Merge branch 'common/mmcif' into rmobile-latest

author: Paul Mundt <lethal@linux-sh.org> 2011-01-14 02:06:31 -0500
committer: Paul Mundt <lethal@linux-sh.org> 2011-01-14 02:06:31 -0500
commit: c488a4731abb53aa1bab9fccd8a7472083159bfd (patch)
tree: db6d4a664a1e4b7685c1d2d79da63263f40adf7b
parent: 6d2ae89c36e2adab5cfa69fecb11290082817ac6 (diff)
parent: bba958783b1b4cb0a9420f4e11082467132a334c (diff)
181 files changed, 9940 insertions, 2132 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 7781857dc940..bac328c232f5 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -385,6 +385,10 @@ mapped_file	- # of bytes of mapped file (includes tmpfs/shmem)
 pgpgin          - # of pages paged in (equivalent to # of charging events).
 pgpgout         - # of pages paged out (equivalent to # of uncharging events).
 swap            - # of bytes of swap usage
+dirty           - # of bytes that are waiting to get written back to the disk.
+writeback       - # of bytes that are actively being written back to the disk.
+nfs_unstable    - # of bytes sent to the NFS server, but not yet committed to
+                the actual storage.
 inactive_anon   - # of bytes of anonymous memory and swap cache memory on
                LRU list.
 active_anon     - # of bytes of anonymous and swap cache memory on active
@@ -406,6 +410,9 @@ total_mapped_file	- sum of all children's "cache"
 total_pgpgin            - sum of all children's "pgpgin"
 total_pgpgout           - sum of all children's "pgpgout"
 total_swap              - sum of all children's "swap"
+total_dirty             - sum of all children's "dirty"
+total_writeback         - sum of all children's "writeback"
+total_nfs_unstable      - sum of all children's "nfs_unstable"
 total_inactive_anon     - sum of all children's "inactive_anon"
 total_active_anon       - sum of all children's "active_anon"
 total_inactive_file     - sum of all children's "inactive_file"
@@ -453,6 +460,73 @@ memory under it will be reclaimed.
 You can reset failcnt by writing 0 to failcnt file.
 # echo 0 > .../memory.failcnt
+5.5 dirty memory
+Control the maximum amount of dirty pages a cgroup can have at any given time.
+Limiting dirty memory is like fixing the max amount of dirty (hard to reclaim)
+page cache used by a cgroup.  So, in case of multiple cgroup writers, they will
+not be able to consume more than their designated share of dirty pages and will
+be forced to perform write-out if they cross that limit.
+The interface is equivalent to the procfs interface: /proc/sys/vm/dirty_*.  It
+is possible to configure a limit to trigger both a direct writeback or a
+background writeback performed by per-bdi flusher threads.  The root cgroup
+memory.dirty_* control files are read-only and match the contents of
+the /proc/sys/vm/dirty_* files.
+Per-cgroup dirty limits can be set using the following files in the cgroupfs:
+- memory.dirty_ratio: the amount of dirty memory (expressed as a percentage of
+  cgroup memory) at which a process generating dirty pages will itself start
+  writing out dirty data.
+- memory.dirty_limit_in_bytes: the amount of dirty memory (expressed in bytes)
+  in the cgroup at which a process generating dirty pages will start itself
+  writing out dirty data.  Suffix (k, K, m, M, g, or G) can be used to indicate
+  that value is kilo, mega or gigabytes.
+  Note: memory.dirty_limit_in_bytes is the counterpart of memory.dirty_ratio.
+  Only one of them may be specified at a time.  When one is written it is
+  immediately taken into account to evaluate the dirty memory limits and the
+  other appears as 0 when read.
+- memory.dirty_background_ratio: the amount of dirty memory of the cgroup
+  (expressed as a percentage of cgroup memory) at which background writeback
+  kernel threads will start writing out dirty data.
+- memory.dirty_background_limit_in_bytes: the amount of dirty memory (expressed
+  in bytes) in the cgroup at which background writeback kernel threads will
+  start writing out dirty data.  Suffix (k, K, m, M, g, or G) can be used to
+  indicate that value is kilo, mega or gigabytes.
+  Note: memory.dirty_background_limit_in_bytes is the counterpart of
+  memory.dirty_background_ratio.  Only one of them may be specified at a time.
+  When one is written it is immediately taken into account to evaluate the dirty
+  memory limits and the other appears as 0 when read.
+A cgroup may contain more dirty memory than its dirty limit.  This is possible
+because of the principle that the first cgroup to touch a page is charged for
+it.  Subsequent page counting events (dirty, writeback, nfs_unstable) are also
+counted to the originally charged cgroup.
+Example: If page is allocated by a cgroup A task, then the page is charged to
+cgroup A.  If the page is later dirtied by a task in cgroup B, then the cgroup A
+dirty count will be incremented.  If cgroup A is over its dirty limit but cgroup
+B is not, then dirtying a cgroup A page from a cgroup B task may push cgroup A
+over its dirty limit without throttling the dirtying cgroup B task.
+When use_hierarchy=0, each cgroup has dirty memory usage and limits.
+System-wide dirty limits are also consulted.  Dirty memory consumption is
+checked against both system-wide and per-cgroup dirty limits.
+The current implementation does not enforce per-cgroup dirty limits when
+use_hierarchy=1.  System-wide dirty limits are used for processes in such
+cgroups.  Attempts to read memory.dirty_* files return the system-wide
+values.  Writes to the memory.dirty_* files return error.  An enhanced
+implementation is needed to check the chain of parents to ensure that no
+dirty limit is exceeded.
 6. Hierarchy support
 The memory controller supports a deep hierarchy and hierarchical accounting.
diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
index 524de926290d..59293ac4a5d0 100644
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -8,7 +8,7 @@ Parameters: <cipher> <key> <iv_offset> <device path> <offset>
 <cipher>
    Encryption cipher and an optional IV generation mode.
-    (In format cipher-chainmode-ivopts:ivmode).
+    (In format cipher[:keycount]-chainmode-ivopts:ivmode).
    Examples:
       des
       aes-cbc-essiv:sha256
@@ -20,6 +20,11 @@ Parameters: <cipher> <key> <iv_offset> <device path> <offset>
    Key used for encryption. It is encoded as a hexadecimal number.
    You can only use key sizes that are valid for the selected cipher.
+<keycount>
+    Multi-key compatibility mode. You can define <keycount> keys and
+    then sectors are encrypted according to their offsets (sector 0 uses key0;
+    sector 1 uses key1 etc.).  <keycount> must be a power of two.
 <iv_offset>
    The IV offset is a sector count that is added to the sector number
    before creating the IV.
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
new file mode 100644
index 000000000000..33b6b7071ac8
--- /dev/null
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -0,0 +1,70 @@
+Device-mapper RAID (dm-raid) is a bridge from DM to MD.  It
+provides a way to use device-mapper interfaces to access the MD RAID
+drivers.
+As with all device-mapper targets, the nominal public interfaces are the
+constructor (CTR) tables and the status outputs (both STATUSTYPE_INFO
+and STATUSTYPE_TABLE).  The CTR table looks like the following:
+1: <s> <l> raid \
+2:      <raid_type> <#raid_params> <raid_params> \
+3:      <#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN>
+Line 1 contains the standard first three arguments to any device-mapper
+target - the start, length, and target type fields.  The target type in
+this case is "raid".
+Line 2 contains the arguments that define the particular raid
+type/personality/level, the required arguments for that raid type, and
+any optional arguments.  Possible raid types include: raid4, raid5_la,
+raid5_ls, raid5_rs, raid6_zr, raid6_nr, and raid6_nc.  (raid1 is
+planned for the future.)  The list of required and optional parameters
+is the same for all the current raid types.  The required parameters are
+positional, while the optional parameters are given as key/value pairs.
+The possible parameters are as follows:
+ <chunk_size>           Chunk size in sectors.
+ [[no]sync]             Force/Prevent RAID initialization
+ [rebuild <idx>]        Rebuild the drive indicated by the index
+ [daemon_sleep <ms>]    Time between bitmap daemon work to clear bits
+ [min_recovery_rate <kB/sec/disk>]      Throttle RAID initialization
+ [max_recovery_rate <kB/sec/disk>]      Throttle RAID initialization
+ [max_write_behind <sectors>]           See '-write-behind=' (man mdadm)
+ [stripe_cache <sectors>]               Stripe cache size for higher RAIDs
+Line 3 contains the list of devices that compose the array in
+metadata/data device pairs.  If the metadata is stored separately, a '-'
+is given for the metadata device position.  If a drive has failed or is
+missing at creation time, a '-' can be given for both the metadata and
+data drives for a given position.
+NB. Currently all metadata devices must be specified as '-'.
+Examples:
+# RAID4 - 4 data drives, 1 parity
+# No metadata devices specified to hold superblock/bitmap info
+# Chunk size of 1MiB
+# (Lines separated for easy reading)
+0 1960893648 raid \
+        raid4 1 2048 \
+        5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
+# RAID4 - 4 data drives, 1 parity (no metadata devices)
+# Chunk size of 1MiB, force RAID initialization,
+#       min recovery rate at 20 kiB/sec/disk
+0 1960893648 raid \
+        raid4 4 2048 min_recovery_rate 20 sync\
+        5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
+Performing a 'dmsetup table' should display the CTR table used to
+construct the mapping (with possible reordering of optional
+parameters).
+Performing a 'dmsetup status' will yield information on the state and
+health of the array.  The output is as follows:
+1: <s> <l> raid \
+2:      <raid_type> <#devices> <1 health char for each dev> <resync_ratio>
+Line 1 is standard DM output.  Line 2 is best shown by example:
+        0 1960893648 raid raid4 5 AAAAA 2/490221568
+Here we can see the RAID type is raid4, there are 5 devices - all of
+which are 'A'live, and the array is 2/490221568 complete with recovery.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 9471225212c4..23cae6548d3a 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -375,6 +375,7 @@ Anonymous:             0 kB
 Swap:                  0 kB
 KernelPageSize:        4 kB
 MMUPageSize:           4 kB
+Locked:              374 kB
 The first of these lines shows the same information as is displayed for the
 mapping in /proc/PID/maps.  The remaining lines show the size of the mapping
@@ -670,6 +671,8 @@ varies by architecture and compile options.  The following is from a
 > cat /proc/meminfo
+The "Locked" indicates whether the mapping is locked in memory or not.
 MemTotal:     16344972 kB
 MemFree:      13634064 kB
@@ -1320,6 +1323,10 @@ scaled linearly with /proc/<pid>/oom_score_adj.
 Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
 other with its scaled value.
+The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last
+value set by a CAP_SYS_RESOURCE process. To reduce the value any lower
+requires CAP_SYS_RESOURCE.
 NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see
 Documentation/feature-removal-schedule.txt.
diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt
index a492d92bb098..792faa3c06cf 100644
--- a/Documentation/gpio.txt
+++ b/Documentation/gpio.txt
@@ -135,7 +135,7 @@ setting up a platform_device using the GPIO, is mark its direction:
        int gpio_direction_input(unsigned gpio);
        int gpio_direction_output(unsigned gpio, int value);
-The return value is zero for success, else a negative errno.  It must
+The return value is zero for success, else a negative errno.  It should
 be checked, since the get/set calls don't have error returns and since
 misconfiguration is possible.  You should normally issue these calls from
 a task context.  However, for spinlock-safe GPIOs it's OK to use them
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
new file mode 100644
index 000000000000..0924aaca3302
--- /dev/null
+++ b/Documentation/vm/transhuge.txt
@@ -0,0 +1,298 @@
+= Transparent Hugepage Support =
+== Objective ==
+Performance critical computing applications dealing with large memory
+working sets are already running on top of libhugetlbfs and in turn
+hugetlbfs. Transparent Hugepage Support is an alternative means of
+using huge pages for the backing of virtual memory with huge pages
+that supports the automatic promotion and demotion of page sizes and
+without the shortcomings of hugetlbfs.
+Currently it only works for anonymous memory mappings but in the
+future it can expand over the pagecache layer starting with tmpfs.
+The reason applications are running faster is because of two
+factors. The first factor is almost completely irrelevant and it's not
+of significant interest because it'll also have the downside of
+requiring larger clear-page copy-page in page faults which is a
+potentially negative effect. The first factor consists in taking a
+single page fault for each 2M virtual region touched by userland (so
+reducing the enter/exit kernel frequency by a 512 times factor). This
+only matters the first time the memory is accessed for the lifetime of
+a memory mapping. The second long lasting and much more important
+factor will affect all subsequent accesses to the memory for the whole
+runtime of the application. The second factor consist of two
+components: 1) the TLB miss will run faster (especially with
+virtualization using nested pagetables but almost always also on bare
+metal without virtualization) and 2) a single TLB entry will be
+mapping a much larger amount of virtual memory in turn reducing the
+number of TLB misses. With virtualization and nested pagetables the
+TLB can be mapped of larger size only if both KVM and the Linux guest
+are using hugepages but a significant speedup already happens if only
+one of the two is using hugepages just because of the fact the TLB
+miss is going to run faster.
+== Design ==
+- "graceful fallback": mm components which don't have transparent
+  hugepage knowledge fall back to breaking a transparent hugepage and
+  working on the regular pages and their respective regular pmd/pte
+  mappings
+- if a hugepage allocation fails because of memory fragmentation,
+  regular pages should be gracefully allocated instead and mixed in
+  the same vma without any failure or significant delay and without
+  userland noticing
+- if some task quits and more hugepages become available (either
+  immediately in the buddy or through the VM), guest physical memory
+  backed by regular pages should be relocated on hugepages
+  automatically (with khugepaged)
+- it doesn't require memory reservation and in turn it uses hugepages
+  whenever possible (the only possible reservation here is kernelcore=
+  to avoid unmovable pages to fragment all the memory but such a tweak
+  is not specific to transparent hugepage support and it's a generic
+  feature that applies to all dynamic high order allocations in the
+  kernel)
+- this initial support only offers the feature in the anonymous memory
+  regions but it'd be ideal to move it to tmpfs and the pagecache
+  later
+Transparent Hugepage Support maximizes the usefulness of free memory
+if compared to the reservation approach of hugetlbfs by allowing all
+unused memory to be used as cache or other movable (or even unmovable
+entities). It doesn't require reservation to prevent hugepage
+allocation failures to be noticeable from userland. It allows paging
+and all other advanced VM features to be available on the
+hugepages. It requires no modifications for applications to take
+advantage of it.
+Applications however can be further optimized to take advantage of
+this feature, like for example they've been optimized before to avoid
+a flood of mmap system calls for every malloc(4k). Optimizing userland
+is by far not mandatory and khugepaged already can take care of long
+lived page allocations even for hugepage unaware applications that
+deals with large amounts of memory.
+In certain cases when hugepages are enabled system wide, application
+may end up allocating more memory resources. An application may mmap a
+large region but only touch 1 byte of it, in that case a 2M page might
+be allocated instead of a 4k page for no good. This is why it's
+possible to disable hugepages system-wide and to only have them inside
+MADV_HUGEPAGE madvise regions.
+Embedded systems should enable hugepages only inside madvise regions
+to eliminate any risk of wasting any precious byte of memory and to
+only run faster.
+Applications that gets a lot of benefit from hugepages and that don't
+risk to lose memory by using hugepages, should use
+madvise(MADV_HUGEPAGE) on their critical mmapped regions.
+== sysfs ==
+Transparent Hugepage Support can be entirely disabled (mostly for
+debugging purposes) or only enabled inside MADV_HUGEPAGE regions (to
+avoid the risk of consuming more memory resources) or enabled system
+wide. This can be achieved with one of:
+echo always >/sys/kernel/mm/transparent_hugepage/enabled
+echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
+echo never >/sys/kernel/mm/transparent_hugepage/enabled
+It's also possible to limit defrag efforts in the VM to generate
+hugepages in case they're not immediately free to madvise regions or
+to never try to defrag memory and simply fallback to regular pages
+unless hugepages are immediately available. Clearly if we spend CPU
+time to defrag memory, we would expect to gain even more by the fact
+we use hugepages later instead of regular pages. This isn't always
+guaranteed, but it may be more likely in case the allocation is for a
+MADV_HUGEPAGE region.
+echo always >/sys/kernel/mm/transparent_hugepage/defrag
+echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
+echo never >/sys/kernel/mm/transparent_hugepage/defrag
+khugepaged will be automatically started when
+transparent_hugepage/enabled is set to "always" or "madvise, and it'll
+be automatically shutdown if it's set to "never".
+khugepaged runs usually at low frequency so while one may not want to
+invoke defrag algorithms synchronously during the page faults, it
+should be worth invoking defrag at least in khugepaged. However it's
+also possible to disable defrag in khugepaged:
+echo yes >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
+echo no >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
+You can also control how many pages khugepaged should scan at each
+pass:
+/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan
+and how many milliseconds to wait in khugepaged between each pass (you
+can set this to 0 to run khugepaged at 100% utilization of one core):
+/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs
+and how many milliseconds to wait in khugepaged if there's an hugepage
+allocation failure to throttle the next allocation attempt.
+/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
+The khugepaged progress can be seen in the number of pages collapsed:
+/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
+for each pass:
+/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
+== Boot parameter ==
+You can change the sysfs boot time defaults of Transparent Hugepage
+Support by passing the parameter "transparent_hugepage=always" or
+"transparent_hugepage=madvise" or "transparent_hugepage=never"
+(without "") to the kernel command line.
+== Need of application restart ==
+The transparent_hugepage/enabled values only affect future
+behavior. So to make them effective you need to restart any
+application that could have been using hugepages. This also applies to
+the regions registered in khugepaged.
+== get_user_pages and follow_page ==
+get_user_pages and follow_page if run on a hugepage, will return the
+head or tail pages as usual (exactly as they would do on
+hugetlbfs). Most gup users will only care about the actual physical
+address of the page and its temporary pinning to release after the I/O
+is complete, so they won't ever notice the fact the page is huge. But
+if any driver is going to mangle over the page structure of the tail
+page (like for checking page->mapping or other bits that are relevant
+for the head page and not the tail page), it should be updated to jump
+to check head page instead (while serializing properly against
+split_huge_page() to avoid the head and tail pages to disappear from
+under it, see the futex code to see an example of that, hugetlbfs also
+needed special handling in futex code for similar reasons).
+NOTE: these aren't new constraints to the GUP API, and they match the
+same constrains that applies to hugetlbfs too, so any driver capable
+of handling GUP on hugetlbfs will also work fine on transparent
+hugepage backed mappings.
+In case you can't handle compound pages if they're returned by
+follow_page, the FOLL_SPLIT bit can be specified as parameter to
+follow_page, so that it will split the hugepages before returning
+them. Migration for example passes FOLL_SPLIT as parameter to
+follow_page because it's not hugepage aware and in fact it can't work
+at all on hugetlbfs (but it instead works fine on transparent
+hugepages thanks to FOLL_SPLIT). migration simply can't deal with
+hugepages being returned (as it's not only checking the pfn of the
+page and pinning it during the copy but it pretends to migrate the
+memory in regular page sizes and with regular pte/pmd mappings).
+== Optimizing the applications ==
+To be guaranteed that the kernel will map a 2M page immediately in any
+memory region, the mmap region has to be hugepage naturally
+aligned. posix_memalign() can provide that guarantee.
+== Hugetlbfs ==
+You can use hugetlbfs on a kernel that has transparent hugepage
+support enabled just fine as always. No difference can be noted in
+hugetlbfs other than there will be less overall fragmentation. All
+usual features belonging to hugetlbfs are preserved and
+unaffected. libhugetlbfs will also work fine as usual.
+== Graceful fallback ==
+Code walking pagetables but unware about huge pmds can simply call
+split_huge_page_pmd(mm, pmd) where the pmd is the one returned by
+pmd_offset. It's trivial to make the code transparent hugepage aware
+by just grepping for "pmd_offset" and adding split_huge_page_pmd where
+missing after pmd_offset returns the pmd. Thanks to the graceful
+fallback design, with a one liner change, you can avoid to write
+hundred if not thousand of lines of complex code to make your code
+hugepage aware.
+If you're not walking pagetables but you run into a physical hugepage
+but you can't handle it natively in your code, you can split it by
+calling split_huge_page(page). This is what the Linux VM does before
+it tries to swapout the hugepage for example.
+Example to make mremap.c transparent hugepage aware with a one liner
+change:
+diff --git a/mm/mremap.c b/mm/mremap.c
+--- a/mm/mremap.c
+++ b/mm/mremap.c
+@@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru
+                return NULL;
+        pmd = pmd_offset(pud, addr);
+       split_huge_page_pmd(mm, pmd);
+        if (pmd_none_or_clear_bad(pmd))
+                return NULL;
+== Locking in hugepage aware code ==
+We want as much code as possible hugepage aware, as calling
+split_huge_page() or split_huge_page_pmd() has a cost.
+To make pagetable walks huge pmd aware, all you need to do is to call
+pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
+mmap_sem in read (or write) mode to be sure an huge pmd cannot be
+created from under you by khugepaged (khugepaged collapse_huge_page
+takes the mmap_sem in write mode in addition to the anon_vma lock). If
+pmd_trans_huge returns false, you just fallback in the old code
+paths. If instead pmd_trans_huge returns true, you have to take the
+mm->page_table_lock and re-run pmd_trans_huge. Taking the
+page_table_lock will prevent the huge pmd to be converted into a
+regular pmd from under you (split_huge_page can run in parallel to the
+pagetable walk). If the second pmd_trans_huge returns false, you
+should just drop the page_table_lock and fallback to the old code as
+before. Otherwise you should run pmd_trans_splitting on the pmd. In
+case pmd_trans_splitting returns true, it means split_huge_page is
+already in the middle of splitting the page. So if pmd_trans_splitting
+returns true it's enough to drop the page_table_lock and call
+wait_split_huge_page and then fallback the old code paths. You are
+guaranteed by the time wait_split_huge_page returns, the pmd isn't
+huge anymore. If pmd_trans_splitting returns false, you can proceed to
+process the huge pmd and the hugepage natively. Once finished you can
+drop the page_table_lock.
+== compound_lock, get_user_pages and put_page ==
+split_huge_page internally has to distribute the refcounts in the head
+page to the tail pages before clearing all PG_head/tail bits from the
+page structures. It can do that easily for refcounts taken by huge pmd
+mappings. But the GUI API as created by hugetlbfs (that returns head
+and tail pages if running get_user_pages on an address backed by any
+hugepage), requires the refcount to be accounted on the tail pages and
+not only in the head pages, if we want to be able to run
+split_huge_page while there are gup pins established on any tail
+page. Failure to be able to run split_huge_page if there's any gup pin
+on any tail page, would mean having to split all hugepages upfront in
+get_user_pages which is unacceptable as too many gup users are
+performance critical and they must work natively on hugepages like
+they work natively on hugetlbfs already (hugetlbfs is simpler because
+hugetlbfs pages cannot be splitted so there wouldn't be requirement of
+accounting the pins on the tail pages for hugetlbfs). If we wouldn't
+account the gup refcounts on the tail pages during gup, we won't know
+anymore which tail page is pinned by gup and which is not while we run
+split_huge_page. But we still have to add the gup pin to the head page
+too, to know when we can free the compound page in case it's never
+splitted during its lifetime. That requires changing not just
+get_page, but put_page as well so that when put_page runs on a tail
+page (and only on a tail page) it will find its respective head page,
+and then it will decrease the head page refcount in addition to the
+tail page refcount. To obtain a head page reliably and to decrease its
+refcount without race conditions, put_page has to serialize against
+__split_huge_page_refcount using a special per-page lock called
+compound_lock.
diff --git a/MAINTAINERS b/MAINTAINERS
index 3dd5c6fce989..af656ded404e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6592,13 +6592,12 @@ F:	Documentation/i2c/busses/i2c-viapro
 F:      drivers/i2c/busses/i2c-viapro.c
 VIA SD/MMC CARD CONTROLLER DRIVER
-M:      Joseph Chan <JosephChan@via.com.tw>
+M:      Bruce Chang <brucechang@via.com.tw>
 M:      Harald Welte <HaraldWelte@viatech.com>
 S:      Maintained
 F:      drivers/mmc/host/via-sdmmc.c
 VIA UNICHROME(PRO)/CHROME9 FRAMEBUFFER DRIVER
-M:      Joseph Chan <JosephChan@via.com.tw>
 M:      Florian Tobias Schandinat <FlorianSchandinat@gmx.de>
 L:      linux-fbdev@vger.kernel.org
 S:      Maintained
diff --git a/arch/alpha/include/asm/mman.h b/arch/alpha/include/asm/mman.h
index 99c56d47879d..72db984f8781 100644
--- a/arch/alpha/include/asm/mman.h
+++ b/arch/alpha/include/asm/mman.h
@@ -53,6 +53,9 @@
 #define MADV_MERGEABLE   12             /* KSM may merge identical pages */
 #define MADV_UNMERGEABLE 13             /* KSM may not merge identical pages */
+#define MADV_HUGEPAGE   14              /* Worth backing with hugepages */
+#define MADV_NOHUGEPAGE 15              /* Not worth backing with hugepages */
 /* compatibility flags */
 #define MAP_FILE        0
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index 0c1bb68ff4a8..2cfe8161b478 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -38,17 +38,9 @@
 #ifdef CONFIG_MMU
 void *module_alloc(unsigned long size)
 {
-        struct vm_struct *area;
+        return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+                                GFP_KERNEL, PAGE_KERNEL_EXEC, -1,
-        size = PAGE_ALIGN(size);
+                                __builtin_return_address(0));
-        if (!size)
-                return NULL;
-        area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
-        if (!area)
-                return NULL;
-        return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC);
 }
 #else /* CONFIG_MMU */
 void *module_alloc(unsigned long size)
diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c
index 93292a18cf77..709244c66fa3 100644
--- a/arch/arm/mm/pgd.c
+++ b/arch/arm/mm/pgd.c
@@ -50,7 +50,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
                if (!new_pmd)
                        goto no_pmd;
-                new_pte = pte_alloc_map(mm, new_pmd, 0);
+                new_pte = pte_alloc_map(mm, NULL, new_pmd, 0);
                if (!new_pte)
                        goto no_pte;
diff --git a/arch/avr32/boards/atngw100/setup.c b/arch/avr32/boards/atngw100/setup.c
index 8c6a2440e345..659d119ce712 100644
--- a/arch/avr32/boards/atngw100/setup.c
+++ b/arch/avr32/boards/atngw100/setup.c
@@ -188,7 +188,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
         */
        regs = (void __iomem __force *)res->start;
        pclk = clk_get(&pdev->dev, "pclk");
-        if (!pclk)
+        if (IS_ERR(pclk))
                return;
        clk_enable(pclk);
diff --git a/arch/avr32/boards/atstk1000/atstk1002.c b/arch/avr32/boards/atstk1000/atstk1002.c
index 2adc261c9e3d..6ce30fb2ec94 100644
--- a/arch/avr32/boards/atstk1000/atstk1002.c
+++ b/arch/avr32/boards/atstk1000/atstk1002.c
@@ -203,7 +203,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
         */
        regs = (void __iomem __force *)res->start;
        pclk = clk_get(&pdev->dev, "pclk");
-        if (!pclk)
+        if (IS_ERR(pclk))
                return;
        clk_enable(pclk);
diff --git a/arch/avr32/boards/favr-32/setup.c b/arch/avr32/boards/favr-32/setup.c
index 75f19f47fb2f..86fab77a5a00 100644
--- a/arch/avr32/boards/favr-32/setup.c
+++ b/arch/avr32/boards/favr-32/setup.c
@@ -206,7 +206,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
         */
        regs = (void __iomem __force *)res->start;
        pclk = clk_get(&pdev->dev, "pclk");
-        if (!pclk)
+        if (IS_ERR(pclk))
                return;
        clk_enable(pclk);
diff --git a/arch/avr32/boards/hammerhead/setup.c b/arch/avr32/boards/hammerhead/setup.c
index dd009875a405..da14fbdd4e8e 100644
--- a/arch/avr32/boards/hammerhead/setup.c
+++ b/arch/avr32/boards/hammerhead/setup.c
@@ -150,7 +150,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
        regs = (void __iomem __force *)res->start;
        pclk = clk_get(&pdev->dev, "pclk");
-        if (!pclk)
+        if (IS_ERR(pclk))
                return;
        clk_enable(pclk);
diff --git a/arch/avr32/boards/merisc/setup.c b/arch/avr32/boards/merisc/setup.c
index 623b077594fc..e61bc948f959 100644
--- a/arch/avr32/boards/merisc/setup.c
+++ b/arch/avr32/boards/merisc/setup.c
@@ -134,7 +134,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
        regs = (void __iomem __force *)res->start;
        pclk = clk_get(&pdev->dev, "pclk");
-        if (!pclk)
+        if (IS_ERR(pclk))
                return;
        clk_enable(pclk);
diff --git a/arch/avr32/boards/mimc200/setup.c b/arch/avr32/boards/mimc200/setup.c
index 523d8e183bef..c4da5cba2dbf 100644
--- a/arch/avr32/boards/mimc200/setup.c
+++ b/arch/avr32/boards/mimc200/setup.c
@@ -162,7 +162,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
         */
        regs = (void __iomem __force *)res->start;
        pclk = clk_get(&pdev->dev, "pclk");
-        if (!pclk)
+        if (IS_ERR(pclk))
                return;
        clk_enable(pclk);
diff --git a/arch/avr32/configs/atngw100_defconfig b/arch/avr32/configs/atngw100_defconfig
index 9854013d2728..6f9ca56de1f6 100644
--- a/arch/avr32/configs/atngw100_defconfig
+++ b/arch/avr32/configs/atngw100_defconfig
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_BASE_FULL is not set
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 CONFIG_NO_HZ=y
@@ -29,6 +26,7 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -72,8 +70,8 @@ CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
 CONFIG_ATMEL_TCLIB=y
-CONFIG_EEPROM_AT24=m
 CONFIG_NETDEVICES=y
 CONFIG_TUN=m
 CONFIG_NET_ETHERNET=y
@@ -106,6 +104,7 @@ CONFIG_GPIO_SYSFS=y
 CONFIG_WATCHDOG=y
 CONFIG_AT32AP700X_WDT=y
 CONFIG_USB_GADGET=y
+CONFIG_USB_GADGET_VBUS_DRAW=350
 CONFIG_USB_ZERO=m
 CONFIG_USB_ETH=m
 CONFIG_USB_GADGETFS=m
@@ -115,14 +114,12 @@ CONFIG_USB_CDC_COMPOSITE=m
 CONFIG_MMC=y
 CONFIG_MMC_TEST=m
 CONFIG_MMC_ATMELMCI=y
-CONFIG_MMC_SPI=m
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=y
 CONFIG_LEDS_GPIO=y
 CONFIG_LEDS_TRIGGERS=y
 CONFIG_LEDS_TRIGGER_TIMER=y
 CONFIG_LEDS_TRIGGER_HEARTBEAT=y
-CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_AT32AP700X=y
 CONFIG_DMADEVICES=y
@@ -130,21 +127,23 @@ CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
 # CONFIG_DNOTIFY is not set
 CONFIG_FUSE_FS=m
 CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
 CONFIG_FAT_DEFAULT_CODEPAGE=850
+CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=m
+CONFIG_CONFIGFS_FS=y
 CONFIG_JFFS2_FS=y
-CONFIG_UFS_FS=y
+CONFIG_UBIFS_FS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
 CONFIG_NFSD=m
 CONFIG_NFSD_V3=y
-CONFIG_SMB_FS=m
 CONFIG_CIFS=m
 CONFIG_NLS_CODEPAGE_437=m
 CONFIG_NLS_CODEPAGE_850=m
@@ -155,5 +154,3 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_CRYPTO_PCBC=m
diff --git a/arch/avr32/configs/atngw100_evklcd100_defconfig b/arch/avr32/configs/atngw100_evklcd100_defconfig
index 7ceda354597b..7eece0af34c9 100644
--- a/arch/avr32/configs/atngw100_evklcd100_defconfig
+++ b/arch/avr32/configs/atngw100_evklcd100_defconfig
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_BASE_FULL is not set
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 CONFIG_NO_HZ=y
@@ -31,6 +28,7 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -74,8 +72,10 @@ CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
 CONFIG_ATMEL_TCLIB=y
 CONFIG_NETDEVICES=y
+CONFIG_TUN=m
 CONFIG_NET_ETHERNET=y
 CONFIG_MACB=y
 # CONFIG_NETDEV_1000 is not set
@@ -104,6 +104,7 @@ CONFIG_I2C_GPIO=m
 CONFIG_SPI=y
 CONFIG_SPI_ATMEL=y
 CONFIG_SPI_SPIDEV=m
+CONFIG_GPIO_SYSFS=y
 # CONFIG_HWMON is not set
 CONFIG_WATCHDOG=y
 CONFIG_AT32AP700X_WDT=y
@@ -127,6 +128,7 @@ CONFIG_USB_FILE_STORAGE=m
 CONFIG_USB_G_SERIAL=m
 CONFIG_USB_CDC_COMPOSITE=m
 CONFIG_MMC=y
+CONFIG_MMC_TEST=m
 CONFIG_MMC_ATMELMCI=y
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=y
@@ -141,11 +143,14 @@ CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
 # CONFIG_DNOTIFY is not set
 CONFIG_FUSE_FS=m
 CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
 CONFIG_FAT_DEFAULT_CODEPAGE=850
+CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_CONFIGFS_FS=y
 CONFIG_JFFS2_FS=y
@@ -155,7 +160,6 @@ CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
 CONFIG_NFSD=m
 CONFIG_NFSD_V3=y
-CONFIG_SMB_FS=m
 CONFIG_CIFS=m
 CONFIG_NLS_CODEPAGE_437=m
 CONFIG_NLS_CODEPAGE_850=m
@@ -166,4 +170,3 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/avr32/configs/atngw100_evklcd101_defconfig b/arch/avr32/configs/atngw100_evklcd101_defconfig
index 7bc5b2ce68d5..387eb9d6e423 100644
--- a/arch/avr32/configs/atngw100_evklcd101_defconfig
+++ b/arch/avr32/configs/atngw100_evklcd101_defconfig
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_BASE_FULL is not set
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 CONFIG_NO_HZ=y
@@ -30,6 +27,7 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -73,8 +71,10 @@ CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
 CONFIG_ATMEL_TCLIB=y
 CONFIG_NETDEVICES=y
+CONFIG_TUN=m
 CONFIG_NET_ETHERNET=y
 CONFIG_MACB=y
 # CONFIG_NETDEV_1000 is not set
@@ -103,6 +103,7 @@ CONFIG_I2C_GPIO=m
 CONFIG_SPI=y
 CONFIG_SPI_ATMEL=y
 CONFIG_SPI_SPIDEV=m
+CONFIG_GPIO_SYSFS=y
 # CONFIG_HWMON is not set
 CONFIG_WATCHDOG=y
 CONFIG_AT32AP700X_WDT=y
@@ -126,6 +127,7 @@ CONFIG_USB_FILE_STORAGE=m
 CONFIG_USB_G_SERIAL=m
 CONFIG_USB_CDC_COMPOSITE=m
 CONFIG_MMC=y
+CONFIG_MMC_TEST=m
 CONFIG_MMC_ATMELMCI=y
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=y
@@ -140,11 +142,14 @@ CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
 # CONFIG_DNOTIFY is not set
 CONFIG_FUSE_FS=m
 CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
 CONFIG_FAT_DEFAULT_CODEPAGE=850
+CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_CONFIGFS_FS=y
 CONFIG_JFFS2_FS=y
@@ -154,7 +159,6 @@ CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
 CONFIG_NFSD=m
 CONFIG_NFSD_V3=y
-CONFIG_SMB_FS=m
 CONFIG_CIFS=m
 CONFIG_NLS_CODEPAGE_437=m
 CONFIG_NLS_CODEPAGE_850=m
@@ -165,4 +169,3 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/avr32/configs/atngw100mkii_defconfig b/arch/avr32/configs/atngw100mkii_defconfig
index 4bd36821d4a2..f0fe237133a9 100644
--- a/arch/avr32/configs/atngw100mkii_defconfig
+++ b/arch/avr32/configs/atngw100mkii_defconfig
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_BASE_FULL is not set
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 CONFIG_NO_HZ=y
@@ -29,6 +26,7 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -74,6 +72,7 @@ CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
 CONFIG_ATMEL_TCLIB=y
 CONFIG_NETDEVICES=y
 CONFIG_TUN=m
@@ -107,6 +106,7 @@ CONFIG_GPIO_SYSFS=y
 CONFIG_WATCHDOG=y
 CONFIG_AT32AP700X_WDT=y
 CONFIG_USB_GADGET=y
+CONFIG_USB_GADGET_VBUS_DRAW=350
 CONFIG_USB_ZERO=m
 CONFIG_USB_ETH=m
 CONFIG_USB_GADGETFS=m
@@ -116,14 +116,12 @@ CONFIG_USB_CDC_COMPOSITE=m
 CONFIG_MMC=y
 CONFIG_MMC_TEST=m
 CONFIG_MMC_ATMELMCI=y
-CONFIG_MMC_SPI=m
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=y
 CONFIG_LEDS_GPIO=y
 CONFIG_LEDS_TRIGGERS=y
 CONFIG_LEDS_TRIGGER_TIMER=y
 CONFIG_LEDS_TRIGGER_HEARTBEAT=y
-CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_AT32AP700X=y
 CONFIG_DMADEVICES=y
@@ -131,21 +129,23 @@ CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
 # CONFIG_DNOTIFY is not set
 CONFIG_FUSE_FS=m
 CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
 CONFIG_FAT_DEFAULT_CODEPAGE=850
+CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=m
+CONFIG_CONFIGFS_FS=y
 CONFIG_JFFS2_FS=y
-CONFIG_UFS_FS=y
+CONFIG_UBIFS_FS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
 CONFIG_NFSD=m
 CONFIG_NFSD_V3=y
-CONFIG_SMB_FS=m
 CONFIG_CIFS=m
 CONFIG_NLS_CODEPAGE_437=m
 CONFIG_NLS_CODEPAGE_850=m
@@ -156,5 +156,3 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_CRYPTO_PCBC=m
diff --git a/arch/avr32/configs/atngw100mkii_evklcd100_defconfig b/arch/avr32/configs/atngw100mkii_evklcd100_defconfig
index f8437ef3237f..e4a7c1dc8380 100644
--- a/arch/avr32/configs/atngw100mkii_evklcd100_defconfig
+++ b/arch/avr32/configs/atngw100mkii_evklcd100_defconfig
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_BASE_FULL is not set
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 CONFIG_NO_HZ=y
@@ -32,6 +29,7 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -77,8 +75,10 @@ CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
 CONFIG_ATMEL_TCLIB=y
 CONFIG_NETDEVICES=y
+CONFIG_TUN=m
 CONFIG_NET_ETHERNET=y
 CONFIG_MACB=y
 # CONFIG_NETDEV_1000 is not set
@@ -107,6 +107,7 @@ CONFIG_I2C_GPIO=m
 CONFIG_SPI=y
 CONFIG_SPI_ATMEL=y
 CONFIG_SPI_SPIDEV=m
+CONFIG_GPIO_SYSFS=y
 # CONFIG_HWMON is not set
 CONFIG_WATCHDOG=y
 CONFIG_AT32AP700X_WDT=y
@@ -130,6 +131,7 @@ CONFIG_USB_FILE_STORAGE=m
 CONFIG_USB_G_SERIAL=m
 CONFIG_USB_CDC_COMPOSITE=m
 CONFIG_MMC=y
+CONFIG_MMC_TEST=m
 CONFIG_MMC_ATMELMCI=y
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=y
@@ -144,11 +146,14 @@ CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
 # CONFIG_DNOTIFY is not set
 CONFIG_FUSE_FS=m
 CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
 CONFIG_FAT_DEFAULT_CODEPAGE=850
+CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_CONFIGFS_FS=y
 CONFIG_JFFS2_FS=y
@@ -158,7 +163,6 @@ CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
 CONFIG_NFSD=m
 CONFIG_NFSD_V3=y
-CONFIG_SMB_FS=m
 CONFIG_CIFS=m
 CONFIG_NLS_CODEPAGE_437=m
 CONFIG_NLS_CODEPAGE_850=m
@@ -169,4 +173,3 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/avr32/configs/atngw100mkii_evklcd101_defconfig b/arch/avr32/configs/atngw100mkii_evklcd101_defconfig
index 7f58f996d945..6f37f70c2c37 100644
--- a/arch/avr32/configs/atngw100mkii_evklcd101_defconfig
+++ b/arch/avr32/configs/atngw100mkii_evklcd101_defconfig
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_BASE_FULL is not set
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 CONFIG_NO_HZ=y
@@ -31,6 +28,7 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -76,8 +74,10 @@ CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
 CONFIG_ATMEL_TCLIB=y
 CONFIG_NETDEVICES=y
+CONFIG_TUN=m
 CONFIG_NET_ETHERNET=y
 CONFIG_MACB=y
 # CONFIG_NETDEV_1000 is not set
@@ -106,6 +106,7 @@ CONFIG_I2C_GPIO=m
 CONFIG_SPI=y
 CONFIG_SPI_ATMEL=y
 CONFIG_SPI_SPIDEV=m
+CONFIG_GPIO_SYSFS=y
 # CONFIG_HWMON is not set
 CONFIG_WATCHDOG=y
 CONFIG_AT32AP700X_WDT=y
@@ -129,6 +130,7 @@ CONFIG_USB_FILE_STORAGE=m
 CONFIG_USB_G_SERIAL=m
 CONFIG_USB_CDC_COMPOSITE=m
 CONFIG_MMC=y
+CONFIG_MMC_TEST=m
 CONFIG_MMC_ATMELMCI=y
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=y
@@ -143,11 +145,14 @@ CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
 # CONFIG_DNOTIFY is not set
 CONFIG_FUSE_FS=m
 CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
 CONFIG_FAT_DEFAULT_CODEPAGE=850
+CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_CONFIGFS_FS=y
 CONFIG_JFFS2_FS=y
@@ -157,7 +162,6 @@ CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
 CONFIG_NFSD=m
 CONFIG_NFSD_V3=y
-CONFIG_SMB_FS=m
 CONFIG_CIFS=m
 CONFIG_NLS_CODEPAGE_437=m
 CONFIG_NLS_CODEPAGE_850=m
@@ -168,4 +172,3 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/avr32/configs/atstk1002_defconfig b/arch/avr32/configs/atstk1002_defconfig
index aec4c43a75da..4fb01f5ab42f 100644
--- a/arch/avr32/configs/atstk1002_defconfig
+++ b/arch/avr32/configs/atstk1002_defconfig
@@ -3,7 +3,6 @@ CONFIG_EXPERIMENTAL=y
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
 CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_SYSCTL_SYSCALL is not set
@@ -11,7 +10,7 @@ CONFIG_BLK_DEV_INITRD=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
@@ -26,6 +25,7 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -35,6 +35,7 @@ CONFIG_INET=y
 CONFIG_IP_PNP=y
 CONFIG_IP_PNP_DHCP=y
 CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE_DEMUX=m
 CONFIG_NET_IPGRE=m
 CONFIG_INET_AH=m
 CONFIG_INET_ESP=m
@@ -58,16 +59,14 @@ CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_DATAFLASH=m
-CONFIG_MTD_M25P80=m
 CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
 CONFIG_ATMEL_PWM=m
 CONFIG_ATMEL_TCLIB=y
 CONFIG_ATMEL_SSC=m
-CONFIG_EEPROM_AT24=m
 # CONFIG_SCSI_PROC_FS is not set
 CONFIG_BLK_DEV_SD=m
 CONFIG_BLK_DEV_SR=m
@@ -120,7 +119,6 @@ CONFIG_SND_MIXER_OSS=m
 CONFIG_SND_PCM_OSS=m
 # CONFIG_SND_SUPPORT_OLD_API is not set
 # CONFIG_SND_VERBOSE_PROCFS is not set
-# CONFIG_SND_DRIVERS is not set
 CONFIG_SND_AT73C213=m
 # CONFIG_HID_SUPPORT is not set
 CONFIG_USB_GADGET=y
@@ -131,16 +129,15 @@ CONFIG_USB_FILE_STORAGE=m
 CONFIG_USB_G_SERIAL=m
 CONFIG_USB_CDC_COMPOSITE=m
 CONFIG_MMC=y
+CONFIG_MMC_TEST=m
 CONFIG_MMC_ATMELMCI=y
-CONFIG_MMC_SPI=m
 CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=m
+CONFIG_LEDS_CLASS=y
 CONFIG_LEDS_ATMEL_PWM=m
 CONFIG_LEDS_GPIO=m
 CONFIG_LEDS_TRIGGERS=y
 CONFIG_LEDS_TRIGGER_TIMER=m
 CONFIG_LEDS_TRIGGER_HEARTBEAT=m
-CONFIG_LEDS_TRIGGER_DEFAULT_ON=m
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_AT32AP700X=y
 CONFIG_DMADEVICES=y
@@ -149,20 +146,23 @@ CONFIG_EXT3_FS=y
 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
 CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
 # CONFIG_DNOTIFY is not set
 CONFIG_FUSE_FS=m
 CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
+CONFIG_FAT_DEFAULT_CODEPAGE=850
 CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
+CONFIG_CONFIGFS_FS=y
 CONFIG_JFFS2_FS=y
-# CONFIG_JFFS2_FS_WRITEBUFFER is not set
 CONFIG_UBIFS_FS=y
-CONFIG_MINIX_FS=m
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
+CONFIG_CIFS=m
 CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_850=m
 CONFIG_NLS_ISO8859_1=m
 CONFIG_NLS_UTF8=m
 CONFIG_MAGIC_SYSRQ=y
@@ -170,6 +170,3 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-# CONFIG_CRYPTO_HW is not set
-CONFIG_CRC_T10DIF=m
diff --git a/arch/avr32/configs/atstk1003_defconfig b/arch/avr32/configs/atstk1003_defconfig
index 50ba3db682ca..9faaf9b900f2 100644
--- a/arch/avr32/configs/atstk1003_defconfig
+++ b/arch/avr32/configs/atstk1003_defconfig
@@ -2,22 +2,15 @@ CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
-CONFIG_TASKSTATS=y
-CONFIG_TASK_DELAY_ACCT=y
-CONFIG_AUDIT=y
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
 CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_BASE_FULL is not set
-# CONFIG_SLUB_DEBUG is not set
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
@@ -33,6 +26,7 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -54,18 +48,18 @@ CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_DATAFLASH=m
+CONFIG_MTD_UBI=y
-CONFIG_MTD_M25P80=m
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
 CONFIG_ATMEL_PWM=m
 CONFIG_ATMEL_TCLIB=y
 CONFIG_ATMEL_SSC=m
-CONFIG_EEPROM_AT24=m
 # CONFIG_SCSI_PROC_FS is not set
 CONFIG_BLK_DEV_SD=m
 CONFIG_BLK_DEV_SR=m
+# CONFIG_SCSI_LOWLEVEL is not set
 CONFIG_ATA=m
 # CONFIG_SATA_PMP is not set
 CONFIG_PATA_AT32=m
@@ -77,6 +71,7 @@ CONFIG_PPP_ASYNC=m
 CONFIG_PPP_DEFLATE=m
 CONFIG_PPP_BSDCOMP=m
 CONFIG_INPUT=m
+CONFIG_INPUT_EVDEV=m
 # CONFIG_KEYBOARD_ATKBD is not set
 CONFIG_KEYBOARD_GPIO=m
 # CONFIG_MOUSE_PS2 is not set
@@ -106,7 +101,6 @@ CONFIG_SND_PCM_OSS=m
 CONFIG_SND_AT73C213=m
 # CONFIG_HID_SUPPORT is not set
 CONFIG_USB_GADGET=y
-CONFIG_USB_GADGET_DEBUG_FS=y
 CONFIG_USB_ZERO=m
 CONFIG_USB_ETH=m
 CONFIG_USB_GADGETFS=m
@@ -116,36 +110,39 @@ CONFIG_USB_CDC_COMPOSITE=m
 CONFIG_MMC=y
 CONFIG_MMC_TEST=m
 CONFIG_MMC_ATMELMCI=y
-CONFIG_MMC_SPI=m
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=y
 CONFIG_LEDS_ATMEL_PWM=m
-CONFIG_LEDS_GPIO=y
+CONFIG_LEDS_GPIO=m
 CONFIG_LEDS_TRIGGERS=y
-CONFIG_LEDS_TRIGGER_TIMER=y
+CONFIG_LEDS_TRIGGER_TIMER=m
-CONFIG_LEDS_TRIGGER_HEARTBEAT=y
+CONFIG_LEDS_TRIGGER_HEARTBEAT=m
-CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_AT32AP700X=y
 CONFIG_DMADEVICES=y
-CONFIG_DW_DMAC=y
+CONFIG_EXT2_FS=y
-CONFIG_EXT2_FS=m
+CONFIG_EXT3_FS=y
-CONFIG_EXT3_FS=m
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
 # CONFIG_DNOTIFY is not set
 CONFIG_FUSE_FS=m
 CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
+CONFIG_FAT_DEFAULT_CODEPAGE=850
 CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=m
+CONFIG_CONFIGFS_FS=y
 CONFIG_JFFS2_FS=y
+CONFIG_UBIFS_FS=y
 # CONFIG_NETWORK_FILESYSTEMS is not set
 CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_850=m
 CONFIG_NLS_ISO8859_1=m
 CONFIG_NLS_UTF8=m
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
+CONFIG_DETECT_HUNG_TASK=y
 CONFIG_FRAME_POINTER=y
-CONFIG_CRC_T10DIF=m
diff --git a/arch/avr32/configs/atstk1004_defconfig b/arch/avr32/configs/atstk1004_defconfig
index 329e10ba3b54..3d2a5d85f970 100644
--- a/arch/avr32/configs/atstk1004_defconfig
+++ b/arch/avr32/configs/atstk1004_defconfig
@@ -1,19 +1,32 @@
 CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
+CONFIG_BLK_DEV_INITRD=y
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_BASE_FULL is not set
-# CONFIG_FUTEX is not set
-# CONFIG_EPOLL is not set
-# CONFIG_SIGNALFD is not set
-# CONFIG_TIMERFD is not set
-# CONFIG_EVENTFD is not set
 # CONFIG_COMPAT_BRK is not set
-CONFIG_SLOB=y
+CONFIG_PROFILING=y
-# CONFIG_BLOCK is not set
+CONFIG_OPROFILE=m
+# CONFIG_KPROBES is not set
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+# CONFIG_BLK_DEV_BSG is not set
+# CONFIG_IOSCHED_DEADLINE is not set
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
 CONFIG_BOARD_ATSTK1004=y
 # CONFIG_OWNERSHIP_TRACE is not set
+CONFIG_NMI_DEBUGGING=y
+CONFIG_PM=y
+CONFIG_CPU_FREQ=y
+# CONFIG_CPU_FREQ_STAT is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -31,40 +44,104 @@ CONFIG_MTD=y
 CONFIG_MTD_PARTITIONS=y
 CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_CHAR=y
+CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP=y
-# CONFIG_MISC_DEVICES is not set
+CONFIG_MTD_UBI=y
-# CONFIG_INPUT is not set
+CONFIG_BLK_DEV_LOOP=m
+CONFIG_BLK_DEV_NBD=m
+CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
+CONFIG_ATMEL_PWM=m
+CONFIG_ATMEL_TCLIB=y
+CONFIG_ATMEL_SSC=m
+# CONFIG_SCSI_PROC_FS is not set
+CONFIG_BLK_DEV_SD=m
+CONFIG_BLK_DEV_SR=m
+# CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_ATA=m
+# CONFIG_SATA_PMP is not set
+CONFIG_PATA_AT32=m
+CONFIG_NETDEVICES=y
+# CONFIG_NETDEV_1000 is not set
+# CONFIG_NETDEV_10000 is not set
+CONFIG_PPP=m
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_DEFLATE=m
+CONFIG_PPP_BSDCOMP=m
+CONFIG_INPUT=m
+CONFIG_INPUT_EVDEV=m
+# CONFIG_KEYBOARD_ATKBD is not set
+CONFIG_KEYBOARD_GPIO=m
+# CONFIG_MOUSE_PS2 is not set
+CONFIG_MOUSE_GPIO=m
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
 # CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_ATMEL=y
 CONFIG_SERIAL_ATMEL_CONSOLE=y
-# CONFIG_SERIAL_ATMEL_PDC is not set
 # CONFIG_LEGACY_PTYS is not set
 # CONFIG_HW_RANDOM is not set
+CONFIG_I2C=m
+CONFIG_I2C_CHARDEV=m
+CONFIG_I2C_GPIO=m
 CONFIG_SPI=y
 CONFIG_SPI_ATMEL=y
+CONFIG_SPI_SPIDEV=m
+CONFIG_GPIO_SYSFS=y
 # CONFIG_HWMON is not set
 CONFIG_WATCHDOG=y
 CONFIG_AT32AP700X_WDT=y
 CONFIG_FB=y
 CONFIG_FB_ATMEL=y
 CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_LCD_CLASS_DEVICE=y
 CONFIG_LCD_LTV350QV=y
 # CONFIG_BACKLIGHT_CLASS_DEVICE is not set
 CONFIG_USB_GADGET=y
-CONFIG_USB_ETH=y
+CONFIG_USB_ZERO=m
-# CONFIG_USB_ETH_RNDIS is not set
+CONFIG_USB_ETH=m
+CONFIG_USB_GADGETFS=m
+CONFIG_USB_FILE_STORAGE=m
+CONFIG_USB_G_SERIAL=m
+CONFIG_USB_CDC_COMPOSITE=m
+CONFIG_MMC=y
+CONFIG_MMC_TEST=m
+CONFIG_MMC_ATMELMCI=y
+CONFIG_NEW_LEDS=y
+CONFIG_LEDS_CLASS=y
+CONFIG_LEDS_ATMEL_PWM=m
+CONFIG_LEDS_GPIO=m
+CONFIG_LEDS_TRIGGERS=y
+CONFIG_LEDS_TRIGGER_TIMER=m
+CONFIG_LEDS_TRIGGER_HEARTBEAT=m
 CONFIG_RTC_CLASS=y
-# CONFIG_RTC_INTF_PROC is not set
 CONFIG_RTC_DRV_AT32AP700X=y
+CONFIG_DMADEVICES=y
+CONFIG_EXT2_FS=y
+CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
+# CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
 # CONFIG_DNOTIFY is not set
+CONFIG_FUSE_FS=m
+CONFIG_MSDOS_FS=m
+CONFIG_VFAT_FS=m
+CONFIG_FAT_DEFAULT_CODEPAGE=850
 CONFIG_PROC_KCORE=y
-# CONFIG_PROC_PAGE_MONITOR is not set
 CONFIG_TMPFS=y
+CONFIG_CONFIGFS_FS=y
 CONFIG_JFFS2_FS=y
-# CONFIG_JFFS2_FS_WRITEBUFFER is not set
+CONFIG_UBIFS_FS=y
 # CONFIG_NETWORK_FILESYSTEMS is not set
+CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_850=m
+CONFIG_NLS_ISO8859_1=m
+CONFIG_NLS_UTF8=m
 CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_FS=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_DETECT_HUNG_TASK=y
+CONFIG_FRAME_POINTER=y
diff --git a/arch/avr32/configs/atstk1006_defconfig b/arch/avr32/configs/atstk1006_defconfig
index dbcc1b51e506..1ed8f22d4fe2 100644
--- a/arch/avr32/configs/atstk1006_defconfig
+++ b/arch/avr32/configs/atstk1006_defconfig
@@ -3,7 +3,6 @@ CONFIG_EXPERIMENTAL=y
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
 CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_SYSCTL_SYSCALL is not set
@@ -11,7 +10,7 @@ CONFIG_BLK_DEV_INITRD=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
@@ -37,6 +36,7 @@ CONFIG_INET=y
 CONFIG_IP_PNP=y
 CONFIG_IP_PNP_DHCP=y
 CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE_DEMUX=m
 CONFIG_NET_IPGRE=m
 CONFIG_INET_AH=m
 CONFIG_INET_ESP=m
@@ -60,15 +60,13 @@ CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_DATAFLASH=m
-CONFIG_MTD_DATAFLASH_OTP=y
-CONFIG_MTD_M25P80=m
 CONFIG_MTD_NAND=y
 CONFIG_MTD_NAND_ATMEL=y
 CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
 CONFIG_ATMEL_PWM=m
 CONFIG_ATMEL_TCLIB=y
 CONFIG_ATMEL_SSC=m
@@ -132,17 +130,17 @@ CONFIG_USB_ETH=m
 CONFIG_USB_GADGETFS=m
 CONFIG_USB_FILE_STORAGE=m
 CONFIG_USB_G_SERIAL=m
+CONFIG_USB_CDC_COMPOSITE=m
 CONFIG_MMC=y
+CONFIG_MMC_TEST=m
 CONFIG_MMC_ATMELMCI=y
-CONFIG_MMC_SPI=m
 CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=m
+CONFIG_LEDS_CLASS=y
 CONFIG_LEDS_ATMEL_PWM=m
 CONFIG_LEDS_GPIO=m
 CONFIG_LEDS_TRIGGERS=y
 CONFIG_LEDS_TRIGGER_TIMER=m
 CONFIG_LEDS_TRIGGER_HEARTBEAT=m
-CONFIG_LEDS_TRIGGER_DEFAULT_ON=m
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_AT32AP700X=y
 CONFIG_DMADEVICES=y
@@ -156,15 +154,18 @@ CONFIG_EXT4_FS=y
 CONFIG_FUSE_FS=m
 CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
+CONFIG_FAT_DEFAULT_CODEPAGE=850
 CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
+CONFIG_CONFIGFS_FS=y
 CONFIG_JFFS2_FS=y
 CONFIG_UBIFS_FS=y
-CONFIG_MINIX_FS=m
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
+CONFIG_CIFS=m
 CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_850=m
 CONFIG_NLS_ISO8859_1=m
 CONFIG_NLS_UTF8=m
 CONFIG_MAGIC_SYSRQ=y
@@ -172,7 +173,3 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_CRYPTO_FIPS=y
-# CONFIG_CRYPTO_HW is not set
-CONFIG_CRC_T10DIF=m
diff --git a/arch/avr32/configs/favr-32_defconfig b/arch/avr32/configs/favr-32_defconfig
index 0c813b661a0a..aeadc955db32 100644
--- a/arch/avr32/configs/favr-32_defconfig
+++ b/arch/avr32/configs/favr-32_defconfig
@@ -11,7 +11,7 @@ CONFIG_BLK_DEV_INITRD=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
diff --git a/arch/avr32/configs/hammerhead_defconfig b/arch/avr32/configs/hammerhead_defconfig
index dcc01f0eb294..1692beeb7ed3 100644
--- a/arch/avr32/configs/hammerhead_defconfig
+++ b/arch/avr32/configs/hammerhead_defconfig
@@ -12,7 +12,7 @@ CONFIG_BLK_DEV_INITRD=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
diff --git a/arch/avr32/include/asm/syscalls.h b/arch/avr32/include/asm/syscalls.h
index ab608b70b24d..244f2acab546 100644
--- a/arch/avr32/include/asm/syscalls.h
+++ b/arch/avr32/include/asm/syscalls.h
@@ -15,20 +15,6 @@
 #include <linux/types.h>
 #include <linux/signal.h>
-/* kernel/process.c */
-asmlinkage int sys_fork(struct pt_regs *);
-asmlinkage int sys_clone(unsigned long, unsigned long,
-                         unsigned long, unsigned long,
-                         struct pt_regs *);
-asmlinkage int sys_vfork(struct pt_regs *);
-asmlinkage int sys_execve(const char __user *, char __user *__user *,
-                          char __user *__user *, struct pt_regs *);
-/* kernel/signal.c */
-asmlinkage int sys_sigaltstack(const stack_t __user *, stack_t __user *,
-                               struct pt_regs *);
-asmlinkage int sys_rt_sigreturn(struct pt_regs *);
 /* mm/cache.c */
 asmlinkage int sys_cacheflush(int, void __user *, size_t);
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 9c46aaad11ce..ef5a2a08fcca 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -367,14 +367,13 @@ asmlinkage int sys_fork(struct pt_regs *regs)
 }
 asmlinkage int sys_clone(unsigned long clone_flags, unsigned long newsp,
-                         unsigned long parent_tidptr,
+                void __user *parent_tidptr, void __user *child_tidptr,
-                         unsigned long child_tidptr, struct pt_regs *regs)
+                struct pt_regs *regs)
 {
        if (!newsp)
                newsp = regs->sp;
-        return do_fork(clone_flags, newsp, regs, 0,
+        return do_fork(clone_flags, newsp, regs, 0, parent_tidptr,
-                       (int __user *)parent_tidptr,
+                        child_tidptr);
-                       (int __user *)child_tidptr);
 }
 asmlinkage int sys_vfork(struct pt_regs *regs)
diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c
index 668ed2817e51..05ad29112ff4 100644
--- a/arch/avr32/kernel/time.c
+++ b/arch/avr32/kernel/time.c
@@ -35,7 +35,6 @@ static struct clocksource counter = {
        .rating         = 50,
        .read           = read_cycle_count,
        .mask           = CLOCKSOURCE_MASK(32),
-        .shift          = 16,
        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
 };
@@ -123,9 +122,7 @@ void __init time_init(void)
        /* figure rate for counter */
        counter_hz = clk_get_rate(boot_cpu_data.clk);
-        counter.mult = clocksource_hz2mult(counter_hz, counter.shift);
+        ret = clocksource_register_hz(&counter, counter_hz);
-        ret = clocksource_register(&counter);
        if (ret)
                pr_debug("timer: could not register clocksource: %d\n", ret);
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index ac76da099a6d..89accc626b86 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -618,7 +618,7 @@ pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 }
 /* forward declaration */
-static static const struct dentry_operations pfmfs_dentry_operations;
+static const struct dentry_operations pfmfs_dentry_operations;
 static struct dentry *
 pfmfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 1841ee7e65f9..5ca674b74737 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -38,7 +38,7 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
        if (pud) {
                pmd = pmd_alloc(mm, pud, taddr);
                if (pmd)
-                        pte = pte_alloc_map(mm, pmd, taddr);
+                        pte = pte_alloc_map(mm, NULL, pmd, taddr);
        }
        return pte;
 }
diff --git a/arch/mips/include/asm/mman.h b/arch/mips/include/asm/mman.h
index c892bfb3e2c1..785b4ea4ec3f 100644
--- a/arch/mips/include/asm/mman.h
+++ b/arch/mips/include/asm/mman.h
@@ -77,6 +77,9 @@
 #define MADV_UNMERGEABLE 13             /* KSM may not merge identical pages */
 #define MADV_HWPOISON    100            /* poison a page for testing */
+#define MADV_HUGEPAGE   14              /* Worth backing with hugepages */
+#define MADV_NOHUGEPAGE 15              /* Not worth backing with hugepages */
 /* compatibility flags */
 #define MAP_FILE        0
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 6f51dda87fce..d87a72e9fac7 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -46,17 +46,9 @@ static DEFINE_SPINLOCK(dbe_lock);
 void *module_alloc(unsigned long size)
 {
 #ifdef MODULE_START
-        struct vm_struct *area;
+        return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
+                                GFP_KERNEL, PAGE_KERNEL, -1,
-        size = PAGE_ALIGN(size);
+                                __builtin_return_address(0));
-        if (!size)
-                return NULL;
-        area = __get_vm_area(size, VM_ALLOC, MODULE_START, MODULE_END);
-        if (!area)
-                return NULL;
-        return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL);
 #else
        if (size == 0)
                return NULL;
diff --git a/arch/parisc/include/asm/mman.h b/arch/parisc/include/asm/mman.h
index 9749c8afe83a..f5b7bf5fba68 100644
--- a/arch/parisc/include/asm/mman.h
+++ b/arch/parisc/include/asm/mman.h
@@ -59,6 +59,9 @@
 #define MADV_MERGEABLE   65             /* KSM may merge identical pages */
 #define MADV_UNMERGEABLE 66             /* KSM may not merge identical pages */
+#define MADV_HUGEPAGE   67              /* Worth backing with hugepages */
+#define MADV_NOHUGEPAGE 68              /* Not worth backing with hugepages */
 /* compatibility flags */
 #define MAP_FILE        0
 #define MAP_VARIABLE    0
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index d7efdbf640c7..fec13200868f 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -16,6 +16,16 @@
 #ifdef __HAVE_ARCH_PTE_SPECIAL
+static inline void get_huge_page_tail(struct page *page)
+{
+        /*
+         * __split_huge_page_refcount() cannot run
+         * from under us.
+         */
+        VM_BUG_ON(atomic_read(&page->_count) < 0);
+        atomic_inc(&page->_count);
+}
 /*
 * The performance critical leaf functions are made noinline otherwise gcc
 * inlines everything into a single function which results in too much
@@ -47,6 +57,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
                        put_page(page);
                        return 0;
                }
+                if (PageTail(page))
+                        get_huge_page_tail(page);
                pages[*nr] = page;
                (*nr)++;
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 9163db3e8d15..d7762349ea48 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -35,7 +35,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
                if (pud) {
                        pmd = pmd_alloc(mm, pud, addr);
                        if (pmd)
-                                pte = pte_alloc_map(mm, pmd, addr);
+                                pte = pte_alloc_map(mm, NULL, pmd, addr);
                }
        }
diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
index ee3c7dde8d9f..8d348c474a2f 100644
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -23,17 +23,11 @@
 static void *module_map(unsigned long size)
 {
-        struct vm_struct *area;
+        if (PAGE_ALIGN(size) > MODULES_LEN)
-        size = PAGE_ALIGN(size);
-        if (!size || size > MODULES_LEN)
-                return NULL;
-        area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
-        if (!area)
                return NULL;
+        return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-        return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL);
+                                GFP_KERNEL, PAGE_KERNEL, -1,
+                                __builtin_return_address(0));
 }
 static char *dot2underscore(char *name)
diff --git a/arch/sparc/mm/generic_32.c b/arch/sparc/mm/generic_32.c
index 5edcac184eaf..e6067b75f11c 100644
--- a/arch/sparc/mm/generic_32.c
+++ b/arch/sparc/mm/generic_32.c
@@ -50,7 +50,7 @@ static inline int io_remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned
                end = PGDIR_SIZE;
        offset -= address;
        do {
-                pte_t * pte = pte_alloc_map(mm, pmd, address);
+                pte_t *pte = pte_alloc_map(mm, NULL, pmd, address);
                if (!pte)
                        return -ENOMEM;
                io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space);
diff --git a/arch/sparc/mm/generic_64.c b/arch/sparc/mm/generic_64.c
index 04f2bf4cd571..3cb00dfd4bd6 100644
--- a/arch/sparc/mm/generic_64.c
+++ b/arch/sparc/mm/generic_64.c
@@ -92,7 +92,7 @@ static inline int io_remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned
                end = PGDIR_SIZE;
        offset -= address;
        do {
-                pte_t * pte = pte_alloc_map(mm, pmd, address);
+                pte_t *pte = pte_alloc_map(mm, NULL, pmd, address);
                if (!pte)
                        return -ENOMEM;
                io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space);
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 5fdddf134caa..f4e97646ce23 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -214,7 +214,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
        if (pud) {
                pmd = pmd_alloc(mm, pud, addr);
                if (pmd)
-                        pte = pte_alloc_map(mm, pmd, addr);
+                        pte = pte_alloc_map(mm, NULL, pmd, addr);
        }
        return pte;
 }
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 3d099f974785..1aee587e9c5d 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -31,7 +31,7 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
        if (!pmd)
                goto out_pmd;
-        pte = pte_alloc_map(mm, pmd, proc);
+        pte = pte_alloc_map(mm, NULL, pmd, proc);
        if (!pte)
                goto out_pte;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index aa75f21a9fba..ffd7f8d29187 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -822,6 +822,7 @@ extern bool kvm_rebooting;
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 7709c12431b8..2071a8b2b32f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -435,6 +435,11 @@ static inline void pte_update(struct mm_struct *mm, unsigned long addr,
 {
        PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
 }
+static inline void pmd_update(struct mm_struct *mm, unsigned long addr,
+                              pmd_t *pmdp)
+{
+        PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp);
+}
 static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
                                    pte_t *ptep)
@@ -442,6 +447,12 @@ static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
        PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
 }
+static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr,
+                                    pmd_t *pmdp)
+{
+        PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp);
+}
 static inline pte_t __pte(pteval_t val)
 {
        pteval_t ret;
@@ -543,6 +554,20 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
                PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+                              pmd_t *pmdp, pmd_t pmd)
+{
+#if PAGETABLE_LEVELS >= 3
+        if (sizeof(pmdval_t) > sizeof(long))
+                /* 5 arg words */
+                pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd);
+        else
+                PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp, pmd.pmd);
+#endif
+}
+#endif
 static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 {
        pmdval_t val = native_pmd_val(pmd);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b82bac975250..82885099c869 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -265,10 +265,16 @@ struct pv_mmu_ops {
        void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
                           pte_t *ptep, pte_t pteval);
        void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
+        void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr,
+                           pmd_t *pmdp, pmd_t pmdval);
        void (*pte_update)(struct mm_struct *mm, unsigned long addr,
                           pte_t *ptep);
        void (*pte_update_defer)(struct mm_struct *mm,
                                 unsigned long addr, pte_t *ptep);
+        void (*pmd_update)(struct mm_struct *mm, unsigned long addr,
+                           pmd_t *pmdp);
+        void (*pmd_update_defer)(struct mm_struct *mm,
+                                 unsigned long addr, pmd_t *pmdp);
        pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
                                        pte_t *ptep);
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 2334982b339e..98391db840c6 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -46,6 +46,15 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
 #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
 #endif
+#ifdef CONFIG_SMP
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
+{
+        return __pmd(xchg((pmdval_t *)xp, 0));
+}
+#else
+#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
+#endif
 /*
 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
 * split up the 29 bits of offset into this range:
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 177b0165ea01..94b979d1b58d 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -104,6 +104,29 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
 #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
 #endif
+#ifdef CONFIG_SMP
+union split_pmd {
+        struct {
+                u32 pmd_low;
+                u32 pmd_high;
+        };
+        pmd_t pmd;
+};
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
+{
+        union split_pmd res, *orig = (union split_pmd *)pmdp;
+        /* xchg acts as a barrier before setting of the high bits */
+        res.pmd_low = xchg(&orig->pmd_low, 0);
+        res.pmd_high = orig->pmd_high;
+        orig->pmd_high = 0;
+        return res.pmd;
+}
+#else
+#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
+#endif
 /*
 * Bits 0, 6 and 7 are taken in the low part of the pte,
 * put the 32 bits of offset into the high part.
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index ada823a13c7c..18601c86fab1 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -35,6 +35,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
 #else  /* !CONFIG_PARAVIRT */
 #define set_pte(ptep, pte)              native_set_pte(ptep, pte)
 #define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
+#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd)
 #define set_pte_atomic(ptep, pte)                                       \
        native_set_pte_atomic(ptep, pte)
@@ -59,6 +60,8 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
 #define pte_update(mm, addr, ptep)              do { } while (0)
 #define pte_update_defer(mm, addr, ptep)        do { } while (0)
+#define pmd_update(mm, addr, ptep)              do { } while (0)
+#define pmd_update_defer(mm, addr, ptep)        do { } while (0)
 #define pgd_val(x)      native_pgd_val(x)
 #define __pgd(x)        native_make_pgd(x)
@@ -94,6 +97,11 @@ static inline int pte_young(pte_t pte)
        return pte_flags(pte) & _PAGE_ACCESSED;
 }
+static inline int pmd_young(pmd_t pmd)
+{
+        return pmd_flags(pmd) & _PAGE_ACCESSED;
+}
 static inline int pte_write(pte_t pte)
 {
        return pte_flags(pte) & _PAGE_RW;
@@ -142,6 +150,23 @@ static inline int pmd_large(pmd_t pte)
                (_PAGE_PSE | _PAGE_PRESENT);
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+        return pmd_val(pmd) & _PAGE_SPLITTING;
+}
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+        return pmd_val(pmd) & _PAGE_PSE;
+}
+static inline int has_transparent_hugepage(void)
+{
+        return cpu_has_pse;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
 {
        pteval_t v = native_pte_val(pte);
@@ -216,6 +241,55 @@ static inline pte_t pte_mkspecial(pte_t pte)
        return pte_set_flags(pte, _PAGE_SPECIAL);
 }
+static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
+{
+        pmdval_t v = native_pmd_val(pmd);
+        return __pmd(v | set);
+}
+static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
+{
+        pmdval_t v = native_pmd_val(pmd);
+        return __pmd(v & ~clear);
+}
+static inline pmd_t pmd_mkold(pmd_t pmd)
+{
+        return pmd_clear_flags(pmd, _PAGE_ACCESSED);
+}
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+        return pmd_clear_flags(pmd, _PAGE_RW);
+}
+static inline pmd_t pmd_mkdirty(pmd_t pmd)
+{
+        return pmd_set_flags(pmd, _PAGE_DIRTY);
+}
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+        return pmd_set_flags(pmd, _PAGE_PSE);
+}
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
+{
+        return pmd_set_flags(pmd, _PAGE_ACCESSED);
+}
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
+{
+        return pmd_set_flags(pmd, _PAGE_RW);
+}
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+        return pmd_clear_flags(pmd, _PAGE_PRESENT);
+}
 /*
 * Mask out unsupported bits in a present pgprot.  Non-present pgprots
 * can use those bits for other purposes, so leave them be.
@@ -256,6 +330,16 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
        return __pte(val);
 }
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+        pmdval_t val = pmd_val(pmd);
+        val &= _HPAGE_CHG_MASK;
+        val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK;
+        return __pmd(val);
+}
 /* mprotect needs to preserve PAT bits when updating vm_page_prot */
 #define pgprot_modify pgprot_modify
 static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
@@ -350,7 +434,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
-#define pmd_page(pmd)   pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)
+#define pmd_page(pmd)   pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT)
 /*
 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@ -524,12 +608,26 @@ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
        return res;
 }
+static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
+{
+        pmd_t res = *pmdp;
+        native_pmd_clear(pmdp);
+        return res;
+}
 static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
                                     pte_t *ptep , pte_t pte)
 {
        native_set_pte(ptep, pte);
 }
+static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
+                                     pmd_t *pmdp , pmd_t pmd)
+{
+        native_set_pmd(pmdp, pmd);
+}
 #ifndef CONFIG_PARAVIRT
 /*
 * Rules for using pte_update - it must be called after any PTE update which
@@ -607,6 +705,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
 #define flush_tlb_fix_spurious_fault(vma, address)
+#define mk_pmd(page, pgprot)   pfn_pmd(page_to_pfn(page), (pgprot))
+#define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
+                                 unsigned long address, pmd_t *pmdp,
+                                 pmd_t entry, int dirty);
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+                                     unsigned long addr, pmd_t *pmdp);
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
+                                  unsigned long address, pmd_t *pmdp);
+#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+extern void pmdp_splitting_flush(struct vm_area_struct *vma,
+                                 unsigned long addr, pmd_t *pmdp);
+#define __HAVE_ARCH_PMD_WRITE
+static inline int pmd_write(pmd_t pmd)
+{
+        return pmd_flags(pmd) & _PAGE_RW;
+}
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
+                                       pmd_t *pmdp)
+{
+        pmd_t pmd = native_pmdp_get_and_clear(pmdp);
+        pmd_update(mm, addr, pmdp);
+        return pmd;
+}
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+                                      unsigned long addr, pmd_t *pmdp)
+{
+        clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
+        pmd_update(mm, addr, pmdp);
+}
 /*
 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
 *
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index f86da20347f2..975f709e09ae 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -59,6 +59,16 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
        native_set_pte(ptep, pte);
 }
+static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+        *pmdp = pmd;
+}
+static inline void native_pmd_clear(pmd_t *pmd)
+{
+        native_set_pmd(pmd, native_make_pmd(0));
+}
 static inline pte_t native_ptep_get_and_clear(pte_t *xp)
 {
 #ifdef CONFIG_SMP
@@ -72,14 +82,17 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
 #endif
 }
-static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
 {
-        *pmdp = pmd;
+#ifdef CONFIG_SMP
-}
+        return native_make_pmd(xchg(&xp->pmd, 0));
+#else
-static inline void native_pmd_clear(pmd_t *pmd)
+        /* native_local_pmdp_get_and_clear,
-{
+           but duplicated because of cyclic dependency */
-        native_set_pmd(pmd, native_make_pmd(0));
+        pmd_t ret = *xp;
+        native_pmd_clear(xp);
+        return ret;
+#endif
 }
 static inline void native_set_pud(pud_t *pudp, pud_t pud)
@@ -168,6 +181,7 @@ extern void cleanup_highmap(void);
 #define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
 #define __HAVE_ARCH_PTE_SAME
 #endif /* !__ASSEMBLY__ */
 #endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d1f4a760be23..7db7723d1f32 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -22,6 +22,7 @@
 #define _PAGE_BIT_PAT_LARGE     12      /* On 2MB or 1GB pages */
 #define _PAGE_BIT_SPECIAL       _PAGE_BIT_UNUSED1
 #define _PAGE_BIT_CPA_TEST      _PAGE_BIT_UNUSED1
+#define _PAGE_BIT_SPLITTING     _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */
 #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
 /* If _PAGE_BIT_PRESENT is clear, we use these: */
@@ -45,6 +46,7 @@
 #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
 #define _PAGE_SPECIAL   (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
 #define _PAGE_CPA_TEST  (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
+#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
 #define __HAVE_ARCH_PTE_SPECIAL
 #ifdef CONFIG_KMEMCHECK
@@ -70,6 +72,7 @@
 /* Set of bits not changed in pte_modify */
 #define _PAGE_CHG_MASK  (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |         \
                         _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
 #define _PAGE_CACHE_MASK        (_PAGE_PCD | _PAGE_PWT)
 #define _PAGE_CACHE_WB          (0)
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 8760cc60a21c..f25bdf238a33 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -42,6 +42,11 @@ extern unsigned int   machine_to_phys_order;
 extern unsigned long get_phys_to_machine(unsigned long pfn);
 extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern int m2p_add_override(unsigned long mfn, struct page *page);
+extern int m2p_remove_override(struct page *page);
+extern struct page *m2p_find_override(unsigned long mfn);
+extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
 static inline unsigned long pfn_to_mfn(unsigned long pfn)
 {
        unsigned long mfn;
@@ -72,9 +77,6 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
        if (xen_feature(XENFEAT_auto_translated_physmap))
                return mfn;
-        if (unlikely((mfn >> machine_to_phys_order) != 0))
-                return ~0;
        pfn = 0;
        /*
         * The array access can fail (e.g., device space beyond end of RAM).
@@ -83,6 +85,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
         */
        __get_user(pfn, &machine_to_phys_mapping[mfn]);
+        /*
+         * If this appears to be a foreign mfn (because the pfn
+         * doesn't map back to the mfn), then check the local override
+         * table to see if there's a better pfn to use.
+         */
+        if (get_phys_to_machine(pfn) != mfn)
+                pfn = m2p_find_override_pfn(mfn, pfn);
        return pfn;
 }
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 8f2956091735..ab23f1ad4bf1 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -37,20 +37,11 @@
 void *module_alloc(unsigned long size)
 {
-        struct vm_struct *area;
+        if (PAGE_ALIGN(size) > MODULES_LEN)
-        if (!size)
-                return NULL;
-        size = PAGE_ALIGN(size);
-        if (size > MODULES_LEN)
                return NULL;
+        return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-        area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
+                                GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
-        if (!area)
+                                -1, __builtin_return_address(0));
-                return NULL;
-        return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
-                                        PAGE_KERNEL_EXEC);
 }
 /* Free memory returned from module_alloc */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c5b250011fd4..869e1aeeb71b 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -421,8 +421,11 @@ struct pv_mmu_ops pv_mmu_ops = {
        .set_pte = native_set_pte,
        .set_pte_at = native_set_pte_at,
        .set_pmd = native_set_pmd,
+        .set_pmd_at = native_set_pmd_at,
        .pte_update = paravirt_nop,
        .pte_update_defer = paravirt_nop,
+        .pmd_update = paravirt_nop,
+        .pmd_update_defer = paravirt_nop,
        .ptep_modify_prot_start = __ptep_modify_prot_start,
        .ptep_modify_prot_commit = __ptep_modify_prot_commit,
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index c2f1b26141e2..998e972f3b1a 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -133,7 +133,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
        pmd = pmd_alloc(&tboot_mm, pud, vaddr);
        if (!pmd)
                return -1;
-        pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
+        pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr);
        if (!pte)
                return -1;
        set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 61fb98519622..863f8753ab0a 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -179,6 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
        if (pud_none_or_clear_bad(pud))
                goto out;
        pmd = pmd_offset(pud, 0xA0000);
+        split_huge_page_pmd(mm, pmd);
        if (pmd_none_or_clear_bad(pmd))
                goto out;
        pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9cafbb499813..f02b8edc3d44 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -554,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
        return ret;
 }
-static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 {
        struct kvm_memory_slot *slot;
-        int host_level, level, max_level;
        slot = gfn_to_memslot(vcpu->kvm, large_gfn);
        if (slot && slot->dirty_bitmap)
-                return PT_PAGE_TABLE_LEVEL;
+                return true;
+        return false;
+}
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+{
+        int host_level, level, max_level;
        host_level = host_mapping_level(vcpu->kvm, large_gfn);
@@ -941,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
        return young;
 }
+static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+                              unsigned long data)
+{
+        u64 *spte;
+        int young = 0;
+        /*
+         * If there's no access bit in the secondary pte set by the
+         * hardware it's up to gup-fast/gup to set the access bit in
+         * the primary pte or in the page structure.
+         */
+        if (!shadow_accessed_mask)
+                goto out;
+        spte = rmap_next(kvm, rmapp, NULL);
+        while (spte) {
+                u64 _spte = *spte;
+                BUG_ON(!(_spte & PT_PRESENT_MASK));
+                young = _spte & PT_ACCESSED_MASK;
+                if (young) {
+                        young = 1;
+                        break;
+                }
+                spte = rmap_next(kvm, rmapp, spte);
+        }
+out:
+        return young;
+}
 #define RMAP_RECYCLE_THRESHOLD 1000
 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -961,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva)
        return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
 }
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+        return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
+}
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
@@ -2281,6 +2319,48 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
        return 1;
 }
+static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
+                                        gfn_t *gfnp, pfn_t *pfnp, int *levelp)
+{
+        pfn_t pfn = *pfnp;
+        gfn_t gfn = *gfnp;
+        int level = *levelp;
+        /*
+         * Check if it's a transparent hugepage. If this would be an
+         * hugetlbfs page, level wouldn't be set to
+         * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
+         * here.
+         */
+        if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
+            level == PT_PAGE_TABLE_LEVEL &&
+            PageTransCompound(pfn_to_page(pfn)) &&
+            !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
+                unsigned long mask;
+                /*
+                 * mmu_notifier_retry was successful and we hold the
+                 * mmu_lock here, so the pmd can't become splitting
+                 * from under us, and in turn
+                 * __split_huge_page_refcount() can't run from under
+                 * us and we can safely transfer the refcount from
+                 * PG_tail to PG_head as we switch the pfn to tail to
+                 * head.
+                 */
+                *levelp = level = PT_DIRECTORY_LEVEL;
+                mask = KVM_PAGES_PER_HPAGE(level) - 1;
+                VM_BUG_ON((gfn & mask) != (pfn & mask));
+                if (pfn & mask) {
+                        gfn &= ~mask;
+                        *gfnp = gfn;
+                        kvm_release_pfn_clean(pfn);
+                        pfn &= ~mask;
+                        if (!get_page_unless_zero(pfn_to_page(pfn)))
+                                BUG();
+                        *pfnp = pfn;
+                }
+        }
+}
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
                         gva_t gva, pfn_t *pfn, bool write, bool *writable);
@@ -2289,20 +2369,25 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
 {
        int r;
        int level;
+        int force_pt_level;
        pfn_t pfn;
        unsigned long mmu_seq;
        bool map_writable;
-        level = mapping_level(vcpu, gfn);
+        force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+        if (likely(!force_pt_level)) {
-        /*
+                level = mapping_level(vcpu, gfn);
-         * This path builds a PAE pagetable - so we can map 2mb pages at
+                /*
-         * maximum. Therefore check if the level is larger than that.
+                 * This path builds a PAE pagetable - so we can map
-         */
+                 * 2mb pages at maximum. Therefore check if the level
-        if (level > PT_DIRECTORY_LEVEL)
+                 * is larger than that.
-                level = PT_DIRECTORY_LEVEL;
+                 */
+                if (level > PT_DIRECTORY_LEVEL)
+                        level = PT_DIRECTORY_LEVEL;
-        gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+                gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+        } else
+                level = PT_PAGE_TABLE_LEVEL;
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
@@ -2318,6 +2403,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
        if (mmu_notifier_retry(vcpu, mmu_seq))
                goto out_unlock;
        kvm_mmu_free_some_pages(vcpu);
+        if (likely(!force_pt_level))
+                transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
        r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
                         prefault);
        spin_unlock(&vcpu->kvm->mmu_lock);
@@ -2655,6 +2742,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
        pfn_t pfn;
        int r;
        int level;
+        int force_pt_level;
        gfn_t gfn = gpa >> PAGE_SHIFT;
        unsigned long mmu_seq;
        int write = error_code & PFERR_WRITE_MASK;
@@ -2667,9 +2755,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
        if (r)
                return r;
-        level = mapping_level(vcpu, gfn);
+        force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+        if (likely(!force_pt_level)) {
-        gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+                level = mapping_level(vcpu, gfn);
+                gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+        } else
+                level = PT_PAGE_TABLE_LEVEL;
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
@@ -2684,6 +2775,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
        if (mmu_notifier_retry(vcpu, mmu_seq))
                goto out_unlock;
        kvm_mmu_free_some_pages(vcpu);
+        if (likely(!force_pt_level))
+                transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
        r = __direct_map(vcpu, gpa, write, map_writable,
                         level, gfn, pfn, prefault);
        spin_unlock(&vcpu->kvm->mmu_lock);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 53210f1e94c2..6bccc24c4181 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -550,6 +550,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
        int r;
        pfn_t pfn;
        int level = PT_PAGE_TABLE_LEVEL;
+        int force_pt_level;
        unsigned long mmu_seq;
        bool map_writable;
@@ -577,7 +578,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
                return 0;
        }
-        if (walker.level >= PT_DIRECTORY_LEVEL) {
+        if (walker.level >= PT_DIRECTORY_LEVEL)
+                force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
+        else
+                force_pt_level = 1;
+        if (!force_pt_level) {
                level = min(walker.level, mapping_level(vcpu, walker.gfn));
                walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
        }
@@ -599,6 +604,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
        trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
        kvm_mmu_free_some_pages(vcpu);
+        if (!force_pt_level)
+                transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
        sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
                             level, &write_pt, pfn, map_writable, prefault);
        (void)sptep;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 738e6593799d..dbe34b931374 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
 #include <linux/mm.h>
 #include <linux/vmstat.h>
 #include <linux/highmem.h>
+#include <linux/swap.h>
 #include <asm/pgtable.h>
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
                page = pte_page(pte);
                get_page(page);
+                SetPageReferenced(page);
                pages[*nr] = page;
                (*nr)++;
@@ -103,6 +105,17 @@ static inline void get_head_page_multiple(struct page *page, int nr)
        VM_BUG_ON(page != compound_head(page));
        VM_BUG_ON(page_count(page) == 0);
        atomic_add(nr, &page->_count);
+        SetPageReferenced(page);
+}
+static inline void get_huge_page_tail(struct page *page)
+{
+        /*
+         * __split_huge_page_refcount() cannot run
+         * from under us.
+         */
+        VM_BUG_ON(atomic_read(&page->_count) < 0);
+        atomic_inc(&page->_count);
 }
 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
@@ -128,6 +141,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
        do {
                VM_BUG_ON(compound_head(page) != head);
                pages[*nr] = page;
+                if (PageTail(page))
+                        get_huge_page_tail(page);
                (*nr)++;
                page++;
                refs++;
@@ -148,7 +163,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                pmd_t pmd = *pmdp;
                next = pmd_addr_end(addr, end);
-                if (pmd_none(pmd))
+                /*
+                 * The pmd_trans_splitting() check below explains why
+                 * pmdp_splitting_flush has to flush the tlb, to stop
+                 * this gup-fast code from running while we set the
+                 * splitting bit in the pmd. Returning zero will take
+                 * the slow path that will call wait_split_huge_page()
+                 * if the pmd is still in splitting state. gup-fast
+                 * can't because it has irq disabled and
+                 * wait_split_huge_page() would never return as the
+                 * tlb flush IPI wouldn't run.
+                 */
+                if (pmd_none(pmd) || pmd_trans_splitting(pmd))
                        return 0;
                if (unlikely(pmd_large(pmd))) {
                        if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8be8c7d7bc89..500242d3c96d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -320,6 +320,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
        return changed;
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+                          unsigned long address, pmd_t *pmdp,
+                          pmd_t entry, int dirty)
+{
+        int changed = !pmd_same(*pmdp, entry);
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        if (changed && dirty) {
+                *pmdp = entry;
+                pmd_update_defer(vma->vm_mm, address, pmdp);
+                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+        }
+        return changed;
+}
+#endif
 int ptep_test_and_clear_young(struct vm_area_struct *vma,
                              unsigned long addr, pte_t *ptep)
 {
@@ -335,6 +354,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
        return ret;
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+                              unsigned long addr, pmd_t *pmdp)
+{
+        int ret = 0;
+        if (pmd_young(*pmdp))
+                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+                                         (unsigned long *)pmdp);
+        if (ret)
+                pmd_update(vma->vm_mm, addr, pmdp);
+        return ret;
+}
+#endif
 int ptep_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pte_t *ptep)
 {
@@ -347,6 +383,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
        return young;
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+                           unsigned long address, pmd_t *pmdp)
+{
+        int young;
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        young = pmdp_test_and_clear_young(vma, address, pmdp);
+        if (young)
+                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+        return young;
+}
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+                          unsigned long address, pmd_t *pmdp)
+{
+        int set;
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
+                                (unsigned long *)pmdp);
+        if (set) {
+                pmd_update(vma->vm_mm, address, pmdp);
+                /* need tlb flush only to serialize against gup-fast */
+                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+        }
+}
+#endif
 /**
 * reserve_top_address - reserves a hole in the top of kernel address space
 * @reserve - size of hole to reserve
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 779385158915..17c565de3d64 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,7 +12,8 @@ CFLAGS_mmu.o			:= $(nostackp)
 obj-y           := enlighten.o setup.o multicalls.o mmu.o irq.o \
                        time.o xen-asm.o xen-asm_$(BITS).o \
-                        grant-table.o suspend.o platform-pci-unplug.o
+                        grant-table.o suspend.o platform-pci-unplug.o \
+                        p2m.o
 obj-$(CONFIG_SMP)               += smp.o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 44924e551fde..7575e55cd52e 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -173,371 +173,6 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
 */
 #define USER_LIMIT      ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
-/*
- * Xen leaves the responsibility for maintaining p2m mappings to the
- * guests themselves, but it must also access and update the p2m array
- * during suspend/resume when all the pages are reallocated.
- *
- * The p2m table is logically a flat array, but we implement it as a
- * three-level tree to allow the address space to be sparse.
- *
- *                               Xen
- *                                |
- *     p2m_top              p2m_top_mfn
- *       /  \                   /   \
- * p2m_mid p2m_mid      p2m_mid_mfn p2m_mid_mfn
- *    / \      / \         /           /
- *  p2m p2m p2m p2m p2m p2m p2m ...
- *
- * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
- *
- * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
- * maximum representable pseudo-physical address space is:
- *  P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
- *
- * P2M_PER_PAGE depends on the architecture, as a mfn is always
- * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
- * 512 and 1024 entries respectively. 
- */
-unsigned long xen_max_p2m_pfn __read_mostly;
-#define P2M_PER_PAGE            (PAGE_SIZE / sizeof(unsigned long))
-#define P2M_MID_PER_PAGE        (PAGE_SIZE / sizeof(unsigned long *))
-#define P2M_TOP_PER_PAGE        (PAGE_SIZE / sizeof(unsigned long **))
-#define MAX_P2M_PFN             (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
-/* Placeholders for holes in the address space */
-static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
-RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
-RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
-static inline unsigned p2m_top_index(unsigned long pfn)
-{
-        BUG_ON(pfn >= MAX_P2M_PFN);
-        return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
-}
-static inline unsigned p2m_mid_index(unsigned long pfn)
-{
-        return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
-}
-static inline unsigned p2m_index(unsigned long pfn)
-{
-        return pfn % P2M_PER_PAGE;
-}
-static void p2m_top_init(unsigned long ***top)
-{
-        unsigned i;
-        for (i = 0; i < P2M_TOP_PER_PAGE; i++)
-                top[i] = p2m_mid_missing;
-}
-static void p2m_top_mfn_init(unsigned long *top)
-{
-        unsigned i;
-        for (i = 0; i < P2M_TOP_PER_PAGE; i++)
-                top[i] = virt_to_mfn(p2m_mid_missing_mfn);
-}
-static void p2m_top_mfn_p_init(unsigned long **top)
-{
-        unsigned i;
-        for (i = 0; i < P2M_TOP_PER_PAGE; i++)
-                top[i] = p2m_mid_missing_mfn;
-}
-static void p2m_mid_init(unsigned long **mid)
-{
-        unsigned i;
-        for (i = 0; i < P2M_MID_PER_PAGE; i++)
-                mid[i] = p2m_missing;
-}
-static void p2m_mid_mfn_init(unsigned long *mid)
-{
-        unsigned i;
-        for (i = 0; i < P2M_MID_PER_PAGE; i++)
-                mid[i] = virt_to_mfn(p2m_missing);
-}
-static void p2m_init(unsigned long *p2m)
-{
-        unsigned i;
-        for (i = 0; i < P2M_MID_PER_PAGE; i++)
-                p2m[i] = INVALID_P2M_ENTRY;
-}
-/*
- * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
- *
- * This is called both at boot time, and after resuming from suspend:
- * - At boot time we're called very early, and must use extend_brk()
- *   to allocate memory.
- *
- * - After resume we're called from within stop_machine, but the mfn
- *   tree should alreay be completely allocated.
- */
-void xen_build_mfn_list_list(void)
-{
-        unsigned long pfn;
-        /* Pre-initialize p2m_top_mfn to be completely missing */
-        if (p2m_top_mfn == NULL) {
-                p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
-                p2m_mid_mfn_init(p2m_mid_missing_mfn);
-                p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
-                p2m_top_mfn_p_init(p2m_top_mfn_p);
-                p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
-                p2m_top_mfn_init(p2m_top_mfn);
-        } else {
-                /* Reinitialise, mfn's all change after migration */
-                p2m_mid_mfn_init(p2m_mid_missing_mfn);
-        }
-        for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
-                unsigned topidx = p2m_top_index(pfn);
-                unsigned mididx = p2m_mid_index(pfn);
-                unsigned long **mid;
-                unsigned long *mid_mfn_p;
-                mid = p2m_top[topidx];
-                mid_mfn_p = p2m_top_mfn_p[topidx];
-                /* Don't bother allocating any mfn mid levels if
-                 * they're just missing, just update the stored mfn,
-                 * since all could have changed over a migrate.
-                 */
-                if (mid == p2m_mid_missing) {
-                        BUG_ON(mididx);
-                        BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
-                        p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
-                        pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
-                        continue;
-                }
-                if (mid_mfn_p == p2m_mid_missing_mfn) {
-                        /*
-                         * XXX boot-time only!  We should never find
-                         * missing parts of the mfn tree after
-                         * runtime.  extend_brk() will BUG if we call
-                         * it too late.
-                         */
-                        mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
-                        p2m_mid_mfn_init(mid_mfn_p);
-                        p2m_top_mfn_p[topidx] = mid_mfn_p;
-                }
-                p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
-                mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
-        }
-}
-void xen_setup_mfn_list_list(void)
-{
-        BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
-        HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
-                virt_to_mfn(p2m_top_mfn);
-        HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
-}
-/* Set up p2m_top to point to the domain-builder provided p2m pages */
-void __init xen_build_dynamic_phys_to_machine(void)
-{
-        unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
-        unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
-        unsigned long pfn;
-        xen_max_p2m_pfn = max_pfn;
-        p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
-        p2m_init(p2m_missing);
-        p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
-        p2m_mid_init(p2m_mid_missing);
-        p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
-        p2m_top_init(p2m_top);
-        /*
-         * The domain builder gives us a pre-constructed p2m array in
-         * mfn_list for all the pages initially given to us, so we just
-         * need to graft that into our tree structure.
-         */
-        for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
-                unsigned topidx = p2m_top_index(pfn);
-                unsigned mididx = p2m_mid_index(pfn);
-                if (p2m_top[topidx] == p2m_mid_missing) {
-                        unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
-                        p2m_mid_init(mid);
-                        p2m_top[topidx] = mid;
-                }
-                p2m_top[topidx][mididx] = &mfn_list[pfn];
-        }
-}
-unsigned long get_phys_to_machine(unsigned long pfn)
-{
-        unsigned topidx, mididx, idx;
-        if (unlikely(pfn >= MAX_P2M_PFN))
-                return INVALID_P2M_ENTRY;
-        topidx = p2m_top_index(pfn);
-        mididx = p2m_mid_index(pfn);
-        idx = p2m_index(pfn);
-        return p2m_top[topidx][mididx][idx];
-}
-EXPORT_SYMBOL_GPL(get_phys_to_machine);
-static void *alloc_p2m_page(void)
-{
-        return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
-}
-static void free_p2m_page(void *p)
-{
-        free_page((unsigned long)p);
-}
-/* 
- * Fully allocate the p2m structure for a given pfn.  We need to check
- * that both the top and mid levels are allocated, and make sure the
- * parallel mfn tree is kept in sync.  We may race with other cpus, so
- * the new pages are installed with cmpxchg; if we lose the race then
- * simply free the page we allocated and use the one that's there.
- */
-static bool alloc_p2m(unsigned long pfn)
-{
-        unsigned topidx, mididx;
-        unsigned long ***top_p, **mid;
-        unsigned long *top_mfn_p, *mid_mfn;
-        topidx = p2m_top_index(pfn);
-        mididx = p2m_mid_index(pfn);
-        top_p = &p2m_top[topidx];
-        mid = *top_p;
-        if (mid == p2m_mid_missing) {
-                /* Mid level is missing, allocate a new one */
-                mid = alloc_p2m_page();
-                if (!mid)
-                        return false;
-                p2m_mid_init(mid);
-                if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
-                        free_p2m_page(mid);
-        }
-        top_mfn_p = &p2m_top_mfn[topidx];
-        mid_mfn = p2m_top_mfn_p[topidx];
-        BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
-        if (mid_mfn == p2m_mid_missing_mfn) {
-                /* Separately check the mid mfn level */
-                unsigned long missing_mfn;
-                unsigned long mid_mfn_mfn;
-                mid_mfn = alloc_p2m_page();
-                if (!mid_mfn)
-                        return false;
-                p2m_mid_mfn_init(mid_mfn);
-                missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
-                mid_mfn_mfn = virt_to_mfn(mid_mfn);
-                if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
-                        free_p2m_page(mid_mfn);
-                else
-                        p2m_top_mfn_p[topidx] = mid_mfn;
-        }
-        if (p2m_top[topidx][mididx] == p2m_missing) {
-                /* p2m leaf page is missing */
-                unsigned long *p2m;
-                p2m = alloc_p2m_page();
-                if (!p2m)
-                        return false;
-                p2m_init(p2m);
-                if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
-                        free_p2m_page(p2m);
-                else
-                        mid_mfn[mididx] = virt_to_mfn(p2m);
-        }
-        return true;
-}
-/* Try to install p2m mapping; fail if intermediate bits missing */
-bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
-{
-        unsigned topidx, mididx, idx;
-        if (unlikely(pfn >= MAX_P2M_PFN)) {
-                BUG_ON(mfn != INVALID_P2M_ENTRY);
-                return true;
-        }
-        topidx = p2m_top_index(pfn);
-        mididx = p2m_mid_index(pfn);
-        idx = p2m_index(pfn);
-        if (p2m_top[topidx][mididx] == p2m_missing)
-                return mfn == INVALID_P2M_ENTRY;
-        p2m_top[topidx][mididx][idx] = mfn;
-        return true;
-}
-bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
-{
-        if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
-                BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
-                return true;
-        }
-        if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
-                if (!alloc_p2m(pfn))
-                        return false;
-                if (!__set_phys_to_machine(pfn, mfn))
-                        return false;
-        }
-        return true;
-}
 unsigned long arbitrary_virt_to_mfn(void *vaddr)
 {
        xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
new file mode 100644
index 000000000000..8f2251d2a3f8
--- /dev/null
+++ b/arch/x86/xen/p2m.c
@@ -0,0 +1,510 @@
+/*
+ * Xen leaves the responsibility for maintaining p2m mappings to the
+ * guests themselves, but it must also access and update the p2m array
+ * during suspend/resume when all the pages are reallocated.
+ *
+ * The p2m table is logically a flat array, but we implement it as a
+ * three-level tree to allow the address space to be sparse.
+ *
+ *                               Xen
+ *                                |
+ *     p2m_top              p2m_top_mfn
+ *       /  \                   /   \
+ * p2m_mid p2m_mid      p2m_mid_mfn p2m_mid_mfn
+ *    / \      / \         /           /
+ *  p2m p2m p2m p2m p2m p2m p2m ...
+ *
+ * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
+ *
+ * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
+ * maximum representable pseudo-physical address space is:
+ *  P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
+ *
+ * P2M_PER_PAGE depends on the architecture, as a mfn is always
+ * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
+ * 512 and 1024 entries respectively. 
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/sched.h>
+#include <asm/cache.h>
+#include <asm/setup.h>
+#include <asm/xen/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include "xen-ops.h"
+static void __init m2p_override_init(void);
+unsigned long xen_max_p2m_pfn __read_mostly;
+#define P2M_PER_PAGE            (PAGE_SIZE / sizeof(unsigned long))
+#define P2M_MID_PER_PAGE        (PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE        (PAGE_SIZE / sizeof(unsigned long **))
+#define MAX_P2M_PFN             (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+/* Placeholders for holes in the address space */
+static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
+RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+static inline unsigned p2m_top_index(unsigned long pfn)
+{
+        BUG_ON(pfn >= MAX_P2M_PFN);
+        return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
+}
+static inline unsigned p2m_mid_index(unsigned long pfn)
+{
+        return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
+}
+static inline unsigned p2m_index(unsigned long pfn)
+{
+        return pfn % P2M_PER_PAGE;
+}
+static void p2m_top_init(unsigned long ***top)
+{
+        unsigned i;
+        for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+                top[i] = p2m_mid_missing;
+}
+static void p2m_top_mfn_init(unsigned long *top)
+{
+        unsigned i;
+        for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+                top[i] = virt_to_mfn(p2m_mid_missing_mfn);
+}
+static void p2m_top_mfn_p_init(unsigned long **top)
+{
+        unsigned i;
+        for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+                top[i] = p2m_mid_missing_mfn;
+}
+static void p2m_mid_init(unsigned long **mid)
+{
+        unsigned i;
+        for (i = 0; i < P2M_MID_PER_PAGE; i++)
+                mid[i] = p2m_missing;
+}
+static void p2m_mid_mfn_init(unsigned long *mid)
+{
+        unsigned i;
+        for (i = 0; i < P2M_MID_PER_PAGE; i++)
+                mid[i] = virt_to_mfn(p2m_missing);
+}
+static void p2m_init(unsigned long *p2m)
+{
+        unsigned i;
+        for (i = 0; i < P2M_MID_PER_PAGE; i++)
+                p2m[i] = INVALID_P2M_ENTRY;
+}
+/*
+ * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
+ *
+ * This is called both at boot time, and after resuming from suspend:
+ * - At boot time we're called very early, and must use extend_brk()
+ *   to allocate memory.
+ *
+ * - After resume we're called from within stop_machine, but the mfn
+ *   tree should alreay be completely allocated.
+ */
+void xen_build_mfn_list_list(void)
+{
+        unsigned long pfn;
+        /* Pre-initialize p2m_top_mfn to be completely missing */
+        if (p2m_top_mfn == NULL) {
+                p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+                p2m_mid_mfn_init(p2m_mid_missing_mfn);
+                p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+                p2m_top_mfn_p_init(p2m_top_mfn_p);
+                p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+                p2m_top_mfn_init(p2m_top_mfn);
+        } else {
+                /* Reinitialise, mfn's all change after migration */
+                p2m_mid_mfn_init(p2m_mid_missing_mfn);
+        }
+        for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
+                unsigned topidx = p2m_top_index(pfn);
+                unsigned mididx = p2m_mid_index(pfn);
+                unsigned long **mid;
+                unsigned long *mid_mfn_p;
+                mid = p2m_top[topidx];
+                mid_mfn_p = p2m_top_mfn_p[topidx];
+                /* Don't bother allocating any mfn mid levels if
+                 * they're just missing, just update the stored mfn,
+                 * since all could have changed over a migrate.
+                 */
+                if (mid == p2m_mid_missing) {
+                        BUG_ON(mididx);
+                        BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+                        p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
+                        pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
+                        continue;
+                }
+                if (mid_mfn_p == p2m_mid_missing_mfn) {
+                        /*
+                         * XXX boot-time only!  We should never find
+                         * missing parts of the mfn tree after
+                         * runtime.  extend_brk() will BUG if we call
+                         * it too late.
+                         */
+                        mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+                        p2m_mid_mfn_init(mid_mfn_p);
+                        p2m_top_mfn_p[topidx] = mid_mfn_p;
+                }
+                p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
+                mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
+        }
+}
+void xen_setup_mfn_list_list(void)
+{
+        BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
+        HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+                virt_to_mfn(p2m_top_mfn);
+        HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
+}
+/* Set up p2m_top to point to the domain-builder provided p2m pages */
+void __init xen_build_dynamic_phys_to_machine(void)
+{
+        unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
+        unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
+        unsigned long pfn;
+        xen_max_p2m_pfn = max_pfn;
+        p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+        p2m_init(p2m_missing);
+        p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+        p2m_mid_init(p2m_mid_missing);
+        p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
+        p2m_top_init(p2m_top);
+        /*
+         * The domain builder gives us a pre-constructed p2m array in
+         * mfn_list for all the pages initially given to us, so we just
+         * need to graft that into our tree structure.
+         */
+        for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
+                unsigned topidx = p2m_top_index(pfn);
+                unsigned mididx = p2m_mid_index(pfn);
+                if (p2m_top[topidx] == p2m_mid_missing) {
+                        unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+                        p2m_mid_init(mid);
+                        p2m_top[topidx] = mid;
+                }
+                p2m_top[topidx][mididx] = &mfn_list[pfn];
+        }
+        m2p_override_init();
+}
+unsigned long get_phys_to_machine(unsigned long pfn)
+{
+        unsigned topidx, mididx, idx;
+        if (unlikely(pfn >= MAX_P2M_PFN))
+                return INVALID_P2M_ENTRY;
+        topidx = p2m_top_index(pfn);
+        mididx = p2m_mid_index(pfn);
+        idx = p2m_index(pfn);
+        return p2m_top[topidx][mididx][idx];
+}
+EXPORT_SYMBOL_GPL(get_phys_to_machine);
+static void *alloc_p2m_page(void)
+{
+        return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
+}
+static void free_p2m_page(void *p)
+{
+        free_page((unsigned long)p);
+}
+/* 
+ * Fully allocate the p2m structure for a given pfn.  We need to check
+ * that both the top and mid levels are allocated, and make sure the
+ * parallel mfn tree is kept in sync.  We may race with other cpus, so
+ * the new pages are installed with cmpxchg; if we lose the race then
+ * simply free the page we allocated and use the one that's there.
+ */
+static bool alloc_p2m(unsigned long pfn)
+{
+        unsigned topidx, mididx;
+        unsigned long ***top_p, **mid;
+        unsigned long *top_mfn_p, *mid_mfn;
+        topidx = p2m_top_index(pfn);
+        mididx = p2m_mid_index(pfn);
+        top_p = &p2m_top[topidx];
+        mid = *top_p;
+        if (mid == p2m_mid_missing) {
+                /* Mid level is missing, allocate a new one */
+                mid = alloc_p2m_page();
+                if (!mid)
+                        return false;
+                p2m_mid_init(mid);
+                if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
+                        free_p2m_page(mid);
+        }
+        top_mfn_p = &p2m_top_mfn[topidx];
+        mid_mfn = p2m_top_mfn_p[topidx];
+        BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
+        if (mid_mfn == p2m_mid_missing_mfn) {
+                /* Separately check the mid mfn level */
+                unsigned long missing_mfn;
+                unsigned long mid_mfn_mfn;
+                mid_mfn = alloc_p2m_page();
+                if (!mid_mfn)
+                        return false;
+                p2m_mid_mfn_init(mid_mfn);
+                missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
+                mid_mfn_mfn = virt_to_mfn(mid_mfn);
+                if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
+                        free_p2m_page(mid_mfn);
+                else
+                        p2m_top_mfn_p[topidx] = mid_mfn;
+        }
+        if (p2m_top[topidx][mididx] == p2m_missing) {
+                /* p2m leaf page is missing */
+                unsigned long *p2m;
+                p2m = alloc_p2m_page();
+                if (!p2m)
+                        return false;
+                p2m_init(p2m);
+                if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
+                        free_p2m_page(p2m);
+                else
+                        mid_mfn[mididx] = virt_to_mfn(p2m);
+        }
+        return true;
+}
+/* Try to install p2m mapping; fail if intermediate bits missing */
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+        unsigned topidx, mididx, idx;
+        if (unlikely(pfn >= MAX_P2M_PFN)) {
+                BUG_ON(mfn != INVALID_P2M_ENTRY);
+                return true;
+        }
+        topidx = p2m_top_index(pfn);
+        mididx = p2m_mid_index(pfn);
+        idx = p2m_index(pfn);
+        if (p2m_top[topidx][mididx] == p2m_missing)
+                return mfn == INVALID_P2M_ENTRY;
+        p2m_top[topidx][mididx][idx] = mfn;
+        return true;
+}
+bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+        if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
+                BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+                return true;
+        }
+        if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
+                if (!alloc_p2m(pfn))
+                        return false;
+                if (!__set_phys_to_machine(pfn, mfn))
+                        return false;
+        }
+        return true;
+}
+#define M2P_OVERRIDE_HASH_SHIFT 10
+#define M2P_OVERRIDE_HASH       (1 << M2P_OVERRIDE_HASH_SHIFT)
+static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);
+static DEFINE_SPINLOCK(m2p_override_lock);
+static void __init m2p_override_init(void)
+{
+        unsigned i;
+        m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
+                                   sizeof(unsigned long));
+        for (i = 0; i < M2P_OVERRIDE_HASH; i++)
+                INIT_LIST_HEAD(&m2p_overrides[i]);
+}
+static unsigned long mfn_hash(unsigned long mfn)
+{
+        return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
+}
+/* Add an MFN override for a particular page */
+int m2p_add_override(unsigned long mfn, struct page *page)
+{
+        unsigned long flags;
+        unsigned long pfn;
+        unsigned long address;
+        unsigned level;
+        pte_t *ptep = NULL;
+        pfn = page_to_pfn(page);
+        if (!PageHighMem(page)) {
+                address = (unsigned long)__va(pfn << PAGE_SHIFT);
+                ptep = lookup_address(address, &level);
+                if (WARN(ptep == NULL || level != PG_LEVEL_4K,
+                                        "m2p_add_override: pfn %lx not mapped", pfn))
+                        return -EINVAL;
+        }
+        page->private = mfn;
+        page->index = pfn_to_mfn(pfn);
+        __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
+        if (!PageHighMem(page))
+                /* Just zap old mapping for now */
+                pte_clear(&init_mm, address, ptep);
+        spin_lock_irqsave(&m2p_override_lock, flags);
+        list_add(&page->lru,  &m2p_overrides[mfn_hash(mfn)]);
+        spin_unlock_irqrestore(&m2p_override_lock, flags);
+        return 0;
+}
+int m2p_remove_override(struct page *page)
+{
+        unsigned long flags;
+        unsigned long mfn;
+        unsigned long pfn;
+        unsigned long address;
+        unsigned level;
+        pte_t *ptep = NULL;
+        pfn = page_to_pfn(page);
+        mfn = get_phys_to_machine(pfn);
+        if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT))
+                return -EINVAL;
+        if (!PageHighMem(page)) {
+                address = (unsigned long)__va(pfn << PAGE_SHIFT);
+                ptep = lookup_address(address, &level);
+                if (WARN(ptep == NULL || level != PG_LEVEL_4K,
+                                        "m2p_remove_override: pfn %lx not mapped", pfn))
+                        return -EINVAL;
+        }
+        spin_lock_irqsave(&m2p_override_lock, flags);
+        list_del(&page->lru);
+        spin_unlock_irqrestore(&m2p_override_lock, flags);
+        __set_phys_to_machine(pfn, page->index);
+        if (!PageHighMem(page))
+                set_pte_at(&init_mm, address, ptep,
+                                pfn_pte(pfn, PAGE_KERNEL));
+                /* No tlb flush necessary because the caller already
+                 * left the pte unmapped. */
+        return 0;
+}
+struct page *m2p_find_override(unsigned long mfn)
+{
+        unsigned long flags;
+        struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)];
+        struct page *p, *ret;
+        ret = NULL;
+        spin_lock_irqsave(&m2p_override_lock, flags);
+        list_for_each_entry(p, bucket, lru) {
+                if (p->private == mfn) {
+                        ret = p;
+                        break;
+                }
+        }
+        spin_unlock_irqrestore(&m2p_override_lock, flags);
+        return ret;
+}
+unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
+{
+        struct page *p = m2p_find_override(mfn);
+        unsigned long ret = pfn;
+        if (p)
+                ret = page_to_pfn(p);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
diff --git a/arch/xtensa/include/asm/mman.h b/arch/xtensa/include/asm/mman.h
index fca4db425f6e..30789010733d 100644
--- a/arch/xtensa/include/asm/mman.h
+++ b/arch/xtensa/include/asm/mman.h
@@ -83,6 +83,9 @@
 #define MADV_MERGEABLE   12             /* KSM may merge identical pages */
 #define MADV_UNMERGEABLE 13             /* KSM may not merge identical pages */
+#define MADV_HUGEPAGE   14              /* Worth backing with hugepages */
+#define MADV_NOHUGEPAGE 15              /* Not worth backing with hugepages */
 /* compatibility flags */
 #define MAP_FILE        0
diff --git a/drivers/base/node.c b/drivers/base/node.c
index ce012a9c6201..36b43052001d 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -117,12 +117,21 @@ static ssize_t node_read_meminfo(struct sys_device * dev,
                       "Node %d WritebackTmp:   %8lu kB\n"
                       "Node %d Slab:           %8lu kB\n"
                       "Node %d SReclaimable:   %8lu kB\n"
-                       "Node %d SUnreclaim:     %8lu kB\n",
+                       "Node %d SUnreclaim:     %8lu kB\n"
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                       "Node %d AnonHugePages:  %8lu kB\n"
+#endif
+                        ,
                       nid, K(node_page_state(nid, NR_FILE_DIRTY)),
                       nid, K(node_page_state(nid, NR_WRITEBACK)),
                       nid, K(node_page_state(nid, NR_FILE_PAGES)),
                       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
-                       nid, K(node_page_state(nid, NR_ANON_PAGES)),
+                       nid, K(node_page_state(nid, NR_ANON_PAGES)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                        + node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) *
+                        HPAGE_PMD_NR
+#endif
+                       ),
                       nid, K(node_page_state(nid, NR_SHMEM)),
                       nid, node_page_state(nid, NR_KERNEL_STACK) *
                                THREAD_SIZE / 1024,
@@ -133,7 +142,13 @@ static ssize_t node_read_meminfo(struct sys_device * dev,
                       nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) +
                                node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
                       nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)),
-                       nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)));
+                       nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE))
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                        , nid,
+                        K(node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) *
+                        HPAGE_PMD_NR)
+#endif
+                       );
        n += hugetlb_report_node_meminfo(nid, buf + n);
        return n;
 }
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index bf1a95e31559..98d9ec85e0eb 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -240,6 +240,30 @@ config DM_MIRROR
         Allow volume managers to mirror logical volumes, also
         needed for live data migration tools such as 'pvmove'.
+config DM_RAID
+       tristate "RAID 4/5/6 target (EXPERIMENTAL)"
+       depends on BLK_DEV_DM && EXPERIMENTAL
+       select MD_RAID456
+       select BLK_DEV_MD
+       ---help---
+         A dm target that supports RAID4, RAID5 and RAID6 mappings
+         A RAID-5 set of N drives with a capacity of C MB per drive provides
+         the capacity of C * (N - 1) MB, and protects against a failure
+         of a single drive. For a given sector (row) number, (N - 1) drives
+         contain data sectors, and one drive contains the parity protection.
+         For a RAID-4 set, the parity blocks are present on a single drive,
+         while a RAID-5 set distributes the parity across the drives in one
+         of the available parity distribution methods.
+         A RAID-6 set of N drives with a capacity of C MB per drive
+         provides the capacity of C * (N - 2) MB, and protects
+         against a failure of any two drives. For a given sector
+         (row) number, (N - 2) drives contain data sectors, and two
+         drives contains two independent redundancy syndromes.  Like
+         RAID-5, RAID-6 distributes the syndromes across the drives
+         in one of the available parity distribution methods.
 config DM_LOG_USERSPACE
        tristate "Mirror userspace logging (EXPERIMENTAL)"
        depends on DM_MIRROR && EXPERIMENTAL && NET
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 5e3aac41919d..d0138606c2e8 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)         += dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_LOG_USERSPACE)  += dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)           += dm-zero.o
+obj-$(CONFIG_DM_RAID)   += dm-raid.o
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs                     += dm-uevent.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 5a1ffe3527aa..9a35320fb59f 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -210,11 +210,11 @@ static struct page *read_sb_page(mddev_t *mddev, loff_t offset,
                    || test_bit(Faulty, &rdev->flags))
                        continue;
-                target = rdev->sb_start + offset + index * (PAGE_SIZE/512);
+                target = offset + index * (PAGE_SIZE/512);
                if (sync_page_io(rdev, target,
                                 roundup(size, bdev_logical_block_size(rdev->bdev)),
-                                 page, READ)) {
+                                 page, READ, true)) {
                        page->index = index;
                        attach_page_buffers(page, NULL); /* so that free_buffer will
                                                          * quietly no-op */
@@ -264,14 +264,18 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
 static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 {
        mdk_rdev_t *rdev = NULL;
+        struct block_device *bdev;
        mddev_t *mddev = bitmap->mddev;
        while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
                int size = PAGE_SIZE;
                loff_t offset = mddev->bitmap_info.offset;
+                bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
                if (page->index == bitmap->file_pages-1)
                        size = roundup(bitmap->last_page_size,
-                                       bdev_logical_block_size(rdev->bdev));
+                                       bdev_logical_block_size(bdev));
                /* Just make sure we aren't corrupting data or
                 * metadata
                 */
@@ -1542,7 +1546,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
        wait_event(bitmap->mddev->recovery_wait,
                   atomic_read(&bitmap->mddev->recovery_active) == 0);
-        bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync;
+        bitmap->mddev->curr_resync_completed = sector;
        set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
        sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
        s = 0;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index d5b0e4c0e702..4e054bd91664 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,10 +18,14 @@
 #include <linux/crypto.h>
 #include <linux/workqueue.h>
 #include <linux/backing-dev.h>
+#include <linux/percpu.h>
 #include <asm/atomic.h>
 #include <linux/scatterlist.h>
 #include <asm/page.h>
 #include <asm/unaligned.h>
+#include <crypto/hash.h>
+#include <crypto/md5.h>
+#include <crypto/algapi.h>
 #include <linux/device-mapper.h>
@@ -63,6 +67,7 @@ struct dm_crypt_request {
        struct convert_context *ctx;
        struct scatterlist sg_in;
        struct scatterlist sg_out;
+        sector_t iv_sector;
 };
 struct crypt_config;
@@ -73,11 +78,13 @@ struct crypt_iv_operations {
        void (*dtr)(struct crypt_config *cc);
        int (*init)(struct crypt_config *cc);
        int (*wipe)(struct crypt_config *cc);
-        int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
+        int (*generator)(struct crypt_config *cc, u8 *iv,
+                         struct dm_crypt_request *dmreq);
+        int (*post)(struct crypt_config *cc, u8 *iv,
+                    struct dm_crypt_request *dmreq);
 };
 struct iv_essiv_private {
-        struct crypto_cipher *tfm;
        struct crypto_hash *hash_tfm;
        u8 *salt;
 };
@@ -86,11 +93,32 @@ struct iv_benbi_private {
        int shift;
 };
+#define LMK_SEED_SIZE 64 /* hash + 0 */
+struct iv_lmk_private {
+        struct crypto_shash *hash_tfm;
+        u8 *seed;
+};
 /*
 * Crypt: maps a linear range of a block device
 * and encrypts / decrypts at the same time.
 */
 enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
+/*
+ * Duplicated per-CPU state for cipher.
+ */
+struct crypt_cpu {
+        struct ablkcipher_request *req;
+        /* ESSIV: struct crypto_cipher *essiv_tfm */
+        void *iv_private;
+        struct crypto_ablkcipher *tfms[0];
+};
+/*
+ * The fields in here must be read only after initialization,
+ * changing state should be in crypt_cpu.
+ */
 struct crypt_config {
        struct dm_dev *dev;
        sector_t start;
@@ -108,17 +136,25 @@ struct crypt_config {
        struct workqueue_struct *crypt_queue;
        char *cipher;
-        char *cipher_mode;
+        char *cipher_string;
        struct crypt_iv_operations *iv_gen_ops;
        union {
                struct iv_essiv_private essiv;
                struct iv_benbi_private benbi;
+                struct iv_lmk_private lmk;
        } iv_gen_private;
        sector_t iv_offset;
        unsigned int iv_size;
        /*
+         * Duplicated per cpu state. Access through
+         * per_cpu_ptr() only.
+         */
+        struct crypt_cpu __percpu *cpu;
+        unsigned tfms_count;
+        /*
         * Layout of each crypto request:
         *
         *   struct ablkcipher_request
@@ -132,11 +168,10 @@ struct crypt_config {
         * correctly aligned.
         */
        unsigned int dmreq_start;
-        struct ablkcipher_request *req;
-        struct crypto_ablkcipher *tfm;
        unsigned long flags;
        unsigned int key_size;
+        unsigned int key_parts;
        u8 key[0];
 };
@@ -148,6 +183,20 @@ static struct kmem_cache *_crypt_io_pool;
 static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
+static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
+static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
+{
+        return this_cpu_ptr(cc->cpu);
+}
+/*
+ * Use this to access cipher attributes that are the same for each CPU.
+ */
+static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
+{
+        return __this_cpu_ptr(cc->cpu)->tfms[0];
+}
 /*
 * Different IV generation algorithms:
@@ -168,23 +217,38 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 * null: the initial vector is always zero.  Provides compatibility with
 *       obsolete loop_fish2 devices.  Do not use for new devices.
 *
+ * lmk:  Compatible implementation of the block chaining mode used
+ *       by the Loop-AES block device encryption system
+ *       designed by Jari Ruusu. See http://loop-aes.sourceforge.net/
+ *       It operates on full 512 byte sectors and uses CBC
+ *       with an IV derived from the sector number, the data and
+ *       optionally extra IV seed.
+ *       This means that after decryption the first block
+ *       of sector must be tweaked according to decrypted data.
+ *       Loop-AES can use three encryption schemes:
+ *         version 1: is plain aes-cbc mode
+ *         version 2: uses 64 multikey scheme with lmk IV generator
+ *         version 3: the same as version 2 with additional IV seed
+ *                   (it uses 65 keys, last key is used as IV seed)
+ *
 * plumb: unimplemented, see:
 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
 */
-static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
+                              struct dm_crypt_request *dmreq)
 {
        memset(iv, 0, cc->iv_size);
-        *(u32 *)iv = cpu_to_le32(sector & 0xffffffff);
+        *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
        return 0;
 }
 static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
-                                sector_t sector)
+                                struct dm_crypt_request *dmreq)
 {
        memset(iv, 0, cc->iv_size);
-        *(u64 *)iv = cpu_to_le64(sector);
+        *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
        return 0;
 }
@@ -195,7 +259,8 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
        struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
        struct hash_desc desc;
        struct scatterlist sg;
-        int err;
+        struct crypto_cipher *essiv_tfm;
+        int err, cpu;
        sg_init_one(&sg, cc->key, cc->key_size);
        desc.tfm = essiv->hash_tfm;
@@ -205,8 +270,16 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
        if (err)
                return err;
-        return crypto_cipher_setkey(essiv->tfm, essiv->salt,
+        for_each_possible_cpu(cpu) {
+                essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private,
+                err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
                                    crypto_hash_digestsize(essiv->hash_tfm));
+                if (err)
+                        return err;
+        }
+        return 0;
 }
 /* Wipe salt and reset key derived from volume key */
@@ -214,24 +287,76 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
 {
        struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
        unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
+        struct crypto_cipher *essiv_tfm;
+        int cpu, r, err = 0;
        memset(essiv->salt, 0, salt_size);
-        return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
+        for_each_possible_cpu(cpu) {
+                essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private;
+                r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
+                if (r)
+                        err = r;
+        }
+        return err;
+}
+/* Set up per cpu cipher state */
+static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
+                                             struct dm_target *ti,
+                                             u8 *salt, unsigned saltsize)
+{
+        struct crypto_cipher *essiv_tfm;
+        int err;
+        /* Setup the essiv_tfm with the given salt */
+        essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
+        if (IS_ERR(essiv_tfm)) {
+                ti->error = "Error allocating crypto tfm for ESSIV";
+                return essiv_tfm;
+        }
+        if (crypto_cipher_blocksize(essiv_tfm) !=
+            crypto_ablkcipher_ivsize(any_tfm(cc))) {
+                ti->error = "Block size of ESSIV cipher does "
+                            "not match IV size of block cipher";
+                crypto_free_cipher(essiv_tfm);
+                return ERR_PTR(-EINVAL);
+        }
+        err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
+        if (err) {
+                ti->error = "Failed to set key for ESSIV cipher";
+                crypto_free_cipher(essiv_tfm);
+                return ERR_PTR(err);
+        }
+        return essiv_tfm;
 }
 static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 {
+        int cpu;
+        struct crypt_cpu *cpu_cc;
+        struct crypto_cipher *essiv_tfm;
        struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
-        crypto_free_cipher(essiv->tfm);
-        essiv->tfm = NULL;
        crypto_free_hash(essiv->hash_tfm);
        essiv->hash_tfm = NULL;
        kzfree(essiv->salt);
        essiv->salt = NULL;
+        for_each_possible_cpu(cpu) {
+                cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+                essiv_tfm = cpu_cc->iv_private;
+                if (essiv_tfm)
+                        crypto_free_cipher(essiv_tfm);
+                cpu_cc->iv_private = NULL;
+        }
 }
 static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -240,7 +365,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
        struct crypto_cipher *essiv_tfm = NULL;
        struct crypto_hash *hash_tfm = NULL;
        u8 *salt = NULL;
-        int err;
+        int err, cpu;
        if (!opts) {
                ti->error = "Digest algorithm missing for ESSIV mode";
@@ -262,48 +387,44 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
                goto bad;
        }
-        /* Allocate essiv_tfm */
-        essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
-        if (IS_ERR(essiv_tfm)) {
-                ti->error = "Error allocating crypto tfm for ESSIV";
-                err = PTR_ERR(essiv_tfm);
-                goto bad;
-        }
-        if (crypto_cipher_blocksize(essiv_tfm) !=
-            crypto_ablkcipher_ivsize(cc->tfm)) {
-                ti->error = "Block size of ESSIV cipher does "
-                            "not match IV size of block cipher";
-                err = -EINVAL;
-                goto bad;
-        }
        cc->iv_gen_private.essiv.salt = salt;
-        cc->iv_gen_private.essiv.tfm = essiv_tfm;
        cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
+        for_each_possible_cpu(cpu) {
+                essiv_tfm = setup_essiv_cpu(cc, ti, salt,
+                                        crypto_hash_digestsize(hash_tfm));
+                if (IS_ERR(essiv_tfm)) {
+                        crypt_iv_essiv_dtr(cc);
+                        return PTR_ERR(essiv_tfm);
+                }
+                per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm;
+        }
        return 0;
 bad:
-        if (essiv_tfm && !IS_ERR(essiv_tfm))
-                crypto_free_cipher(essiv_tfm);
        if (hash_tfm && !IS_ERR(hash_tfm))
                crypto_free_hash(hash_tfm);
        kfree(salt);
        return err;
 }
-static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
+                              struct dm_crypt_request *dmreq)
 {
+        struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
        memset(iv, 0, cc->iv_size);
-        *(u64 *)iv = cpu_to_le64(sector);
+        *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
-        crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
+        crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
        return 0;
 }
 static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
                              const char *opts)
 {
-        unsigned bs = crypto_ablkcipher_blocksize(cc->tfm);
+        unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc));
        int log = ilog2(bs);
        /* we need to calculate how far we must shift the sector count
@@ -328,25 +449,177 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc)
 {
 }
-static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv,
+                              struct dm_crypt_request *dmreq)
 {
        __be64 val;
        memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
-        val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
+        val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1);
        put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
        return 0;
 }
-static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv,
+                             struct dm_crypt_request *dmreq)
 {
        memset(iv, 0, cc->iv_size);
        return 0;
 }
+static void crypt_iv_lmk_dtr(struct crypt_config *cc)
+{
+        struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+        if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm))
+                crypto_free_shash(lmk->hash_tfm);
+        lmk->hash_tfm = NULL;
+        kzfree(lmk->seed);
+        lmk->seed = NULL;
+}
+static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
+                            const char *opts)
+{
+        struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+        lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
+        if (IS_ERR(lmk->hash_tfm)) {
+                ti->error = "Error initializing LMK hash";
+                return PTR_ERR(lmk->hash_tfm);
+        }
+        /* No seed in LMK version 2 */
+        if (cc->key_parts == cc->tfms_count) {
+                lmk->seed = NULL;
+                return 0;
+        }
+        lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL);
+        if (!lmk->seed) {
+                crypt_iv_lmk_dtr(cc);
+                ti->error = "Error kmallocing seed storage in LMK";
+                return -ENOMEM;
+        }
+        return 0;
+}
+static int crypt_iv_lmk_init(struct crypt_config *cc)
+{
+        struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+        int subkey_size = cc->key_size / cc->key_parts;
+        /* LMK seed is on the position of LMK_KEYS + 1 key */
+        if (lmk->seed)
+                memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size),
+                       crypto_shash_digestsize(lmk->hash_tfm));
+        return 0;
+}
+static int crypt_iv_lmk_wipe(struct crypt_config *cc)
+{
+        struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+        if (lmk->seed)
+                memset(lmk->seed, 0, LMK_SEED_SIZE);
+        return 0;
+}
+static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
+                            struct dm_crypt_request *dmreq,
+                            u8 *data)
+{
+        struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+        struct {
+                struct shash_desc desc;
+                char ctx[crypto_shash_descsize(lmk->hash_tfm)];
+        } sdesc;
+        struct md5_state md5state;
+        u32 buf[4];
+        int i, r;
+        sdesc.desc.tfm = lmk->hash_tfm;
+        sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+        r = crypto_shash_init(&sdesc.desc);
+        if (r)
+                return r;
+        if (lmk->seed) {
+                r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE);
+                if (r)
+                        return r;
+        }
+        /* Sector is always 512B, block size 16, add data of blocks 1-31 */
+        r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31);
+        if (r)
+                return r;
+        /* Sector is cropped to 56 bits here */
+        buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF);
+        buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000);
+        buf[2] = cpu_to_le32(4024);
+        buf[3] = 0;
+        r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf));
+        if (r)
+                return r;
+        /* No MD5 padding here */
+        r = crypto_shash_export(&sdesc.desc, &md5state);
+        if (r)
+                return r;
+        for (i = 0; i < MD5_HASH_WORDS; i++)
+                __cpu_to_le32s(&md5state.hash[i]);
+        memcpy(iv, &md5state.hash, cc->iv_size);
+        return 0;
+}
+static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
+                            struct dm_crypt_request *dmreq)
+{
+        u8 *src;
+        int r = 0;
+        if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
+                src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0);
+                r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
+                kunmap_atomic(src, KM_USER0);
+        } else
+                memset(iv, 0, cc->iv_size);
+        return r;
+}
+static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
+                             struct dm_crypt_request *dmreq)
+{
+        u8 *dst;
+        int r;
+        if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
+                return 0;
+        dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0);
+        r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
+        /* Tweak the first block of plaintext sector */
+        if (!r)
+                crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
+        kunmap_atomic(dst, KM_USER0);
+        return r;
+}
 static struct crypt_iv_operations crypt_iv_plain_ops = {
        .generator = crypt_iv_plain_gen
 };
@@ -373,6 +646,15 @@ static struct crypt_iv_operations crypt_iv_null_ops = {
        .generator = crypt_iv_null_gen
 };
+static struct crypt_iv_operations crypt_iv_lmk_ops = {
+        .ctr       = crypt_iv_lmk_ctr,
+        .dtr       = crypt_iv_lmk_dtr,
+        .init      = crypt_iv_lmk_init,
+        .wipe      = crypt_iv_lmk_wipe,
+        .generator = crypt_iv_lmk_gen,
+        .post      = crypt_iv_lmk_post
+};
 static void crypt_convert_init(struct crypt_config *cc,
                               struct convert_context *ctx,
                               struct bio *bio_out, struct bio *bio_in,
@@ -400,6 +682,13 @@ static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc,
        return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start);
 }
+static u8 *iv_of_dmreq(struct crypt_config *cc,
+                       struct dm_crypt_request *dmreq)
+{
+        return (u8 *)ALIGN((unsigned long)(dmreq + 1),
+                crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
+}
 static int crypt_convert_block(struct crypt_config *cc,
                               struct convert_context *ctx,
                               struct ablkcipher_request *req)
@@ -411,9 +700,9 @@ static int crypt_convert_block(struct crypt_config *cc,
        int r = 0;
        dmreq = dmreq_of_req(cc, req);
-        iv = (u8 *)ALIGN((unsigned long)(dmreq + 1),
+        iv = iv_of_dmreq(cc, dmreq);
-                         crypto_ablkcipher_alignmask(cc->tfm) + 1);
+        dmreq->iv_sector = ctx->sector;
        dmreq->ctx = ctx;
        sg_init_table(&dmreq->sg_in, 1);
        sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
@@ -436,7 +725,7 @@ static int crypt_convert_block(struct crypt_config *cc,
        }
        if (cc->iv_gen_ops) {
-                r = cc->iv_gen_ops->generator(cc, iv, ctx->sector);
+                r = cc->iv_gen_ops->generator(cc, iv, dmreq);
                if (r < 0)
                        return r;
        }
@@ -449,21 +738,28 @@ static int crypt_convert_block(struct crypt_config *cc,
        else
                r = crypto_ablkcipher_decrypt(req);
+        if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
+                r = cc->iv_gen_ops->post(cc, iv, dmreq);
        return r;
 }
 static void kcryptd_async_done(struct crypto_async_request *async_req,
                               int error);
 static void crypt_alloc_req(struct crypt_config *cc,
                            struct convert_context *ctx)
 {
-        if (!cc->req)
+        struct crypt_cpu *this_cc = this_crypt_config(cc);
-                cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+        unsigned key_index = ctx->sector & (cc->tfms_count - 1);
-        ablkcipher_request_set_tfm(cc->req, cc->tfm);
-        ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
+        if (!this_cc->req)
-                                        CRYPTO_TFM_REQ_MAY_SLEEP,
+                this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
-                                        kcryptd_async_done,
-                                        dmreq_of_req(cc, cc->req));
+        ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]);
+        ablkcipher_request_set_callback(this_cc->req,
+            CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+            kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
 }
 /*
@@ -472,6 +768,7 @@ static void crypt_alloc_req(struct crypt_config *cc,
 static int crypt_convert(struct crypt_config *cc,
                         struct convert_context *ctx)
 {
+        struct crypt_cpu *this_cc = this_crypt_config(cc);
        int r;
        atomic_set(&ctx->pending, 1);
@@ -483,7 +780,7 @@ static int crypt_convert(struct crypt_config *cc,
                atomic_inc(&ctx->pending);
-                r = crypt_convert_block(cc, ctx, cc->req);
+                r = crypt_convert_block(cc, ctx, this_cc->req);
                switch (r) {
                /* async */
@@ -492,7 +789,7 @@ static int crypt_convert(struct crypt_config *cc,
                        INIT_COMPLETION(ctx->restart);
                        /* fall through*/
                case -EINPROGRESS:
-                        cc->req = NULL;
+                        this_cc->req = NULL;
                        ctx->sector++;
                        continue;
@@ -651,6 +948,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 * They must be separated as otherwise the final stages could be
 * starved by new requests which can block in the first stages due
 * to memory allocation.
+ *
+ * The work is done per CPU global for all dm-crypt instances.
+ * They should not depend on each other and do not block.
 */
 static void crypt_endio(struct bio *clone, int error)
 {
@@ -691,26 +991,30 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
        clone->bi_destructor = dm_crypt_bio_destructor;
 }
-static void kcryptd_io_read(struct dm_crypt_io *io)
+static void kcryptd_unplug(struct crypt_config *cc)
+{
+        blk_unplug(bdev_get_queue(cc->dev->bdev));
+}
+static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 {
        struct crypt_config *cc = io->target->private;
        struct bio *base_bio = io->base_bio;
        struct bio *clone;
-        crypt_inc_pending(io);
        /*
         * The block layer might modify the bvec array, so always
         * copy the required bvecs because we need the original
         * one in order to decrypt the whole bio data *afterwards*.
         */
-        clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs);
+        clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs);
-        if (unlikely(!clone)) {
+        if (!clone) {
-                io->error = -ENOMEM;
+                kcryptd_unplug(cc);
-                crypt_dec_pending(io);
+                return 1;
-                return;
        }
+        crypt_inc_pending(io);
        clone_init(io, clone);
        clone->bi_idx = 0;
        clone->bi_vcnt = bio_segments(base_bio);
@@ -720,6 +1024,7 @@ static void kcryptd_io_read(struct dm_crypt_io *io)
               sizeof(struct bio_vec) * clone->bi_vcnt);
        generic_make_request(clone);
+        return 0;
 }
 static void kcryptd_io_write(struct dm_crypt_io *io)
@@ -732,9 +1037,12 @@ static void kcryptd_io(struct work_struct *work)
 {
        struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
-        if (bio_data_dir(io->base_bio) == READ)
+        if (bio_data_dir(io->base_bio) == READ) {
-                kcryptd_io_read(io);
+                crypt_inc_pending(io);
-        else
+                if (kcryptd_io_read(io, GFP_NOIO))
+                        io->error = -ENOMEM;
+                crypt_dec_pending(io);
+        } else
                kcryptd_io_write(io);
 }
@@ -901,6 +1209,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
                return;
        }
+        if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
+                error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
        mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
        if (!atomic_dec_and_test(&ctx->pending))
@@ -971,34 +1282,84 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
        }
 }
-static int crypt_set_key(struct crypt_config *cc, char *key)
+static void crypt_free_tfms(struct crypt_config *cc, int cpu)
 {
-        unsigned key_size = strlen(key) >> 1;
+        struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+        unsigned i;
-        if (cc->key_size && cc->key_size != key_size)
+        for (i = 0; i < cc->tfms_count; i++)
+                if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) {
+                        crypto_free_ablkcipher(cpu_cc->tfms[i]);
+                        cpu_cc->tfms[i] = NULL;
+                }
+}
+static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode)
+{
+        struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+        unsigned i;
+        int err;
+        for (i = 0; i < cc->tfms_count; i++) {
+                cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0);
+                if (IS_ERR(cpu_cc->tfms[i])) {
+                        err = PTR_ERR(cpu_cc->tfms[i]);
+                        crypt_free_tfms(cc, cpu);
+                        return err;
+                }
+        }
+        return 0;
+}
+static int crypt_setkey_allcpus(struct crypt_config *cc)
+{
+        unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
+        int cpu, err = 0, i, r;
+        for_each_possible_cpu(cpu) {
+                for (i = 0; i < cc->tfms_count; i++) {
+                        r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i],
+                                                     cc->key + (i * subkey_size), subkey_size);
+                        if (r)
+                                err = r;
+                }
+        }
+        return err;
+}
+static int crypt_set_key(struct crypt_config *cc, char *key)
+{
+        /* The key size may not be changed. */
+        if (cc->key_size != (strlen(key) >> 1))
                return -EINVAL;
-        cc->key_size = key_size; /* initial settings */
+        /* Hyphen (which gives a key_size of zero) means there is no key. */
+        if (!cc->key_size && strcmp(key, "-"))
+                return -EINVAL;
-        if ((!key_size && strcmp(key, "-")) ||
+        if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
-           (key_size && crypt_decode_key(cc->key, key, key_size) < 0))
                return -EINVAL;
        set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
-        return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
+        return crypt_setkey_allcpus(cc);
 }
 static int crypt_wipe_key(struct crypt_config *cc)
 {
        clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
        memset(&cc->key, 0, cc->key_size * sizeof(u8));
-        return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
+        return crypt_setkey_allcpus(cc);
 }
 static void crypt_dtr(struct dm_target *ti)
 {
        struct crypt_config *cc = ti->private;
+        struct crypt_cpu *cpu_cc;
+        int cpu;
        ti->private = NULL;
@@ -1010,6 +1371,14 @@ static void crypt_dtr(struct dm_target *ti)
        if (cc->crypt_queue)
                destroy_workqueue(cc->crypt_queue);
+        if (cc->cpu)
+                for_each_possible_cpu(cpu) {
+                        cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+                        if (cpu_cc->req)
+                                mempool_free(cpu_cc->req, cc->req_pool);
+                        crypt_free_tfms(cc, cpu);
+                }
        if (cc->bs)
                bioset_free(cc->bs);
@@ -1023,14 +1392,14 @@ static void crypt_dtr(struct dm_target *ti)
        if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
                cc->iv_gen_ops->dtr(cc);
-        if (cc->tfm && !IS_ERR(cc->tfm))
-                crypto_free_ablkcipher(cc->tfm);
        if (cc->dev)
                dm_put_device(ti, cc->dev);
+        if (cc->cpu)
+                free_percpu(cc->cpu);
        kzfree(cc->cipher);
-        kzfree(cc->cipher_mode);
+        kzfree(cc->cipher_string);
        /* Must zero key material before freeing */
        kzfree(cc);
@@ -1040,9 +1409,9 @@ static int crypt_ctr_cipher(struct dm_target *ti,
                            char *cipher_in, char *key)
 {
        struct crypt_config *cc = ti->private;
-        char *tmp, *cipher, *chainmode, *ivmode, *ivopts;
+        char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
        char *cipher_api = NULL;
-        int ret = -EINVAL;
+        int cpu, ret = -EINVAL;
        /* Convert to crypto api definition? */
        if (strchr(cipher_in, '(')) {
@@ -1050,23 +1419,31 @@ static int crypt_ctr_cipher(struct dm_target *ti,
                return -EINVAL;
        }
+        cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
+        if (!cc->cipher_string)
+                goto bad_mem;
        /*
         * Legacy dm-crypt cipher specification
-         * cipher-mode-iv:ivopts
+         * cipher[:keycount]-mode-iv:ivopts
         */
        tmp = cipher_in;
-        cipher = strsep(&tmp, "-");
+        keycount = strsep(&tmp, "-");
+        cipher = strsep(&keycount, ":");
+        if (!keycount)
+                cc->tfms_count = 1;
+        else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 ||
+                 !is_power_of_2(cc->tfms_count)) {
+                ti->error = "Bad cipher key count specification";
+                return -EINVAL;
+        }
+        cc->key_parts = cc->tfms_count;
        cc->cipher = kstrdup(cipher, GFP_KERNEL);
        if (!cc->cipher)
                goto bad_mem;
-        if (tmp) {
-                cc->cipher_mode = kstrdup(tmp, GFP_KERNEL);
-                if (!cc->cipher_mode)
-                        goto bad_mem;
-        }
        chainmode = strsep(&tmp, "-");
        ivopts = strsep(&tmp, "-");
        ivmode = strsep(&ivopts, ":");
@@ -1074,10 +1451,19 @@ static int crypt_ctr_cipher(struct dm_target *ti,
        if (tmp)
                DMWARN("Ignoring unexpected additional cipher options");
-        /* Compatibility mode for old dm-crypt mappings */
+        cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) +
+                                 cc->tfms_count * sizeof(*(cc->cpu->tfms)),
+                                 __alignof__(struct crypt_cpu));
+        if (!cc->cpu) {
+                ti->error = "Cannot allocate per cpu state";
+                goto bad_mem;
+        }
+        /*
+         * For compatibility with the original dm-crypt mapping format, if
+         * only the cipher name is supplied, use cbc-plain.
+         */
        if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) {
-                kfree(cc->cipher_mode);
-                cc->cipher_mode = kstrdup("cbc-plain", GFP_KERNEL);
                chainmode = "cbc";
                ivmode = "plain";
        }
@@ -1099,11 +1485,12 @@ static int crypt_ctr_cipher(struct dm_target *ti,
        }
        /* Allocate cipher */
-        cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0);
+        for_each_possible_cpu(cpu) {
-        if (IS_ERR(cc->tfm)) {
+                ret = crypt_alloc_tfms(cc, cpu, cipher_api);
-                ret = PTR_ERR(cc->tfm);
+                if (ret < 0) {
-                ti->error = "Error allocating crypto tfm";
+                        ti->error = "Error allocating crypto tfm";
-                goto bad;
+                        goto bad;
+                }
        }
        /* Initialize and set key */
@@ -1114,7 +1501,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
        }
        /* Initialize IV */
-        cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm);
+        cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc));
        if (cc->iv_size)
                /* at least a 64 bit sector number should fit in our buffer */
                cc->iv_size = max(cc->iv_size,
@@ -1137,7 +1524,15 @@ static int crypt_ctr_cipher(struct dm_target *ti,
                cc->iv_gen_ops = &crypt_iv_benbi_ops;
        else if (strcmp(ivmode, "null") == 0)
                cc->iv_gen_ops = &crypt_iv_null_ops;
-        else {
+        else if (strcmp(ivmode, "lmk") == 0) {
+                cc->iv_gen_ops = &crypt_iv_lmk_ops;
+                /* Version 2 and 3 is recognised according
+                 * to length of provided multi-key string.
+                 * If present (version 3), last key is used as IV seed.
+                 */
+                if (cc->key_size % cc->key_parts)
+                        cc->key_parts++;
+        } else {
                ret = -EINVAL;
                ti->error = "Invalid IV mode";
                goto bad;
@@ -1194,6 +1589,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                ti->error = "Cannot allocate encryption context";
                return -ENOMEM;
        }
+        cc->key_size = key_size;
        ti->private = cc;
        ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
@@ -1208,9 +1604,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        }
        cc->dmreq_start = sizeof(struct ablkcipher_request);
-        cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm);
+        cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
        cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
-        cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) &
+        cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
                           ~(crypto_tfm_ctx_alignment() - 1);
        cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
@@ -1219,7 +1615,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                ti->error = "Cannot allocate crypt request mempool";
                goto bad;
        }
-        cc->req = NULL;
        cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
        if (!cc->page_pool) {
@@ -1252,13 +1647,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        cc->start = tmpll;
        ret = -ENOMEM;
-        cc->io_queue = create_singlethread_workqueue("kcryptd_io");
+        cc->io_queue = alloc_workqueue("kcryptd_io",
+                                       WQ_NON_REENTRANT|
+                                       WQ_MEM_RECLAIM,
+                                       1);
        if (!cc->io_queue) {
                ti->error = "Couldn't create kcryptd io queue";
                goto bad;
        }
-        cc->crypt_queue = create_singlethread_workqueue("kcryptd");
+        cc->crypt_queue = alloc_workqueue("kcryptd",
+                                          WQ_NON_REENTRANT|
+                                          WQ_CPU_INTENSIVE|
+                                          WQ_MEM_RECLAIM,
+                                          1);
        if (!cc->crypt_queue) {
                ti->error = "Couldn't create kcryptd queue";
                goto bad;
@@ -1286,9 +1688,10 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
        io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector));
-        if (bio_data_dir(io->base_bio) == READ)
+        if (bio_data_dir(io->base_bio) == READ) {
-                kcryptd_queue_io(io);
+                if (kcryptd_io_read(io, GFP_NOWAIT))
-        else
+                        kcryptd_queue_io(io);
+        } else
                kcryptd_queue_crypt(io);
        return DM_MAPIO_SUBMITTED;
@@ -1306,10 +1709,7 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
                break;
        case STATUSTYPE_TABLE:
-                if (cc->cipher_mode)
+                DMEMIT("%s ", cc->cipher_string);
-                        DMEMIT("%s-%s ", cc->cipher, cc->cipher_mode);
-                else
-                        DMEMIT("%s ", cc->cipher);
                if (cc->key_size > 0) {
                        if ((maxlen - sz) < ((cc->key_size << 1) + 1))
@@ -1421,7 +1821,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
 static struct target_type crypt_target = {
        .name   = "crypt",
-        .version = {1, 7, 0},
+        .version = {1, 10, 0},
        .module = THIS_MODULE,
        .ctr    = crypt_ctr,
        .dtr    = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index baa11912cc94..f18375dcedd9 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -352,7 +352,7 @@ static int __init dm_delay_init(void)
 {
        int r = -ENOMEM;
-        kdelayd_wq = create_workqueue("kdelayd");
+        kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
        if (!kdelayd_wq) {
                DMERR("Couldn't start kdelayd");
                goto bad_queue;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4b54618b4159..6d12775a1061 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -295,19 +295,55 @@ retry:
                DMWARN("remove_all left %d open device(s)", dev_skipped);
 }
+/*
+ * Set the uuid of a hash_cell that isn't already set.
+ */
+static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid)
+{
+        mutex_lock(&dm_hash_cells_mutex);
+        hc->uuid = new_uuid;
+        mutex_unlock(&dm_hash_cells_mutex);
+        list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid));
+}
+/*
+ * Changes the name of a hash_cell and returns the old name for
+ * the caller to free.
+ */
+static char *__change_cell_name(struct hash_cell *hc, char *new_name)
+{
+        char *old_name;
+        /*
+         * Rename and move the name cell.
+         */
+        list_del(&hc->name_list);
+        old_name = hc->name;
+        mutex_lock(&dm_hash_cells_mutex);
+        hc->name = new_name;
+        mutex_unlock(&dm_hash_cells_mutex);
+        list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+        return old_name;
+}
 static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
                                            const char *new)
 {
-        char *new_name, *old_name;
+        char *new_data, *old_name = NULL;
        struct hash_cell *hc;
        struct dm_table *table;
        struct mapped_device *md;
+        unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
        /*
         * duplicate new.
         */
-        new_name = kstrdup(new, GFP_KERNEL);
+        new_data = kstrdup(new, GFP_KERNEL);
-        if (!new_name)
+        if (!new_data)
                return ERR_PTR(-ENOMEM);
        down_write(&_hash_lock);
@@ -315,13 +351,19 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
        /*
         * Is new free ?
         */
-        hc = __get_name_cell(new);
+        if (change_uuid)
+                hc = __get_uuid_cell(new);
+        else
+                hc = __get_name_cell(new);
        if (hc) {
-                DMWARN("asked to rename to an already-existing name %s -> %s",
+                DMWARN("Unable to change %s on mapped device %s to one that "
+                       "already exists: %s",
+                       change_uuid ? "uuid" : "name",
                       param->name, new);
                dm_put(hc->md);
                up_write(&_hash_lock);
-                kfree(new_name);
+                kfree(new_data);
                return ERR_PTR(-EBUSY);
        }
@@ -330,22 +372,30 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
         */
        hc = __get_name_cell(param->name);
        if (!hc) {
-                DMWARN("asked to rename a non-existent device %s -> %s",
+                DMWARN("Unable to rename non-existent device, %s to %s%s",
-                       param->name, new);
+                       param->name, change_uuid ? "uuid " : "", new);
                up_write(&_hash_lock);
-                kfree(new_name);
+                kfree(new_data);
                return ERR_PTR(-ENXIO);
        }
        /*
-         * rename and move the name cell.
+         * Does this device already have a uuid?
         */
-        list_del(&hc->name_list);
+        if (change_uuid && hc->uuid) {
-        old_name = hc->name;
+                DMWARN("Unable to change uuid of mapped device %s to %s "
-        mutex_lock(&dm_hash_cells_mutex);
+                       "because uuid is already set to %s",
-        hc->name = new_name;
+                       param->name, new, hc->uuid);
-        mutex_unlock(&dm_hash_cells_mutex);
+                dm_put(hc->md);
-        list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+                up_write(&_hash_lock);
+                kfree(new_data);
+                return ERR_PTR(-EINVAL);
+        }
+        if (change_uuid)
+                __set_cell_uuid(hc, new_data);
+        else
+                old_name = __change_cell_name(hc, new_data);
        /*
         * Wake up any dm event waiters.
@@ -729,7 +779,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
        hc = __find_device_hash_cell(param);
        if (!hc) {
-                DMWARN("device doesn't appear to be in the dev hash table.");
+                DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
                up_write(&_hash_lock);
                return -ENXIO;
        }
@@ -741,7 +791,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
         */
        r = dm_lock_for_deletion(md);
        if (r) {
-                DMWARN("unable to remove open device %s", hc->name);
+                DMDEBUG_LIMIT("unable to remove open device %s", hc->name);
                up_write(&_hash_lock);
                dm_put(md);
                return r;
@@ -774,21 +824,24 @@ static int invalid_str(char *str, void *end)
 static int dev_rename(struct dm_ioctl *param, size_t param_size)
 {
        int r;
-        char *new_name = (char *) param + param->data_start;
+        char *new_data = (char *) param + param->data_start;
        struct mapped_device *md;
+        unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
-        if (new_name < param->data ||
+        if (new_data < param->data ||
-            invalid_str(new_name, (void *) param + param_size) ||
+            invalid_str(new_data, (void *) param + param_size) ||
-            strlen(new_name) > DM_NAME_LEN - 1) {
+            strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) {
-                DMWARN("Invalid new logical volume name supplied.");
+                DMWARN("Invalid new mapped device name or uuid string supplied.");
                return -EINVAL;
        }
-        r = check_name(new_name);
+        if (!change_uuid) {
-        if (r)
+                r = check_name(new_data);
-                return r;
+                if (r)
+                        return r;
+        }
-        md = dm_hash_rename(param, new_name);
+        md = dm_hash_rename(param, new_data);
        if (IS_ERR(md))
                return PTR_ERR(md);
@@ -885,7 +938,7 @@ static int do_resume(struct dm_ioctl *param)
        hc = __find_device_hash_cell(param);
        if (!hc) {
-                DMWARN("device doesn't appear to be in the dev hash table.");
+                DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
                up_write(&_hash_lock);
                return -ENXIO;
        }
@@ -1212,7 +1265,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
        hc = __find_device_hash_cell(param);
        if (!hc) {
-                DMWARN("device doesn't appear to be in the dev hash table.");
+                DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
                up_write(&_hash_lock);
                return -ENXIO;
        }
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index d8587bac5682..924f5f0084c2 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -37,6 +37,13 @@ struct dm_kcopyd_client {
        unsigned int nr_pages;
        unsigned int nr_free_pages;
+        /*
+         * Block devices to unplug.
+         * Non-NULL pointer means that a block device has some pending requests
+         * and needs to be unplugged.
+         */
+        struct block_device *unplug[2];
        struct dm_io_client *io_client;
        wait_queue_head_t destroyq;
@@ -308,6 +315,31 @@ static int run_complete_job(struct kcopyd_job *job)
        return 0;
 }
+/*
+ * Unplug the block device at the specified index.
+ */
+static void unplug(struct dm_kcopyd_client *kc, int rw)
+{
+        if (kc->unplug[rw] != NULL) {
+                blk_unplug(bdev_get_queue(kc->unplug[rw]));
+                kc->unplug[rw] = NULL;
+        }
+}
+/*
+ * Prepare block device unplug. If there's another device
+ * to be unplugged at the same array index, we unplug that
+ * device first.
+ */
+static void prepare_unplug(struct dm_kcopyd_client *kc, int rw,
+                           struct block_device *bdev)
+{
+        if (likely(kc->unplug[rw] == bdev))
+                return;
+        unplug(kc, rw);
+        kc->unplug[rw] = bdev;
+}
 static void complete_io(unsigned long error, void *context)
 {
        struct kcopyd_job *job = (struct kcopyd_job *) context;
@@ -345,7 +377,7 @@ static int run_io_job(struct kcopyd_job *job)
 {
        int r;
        struct dm_io_request io_req = {
-                .bi_rw = job->rw | REQ_SYNC | REQ_UNPLUG,
+                .bi_rw = job->rw,
                .mem.type = DM_IO_PAGE_LIST,
                .mem.ptr.pl = job->pages,
                .mem.offset = job->offset,
@@ -354,10 +386,16 @@ static int run_io_job(struct kcopyd_job *job)
                .client = job->kc->io_client,
        };
-        if (job->rw == READ)
+        if (job->rw == READ) {
                r = dm_io(&io_req, 1, &job->source, NULL);
-        else
+                prepare_unplug(job->kc, READ, job->source.bdev);
+        } else {
+                if (job->num_dests > 1)
+                        io_req.bi_rw |= REQ_UNPLUG;
                r = dm_io(&io_req, job->num_dests, job->dests, NULL);
+                if (!(io_req.bi_rw & REQ_UNPLUG))
+                        prepare_unplug(job->kc, WRITE, job->dests[0].bdev);
+        }
        return r;
 }
@@ -435,10 +473,18 @@ static void do_work(struct work_struct *work)
         * Pages jobs when successful will jump onto the io jobs
         * list.  io jobs call wake when they complete and it all
         * starts again.
+         *
+         * Note that io_jobs add block devices to the unplug array,
+         * this array is cleared with "unplug" calls. It is thus
+         * forbidden to run complete_jobs after io_jobs and before
+         * unplug because the block device could be destroyed in
+         * job completion callback.
         */
        process_jobs(&kc->complete_jobs, kc, run_complete_job);
        process_jobs(&kc->pages_jobs, kc, run_pages_job);
        process_jobs(&kc->io_jobs, kc, run_io_job);
+        unplug(kc, READ);
+        unplug(kc, WRITE);
 }
 /*
@@ -619,12 +665,15 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
        INIT_LIST_HEAD(&kc->io_jobs);
        INIT_LIST_HEAD(&kc->pages_jobs);
+        memset(kc->unplug, 0, sizeof(kc->unplug));
        kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
        if (!kc->job_pool)
                goto bad_slab;
        INIT_WORK(&kc->kcopyd_work, do_work);
-        kc->kcopyd_wq = create_singlethread_workqueue("kcopyd");
+        kc->kcopyd_wq = alloc_workqueue("kcopyd",
+                                        WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
        if (!kc->kcopyd_wq)
                goto bad_workqueue;
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 1ed0094f064b..aa2e0c374ab3 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -12,12 +12,22 @@
 #include "dm-log-userspace-transfer.h"
+#define DM_LOG_USERSPACE_VSN "1.1.0"
 struct flush_entry {
        int type;
        region_t region;
        struct list_head list;
 };
+/*
+ * This limit on the number of mark and clear request is, to a degree,
+ * arbitrary.  However, there is some basis for the choice in the limits
+ * imposed on the size of data payload by dm-log-userspace-transfer.c:
+ * dm_consult_userspace().
+ */
+#define MAX_FLUSH_GROUP_COUNT 32
 struct log_c {
        struct dm_target *ti;
        uint32_t region_size;
@@ -37,8 +47,15 @@ struct log_c {
         */
        uint64_t in_sync_hint;
+        /*
+         * Mark and clear requests are held until a flush is issued
+         * so that we can group, and thereby limit, the amount of
+         * network traffic between kernel and userspace.  The 'flush_lock'
+         * is used to protect these lists.
+         */
        spinlock_t flush_lock;
-        struct list_head flush_list;  /* only for clear and mark requests */
+        struct list_head mark_list;
+        struct list_head clear_list;
 };
 static mempool_t *flush_entry_pool;
@@ -169,7 +186,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
        strncpy(lc->uuid, argv[0], DM_UUID_LEN);
        spin_lock_init(&lc->flush_lock);
-        INIT_LIST_HEAD(&lc->flush_list);
+        INIT_LIST_HEAD(&lc->mark_list);
+        INIT_LIST_HEAD(&lc->clear_list);
        str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
        if (str_size < 0) {
@@ -181,8 +199,11 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
        r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,
                                 ctr_str, str_size, NULL, NULL);
-        if (r == -ESRCH) {
+        if (r < 0) {
-                DMERR("Userspace log server not found");
+                if (r == -ESRCH)
+                        DMERR("Userspace log server not found");
+                else
+                        DMERR("Userspace log server failed to create log");
                goto out;
        }
@@ -214,10 +235,9 @@ out:
 static void userspace_dtr(struct dm_dirty_log *log)
 {
-        int r;
        struct log_c *lc = log->context;
-        r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
+        (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
                                 NULL, 0,
                                 NULL, NULL);
@@ -338,6 +358,71 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
        return (r) ? 0 : (int)in_sync;
 }
+static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
+{
+        int r = 0;
+        struct flush_entry *fe;
+        list_for_each_entry(fe, flush_list, list) {
+                r = userspace_do_request(lc, lc->uuid, fe->type,
+                                         (char *)&fe->region,
+                                         sizeof(fe->region),
+                                         NULL, NULL);
+                if (r)
+                        break;
+        }
+        return r;
+}
+static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
+{
+        int r = 0;
+        int count;
+        uint32_t type = 0;
+        struct flush_entry *fe, *tmp_fe;
+        LIST_HEAD(tmp_list);
+        uint64_t group[MAX_FLUSH_GROUP_COUNT];
+        /*
+         * Group process the requests
+         */
+        while (!list_empty(flush_list)) {
+                count = 0;
+                list_for_each_entry_safe(fe, tmp_fe, flush_list, list) {
+                        group[count] = fe->region;
+                        count++;
+                        list_del(&fe->list);
+                        list_add(&fe->list, &tmp_list);
+                        type = fe->type;
+                        if (count >= MAX_FLUSH_GROUP_COUNT)
+                                break;
+                }
+                r = userspace_do_request(lc, lc->uuid, type,
+                                         (char *)(group),
+                                         count * sizeof(uint64_t),
+                                         NULL, NULL);
+                if (r) {
+                        /* Group send failed.  Attempt one-by-one. */
+                        list_splice_init(&tmp_list, flush_list);
+                        r = flush_one_by_one(lc, flush_list);
+                        break;
+                }
+        }
+        /*
+         * Must collect flush_entrys that were successfully processed
+         * as a group so that they will be free'd by the caller.
+         */
+        list_splice_init(&tmp_list, flush_list);
+        return r;
+}
 /*
 * userspace_flush
 *
@@ -360,31 +445,25 @@ static int userspace_flush(struct dm_dirty_log *log)
        int r = 0;
        unsigned long flags;
        struct log_c *lc = log->context;
-        LIST_HEAD(flush_list);
+        LIST_HEAD(mark_list);
+        LIST_HEAD(clear_list);
        struct flush_entry *fe, *tmp_fe;
        spin_lock_irqsave(&lc->flush_lock, flags);
-        list_splice_init(&lc->flush_list, &flush_list);
+        list_splice_init(&lc->mark_list, &mark_list);
+        list_splice_init(&lc->clear_list, &clear_list);
        spin_unlock_irqrestore(&lc->flush_lock, flags);
-        if (list_empty(&flush_list))
+        if (list_empty(&mark_list) && list_empty(&clear_list))
                return 0;
-        /*
+        r = flush_by_group(lc, &mark_list);
-         * FIXME: Count up requests, group request types,
+        if (r)
-         * allocate memory to stick all requests in and
+                goto fail;
-         * send to server in one go.  Failing the allocation,
-         * do it one by one.
-         */
-        list_for_each_entry(fe, &flush_list, list) {
+        r = flush_by_group(lc, &clear_list);
-                r = userspace_do_request(lc, lc->uuid, fe->type,
+        if (r)
-                                         (char *)&fe->region,
+                goto fail;
-                                         sizeof(fe->region),
-                                         NULL, NULL);
-                if (r)
-                        goto fail;
-        }
        r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
                                 NULL, 0, NULL, NULL);
@@ -395,7 +474,11 @@ fail:
         * Calling code will receive an error and will know that
         * the log facility has failed.
         */
-        list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
+        list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) {
+                list_del(&fe->list);
+                mempool_free(fe, flush_entry_pool);
+        }
+        list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) {
                list_del(&fe->list);
                mempool_free(fe, flush_entry_pool);
        }
@@ -425,7 +508,7 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
        spin_lock_irqsave(&lc->flush_lock, flags);
        fe->type = DM_ULOG_MARK_REGION;
        fe->region = region;
-        list_add(&fe->list, &lc->flush_list);
+        list_add(&fe->list, &lc->mark_list);
        spin_unlock_irqrestore(&lc->flush_lock, flags);
        return;
@@ -462,7 +545,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
        spin_lock_irqsave(&lc->flush_lock, flags);
        fe->type = DM_ULOG_CLEAR_REGION;
        fe->region = region;
-        list_add(&fe->list, &lc->flush_list);
+        list_add(&fe->list, &lc->clear_list);
        spin_unlock_irqrestore(&lc->flush_lock, flags);
        return;
@@ -684,7 +767,7 @@ static int __init userspace_dirty_log_init(void)
                return r;
        }
-        DMINFO("version 1.0.0 loaded");
+        DMINFO("version " DM_LOG_USERSPACE_VSN " loaded");
        return 0;
 }
@@ -694,7 +777,7 @@ static void __exit userspace_dirty_log_exit(void)
        dm_ulog_tfr_exit();
        mempool_destroy(flush_entry_pool);
-        DMINFO("version 1.0.0 unloaded");
+        DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
        return;
 }
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 075cbcf8a9f5..049eaf12aaab 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -198,6 +198,7 @@ resend:
        memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg));
        memcpy(tfr->uuid, uuid, DM_UUID_LEN);
+        tfr->version = DM_ULOG_REQUEST_VERSION;
        tfr->luid = luid;
        tfr->seq = dm_ulog_seq++;
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 33420e68d153..6951536ea29c 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -455,7 +455,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
                        r = PTR_ERR(lc->io_req.client);
                        DMWARN("couldn't allocate disk io client");
                        kfree(lc);
-                        return -ENOMEM;
+                        return r;
                }
                lc->disk_header = vmalloc(buf_size);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 487ecda90ad4..b82d28819e2a 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -23,6 +23,8 @@
 #define DM_MSG_PREFIX "multipath"
 #define MESG_STR(x) x, sizeof(x)
+#define DM_PG_INIT_DELAY_MSECS 2000
+#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
 /* Path properties */
 struct pgpath {
@@ -33,8 +35,7 @@ struct pgpath {
        unsigned fail_count;            /* Cumulative failure count */
        struct dm_path path;
-        struct work_struct deactivate_path;
+        struct delayed_work activate_path;
-        struct work_struct activate_path;
 };
 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -65,11 +66,15 @@ struct multipath {
        const char *hw_handler_name;
        char *hw_handler_params;
        unsigned nr_priority_groups;
        struct list_head priority_groups;
+        wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
        unsigned pg_init_required;      /* pg_init needs calling? */
        unsigned pg_init_in_progress;   /* Only one pg_init allowed at once */
-        wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
+        unsigned pg_init_delay_retry;   /* Delay pg_init retry? */
        unsigned nr_valid_paths;        /* Total number of usable paths */
        struct pgpath *current_pgpath;
@@ -82,6 +87,7 @@ struct multipath {
        unsigned saved_queue_if_no_path;/* Saved state during suspension */
        unsigned pg_init_retries;       /* Number of times to retry pg_init */
        unsigned pg_init_count;         /* Number of times pg_init called */
+        unsigned pg_init_delay_msecs;   /* Number of msecs before pg_init retry */
        struct work_struct process_queued_ios;
        struct list_head queued_ios;
@@ -116,7 +122,6 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
 static void process_queued_ios(struct work_struct *work);
 static void trigger_event(struct work_struct *work);
 static void activate_path(struct work_struct *work);
-static void deactivate_path(struct work_struct *work);
 /*-----------------------------------------------
@@ -129,8 +134,7 @@ static struct pgpath *alloc_pgpath(void)
        if (pgpath) {
                pgpath->is_active = 1;
-                INIT_WORK(&pgpath->deactivate_path, deactivate_path);
+                INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
-                INIT_WORK(&pgpath->activate_path, activate_path);
        }
        return pgpath;
@@ -141,14 +145,6 @@ static void free_pgpath(struct pgpath *pgpath)
        kfree(pgpath);
 }
-static void deactivate_path(struct work_struct *work)
-{
-        struct pgpath *pgpath =
-                container_of(work, struct pgpath, deactivate_path);
-        blk_abort_queue(pgpath->path.dev->bdev->bd_disk->queue);
-}
 static struct priority_group *alloc_priority_group(void)
 {
        struct priority_group *pg;
@@ -199,6 +195,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
                INIT_LIST_HEAD(&m->queued_ios);
                spin_lock_init(&m->lock);
                m->queue_io = 1;
+                m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
                INIT_WORK(&m->process_queued_ios, process_queued_ios);
                INIT_WORK(&m->trigger_event, trigger_event);
                init_waitqueue_head(&m->pg_init_wait);
@@ -238,14 +235,19 @@ static void free_multipath(struct multipath *m)
 static void __pg_init_all_paths(struct multipath *m)
 {
        struct pgpath *pgpath;
+        unsigned long pg_init_delay = 0;
        m->pg_init_count++;
        m->pg_init_required = 0;
+        if (m->pg_init_delay_retry)
+                pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
+                                                 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
        list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
                /* Skip failed paths */
                if (!pgpath->is_active)
                        continue;
-                if (queue_work(kmpath_handlerd, &pgpath->activate_path))
+                if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
+                                       pg_init_delay))
                        m->pg_init_in_progress++;
        }
 }
@@ -793,8 +795,9 @@ static int parse_features(struct arg_set *as, struct multipath *m)
        const char *param_name;
        static struct param _params[] = {
-                {0, 3, "invalid number of feature args"},
+                {0, 5, "invalid number of feature args"},
                {1, 50, "pg_init_retries must be between 1 and 50"},
+                {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
        };
        r = read_param(_params, shift(as), &argc, &ti->error);
@@ -821,6 +824,14 @@ static int parse_features(struct arg_set *as, struct multipath *m)
                        continue;
                }
+                if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) &&
+                    (argc >= 1)) {
+                        r = read_param(_params + 2, shift(as),
+                                       &m->pg_init_delay_msecs, &ti->error);
+                        argc--;
+                        continue;
+                }
                ti->error = "Unrecognised multipath feature request";
                r = -EINVAL;
        } while (argc && !r);
@@ -931,7 +942,7 @@ static void flush_multipath_work(struct multipath *m)
        flush_workqueue(kmpath_handlerd);
        multipath_wait_for_pg_init_completion(m);
        flush_workqueue(kmultipathd);
-        flush_scheduled_work();
+        flush_work_sync(&m->trigger_event);
 }
 static void multipath_dtr(struct dm_target *ti)
@@ -995,7 +1006,6 @@ static int fail_path(struct pgpath *pgpath)
                      pgpath->path.dev->name, m->nr_valid_paths);
        schedule_work(&m->trigger_event);
-        queue_work(kmultipathd, &pgpath->deactivate_path);
 out:
        spin_unlock_irqrestore(&m->lock, flags);
@@ -1034,7 +1044,7 @@ static int reinstate_path(struct pgpath *pgpath)
                m->current_pgpath = NULL;
                queue_work(kmultipathd, &m->process_queued_ios);
        } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
-                if (queue_work(kmpath_handlerd, &pgpath->activate_path))
+                if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
                        m->pg_init_in_progress++;
        }
@@ -1169,6 +1179,7 @@ static void pg_init_done(void *data, int errors)
        struct priority_group *pg = pgpath->pg;
        struct multipath *m = pg->m;
        unsigned long flags;
+        unsigned delay_retry = 0;
        /* device or driver problems */
        switch (errors) {
@@ -1193,8 +1204,9 @@ static void pg_init_done(void *data, int errors)
                 */
                bypass_pg(m, pg, 1);
                break;
-        /* TODO: For SCSI_DH_RETRY we should wait a couple seconds */
        case SCSI_DH_RETRY:
+                /* Wait before retrying. */
+                delay_retry = 1;
        case SCSI_DH_IMM_RETRY:
        case SCSI_DH_RES_TEMP_UNAVAIL:
                if (pg_init_limit_reached(m, pgpath))
@@ -1227,6 +1239,7 @@ static void pg_init_done(void *data, int errors)
        if (!m->pg_init_required)
                m->queue_io = 0;
+        m->pg_init_delay_retry = delay_retry;
        queue_work(kmultipathd, &m->process_queued_ios);
        /*
@@ -1241,7 +1254,7 @@ out:
 static void activate_path(struct work_struct *work)
 {
        struct pgpath *pgpath =
-                container_of(work, struct pgpath, activate_path);
+                container_of(work, struct pgpath, activate_path.work);
        scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
                                pg_init_done, pgpath);
@@ -1382,11 +1395,14 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
                DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count);
        else {
                DMEMIT("%u ", m->queue_if_no_path +
-                              (m->pg_init_retries > 0) * 2);
+                              (m->pg_init_retries > 0) * 2 +
+                              (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2);
                if (m->queue_if_no_path)
                        DMEMIT("queue_if_no_path ");
                if (m->pg_init_retries)
                        DMEMIT("pg_init_retries %u ", m->pg_init_retries);
+                if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
+                        DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
        }
        if (!m->hw_handler_name || type == STATUSTYPE_INFO)
@@ -1655,7 +1671,7 @@ out:
 *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
        .name = "multipath",
-        .version = {1, 1, 1},
+        .version = {1, 2, 0},
        .module = THIS_MODULE,
        .ctr = multipath_ctr,
        .dtr = multipath_dtr,
@@ -1687,7 +1703,7 @@ static int __init dm_multipath_init(void)
                return -EINVAL;
        }
-        kmultipathd = create_workqueue("kmpathd");
+        kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
        if (!kmultipathd) {
                DMERR("failed to create workqueue kmpathd");
                dm_unregister_target(&multipath_target);
@@ -1701,7 +1717,8 @@ static int __init dm_multipath_init(void)
         * old workqueue would also create a bottleneck in the
         * path of the storage hardware device activation.
         */
-        kmpath_handlerd = create_singlethread_workqueue("kmpath_handlerd");
+        kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
+                                                  WQ_MEM_RECLAIM);
        if (!kmpath_handlerd) {
                DMERR("failed to create workqueue kmpath_handlerd");
                destroy_workqueue(kmultipathd);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
new file mode 100644
index 000000000000..b9e1e15ef11c
--- /dev/null
+++ b/drivers/md/dm-raid.c
@@ -0,0 +1,697 @@
+/*
+ * Copyright (C) 2010-2011 Neil Brown
+ * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/slab.h>
+#include "md.h"
+#include "raid5.h"
+#include "dm.h"
+#include "bitmap.h"
+#define DM_MSG_PREFIX "raid"
+/*
+ * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then
+ * make it so the flag doesn't set anything.
+ */
+#ifndef MD_SYNC_STATE_FORCED
+#define MD_SYNC_STATE_FORCED 0
+#endif
+struct raid_dev {
+        /*
+         * Two DM devices, one to hold metadata and one to hold the
+         * actual data/parity.  The reason for this is to not confuse
+         * ti->len and give more flexibility in altering size and
+         * characteristics.
+         *
+         * While it is possible for this device to be associated
+         * with a different physical device than the data_dev, it
+         * is intended for it to be the same.
+         *    |--------- Physical Device ---------|
+         *    |- meta_dev -|------ data_dev ------|
+         */
+        struct dm_dev *meta_dev;
+        struct dm_dev *data_dev;
+        struct mdk_rdev_s rdev;
+};
+/*
+ * Flags for rs->print_flags field.
+ */
+#define DMPF_DAEMON_SLEEP      0x1
+#define DMPF_MAX_WRITE_BEHIND  0x2
+#define DMPF_SYNC              0x4
+#define DMPF_NOSYNC            0x8
+#define DMPF_STRIPE_CACHE      0x10
+#define DMPF_MIN_RECOVERY_RATE 0x20
+#define DMPF_MAX_RECOVERY_RATE 0x40
+struct raid_set {
+        struct dm_target *ti;
+        uint64_t print_flags;
+        struct mddev_s md;
+        struct raid_type *raid_type;
+        struct dm_target_callbacks callbacks;
+        struct raid_dev dev[0];
+};
+/* Supported raid types and properties. */
+static struct raid_type {
+        const char *name;               /* RAID algorithm. */
+        const char *descr;              /* Descriptor text for logging. */
+        const unsigned parity_devs;     /* # of parity devices. */
+        const unsigned minimal_devs;    /* minimal # of devices in set. */
+        const unsigned level;           /* RAID level. */
+        const unsigned algorithm;       /* RAID algorithm. */
+} raid_types[] = {
+        {"raid4",    "RAID4 (dedicated parity disk)",   1, 2, 5, ALGORITHM_PARITY_0},
+        {"raid5_la", "RAID5 (left asymmetric)",         1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
+        {"raid5_ra", "RAID5 (right asymmetric)",        1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
+        {"raid5_ls", "RAID5 (left symmetric)",          1, 2, 5, ALGORITHM_LEFT_SYMMETRIC},
+        {"raid5_rs", "RAID5 (right symmetric)",         1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC},
+        {"raid6_zr", "RAID6 (zero restart)",            2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART},
+        {"raid6_nr", "RAID6 (N restart)",               2, 4, 6, ALGORITHM_ROTATING_N_RESTART},
+        {"raid6_nc", "RAID6 (N continue)",              2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
+};
+static struct raid_type *get_raid_type(char *name)
+{
+        int i;
+        for (i = 0; i < ARRAY_SIZE(raid_types); i++)
+                if (!strcmp(raid_types[i].name, name))
+                        return &raid_types[i];
+        return NULL;
+}
+static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs)
+{
+        unsigned i;
+        struct raid_set *rs;
+        sector_t sectors_per_dev;
+        if (raid_devs <= raid_type->parity_devs) {
+                ti->error = "Insufficient number of devices";
+                return ERR_PTR(-EINVAL);
+        }
+        sectors_per_dev = ti->len;
+        if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
+                ti->error = "Target length not divisible by number of data devices";
+                return ERR_PTR(-EINVAL);
+        }
+        rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
+        if (!rs) {
+                ti->error = "Cannot allocate raid context";
+                return ERR_PTR(-ENOMEM);
+        }
+        mddev_init(&rs->md);
+        rs->ti = ti;
+        rs->raid_type = raid_type;
+        rs->md.raid_disks = raid_devs;
+        rs->md.level = raid_type->level;
+        rs->md.new_level = rs->md.level;
+        rs->md.dev_sectors = sectors_per_dev;
+        rs->md.layout = raid_type->algorithm;
+        rs->md.new_layout = rs->md.layout;
+        rs->md.delta_disks = 0;
+        rs->md.recovery_cp = 0;
+        for (i = 0; i < raid_devs; i++)
+                md_rdev_init(&rs->dev[i].rdev);
+        /*
+         * Remaining items to be initialized by further RAID params:
+         *  rs->md.persistent
+         *  rs->md.external
+         *  rs->md.chunk_sectors
+         *  rs->md.new_chunk_sectors
+         */
+        return rs;
+}
+static void context_free(struct raid_set *rs)
+{
+        int i;
+        for (i = 0; i < rs->md.raid_disks; i++)
+                if (rs->dev[i].data_dev)
+                        dm_put_device(rs->ti, rs->dev[i].data_dev);
+        kfree(rs);
+}
+/*
+ * For every device we have two words
+ *  <meta_dev>: meta device name or '-' if missing
+ *  <data_dev>: data device name or '-' if missing
+ *
+ * This code parses those words.
+ */
+static int dev_parms(struct raid_set *rs, char **argv)
+{
+        int i;
+        int rebuild = 0;
+        int metadata_available = 0;
+        int ret = 0;
+        for (i = 0; i < rs->md.raid_disks; i++, argv += 2) {
+                rs->dev[i].rdev.raid_disk = i;
+                rs->dev[i].meta_dev = NULL;
+                rs->dev[i].data_dev = NULL;
+                /*
+                 * There are no offsets, since there is a separate device
+                 * for data and metadata.
+                 */
+                rs->dev[i].rdev.data_offset = 0;
+                rs->dev[i].rdev.mddev = &rs->md;
+                if (strcmp(argv[0], "-")) {
+                        rs->ti->error = "Metadata devices not supported";
+                        return -EINVAL;
+                }
+                if (!strcmp(argv[1], "-")) {
+                        if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
+                            (!rs->dev[i].rdev.recovery_offset)) {
+                                rs->ti->error = "Drive designated for rebuild not specified";
+                                return -EINVAL;
+                        }
+                        continue;
+                }
+                ret = dm_get_device(rs->ti, argv[1],
+                                    dm_table_get_mode(rs->ti->table),
+                                    &rs->dev[i].data_dev);
+                if (ret) {
+                        rs->ti->error = "RAID device lookup failure";
+                        return ret;
+                }
+                rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
+                list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
+                if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+                        rebuild++;
+        }
+        if (metadata_available) {
+                rs->md.external = 0;
+                rs->md.persistent = 1;
+                rs->md.major_version = 2;
+        } else if (rebuild && !rs->md.recovery_cp) {
+                /*
+                 * Without metadata, we will not be able to tell if the array
+                 * is in-sync or not - we must assume it is not.  Therefore,
+                 * it is impossible to rebuild a drive.
+                 *
+                 * Even if there is metadata, the on-disk information may
+                 * indicate that the array is not in-sync and it will then
+                 * fail at that time.
+                 *
+                 * User could specify 'nosync' option if desperate.
+                 */
+                DMERR("Unable to rebuild drive while array is not in-sync");
+                rs->ti->error = "RAID device lookup failure";
+                return -EINVAL;
+        }
+        return 0;
+}
+/*
+ * Possible arguments are...
+ * RAID456:
+ *      <chunk_size> [optional_args]
+ *
+ * Optional args:
+ *    [[no]sync]                        Force or prevent recovery of the entire array
+ *    [rebuild <idx>]                   Rebuild the drive indicated by the index
+ *    [daemon_sleep <ms>]               Time between bitmap daemon work to clear bits
+ *    [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
+ *    [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
+ *    [max_write_behind <sectors>]      See '-write-behind=' (man mdadm)
+ *    [stripe_cache <sectors>]          Stripe cache size for higher RAIDs
+ */
+static int parse_raid_params(struct raid_set *rs, char **argv,
+                             unsigned num_raid_params)
+{
+        unsigned i, rebuild_cnt = 0;
+        unsigned long value;
+        char *key;
+        /*
+         * First, parse the in-order required arguments
+         */
+        if ((strict_strtoul(argv[0], 10, &value) < 0) ||
+            !is_power_of_2(value) || (value < 8)) {
+                rs->ti->error = "Bad chunk size";
+                return -EINVAL;
+        }
+        rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
+        argv++;
+        num_raid_params--;
+        /*
+         * Second, parse the unordered optional arguments
+         */
+        for (i = 0; i < rs->md.raid_disks; i++)
+                set_bit(In_sync, &rs->dev[i].rdev.flags);
+        for (i = 0; i < num_raid_params; i++) {
+                if (!strcmp(argv[i], "nosync")) {
+                        rs->md.recovery_cp = MaxSector;
+                        rs->print_flags |= DMPF_NOSYNC;
+                        rs->md.flags |= MD_SYNC_STATE_FORCED;
+                        continue;
+                }
+                if (!strcmp(argv[i], "sync")) {
+                        rs->md.recovery_cp = 0;
+                        rs->print_flags |= DMPF_SYNC;
+                        rs->md.flags |= MD_SYNC_STATE_FORCED;
+                        continue;
+                }
+                /* The rest of the optional arguments come in key/value pairs */
+                if ((i + 1) >= num_raid_params) {
+                        rs->ti->error = "Wrong number of raid parameters given";
+                        return -EINVAL;
+                }
+                key = argv[i++];
+                if (strict_strtoul(argv[i], 10, &value) < 0) {
+                        rs->ti->error = "Bad numerical argument given in raid params";
+                        return -EINVAL;
+                }
+                if (!strcmp(key, "rebuild")) {
+                        if (++rebuild_cnt > rs->raid_type->parity_devs) {
+                                rs->ti->error = "Too many rebuild drives given";
+                                return -EINVAL;
+                        }
+                        if (value > rs->md.raid_disks) {
+                                rs->ti->error = "Invalid rebuild index given";
+                                return -EINVAL;
+                        }
+                        clear_bit(In_sync, &rs->dev[value].rdev.flags);
+                        rs->dev[value].rdev.recovery_offset = 0;
+                } else if (!strcmp(key, "max_write_behind")) {
+                        rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
+                        /*
+                         * In device-mapper, we specify things in sectors, but
+                         * MD records this value in kB
+                         */
+                        value /= 2;
+                        if (value > COUNTER_MAX) {
+                                rs->ti->error = "Max write-behind limit out of range";
+                                return -EINVAL;
+                        }
+                        rs->md.bitmap_info.max_write_behind = value;
+                } else if (!strcmp(key, "daemon_sleep")) {
+                        rs->print_flags |= DMPF_DAEMON_SLEEP;
+                        if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
+                                rs->ti->error = "daemon sleep period out of range";
+                                return -EINVAL;
+                        }
+                        rs->md.bitmap_info.daemon_sleep = value;
+                } else if (!strcmp(key, "stripe_cache")) {
+                        rs->print_flags |= DMPF_STRIPE_CACHE;
+                        /*
+                         * In device-mapper, we specify things in sectors, but
+                         * MD records this value in kB
+                         */
+                        value /= 2;
+                        if (rs->raid_type->level < 5) {
+                                rs->ti->error = "Inappropriate argument: stripe_cache";
+                                return -EINVAL;
+                        }
+                        if (raid5_set_cache_size(&rs->md, (int)value)) {
+                                rs->ti->error = "Bad stripe_cache size";
+                                return -EINVAL;
+                        }
+                } else if (!strcmp(key, "min_recovery_rate")) {
+                        rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
+                        if (value > INT_MAX) {
+                                rs->ti->error = "min_recovery_rate out of range";
+                                return -EINVAL;
+                        }
+                        rs->md.sync_speed_min = (int)value;
+                } else if (!strcmp(key, "max_recovery_rate")) {
+                        rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
+                        if (value > INT_MAX) {
+                                rs->ti->error = "max_recovery_rate out of range";
+                                return -EINVAL;
+                        }
+                        rs->md.sync_speed_max = (int)value;
+                } else {
+                        DMERR("Unable to parse RAID parameter: %s", key);
+                        rs->ti->error = "Unable to parse RAID parameters";
+                        return -EINVAL;
+                }
+        }
+        /* Assume there are no metadata devices until the drives are parsed */
+        rs->md.persistent = 0;
+        rs->md.external = 1;
+        return 0;
+}
+static void do_table_event(struct work_struct *ws)
+{
+        struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
+        dm_table_event(rs->ti->table);
+}
+static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
+{
+        struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
+        return md_raid5_congested(&rs->md, bits);
+}
+static void raid_unplug(struct dm_target_callbacks *cb)
+{
+        struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
+        md_raid5_unplug_device(rs->md.private);
+}
+/*
+ * Construct a RAID4/5/6 mapping:
+ * Args:
+ *      <raid_type> <#raid_params> <raid_params>                \
+ *      <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
+ *
+ * ** metadata devices are not supported yet, use '-' instead **
+ *
+ * <raid_params> varies by <raid_type>.  See 'parse_raid_params' for
+ * details on possible <raid_params>.
+ */
+static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+        int ret;
+        struct raid_type *rt;
+        unsigned long num_raid_params, num_raid_devs;
+        struct raid_set *rs = NULL;
+        /* Must have at least <raid_type> <#raid_params> */
+        if (argc < 2) {
+                ti->error = "Too few arguments";
+                return -EINVAL;
+        }
+        /* raid type */
+        rt = get_raid_type(argv[0]);
+        if (!rt) {
+                ti->error = "Unrecognised raid_type";
+                return -EINVAL;
+        }
+        argc--;
+        argv++;
+        /* number of RAID parameters */
+        if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) {
+                ti->error = "Cannot understand number of RAID parameters";
+                return -EINVAL;
+        }
+        argc--;
+        argv++;
+        /* Skip over RAID params for now and find out # of devices */
+        if (num_raid_params + 1 > argc) {
+                ti->error = "Arguments do not agree with counts given";
+                return -EINVAL;
+        }
+        if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
+            (num_raid_devs >= INT_MAX)) {
+                ti->error = "Cannot understand number of raid devices";
+                return -EINVAL;
+        }
+        rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
+        if (IS_ERR(rs))
+                return PTR_ERR(rs);
+        ret = parse_raid_params(rs, argv, (unsigned)num_raid_params);
+        if (ret)
+                goto bad;
+        ret = -EINVAL;
+        argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
+        argv += num_raid_params + 1;
+        if (argc != (num_raid_devs * 2)) {
+                ti->error = "Supplied RAID devices does not match the count given";
+                goto bad;
+        }
+        ret = dev_parms(rs, argv);
+        if (ret)
+                goto bad;
+        INIT_WORK(&rs->md.event_work, do_table_event);
+        ti->split_io = rs->md.chunk_sectors;
+        ti->private = rs;
+        mutex_lock(&rs->md.reconfig_mutex);
+        ret = md_run(&rs->md);
+        rs->md.in_sync = 0; /* Assume already marked dirty */
+        mutex_unlock(&rs->md.reconfig_mutex);
+        if (ret) {
+                ti->error = "Fail to run raid array";
+                goto bad;
+        }
+        rs->callbacks.congested_fn = raid_is_congested;
+        rs->callbacks.unplug_fn = raid_unplug;
+        dm_table_add_target_callbacks(ti->table, &rs->callbacks);
+        return 0;
+bad:
+        context_free(rs);
+        return ret;
+}
+static void raid_dtr(struct dm_target *ti)
+{
+        struct raid_set *rs = ti->private;
+        list_del_init(&rs->callbacks.list);
+        md_stop(&rs->md);
+        context_free(rs);
+}
+static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
+{
+        struct raid_set *rs = ti->private;
+        mddev_t *mddev = &rs->md;
+        mddev->pers->make_request(mddev, bio);
+        return DM_MAPIO_SUBMITTED;
+}
+static int raid_status(struct dm_target *ti, status_type_t type,
+                       char *result, unsigned maxlen)
+{
+        struct raid_set *rs = ti->private;
+        unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
+        unsigned sz = 0;
+        int i;
+        sector_t sync;
+        switch (type) {
+        case STATUSTYPE_INFO:
+                DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
+                for (i = 0; i < rs->md.raid_disks; i++) {
+                        if (test_bit(Faulty, &rs->dev[i].rdev.flags))
+                                DMEMIT("D");
+                        else if (test_bit(In_sync, &rs->dev[i].rdev.flags))
+                                DMEMIT("A");
+                        else
+                                DMEMIT("a");
+                }
+                if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
+                        sync = rs->md.curr_resync_completed;
+                else
+                        sync = rs->md.recovery_cp;
+                if (sync > rs->md.resync_max_sectors)
+                        sync = rs->md.resync_max_sectors;
+                DMEMIT(" %llu/%llu",
+                       (unsigned long long) sync,
+                       (unsigned long long) rs->md.resync_max_sectors);
+                break;
+        case STATUSTYPE_TABLE:
+                /* The string you would use to construct this array */
+                for (i = 0; i < rs->md.raid_disks; i++)
+                        if (rs->dev[i].data_dev &&
+                            !test_bit(In_sync, &rs->dev[i].rdev.flags))
+                                raid_param_cnt++; /* for rebuilds */
+                raid_param_cnt += (hweight64(rs->print_flags) * 2);
+                if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
+                        raid_param_cnt--;
+                DMEMIT("%s %u %u", rs->raid_type->name,
+                       raid_param_cnt, rs->md.chunk_sectors);
+                if ((rs->print_flags & DMPF_SYNC) &&
+                    (rs->md.recovery_cp == MaxSector))
+                        DMEMIT(" sync");
+                if (rs->print_flags & DMPF_NOSYNC)
+                        DMEMIT(" nosync");
+                for (i = 0; i < rs->md.raid_disks; i++)
+                        if (rs->dev[i].data_dev &&
+                            !test_bit(In_sync, &rs->dev[i].rdev.flags))
+                                DMEMIT(" rebuild %u", i);
+                if (rs->print_flags & DMPF_DAEMON_SLEEP)
+                        DMEMIT(" daemon_sleep %lu",
+                               rs->md.bitmap_info.daemon_sleep);
+                if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
+                        DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
+                if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
+                        DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
+                if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
+                        DMEMIT(" max_write_behind %lu",
+                               rs->md.bitmap_info.max_write_behind);
+                if (rs->print_flags & DMPF_STRIPE_CACHE) {
+                        raid5_conf_t *conf = rs->md.private;
+                        /* convert from kiB to sectors */
+                        DMEMIT(" stripe_cache %d",
+                               conf ? conf->max_nr_stripes * 2 : 0);
+                }
+                DMEMIT(" %d", rs->md.raid_disks);
+                for (i = 0; i < rs->md.raid_disks; i++) {
+                        DMEMIT(" -"); /* metadata device */
+                        if (rs->dev[i].data_dev)
+                                DMEMIT(" %s", rs->dev[i].data_dev->name);
+                        else
+                                DMEMIT(" -");
+                }
+        }
+        return 0;
+}
+static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
+{
+        struct raid_set *rs = ti->private;
+        unsigned i;
+        int ret = 0;
+        for (i = 0; !ret && i < rs->md.raid_disks; i++)
+                if (rs->dev[i].data_dev)
+                        ret = fn(ti,
+                                 rs->dev[i].data_dev,
+                                 0, /* No offset on data devs */
+                                 rs->md.dev_sectors,
+                                 data);
+        return ret;
+}
+static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+        struct raid_set *rs = ti->private;
+        unsigned chunk_size = rs->md.chunk_sectors << 9;
+        raid5_conf_t *conf = rs->md.private;
+        blk_limits_io_min(limits, chunk_size);
+        blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded));
+}
+static void raid_presuspend(struct dm_target *ti)
+{
+        struct raid_set *rs = ti->private;
+        md_stop_writes(&rs->md);
+}
+static void raid_postsuspend(struct dm_target *ti)
+{
+        struct raid_set *rs = ti->private;
+        mddev_suspend(&rs->md);
+}
+static void raid_resume(struct dm_target *ti)
+{
+        struct raid_set *rs = ti->private;
+        mddev_resume(&rs->md);
+}
+static struct target_type raid_target = {
+        .name = "raid",
+        .version = {1, 0, 0},
+        .module = THIS_MODULE,
+        .ctr = raid_ctr,
+        .dtr = raid_dtr,
+        .map = raid_map,
+        .status = raid_status,
+        .iterate_devices = raid_iterate_devices,
+        .io_hints = raid_io_hints,
+        .presuspend = raid_presuspend,
+        .postsuspend = raid_postsuspend,
+        .resume = raid_resume,
+};
+static int __init dm_raid_init(void)
+{
+        return dm_register_target(&raid_target);
+}
+static void __exit dm_raid_exit(void)
+{
+        dm_unregister_target(&raid_target);
+}
+module_init(dm_raid_init);
+module_exit(dm_raid_exit);
+MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
+MODULE_ALIAS("dm-raid4");
+MODULE_ALIAS("dm-raid5");
+MODULE_ALIAS("dm-raid6");
+MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 19a59b041c27..dee326775c60 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -261,7 +261,7 @@ static int mirror_flush(struct dm_target *ti)
        struct dm_io_request io_req = {
                .bi_rw = WRITE_FLUSH,
                .mem.type = DM_IO_KMEM,
-                .mem.ptr.bvec = NULL,
+                .mem.ptr.addr = NULL,
                .client = ms->io_client,
        };
@@ -637,6 +637,12 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
                .client = ms->io_client,
        };
+        if (bio->bi_rw & REQ_DISCARD) {
+                io_req.bi_rw |= REQ_DISCARD;
+                io_req.mem.type = DM_IO_KMEM;
+                io_req.mem.ptr.addr = NULL;
+        }
        for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
                map_region(dest++, m, bio);
@@ -670,7 +676,8 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
        bio_list_init(&requeue);
        while ((bio = bio_list_pop(writes))) {
-                if (bio->bi_rw & REQ_FLUSH) {
+                if ((bio->bi_rw & REQ_FLUSH) ||
+                    (bio->bi_rw & REQ_DISCARD)) {
                        bio_list_add(&sync, bio);
                        continue;
                }
@@ -1076,8 +1083,10 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        ti->private = ms;
        ti->split_io = dm_rh_get_region_size(ms->rh);
        ti->num_flush_requests = 1;
+        ti->num_discard_requests = 1;
-        ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
+        ms->kmirrord_wq = alloc_workqueue("kmirrord",
+                                          WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
        if (!ms->kmirrord_wq) {
                DMERR("couldn't start kmirrord");
                r = -ENOMEM;
@@ -1130,7 +1139,7 @@ static void mirror_dtr(struct dm_target *ti)
        del_timer_sync(&ms->timer);
        flush_workqueue(ms->kmirrord_wq);
-        flush_scheduled_work();
+        flush_work_sync(&ms->trigger_event);
        dm_kcopyd_client_destroy(ms->kcopyd_client);
        destroy_workqueue(ms->kmirrord_wq);
        free_context(ms, ti, ms->nr_mirrors);
@@ -1406,7 +1415,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
 static struct target_type mirror_target = {
        .name    = "mirror",
-        .version = {1, 12, 0},
+        .version = {1, 12, 1},
        .module  = THIS_MODULE,
        .ctr     = mirror_ctr,
        .dtr     = mirror_dtr,
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 2129cdb115dc..95891dfcbca0 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -256,7 +256,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
         */
        INIT_WORK_ONSTACK(&req.work, do_metadata);
        queue_work(ps->metadata_wq, &req.work);
-        flush_workqueue(ps->metadata_wq);
+        flush_work(&req.work);
        return req.result;
 }
@@ -818,7 +818,7 @@ static int persistent_ctr(struct dm_exception_store *store,
        atomic_set(&ps->pending_count, 0);
        ps->callbacks = NULL;
-        ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
+        ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0);
        if (!ps->metadata_wq) {
                kfree(ps);
                DMERR("couldn't start header metadata update thread");
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 53cf79d8bcbc..fdde53cd12b7 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -19,7 +19,6 @@
 #include <linux/vmalloc.h>
 #include <linux/log2.h>
 #include <linux/dm-kcopyd.h>
-#include <linux/workqueue.h>
 #include "dm-exception-store.h"
@@ -80,9 +79,6 @@ struct dm_snapshot {
        /* Origin writes don't trigger exceptions until this is set */
        int active;
-        /* Whether or not owning mapped_device is suspended */
-        int suspended;
        atomic_t pending_exceptions_count;
        mempool_t *pending_pool;
@@ -106,10 +102,6 @@ struct dm_snapshot {
        struct dm_kcopyd_client *kcopyd_client;
-        /* Queue of snapshot writes for ksnapd to flush */
-        struct bio_list queued_bios;
-        struct work_struct queued_bios_work;
        /* Wait for events based on state_bits */
        unsigned long state_bits;
@@ -160,9 +152,6 @@ struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
 }
 EXPORT_SYMBOL(dm_snap_cow);
-static struct workqueue_struct *ksnapd;
-static void flush_queued_bios(struct work_struct *work);
 static sector_t chunk_to_sector(struct dm_exception_store *store,
                                chunk_t chunk)
 {
@@ -1110,7 +1099,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        s->ti = ti;
        s->valid = 1;
        s->active = 0;
-        s->suspended = 0;
        atomic_set(&s->pending_exceptions_count, 0);
        init_rwsem(&s->lock);
        INIT_LIST_HEAD(&s->list);
@@ -1153,9 +1141,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        spin_lock_init(&s->tracked_chunk_lock);
-        bio_list_init(&s->queued_bios);
-        INIT_WORK(&s->queued_bios_work, flush_queued_bios);
        ti->private = s;
        ti->num_flush_requests = num_flush_requests;
@@ -1279,8 +1264,6 @@ static void snapshot_dtr(struct dm_target *ti)
        struct dm_snapshot *s = ti->private;
        struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
-        flush_workqueue(ksnapd);
        down_read(&_origins_lock);
        /* Check whether exception handover must be cancelled */
        (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
@@ -1342,20 +1325,6 @@ static void flush_bios(struct bio *bio)
        }
 }
-static void flush_queued_bios(struct work_struct *work)
-{
-        struct dm_snapshot *s =
-                container_of(work, struct dm_snapshot, queued_bios_work);
-        struct bio *queued_bios;
-        unsigned long flags;
-        spin_lock_irqsave(&s->pe_lock, flags);
-        queued_bios = bio_list_get(&s->queued_bios);
-        spin_unlock_irqrestore(&s->pe_lock, flags);
-        flush_bios(queued_bios);
-}
 static int do_origin(struct dm_dev *origin, struct bio *bio);
 /*
@@ -1760,15 +1729,6 @@ static void snapshot_merge_presuspend(struct dm_target *ti)
        stop_merge(s);
 }
-static void snapshot_postsuspend(struct dm_target *ti)
-{
-        struct dm_snapshot *s = ti->private;
-        down_write(&s->lock);
-        s->suspended = 1;
-        up_write(&s->lock);
-}
 static int snapshot_preresume(struct dm_target *ti)
 {
        int r = 0;
@@ -1783,7 +1743,7 @@ static int snapshot_preresume(struct dm_target *ti)
                        DMERR("Unable to resume snapshot source until "
                              "handover completes.");
                        r = -EINVAL;
-                } else if (!snap_src->suspended) {
+                } else if (!dm_suspended(snap_src->ti)) {
                        DMERR("Unable to perform snapshot handover until "
                              "source is suspended.");
                        r = -EINVAL;
@@ -1816,7 +1776,6 @@ static void snapshot_resume(struct dm_target *ti)
        down_write(&s->lock);
        s->active = 1;
-        s->suspended = 0;
        up_write(&s->lock);
 }
@@ -2194,7 +2153,7 @@ static int origin_iterate_devices(struct dm_target *ti,
 static struct target_type origin_target = {
        .name    = "snapshot-origin",
-        .version = {1, 7, 0},
+        .version = {1, 7, 1},
        .module  = THIS_MODULE,
        .ctr     = origin_ctr,
        .dtr     = origin_dtr,
@@ -2207,13 +2166,12 @@ static struct target_type origin_target = {
 static struct target_type snapshot_target = {
        .name    = "snapshot",
-        .version = {1, 9, 0},
+        .version = {1, 10, 0},
        .module  = THIS_MODULE,
        .ctr     = snapshot_ctr,
        .dtr     = snapshot_dtr,
        .map     = snapshot_map,
        .end_io  = snapshot_end_io,
-        .postsuspend = snapshot_postsuspend,
        .preresume  = snapshot_preresume,
        .resume  = snapshot_resume,
        .status  = snapshot_status,
@@ -2222,14 +2180,13 @@ static struct target_type snapshot_target = {
 static struct target_type merge_target = {
        .name    = dm_snapshot_merge_target_name,
-        .version = {1, 0, 0},
+        .version = {1, 1, 0},
        .module  = THIS_MODULE,
        .ctr     = snapshot_ctr,
        .dtr     = snapshot_dtr,
        .map     = snapshot_merge_map,
        .end_io  = snapshot_end_io,
        .presuspend = snapshot_merge_presuspend,
-        .postsuspend = snapshot_postsuspend,
        .preresume  = snapshot_preresume,
        .resume  = snapshot_merge_resume,
        .status  = snapshot_status,
@@ -2291,17 +2248,8 @@ static int __init dm_snapshot_init(void)
                goto bad_tracked_chunk_cache;
        }
-        ksnapd = create_singlethread_workqueue("ksnapd");
-        if (!ksnapd) {
-                DMERR("Failed to create ksnapd workqueue.");
-                r = -ENOMEM;
-                goto bad_pending_pool;
-        }
        return 0;
-bad_pending_pool:
-        kmem_cache_destroy(tracked_chunk_cache);
 bad_tracked_chunk_cache:
        kmem_cache_destroy(pending_cache);
 bad_pending_cache:
@@ -2322,8 +2270,6 @@ bad_register_snapshot_target:
 static void __exit dm_snapshot_exit(void)
 {
-        destroy_workqueue(ksnapd);
        dm_unregister_target(&snapshot_target);
        dm_unregister_target(&origin_target);
        dm_unregister_target(&merge_target);
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index f0371b4c4fbf..dddfa14f2982 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -39,23 +39,20 @@ struct stripe_c {
        struct dm_target *ti;
        /* Work struct used for triggering events*/
-        struct work_struct kstriped_ws;
+        struct work_struct trigger_event;
        struct stripe stripe[0];
 };
-static struct workqueue_struct *kstriped;
 /*
 * An event is triggered whenever a drive
 * drops out of a stripe volume.
 */
 static void trigger_event(struct work_struct *work)
 {
-        struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws);
+        struct stripe_c *sc = container_of(work, struct stripe_c,
+                                           trigger_event);
        dm_table_event(sc->ti->table);
 }
 static inline struct stripe_c *alloc_context(unsigned int stripes)
@@ -160,7 +157,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                return -ENOMEM;
        }
-        INIT_WORK(&sc->kstriped_ws, trigger_event);
+        INIT_WORK(&sc->trigger_event, trigger_event);
        /* Set pointer to dm target; used in trigger_event */
        sc->ti = ti;
@@ -211,7 +208,7 @@ static void stripe_dtr(struct dm_target *ti)
        for (i = 0; i < sc->stripes; i++)
                dm_put_device(ti, sc->stripe[i].dev);
-        flush_workqueue(kstriped);
+        flush_work_sync(&sc->trigger_event);
        kfree(sc);
 }
@@ -367,7 +364,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
                        atomic_inc(&(sc->stripe[i].error_count));
                        if (atomic_read(&(sc->stripe[i].error_count)) <
                            DM_IO_ERROR_THRESHOLD)
-                                queue_work(kstriped, &sc->kstriped_ws);
+                                schedule_work(&sc->trigger_event);
                }
        return error;
@@ -401,7 +398,7 @@ static void stripe_io_hints(struct dm_target *ti,
 static struct target_type stripe_target = {
        .name   = "striped",
-        .version = {1, 3, 0},
+        .version = {1, 3, 1},
        .module = THIS_MODULE,
        .ctr    = stripe_ctr,
        .dtr    = stripe_dtr,
@@ -422,20 +419,10 @@ int __init dm_stripe_init(void)
                return r;
        }
-        kstriped = create_singlethread_workqueue("kstriped");
-        if (!kstriped) {
-                DMERR("failed to create workqueue kstriped");
-                dm_unregister_target(&stripe_target);
-                return -ENOMEM;
-        }
        return r;
 }
 void dm_stripe_exit(void)
 {
        dm_unregister_target(&stripe_target);
-        destroy_workqueue(kstriped);
-        return;
 }
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 985c20a4f30e..dffa0ac7c4f0 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -71,6 +71,8 @@ struct dm_table {
        void *event_context;
        struct dm_md_mempools *mempools;
+        struct list_head target_callbacks;
 };
 /*
@@ -204,6 +206,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
                return -ENOMEM;
        INIT_LIST_HEAD(&t->devices);
+        INIT_LIST_HEAD(&t->target_callbacks);
        atomic_set(&t->holders, 0);
        t->discards_supported = 1;
@@ -1225,10 +1228,17 @@ int dm_table_resume_targets(struct dm_table *t)
        return 0;
 }
+void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb)
+{
+        list_add(&cb->list, &t->target_callbacks);
+}
+EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 {
        struct dm_dev_internal *dd;
        struct list_head *devices = dm_table_get_devices(t);
+        struct dm_target_callbacks *cb;
        int r = 0;
        list_for_each_entry(dd, devices, list) {
@@ -1243,6 +1253,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
                                     bdevname(dd->dm_dev.bdev, b));
        }
+        list_for_each_entry(cb, &t->target_callbacks, list)
+                if (cb->congested_fn)
+                        r |= cb->congested_fn(cb, bdi_bits);
        return r;
 }
@@ -1264,6 +1278,7 @@ void dm_table_unplug_all(struct dm_table *t)
 {
        struct dm_dev_internal *dd;
        struct list_head *devices = dm_table_get_devices(t);
+        struct dm_target_callbacks *cb;
        list_for_each_entry(dd, devices, list) {
                struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
@@ -1276,6 +1291,10 @@ void dm_table_unplug_all(struct dm_table *t)
                                     dm_device_name(t->md),
                                     bdevname(dd->dm_dev.bdev, b));
        }
+        list_for_each_entry(cb, &t->target_callbacks, list)
+                if (cb->unplug_fn)
+                        cb->unplug_fn(cb);
 }
 struct mapped_device *dm_table_get_md(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f48a2f359ac4..eaa3af0e0632 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -32,7 +32,6 @@
 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
 #define DM_COOKIE_LENGTH 24
-static DEFINE_MUTEX(dm_mutex);
 static const char *_name = DM_NAME;
 static unsigned int major = 0;
@@ -328,7 +327,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 {
        struct mapped_device *md;
-        mutex_lock(&dm_mutex);
        spin_lock(&_minor_lock);
        md = bdev->bd_disk->private_data;
@@ -346,7 +344,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 out:
        spin_unlock(&_minor_lock);
-        mutex_unlock(&dm_mutex);
        return md ? 0 : -ENXIO;
 }
@@ -355,10 +352,12 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode)
 {
        struct mapped_device *md = disk->private_data;
-        mutex_lock(&dm_mutex);
+        spin_lock(&_minor_lock);
        atomic_dec(&md->open_count);
        dm_put(md);
-        mutex_unlock(&dm_mutex);
+        spin_unlock(&_minor_lock);
        return 0;
 }
@@ -1638,13 +1637,15 @@ static void dm_request_fn(struct request_queue *q)
                if (map_request(ti, clone, md))
                        goto requeued;
-                spin_lock_irq(q->queue_lock);
+                BUG_ON(!irqs_disabled());
+                spin_lock(q->queue_lock);
        }
        goto out;
 requeued:
-        spin_lock_irq(q->queue_lock);
+        BUG_ON(!irqs_disabled());
+        spin_lock(q->queue_lock);
 plug_and_out:
        if (!elv_queue_empty(q))
@@ -1884,7 +1885,8 @@ static struct mapped_device *alloc_dev(int minor)
        add_disk(md->disk);
        format_dev_t(md->name, MKDEV(_major, minor));
-        md->wq = create_singlethread_workqueue("kdmflush");
+        md->wq = alloc_workqueue("kdmflush",
+                                 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
        if (!md->wq)
                goto bad_thread;
@@ -1992,13 +1994,14 @@ static void event_callback(void *context)
        wake_up(&md->eventq);
 }
+/*
+ * Protected by md->suspend_lock obtained by dm_swap_table().
+ */
 static void __set_size(struct mapped_device *md, sector_t size)
 {
        set_capacity(md->disk, size);
-        mutex_lock(&md->bdev->bd_inode->i_mutex);
        i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
-        mutex_unlock(&md->bdev->bd_inode->i_mutex);
 }
 /*
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7fc090ac9e28..cf8594c5ea21 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -288,10 +288,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
        int rv;
        int cpu;
-        if (mddev == NULL || mddev->pers == NULL) {
+        if (mddev == NULL || mddev->pers == NULL
+            || !mddev->ready) {
                bio_io_error(bio);
                return 0;
        }
+        smp_rmb(); /* Ensure implications of  'active' are visible */
        rcu_read_lock();
        if (mddev->suspended) {
                DEFINE_WAIT(__wait);
@@ -703,9 +705,9 @@ static struct mdk_personality *find_pers(int level, char *clevel)
 }
 /* return the offset of the super block in 512byte sectors */
-static inline sector_t calc_dev_sboffset(struct block_device *bdev)
+static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev)
 {
-        sector_t num_sectors = i_size_read(bdev->bd_inode) / 512;
+        sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
        return MD_NEW_SIZE_SECTORS(num_sectors);
 }
@@ -763,7 +765,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
         */
        struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
-        bio->bi_bdev = rdev->bdev;
+        bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
        bio->bi_sector = sector;
        bio_add_page(bio, page, size, 0);
        bio->bi_private = rdev;
@@ -793,7 +795,7 @@ static void bi_complete(struct bio *bio, int error)
 }
 int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
-                 struct page *page, int rw)
+                 struct page *page, int rw, bool metadata_op)
 {
        struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
        struct completion event;
@@ -801,8 +803,12 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
        rw |= REQ_SYNC | REQ_UNPLUG;
-        bio->bi_bdev = rdev->bdev;
+        bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
-        bio->bi_sector = sector;
+                rdev->meta_bdev : rdev->bdev;
+        if (metadata_op)
+                bio->bi_sector = sector + rdev->sb_start;
+        else
+                bio->bi_sector = sector + rdev->data_offset;
        bio_add_page(bio, page, size, 0);
        init_completion(&event);
        bio->bi_private = &event;
@@ -827,7 +833,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
                return 0;
-        if (!sync_page_io(rdev, rdev->sb_start, size, rdev->sb_page, READ))
+        if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
                goto fail;
        rdev->sb_loaded = 1;
        return 0;
@@ -989,7 +995,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
         *
         * It also happens to be a multiple of 4Kb.
         */
-        rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+        rdev->sb_start = calc_dev_sboffset(rdev);
        ret = read_disk_sb(rdev, MD_SB_BYTES);
        if (ret) return ret;
@@ -1330,7 +1336,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
                return 0; /* component must fit device */
        if (rdev->mddev->bitmap_info.offset)
                return 0; /* can't move bitmap */
-        rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+        rdev->sb_start = calc_dev_sboffset(rdev);
        if (!num_sectors || num_sectors > rdev->sb_start)
                num_sectors = rdev->sb_start;
        md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
@@ -2465,6 +2471,10 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                        if (rdev2->raid_disk == slot)
                                return -EEXIST;
+                if (slot >= rdev->mddev->raid_disks &&
+                    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
+                        return -ENOSPC;
                rdev->raid_disk = slot;
                if (test_bit(In_sync, &rdev->flags))
                        rdev->saved_raid_disk = slot;
@@ -2482,7 +2492,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                        /* failure here is OK */;
                /* don't wakeup anyone, leave that to userspace. */
        } else {
-                if (slot >= rdev->mddev->raid_disks)
+                if (slot >= rdev->mddev->raid_disks &&
+                    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
                        return -ENOSPC;
                rdev->raid_disk = slot;
                /* assume it is working */
@@ -3107,7 +3118,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
                char nm[20];
                if (rdev->raid_disk < 0)
                        continue;
-                if (rdev->new_raid_disk > mddev->raid_disks)
+                if (rdev->new_raid_disk >= mddev->raid_disks)
                        rdev->new_raid_disk = -1;
                if (rdev->new_raid_disk == rdev->raid_disk)
                        continue;
@@ -3736,6 +3747,8 @@ action_show(mddev_t *mddev, char *page)
        return sprintf(page, "%s\n", type);
 }
+static void reap_sync_thread(mddev_t *mddev);
 static ssize_t
 action_store(mddev_t *mddev, const char *page, size_t len)
 {
@@ -3750,9 +3763,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
        if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
                if (mddev->sync_thread) {
                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-                        md_unregister_thread(mddev->sync_thread);
+                        reap_sync_thread(mddev);
-                        mddev->sync_thread = NULL;
-                        mddev->recovery = 0;
                }
        } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
                   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
@@ -3904,7 +3915,7 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
 static ssize_t
 sync_completed_show(mddev_t *mddev, char *page)
 {
-        unsigned long max_sectors, resync;
+        unsigned long long max_sectors, resync;
        if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
                return sprintf(page, "none\n");
@@ -3915,7 +3926,7 @@ sync_completed_show(mddev_t *mddev, char *page)
                max_sectors = mddev->dev_sectors;
        resync = mddev->curr_resync_completed;
-        return sprintf(page, "%lu / %lu\n", resync, max_sectors);
+        return sprintf(page, "%llu / %llu\n", resync, max_sectors);
 }
 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
@@ -4002,19 +4013,24 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
 {
        char *e;
        unsigned long long new = simple_strtoull(buf, &e, 10);
+        unsigned long long old = mddev->suspend_lo;
        if (mddev->pers == NULL || 
            mddev->pers->quiesce == NULL)
                return -EINVAL;
        if (buf == e || (*e && *e != '\n'))
                return -EINVAL;
-        if (new >= mddev->suspend_hi ||
-            (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
+        mddev->suspend_lo = new;
-                mddev->suspend_lo = new;
+        if (new >= old)
+                /* Shrinking suspended region */
                mddev->pers->quiesce(mddev, 2);
-                return len;
+        else {
-        } else
+                /* Expanding suspended region - need to wait */
-                return -EINVAL;
+                mddev->pers->quiesce(mddev, 1);
+                mddev->pers->quiesce(mddev, 0);
+        }
+        return len;
 }
 static struct md_sysfs_entry md_suspend_lo =
 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
@@ -4031,20 +4047,24 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
 {
        char *e;
        unsigned long long new = simple_strtoull(buf, &e, 10);
+        unsigned long long old = mddev->suspend_hi;
        if (mddev->pers == NULL ||
            mddev->pers->quiesce == NULL)
                return -EINVAL;
        if (buf == e || (*e && *e != '\n'))
                return -EINVAL;
-        if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
-            (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
+        mddev->suspend_hi = new;
-                mddev->suspend_hi = new;
+        if (new <= old)
+                /* Shrinking suspended region */
+                mddev->pers->quiesce(mddev, 2);
+        else {
+                /* Expanding suspended region - need to wait */
                mddev->pers->quiesce(mddev, 1);
                mddev->pers->quiesce(mddev, 0);
-                return len;
+        }
-        } else
+        return len;
-                return -EINVAL;
 }
 static struct md_sysfs_entry md_suspend_hi =
 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
@@ -4422,7 +4442,9 @@ int md_run(mddev_t *mddev)
                 * We don't want the data to overlap the metadata,
                 * Internal Bitmap issues have been handled elsewhere.
                 */
-                if (rdev->data_offset < rdev->sb_start) {
+                if (rdev->meta_bdev) {
+                        /* Nothing to check */;
+                } else if (rdev->data_offset < rdev->sb_start) {
                        if (mddev->dev_sectors &&
                            rdev->data_offset + mddev->dev_sectors
                            > rdev->sb_start) {
@@ -4556,7 +4578,8 @@ int md_run(mddev_t *mddev)
        mddev->safemode_timer.data = (unsigned long) mddev;
        mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
        mddev->in_sync = 1;
+        smp_wmb();
+        mddev->ready = 1;
        list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->raid_disk >= 0) {
                        char nm[20];
@@ -4693,13 +4716,12 @@ static void md_clean(mddev_t *mddev)
        mddev->plug = NULL;
 }
-void md_stop_writes(mddev_t *mddev)
+static void __md_stop_writes(mddev_t *mddev)
 {
        if (mddev->sync_thread) {
                set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-                md_unregister_thread(mddev->sync_thread);
+                reap_sync_thread(mddev);
-                mddev->sync_thread = NULL;
        }
        del_timer_sync(&mddev->safemode_timer);
@@ -4713,10 +4735,18 @@ void md_stop_writes(mddev_t *mddev)
                md_update_sb(mddev, 1);
        }
 }
+void md_stop_writes(mddev_t *mddev)
+{
+        mddev_lock(mddev);
+        __md_stop_writes(mddev);
+        mddev_unlock(mddev);
+}
 EXPORT_SYMBOL_GPL(md_stop_writes);
 void md_stop(mddev_t *mddev)
 {
+        mddev->ready = 0;
        mddev->pers->stop(mddev);
        if (mddev->pers->sync_request && mddev->to_remove == NULL)
                mddev->to_remove = &md_redundancy_group;
@@ -4736,7 +4766,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open)
                goto out;
        }
        if (mddev->pers) {
-                md_stop_writes(mddev);
+                __md_stop_writes(mddev);
                err  = -ENXIO;
                if (mddev->ro==1)
@@ -4773,7 +4803,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                if (mddev->ro)
                        set_disk_ro(disk, 0);
-                md_stop_writes(mddev);
+                __md_stop_writes(mddev);
                md_stop(mddev);
                mddev->queue->merge_bvec_fn = NULL;
                mddev->queue->unplug_fn = NULL;
@@ -5151,9 +5181,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
                /* set saved_raid_disk if appropriate */
                if (!mddev->persistent) {
                        if (info->state & (1<<MD_DISK_SYNC)  &&
-                            info->raid_disk < mddev->raid_disks)
+                            info->raid_disk < mddev->raid_disks) {
                                rdev->raid_disk = info->raid_disk;
-                        else
+                                set_bit(In_sync, &rdev->flags);
+                        } else
                                rdev->raid_disk = -1;
                } else
                        super_types[mddev->major_version].
@@ -5230,7 +5261,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
                        printk(KERN_INFO "md: nonpersistent superblock ...\n");
                        rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
                } else
-                        rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+                        rdev->sb_start = calc_dev_sboffset(rdev);
                rdev->sectors = rdev->sb_start;
                err = bind_rdev_to_array(rdev, mddev);
@@ -5297,7 +5328,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
        }
        if (mddev->persistent)
-                rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+                rdev->sb_start = calc_dev_sboffset(rdev);
        else
                rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
@@ -5510,7 +5541,6 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
         * sb_start or, if that is <data_offset, it must fit before the size
         * of each device.  If num_sectors is zero, we find the largest size
         * that fits.
         */
        if (mddev->sync_thread)
                return -EBUSY;
@@ -6033,7 +6063,8 @@ static int md_thread(void * arg)
                         || kthread_should_stop(),
                         thread->timeout);
-                if (test_and_clear_bit(THREAD_WAKEUP, &thread->flags))
+                clear_bit(THREAD_WAKEUP, &thread->flags);
+                if (!kthread_should_stop())
                        thread->run(thread->mddev);
        }
@@ -6799,7 +6830,7 @@ void md_do_sync(mddev_t *mddev)
                       desc, mdname(mddev));
                mddev->curr_resync = j;
        }
-        mddev->curr_resync_completed = mddev->curr_resync;
+        mddev->curr_resync_completed = j;
        while (j < max_sectors) {
                sector_t sectors;
@@ -6817,8 +6848,7 @@ void md_do_sync(mddev_t *mddev)
                        md_unplug(mddev);
                        wait_event(mddev->recovery_wait,
                                   atomic_read(&mddev->recovery_active) == 0);
-                        mddev->curr_resync_completed =
+                        mddev->curr_resync_completed = j;
-                                mddev->curr_resync;
                        set_bit(MD_CHANGE_CLEAN, &mddev->flags);
                        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
                }
@@ -7023,6 +7053,45 @@ static int remove_and_add_spares(mddev_t *mddev)
        }
        return spares;
 }
+static void reap_sync_thread(mddev_t *mddev)
+{
+        mdk_rdev_t *rdev;
+        /* resync has finished, collect result */
+        md_unregister_thread(mddev->sync_thread);
+        mddev->sync_thread = NULL;
+        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+            !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+                /* success...*/
+                /* activate any spares */
+                if (mddev->pers->spare_active(mddev))
+                        sysfs_notify(&mddev->kobj, NULL,
+                                     "degraded");
+        }
+        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+            mddev->pers->finish_reshape)
+                mddev->pers->finish_reshape(mddev);
+        md_update_sb(mddev, 1);
+        /* if array is no-longer degraded, then any saved_raid_disk
+         * information must be scrapped
+         */
+        if (!mddev->degraded)
+                list_for_each_entry(rdev, &mddev->disks, same_set)
+                        rdev->saved_raid_disk = -1;
+        clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+        clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+        clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+        clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+        clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+        /* flag recovery needed just to double check */
+        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+        sysfs_notify_dirent_safe(mddev->sysfs_action);
+        md_new_event(mddev);
+}
 /*
 * This routine is regularly called by all per-raid-array threads to
 * deal with generic issues like resync and super-block update.
@@ -7047,9 +7116,6 @@ static int remove_and_add_spares(mddev_t *mddev)
 */
 void md_check_recovery(mddev_t *mddev)
 {
-        mdk_rdev_t *rdev;
        if (mddev->bitmap)
                bitmap_daemon_work(mddev);
@@ -7117,34 +7183,7 @@ void md_check_recovery(mddev_t *mddev)
                        goto unlock;
                }
                if (mddev->sync_thread) {
-                        /* resync has finished, collect result */
+                        reap_sync_thread(mddev);
-                        md_unregister_thread(mddev->sync_thread);
-                        mddev->sync_thread = NULL;
-                        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
-                            !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
-                                /* success...*/
-                                /* activate any spares */
-                                if (mddev->pers->spare_active(mddev))
-                                        sysfs_notify(&mddev->kobj, NULL,
-                                                     "degraded");
-                        }
-                        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-                            mddev->pers->finish_reshape)
-                                mddev->pers->finish_reshape(mddev);
-                        md_update_sb(mddev, 1);
-                        /* if array is no-longer degraded, then any saved_raid_disk
-                         * information must be scrapped
-                         */
-                        if (!mddev->degraded)
-                                list_for_each_entry(rdev, &mddev->disks, same_set)
-                                        rdev->saved_raid_disk = -1;
-                        mddev->recovery = 0;
-                        /* flag recovery needed just to double check */
-                        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-                        sysfs_notify_dirent_safe(mddev->sysfs_action);
-                        md_new_event(mddev);
                        goto unlock;
                }
                /* Set RUNNING before clearing NEEDED to avoid
@@ -7202,7 +7241,11 @@ void md_check_recovery(mddev_t *mddev)
                                        " thread...\n", 
                                        mdname(mddev));
                                /* leave the spares where they are, it shouldn't hurt */
-                                mddev->recovery = 0;
+                                clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+                                clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+                                clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+                                clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+                                clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
                        } else
                                md_wakeup_thread(mddev->sync_thread);
                        sysfs_notify_dirent_safe(mddev->sysfs_action);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index d05bab55df4e..eec517ced31a 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -60,6 +60,12 @@ struct mdk_rdev_s
        mddev_t *mddev;                 /* RAID array if running */
        int last_events;                /* IO event timestamp */
+        /*
+         * If meta_bdev is non-NULL, it means that a separate device is
+         * being used to store the metadata (superblock/bitmap) which
+         * would otherwise be contained on the same device as the data (bdev).
+         */
+        struct block_device *meta_bdev;
        struct block_device *bdev;      /* block device handle */
        struct page     *sb_page;
@@ -148,7 +154,8 @@ struct mddev_s
                                                       * are happening, so run/
                                                       * takeover/stop are not safe
                                                       */
+        int                             ready; /* See when safe to pass 
+                                                * IO requests down */
        struct gendisk                  *gendisk;
        struct kobject                  kobj;
@@ -497,8 +504,8 @@ extern void md_flush_request(mddev_t *mddev, struct bio *bio);
 extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
                           sector_t sector, int size, struct page *page);
 extern void md_super_wait(mddev_t *mddev);
-extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
+extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, 
-                        struct page *page, int rw);
+                        struct page *page, int rw, bool metadata_op);
 extern void md_do_sync(mddev_t *mddev);
 extern void md_new_event(mddev_t *mddev);
 extern int md_allow_write(mddev_t *mddev);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 845cf95b612c..a23ffa397ba9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1027,8 +1027,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
        } else
                set_bit(Faulty, &rdev->flags);
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
-        printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n"
+        printk(KERN_ALERT
-               KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n",
+               "md/raid1:%s: Disk failure on %s, disabling device.\n"
+               "md/raid1:%s: Operation continuing on %d devices.\n",
               mdname(mddev), bdevname(rdev->bdev, b),
               mdname(mddev), conf->raid_disks - mddev->degraded);
 }
@@ -1364,10 +1365,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
                                         */
                                        rdev = conf->mirrors[d].rdev;
                                        if (sync_page_io(rdev,
-                                                         sect + rdev->data_offset,
+                                                         sect,
                                                         s<<9,
                                                         bio->bi_io_vec[idx].bv_page,
-                                                         READ)) {
+                                                         READ, false)) {
                                                success = 1;
                                                break;
                                        }
@@ -1390,10 +1391,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
                                        rdev = conf->mirrors[d].rdev;
                                        atomic_add(s, &rdev->corrected_errors);
                                        if (sync_page_io(rdev,
-                                                         sect + rdev->data_offset,
+                                                         sect,
                                                         s<<9,
                                                         bio->bi_io_vec[idx].bv_page,
-                                                         WRITE) == 0)
+                                                         WRITE, false) == 0)
                                                md_error(mddev, rdev);
                                }
                                d = start;
@@ -1405,10 +1406,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
                                                continue;
                                        rdev = conf->mirrors[d].rdev;
                                        if (sync_page_io(rdev,
-                                                         sect + rdev->data_offset,
+                                                         sect,
                                                         s<<9,
                                                         bio->bi_io_vec[idx].bv_page,
-                                                         READ) == 0)
+                                                         READ, false) == 0)
                                                md_error(mddev, rdev);
                                }
                        } else {
@@ -1488,10 +1489,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
                        rdev = conf->mirrors[d].rdev;
                        if (rdev &&
                            test_bit(In_sync, &rdev->flags) &&
-                            sync_page_io(rdev,
+                            sync_page_io(rdev, sect, s<<9,
-                                         sect + rdev->data_offset,
+                                         conf->tmppage, READ, false))
-                                         s<<9,
-                                         conf->tmppage, READ))
                                success = 1;
                        else {
                                d++;
@@ -1514,9 +1513,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
                        rdev = conf->mirrors[d].rdev;
                        if (rdev &&
                            test_bit(In_sync, &rdev->flags)) {
-                                if (sync_page_io(rdev,
+                                if (sync_page_io(rdev, sect, s<<9,
-                                                 sect + rdev->data_offset,
+                                                 conf->tmppage, WRITE, false)
-                                                 s<<9, conf->tmppage, WRITE)
                                    == 0)
                                        /* Well, this device is dead */
                                        md_error(mddev, rdev);
@@ -1531,9 +1529,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
                        rdev = conf->mirrors[d].rdev;
                        if (rdev &&
                            test_bit(In_sync, &rdev->flags)) {
-                                if (sync_page_io(rdev,
+                                if (sync_page_io(rdev, sect, s<<9,
-                                                 sect + rdev->data_offset,
+                                                 conf->tmppage, READ, false)
-                                                 s<<9, conf->tmppage, READ)
                                    == 0)
                                        /* Well, this device is dead */
                                        md_error(mddev, rdev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0641674827f0..69b659544390 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1051,8 +1051,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
        }
        set_bit(Faulty, &rdev->flags);
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
-        printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n"
+        printk(KERN_ALERT
-               KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n",
+               "md/raid10:%s: Disk failure on %s, disabling device.\n"
+               "md/raid10:%s: Operation continuing on %d devices.\n",
               mdname(mddev), bdevname(rdev->bdev, b),
               mdname(mddev), conf->raid_disks - mddev->degraded);
 }
@@ -1559,9 +1560,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                rcu_read_unlock();
                                success = sync_page_io(rdev,
                                                       r10_bio->devs[sl].addr +
-                                                       sect + rdev->data_offset,
+                                                       sect,
                                                       s<<9,
-                                                       conf->tmppage, READ);
+                                                       conf->tmppage, READ, false);
                                rdev_dec_pending(rdev, mddev);
                                rcu_read_lock();
                                if (success)
@@ -1598,8 +1599,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                atomic_add(s, &rdev->corrected_errors);
                                if (sync_page_io(rdev,
                                                 r10_bio->devs[sl].addr +
-                                                 sect + rdev->data_offset,
+                                                 sect,
-                                                 s<<9, conf->tmppage, WRITE)
+                                                 s<<9, conf->tmppage, WRITE, false)
                                    == 0) {
                                        /* Well, this device is dead */
                                        printk(KERN_NOTICE
@@ -1635,9 +1636,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                rcu_read_unlock();
                                if (sync_page_io(rdev,
                                                 r10_bio->devs[sl].addr +
-                                                 sect + rdev->data_offset,
+                                                 sect,
                                                 s<<9, conf->tmppage,
-                                                 READ) == 0) {
+                                                 READ, false) == 0) {
                                        /* Well, this device is dead */
                                        printk(KERN_NOTICE
                                               "md/raid10:%s: unable to read back "
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index dc574f303f8b..5044babfcda0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1721,7 +1721,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
                set_bit(Faulty, &rdev->flags);
                printk(KERN_ALERT
                       "md/raid:%s: Disk failure on %s, disabling device.\n"
-                       KERN_ALERT
                       "md/raid:%s: Operation continuing on %d devices.\n",
                       mdname(mddev),
                       bdevname(rdev->bdev, b),
@@ -4237,7 +4236,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                wait_event(conf->wait_for_overlap,
                           atomic_read(&conf->reshape_stripes)==0);
                mddev->reshape_position = conf->reshape_progress;
-                mddev->curr_resync_completed = mddev->curr_resync;
+                mddev->curr_resync_completed = sector_nr;
                conf->reshape_checkpoint = jiffies;
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
@@ -4338,7 +4337,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                wait_event(conf->wait_for_overlap,
                           atomic_read(&conf->reshape_stripes) == 0);
                mddev->reshape_position = conf->reshape_progress;
-                mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors;
+                mddev->curr_resync_completed = sector_nr;
                conf->reshape_checkpoint = jiffies;
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
@@ -5339,7 +5338,7 @@ static int raid5_spare_active(mddev_t *mddev)
                    && !test_bit(Faulty, &tmp->rdev->flags)
                    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
                        count++;
-                        sysfs_notify_dirent(tmp->rdev->sysfs_state);
+                        sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
                }
        }
        spin_lock_irqsave(&conf->device_lock, flags);
@@ -5528,8 +5527,8 @@ static int raid5_start_reshape(mddev_t *mddev)
                return -ENOSPC;
        list_for_each_entry(rdev, &mddev->disks, same_set)
-                if (rdev->raid_disk < 0 &&
+                if ((rdev->raid_disk < 0 || rdev->raid_disk >= conf->raid_disks)
-                    !test_bit(Faulty, &rdev->flags))
+                     && !test_bit(Faulty, &rdev->flags))
                        spares++;
        if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
@@ -5589,6 +5588,11 @@ static int raid5_start_reshape(mddev_t *mddev)
                                        /* Failure here is OK */;
                        } else
                                break;
+                } else if (rdev->raid_disk >= conf->previous_raid_disks
+                           && !test_bit(Faulty, &rdev->flags)) {
+                        /* This is a spare that was manually added */
+                        set_bit(In_sync, &rdev->flags);
+                        added_devices++;
                }
        /* When a reshape changes the number of devices, ->degraded
diff --git a/drivers/serial/atmel_serial.c b/drivers/serial/atmel_serial.c
index 3892666b5fbd..2a1d52fb4936 100644
--- a/drivers/serial/atmel_serial.c
+++ b/drivers/serial/atmel_serial.c
@@ -1732,6 +1732,11 @@ static int __devinit atmel_serial_probe(struct platform_device *pdev)
        device_init_wakeup(&pdev->dev, 1);
        platform_set_drvdata(pdev, port);
+        if (port->rs485.flags & SER_RS485_ENABLED) {
+                UART_PUT_MR(&port->uart, ATMEL_US_USMODE_NORMAL);
+                UART_PUT_CR(&port->uart, ATMEL_US_RTSEN);
+        }
        return 0;
 err_add_port:
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 5a48ce996dea..07bec09d1dad 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -71,11 +71,18 @@ config XEN_SYS_HYPERVISOR
         but will have no xen contents.
 config XEN_XENBUS_FRONTEND
-       tristate
+        tristate
+config XEN_GNTDEV
+        tristate "userspace grant access device driver"
+        depends on XEN
+        select MMU_NOTIFIER
+        help
+          Allows userspace processes to use grants.
 config XEN_PLATFORM_PCI
        tristate "xen platform pci device driver"
-        depends on XEN_PVHVM
+        depends on XEN_PVHVM && PCI
        default m
        help
          Driver for the Xen PCI Platform device: it is responsible for
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 533a199e7a3f..5088cc2e6fe2 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -9,11 +9,14 @@ obj-$(CONFIG_HOTPLUG_CPU)	+= cpu_hotplug.o
 obj-$(CONFIG_XEN_XENCOMM)       += xencomm.o
 obj-$(CONFIG_XEN_BALLOON)       += balloon.o
 obj-$(CONFIG_XEN_DEV_EVTCHN)    += xen-evtchn.o
+obj-$(CONFIG_XEN_GNTDEV)        += xen-gntdev.o
 obj-$(CONFIG_XENFS)             += xenfs/
 obj-$(CONFIG_XEN_SYS_HYPERVISOR)        += sys-hypervisor.o
-obj-$(CONFIG_XEN_PLATFORM_PCI)  += platform-pci.o
+obj-$(CONFIG_XEN_PLATFORM_PCI)  += xen-platform-pci.o
 obj-$(CONFIG_SWIOTLB_XEN)       += swiotlb-xen.o
 obj-$(CONFIG_XEN_DOM0)          += pci.o
 xen-evtchn-y                    := evtchn.o
+xen-gntdev-y                            := gntdev.o
+xen-platform-pci-y              := platform-pci.o
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
new file mode 100644
index 000000000000..1e31cdcdae1e
--- /dev/null
+++ b/drivers/xen/gntdev.c
@@ -0,0 +1,665 @@
+/******************************************************************************
+ * gntdev.c
+ *
+ * Device for accessing (in user-space) pages that have been granted by other
+ * domains.
+ *
+ * Copyright (c) 2006-2007, D G Murray.
+ *           (c) 2009 Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#undef DEBUG
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmu_notifier.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <xen/xen.h>
+#include <xen/grant_table.h>
+#include <xen/gntdev.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, "
+              "Gerd Hoffmann <kraxel@redhat.com>");
+MODULE_DESCRIPTION("User-space granted page access driver");
+static int limit = 1024;
+module_param(limit, int, 0644);
+MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped at "
+                "once by a gntdev instance");
+struct gntdev_priv {
+        struct list_head maps;
+        uint32_t used;
+        uint32_t limit;
+        /* lock protects maps from concurrent changes */
+        spinlock_t lock;
+        struct mm_struct *mm;
+        struct mmu_notifier mn;
+};
+struct grant_map {
+        struct list_head next;
+        struct gntdev_priv *priv;
+        struct vm_area_struct *vma;
+        int index;
+        int count;
+        int flags;
+        int is_mapped;
+        struct ioctl_gntdev_grant_ref *grants;
+        struct gnttab_map_grant_ref   *map_ops;
+        struct gnttab_unmap_grant_ref *unmap_ops;
+        struct page **pages;
+};
+/* ------------------------------------------------------------------ */
+static void gntdev_print_maps(struct gntdev_priv *priv,
+                              char *text, int text_index)
+{
+#ifdef DEBUG
+        struct grant_map *map;
+        pr_debug("maps list (priv %p, usage %d/%d)\n",
+               priv, priv->used, priv->limit);
+        list_for_each_entry(map, &priv->maps, next)
+                pr_debug("  index %2d, count %2d %s\n",
+                       map->index, map->count,
+                       map->index == text_index && text ? text : "");
+#endif
+}
+static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
+{
+        struct grant_map *add;
+        int i;
+        add = kzalloc(sizeof(struct grant_map), GFP_KERNEL);
+        if (NULL == add)
+                return NULL;
+        add->grants    = kzalloc(sizeof(add->grants[0])    * count, GFP_KERNEL);
+        add->map_ops   = kzalloc(sizeof(add->map_ops[0])   * count, GFP_KERNEL);
+        add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL);
+        add->pages     = kzalloc(sizeof(add->pages[0])     * count, GFP_KERNEL);
+        if (NULL == add->grants    ||
+            NULL == add->map_ops   ||
+            NULL == add->unmap_ops ||
+            NULL == add->pages)
+                goto err;
+        for (i = 0; i < count; i++) {
+                add->pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+                if (add->pages[i] == NULL)
+                        goto err;
+        }
+        add->index = 0;
+        add->count = count;
+        add->priv  = priv;
+        if (add->count + priv->used > priv->limit)
+                goto err;
+        return add;
+err:
+        if (add->pages)
+                for (i = 0; i < count; i++) {
+                        if (add->pages[i])
+                                __free_page(add->pages[i]);
+                }
+        kfree(add->pages);
+        kfree(add->grants);
+        kfree(add->map_ops);
+        kfree(add->unmap_ops);
+        kfree(add);
+        return NULL;
+}
+static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add)
+{
+        struct grant_map *map;
+        list_for_each_entry(map, &priv->maps, next) {
+                if (add->index + add->count < map->index) {
+                        list_add_tail(&add->next, &map->next);
+                        goto done;
+                }
+                add->index = map->index + map->count;
+        }
+        list_add_tail(&add->next, &priv->maps);
+done:
+        priv->used += add->count;
+        gntdev_print_maps(priv, "[new]", add->index);
+}
+static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv,
+                int index, int count)
+{
+        struct grant_map *map;
+        list_for_each_entry(map, &priv->maps, next) {
+                if (map->index != index)
+                        continue;
+                if (map->count != count)
+                        continue;
+                return map;
+        }
+        return NULL;
+}
+static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv,
+                                               unsigned long vaddr)
+{
+        struct grant_map *map;
+        list_for_each_entry(map, &priv->maps, next) {
+                if (!map->vma)
+                        continue;
+                if (vaddr < map->vma->vm_start)
+                        continue;
+                if (vaddr >= map->vma->vm_end)
+                        continue;
+                return map;
+        }
+        return NULL;
+}
+static int gntdev_del_map(struct grant_map *map)
+{
+        int i;
+        if (map->vma)
+                return -EBUSY;
+        for (i = 0; i < map->count; i++)
+                if (map->unmap_ops[i].handle)
+                        return -EBUSY;
+        map->priv->used -= map->count;
+        list_del(&map->next);
+        return 0;
+}
+static void gntdev_free_map(struct grant_map *map)
+{
+        int i;
+        if (!map)
+                return;
+        if (map->pages)
+                for (i = 0; i < map->count; i++) {
+                        if (map->pages[i])
+                                __free_page(map->pages[i]);
+                }
+        kfree(map->pages);
+        kfree(map->grants);
+        kfree(map->map_ops);
+        kfree(map->unmap_ops);
+        kfree(map);
+}
+/* ------------------------------------------------------------------ */
+static int find_grant_ptes(pte_t *pte, pgtable_t token,
+                unsigned long addr, void *data)
+{
+        struct grant_map *map = data;
+        unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
+        u64 pte_maddr;
+        BUG_ON(pgnr >= map->count);
+        pte_maddr = arbitrary_virt_to_machine(pte).maddr;
+        gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr,
+                          GNTMAP_contains_pte | map->flags,
+                          map->grants[pgnr].ref,
+                          map->grants[pgnr].domid);
+        gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr,
+                            GNTMAP_contains_pte | map->flags,
+                            0 /* handle */);
+        return 0;
+}
+static int map_grant_pages(struct grant_map *map)
+{
+        int i, err = 0;
+        pr_debug("map %d+%d\n", map->index, map->count);
+        err = gnttab_map_refs(map->map_ops, map->pages, map->count);
+        if (err)
+                return err;
+        for (i = 0; i < map->count; i++) {
+                if (map->map_ops[i].status)
+                        err = -EINVAL;
+                map->unmap_ops[i].handle = map->map_ops[i].handle;
+        }
+        return err;
+}
+static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
+{
+        int i, err = 0;
+        pr_debug("map %d+%d [%d+%d]\n", map->index, map->count, offset, pages);
+        err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages, pages);
+        if (err)
+                return err;
+        for (i = 0; i < pages; i++) {
+                if (map->unmap_ops[offset+i].status)
+                        err = -EINVAL;
+                map->unmap_ops[offset+i].handle = 0;
+        }
+        return err;
+}
+/* ------------------------------------------------------------------ */
+static void gntdev_vma_close(struct vm_area_struct *vma)
+{
+        struct grant_map *map = vma->vm_private_data;
+        pr_debug("close %p\n", vma);
+        map->is_mapped = 0;
+        map->vma = NULL;
+        vma->vm_private_data = NULL;
+}
+static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        pr_debug("vaddr %p, pgoff %ld (shouldn't happen)\n",
+                        vmf->virtual_address, vmf->pgoff);
+        vmf->flags = VM_FAULT_ERROR;
+        return 0;
+}
+static struct vm_operations_struct gntdev_vmops = {
+        .close = gntdev_vma_close,
+        .fault = gntdev_vma_fault,
+};
+/* ------------------------------------------------------------------ */
+static void mn_invl_range_start(struct mmu_notifier *mn,
+                                struct mm_struct *mm,
+                                unsigned long start, unsigned long end)
+{
+        struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
+        struct grant_map *map;
+        unsigned long mstart, mend;
+        int err;
+        spin_lock(&priv->lock);
+        list_for_each_entry(map, &priv->maps, next) {
+                if (!map->vma)
+                        continue;
+                if (!map->is_mapped)
+                        continue;
+                if (map->vma->vm_start >= end)
+                        continue;
+                if (map->vma->vm_end <= start)
+                        continue;
+                mstart = max(start, map->vma->vm_start);
+                mend   = min(end,   map->vma->vm_end);
+                pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
+                                map->index, map->count,
+                                map->vma->vm_start, map->vma->vm_end,
+                                start, end, mstart, mend);
+                err = unmap_grant_pages(map,
+                                        (mstart - map->vma->vm_start) >> PAGE_SHIFT,
+                                        (mend - mstart) >> PAGE_SHIFT);
+                WARN_ON(err);
+        }
+        spin_unlock(&priv->lock);
+}
+static void mn_invl_page(struct mmu_notifier *mn,
+                         struct mm_struct *mm,
+                         unsigned long address)
+{
+        mn_invl_range_start(mn, mm, address, address + PAGE_SIZE);
+}
+static void mn_release(struct mmu_notifier *mn,
+                       struct mm_struct *mm)
+{
+        struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
+        struct grant_map *map;
+        int err;
+        spin_lock(&priv->lock);
+        list_for_each_entry(map, &priv->maps, next) {
+                if (!map->vma)
+                        continue;
+                pr_debug("map %d+%d (%lx %lx)\n",
+                                map->index, map->count,
+                                map->vma->vm_start, map->vma->vm_end);
+                err = unmap_grant_pages(map, /* offset */ 0, map->count);
+                WARN_ON(err);
+        }
+        spin_unlock(&priv->lock);
+}
+struct mmu_notifier_ops gntdev_mmu_ops = {
+        .release                = mn_release,
+        .invalidate_page        = mn_invl_page,
+        .invalidate_range_start = mn_invl_range_start,
+};
+/* ------------------------------------------------------------------ */
+static int gntdev_open(struct inode *inode, struct file *flip)
+{
+        struct gntdev_priv *priv;
+        int ret = 0;
+        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+        if (!priv)
+                return -ENOMEM;
+        INIT_LIST_HEAD(&priv->maps);
+        spin_lock_init(&priv->lock);
+        priv->limit = limit;
+        priv->mm = get_task_mm(current);
+        if (!priv->mm) {
+                kfree(priv);
+                return -ENOMEM;
+        }
+        priv->mn.ops = &gntdev_mmu_ops;
+        ret = mmu_notifier_register(&priv->mn, priv->mm);
+        mmput(priv->mm);
+        if (ret) {
+                kfree(priv);
+                return ret;
+        }
+        flip->private_data = priv;
+        pr_debug("priv %p\n", priv);
+        return 0;
+}
+static int gntdev_release(struct inode *inode, struct file *flip)
+{
+        struct gntdev_priv *priv = flip->private_data;
+        struct grant_map *map;
+        int err;
+        pr_debug("priv %p\n", priv);
+        spin_lock(&priv->lock);
+        while (!list_empty(&priv->maps)) {
+                map = list_entry(priv->maps.next, struct grant_map, next);
+                err = gntdev_del_map(map);
+                if (WARN_ON(err))
+                        gntdev_free_map(map);
+        }
+        spin_unlock(&priv->lock);
+        mmu_notifier_unregister(&priv->mn, priv->mm);
+        kfree(priv);
+        return 0;
+}
+static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
+                                       struct ioctl_gntdev_map_grant_ref __user *u)
+{
+        struct ioctl_gntdev_map_grant_ref op;
+        struct grant_map *map;
+        int err;
+        if (copy_from_user(&op, u, sizeof(op)) != 0)
+                return -EFAULT;
+        pr_debug("priv %p, add %d\n", priv, op.count);
+        if (unlikely(op.count <= 0))
+                return -EINVAL;
+        if (unlikely(op.count > priv->limit))
+                return -EINVAL;
+        err = -ENOMEM;
+        map = gntdev_alloc_map(priv, op.count);
+        if (!map)
+                return err;
+        if (copy_from_user(map->grants, &u->refs,
+                           sizeof(map->grants[0]) * op.count) != 0) {
+                gntdev_free_map(map);
+                return err;
+        }
+        spin_lock(&priv->lock);
+        gntdev_add_map(priv, map);
+        op.index = map->index << PAGE_SHIFT;
+        spin_unlock(&priv->lock);
+        if (copy_to_user(u, &op, sizeof(op)) != 0) {
+                spin_lock(&priv->lock);
+                gntdev_del_map(map);
+                spin_unlock(&priv->lock);
+                gntdev_free_map(map);
+                return err;
+        }
+        return 0;
+}
+static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
+                                         struct ioctl_gntdev_unmap_grant_ref __user *u)
+{
+        struct ioctl_gntdev_unmap_grant_ref op;
+        struct grant_map *map;
+        int err = -ENOENT;
+        if (copy_from_user(&op, u, sizeof(op)) != 0)
+                return -EFAULT;
+        pr_debug("priv %p, del %d+%d\n", priv, (int)op.index, (int)op.count);
+        spin_lock(&priv->lock);
+        map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
+        if (map)
+                err = gntdev_del_map(map);
+        spin_unlock(&priv->lock);
+        if (!err)
+                gntdev_free_map(map);
+        return err;
+}
+static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
+                                              struct ioctl_gntdev_get_offset_for_vaddr __user *u)
+{
+        struct ioctl_gntdev_get_offset_for_vaddr op;
+        struct grant_map *map;
+        if (copy_from_user(&op, u, sizeof(op)) != 0)
+                return -EFAULT;
+        pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr);
+        spin_lock(&priv->lock);
+        map = gntdev_find_map_vaddr(priv, op.vaddr);
+        if (map == NULL ||
+            map->vma->vm_start != op.vaddr) {
+                spin_unlock(&priv->lock);
+                return -EINVAL;
+        }
+        op.offset = map->index << PAGE_SHIFT;
+        op.count = map->count;
+        spin_unlock(&priv->lock);
+        if (copy_to_user(u, &op, sizeof(op)) != 0)
+                return -EFAULT;
+        return 0;
+}
+static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv,
+                                        struct ioctl_gntdev_set_max_grants __user *u)
+{
+        struct ioctl_gntdev_set_max_grants op;
+        if (copy_from_user(&op, u, sizeof(op)) != 0)
+                return -EFAULT;
+        pr_debug("priv %p, limit %d\n", priv, op.count);
+        if (op.count > limit)
+                return -E2BIG;
+        spin_lock(&priv->lock);
+        priv->limit = op.count;
+        spin_unlock(&priv->lock);
+        return 0;
+}
+static long gntdev_ioctl(struct file *flip,
+                         unsigned int cmd, unsigned long arg)
+{
+        struct gntdev_priv *priv = flip->private_data;
+        void __user *ptr = (void __user *)arg;
+        switch (cmd) {
+        case IOCTL_GNTDEV_MAP_GRANT_REF:
+                return gntdev_ioctl_map_grant_ref(priv, ptr);
+        case IOCTL_GNTDEV_UNMAP_GRANT_REF:
+                return gntdev_ioctl_unmap_grant_ref(priv, ptr);
+        case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
+                return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);
+        case IOCTL_GNTDEV_SET_MAX_GRANTS:
+                return gntdev_ioctl_set_max_grants(priv, ptr);
+        default:
+                pr_debug("priv %p, unknown cmd %x\n", priv, cmd);
+                return -ENOIOCTLCMD;
+        }
+        return 0;
+}
+static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
+{
+        struct gntdev_priv *priv = flip->private_data;
+        int index = vma->vm_pgoff;
+        int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+        struct grant_map *map;
+        int err = -EINVAL;
+        if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
+                return -EINVAL;
+        pr_debug("map %d+%d at %lx (pgoff %lx)\n",
+                        index, count, vma->vm_start, vma->vm_pgoff);
+        spin_lock(&priv->lock);
+        map = gntdev_find_map_index(priv, index, count);
+        if (!map)
+                goto unlock_out;
+        if (map->vma)
+                goto unlock_out;
+        if (priv->mm != vma->vm_mm) {
+                printk(KERN_WARNING "Huh? Other mm?\n");
+                goto unlock_out;
+        }
+        vma->vm_ops = &gntdev_vmops;
+        vma->vm_flags |= VM_RESERVED|VM_DONTCOPY|VM_DONTEXPAND|VM_PFNMAP;
+        vma->vm_private_data = map;
+        map->vma = vma;
+        map->flags = GNTMAP_host_map | GNTMAP_application_map;
+        if (!(vma->vm_flags & VM_WRITE))
+                map->flags |= GNTMAP_readonly;
+        spin_unlock(&priv->lock);
+        err = apply_to_page_range(vma->vm_mm, vma->vm_start,
+                                  vma->vm_end - vma->vm_start,
+                                  find_grant_ptes, map);
+        if (err) {
+                printk(KERN_WARNING "find_grant_ptes() failure.\n");
+                return err;
+        }
+        err = map_grant_pages(map);
+        if (err) {
+                printk(KERN_WARNING "map_grant_pages() failure.\n");
+                return err;
+        }
+        map->is_mapped = 1;
+        return 0;
+unlock_out:
+        spin_unlock(&priv->lock);
+        return err;
+}
+static const struct file_operations gntdev_fops = {
+        .owner = THIS_MODULE,
+        .open = gntdev_open,
+        .release = gntdev_release,
+        .mmap = gntdev_mmap,
+        .unlocked_ioctl = gntdev_ioctl
+};
+static struct miscdevice gntdev_miscdev = {
+        .minor        = MISC_DYNAMIC_MINOR,
+        .name         = "xen/gntdev",
+        .fops         = &gntdev_fops,
+};
+/* ------------------------------------------------------------------ */
+static int __init gntdev_init(void)
+{
+        int err;
+        if (!xen_domain())
+                return -ENODEV;
+        err = misc_register(&gntdev_miscdev);
+        if (err != 0) {
+                printk(KERN_ERR "Could not register gntdev device\n");
+                return err;
+        }
+        return 0;
+}
+static void __exit gntdev_exit(void)
+{
+        misc_deregister(&gntdev_miscdev);
+}
+module_init(gntdev_init);
+module_exit(gntdev_exit);
+/* ------------------------------------------------------------------ */
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 6c4531816496..9ef54ebc1194 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -447,6 +447,52 @@ unsigned int gnttab_max_grant_frames(void)
 }
 EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
+int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
+                    struct page **pages, unsigned int count)
+{
+        int i, ret;
+        pte_t *pte;
+        unsigned long mfn;
+        ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count);
+        if (ret)
+                return ret;
+        for (i = 0; i < count; i++) {
+                /* m2p override only supported for GNTMAP_contains_pte mappings */
+                if (!(map_ops[i].flags & GNTMAP_contains_pte))
+                        continue;
+                pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
+                                (map_ops[i].host_addr & ~PAGE_MASK));
+                mfn = pte_mfn(*pte);
+                ret = m2p_add_override(mfn, pages[i]);
+                if (ret)
+                        return ret;
+        }
+        return ret;
+}
+EXPORT_SYMBOL_GPL(gnttab_map_refs);
+int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
+                struct page **pages, unsigned int count)
+{
+        int i, ret;
+        ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count);
+        if (ret)
+                return ret;
+        for (i = 0; i < count; i++) {
+                ret = m2p_remove_override(pages[i]);
+                if (ret)
+                        return ret;
+        }
+        return ret;
+}
+EXPORT_SYMBOL_GPL(gnttab_unmap_refs);
 static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 {
        struct gnttab_setup_table setup;
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
index c01b5ddce529..afbe041f42c5 100644
--- a/drivers/xen/platform-pci.c
+++ b/drivers/xen/platform-pci.c
@@ -105,7 +105,7 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
                                       const struct pci_device_id *ent)
 {
        int i, ret;
-        long ioaddr, iolen;
+        long ioaddr;
        long mmio_addr, mmio_len;
        unsigned int max_nr_gframes;
@@ -114,7 +114,6 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
                return i;
        ioaddr = pci_resource_start(pdev, 0);
-        iolen = pci_resource_len(pdev, 0);
        mmio_addr = pci_resource_start(pdev, 1);
        mmio_len = pci_resource_len(pdev, 1);
@@ -125,19 +124,13 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
                goto pci_out;
        }
-        if (request_mem_region(mmio_addr, mmio_len, DRV_NAME) == NULL) {
+        ret = pci_request_region(pdev, 1, DRV_NAME);
-                dev_err(&pdev->dev, "MEM I/O resource 0x%lx @ 0x%lx busy\n",
+        if (ret < 0)
-                       mmio_addr, mmio_len);
-                ret = -EBUSY;
                goto pci_out;
-        }
-        if (request_region(ioaddr, iolen, DRV_NAME) == NULL) {
+        ret = pci_request_region(pdev, 0, DRV_NAME);
-                dev_err(&pdev->dev, "I/O resource 0x%lx @ 0x%lx busy\n",
+        if (ret < 0)
-                       iolen, ioaddr);
-                ret = -EBUSY;
                goto mem_out;
-        }
        platform_mmio = mmio_addr;
        platform_mmiolen = mmio_len;
@@ -169,9 +162,9 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
        return 0;
 out:
-        release_region(ioaddr, iolen);
+        pci_release_region(pdev, 0);
 mem_out:
-        release_mem_region(mmio_addr, mmio_len);
+        pci_release_region(pdev, 1);
 pci_out:
        pci_disable_device(pdev);
        return ret;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 9ed476906327..d3b28abdd6aa 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -141,13 +141,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
        return rc;
 }
-static inode *ecryptfs_get_inode(struct inode *lower_inode,
+static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
                       struct super_block *sb)
 {
        struct inode *inode;
        int rc = 0;
-        lower_inode = lower_dentry->d_inode;
        if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
                rc = -EXDEV;
                goto out;
@@ -202,7 +201,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
 {
        struct inode *lower_inode = lower_dentry->d_inode;
        struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
-        if (IS_ERR(inode)
+        if (IS_ERR(inode))
                return PTR_ERR(inode);
        if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
                d_add(dentry, inode);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3d06ccc953aa..59c6e4956786 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -84,13 +84,9 @@ static inline struct inode *wb_inode(struct list_head *head)
        return list_entry(head, struct inode, i_wb_list);
 }
-static void bdi_queue_work(struct backing_dev_info *bdi,
+/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
-                struct wb_writeback_work *work)
+static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
 {
-        trace_writeback_queue(bdi, work);
-        spin_lock_bh(&bdi->wb_lock);
-        list_add_tail(&work->list, &bdi->work_list);
        if (bdi->wb.task) {
                wake_up_process(bdi->wb.task);
        } else {
@@ -98,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
                 * The bdi thread isn't there, wake up the forker thread which
                 * will create and run it.
                 */
-                trace_writeback_nothread(bdi, work);
                wake_up_process(default_backing_dev_info.wb.task);
        }
+}
+static void bdi_queue_work(struct backing_dev_info *bdi,
+                           struct wb_writeback_work *work)
+{
+        trace_writeback_queue(bdi, work);
+        spin_lock_bh(&bdi->wb_lock);
+        list_add_tail(&work->list, &bdi->work_list);
+        if (!bdi->wb.task)
+                trace_writeback_nothread(bdi, work);
+        bdi_wakeup_flusher(bdi);
        spin_unlock_bh(&bdi->wb_lock);
 }
 static void
 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-                bool range_cyclic, bool for_background)
+                      bool range_cyclic)
 {
        struct wb_writeback_work *work;
@@ -126,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
        work->sync_mode = WB_SYNC_NONE;
        work->nr_pages  = nr_pages;
        work->range_cyclic = range_cyclic;
-        work->for_background = for_background;
        bdi_queue_work(bdi, work);
 }
@@ -144,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 */
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 {
-        __bdi_start_writeback(bdi, nr_pages, true, false);
+        __bdi_start_writeback(bdi, nr_pages, true);
 }
 /**
@@ -152,13 +158,21 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 * @bdi: the backing device to write from
 *
 * Description:
- *   This does WB_SYNC_NONE background writeback. The IO is only
+ *   This makes sure WB_SYNC_NONE background writeback happens. When
- *   started when this function returns, we make no guarentees on
+ *   this function returns, it is only guaranteed that for given BDI
- *   completion. Caller need not hold sb s_umount semaphore.
+ *   some IO is happening if we are over background dirty threshold.
+ *   Caller need not hold sb s_umount semaphore.
 */
 void bdi_start_background_writeback(struct backing_dev_info *bdi)
 {
-        __bdi_start_writeback(bdi, LONG_MAX, true, true);
+        /*
+         * We just wake up the flusher thread. It will perform background
+         * writeback as soon as there is no other work to do.
+         */
+        trace_writeback_wake_background(bdi);
+        spin_lock_bh(&bdi->wb_lock);
+        bdi_wakeup_flusher(bdi);
+        spin_unlock_bh(&bdi->wb_lock);
 }
 /*
@@ -616,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb,
        };
        unsigned long oldest_jif;
        long wrote = 0;
+        long write_chunk;
        struct inode *inode;
        if (wbc.for_kupdate) {
@@ -628,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb,
                wbc.range_end = LLONG_MAX;
        }
+        /*
+         * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+         * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+         * here avoids calling into writeback_inodes_wb() more than once.
+         *
+         * The intended call sequence for WB_SYNC_ALL writeback is:
+         *
+         *      wb_writeback()
+         *          __writeback_inodes_sb()     <== called only once
+         *              write_cache_pages()     <== called once for each inode
+         *                   (quickly) tag currently dirty pages
+         *                   (maybe slowly) sync all tagged pages
+         */
+        if (wbc.sync_mode == WB_SYNC_NONE)
+                write_chunk = MAX_WRITEBACK_PAGES;
+        else
+                write_chunk = LONG_MAX;
        wbc.wb_start = jiffies; /* livelock avoidance */
        for (;;) {
                /*
@@ -637,6 +670,16 @@ static long wb_writeback(struct bdi_writeback *wb,
                        break;
                /*
+                 * Background writeout and kupdate-style writeback may
+                 * run forever. Stop them if there is other work to do
+                 * so that e.g. sync can proceed. They'll be restarted
+                 * after the other works are all done.
+                 */
+                if ((work->for_background || work->for_kupdate) &&
+                    !list_empty(&wb->bdi->work_list))
+                        break;
+                /*
                 * For background writeout, stop when we are below the
                 * background dirty threshold
                 */
@@ -644,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                        break;
                wbc.more_io = 0;
-                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+                wbc.nr_to_write = write_chunk;
                wbc.pages_skipped = 0;
                trace_wbc_writeback_start(&wbc, wb->bdi);
@@ -654,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb,
                        writeback_inodes_wb(wb, &wbc);
                trace_wbc_writeback_written(&wbc, wb->bdi);
-                work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+                work->nr_pages -= write_chunk - wbc.nr_to_write;
-                wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+                wrote += write_chunk - wbc.nr_to_write;
                /*
                 * If we consumed everything, see if we have more
@@ -670,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                /*
                 * Did we write something? Try for more
                 */
-                if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
+                if (wbc.nr_to_write < write_chunk)
                        continue;
                /*
                 * Nothing written. Wait for some inode to
@@ -718,6 +761,23 @@ static unsigned long get_nr_dirty_pages(void)
                get_nr_dirty_inodes();
 }
+static long wb_check_background_flush(struct bdi_writeback *wb)
+{
+        if (over_bground_thresh()) {
+                struct wb_writeback_work work = {
+                        .nr_pages       = LONG_MAX,
+                        .sync_mode      = WB_SYNC_NONE,
+                        .for_background = 1,
+                        .range_cyclic   = 1,
+                };
+                return wb_writeback(wb, &work);
+        }
+        return 0;
+}
 static long wb_check_old_data_flush(struct bdi_writeback *wb)
 {
        unsigned long expired;
@@ -787,6 +847,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
         * Check for periodic writeback, kupdated() style
         */
        wrote += wb_check_old_data_flush(wb);
+        wrote += wb_check_background_flush(wb);
        clear_bit(BDI_writeback_running, &wb->bdi->state);
        return wrote;
@@ -873,7 +934,7 @@ void wakeup_flusher_threads(long nr_pages)
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                if (!bdi_has_dirty_io(bdi))
                        continue;
-                __bdi_start_writeback(bdi, nr_pages, false, false);
+                __bdi_start_writeback(bdi, nr_pages, false);
        }
        rcu_read_unlock();
 }
@@ -1164,7 +1225,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
 * @sb: the superblock
 *
 * This function writes and waits on any dirty inode belonging to this
- * super_block. The number of pages synced is returned.
+ * super_block.
 */
 void sync_inodes_sb(struct super_block *sb)
 {
@@ -1242,11 +1303,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
 EXPORT_SYMBOL(sync_inode);
 /**
- * sync_inode - write an inode to disk
+ * sync_inode_metadata - write an inode to disk
 * @inode: the inode to sync
 * @wait: wait for I/O to complete.
 *
- * Write an inode to disk and adjust it's dirty state after completion.
+ * Write an inode to disk and adjust its dirty state after completion.
 *
 * Note: only writes the actual inode, no associated data or other metadata.
 */
diff --git a/fs/mpage.c b/fs/mpage.c
index fd56ca2ea556..d78455a81ec9 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -40,7 +40,7 @@
 * status of that page is hard.  See end_buffer_async_read() for the details.
 * There is no point in duplicating all that complexity.
 */
-static void mpage_end_io_read(struct bio *bio, int err)
+static void mpage_end_io(struct bio *bio, int err)
 {
        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -50,44 +50,29 @@ static void mpage_end_io_read(struct bio *bio, int err)
                if (--bvec >= bio->bi_io_vec)
                        prefetchw(&bvec->bv_page->flags);
+                if (bio_data_dir(bio) == READ) {
-                if (uptodate) {
+                        if (uptodate) {
-                        SetPageUptodate(page);
+                                SetPageUptodate(page);
-                } else {
+                        } else {
-                        ClearPageUptodate(page);
+                                ClearPageUptodate(page);
-                        SetPageError(page);
+                                SetPageError(page);
-                }
+                        }
-                unlock_page(page);
+                        unlock_page(page);
-        } while (bvec >= bio->bi_io_vec);
+                } else { /* bio_data_dir(bio) == WRITE */
-        bio_put(bio);
+                        if (!uptodate) {
-}
+                                SetPageError(page);
+                                if (page->mapping)
-static void mpage_end_io_write(struct bio *bio, int err)
+                                        set_bit(AS_EIO, &page->mapping->flags);
-{
+                        }
-        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+                        end_page_writeback(page);
-        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-        do {
-                struct page *page = bvec->bv_page;
-                if (--bvec >= bio->bi_io_vec)
-                        prefetchw(&bvec->bv_page->flags);
-                if (!uptodate){
-                        SetPageError(page);
-                        if (page->mapping)
-                                set_bit(AS_EIO, &page->mapping->flags);
                }
-                end_page_writeback(page);
        } while (bvec >= bio->bi_io_vec);
        bio_put(bio);
 }
 static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
-        bio->bi_end_io = mpage_end_io_read;
+        bio->bi_end_io = mpage_end_io;
-        if (rw == WRITE)
-                bio->bi_end_io = mpage_end_io_write;
        submit_bio(rw, bio);
        return NULL;
 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 95b081bc9e25..64ee240f3c80 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1579,6 +1579,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
 {
        struct iattr attr;
        int error;
+        int open_flags = 0;
        dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
                        dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1586,7 +1587,10 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
        attr.ia_mode = mode;
        attr.ia_valid = ATTR_MODE;
-        error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL);
+        if ((nd->flags & LOOKUP_CREATE) != 0)
+                open_flags = nd->intent.open.flags;
+        error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL);
        if (error != 0)
                goto out_err;
        return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 93f1cdd5d3d7..9d096e82b201 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1151,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                goto err_task_lock;
        }
-        if (oom_score_adj < task->signal->oom_score_adj &&
+        if (oom_score_adj < task->signal->oom_score_adj_min &&
                        !capable(CAP_SYS_RESOURCE)) {
                err = -EACCES;
                goto err_sighand;
@@ -1164,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                        atomic_dec(&task->mm->oom_disable_count);
        }
        task->signal->oom_score_adj = oom_score_adj;
+        if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
+                task->signal->oom_score_adj_min = oom_score_adj;
        /*
         * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
         * always attainable.
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a65239cfd97e..ed257d141568 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -101,6 +101,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_MEMORY_FAILURE
                "HardwareCorrupted: %5lu kB\n"
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                "AnonHugePages:  %8lu kB\n"
+#endif
                ,
                K(i.totalram),
                K(i.freeram),
@@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                K(i.freeswap),
                K(global_page_state(NR_FILE_DIRTY)),
                K(global_page_state(NR_WRITEBACK)),
-                K(global_page_state(NR_ANON_PAGES)),
+                K(global_page_state(NR_ANON_PAGES)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                  + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+                  HPAGE_PMD_NR
+#endif
+                  ),
                K(global_page_state(NR_FILE_MAPPED)),
                K(global_page_state(NR_SHMEM)),
                K(global_page_state(NR_SLAB_RECLAIMABLE) +
@@ -151,6 +159,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_MEMORY_FAILURE
                ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+                   HPAGE_PMD_NR)
+#endif
                );
        hugetlb_report_meminfo(m);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index b06c674624e6..6d8e6a9e93ab 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page)
        if (PageHuge(page))
                u |= 1 << KPF_HUGE;
-        u |= kpf_copy_bit(k, KPF_LOCKED,        PG_locked);
        /*
-         * Caveats on high order pages:
+         * Caveats on high order pages: page->_count will only be set
-         * PG_buddy will only be set on the head page; SLUB/SLQB do the same
+         * -1 on the head page; SLUB/SLQB do the same for PG_slab;
-         * for PG_slab; SLOB won't set PG_slab at all on compound pages.
+         * SLOB won't set PG_slab at all on compound pages.
         */
+        if (PageBuddy(page))
+                u |= 1 << KPF_BUDDY;
+        u |= kpf_copy_bit(k, KPF_LOCKED,        PG_locked);
        u |= kpf_copy_bit(k, KPF_SLAB,          PG_slab);
-        u |= kpf_copy_bit(k, KPF_BUDDY,         PG_buddy);
        u |= kpf_copy_bit(k, KPF_ERROR,         PG_error);
        u |= kpf_copy_bit(k, KPF_DIRTY,         PG_dirty);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c3755bd8dd3e..60b914860f81 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -418,7 +418,8 @@ static int show_smap(struct seq_file *m, void *v)
                   "Anonymous:      %8lu kB\n"
                   "Swap:           %8lu kB\n"
                   "KernelPageSize: %8lu kB\n"
-                   "MMUPageSize:    %8lu kB\n",
+                   "MMUPageSize:    %8lu kB\n"
+                   "Locked:         %8lu kB\n",
                   (vma->vm_end - vma->vm_start) >> 10,
                   mss.resident >> 10,
                   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -430,7 +431,9 @@ static int show_smap(struct seq_file *m, void *v)
                   mss.anonymous >> 10,
                   mss.swap >> 10,
                   vma_kernel_pagesize(vma) >> 10,
-                   vma_mmu_pagesize(vma) >> 10);
+                   vma_mmu_pagesize(vma) >> 10,
+                   (vma->vm_flags & VM_LOCKED) ?
+                        (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
        if (m->count < m->size)  /* vma is copied successfully */
                m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h
index 6098cae2af8e..ff5c66080c8c 100644
--- a/include/asm-generic/gpio.h
+++ b/include/asm-generic/gpio.h
@@ -147,11 +147,11 @@ extern struct gpio_chip *gpiochip_find(void *data,
 /* Always use the library code for GPIO management calls,
 * or when sleeping may be involved.
 */
-extern int __must_check gpio_request(unsigned gpio, const char *label);
+extern int gpio_request(unsigned gpio, const char *label);
 extern void gpio_free(unsigned gpio);
-extern int __must_check gpio_direction_input(unsigned gpio);
+extern int gpio_direction_input(unsigned gpio);
-extern int __must_check gpio_direction_output(unsigned gpio, int value);
+extern int gpio_direction_output(unsigned gpio, int value);
 extern int gpio_set_debounce(unsigned gpio, unsigned debounce);
@@ -192,8 +192,8 @@ struct gpio {
        const char      *label;
 };
-extern int __must_check gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
+extern int gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
-extern int __must_check gpio_request_array(struct gpio *array, size_t num);
+extern int gpio_request_array(struct gpio *array, size_t num);
 extern void gpio_free_array(struct gpio *array, size_t num);
 #ifdef CONFIG_GPIO_SYSFS
diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h
index 3da9e2742fa0..787abbb6d867 100644
--- a/include/asm-generic/mman-common.h
+++ b/include/asm-generic/mman-common.h
@@ -45,6 +45,9 @@
 #define MADV_MERGEABLE   12             /* KSM may merge identical pages */
 #define MADV_UNMERGEABLE 13             /* KSM may not merge identical pages */
+#define MADV_HUGEPAGE   14              /* Worth backing with hugepages */
+#define MADV_NOHUGEPAGE 15              /* Not worth backing with hugepages */
 /* compatibility flags */
 #define MAP_FILE        0
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 6f3c6ae4fe03..f1eddf71dd0c 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -5,67 +5,108 @@
 #ifdef CONFIG_MMU
 #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-/*
+extern int ptep_set_access_flags(struct vm_area_struct *vma,
- * Largely same as above, but only sets the access flags (dirty,
+                                 unsigned long address, pte_t *ptep,
- * accessed, and writable). Furthermore, we know it always gets set
+                                 pte_t entry, int dirty);
- * to a "more permissive" setting, which allows most architectures
+#endif
- * to optimize this. We return whether the PTE actually changed, which
- * in turn instructs the caller to do things like update__mmu_cache.
+#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
- * This used to be done in the caller, but sparc needs minor faults to
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
- * force that call on sun4c so we changed this macro slightly
+                                 unsigned long address, pmd_t *pmdp,
- */
+                                 pmd_t entry, int dirty);
-#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
-({                                                                        \
-        int __changed = !pte_same(*(__ptep), __entry);                    \
-        if (__changed) {                                                  \
-                set_pte_at((__vma)->vm_mm, (__address), __ptep, __entry); \
-                flush_tlb_page(__vma, __address);                         \
-        }                                                                 \
-        __changed;                                                        \
-})
 #endif
 #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define ptep_test_and_clear_young(__vma, __address, __ptep)             \
+static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
-({                                                                      \
+                                            unsigned long address,
-        pte_t __pte = *(__ptep);                                        \
+                                            pte_t *ptep)
-        int r = 1;                                                      \
+{
-        if (!pte_young(__pte))                                          \
+        pte_t pte = *ptep;
-                r = 0;                                                  \
+        int r = 1;
-        else                                                            \
+        if (!pte_young(pte))
-                set_pte_at((__vma)->vm_mm, (__address),                 \
+                r = 0;
-                           (__ptep), pte_mkold(__pte));                 \
+        else
-        r;                                                              \
+                set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte));
-})
+        return r;
+}
+#endif
+#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+                                            unsigned long address,
+                                            pmd_t *pmdp)
+{
+        pmd_t pmd = *pmdp;
+        int r = 1;
+        if (!pmd_young(pmd))
+                r = 0;
+        else
+                set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd));
+        return r;
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+                                            unsigned long address,
+                                            pmd_t *pmdp)
+{
+        BUG();
+        return 0;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-#define ptep_clear_flush_young(__vma, __address, __ptep)                \
+int ptep_clear_flush_young(struct vm_area_struct *vma,
-({                                                                      \
+                           unsigned long address, pte_t *ptep);
-        int __young;                                                    \
+#endif
-        __young = ptep_test_and_clear_young(__vma, __address, __ptep);  \
-        if (__young)                                                    \
+#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
-                flush_tlb_page(__vma, __address);                       \
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
-        __young;                                                        \
+                           unsigned long address, pmd_t *pmdp);
-})
 #endif
 #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
-#define ptep_get_and_clear(__mm, __address, __ptep)                     \
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
-({                                                                      \
+                                       unsigned long address,
-        pte_t __pte = *(__ptep);                                        \
+                                       pte_t *ptep)
-        pte_clear((__mm), (__address), (__ptep));                       \
+{
-        __pte;                                                          \
+        pte_t pte = *ptep;
+        pte_clear(mm, address, ptep);
+        return pte;
+}
+#endif
+#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+                                       unsigned long address,
+                                       pmd_t *pmdp)
+{
+        pmd_t pmd = *pmdp;
+        pmd_clear(mm, address, pmdp);
+        return pmd;
 })
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+                                       unsigned long address,
+                                       pmd_t *pmdp)
+{
+        BUG();
+        return __pmd(0);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
-#define ptep_get_and_clear_full(__mm, __address, __ptep, __full)        \
+static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
-({                                                                      \
+                                            unsigned long address, pte_t *ptep,
-        pte_t __pte;                                                    \
+                                            int full)
-        __pte = ptep_get_and_clear((__mm), (__address), (__ptep));      \
+{
-        __pte;                                                          \
+        pte_t pte;
-})
+        pte = ptep_get_and_clear(mm, address, ptep);
+        return pte;
+}
 #endif
 /*
@@ -74,20 +115,25 @@
 * not present, or in the process of an address space destruction.
 */
 #ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL
-#define pte_clear_not_present_full(__mm, __address, __ptep, __full)     \
+static inline void pte_clear_not_present_full(struct mm_struct *mm,
-do {                                                                    \
+                                              unsigned long address,
-        pte_clear((__mm), (__address), (__ptep));                       \
+                                              pte_t *ptep,
-} while (0)
+                                              int full)
+{
+        pte_clear(mm, address, ptep);
+}
 #endif
 #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
-#define ptep_clear_flush(__vma, __address, __ptep)                      \
+extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
-({                                                                      \
+                              unsigned long address,
-        pte_t __pte;                                                    \
+                              pte_t *ptep);
-        __pte = ptep_get_and_clear((__vma)->vm_mm, __address, __ptep);  \
+#endif
-        flush_tlb_page(__vma, __address);                               \
-        __pte;                                                          \
+#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
-})
+extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma,
+                              unsigned long address,
+                              pmd_t *pmdp);
 #endif
 #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
@@ -99,8 +145,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
 }
 #endif
+#ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+                                      unsigned long address, pmd_t *pmdp)
+{
+        pmd_t old_pmd = *pmdp;
+        set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd));
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+                                      unsigned long address, pmd_t *pmdp)
+{
+        BUG();
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma,
+                              unsigned long address,
+                              pmd_t *pmdp);
+#endif
 #ifndef __HAVE_ARCH_PTE_SAME
-#define pte_same(A,B)   (pte_val(A) == pte_val(B))
+static inline int pte_same(pte_t pte_a, pte_t pte_b)
+{
+        return pte_val(pte_a) == pte_val(pte_b);
+}
+#endif
+#ifndef __HAVE_ARCH_PMD_SAME
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
+{
+        return pmd_val(pmd_a) == pmd_val(pmd_b);
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
+{
+        BUG();
+        return 0;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 #ifndef __HAVE_ARCH_PAGE_TEST_DIRTY
@@ -348,6 +435,24 @@ extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
                                unsigned long size);
 #endif
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+        return 0;
+}
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+        return 0;
+}
+#ifndef __HAVE_ARCH_PMD_WRITE
+static inline int pmd_write(pmd_t pmd)
+{
+        BUG();
+        return 0;
+}
+#endif /* __HAVE_ARCH_PMD_WRITE */
+#endif
 #endif /* !__ASSEMBLY__ */
 #endif /* _ASM_GENERIC_PGTABLE_H */
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 5ac51552d908..dfa2ed4c0d26 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -11,6 +11,9 @@
 /* The full zone was compacted */
 #define COMPACT_COMPLETE        3
+#define COMPACT_MODE_DIRECT_RECLAIM     0
+#define COMPACT_MODE_KSWAPD             1
 #ifdef CONFIG_COMPACTION
 extern int sysctl_compact_memory;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
@@ -21,7 +24,12 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
-                        int order, gfp_t gfp_mask, nodemask_t *mask);
+                        int order, gfp_t gfp_mask, nodemask_t *mask,
+                        bool sync);
+extern unsigned long compaction_suitable(struct zone *zone, int order);
+extern unsigned long compact_zone_order(struct zone *zone, int order,
+                                        gfp_t gfp_mask, bool sync,
+                                        int compact_mode);
 /* Do not skip compaction more than 64 times */
 #define COMPACT_MAX_DEFER_SHIFT 6
@@ -54,7 +62,20 @@ static inline bool compaction_deferred(struct zone *zone)
 #else
 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
-                        int order, gfp_t gfp_mask, nodemask_t *nodemask)
+                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
+                        bool sync)
+{
+        return COMPACT_CONTINUE;
+}
+static inline unsigned long compaction_suitable(struct zone *zone, int order)
+{
+        return COMPACT_SKIPPED;
+}
+static inline unsigned long compact_zone_order(struct zone *zone, int order,
+                                               gfp_t gfp_mask, bool sync,
+                                               int compact_mode)
 {
        return COMPACT_CONTINUE;
 }
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 2970022faa63..272496d1fae4 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -193,6 +193,13 @@ struct dm_target {
        char *error;
 };
+/* Each target can link one of these into the table */
+struct dm_target_callbacks {
+        struct list_head list;
+        int (*congested_fn) (struct dm_target_callbacks *, int);
+        void (*unplug_fn)(struct dm_target_callbacks *);
+};
 int dm_register_target(struct target_type *t);
 void dm_unregister_target(struct target_type *t);
@@ -269,6 +276,11 @@ int dm_table_add_target(struct dm_table *t, const char *type,
                        sector_t start, sector_t len, char *params);
 /*
+ * Target_ctr should call this if it needs to add any callbacks.
+ */
+void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb);
+/*
 * Finally call this to make the table ready for use.
 */
 int dm_table_complete(struct dm_table *t);
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index 49eab360d5d4..78bbf47bbb96 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -44,7 +44,7 @@
 * Remove a device, destroy any tables.
 *
 * DM_DEV_RENAME:
- * Rename a device.
+ * Rename a device or set its uuid if none was previously supplied.
 *
 * DM_SUSPEND:
 * This performs both suspend and resume, depending which flag is
@@ -267,9 +267,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY     _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 #define DM_VERSION_MAJOR        4
-#define DM_VERSION_MINOR        18
+#define DM_VERSION_MINOR        19
-#define DM_VERSION_PATCHLEVEL   0
+#define DM_VERSION_PATCHLEVEL   1
-#define DM_VERSION_EXTRA        "-ioctl (2010-06-29)"
+#define DM_VERSION_EXTRA        "-ioctl (2011-01-07)"
 /* Status bits */
 #define DM_READONLY_FLAG        (1 << 0) /* In/Out */
@@ -322,4 +322,10 @@ enum {
 */
 #define DM_UEVENT_GENERATED_FLAG        (1 << 13) /* Out */
+/*
+ * If set, rename changes the uuid not the name.  Only permitted
+ * if no uuid was previously supplied: an existing uuid cannot be changed.
+ */
+#define DM_UUID_FLAG                    (1 << 14) /* In */
 #endif                          /* _LINUX_DM_IOCTL_H */
diff --git a/include/linux/dm-log-userspace.h b/include/linux/dm-log-userspace.h
index 0c3c3a2110c4..eeace7d3ff15 100644
--- a/include/linux/dm-log-userspace.h
+++ b/include/linux/dm-log-userspace.h
@@ -370,6 +370,16 @@
 #define DM_ULOG_REQUEST_TYPE(request_type) \
        (DM_ULOG_REQUEST_MASK & (request_type))
+/*
+ * DM_ULOG_REQUEST_VERSION is incremented when there is a
+ * change to the way information is passed between kernel
+ * and userspace.  This could be a structure change of
+ * dm_ulog_request or a change in the way requests are
+ * issued/handled.  Changes are outlined here:
+ *      version 1:  Initial implementation
+ */
+#define DM_ULOG_REQUEST_VERSION 1
 struct dm_ulog_request {
        /*
         * The local unique identifier (luid) and the universally unique
@@ -383,8 +393,9 @@ struct dm_ulog_request {
         */
        uint64_t luid;
        char uuid[DM_UUID_LEN];
-        char padding[7];        /* Padding because DM_UUID_LEN = 129 */
+        char padding[3];        /* Padding because DM_UUID_LEN = 129 */
+        uint32_t version;       /* See DM_ULOG_REQUEST_VERSION */
        int32_t error;          /* Used to report back processing errors */
        uint32_t seq;           /* Sequence number for request */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f54adfcbec9c..a3b148a91874 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -34,6 +34,7 @@ struct vm_area_struct;
 #else
 #define ___GFP_NOTRACK          0
 #endif
+#define ___GFP_NO_KSWAPD        0x400000u
 /*
 * GFP bitmasks..
@@ -81,13 +82,15 @@ struct vm_area_struct;
 #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
 #define __GFP_NOTRACK   ((__force gfp_t)___GFP_NOTRACK)  /* Don't track with kmemcheck */
+#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
 /*
 * This may seem redundant, but it's a way of annotating false positives vs.
 * allocations that simply cannot be supported (e.g. page tables).
 */
 #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
-#define __GFP_BITS_SHIFT 22     /* Room for 22 __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 23     /* Room for 23 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 /* This equals 0, but use constants in case they ever change */
@@ -106,6 +109,9 @@ struct vm_area_struct;
                                 __GFP_HARDWALL | __GFP_HIGHMEM | \
                                 __GFP_MOVABLE)
 #define GFP_IOFS        (__GFP_IO | __GFP_FS)
+#define GFP_TRANSHUGE   (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
+                         __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
+                         __GFP_NO_KSWAPD)
 #ifdef CONFIG_NUMA
 #define GFP_THISNODE    (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
@@ -325,14 +331,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
        return alloc_pages_current(gfp_mask, order);
 }
-extern struct page *alloc_page_vma(gfp_t gfp_mask,
+extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
                        struct vm_area_struct *vma, unsigned long addr);
 #else
 #define alloc_pages(gfp_mask, order) \
                alloc_pages_node(numa_node_id(), gfp_mask, order)
-#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0)
+#define alloc_pages_vma(gfp_mask, order, vma, addr)     \
+        alloc_pages(gfp_mask, order)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
+#define alloc_page_vma(gfp_mask, vma, addr)     \
+        alloc_pages_vma(gfp_mask, 0, vma, addr)
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index f79d67f413e4..4b47ed96f131 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -30,7 +30,7 @@ static inline int gpio_is_valid(int number)
        return 0;
 }
-static inline int __must_check gpio_request(unsigned gpio, const char *label)
+static inline int gpio_request(unsigned gpio, const char *label)
 {
        return -ENOSYS;
 }
@@ -62,12 +62,12 @@ static inline void gpio_free_array(struct gpio *array, size_t num)
        WARN_ON(1);
 }
-static inline int __must_check gpio_direction_input(unsigned gpio)
+static inline int gpio_direction_input(unsigned gpio)
 {
        return -ENOSYS;
 }
-static inline int __must_check gpio_direction_output(unsigned gpio, int value)
+static inline int gpio_direction_output(unsigned gpio, int value)
 {
        return -ENOSYS;
 }
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
new file mode 100644
index 000000000000..8e6c8c42bc3c
--- /dev/null
+++ b/include/linux/huge_mm.h
@@ -0,0 +1,179 @@
+#ifndef _LINUX_HUGE_MM_H
+#define _LINUX_HUGE_MM_H
+extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
+                                      struct vm_area_struct *vma,
+                                      unsigned long address, pmd_t *pmd,
+                                      unsigned int flags);
+extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+                         pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+                         struct vm_area_struct *vma);
+extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                               unsigned long address, pmd_t *pmd,
+                               pmd_t orig_pmd);
+extern pgtable_t get_pmd_huge_pte(struct mm_struct *mm);
+extern struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+                                          unsigned long addr,
+                                          pmd_t *pmd,
+                                          unsigned int flags);
+extern int zap_huge_pmd(struct mmu_gather *tlb,
+                        struct vm_area_struct *vma,
+                        pmd_t *pmd);
+extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+                        unsigned long addr, unsigned long end,
+                        unsigned char *vec);
+extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+                        unsigned long addr, pgprot_t newprot);
+enum transparent_hugepage_flag {
+        TRANSPARENT_HUGEPAGE_FLAG,
+        TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+        TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+        TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
+        TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
+#ifdef CONFIG_DEBUG_VM
+        TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
+#endif
+};
+enum page_check_address_pmd_flag {
+        PAGE_CHECK_ADDRESS_PMD_FLAG,
+        PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG,
+        PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG,
+};
+extern pmd_t *page_check_address_pmd(struct page *page,
+                                     struct mm_struct *mm,
+                                     unsigned long address,
+                                     enum page_check_address_pmd_flag flag);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define HPAGE_PMD_SHIFT HPAGE_SHIFT
+#define HPAGE_PMD_MASK HPAGE_MASK
+#define HPAGE_PMD_SIZE HPAGE_SIZE
+#define transparent_hugepage_enabled(__vma)                             \
+        ((transparent_hugepage_flags &                                  \
+          (1<<TRANSPARENT_HUGEPAGE_FLAG) ||                             \
+          (transparent_hugepage_flags &                                 \
+           (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG) &&                   \
+           ((__vma)->vm_flags & VM_HUGEPAGE))) &&                       \
+         !((__vma)->vm_flags & VM_NOHUGEPAGE))
+#define transparent_hugepage_defrag(__vma)                              \
+        ((transparent_hugepage_flags &                                  \
+          (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)) ||                     \
+         (transparent_hugepage_flags &                                  \
+          (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG) &&             \
+          (__vma)->vm_flags & VM_HUGEPAGE))
+#ifdef CONFIG_DEBUG_VM
+#define transparent_hugepage_debug_cow()                                \
+        (transparent_hugepage_flags &                                   \
+         (1<<TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG))
+#else /* CONFIG_DEBUG_VM */
+#define transparent_hugepage_debug_cow() 0
+#endif /* CONFIG_DEBUG_VM */
+extern unsigned long transparent_hugepage_flags;
+extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+                          pmd_t *dst_pmd, pmd_t *src_pmd,
+                          struct vm_area_struct *vma,
+                          unsigned long addr, unsigned long end);
+extern int handle_pte_fault(struct mm_struct *mm,
+                            struct vm_area_struct *vma, unsigned long address,
+                            pte_t *pte, pmd_t *pmd, unsigned int flags);
+extern int split_huge_page(struct page *page);
+extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
+#define split_huge_page_pmd(__mm, __pmd)                                \
+        do {                                                            \
+                pmd_t *____pmd = (__pmd);                               \
+                if (unlikely(pmd_trans_huge(*____pmd)))                 \
+                        __split_huge_page_pmd(__mm, ____pmd);           \
+        }  while (0)
+#define wait_split_huge_page(__anon_vma, __pmd)                         \
+        do {                                                            \
+                pmd_t *____pmd = (__pmd);                               \
+                spin_unlock_wait(&(__anon_vma)->root->lock);            \
+                /*                                                      \
+                 * spin_unlock_wait() is just a loop in C and so the    \
+                 * CPU can reorder anything around it.                  \
+                 */                                                     \
+                smp_mb();                                               \
+                BUG_ON(pmd_trans_splitting(*____pmd) ||                 \
+                       pmd_trans_huge(*____pmd));                       \
+        } while (0)
+#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
+#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
+#if HPAGE_PMD_ORDER > MAX_ORDER
+#error "hugepages can't be allocated by the buddy allocator"
+#endif
+extern int hugepage_madvise(struct vm_area_struct *vma,
+                            unsigned long *vm_flags, int advice);
+extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+                                    unsigned long start,
+                                    unsigned long end,
+                                    long adjust_next);
+static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
+                                         unsigned long start,
+                                         unsigned long end,
+                                         long adjust_next)
+{
+        if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+                return;
+        __vma_adjust_trans_huge(vma, start, end, adjust_next);
+}
+static inline int hpage_nr_pages(struct page *page)
+{
+        if (unlikely(PageTransHuge(page)))
+                return HPAGE_PMD_NR;
+        return 1;
+}
+static inline struct page *compound_trans_head(struct page *page)
+{
+        if (PageTail(page)) {
+                struct page *head;
+                head = page->first_page;
+                smp_rmb();
+                /*
+                 * head may be a dangling pointer.
+                 * __split_huge_page_refcount clears PageTail before
+                 * overwriting first_page, so if PageTail is still
+                 * there it means the head pointer isn't dangling.
+                 */
+                if (PageTail(page))
+                        return head;
+        }
+        return page;
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+#define HPAGE_PMD_SHIFT ({ BUG(); 0; })
+#define HPAGE_PMD_MASK ({ BUG(); 0; })
+#define HPAGE_PMD_SIZE ({ BUG(); 0; })
+#define hpage_nr_pages(x) 1
+#define transparent_hugepage_enabled(__vma) 0
+#define transparent_hugepage_flags 0UL
+static inline int split_huge_page(struct page *page)
+{
+        return 0;
+}
+#define split_huge_page_pmd(__mm, __pmd)        \
+        do { } while (0)
+#define wait_split_huge_page(__anon_vma, __pmd) \
+        do { } while (0)
+#define compound_trans_head(page) compound_head(page)
+static inline int hugepage_madvise(struct vm_area_struct *vma,
+                                   unsigned long *vm_flags, int advice)
+{
+        BUG();
+        return 0;
+}
+static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
+                                         unsigned long start,
+                                         unsigned long end,
+                                         long adjust_next)
+{
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 979c68cc7458..6a64c6fa81af 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -57,7 +57,7 @@ struct irq_desc {
 #endif
        struct timer_rand_state *timer_rand_state;
-        unsigned int            *kstat_irqs;
+        unsigned int __percpu   *kstat_irqs;
        irq_flow_handler_t      handle_irq;
        struct irqaction        *action;        /* IRQ action list */
        unsigned int            status;         /* IRQ status */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 57dac7022b63..5a9d9059520b 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -600,6 +600,13 @@ struct sysinfo {
 #define NUMA_BUILD 0
 #endif
+/* This helps us avoid #ifdef CONFIG_COMPACTION */
+#ifdef CONFIG_COMPACTION
+#define COMPACTION_BUILD 1
+#else
+#define COMPACTION_BUILD 0
+#endif
 /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 44e83ba12b5b..0cce2db580c3 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -46,16 +46,14 @@ DECLARE_PER_CPU(struct kernel_stat, kstat);
 extern unsigned long long nr_context_switches(void);
 #ifndef CONFIG_GENERIC_HARDIRQS
-#define kstat_irqs_this_cpu(irq) \
-        (this_cpu_read(kstat.irqs[irq])
 struct irq_desc;
 static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
                                            struct irq_desc *desc)
 {
-        kstat_this_cpu.irqs[irq]++;
+        __this_cpu_inc(kstat.irqs[irq]);
-        kstat_this_cpu.irqs_sum++;
+        __this_cpu_inc(kstat.irqs_sum);
 }
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
@@ -65,17 +63,18 @@ static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 #else
 #include <linux/irq.h>
 extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
-#define kstat_irqs_this_cpu(DESC) \
-        ((DESC)->kstat_irqs[smp_processor_id()])
+#define kstat_incr_irqs_this_cpu(irqno, DESC)           \
-#define kstat_incr_irqs_this_cpu(irqno, DESC) do {\
+do {                                                    \
-        ((DESC)->kstat_irqs[smp_processor_id()]++);\
+        __this_cpu_inc(*(DESC)->kstat_irqs);            \
-        kstat_this_cpu.irqs_sum++; } while (0)
+        __this_cpu_inc(kstat.irqs_sum);                 \
+} while (0)
 #endif
 static inline void kstat_incr_softirqs_this_cpu(unsigned int irq)
 {
-        kstat_this_cpu.softirqs[irq]++;
+        __this_cpu_inc(kstat.softirqs[irq]);
 }
 static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu)
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
new file mode 100644
index 000000000000..6b394f0b5148
--- /dev/null
+++ b/include/linux/khugepaged.h
@@ -0,0 +1,67 @@
+#ifndef _LINUX_KHUGEPAGED_H
+#define _LINUX_KHUGEPAGED_H
+#include <linux/sched.h> /* MMF_VM_HUGEPAGE */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern int __khugepaged_enter(struct mm_struct *mm);
+extern void __khugepaged_exit(struct mm_struct *mm);
+extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma);
+#define khugepaged_enabled()                                           \
+        (transparent_hugepage_flags &                                  \
+         ((1<<TRANSPARENT_HUGEPAGE_FLAG) |                     \
+          (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)))
+#define khugepaged_always()                             \
+        (transparent_hugepage_flags &                   \
+         (1<<TRANSPARENT_HUGEPAGE_FLAG))
+#define khugepaged_req_madv()                                   \
+        (transparent_hugepage_flags &                           \
+         (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))
+#define khugepaged_defrag()                                     \
+        (transparent_hugepage_flags &                           \
+         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
+static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+        if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags))
+                return __khugepaged_enter(mm);
+        return 0;
+}
+static inline void khugepaged_exit(struct mm_struct *mm)
+{
+        if (test_bit(MMF_VM_HUGEPAGE, &mm->flags))
+                __khugepaged_exit(mm);
+}
+static inline int khugepaged_enter(struct vm_area_struct *vma)
+{
+        if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags))
+                if ((khugepaged_always() ||
+                     (khugepaged_req_madv() &&
+                      vma->vm_flags & VM_HUGEPAGE)) &&
+                    !(vma->vm_flags & VM_NOHUGEPAGE))
+                        if (__khugepaged_enter(vma->vm_mm))
+                                return -ENOMEM;
+        return 0;
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+        return 0;
+}
+static inline void khugepaged_exit(struct mm_struct *mm)
+{
+}
+static inline int khugepaged_enter(struct vm_area_struct *vma)
+{
+        return 0;
+}
+static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+{
+        return 0;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif /* _LINUX_KHUGEPAGED_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 159a0762aeaf..6a576f989437 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -25,6 +25,11 @@ struct page_cgroup;
 struct page;
 struct mm_struct;
+/* Stats that can be updated by kernel. */
+enum mem_cgroup_page_stat_item {
+        MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
+};
 extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
                                        struct list_head *dst,
                                        unsigned long *scanned, int order,
@@ -93,7 +98,7 @@ extern int
 mem_cgroup_prepare_migration(struct page *page,
        struct page *newpage, struct mem_cgroup **ptr);
 extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
-        struct page *oldpage, struct page *newpage);
+        struct page *oldpage, struct page *newpage, bool migration_ok);
 /*
 * For memory reclaim.
@@ -121,7 +126,22 @@ static inline bool mem_cgroup_disabled(void)
        return false;
 }
-void mem_cgroup_update_file_mapped(struct page *page, int val);
+void mem_cgroup_update_page_stat(struct page *page,
+                                 enum mem_cgroup_page_stat_item idx,
+                                 int val);
+static inline void mem_cgroup_inc_page_stat(struct page *page,
+                                            enum mem_cgroup_page_stat_item idx)
+{
+        mem_cgroup_update_page_stat(page, idx, 1);
+}
+static inline void mem_cgroup_dec_page_stat(struct page *page,
+                                            enum mem_cgroup_page_stat_item idx)
+{
+        mem_cgroup_update_page_stat(page, idx, -1);
+}
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                                                gfp_t gfp_mask);
 u64 mem_cgroup_get_limit(struct mem_cgroup *mem);
@@ -231,8 +251,7 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
 }
 static inline void mem_cgroup_end_migration(struct mem_cgroup *mem,
-                                        struct page *oldpage,
+                struct page *oldpage, struct page *newpage, bool migration_ok)
-                                        struct page *newpage)
 {
 }
@@ -293,8 +312,13 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 }
-static inline void mem_cgroup_update_file_mapped(struct page *page,
+static inline void mem_cgroup_inc_page_stat(struct page *page,
-                                                        int val)
+                                            enum mem_cgroup_page_stat_item idx)
+{
+}
+static inline void mem_cgroup_dec_page_stat(struct page *page,
+                                            enum mem_cgroup_page_stat_item idx)
 {
 }
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 31c237a00c48..24376fe7ee68 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -13,12 +13,16 @@ struct mem_section;
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
- * Types for free bootmem.
+ * Types for free bootmem stored in page->lru.next. These have to be in
- * The normal smallest mapcount is -1. Here is smaller value than it.
+ * some random range in unsigned long space for debugging purposes.
 */
-#define SECTION_INFO            (-1 - 1)
+enum {
-#define MIX_SECTION_INFO        (-1 - 2)
+        MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12,
-#define NODE_INFO               (-1 - 3)
+        SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE,
+        MIX_SECTION_INFO,
+        NODE_INFO,
+        MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO,
+};
 /*
 * pgdat resizing functions
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 085527fb8261..e39aeecfe9a2 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -13,9 +13,11 @@ extern void putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
                        struct page *, struct page *);
 extern int migrate_pages(struct list_head *l, new_page_t x,
-                        unsigned long private, int offlining);
+                        unsigned long private, bool offlining,
+                        bool sync);
 extern int migrate_huge_pages(struct list_head *l, new_page_t x,
-                        unsigned long private, int offlining);
+                        unsigned long private, bool offlining,
+                        bool sync);
 extern int fail_migrate_page(struct address_space *,
                        struct page *, struct page *);
@@ -33,9 +35,11 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 static inline void putback_lru_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t x,
-                unsigned long private, int offlining) { return -ENOSYS; }
+                unsigned long private, bool offlining,
+                bool sync) { return -ENOSYS; }
 static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
-                unsigned long private, int offlining) { return -ENOSYS; }
+                unsigned long private, bool offlining,
+                bool sync) { return -ENOSYS; }
 static inline int migrate_prep(void) { return -ENOSYS; }
 static inline int migrate_prep_local(void) { return -ENOSYS; }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 721f451c3029..956a35532f47 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -14,6 +14,7 @@
 #include <linux/mm_types.h>
 #include <linux/range.h>
 #include <linux/pfn.h>
+#include <linux/bit_spinlock.h>
 struct mempolicy;
 struct anon_vma;
@@ -82,6 +83,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_GROWSUP      0x00000200
 #else
 #define VM_GROWSUP      0x00000000
+#define VM_NOHUGEPAGE   0x00000200      /* MADV_NOHUGEPAGE marked this vma */
 #endif
 #define VM_PFNMAP       0x00000400      /* Page-ranges managed without "struct page", just pure PFN */
 #define VM_DENYWRITE    0x00000800      /* ETXTBSY on write attempts.. */
@@ -101,7 +103,11 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_NORESERVE    0x00200000      /* should the VM suppress accounting */
 #define VM_HUGETLB      0x00400000      /* Huge TLB Page VM */
 #define VM_NONLINEAR    0x00800000      /* Is non-linear (remap_file_pages) */
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
 #define VM_MAPPED_COPY  0x01000000      /* T if mapped copy of data (nommu mmap) */
+#else
+#define VM_HUGEPAGE     0x01000000      /* MADV_HUGEPAGE marked this vma */
+#endif
 #define VM_INSERTPAGE   0x02000000      /* The vma has had "vm_insert_page()" done on it */
 #define VM_ALWAYSDUMP   0x04000000      /* Always include in core dumps */
@@ -242,6 +248,7 @@ struct inode;
 * files which need it (119 of them)
 */
 #include <linux/page-flags.h>
+#include <linux/huge_mm.h>
 /*
 * Methods to modify the page usage count.
@@ -305,6 +312,39 @@ static inline int is_vmalloc_or_module_addr(const void *x)
 }
 #endif
+static inline void compound_lock(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        bit_spin_lock(PG_compound_lock, &page->flags);
+#endif
+}
+static inline void compound_unlock(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        bit_spin_unlock(PG_compound_lock, &page->flags);
+#endif
+}
+static inline unsigned long compound_lock_irqsave(struct page *page)
+{
+        unsigned long uninitialized_var(flags);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        local_irq_save(flags);
+        compound_lock(page);
+#endif
+        return flags;
+}
+static inline void compound_unlock_irqrestore(struct page *page,
+                                              unsigned long flags)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        compound_unlock(page);
+        local_irq_restore(flags);
+#endif
+}
 static inline struct page *compound_head(struct page *page)
 {
        if (unlikely(PageTail(page)))
@@ -319,9 +359,29 @@ static inline int page_count(struct page *page)
 static inline void get_page(struct page *page)
 {
-        page = compound_head(page);
+        /*
-        VM_BUG_ON(atomic_read(&page->_count) == 0);
+         * Getting a normal page or the head of a compound page
+         * requires to already have an elevated page->_count. Only if
+         * we're getting a tail page, the elevated page->_count is
+         * required only in the head page, so for tail pages the
+         * bugcheck only verifies that the page->_count isn't
+         * negative.
+         */
+        VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page));
        atomic_inc(&page->_count);
+        /*
+         * Getting a tail page will elevate both the head and tail
+         * page->_count(s).
+         */
+        if (unlikely(PageTail(page))) {
+                /*
+                 * This is safe only because
+                 * __split_huge_page_refcount can't run under
+                 * get_page().
+                 */
+                VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
+                atomic_inc(&page->first_page->_count);
+        }
 }
 static inline struct page *virt_to_head_page(const void *x)
@@ -339,6 +399,27 @@ static inline void init_page_count(struct page *page)
        atomic_set(&page->_count, 1);
 }
+/*
+ * PageBuddy() indicate that the page is free and in the buddy system
+ * (see mm/page_alloc.c).
+ */
+static inline int PageBuddy(struct page *page)
+{
+        return atomic_read(&page->_mapcount) == -2;
+}
+static inline void __SetPageBuddy(struct page *page)
+{
+        VM_BUG_ON(atomic_read(&page->_mapcount) != -1);
+        atomic_set(&page->_mapcount, -2);
+}
+static inline void __ClearPageBuddy(struct page *page)
+{
+        VM_BUG_ON(!PageBuddy(page));
+        atomic_set(&page->_mapcount, -1);
+}
 void put_page(struct page *page);
 void put_pages_list(struct list_head *pages);
@@ -370,12 +451,39 @@ static inline int compound_order(struct page *page)
        return (unsigned long)page[1].lru.prev;
 }
+static inline int compound_trans_order(struct page *page)
+{
+        int order;
+        unsigned long flags;
+        if (!PageHead(page))
+                return 0;
+        flags = compound_lock_irqsave(page);
+        order = compound_order(page);
+        compound_unlock_irqrestore(page, flags);
+        return order;
+}
 static inline void set_compound_order(struct page *page, unsigned long order)
 {
        page[1].lru.prev = (void *)order;
 }
 /*
+ * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
+ * servicing faults for write access.  In the normal case, do always want
+ * pte_mkwrite.  But get_user_pages can cause write faults for mappings
+ * that do not have writing enabled, when used by access_process_vm.
+ */
+static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+        if (likely(vma->vm_flags & VM_WRITE))
+                pte = pte_mkwrite(pte);
+        return pte;
+}
+/*
 * Multiple processes may "see" the same page. E.g. for untouched
 * mappings of /dev/null, all processes see the same page full of
 * zeroes, and text pages of executables and shared libraries have
@@ -657,7 +765,7 @@ static inline struct address_space *page_mapping(struct page *page)
        VM_BUG_ON(PageSlab(page));
        if (unlikely(PageSwapCache(page)))
                mapping = &swapper_space;
-        else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON))
+        else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
                mapping = NULL;
        return mapping;
 }
@@ -1064,7 +1172,8 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
 #endif
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
+int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+                pmd_t *pmd, unsigned long address);
 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
 /*
@@ -1133,16 +1242,18 @@ static inline void pgtable_page_dtor(struct page *page)
        pte_unmap(pte);                                 \
 } while (0)
-#define pte_alloc_map(mm, pmd, address)                 \
+#define pte_alloc_map(mm, vma, pmd, address)                            \
-        ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
+        ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, vma,    \
-                NULL: pte_offset_map(pmd, address))
+                                                        pmd, address))? \
+         NULL: pte_offset_map(pmd, address))
 #define pte_alloc_map_lock(mm, pmd, address, ptlp)      \
-        ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
+        ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, NULL,   \
+                                                        pmd, address))? \
                NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
 #define pte_alloc_kernel(pmd, address)                  \
-        ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
+        ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
                NULL: pte_offset_kernel(pmd, address))
 extern void free_area_init(unsigned long * zones_size);
@@ -1415,6 +1526,8 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 #define FOLL_GET        0x04    /* do get_page on page */
 #define FOLL_DUMP       0x08    /* give error on hole if it would be zero */
 #define FOLL_FORCE      0x10    /* get_user_pages read/write w/o permission */
+#define FOLL_MLOCK      0x40    /* mark page as mlocked */
+#define FOLL_SPLIT      0x80    /* don't return transhuge pages, split them */
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
                        void *data);
@@ -1518,5 +1631,14 @@ static inline int is_hwpoison_address(unsigned long addr)
 extern void dump_page(struct page *page);
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+extern void clear_huge_page(struct page *page,
+                            unsigned long addr,
+                            unsigned int pages_per_huge_page);
+extern void copy_user_huge_page(struct page *dst, struct page *src,
+                                unsigned long addr, struct vm_area_struct *vma,
+                                unsigned int pages_per_huge_page);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 8835b877b8db..8f7d24712dc1 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -1,6 +1,8 @@
 #ifndef LINUX_MM_INLINE_H
 #define LINUX_MM_INLINE_H
+#include <linux/huge_mm.h>
 /**
 * page_is_file_cache - should the page be on a file LRU or anon LRU?
 * @page: the page to test
@@ -20,18 +22,25 @@ static inline int page_is_file_cache(struct page *page)
 }
 static inline void
-add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
+__add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l,
+                       struct list_head *head)
 {
-        list_add(&page->lru, &zone->lru[l].list);
+        list_add(&page->lru, head);
-        __inc_zone_state(zone, NR_LRU_BASE + l);
+        __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page));
        mem_cgroup_add_lru_list(page, l);
 }
 static inline void
+add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
+{
+        __add_page_to_lru_list(zone, page, l, &zone->lru[l].list);
+}
+static inline void
 del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l)
 {
        list_del(&page->lru);
-        __dec_zone_state(zone, NR_LRU_BASE + l);
+        __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page));
        mem_cgroup_del_lru_list(page, l);
 }
@@ -66,7 +75,7 @@ del_page_from_lru(struct zone *zone, struct page *page)
                        l += LRU_ACTIVE;
                }
        }
-        __dec_zone_state(zone, NR_LRU_BASE + l);
+        __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page));
        mem_cgroup_del_lru_list(page, l);
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bb7288a782fd..26bc4e2cd275 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -310,6 +310,9 @@ struct mm_struct {
 #ifdef CONFIG_MMU_NOTIFIER
        struct mmu_notifier_mm *mmu_notifier_mm;
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        pgtable_t pmd_huge_pte; /* protected by page_table_lock */
+#endif
        /* How many tasks sharing this mm are OOM_DISABLE */
        atomic_t oom_disable_count;
 };
diff --git a/include/linux/mmc/sh_mmcif.h b/include/linux/mmc/sh_mmcif.h
index bf173502d744..38d393092812 100644
--- a/include/linux/mmc/sh_mmcif.h
+++ b/include/linux/mmc/sh_mmcif.h
@@ -94,12 +94,12 @@ struct sh_mmcif_plat_data {
 static inline u32 sh_mmcif_readl(void __iomem *addr, int reg)
 {
-        return readl(addr + reg);
+        return __raw_readl(addr + reg);
 }
 static inline void sh_mmcif_writel(void __iomem *addr, int reg, u32 val)
 {
-        writel(val, addr + reg);
+        __raw_writel(val, addr + reg);
 }
 #define SH_MMCIF_BBS 512 /* boot block size */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 43dcfbdc39de..cc2e7dfea9d7 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -62,6 +62,16 @@ struct mmu_notifier_ops {
                                 unsigned long address);
        /*
+         * test_young is called to check the young/accessed bitflag in
+         * the secondary pte. This is used to know if the page is
+         * frequently used without actually clearing the flag or tearing
+         * down the secondary mapping on the page.
+         */
+        int (*test_young)(struct mmu_notifier *mn,
+                          struct mm_struct *mm,
+                          unsigned long address);
+        /*
         * change_pte is called in cases that pte mapping to page is changed:
         * for example, when ksm remaps pte to point to a new shared page.
         */
@@ -163,6 +173,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
 extern void __mmu_notifier_release(struct mm_struct *mm);
 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long address);
+extern int __mmu_notifier_test_young(struct mm_struct *mm,
+                                     unsigned long address);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
                                      unsigned long address, pte_t pte);
 extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -186,6 +198,14 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
        return 0;
 }
+static inline int mmu_notifier_test_young(struct mm_struct *mm,
+                                          unsigned long address)
+{
+        if (mm_has_notifiers(mm))
+                return __mmu_notifier_test_young(mm, address);
+        return 0;
+}
 static inline void mmu_notifier_change_pte(struct mm_struct *mm,
                                           unsigned long address, pte_t pte)
 {
@@ -243,6 +263,32 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
        __pte;                                                          \
 })
+#define pmdp_clear_flush_notify(__vma, __address, __pmdp)               \
+({                                                                      \
+        pmd_t __pmd;                                                    \
+        struct vm_area_struct *___vma = __vma;                          \
+        unsigned long ___address = __address;                           \
+        VM_BUG_ON(__address & ~HPAGE_PMD_MASK);                         \
+        mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address,  \
+                                            (__address)+HPAGE_PMD_SIZE);\
+        __pmd = pmdp_clear_flush(___vma, ___address, __pmdp);           \
+        mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address,    \
+                                          (__address)+HPAGE_PMD_SIZE);  \
+        __pmd;                                                          \
+})
+#define pmdp_splitting_flush_notify(__vma, __address, __pmdp)           \
+({                                                                      \
+        struct vm_area_struct *___vma = __vma;                          \
+        unsigned long ___address = __address;                           \
+        VM_BUG_ON(__address & ~HPAGE_PMD_MASK);                         \
+        mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address,  \
+                                            (__address)+HPAGE_PMD_SIZE);\
+        pmdp_splitting_flush(___vma, ___address, __pmdp);               \
+        mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address,    \
+                                          (__address)+HPAGE_PMD_SIZE);  \
+})
 #define ptep_clear_flush_young_notify(__vma, __address, __ptep)         \
 ({                                                                      \
        int __young;                                                    \
@@ -254,6 +300,17 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
        __young;                                                        \
 })
+#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp)         \
+({                                                                      \
+        int __young;                                                    \
+        struct vm_area_struct *___vma = __vma;                          \
+        unsigned long ___address = __address;                           \
+        __young = pmdp_clear_flush_young(___vma, ___address, __pmdp);   \
+        __young |= mmu_notifier_clear_flush_young(___vma->vm_mm,        \
+                                                  ___address);          \
+        __young;                                                        \
+})
 #define set_pte_at_notify(__mm, __address, __ptep, __pte)               \
 ({                                                                      \
        struct mm_struct *___mm = __mm;                                 \
@@ -276,6 +333,12 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
        return 0;
 }
+static inline int mmu_notifier_test_young(struct mm_struct *mm,
+                                          unsigned long address)
+{
+        return 0;
+}
 static inline void mmu_notifier_change_pte(struct mm_struct *mm,
                                           unsigned long address, pte_t pte)
 {
@@ -305,7 +368,10 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 }
 #define ptep_clear_flush_young_notify ptep_clear_flush_young
+#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
 #define ptep_clear_flush_notify ptep_clear_flush
+#define pmdp_clear_flush_notify pmdp_clear_flush
+#define pmdp_splitting_flush_notify pmdp_splitting_flush
 #define set_pte_at_notify set_pte_at
 #endif /* CONFIG_MMU_NOTIFIER */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 39c24ebe9cfd..02ecb0189b1d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -114,6 +114,7 @@ enum zone_stat_item {
        NUMA_LOCAL,             /* allocation from local node */
        NUMA_OTHER,             /* allocation from other node */
 #endif
+        NR_ANON_TRANSPARENT_HUGEPAGES,
        NR_VM_ZONE_STAT_ITEMS };
 /*
@@ -458,12 +459,6 @@ static inline int zone_is_oom_locked(const struct zone *zone)
        return test_bit(ZONE_OOM_LOCKED, &zone->flags);
 }
-#ifdef CONFIG_SMP
-unsigned long zone_nr_free_pages(struct zone *zone);
-#else
-#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
-#endif /* CONFIG_SMP */
 /*
 * The "priority" of VM scanning is how much of the queues we will scan in one
 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
@@ -645,6 +640,7 @@ typedef struct pglist_data {
        wait_queue_head_t kswapd_wait;
        struct task_struct *kswapd;
        int kswapd_max_order;
+        enum zone_type classzone_idx;
 } pg_data_t;
 #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
@@ -660,8 +656,10 @@ typedef struct pglist_data {
 extern struct mutex zonelists_mutex;
 void build_all_zonelists(void *data);
-void wakeup_kswapd(struct zone *zone, int order);
+void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+                int classzone_idx, int alloc_flags);
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
                int classzone_idx, int alloc_flags);
 enum memmap_context {
        MEMMAP_EARLY,
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5f38c460367e..0db8037e2725 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -48,9 +48,6 @@
 * struct page (these bits with information) are always mapped into kernel
 * address space...
 *
- * PG_buddy is set to indicate that the page is free and in the buddy system
- * (see mm/page_alloc.c).
- *
 * PG_hwpoison indicates that a page got corrupted in hardware and contains
 * data with incorrect ECC bits that triggered a machine check. Accessing is
 * not safe since it may cause another machine check. Don't touch!
@@ -96,7 +93,6 @@ enum pageflags {
        PG_swapcache,           /* Swap page: swp_entry_t in private */
        PG_mappedtodisk,        /* Has blocks allocated on-disk */
        PG_reclaim,             /* To be reclaimed asap */
-        PG_buddy,               /* Page is free, on buddy lists */
        PG_swapbacked,          /* Page is backed by RAM/swap */
        PG_unevictable,         /* Page is "unevictable"  */
 #ifdef CONFIG_MMU
@@ -108,6 +104,9 @@ enum pageflags {
 #ifdef CONFIG_MEMORY_FAILURE
        PG_hwpoison,            /* hardware poisoned page. Don't touch */
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        PG_compound_lock,
+#endif
        __NR_PAGEFLAGS,
        /* Filesystems */
@@ -198,7 +197,7 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; }
 struct page;    /* forward declaration */
 TESTPAGEFLAG(Locked, locked) TESTSETFLAG(Locked, locked)
-PAGEFLAG(Error, error)
+PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
 PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
 PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
 PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
@@ -230,7 +229,6 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
 * risky: they bypass page accounting.
 */
 TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
-__PAGEFLAG(Buddy, buddy)
 PAGEFLAG(MappedToDisk, mappedtodisk)
 /* PG_readahead is only used for file reads; PG_reclaim is only for writes */
@@ -344,7 +342,7 @@ static inline void set_page_writeback(struct page *page)
 * tests can be used in performance sensitive paths. PageCompound is
 * generally not used in hot code paths.
 */
-__PAGEFLAG(Head, head)
+__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head)
 __PAGEFLAG(Tail, tail)
 static inline int PageCompound(struct page *page)
@@ -352,6 +350,13 @@ static inline int PageCompound(struct page *page)
        return page->flags & ((1L << PG_head) | (1L << PG_tail));
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline void ClearPageCompound(struct page *page)
+{
+        BUG_ON(!PageHead(page));
+        ClearPageHead(page);
+}
+#endif
 #else
 /*
 * Reduce page flag use as much as possible by overlapping
@@ -389,14 +394,61 @@ static inline void __ClearPageTail(struct page *page)
        page->flags &= ~PG_head_tail_mask;
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline void ClearPageCompound(struct page *page)
+{
+        BUG_ON((page->flags & PG_head_tail_mask) != (1 << PG_compound));
+        clear_bit(PG_compound, &page->flags);
+}
+#endif
 #endif /* !PAGEFLAGS_EXTENDED */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * PageHuge() only returns true for hugetlbfs pages, but not for
+ * normal or transparent huge pages.
+ *
+ * PageTransHuge() returns true for both transparent huge and
+ * hugetlbfs pages, but not normal pages. PageTransHuge() can only be
+ * called only in the core VM paths where hugetlbfs pages can't exist.
+ */
+static inline int PageTransHuge(struct page *page)
+{
+        VM_BUG_ON(PageTail(page));
+        return PageHead(page);
+}
+static inline int PageTransCompound(struct page *page)
+{
+        return PageCompound(page);
+}
+#else
+static inline int PageTransHuge(struct page *page)
+{
+        return 0;
+}
+static inline int PageTransCompound(struct page *page)
+{
+        return 0;
+}
+#endif
 #ifdef CONFIG_MMU
 #define __PG_MLOCKED            (1 << PG_mlocked)
 #else
 #define __PG_MLOCKED            0
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define __PG_COMPOUND_LOCK              (1 << PG_compound_lock)
+#else
+#define __PG_COMPOUND_LOCK              0
+#endif
 /*
 * Flags checked when a page is freed.  Pages being freed should not have
 * these flags set.  It they are, there is a problem.
@@ -404,9 +456,10 @@ static inline void __ClearPageTail(struct page *page)
 #define PAGE_FLAGS_CHECK_AT_FREE \
        (1 << PG_lru     | 1 << PG_locked    | \
         1 << PG_private | 1 << PG_private_2 | \
-         1 << PG_buddy   | 1 << PG_writeback | 1 << PG_reserved | \
+         1 << PG_writeback | 1 << PG_reserved | \
         1 << PG_slab    | 1 << PG_swapcache | 1 << PG_active | \
-         1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON)
+         1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \
+         __PG_COMPOUND_LOCK)
 /*
 * Flags checked when a page is prepped for return by the page allocator.
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index b02195dfc1b0..5b0c971d7cae 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -35,12 +35,18 @@ struct page_cgroup *lookup_page_cgroup(struct page *page);
 enum {
        /* flags for mem_cgroup */
-        PCG_LOCK,  /* page cgroup is locked */
+        PCG_LOCK,  /* Lock for pc->mem_cgroup and following bits. */
        PCG_CACHE, /* charged as cache */
        PCG_USED, /* this object is in use. */
-        PCG_ACCT_LRU, /* page has been accounted for */
-        PCG_FILE_MAPPED, /* page is accounted as "mapped" */
        PCG_MIGRATION, /* under page migration */
+        /* flags for mem_cgroup and file and I/O status */
+        PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */
+        PCG_FILE_MAPPED, /* page is accounted as "mapped" */
+        PCG_FILE_DIRTY, /* page is dirty */
+        PCG_FILE_WRITEBACK, /* page is under writeback */
+        PCG_FILE_UNSTABLE_NFS, /* page is NFS unstable */
+        /* No lock in page_cgroup */
+        PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */
 };
 #define TESTPCGFLAG(uname, lname)                       \
@@ -59,6 +65,10 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc)	\
 static inline int TestClearPageCgroup##uname(struct page_cgroup *pc)    \
        { return test_and_clear_bit(PCG_##lname, &pc->flags);  }
+#define TESTSETPCGFLAG(uname, lname)                    \
+static inline int TestSetPageCgroup##uname(struct page_cgroup *pc)      \
+        { return test_and_set_bit(PCG_##lname, &pc->flags);  }
 /* Cache flag is set only once (at allocation) */
 TESTPCGFLAG(Cache, CACHE)
 CLEARPCGFLAG(Cache, CACHE)
@@ -78,6 +88,22 @@ SETPCGFLAG(FileMapped, FILE_MAPPED)
 CLEARPCGFLAG(FileMapped, FILE_MAPPED)
 TESTPCGFLAG(FileMapped, FILE_MAPPED)
+SETPCGFLAG(FileDirty, FILE_DIRTY)
+CLEARPCGFLAG(FileDirty, FILE_DIRTY)
+TESTPCGFLAG(FileDirty, FILE_DIRTY)
+TESTCLEARPCGFLAG(FileDirty, FILE_DIRTY)
+TESTSETPCGFLAG(FileDirty, FILE_DIRTY)
+SETPCGFLAG(FileWriteback, FILE_WRITEBACK)
+CLEARPCGFLAG(FileWriteback, FILE_WRITEBACK)
+TESTPCGFLAG(FileWriteback, FILE_WRITEBACK)
+SETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+CLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+TESTPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+TESTCLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+TESTSETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
 SETPCGFLAG(Migration, MIGRATION)
 CLEARPCGFLAG(Migration, MIGRATION)
 TESTPCGFLAG(Migration, MIGRATION)
@@ -94,6 +120,10 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
 static inline void lock_page_cgroup(struct page_cgroup *pc)
 {
+        /*
+         * Don't take this lock in IRQ context.
+         * This lock is for pc->mem_cgroup, USED, CACHE, MIGRATION
+         */
        bit_spin_lock(PCG_LOCK, &pc->flags);
 }
@@ -107,6 +137,24 @@ static inline int page_is_cgroup_locked(struct page_cgroup *pc)
        return bit_spin_is_locked(PCG_LOCK, &pc->flags);
 }
+static inline void move_lock_page_cgroup(struct page_cgroup *pc,
+        unsigned long *flags)
+{
+        /*
+         * We know updates to pc->flags of page cache's stats are from both of
+         * usual context or IRQ context. Disable IRQ to avoid deadlock.
+         */
+        local_irq_save(*flags);
+        bit_spin_lock(PCG_MOVE_LOCK, &pc->flags);
+}
+static inline void move_unlock_page_cgroup(struct page_cgroup *pc,
+        unsigned long *flags)
+{
+        bit_spin_unlock(PCG_MOVE_LOCK, &pc->flags);
+        local_irq_restore(*flags);
+}
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct page_cgroup;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 2d1ffe3cf1ee..9c66e994540f 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -48,7 +48,7 @@ static inline void mapping_clear_unevictable(struct address_space *mapping)
 static inline int mapping_unevictable(struct address_space *mapping)
 {
-        if (likely(mapping))
+        if (mapping)
                return test_bit(AS_UNEVICTABLE, &mapping->flags);
        return !!mapping;
 }
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index ab2baa5c4884..23241c2fecce 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -146,6 +146,22 @@ static inline void *radix_tree_deref_slot(void **pslot)
 }
 /**
+ * radix_tree_deref_slot_protected      - dereference a slot without RCU lock but with tree lock held
+ * @pslot:      pointer to slot, returned by radix_tree_lookup_slot
+ * Returns:     item that was stored in that slot with any direct pointer flag
+ *              removed.
+ *
+ * Similar to radix_tree_deref_slot but only used during migration when a pages
+ * mapping is being moved. The caller does not hold the RCU read lock but it
+ * must hold the tree lock to prevent parallel updates.
+ */
+static inline void *radix_tree_deref_slot_protected(void **pslot,
+                                                        spinlock_t *treelock)
+{
+        return rcu_dereference_protected(*pslot, lockdep_is_held(treelock));
+}
+/**
 * radix_tree_deref_retry       - check radix_tree_deref_slot
 * @arg:        pointer returned by radix_tree_deref_slot
 * Returns:     0 if retry is not required, otherwise retry is required
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bb83c0da2071..e9fd04ca1e51 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -198,6 +198,8 @@ enum ttu_flags {
 };
 #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
+bool is_vma_temporary_stack(struct vm_area_struct *vma);
 int try_to_unmap(struct page *, enum ttu_flags flags);
 int try_to_unmap_one(struct page *, struct vm_area_struct *,
                        unsigned long address, enum ttu_flags flags);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 96e23215e276..d747f948b34e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -21,7 +21,8 @@
 #define CLONE_DETACHED          0x00400000      /* Unused, ignored */
 #define CLONE_UNTRACED          0x00800000      /* set if the tracing process can't force CLONE_PTRACE on this clone */
 #define CLONE_CHILD_SETTID      0x01000000      /* set the TID in the child */
-#define CLONE_STOPPED           0x02000000      /* Start in stopped state */
+/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
+   and is now available for re-use. */
 #define CLONE_NEWUTS            0x04000000      /* New utsname group? */
 #define CLONE_NEWIPC            0x08000000      /* New ipcs */
 #define CLONE_NEWUSER           0x10000000      /* New user namespace */
@@ -433,6 +434,7 @@ extern int get_dumpable(struct mm_struct *mm);
 #endif
                                        /* leave room for more dump flags */
 #define MMF_VM_MERGEABLE        16      /* KSM may merge identical pages */
+#define MMF_VM_HUGEPAGE         17      /* set when VM_HUGEPAGE is set on vma */
 #define MMF_INIT_MASK           (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
@@ -633,6 +635,8 @@ struct signal_struct {
        int oom_adj;            /* OOM kill score adjustment (bit shift) */
        int oom_score_adj;      /* OOM kill score adjustment */
+        int oom_score_adj_min;  /* OOM kill score adjustment minimum value.
+                                 * Only settable by CAP_SYS_RESOURCE. */
        struct mutex cred_guard_mutex;  /* guard against foreign influences on
                                         * credential calculations
diff --git a/include/linux/swap.h b/include/linux/swap.h
index eba53e71d2cc..4d559325d919 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -208,6 +208,8 @@ extern unsigned int nr_free_pagecache_pages(void);
 /* linux/mm/swap.c */
 extern void __lru_cache_add(struct page *, enum lru_list lru);
 extern void lru_cache_add_lru(struct page *, enum lru_list lru);
+extern void lru_add_page_tail(struct zone* zone,
+                              struct page *page, struct page *page_tail);
 extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 44b54f619ac6..4ed6fcd6b726 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -59,8 +59,9 @@ extern void *vmalloc_exec(unsigned long size);
 extern void *vmalloc_32(unsigned long size);
 extern void *vmalloc_32_user(unsigned long size);
 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
-extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask,
+extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
-                                pgprot_t prot);
+                        unsigned long start, unsigned long end, gfp_t gfp_mask,
+                        pgprot_t prot, int node, void *caller);
 extern void vfree(const void *addr);
 extern void *vmap(struct page **pages, unsigned int count,
@@ -90,9 +91,6 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size,
                                        unsigned long flags,
                                        unsigned long start, unsigned long end,
                                        void *caller);
-extern struct vm_struct *get_vm_area_node(unsigned long size,
-                                          unsigned long flags, int node,
-                                          gfp_t gfp_mask);
 extern struct vm_struct *remove_vm_area(const void *addr);
 extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
@@ -120,7 +118,7 @@ extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 #ifdef CONFIG_SMP
 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                                     const size_t *sizes, int nr_vms,
-                                     size_t align, gfp_t gfp_mask);
+                                     size_t align);
 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms);
 #endif
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index eaaea37b3b75..833e676d6d92 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -254,6 +254,11 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
 extern void __dec_zone_state(struct zone *, enum zone_stat_item);
 void refresh_cpu_vm_stats(int);
+int calculate_pressure_threshold(struct zone *zone);
+int calculate_normal_threshold(struct zone *zone);
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+                                int (*calculate_pressure)(struct zone *));
 #else /* CONFIG_SMP */
 /*
@@ -298,6 +303,8 @@ static inline void __dec_zone_page_state(struct page *page,
 #define dec_zone_page_state __dec_zone_page_state
 #define mod_zone_page_state __mod_zone_page_state
+#define set_pgdat_percpu_threshold(pgdat, callback) { }
 static inline void refresh_cpu_vm_stats(int cpu) { }
 #endif
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
new file mode 100644
index 000000000000..388bcdd26d46
--- /dev/null
+++ b/include/trace/events/compaction.h
@@ -0,0 +1,74 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM compaction
+#if !defined(_TRACE_COMPACTION_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_COMPACTION_H
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+#include "gfpflags.h"
+DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
+        TP_PROTO(unsigned long nr_scanned,
+                unsigned long nr_taken),
+        TP_ARGS(nr_scanned, nr_taken),
+        TP_STRUCT__entry(
+                __field(unsigned long, nr_scanned)
+                __field(unsigned long, nr_taken)
+        ),
+        TP_fast_assign(
+                __entry->nr_scanned = nr_scanned;
+                __entry->nr_taken = nr_taken;
+        ),
+        TP_printk("nr_scanned=%lu nr_taken=%lu",
+                __entry->nr_scanned,
+                __entry->nr_taken)
+);
+DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_migratepages,
+        TP_PROTO(unsigned long nr_scanned,
+                unsigned long nr_taken),
+        TP_ARGS(nr_scanned, nr_taken)
+);
+DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
+        TP_PROTO(unsigned long nr_scanned,
+                unsigned long nr_taken),
+        TP_ARGS(nr_scanned, nr_taken)
+);
+TRACE_EVENT(mm_compaction_migratepages,
+        TP_PROTO(unsigned long nr_migrated,
+                unsigned long nr_failed),
+        TP_ARGS(nr_migrated, nr_failed),
+        TP_STRUCT__entry(
+                __field(unsigned long, nr_migrated)
+                __field(unsigned long, nr_failed)
+        ),
+        TP_fast_assign(
+                __entry->nr_migrated = nr_migrated;
+                __entry->nr_failed = nr_failed;
+        ),
+        TP_printk("nr_migrated=%lu nr_failed=%lu",
+                __entry->nr_migrated,
+                __entry->nr_failed)
+);
+#endif /* _TRACE_COMPACTION_H */
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index c255fcc587bf..ea422aaa23e1 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -25,13 +25,13 @@
 #define trace_reclaim_flags(page, sync) ( \
        (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
-        (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC)   \
+        (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC)   \
        )
 #define trace_shrink_flags(file, sync) ( \
-        (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_MIXED : \
+        (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_MIXED : \
                        (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) |  \
-        (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \
+        (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \
        )
 TRACE_EVENT(mm_vmscan_kswapd_sleep,
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 89a2b2db4375..4e249b927eaa 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -81,6 +81,7 @@ DEFINE_EVENT(writeback_class, name, \
        TP_ARGS(bdi))
 DEFINE_WRITEBACK_EVENT(writeback_nowork);
+DEFINE_WRITEBACK_EVENT(writeback_wake_background);
 DEFINE_WRITEBACK_EVENT(writeback_wake_thread);
 DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread);
 DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
diff --git a/include/xen/gntdev.h b/include/xen/gntdev.h
new file mode 100644
index 000000000000..eb23f4188f5a
--- /dev/null
+++ b/include/xen/gntdev.h
@@ -0,0 +1,119 @@
+/******************************************************************************
+ * gntdev.h
+ * 
+ * Interface to /dev/xen/gntdev.
+ * 
+ * Copyright (c) 2007, D G Murray
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifndef __LINUX_PUBLIC_GNTDEV_H__
+#define __LINUX_PUBLIC_GNTDEV_H__
+struct ioctl_gntdev_grant_ref {
+        /* The domain ID of the grant to be mapped. */
+        uint32_t domid;
+        /* The grant reference of the grant to be mapped. */
+        uint32_t ref;
+};
+/*
+ * Inserts the grant references into the mapping table of an instance
+ * of gntdev. N.B. This does not perform the mapping, which is deferred
+ * until mmap() is called with @index as the offset.
+ */
+#define IOCTL_GNTDEV_MAP_GRANT_REF \
+_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref))
+struct ioctl_gntdev_map_grant_ref {
+        /* IN parameters */
+        /* The number of grants to be mapped. */
+        uint32_t count;
+        uint32_t pad;
+        /* OUT parameters */
+        /* The offset to be used on a subsequent call to mmap(). */
+        uint64_t index;
+        /* Variable IN parameter. */
+        /* Array of grant references, of size @count. */
+        struct ioctl_gntdev_grant_ref refs[1];
+};
+/*
+ * Removes the grant references from the mapping table of an instance of
+ * of gntdev. N.B. munmap() must be called on the relevant virtual address(es)
+ * before this ioctl is called, or an error will result.
+ */
+#define IOCTL_GNTDEV_UNMAP_GRANT_REF \
+_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref))
+struct ioctl_gntdev_unmap_grant_ref {
+        /* IN parameters */
+        /* The offset was returned by the corresponding map operation. */
+        uint64_t index;
+        /* The number of pages to be unmapped. */
+        uint32_t count;
+        uint32_t pad;
+};
+/*
+ * Returns the offset in the driver's address space that corresponds
+ * to @vaddr. This can be used to perform a munmap(), followed by an
+ * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by
+ * the caller. The number of pages that were allocated at the same time as
+ * @vaddr is returned in @count.
+ *
+ * N.B. Where more than one page has been mapped into a contiguous range, the
+ *      supplied @vaddr must correspond to the start of the range; otherwise
+ *      an error will result. It is only possible to munmap() the entire
+ *      contiguously-allocated range at once, and not any subrange thereof.
+ */
+#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \
+_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr))
+struct ioctl_gntdev_get_offset_for_vaddr {
+        /* IN parameters */
+        /* The virtual address of the first mapped page in a range. */
+        uint64_t vaddr;
+        /* OUT parameters */
+        /* The offset that was used in the initial mmap() operation. */
+        uint64_t offset;
+        /* The number of pages mapped in the VM area that begins at @vaddr. */
+        uint32_t count;
+        uint32_t pad;
+};
+/*
+ * Sets the maximum number of grants that may mapped at once by this gntdev
+ * instance.
+ *
+ * N.B. This must be called before any other ioctl is performed on the device.
+ */
+#define IOCTL_GNTDEV_SET_MAX_GRANTS \
+_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants))
+struct ioctl_gntdev_set_max_grants {
+        /* IN parameter */
+        /* The maximum number of grants that may be mapped at once. */
+        uint32_t count;
+};
+#endif /* __LINUX_PUBLIC_GNTDEV_H__ */
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index 9a731706a016..b1fab6b5b3ef 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -37,10 +37,16 @@
 #ifndef __ASM_GNTTAB_H__
 #define __ASM_GNTTAB_H__
-#include <asm/xen/hypervisor.h>
+#include <asm/page.h>
+#include <xen/interface/xen.h>
 #include <xen/interface/grant_table.h>
+#include <asm/xen/hypervisor.h>
 #include <asm/xen/grant_table.h>
+#include <xen/features.h>
 /* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */
 #define NR_GRANT_FRAMES 4
@@ -107,6 +113,37 @@ void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
 void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
                                       unsigned long pfn);
+static inline void
+gnttab_set_map_op(struct gnttab_map_grant_ref *map, phys_addr_t addr,
+                  uint32_t flags, grant_ref_t ref, domid_t domid)
+{
+        if (flags & GNTMAP_contains_pte)
+                map->host_addr = addr;
+        else if (xen_feature(XENFEAT_auto_translated_physmap))
+                map->host_addr = __pa(addr);
+        else
+                map->host_addr = addr;
+        map->flags = flags;
+        map->ref = ref;
+        map->dom = domid;
+}
+static inline void
+gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr,
+                    uint32_t flags, grant_handle_t handle)
+{
+        if (flags & GNTMAP_contains_pte)
+                unmap->host_addr = addr;
+        else if (xen_feature(XENFEAT_auto_translated_physmap))
+                unmap->host_addr = __pa(addr);
+        else
+                unmap->host_addr = addr;
+        unmap->handle = handle;
+        unmap->dev_bus_addr = 0;
+}
 int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
                           unsigned long max_nr_gframes,
                           struct grant_entry **__shared);
@@ -118,4 +155,9 @@ unsigned int gnttab_max_grant_frames(void);
 #define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
+int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
+                    struct page **pages, unsigned int count);
+int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
+                      struct page **pages, unsigned int count);
 #endif /* __ASM_GNTTAB_H__ */
diff --git a/kernel/fork.c b/kernel/fork.c
index d9b44f20b6b0..25e429152ddc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -66,6 +66,7 @@
 #include <linux/posix-timers.h>
 #include <linux/user-return-notifier.h>
 #include <linux/oom.h>
+#include <linux/khugepaged.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -330,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        retval = ksm_fork(mm, oldmm);
        if (retval)
                goto out;
+        retval = khugepaged_fork(mm, oldmm);
+        if (retval)
+                goto out;
        prev = NULL;
        for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@ -529,6 +533,9 @@ void __mmdrop(struct mm_struct *mm)
        mm_free_pgd(mm);
        destroy_context(mm);
        mmu_notifier_mm_destroy(mm);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        VM_BUG_ON(mm->pmd_huge_pte);
+#endif
        free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -543,6 +550,7 @@ void mmput(struct mm_struct *mm)
        if (atomic_dec_and_test(&mm->mm_users)) {
                exit_aio(mm);
                ksm_exit(mm);
+                khugepaged_exit(mm); /* must run before exit_mmap */
                exit_mmap(mm);
                set_mm_exe_file(mm, NULL);
                if (!list_empty(&mm->mmlist)) {
@@ -669,6 +677,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
        mm->token_priority = 0;
        mm->last_interval = 0;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        mm->pmd_huge_pte = NULL;
+#endif
        if (!mm_init(mm, tsk))
                goto fail_nomem;
@@ -910,6 +922,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sig->oom_adj = current->signal->oom_adj;
        sig->oom_score_adj = current->signal->oom_score_adj;
+        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
        mutex_init(&sig->cred_guard_mutex);
@@ -1410,23 +1423,6 @@ long do_fork(unsigned long clone_flags,
        }
        /*
-         * We hope to recycle these flags after 2.6.26
-         */
-        if (unlikely(clone_flags & CLONE_STOPPED)) {
-                static int __read_mostly count = 100;
-                if (count > 0 && printk_ratelimit()) {
-                        char comm[TASK_COMM_LEN];
-                        count--;
-                        printk(KERN_INFO "fork(): process `%s' used deprecated "
-                                        "clone flags 0x%lx\n",
-                                get_task_comm(comm, current),
-                                clone_flags & CLONE_STOPPED);
-                }
-        }
-        /*
         * When called from kernel_thread, don't do user tracing stuff.
         */
        if (likely(user_mode(regs)))
@@ -1464,16 +1460,7 @@ long do_fork(unsigned long clone_flags,
                 */
                p->flags &= ~PF_STARTING;
-                if (unlikely(clone_flags & CLONE_STOPPED)) {
+                wake_up_new_task(p, clone_flags);
-                        /*
-                         * We'll start up with an immediate SIGSTOP.
-                         */
-                        sigaddset(&p->pending.signal, SIGSTOP);
-                        set_tsk_thread_flag(p, TIF_SIGPENDING);
-                        __set_task_state(p, TASK_STOPPED);
-                } else {
-                        wake_up_new_task(p, clone_flags);
-                }
                tracehook_report_clone_complete(trace, regs,
                                                clone_flags, nr, p);
diff --git a/kernel/futex.c b/kernel/futex.c
index 3019b92e6917..52075633373f 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -233,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
 {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
-        struct page *page;
+        struct page *page, *page_head;
        int err;
        /*
@@ -265,11 +265,46 @@ again:
        if (err < 0)
                return err;
-        page = compound_head(page);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-        lock_page(page);
+        page_head = page;
-        if (!page->mapping) {
+        if (unlikely(PageTail(page))) {
-                unlock_page(page);
                put_page(page);
+                /* serialize against __split_huge_page_splitting() */
+                local_irq_disable();
+                if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
+                        page_head = compound_head(page);
+                        /*
+                         * page_head is valid pointer but we must pin
+                         * it before taking the PG_lock and/or
+                         * PG_compound_lock. The moment we re-enable
+                         * irqs __split_huge_page_splitting() can
+                         * return and the head page can be freed from
+                         * under us. We can't take the PG_lock and/or
+                         * PG_compound_lock on a page that could be
+                         * freed from under us.
+                         */
+                        if (page != page_head) {
+                                get_page(page_head);
+                                put_page(page);
+                        }
+                        local_irq_enable();
+                } else {
+                        local_irq_enable();
+                        goto again;
+                }
+        }
+#else
+        page_head = compound_head(page);
+        if (page != page_head) {
+                get_page(page_head);
+                put_page(page);
+        }
+#endif
+        lock_page(page_head);
+        if (!page_head->mapping) {
+                unlock_page(page_head);
+                put_page(page_head);
                goto again;
        }
@@ -280,20 +315,20 @@ again:
         * it's a read-only handle, it's expected that futexes attach to
         * the object not the particular process.
         */
-        if (PageAnon(page)) {
+        if (PageAnon(page_head)) {
                key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
                key->private.mm = mm;
                key->private.address = address;
        } else {
                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
-                key->shared.inode = page->mapping->host;
+                key->shared.inode = page_head->mapping->host;
-                key->shared.pgoff = page->index;
+                key->shared.pgoff = page_head->index;
        }
        get_futex_key_refs(key);
-        unlock_page(page);
+        unlock_page(page_head);
-        put_page(page);
+        put_page(page_head);
        return 0;
 }
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9988d03797f5..282f20230e67 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; }
 static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
 {
+        int cpu;
        desc->irq_data.irq = irq;
        desc->irq_data.chip = &no_irq_chip;
        desc->irq_data.chip_data = NULL;
@@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
        desc->name = NULL;
-        memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
+        for_each_possible_cpu(cpu)
+                *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
        desc_smp_init(desc, node);
 }
@@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
        if (!desc)
                return NULL;
        /* allocate based on nr_cpu_ids */
-        desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs),
+        desc->kstat_irqs = alloc_percpu(unsigned int);
-                                         gfp, node);
        if (!desc->kstat_irqs)
                goto err_desc;
@@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
        return desc;
 err_kstat:
-        kfree(desc->kstat_irqs);
+        free_percpu(desc->kstat_irqs);
 err_desc:
        kfree(desc);
        return NULL;
@@ -166,7 +168,7 @@ static void free_desc(unsigned int irq)
        mutex_unlock(&sparse_irq_lock);
        free_masks(desc);
-        kfree(desc->kstat_irqs);
+        free_percpu(desc->kstat_irqs);
        kfree(desc);
 }
@@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
        }
 };
-static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
 int __init early_irq_init(void)
 {
        int count, i, node = first_online_node;
@@ -250,7 +251,8 @@ int __init early_irq_init(void)
        for (i = 0; i < count; i++) {
                desc[i].irq_data.irq = i;
                desc[i].irq_data.chip = &no_irq_chip;
-                desc[i].kstat_irqs = kstat_irqs_all[i];
+                /* TODO : do this allocation on-demand ... */
+                desc[i].kstat_irqs = alloc_percpu(unsigned int);
                alloc_masks(desc + i, GFP_KERNEL, node);
                desc_smp_init(desc + i, node);
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -275,6 +277,22 @@ static void free_desc(unsigned int irq)
 static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
 {
+#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
+        struct irq_desc *desc;
+        unsigned int i;
+        for (i = 0; i < cnt; i++) {
+                desc = irq_to_desc(start + i);
+                if (desc && !desc->kstat_irqs) {
+                        unsigned int __percpu *stats = alloc_percpu(unsigned int);
+                        if (!stats)
+                                return -1;
+                        if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
+                                free_percpu(stats);
+                }
+        }
+#endif
        return start;
 }
 #endif /* !CONFIG_SPARSE_IRQ */
@@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq)
 unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-        return desc ? desc->kstat_irqs[cpu] : 0;
+        return desc && desc->kstat_irqs ?
+                        *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
 }
 #ifdef CONFIG_GENERIC_HARDIRQS
@@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq)
        int cpu;
        int sum = 0;
-        if (!desc)
+        if (!desc || !desc->kstat_irqs)
                return 0;
        for_each_possible_cpu(cpu)
-                sum += desc->kstat_irqs[cpu];
+                sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
        return sum;
 }
 #endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/mm/Kconfig b/mm/Kconfig
index c2c8a4a11898..3ad483bdf505 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -302,6 +302,44 @@ config NOMMU_INITIAL_TRIM_EXCESS
          See Documentation/nommu-mmap.txt for more information.
+config TRANSPARENT_HUGEPAGE
+        bool "Transparent Hugepage Support"
+        depends on X86 && MMU
+        select COMPACTION
+        help
+          Transparent Hugepages allows the kernel to use huge pages and
+          huge tlb transparently to the applications whenever possible.
+          This feature can improve computing performance to certain
+          applications by speeding up page faults during memory
+          allocation, by reducing the number of tlb misses and by speeding
+          up the pagetable walking.
+          If memory constrained on embedded, you may want to say N.
+choice
+        prompt "Transparent Hugepage Support sysfs defaults"
+        depends on TRANSPARENT_HUGEPAGE
+        default TRANSPARENT_HUGEPAGE_ALWAYS
+        help
+          Selects the sysfs defaults for Transparent Hugepage Support.
+        config TRANSPARENT_HUGEPAGE_ALWAYS
+                bool "always"
+        help
+          Enabling Transparent Hugepage always, can increase the
+          memory footprint of applications without a guaranteed
+          benefit but it will work automatically for all applications.
+        config TRANSPARENT_HUGEPAGE_MADVISE
+                bool "madvise"
+        help
+          Enabling Transparent Hugepage madvise, will only provide a
+          performance improvement benefit to the applications using
+          madvise(MADV_HUGEPAGE) but it won't risk to increase the
+          memory footprint of applications without a guaranteed
+          benefit.
+endchoice
 #
 # UP and nommu archs use km based percpu allocator
 #
diff --git a/mm/Makefile b/mm/Makefile
index f73f75a29f82..2b1b575ae712 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,7 +5,7 @@
 mmu-y                   := nommu.o
 mmu-$(CONFIG_MMU)       := fremap.o highmem.o madvise.o memory.o mincore.o \
                           mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
-                           vmalloc.o pagewalk.o
+                           vmalloc.o pagewalk.o pgtable-generic.o
 obj-y                   := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o \
@@ -37,6 +37,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/compaction.c b/mm/compaction.c
index 1a8894eadf72..6d592a021072 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,9 @@
 #include <linux/sysfs.h>
 #include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/compaction.h>
 /*
 * compact_control is used to track pages being migrated and the free pages
 * they are being migrated to during memory compaction. The free_pfn starts
@@ -30,6 +33,7 @@ struct compact_control {
        unsigned long nr_migratepages;  /* Number of pages to migrate */
        unsigned long free_pfn;         /* isolate_freepages search base */
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
+        bool sync;                      /* Synchronous migration */
        /* Account for isolated anon and file pages */
        unsigned long nr_anon;
@@ -38,6 +42,8 @@ struct compact_control {
        unsigned int order;             /* order a direct compactor needs */
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
+        int compact_mode;
 };
 static unsigned long release_freepages(struct list_head *freelist)
@@ -60,7 +66,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
                                struct list_head *freelist)
 {
        unsigned long zone_end_pfn, end_pfn;
-        int total_isolated = 0;
+        int nr_scanned = 0, total_isolated = 0;
        struct page *cursor;
        /* Get the last PFN we should scan for free pages at */
@@ -81,6 +87,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
                if (!pfn_valid_within(blockpfn))
                        continue;
+                nr_scanned++;
                if (!PageBuddy(page))
                        continue;
@@ -100,6 +107,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
                }
        }
+        trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
        return total_isolated;
 }
@@ -234,6 +242,8 @@ static unsigned long isolate_migratepages(struct zone *zone,
                                        struct compact_control *cc)
 {
        unsigned long low_pfn, end_pfn;
+        unsigned long last_pageblock_nr = 0, pageblock_nr;
+        unsigned long nr_scanned = 0, nr_isolated = 0;
        struct list_head *migratelist = &cc->migratepages;
        /* Do not scan outside zone boundaries */
@@ -266,20 +276,51 @@ static unsigned long isolate_migratepages(struct zone *zone,
                struct page *page;
                if (!pfn_valid_within(low_pfn))
                        continue;
+                nr_scanned++;
                /* Get the page and skip if free */
                page = pfn_to_page(low_pfn);
                if (PageBuddy(page))
                        continue;
+                /*
+                 * For async migration, also only scan in MOVABLE blocks. Async
+                 * migration is optimistic to see if the minimum amount of work
+                 * satisfies the allocation
+                 */
+                pageblock_nr = low_pfn >> pageblock_order;
+                if (!cc->sync && last_pageblock_nr != pageblock_nr &&
+                                get_pageblock_migratetype(page) != MIGRATE_MOVABLE) {
+                        low_pfn += pageblock_nr_pages;
+                        low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+                        last_pageblock_nr = pageblock_nr;
+                        continue;
+                }
+                if (!PageLRU(page))
+                        continue;
+                /*
+                 * PageLRU is set, and lru_lock excludes isolation,
+                 * splitting and collapsing (collapsing has already
+                 * happened if PageLRU is set).
+                 */
+                if (PageTransHuge(page)) {
+                        low_pfn += (1 << compound_order(page)) - 1;
+                        continue;
+                }
                /* Try isolate the page */
                if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
                        continue;
+                VM_BUG_ON(PageTransCompound(page));
                /* Successfully isolated */
                del_page_from_lru_list(zone, page, page_lru(page));
                list_add(&page->lru, migratelist);
                cc->nr_migratepages++;
+                nr_isolated++;
                /* Avoid isolating too much */
                if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
@@ -291,6 +332,8 @@ static unsigned long isolate_migratepages(struct zone *zone,
        spin_unlock_irq(&zone->lru_lock);
        cc->migrate_pfn = low_pfn;
+        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
        return cc->nr_migratepages;
 }
@@ -341,10 +384,10 @@ static void update_nr_listpages(struct compact_control *cc)
 }
 static int compact_finished(struct zone *zone,
-                                                struct compact_control *cc)
+                            struct compact_control *cc)
 {
        unsigned int order;
-        unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);
+        unsigned long watermark;
        if (fatal_signal_pending(current))
                return COMPACT_PARTIAL;
@@ -354,12 +397,27 @@ static int compact_finished(struct zone *zone,
                return COMPACT_COMPLETE;
        /* Compaction run is not finished if the watermark is not met */
+        if (cc->compact_mode != COMPACT_MODE_KSWAPD)
+                watermark = low_wmark_pages(zone);
+        else
+                watermark = high_wmark_pages(zone);
+        watermark += (1 << cc->order);
        if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
                return COMPACT_CONTINUE;
        if (cc->order == -1)
                return COMPACT_CONTINUE;
+        /*
+         * Generating only one page of the right order is not enough
+         * for kswapd, we must continue until we're above the high
+         * watermark as a pool for high order GFP_ATOMIC allocations
+         * too.
+         */
+        if (cc->compact_mode == COMPACT_MODE_KSWAPD)
+                return COMPACT_CONTINUE;
        /* Direct compactor: Is a suitable page free? */
        for (order = cc->order; order < MAX_ORDER; order++) {
                /* Job done if page is free of the right migratetype */
@@ -374,10 +432,62 @@ static int compact_finished(struct zone *zone,
        return COMPACT_CONTINUE;
 }
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ * Returns
+ *   COMPACT_SKIPPED  - If there are too few free pages for compaction
+ *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
+ *   COMPACT_CONTINUE - If compaction should run now
+ */
+unsigned long compaction_suitable(struct zone *zone, int order)
+{
+        int fragindex;
+        unsigned long watermark;
+        /*
+         * Watermarks for order-0 must be met for compaction. Note the 2UL.
+         * This is because during migration, copies of pages need to be
+         * allocated and for a short time, the footprint is higher
+         */
+        watermark = low_wmark_pages(zone) + (2UL << order);
+        if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+                return COMPACT_SKIPPED;
+        /*
+         * fragmentation index determines if allocation failures are due to
+         * low memory or external fragmentation
+         *
+         * index of -1 implies allocations might succeed dependingon watermarks
+         * index towards 0 implies failure is due to lack of memory
+         * index towards 1000 implies failure is due to fragmentation
+         *
+         * Only compact if a failure would be due to fragmentation.
+         */
+        fragindex = fragmentation_index(zone, order);
+        if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+                return COMPACT_SKIPPED;
+        if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
+                return COMPACT_PARTIAL;
+        return COMPACT_CONTINUE;
+}
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
        int ret;
+        ret = compaction_suitable(zone, cc->order);
+        switch (ret) {
+        case COMPACT_PARTIAL:
+        case COMPACT_SKIPPED:
+                /* Compaction is likely to fail */
+                return ret;
+        case COMPACT_CONTINUE:
+                /* Fall through to compaction */
+                ;
+        }
        /* Setup to move all movable pages to the end of the zone */
        cc->migrate_pfn = zone->zone_start_pfn;
        cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
@@ -393,7 +503,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                nr_migrate = cc->nr_migratepages;
                migrate_pages(&cc->migratepages, compaction_alloc,
-                                                (unsigned long)cc, 0);
+                                (unsigned long)cc, false,
+                                cc->sync);
                update_nr_listpages(cc);
                nr_remaining = cc->nr_migratepages;
@@ -401,6 +512,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
                if (nr_remaining)
                        count_vm_events(COMPACTPAGEFAILED, nr_remaining);
+                trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
+                                                nr_remaining);
                /* Release LRU pages not migrated */
                if (!list_empty(&cc->migratepages)) {
@@ -417,8 +530,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        return ret;
 }
-static unsigned long compact_zone_order(struct zone *zone,
+unsigned long compact_zone_order(struct zone *zone,
-                                                int order, gfp_t gfp_mask)
+                                 int order, gfp_t gfp_mask,
+                                 bool sync,
+                                 int compact_mode)
 {
        struct compact_control cc = {
                .nr_freepages = 0,
@@ -426,6 +541,8 @@ static unsigned long compact_zone_order(struct zone *zone,
                .order = order,
                .migratetype = allocflags_to_migratetype(gfp_mask),
                .zone = zone,
+                .sync = sync,
+                .compact_mode = compact_mode,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
@@ -441,16 +558,17 @@ int sysctl_extfrag_threshold = 500;
 * @order: The order of the current allocation
 * @gfp_mask: The GFP mask of the current allocation
 * @nodemask: The allowed nodes to allocate from
+ * @sync: Whether migration is synchronous or not
 *
 * This is the main entry point for direct page compaction.
 */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
-                        int order, gfp_t gfp_mask, nodemask_t *nodemask)
+                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
+                        bool sync)
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
        int may_perform_io = gfp_mask & __GFP_IO;
-        unsigned long watermark;
        struct zoneref *z;
        struct zone *zone;
        int rc = COMPACT_SKIPPED;
@@ -460,7 +578,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
         * made because an assumption is made that the page allocator can satisfy
         * the "cheaper" orders without taking special steps
         */
-        if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io)
+        if (!order || !may_enter_fs || !may_perform_io)
                return rc;
        count_vm_event(COMPACTSTALL);
@@ -468,43 +586,14 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
        /* Compact each zone in the list */
        for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
                                                                nodemask) {
-                int fragindex;
                int status;
-                /*
+                status = compact_zone_order(zone, order, gfp_mask, sync,
-                 * Watermarks for order-0 must be met for compaction. Note
+                                            COMPACT_MODE_DIRECT_RECLAIM);
-                 * the 2UL. This is because during migration, copies of
-                 * pages need to be allocated and for a short time, the
-                 * footprint is higher
-                 */
-                watermark = low_wmark_pages(zone) + (2UL << order);
-                if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
-                        continue;
-                /*
-                 * fragmentation index determines if allocation failures are
-                 * due to low memory or external fragmentation
-                 *
-                 * index of -1 implies allocations might succeed depending
-                 *      on watermarks
-                 * index towards 0 implies failure is due to lack of memory
-                 * index towards 1000 implies failure is due to fragmentation
-                 *
-                 * Only compact if a failure would be due to fragmentation.
-                 */
-                fragindex = fragmentation_index(zone, order);
-                if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
-                        continue;
-                if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
-                        rc = COMPACT_PARTIAL;
-                        break;
-                }
-                status = compact_zone_order(zone, order, gfp_mask);
                rc = max(status, rc);
-                if (zone_watermark_ok(zone, order, watermark, 0, 0))
+                /* If a normal allocation would succeed, stop compacting */
+                if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
                        break;
        }
@@ -531,6 +620,7 @@ static int compact_node(int nid)
                        .nr_freepages = 0,
                        .nr_migratepages = 0,
                        .order = -1,
+                        .compact_mode = COMPACT_MODE_DIRECT_RECLAIM,
                };
                zone = &pgdat->node_zones[zoneid];
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 4df2de77e069..03bf3bb4519a 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -324,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
                if (mem_flags & __GFP_WAIT) {
                        DECLARE_WAITQUEUE(wait, current);
-                        __set_current_state(TASK_INTERRUPTIBLE);
+                        __set_current_state(TASK_UNINTERRUPTIBLE);
                        __add_wait_queue(&pool->waitq, &wait);
                        spin_unlock_irqrestore(&pool->lock, flags);
@@ -355,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc);
 static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
 {
-        unsigned long flags;
        struct dma_page *page;
-        spin_lock_irqsave(&pool->lock, flags);
        list_for_each_entry(page, &pool->page_list, page_list) {
                if (dma < page->dma)
                        continue;
                if (dma < (page->dma + pool->allocation))
-                        goto done;
+                        return page;
        }
-        page = NULL;
+        return NULL;
- done:
-        spin_unlock_irqrestore(&pool->lock, flags);
-        return page;
 }
 /**
@@ -386,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
        unsigned long flags;
        unsigned int offset;
+        spin_lock_irqsave(&pool->lock, flags);
        page = pool_find_page(pool, dma);
        if (!page) {
+                spin_unlock_irqrestore(&pool->lock, flags);
                if (pool->dev)
                        dev_err(pool->dev,
                                "dma_pool_free %s, %p/%lx (bad dma)\n",
@@ -401,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
        offset = vaddr - page->vaddr;
 #ifdef  DMAPOOL_DEBUG
        if ((dma - page->dma) != offset) {
+                spin_unlock_irqrestore(&pool->lock, flags);
                if (pool->dev)
                        dev_err(pool->dev,
                                "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
@@ -418,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
                                chain = *(int *)(page->vaddr + chain);
                                continue;
                        }
+                        spin_unlock_irqrestore(&pool->lock, flags);
                        if (pool->dev)
                                dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
                                        "already free\n", pool->name,
@@ -432,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
        memset(vaddr, POOL_POISON_FREED, pool->size);
 #endif
-        spin_lock_irqsave(&pool->lock, flags);
        page->in_use--;
        *(int *)vaddr = page->offset;
        page->offset = offset;
diff --git a/mm/filemap.c b/mm/filemap.c
index ca389394fa2a..83a45d35468b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -298,7 +298,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
                                continue;
                        wait_on_page_writeback(page);
-                        if (PageError(page))
+                        if (TestClearPageError(page))
                                ret = -EIO;
                }
                pagevec_release(&pvec);
@@ -837,9 +837,6 @@ repeat:
                if (radix_tree_deref_retry(page))
                        goto restart;
-                if (page->mapping == NULL || page->index != index)
-                        break;
                if (!page_cache_get_speculative(page))
                        goto repeat;
@@ -849,6 +846,16 @@ repeat:
                        goto repeat;
                }
+                /*
+                 * must check mapping and index after taking the ref.
+                 * otherwise we can get both false positives and false
+                 * negatives, which is just confusing to the caller.
+                 */
+                if (page->mapping == NULL || page->index != index) {
+                        page_cache_release(page);
+                        break;
+                }
                pages[ret] = page;
                ret++;
                index++;
@@ -2220,7 +2227,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
                gfp_notmask = __GFP_FS;
 repeat:
        page = find_lock_page(mapping, index);
-        if (likely(page))
+        if (page)
                return page;
        page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
new file mode 100644
index 000000000000..004c9c2aac78
--- /dev/null
+++ b/mm/huge_memory.c
@@ -0,0 +1,2346 @@
+/*
+ *  Copyright (C) 2009  Red Hat, Inc.
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/mmu_notifier.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/mm_inline.h>
+#include <linux/kthread.h>
+#include <linux/khugepaged.h>
+#include <linux/freezer.h>
+#include <linux/mman.h>
+#include <asm/tlb.h>
+#include <asm/pgalloc.h>
+#include "internal.h"
+/*
+ * By default transparent hugepage support is enabled for all mappings
+ * and khugepaged scans all mappings. Defrag is only invoked by
+ * khugepaged hugepage allocations and by page faults inside
+ * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
+ * allocations.
+ */
+unsigned long transparent_hugepage_flags __read_mostly =
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
+        (1<<TRANSPARENT_HUGEPAGE_FLAG)|
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
+        (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
+#endif
+        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
+        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+/* default scan 8*512 pte (or vmas) every 30 second */
+static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
+static unsigned int khugepaged_pages_collapsed;
+static unsigned int khugepaged_full_scans;
+static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
+/* during fragmentation poll the hugepage allocator once every minute */
+static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
+static struct task_struct *khugepaged_thread __read_mostly;
+static DEFINE_MUTEX(khugepaged_mutex);
+static DEFINE_SPINLOCK(khugepaged_mm_lock);
+static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
+/*
+ * default collapse hugepages if there is at least one pte mapped like
+ * it would have happened if the vma was large enough during page
+ * fault.
+ */
+static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+static int khugepaged(void *none);
+static int mm_slots_hash_init(void);
+static int khugepaged_slab_init(void);
+static void khugepaged_slab_free(void);
+#define MM_SLOTS_HASH_HEADS 1024
+static struct hlist_head *mm_slots_hash __read_mostly;
+static struct kmem_cache *mm_slot_cache __read_mostly;
+/**
+ * struct mm_slot - hash lookup from mm to mm_slot
+ * @hash: hash collision list
+ * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+        struct hlist_node hash;
+        struct list_head mm_node;
+        struct mm_struct *mm;
+};
+/**
+ * struct khugepaged_scan - cursor for scanning
+ * @mm_head: the head of the mm list to scan
+ * @mm_slot: the current mm_slot we are scanning
+ * @address: the next address inside that to be scanned
+ *
+ * There is only the one khugepaged_scan instance of this cursor structure.
+ */
+struct khugepaged_scan {
+        struct list_head mm_head;
+        struct mm_slot *mm_slot;
+        unsigned long address;
+} khugepaged_scan = {
+        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
+};
+static int set_recommended_min_free_kbytes(void)
+{
+        struct zone *zone;
+        int nr_zones = 0;
+        unsigned long recommended_min;
+        extern int min_free_kbytes;
+        if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                      &transparent_hugepage_flags) &&
+            !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                      &transparent_hugepage_flags))
+                return 0;
+        for_each_populated_zone(zone)
+                nr_zones++;
+        /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
+        recommended_min = pageblock_nr_pages * nr_zones * 2;
+        /*
+         * Make sure that on average at least two pageblocks are almost free
+         * of another type, one for a migratetype to fall back to and a
+         * second to avoid subsequent fallbacks of other types There are 3
+         * MIGRATE_TYPES we care about.
+         */
+        recommended_min += pageblock_nr_pages * nr_zones *
+                           MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
+        /* don't ever allow to reserve more than 5% of the lowmem */
+        recommended_min = min(recommended_min,
+                              (unsigned long) nr_free_buffer_pages() / 20);
+        recommended_min <<= (PAGE_SHIFT-10);
+        if (recommended_min > min_free_kbytes)
+                min_free_kbytes = recommended_min;
+        setup_per_zone_wmarks();
+        return 0;
+}
+late_initcall(set_recommended_min_free_kbytes);
+static int start_khugepaged(void)
+{
+        int err = 0;
+        if (khugepaged_enabled()) {
+                int wakeup;
+                if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+                mutex_lock(&khugepaged_mutex);
+                if (!khugepaged_thread)
+                        khugepaged_thread = kthread_run(khugepaged, NULL,
+                                                        "khugepaged");
+                if (unlikely(IS_ERR(khugepaged_thread))) {
+                        printk(KERN_ERR
+                               "khugepaged: kthread_run(khugepaged) failed\n");
+                        err = PTR_ERR(khugepaged_thread);
+                        khugepaged_thread = NULL;
+                }
+                wakeup = !list_empty(&khugepaged_scan.mm_head);
+                mutex_unlock(&khugepaged_mutex);
+                if (wakeup)
+                        wake_up_interruptible(&khugepaged_wait);
+                set_recommended_min_free_kbytes();
+        } else
+                /* wakeup to exit */
+                wake_up_interruptible(&khugepaged_wait);
+out:
+        return err;
+}
+#ifdef CONFIG_SYSFS
+static ssize_t double_flag_show(struct kobject *kobj,
+                                struct kobj_attribute *attr, char *buf,
+                                enum transparent_hugepage_flag enabled,
+                                enum transparent_hugepage_flag req_madv)
+{
+        if (test_bit(enabled, &transparent_hugepage_flags)) {
+                VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
+                return sprintf(buf, "[always] madvise never\n");
+        } else if (test_bit(req_madv, &transparent_hugepage_flags))
+                return sprintf(buf, "always [madvise] never\n");
+        else
+                return sprintf(buf, "always madvise [never]\n");
+}
+static ssize_t double_flag_store(struct kobject *kobj,
+                                 struct kobj_attribute *attr,
+                                 const char *buf, size_t count,
+                                 enum transparent_hugepage_flag enabled,
+                                 enum transparent_hugepage_flag req_madv)
+{
+        if (!memcmp("always", buf,
+                    min(sizeof("always")-1, count))) {
+                set_bit(enabled, &transparent_hugepage_flags);
+                clear_bit(req_madv, &transparent_hugepage_flags);
+        } else if (!memcmp("madvise", buf,
+                           min(sizeof("madvise")-1, count))) {
+                clear_bit(enabled, &transparent_hugepage_flags);
+                set_bit(req_madv, &transparent_hugepage_flags);
+        } else if (!memcmp("never", buf,
+                           min(sizeof("never")-1, count))) {
+                clear_bit(enabled, &transparent_hugepage_flags);
+                clear_bit(req_madv, &transparent_hugepage_flags);
+        } else
+                return -EINVAL;
+        return count;
+}
+static ssize_t enabled_show(struct kobject *kobj,
+                            struct kobj_attribute *attr, char *buf)
+{
+        return double_flag_show(kobj, attr, buf,
+                                TRANSPARENT_HUGEPAGE_FLAG,
+                                TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+}
+static ssize_t enabled_store(struct kobject *kobj,
+                             struct kobj_attribute *attr,
+                             const char *buf, size_t count)
+{
+        ssize_t ret;
+        ret = double_flag_store(kobj, attr, buf, count,
+                                TRANSPARENT_HUGEPAGE_FLAG,
+                                TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+        if (ret > 0) {
+                int err = start_khugepaged();
+                if (err)
+                        ret = err;
+        }
+        if (ret > 0 &&
+            (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                      &transparent_hugepage_flags) ||
+             test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                      &transparent_hugepage_flags)))
+                set_recommended_min_free_kbytes();
+        return ret;
+}
+static struct kobj_attribute enabled_attr =
+        __ATTR(enabled, 0644, enabled_show, enabled_store);
+static ssize_t single_flag_show(struct kobject *kobj,
+                                struct kobj_attribute *attr, char *buf,
+                                enum transparent_hugepage_flag flag)
+{
+        if (test_bit(flag, &transparent_hugepage_flags))
+                return sprintf(buf, "[yes] no\n");
+        else
+                return sprintf(buf, "yes [no]\n");
+}
+static ssize_t single_flag_store(struct kobject *kobj,
+                                 struct kobj_attribute *attr,
+                                 const char *buf, size_t count,
+                                 enum transparent_hugepage_flag flag)
+{
+        if (!memcmp("yes", buf,
+                    min(sizeof("yes")-1, count))) {
+                set_bit(flag, &transparent_hugepage_flags);
+        } else if (!memcmp("no", buf,
+                           min(sizeof("no")-1, count))) {
+                clear_bit(flag, &transparent_hugepage_flags);
+        } else
+                return -EINVAL;
+        return count;
+}
+/*
+ * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
+ * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
+ * memory just to allocate one more hugepage.
+ */
+static ssize_t defrag_show(struct kobject *kobj,
+                           struct kobj_attribute *attr, char *buf)
+{
+        return double_flag_show(kobj, attr, buf,
+                                TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+                                TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static ssize_t defrag_store(struct kobject *kobj,
+                            struct kobj_attribute *attr,
+                            const char *buf, size_t count)
+{
+        return double_flag_store(kobj, attr, buf, count,
+                                 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+                                 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static struct kobj_attribute defrag_attr =
+        __ATTR(defrag, 0644, defrag_show, defrag_store);
+#ifdef CONFIG_DEBUG_VM
+static ssize_t debug_cow_show(struct kobject *kobj,
+                                struct kobj_attribute *attr, char *buf)
+{
+        return single_flag_show(kobj, attr, buf,
+                                TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static ssize_t debug_cow_store(struct kobject *kobj,
+                               struct kobj_attribute *attr,
+                               const char *buf, size_t count)
+{
+        return single_flag_store(kobj, attr, buf, count,
+                                 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static struct kobj_attribute debug_cow_attr =
+        __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
+#endif /* CONFIG_DEBUG_VM */
+static struct attribute *hugepage_attr[] = {
+        &enabled_attr.attr,
+        &defrag_attr.attr,
+#ifdef CONFIG_DEBUG_VM
+        &debug_cow_attr.attr,
+#endif
+        NULL,
+};
+static struct attribute_group hugepage_attr_group = {
+        .attrs = hugepage_attr,
+};
+static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
+                                         struct kobj_attribute *attr,
+                                         char *buf)
+{
+        return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
+}
+static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
+                                          struct kobj_attribute *attr,
+                                          const char *buf, size_t count)
+{
+        unsigned long msecs;
+        int err;
+        err = strict_strtoul(buf, 10, &msecs);
+        if (err || msecs > UINT_MAX)
+                return -EINVAL;
+        khugepaged_scan_sleep_millisecs = msecs;
+        wake_up_interruptible(&khugepaged_wait);
+        return count;
+}
+static struct kobj_attribute scan_sleep_millisecs_attr =
+        __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
+               scan_sleep_millisecs_store);
+static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
+                                          struct kobj_attribute *attr,
+                                          char *buf)
+{
+        return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
+}
+static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
+                                           struct kobj_attribute *attr,
+                                           const char *buf, size_t count)
+{
+        unsigned long msecs;
+        int err;
+        err = strict_strtoul(buf, 10, &msecs);
+        if (err || msecs > UINT_MAX)
+                return -EINVAL;
+        khugepaged_alloc_sleep_millisecs = msecs;
+        wake_up_interruptible(&khugepaged_wait);
+        return count;
+}
+static struct kobj_attribute alloc_sleep_millisecs_attr =
+        __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
+               alloc_sleep_millisecs_store);
+static ssize_t pages_to_scan_show(struct kobject *kobj,
+                                  struct kobj_attribute *attr,
+                                  char *buf)
+{
+        return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
+}
+static ssize_t pages_to_scan_store(struct kobject *kobj,
+                                   struct kobj_attribute *attr,
+                                   const char *buf, size_t count)
+{
+        int err;
+        unsigned long pages;
+        err = strict_strtoul(buf, 10, &pages);
+        if (err || !pages || pages > UINT_MAX)
+                return -EINVAL;
+        khugepaged_pages_to_scan = pages;
+        return count;
+}
+static struct kobj_attribute pages_to_scan_attr =
+        __ATTR(pages_to_scan, 0644, pages_to_scan_show,
+               pages_to_scan_store);
+static ssize_t pages_collapsed_show(struct kobject *kobj,
+                                    struct kobj_attribute *attr,
+                                    char *buf)
+{
+        return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
+}
+static struct kobj_attribute pages_collapsed_attr =
+        __ATTR_RO(pages_collapsed);
+static ssize_t full_scans_show(struct kobject *kobj,
+                               struct kobj_attribute *attr,
+                               char *buf)
+{
+        return sprintf(buf, "%u\n", khugepaged_full_scans);
+}
+static struct kobj_attribute full_scans_attr =
+        __ATTR_RO(full_scans);
+static ssize_t khugepaged_defrag_show(struct kobject *kobj,
+                                      struct kobj_attribute *attr, char *buf)
+{
+        return single_flag_show(kobj, attr, buf,
+                                TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static ssize_t khugepaged_defrag_store(struct kobject *kobj,
+                                       struct kobj_attribute *attr,
+                                       const char *buf, size_t count)
+{
+        return single_flag_store(kobj, attr, buf, count,
+                                 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static struct kobj_attribute khugepaged_defrag_attr =
+        __ATTR(defrag, 0644, khugepaged_defrag_show,
+               khugepaged_defrag_store);
+/*
+ * max_ptes_none controls if khugepaged should collapse hugepages over
+ * any unmapped ptes in turn potentially increasing the memory
+ * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
+ * reduce the available free memory in the system as it
+ * runs. Increasing max_ptes_none will instead potentially reduce the
+ * free memory in the system during the khugepaged scan.
+ */
+static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
+                                             struct kobj_attribute *attr,
+                                             char *buf)
+{
+        return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
+}
+static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
+                                              struct kobj_attribute *attr,
+                                              const char *buf, size_t count)
+{
+        int err;
+        unsigned long max_ptes_none;
+        err = strict_strtoul(buf, 10, &max_ptes_none);
+        if (err || max_ptes_none > HPAGE_PMD_NR-1)
+                return -EINVAL;
+        khugepaged_max_ptes_none = max_ptes_none;
+        return count;
+}
+static struct kobj_attribute khugepaged_max_ptes_none_attr =
+        __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
+               khugepaged_max_ptes_none_store);
+static struct attribute *khugepaged_attr[] = {
+        &khugepaged_defrag_attr.attr,
+        &khugepaged_max_ptes_none_attr.attr,
+        &pages_to_scan_attr.attr,
+        &pages_collapsed_attr.attr,
+        &full_scans_attr.attr,
+        &scan_sleep_millisecs_attr.attr,
+        &alloc_sleep_millisecs_attr.attr,
+        NULL,
+};
+static struct attribute_group khugepaged_attr_group = {
+        .attrs = khugepaged_attr,
+        .name = "khugepaged",
+};
+#endif /* CONFIG_SYSFS */
+static int __init hugepage_init(void)
+{
+        int err;
+#ifdef CONFIG_SYSFS
+        static struct kobject *hugepage_kobj;
+#endif
+        err = -EINVAL;
+        if (!has_transparent_hugepage()) {
+                transparent_hugepage_flags = 0;
+                goto out;
+        }
+#ifdef CONFIG_SYSFS
+        err = -ENOMEM;
+        hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
+        if (unlikely(!hugepage_kobj)) {
+                printk(KERN_ERR "hugepage: failed kobject create\n");
+                goto out;
+        }
+        err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group);
+        if (err) {
+                printk(KERN_ERR "hugepage: failed register hugeage group\n");
+                goto out;
+        }
+        err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group);
+        if (err) {
+                printk(KERN_ERR "hugepage: failed register hugeage group\n");
+                goto out;
+        }
+#endif
+        err = khugepaged_slab_init();
+        if (err)
+                goto out;
+        err = mm_slots_hash_init();
+        if (err) {
+                khugepaged_slab_free();
+                goto out;
+        }
+        /*
+         * By default disable transparent hugepages on smaller systems,
+         * where the extra memory used could hurt more than TLB overhead
+         * is likely to save.  The admin can still enable it through /sys.
+         */
+        if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
+                transparent_hugepage_flags = 0;
+        start_khugepaged();
+        set_recommended_min_free_kbytes();
+out:
+        return err;
+}
+module_init(hugepage_init)
+static int __init setup_transparent_hugepage(char *str)
+{
+        int ret = 0;
+        if (!str)
+                goto out;
+        if (!strcmp(str, "always")) {
+                set_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                        &transparent_hugepage_flags);
+                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                          &transparent_hugepage_flags);
+                ret = 1;
+        } else if (!strcmp(str, "madvise")) {
+                clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                          &transparent_hugepage_flags);
+                set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                        &transparent_hugepage_flags);
+                ret = 1;
+        } else if (!strcmp(str, "never")) {
+                clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                          &transparent_hugepage_flags);
+                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                          &transparent_hugepage_flags);
+                ret = 1;
+        }
+out:
+        if (!ret)
+                printk(KERN_WARNING
+                       "transparent_hugepage= cannot parse, ignored\n");
+        return ret;
+}
+__setup("transparent_hugepage=", setup_transparent_hugepage);
+static void prepare_pmd_huge_pte(pgtable_t pgtable,
+                                 struct mm_struct *mm)
+{
+        assert_spin_locked(&mm->page_table_lock);
+        /* FIFO */
+        if (!mm->pmd_huge_pte)
+                INIT_LIST_HEAD(&pgtable->lru);
+        else
+                list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
+        mm->pmd_huge_pte = pgtable;
+}
+static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+{
+        if (likely(vma->vm_flags & VM_WRITE))
+                pmd = pmd_mkwrite(pmd);
+        return pmd;
+}
+static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
+                                        struct vm_area_struct *vma,
+                                        unsigned long haddr, pmd_t *pmd,
+                                        struct page *page)
+{
+        int ret = 0;
+        pgtable_t pgtable;
+        VM_BUG_ON(!PageCompound(page));
+        pgtable = pte_alloc_one(mm, haddr);
+        if (unlikely(!pgtable)) {
+                mem_cgroup_uncharge_page(page);
+                put_page(page);
+                return VM_FAULT_OOM;
+        }
+        clear_huge_page(page, haddr, HPAGE_PMD_NR);
+        __SetPageUptodate(page);
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_none(*pmd))) {
+                spin_unlock(&mm->page_table_lock);
+                mem_cgroup_uncharge_page(page);
+                put_page(page);
+                pte_free(mm, pgtable);
+        } else {
+                pmd_t entry;
+                entry = mk_pmd(page, vma->vm_page_prot);
+                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+                entry = pmd_mkhuge(entry);
+                /*
+                 * The spinlocking to take the lru_lock inside
+                 * page_add_new_anon_rmap() acts as a full memory
+                 * barrier to be sure clear_huge_page writes become
+                 * visible after the set_pmd_at() write.
+                 */
+                page_add_new_anon_rmap(page, vma, haddr);
+                set_pmd_at(mm, haddr, pmd, entry);
+                prepare_pmd_huge_pte(pgtable, mm);
+                add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+                spin_unlock(&mm->page_table_lock);
+        }
+        return ret;
+}
+static inline gfp_t alloc_hugepage_gfpmask(int defrag)
+{
+        return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT);
+}
+static inline struct page *alloc_hugepage_vma(int defrag,
+                                              struct vm_area_struct *vma,
+                                              unsigned long haddr)
+{
+        return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
+                               HPAGE_PMD_ORDER, vma, haddr);
+}
+#ifndef CONFIG_NUMA
+static inline struct page *alloc_hugepage(int defrag)
+{
+        return alloc_pages(alloc_hugepage_gfpmask(defrag),
+                           HPAGE_PMD_ORDER);
+}
+#endif
+int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                               unsigned long address, pmd_t *pmd,
+                               unsigned int flags)
+{
+        struct page *page;
+        unsigned long haddr = address & HPAGE_PMD_MASK;
+        pte_t *pte;
+        if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
+                if (unlikely(anon_vma_prepare(vma)))
+                        return VM_FAULT_OOM;
+                if (unlikely(khugepaged_enter(vma)))
+                        return VM_FAULT_OOM;
+                page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+                                          vma, haddr);
+                if (unlikely(!page))
+                        goto out;
+                if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+                        put_page(page);
+                        goto out;
+                }
+                return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
+        }
+out:
+        /*
+         * Use __pte_alloc instead of pte_alloc_map, because we can't
+         * run pte_offset_map on the pmd, if an huge pmd could
+         * materialize from under us from a different thread.
+         */
+        if (unlikely(__pte_alloc(mm, vma, pmd, address)))
+                return VM_FAULT_OOM;
+        /* if an huge pmd materialized from under us just retry later */
+        if (unlikely(pmd_trans_huge(*pmd)))
+                return 0;
+        /*
+         * A regular pmd is established and it can't morph into a huge pmd
+         * from under us anymore at this point because we hold the mmap_sem
+         * read mode and khugepaged takes it in write mode. So now it's
+         * safe to run pte_offset_map().
+         */
+        pte = pte_offset_map(pmd, address);
+        return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+}
+int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+                  struct vm_area_struct *vma)
+{
+        struct page *src_page;
+        pmd_t pmd;
+        pgtable_t pgtable;
+        int ret;
+        ret = -ENOMEM;
+        pgtable = pte_alloc_one(dst_mm, addr);
+        if (unlikely(!pgtable))
+                goto out;
+        spin_lock(&dst_mm->page_table_lock);
+        spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
+        ret = -EAGAIN;
+        pmd = *src_pmd;
+        if (unlikely(!pmd_trans_huge(pmd))) {
+                pte_free(dst_mm, pgtable);
+                goto out_unlock;
+        }
+        if (unlikely(pmd_trans_splitting(pmd))) {
+                /* split huge page running from under us */
+                spin_unlock(&src_mm->page_table_lock);
+                spin_unlock(&dst_mm->page_table_lock);
+                pte_free(dst_mm, pgtable);
+                wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
+                goto out;
+        }
+        src_page = pmd_page(pmd);
+        VM_BUG_ON(!PageHead(src_page));
+        get_page(src_page);
+        page_dup_rmap(src_page);
+        add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+        pmdp_set_wrprotect(src_mm, addr, src_pmd);
+        pmd = pmd_mkold(pmd_wrprotect(pmd));
+        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+        prepare_pmd_huge_pte(pgtable, dst_mm);
+        ret = 0;
+out_unlock:
+        spin_unlock(&src_mm->page_table_lock);
+        spin_unlock(&dst_mm->page_table_lock);
+out:
+        return ret;
+}
+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
+{
+        pgtable_t pgtable;
+        assert_spin_locked(&mm->page_table_lock);
+        /* FIFO */
+        pgtable = mm->pmd_huge_pte;
+        if (list_empty(&pgtable->lru))
+                mm->pmd_huge_pte = NULL;
+        else {
+                mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+                                              struct page, lru);
+                list_del(&pgtable->lru);
+        }
+        return pgtable;
+}
+static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
+                                        struct vm_area_struct *vma,
+                                        unsigned long address,
+                                        pmd_t *pmd, pmd_t orig_pmd,
+                                        struct page *page,
+                                        unsigned long haddr)
+{
+        pgtable_t pgtable;
+        pmd_t _pmd;
+        int ret = 0, i;
+        struct page **pages;
+        pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
+                        GFP_KERNEL);
+        if (unlikely(!pages)) {
+                ret |= VM_FAULT_OOM;
+                goto out;
+        }
+        for (i = 0; i < HPAGE_PMD_NR; i++) {
+                pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+                                          vma, address);
+                if (unlikely(!pages[i] ||
+                             mem_cgroup_newpage_charge(pages[i], mm,
+                                                       GFP_KERNEL))) {
+                        if (pages[i])
+                                put_page(pages[i]);
+                        mem_cgroup_uncharge_start();
+                        while (--i >= 0) {
+                                mem_cgroup_uncharge_page(pages[i]);
+                                put_page(pages[i]);
+                        }
+                        mem_cgroup_uncharge_end();
+                        kfree(pages);
+                        ret |= VM_FAULT_OOM;
+                        goto out;
+                }
+        }
+        for (i = 0; i < HPAGE_PMD_NR; i++) {
+                copy_user_highpage(pages[i], page + i,
+                                   haddr + PAGE_SHIFT*i, vma);
+                __SetPageUptodate(pages[i]);
+                cond_resched();
+        }
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_same(*pmd, orig_pmd)))
+                goto out_free_pages;
+        VM_BUG_ON(!PageHead(page));
+        pmdp_clear_flush_notify(vma, haddr, pmd);
+        /* leave pmd empty until pte is filled */
+        pgtable = get_pmd_huge_pte(mm);
+        pmd_populate(mm, &_pmd, pgtable);
+        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+                pte_t *pte, entry;
+                entry = mk_pte(pages[i], vma->vm_page_prot);
+                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                page_add_new_anon_rmap(pages[i], vma, haddr);
+                pte = pte_offset_map(&_pmd, haddr);
+                VM_BUG_ON(!pte_none(*pte));
+                set_pte_at(mm, haddr, pte, entry);
+                pte_unmap(pte);
+        }
+        kfree(pages);
+        mm->nr_ptes++;
+        smp_wmb(); /* make pte visible before pmd */
+        pmd_populate(mm, pmd, pgtable);
+        page_remove_rmap(page);
+        spin_unlock(&mm->page_table_lock);
+        ret |= VM_FAULT_WRITE;
+        put_page(page);
+out:
+        return ret;
+out_free_pages:
+        spin_unlock(&mm->page_table_lock);
+        mem_cgroup_uncharge_start();
+        for (i = 0; i < HPAGE_PMD_NR; i++) {
+                mem_cgroup_uncharge_page(pages[i]);
+                put_page(pages[i]);
+        }
+        mem_cgroup_uncharge_end();
+        kfree(pages);
+        goto out;
+}
+int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                        unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
+{
+        int ret = 0;
+        struct page *page, *new_page;
+        unsigned long haddr;
+        VM_BUG_ON(!vma->anon_vma);
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_same(*pmd, orig_pmd)))
+                goto out_unlock;
+        page = pmd_page(orig_pmd);
+        VM_BUG_ON(!PageCompound(page) || !PageHead(page));
+        haddr = address & HPAGE_PMD_MASK;
+        if (page_mapcount(page) == 1) {
+                pmd_t entry;
+                entry = pmd_mkyoung(orig_pmd);
+                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+                if (pmdp_set_access_flags(vma, haddr, pmd, entry,  1))
+                        update_mmu_cache(vma, address, entry);
+                ret |= VM_FAULT_WRITE;
+                goto out_unlock;
+        }
+        get_page(page);
+        spin_unlock(&mm->page_table_lock);
+        if (transparent_hugepage_enabled(vma) &&
+            !transparent_hugepage_debug_cow())
+                new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+                                              vma, haddr);
+        else
+                new_page = NULL;
+        if (unlikely(!new_page)) {
+                ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
+                                                   pmd, orig_pmd, page, haddr);
+                put_page(page);
+                goto out;
+        }
+        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+                put_page(new_page);
+                put_page(page);
+                ret |= VM_FAULT_OOM;
+                goto out;
+        }
+        copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
+        __SetPageUptodate(new_page);
+        spin_lock(&mm->page_table_lock);
+        put_page(page);
+        if (unlikely(!pmd_same(*pmd, orig_pmd))) {
+                mem_cgroup_uncharge_page(new_page);
+                put_page(new_page);
+        } else {
+                pmd_t entry;
+                VM_BUG_ON(!PageHead(page));
+                entry = mk_pmd(new_page, vma->vm_page_prot);
+                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+                entry = pmd_mkhuge(entry);
+                pmdp_clear_flush_notify(vma, haddr, pmd);
+                page_add_new_anon_rmap(new_page, vma, haddr);
+                set_pmd_at(mm, haddr, pmd, entry);
+                update_mmu_cache(vma, address, entry);
+                page_remove_rmap(page);
+                put_page(page);
+                ret |= VM_FAULT_WRITE;
+        }
+out_unlock:
+        spin_unlock(&mm->page_table_lock);
+out:
+        return ret;
+}
+struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+                                   unsigned long addr,
+                                   pmd_t *pmd,
+                                   unsigned int flags)
+{
+        struct page *page = NULL;
+        assert_spin_locked(&mm->page_table_lock);
+        if (flags & FOLL_WRITE && !pmd_write(*pmd))
+                goto out;
+        page = pmd_page(*pmd);
+        VM_BUG_ON(!PageHead(page));
+        if (flags & FOLL_TOUCH) {
+                pmd_t _pmd;
+                /*
+                 * We should set the dirty bit only for FOLL_WRITE but
+                 * for now the dirty bit in the pmd is meaningless.
+                 * And if the dirty bit will become meaningful and
+                 * we'll only set it with FOLL_WRITE, an atomic
+                 * set_bit will be required on the pmd to set the
+                 * young bit, instead of the current set_pmd_at.
+                 */
+                _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+                set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
+        }
+        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
+        VM_BUG_ON(!PageCompound(page));
+        if (flags & FOLL_GET)
+                get_page(page);
+out:
+        return page;
+}
+int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+                 pmd_t *pmd)
+{
+        int ret = 0;
+        spin_lock(&tlb->mm->page_table_lock);
+        if (likely(pmd_trans_huge(*pmd))) {
+                if (unlikely(pmd_trans_splitting(*pmd))) {
+                        spin_unlock(&tlb->mm->page_table_lock);
+                        wait_split_huge_page(vma->anon_vma,
+                                             pmd);
+                } else {
+                        struct page *page;
+                        pgtable_t pgtable;
+                        pgtable = get_pmd_huge_pte(tlb->mm);
+                        page = pmd_page(*pmd);
+                        pmd_clear(pmd);
+                        page_remove_rmap(page);
+                        VM_BUG_ON(page_mapcount(page) < 0);
+                        add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+                        VM_BUG_ON(!PageHead(page));
+                        spin_unlock(&tlb->mm->page_table_lock);
+                        tlb_remove_page(tlb, page);
+                        pte_free(tlb->mm, pgtable);
+                        ret = 1;
+                }
+        } else
+                spin_unlock(&tlb->mm->page_table_lock);
+        return ret;
+}
+int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+                unsigned long addr, unsigned long end,
+                unsigned char *vec)
+{
+        int ret = 0;
+        spin_lock(&vma->vm_mm->page_table_lock);
+        if (likely(pmd_trans_huge(*pmd))) {
+                ret = !pmd_trans_splitting(*pmd);
+                spin_unlock(&vma->vm_mm->page_table_lock);
+                if (unlikely(!ret))
+                        wait_split_huge_page(vma->anon_vma, pmd);
+                else {
+                        /*
+                         * All logical pages in the range are present
+                         * if backed by a huge page.
+                         */
+                        memset(vec, 1, (end - addr) >> PAGE_SHIFT);
+                }
+        } else
+                spin_unlock(&vma->vm_mm->page_table_lock);
+        return ret;
+}
+int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+                unsigned long addr, pgprot_t newprot)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        int ret = 0;
+        spin_lock(&mm->page_table_lock);
+        if (likely(pmd_trans_huge(*pmd))) {
+                if (unlikely(pmd_trans_splitting(*pmd))) {
+                        spin_unlock(&mm->page_table_lock);
+                        wait_split_huge_page(vma->anon_vma, pmd);
+                } else {
+                        pmd_t entry;
+                        entry = pmdp_get_and_clear(mm, addr, pmd);
+                        entry = pmd_modify(entry, newprot);
+                        set_pmd_at(mm, addr, pmd, entry);
+                        spin_unlock(&vma->vm_mm->page_table_lock);
+                        flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
+                        ret = 1;
+                }
+        } else
+                spin_unlock(&vma->vm_mm->page_table_lock);
+        return ret;
+}
+pmd_t *page_check_address_pmd(struct page *page,
+                              struct mm_struct *mm,
+                              unsigned long address,
+                              enum page_check_address_pmd_flag flag)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd, *ret = NULL;
+        if (address & ~HPAGE_PMD_MASK)
+                goto out;
+        pgd = pgd_offset(mm, address);
+        if (!pgd_present(*pgd))
+                goto out;
+        pud = pud_offset(pgd, address);
+        if (!pud_present(*pud))
+                goto out;
+        pmd = pmd_offset(pud, address);
+        if (pmd_none(*pmd))
+                goto out;
+        if (pmd_page(*pmd) != page)
+                goto out;
+        /*
+         * split_vma() may create temporary aliased mappings. There is
+         * no risk as long as all huge pmd are found and have their
+         * splitting bit set before __split_huge_page_refcount
+         * runs. Finding the same huge pmd more than once during the
+         * same rmap walk is not a problem.
+         */
+        if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
+            pmd_trans_splitting(*pmd))
+                goto out;
+        if (pmd_trans_huge(*pmd)) {
+                VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
+                          !pmd_trans_splitting(*pmd));
+                ret = pmd;
+        }
+out:
+        return ret;
+}
+static int __split_huge_page_splitting(struct page *page,
+                                       struct vm_area_struct *vma,
+                                       unsigned long address)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        pmd_t *pmd;
+        int ret = 0;
+        spin_lock(&mm->page_table_lock);
+        pmd = page_check_address_pmd(page, mm, address,
+                                     PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
+        if (pmd) {
+                /*
+                 * We can't temporarily set the pmd to null in order
+                 * to split it, the pmd must remain marked huge at all
+                 * times or the VM won't take the pmd_trans_huge paths
+                 * and it won't wait on the anon_vma->root->lock to
+                 * serialize against split_huge_page*.
+                 */
+                pmdp_splitting_flush_notify(vma, address, pmd);
+                ret = 1;
+        }
+        spin_unlock(&mm->page_table_lock);
+        return ret;
+}
+static void __split_huge_page_refcount(struct page *page)
+{
+        int i;
+        unsigned long head_index = page->index;
+        struct zone *zone = page_zone(page);
+        int zonestat;
+        /* prevent PageLRU to go away from under us, and freeze lru stats */
+        spin_lock_irq(&zone->lru_lock);
+        compound_lock(page);
+        for (i = 1; i < HPAGE_PMD_NR; i++) {
+                struct page *page_tail = page + i;
+                /* tail_page->_count cannot change */
+                atomic_sub(atomic_read(&page_tail->_count), &page->_count);
+                BUG_ON(page_count(page) <= 0);
+                atomic_add(page_mapcount(page) + 1, &page_tail->_count);
+                BUG_ON(atomic_read(&page_tail->_count) <= 0);
+                /* after clearing PageTail the gup refcount can be released */
+                smp_mb();
+                page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+                page_tail->flags |= (page->flags &
+                                     ((1L << PG_referenced) |
+                                      (1L << PG_swapbacked) |
+                                      (1L << PG_mlocked) |
+                                      (1L << PG_uptodate)));
+                page_tail->flags |= (1L << PG_dirty);
+                /*
+                 * 1) clear PageTail before overwriting first_page
+                 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
+                 */
+                smp_wmb();
+                /*
+                 * __split_huge_page_splitting() already set the
+                 * splitting bit in all pmd that could map this
+                 * hugepage, that will ensure no CPU can alter the
+                 * mapcount on the head page. The mapcount is only
+                 * accounted in the head page and it has to be
+                 * transferred to all tail pages in the below code. So
+                 * for this code to be safe, the split the mapcount
+                 * can't change. But that doesn't mean userland can't
+                 * keep changing and reading the page contents while
+                 * we transfer the mapcount, so the pmd splitting
+                 * status is achieved setting a reserved bit in the
+                 * pmd, not by clearing the present bit.
+                */
+                BUG_ON(page_mapcount(page_tail));
+                page_tail->_mapcount = page->_mapcount;
+                BUG_ON(page_tail->mapping);
+                page_tail->mapping = page->mapping;
+                page_tail->index = ++head_index;
+                BUG_ON(!PageAnon(page_tail));
+                BUG_ON(!PageUptodate(page_tail));
+                BUG_ON(!PageDirty(page_tail));
+                BUG_ON(!PageSwapBacked(page_tail));
+                lru_add_page_tail(zone, page, page_tail);
+        }
+        __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+        __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
+        /*
+         * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics,
+         * so adjust those appropriately if this page is on the LRU.
+         */
+        if (PageLRU(page)) {
+                zonestat = NR_LRU_BASE + page_lru(page);
+                __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1));
+        }
+        ClearPageCompound(page);
+        compound_unlock(page);
+        spin_unlock_irq(&zone->lru_lock);
+        for (i = 1; i < HPAGE_PMD_NR; i++) {
+                struct page *page_tail = page + i;
+                BUG_ON(page_count(page_tail) <= 0);
+                /*
+                 * Tail pages may be freed if there wasn't any mapping
+                 * like if add_to_swap() is running on a lru page that
+                 * had its mapping zapped. And freeing these pages
+                 * requires taking the lru_lock so we do the put_page
+                 * of the tail pages after the split is complete.
+                 */
+                put_page(page_tail);
+        }
+        /*
+         * Only the head page (now become a regular page) is required
+         * to be pinned by the caller.
+         */
+        BUG_ON(page_count(page) <= 0);
+}
+static int __split_huge_page_map(struct page *page,
+                                 struct vm_area_struct *vma,
+                                 unsigned long address)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        pmd_t *pmd, _pmd;
+        int ret = 0, i;
+        pgtable_t pgtable;
+        unsigned long haddr;
+        spin_lock(&mm->page_table_lock);
+        pmd = page_check_address_pmd(page, mm, address,
+                                     PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
+        if (pmd) {
+                pgtable = get_pmd_huge_pte(mm);
+                pmd_populate(mm, &_pmd, pgtable);
+                for (i = 0, haddr = address; i < HPAGE_PMD_NR;
+                     i++, haddr += PAGE_SIZE) {
+                        pte_t *pte, entry;
+                        BUG_ON(PageCompound(page+i));
+                        entry = mk_pte(page + i, vma->vm_page_prot);
+                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                        if (!pmd_write(*pmd))
+                                entry = pte_wrprotect(entry);
+                        else
+                                BUG_ON(page_mapcount(page) != 1);
+                        if (!pmd_young(*pmd))
+                                entry = pte_mkold(entry);
+                        pte = pte_offset_map(&_pmd, haddr);
+                        BUG_ON(!pte_none(*pte));
+                        set_pte_at(mm, haddr, pte, entry);
+                        pte_unmap(pte);
+                }
+                mm->nr_ptes++;
+                smp_wmb(); /* make pte visible before pmd */
+                /*
+                 * Up to this point the pmd is present and huge and
+                 * userland has the whole access to the hugepage
+                 * during the split (which happens in place). If we
+                 * overwrite the pmd with the not-huge version
+                 * pointing to the pte here (which of course we could
+                 * if all CPUs were bug free), userland could trigger
+                 * a small page size TLB miss on the small sized TLB
+                 * while the hugepage TLB entry is still established
+                 * in the huge TLB. Some CPU doesn't like that. See
+                 * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
+                 * Erratum 383 on page 93. Intel should be safe but is
+                 * also warns that it's only safe if the permission
+                 * and cache attributes of the two entries loaded in
+                 * the two TLB is identical (which should be the case
+                 * here). But it is generally safer to never allow
+                 * small and huge TLB entries for the same virtual
+                 * address to be loaded simultaneously. So instead of
+                 * doing "pmd_populate(); flush_tlb_range();" we first
+                 * mark the current pmd notpresent (atomically because
+                 * here the pmd_trans_huge and pmd_trans_splitting
+                 * must remain set at all times on the pmd until the
+                 * split is complete for this pmd), then we flush the
+                 * SMP TLB and finally we write the non-huge version
+                 * of the pmd entry with pmd_populate.
+                 */
+                set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
+                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+                pmd_populate(mm, pmd, pgtable);
+                ret = 1;
+        }
+        spin_unlock(&mm->page_table_lock);
+        return ret;
+}
+/* must be called with anon_vma->root->lock hold */
+static void __split_huge_page(struct page *page,
+                              struct anon_vma *anon_vma)
+{
+        int mapcount, mapcount2;
+        struct anon_vma_chain *avc;
+        BUG_ON(!PageHead(page));
+        BUG_ON(PageTail(page));
+        mapcount = 0;
+        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+                struct vm_area_struct *vma = avc->vma;
+                unsigned long addr = vma_address(page, vma);
+                BUG_ON(is_vma_temporary_stack(vma));
+                if (addr == -EFAULT)
+                        continue;
+                mapcount += __split_huge_page_splitting(page, vma, addr);
+        }
+        /*
+         * It is critical that new vmas are added to the tail of the
+         * anon_vma list. This guarantes that if copy_huge_pmd() runs
+         * and establishes a child pmd before
+         * __split_huge_page_splitting() freezes the parent pmd (so if
+         * we fail to prevent copy_huge_pmd() from running until the
+         * whole __split_huge_page() is complete), we will still see
+         * the newly established pmd of the child later during the
+         * walk, to be able to set it as pmd_trans_splitting too.
+         */
+        if (mapcount != page_mapcount(page))
+                printk(KERN_ERR "mapcount %d page_mapcount %d\n",
+                       mapcount, page_mapcount(page));
+        BUG_ON(mapcount != page_mapcount(page));
+        __split_huge_page_refcount(page);
+        mapcount2 = 0;
+        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+                struct vm_area_struct *vma = avc->vma;
+                unsigned long addr = vma_address(page, vma);
+                BUG_ON(is_vma_temporary_stack(vma));
+                if (addr == -EFAULT)
+                        continue;
+                mapcount2 += __split_huge_page_map(page, vma, addr);
+        }
+        if (mapcount != mapcount2)
+                printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
+                       mapcount, mapcount2, page_mapcount(page));
+        BUG_ON(mapcount != mapcount2);
+}
+int split_huge_page(struct page *page)
+{
+        struct anon_vma *anon_vma;
+        int ret = 1;
+        BUG_ON(!PageAnon(page));
+        anon_vma = page_lock_anon_vma(page);
+        if (!anon_vma)
+                goto out;
+        ret = 0;
+        if (!PageCompound(page))
+                goto out_unlock;
+        BUG_ON(!PageSwapBacked(page));
+        __split_huge_page(page, anon_vma);
+        BUG_ON(PageCompound(page));
+out_unlock:
+        page_unlock_anon_vma(anon_vma);
+out:
+        return ret;
+}
+int hugepage_madvise(struct vm_area_struct *vma,
+                     unsigned long *vm_flags, int advice)
+{
+        switch (advice) {
+        case MADV_HUGEPAGE:
+                /*
+                 * Be somewhat over-protective like KSM for now!
+                 */
+                if (*vm_flags & (VM_HUGEPAGE |
+                                 VM_SHARED   | VM_MAYSHARE   |
+                                 VM_PFNMAP   | VM_IO      | VM_DONTEXPAND |
+                                 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
+                                 VM_MIXEDMAP | VM_SAO))
+                        return -EINVAL;
+                *vm_flags &= ~VM_NOHUGEPAGE;
+                *vm_flags |= VM_HUGEPAGE;
+                /*
+                 * If the vma become good for khugepaged to scan,
+                 * register it here without waiting a page fault that
+                 * may not happen any time soon.
+                 */
+                if (unlikely(khugepaged_enter_vma_merge(vma)))
+                        return -ENOMEM;
+                break;
+        case MADV_NOHUGEPAGE:
+                /*
+                 * Be somewhat over-protective like KSM for now!
+                 */
+                if (*vm_flags & (VM_NOHUGEPAGE |
+                                 VM_SHARED   | VM_MAYSHARE   |
+                                 VM_PFNMAP   | VM_IO      | VM_DONTEXPAND |
+                                 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
+                                 VM_MIXEDMAP | VM_SAO))
+                        return -EINVAL;
+                *vm_flags &= ~VM_HUGEPAGE;
+                *vm_flags |= VM_NOHUGEPAGE;
+                /*
+                 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
+                 * this vma even if we leave the mm registered in khugepaged if
+                 * it got registered before VM_NOHUGEPAGE was set.
+                 */
+                break;
+        }
+        return 0;
+}
+static int __init khugepaged_slab_init(void)
+{
+        mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
+                                          sizeof(struct mm_slot),
+                                          __alignof__(struct mm_slot), 0, NULL);
+        if (!mm_slot_cache)
+                return -ENOMEM;
+        return 0;
+}
+static void __init khugepaged_slab_free(void)
+{
+        kmem_cache_destroy(mm_slot_cache);
+        mm_slot_cache = NULL;
+}
+static inline struct mm_slot *alloc_mm_slot(void)
+{
+        if (!mm_slot_cache)     /* initialization failed */
+                return NULL;
+        return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
+}
+static inline void free_mm_slot(struct mm_slot *mm_slot)
+{
+        kmem_cache_free(mm_slot_cache, mm_slot);
+}
+static int __init mm_slots_hash_init(void)
+{
+        mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
+                                GFP_KERNEL);
+        if (!mm_slots_hash)
+                return -ENOMEM;
+        return 0;
+}
+#if 0
+static void __init mm_slots_hash_free(void)
+{
+        kfree(mm_slots_hash);
+        mm_slots_hash = NULL;
+}
+#endif
+static struct mm_slot *get_mm_slot(struct mm_struct *mm)
+{
+        struct mm_slot *mm_slot;
+        struct hlist_head *bucket;
+        struct hlist_node *node;
+        bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+                                % MM_SLOTS_HASH_HEADS];
+        hlist_for_each_entry(mm_slot, node, bucket, hash) {
+                if (mm == mm_slot->mm)
+                        return mm_slot;
+        }
+        return NULL;
+}
+static void insert_to_mm_slots_hash(struct mm_struct *mm,
+                                    struct mm_slot *mm_slot)
+{
+        struct hlist_head *bucket;
+        bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+                                % MM_SLOTS_HASH_HEADS];
+        mm_slot->mm = mm;
+        hlist_add_head(&mm_slot->hash, bucket);
+}
+static inline int khugepaged_test_exit(struct mm_struct *mm)
+{
+        return atomic_read(&mm->mm_users) == 0;
+}
+int __khugepaged_enter(struct mm_struct *mm)
+{
+        struct mm_slot *mm_slot;
+        int wakeup;
+        mm_slot = alloc_mm_slot();
+        if (!mm_slot)
+                return -ENOMEM;
+        /* __khugepaged_exit() must not run from under us */
+        VM_BUG_ON(khugepaged_test_exit(mm));
+        if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
+                free_mm_slot(mm_slot);
+                return 0;
+        }
+        spin_lock(&khugepaged_mm_lock);
+        insert_to_mm_slots_hash(mm, mm_slot);
+        /*
+         * Insert just behind the scanning cursor, to let the area settle
+         * down a little.
+         */
+        wakeup = list_empty(&khugepaged_scan.mm_head);
+        list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
+        spin_unlock(&khugepaged_mm_lock);
+        atomic_inc(&mm->mm_count);
+        if (wakeup)
+                wake_up_interruptible(&khugepaged_wait);
+        return 0;
+}
+int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+{
+        unsigned long hstart, hend;
+        if (!vma->anon_vma)
+                /*
+                 * Not yet faulted in so we will register later in the
+                 * page fault if needed.
+                 */
+                return 0;
+        if (vma->vm_file || vma->vm_ops)
+                /* khugepaged not yet working on file or special mappings */
+                return 0;
+        VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+        hend = vma->vm_end & HPAGE_PMD_MASK;
+        if (hstart < hend)
+                return khugepaged_enter(vma);
+        return 0;
+}
+void __khugepaged_exit(struct mm_struct *mm)
+{
+        struct mm_slot *mm_slot;
+        int free = 0;
+        spin_lock(&khugepaged_mm_lock);
+        mm_slot = get_mm_slot(mm);
+        if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+                hlist_del(&mm_slot->hash);
+                list_del(&mm_slot->mm_node);
+                free = 1;
+        }
+        if (free) {
+                spin_unlock(&khugepaged_mm_lock);
+                clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+                free_mm_slot(mm_slot);
+                mmdrop(mm);
+        } else if (mm_slot) {
+                spin_unlock(&khugepaged_mm_lock);
+                /*
+                 * This is required to serialize against
+                 * khugepaged_test_exit() (which is guaranteed to run
+                 * under mmap sem read mode). Stop here (after we
+                 * return all pagetables will be destroyed) until
+                 * khugepaged has finished working on the pagetables
+                 * under the mmap_sem.
+                 */
+                down_write(&mm->mmap_sem);
+                up_write(&mm->mmap_sem);
+        } else
+                spin_unlock(&khugepaged_mm_lock);
+}
+static void release_pte_page(struct page *page)
+{
+        /* 0 stands for page_is_file_cache(page) == false */
+        dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
+        unlock_page(page);
+        putback_lru_page(page);
+}
+static void release_pte_pages(pte_t *pte, pte_t *_pte)
+{
+        while (--_pte >= pte) {
+                pte_t pteval = *_pte;
+                if (!pte_none(pteval))
+                        release_pte_page(pte_page(pteval));
+        }
+}
+static void release_all_pte_pages(pte_t *pte)
+{
+        release_pte_pages(pte, pte + HPAGE_PMD_NR);
+}
+static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
+                                        unsigned long address,
+                                        pte_t *pte)
+{
+        struct page *page;
+        pte_t *_pte;
+        int referenced = 0, isolated = 0, none = 0;
+        for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+             _pte++, address += PAGE_SIZE) {
+                pte_t pteval = *_pte;
+                if (pte_none(pteval)) {
+                        if (++none <= khugepaged_max_ptes_none)
+                                continue;
+                        else {
+                                release_pte_pages(pte, _pte);
+                                goto out;
+                        }
+                }
+                if (!pte_present(pteval) || !pte_write(pteval)) {
+                        release_pte_pages(pte, _pte);
+                        goto out;
+                }
+                page = vm_normal_page(vma, address, pteval);
+                if (unlikely(!page)) {
+                        release_pte_pages(pte, _pte);
+                        goto out;
+                }
+                VM_BUG_ON(PageCompound(page));
+                BUG_ON(!PageAnon(page));
+                VM_BUG_ON(!PageSwapBacked(page));
+                /* cannot use mapcount: can't collapse if there's a gup pin */
+                if (page_count(page) != 1) {
+                        release_pte_pages(pte, _pte);
+                        goto out;
+                }
+                /*
+                 * We can do it before isolate_lru_page because the
+                 * page can't be freed from under us. NOTE: PG_lock
+                 * is needed to serialize against split_huge_page
+                 * when invoked from the VM.
+                 */
+                if (!trylock_page(page)) {
+                        release_pte_pages(pte, _pte);
+                        goto out;
+                }
+                /*
+                 * Isolate the page to avoid collapsing an hugepage
+                 * currently in use by the VM.
+                 */
+                if (isolate_lru_page(page)) {
+                        unlock_page(page);
+                        release_pte_pages(pte, _pte);
+                        goto out;
+                }
+                /* 0 stands for page_is_file_cache(page) == false */
+                inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
+                VM_BUG_ON(!PageLocked(page));
+                VM_BUG_ON(PageLRU(page));
+                /* If there is no mapped pte young don't collapse the page */
+                if (pte_young(pteval) || PageReferenced(page) ||
+                    mmu_notifier_test_young(vma->vm_mm, address))
+                        referenced = 1;
+        }
+        if (unlikely(!referenced))
+                release_all_pte_pages(pte);
+        else
+                isolated = 1;
+out:
+        return isolated;
+}
+static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
+                                      struct vm_area_struct *vma,
+                                      unsigned long address,
+                                      spinlock_t *ptl)
+{
+        pte_t *_pte;
+        for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
+                pte_t pteval = *_pte;
+                struct page *src_page;
+                if (pte_none(pteval)) {
+                        clear_user_highpage(page, address);
+                        add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+                } else {
+                        src_page = pte_page(pteval);
+                        copy_user_highpage(page, src_page, address, vma);
+                        VM_BUG_ON(page_mapcount(src_page) != 1);
+                        VM_BUG_ON(page_count(src_page) != 2);
+                        release_pte_page(src_page);
+                        /*
+                         * ptl mostly unnecessary, but preempt has to
+                         * be disabled to update the per-cpu stats
+                         * inside page_remove_rmap().
+                         */
+                        spin_lock(ptl);
+                        /*
+                         * paravirt calls inside pte_clear here are
+                         * superfluous.
+                         */
+                        pte_clear(vma->vm_mm, address, _pte);
+                        page_remove_rmap(src_page);
+                        spin_unlock(ptl);
+                        free_page_and_swap_cache(src_page);
+                }
+                address += PAGE_SIZE;
+                page++;
+        }
+}
+static void collapse_huge_page(struct mm_struct *mm,
+                               unsigned long address,
+                               struct page **hpage,
+                               struct vm_area_struct *vma)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd, _pmd;
+        pte_t *pte;
+        pgtable_t pgtable;
+        struct page *new_page;
+        spinlock_t *ptl;
+        int isolated;
+        unsigned long hstart, hend;
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#ifndef CONFIG_NUMA
+        VM_BUG_ON(!*hpage);
+        new_page = *hpage;
+#else
+        VM_BUG_ON(*hpage);
+        /*
+         * Allocate the page while the vma is still valid and under
+         * the mmap_sem read mode so there is no memory allocation
+         * later when we take the mmap_sem in write mode. This is more
+         * friendly behavior (OTOH it may actually hide bugs) to
+         * filesystems in userland with daemons allocating memory in
+         * the userland I/O paths.  Allocating memory with the
+         * mmap_sem in read mode is good idea also to allow greater
+         * scalability.
+         */
+        new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+        if (unlikely(!new_page)) {
+                up_read(&mm->mmap_sem);
+                *hpage = ERR_PTR(-ENOMEM);
+                return;
+        }
+#endif
+        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+                up_read(&mm->mmap_sem);
+                put_page(new_page);
+                return;
+        }
+        /* after allocating the hugepage upgrade to mmap_sem write mode */
+        up_read(&mm->mmap_sem);
+        /*
+         * Prevent all access to pagetables with the exception of
+         * gup_fast later hanlded by the ptep_clear_flush and the VM
+         * handled by the anon_vma lock + PG_lock.
+         */
+        down_write(&mm->mmap_sem);
+        if (unlikely(khugepaged_test_exit(mm)))
+                goto out;
+        vma = find_vma(mm, address);
+        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+        hend = vma->vm_end & HPAGE_PMD_MASK;
+        if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+                goto out;
+        if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+            (vma->vm_flags & VM_NOHUGEPAGE))
+                goto out;
+        /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
+        if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+                goto out;
+        VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+        pgd = pgd_offset(mm, address);
+        if (!pgd_present(*pgd))
+                goto out;
+        pud = pud_offset(pgd, address);
+        if (!pud_present(*pud))
+                goto out;
+        pmd = pmd_offset(pud, address);
+        /* pmd can't go away or become huge under us */
+        if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+                goto out;
+        anon_vma_lock(vma->anon_vma);
+        pte = pte_offset_map(pmd, address);
+        ptl = pte_lockptr(mm, pmd);
+        spin_lock(&mm->page_table_lock); /* probably unnecessary */
+        /*
+         * After this gup_fast can't run anymore. This also removes
+         * any huge TLB entry from the CPU so we won't allow
+         * huge and small TLB entries for the same virtual address
+         * to avoid the risk of CPU bugs in that area.
+         */
+        _pmd = pmdp_clear_flush_notify(vma, address, pmd);
+        spin_unlock(&mm->page_table_lock);
+        spin_lock(ptl);
+        isolated = __collapse_huge_page_isolate(vma, address, pte);
+        spin_unlock(ptl);
+        pte_unmap(pte);
+        if (unlikely(!isolated)) {
+                spin_lock(&mm->page_table_lock);
+                BUG_ON(!pmd_none(*pmd));
+                set_pmd_at(mm, address, pmd, _pmd);
+                spin_unlock(&mm->page_table_lock);
+                anon_vma_unlock(vma->anon_vma);
+                mem_cgroup_uncharge_page(new_page);
+                goto out;
+        }
+        /*
+         * All pages are isolated and locked so anon_vma rmap
+         * can't run anymore.
+         */
+        anon_vma_unlock(vma->anon_vma);
+        __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
+        __SetPageUptodate(new_page);
+        pgtable = pmd_pgtable(_pmd);
+        VM_BUG_ON(page_count(pgtable) != 1);
+        VM_BUG_ON(page_mapcount(pgtable) != 0);
+        _pmd = mk_pmd(new_page, vma->vm_page_prot);
+        _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+        _pmd = pmd_mkhuge(_pmd);
+        /*
+         * spin_lock() below is not the equivalent of smp_wmb(), so
+         * this is needed to avoid the copy_huge_page writes to become
+         * visible after the set_pmd_at() write.
+         */
+        smp_wmb();
+        spin_lock(&mm->page_table_lock);
+        BUG_ON(!pmd_none(*pmd));
+        page_add_new_anon_rmap(new_page, vma, address);
+        set_pmd_at(mm, address, pmd, _pmd);
+        update_mmu_cache(vma, address, entry);
+        prepare_pmd_huge_pte(pgtable, mm);
+        mm->nr_ptes--;
+        spin_unlock(&mm->page_table_lock);
+#ifndef CONFIG_NUMA
+        *hpage = NULL;
+#endif
+        khugepaged_pages_collapsed++;
+out_up_write:
+        up_write(&mm->mmap_sem);
+        return;
+out:
+#ifdef CONFIG_NUMA
+        put_page(new_page);
+#endif
+        goto out_up_write;
+}
+static int khugepaged_scan_pmd(struct mm_struct *mm,
+                               struct vm_area_struct *vma,
+                               unsigned long address,
+                               struct page **hpage)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte, *_pte;
+        int ret = 0, referenced = 0, none = 0;
+        struct page *page;
+        unsigned long _address;
+        spinlock_t *ptl;
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        pgd = pgd_offset(mm, address);
+        if (!pgd_present(*pgd))
+                goto out;
+        pud = pud_offset(pgd, address);
+        if (!pud_present(*pud))
+                goto out;
+        pmd = pmd_offset(pud, address);
+        if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+                goto out;
+        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+        for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+             _pte++, _address += PAGE_SIZE) {
+                pte_t pteval = *_pte;
+                if (pte_none(pteval)) {
+                        if (++none <= khugepaged_max_ptes_none)
+                                continue;
+                        else
+                                goto out_unmap;
+                }
+                if (!pte_present(pteval) || !pte_write(pteval))
+                        goto out_unmap;
+                page = vm_normal_page(vma, _address, pteval);
+                if (unlikely(!page))
+                        goto out_unmap;
+                VM_BUG_ON(PageCompound(page));
+                if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+                        goto out_unmap;
+                /* cannot use mapcount: can't collapse if there's a gup pin */
+                if (page_count(page) != 1)
+                        goto out_unmap;
+                if (pte_young(pteval) || PageReferenced(page) ||
+                    mmu_notifier_test_young(vma->vm_mm, address))
+                        referenced = 1;
+        }
+        if (referenced)
+                ret = 1;
+out_unmap:
+        pte_unmap_unlock(pte, ptl);
+        if (ret)
+                /* collapse_huge_page will return with the mmap_sem released */
+                collapse_huge_page(mm, address, hpage, vma);
+out:
+        return ret;
+}
+static void collect_mm_slot(struct mm_slot *mm_slot)
+{
+        struct mm_struct *mm = mm_slot->mm;
+        VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+        if (khugepaged_test_exit(mm)) {
+                /* free mm_slot */
+                hlist_del(&mm_slot->hash);
+                list_del(&mm_slot->mm_node);
+                /*
+                 * Not strictly needed because the mm exited already.
+                 *
+                 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+                 */
+                /* khugepaged_mm_lock actually not necessary for the below */
+                free_mm_slot(mm_slot);
+                mmdrop(mm);
+        }
+}
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
+                                            struct page **hpage)
+{
+        struct mm_slot *mm_slot;
+        struct mm_struct *mm;
+        struct vm_area_struct *vma;
+        int progress = 0;
+        VM_BUG_ON(!pages);
+        VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+        if (khugepaged_scan.mm_slot)
+                mm_slot = khugepaged_scan.mm_slot;
+        else {
+                mm_slot = list_entry(khugepaged_scan.mm_head.next,
+                                     struct mm_slot, mm_node);
+                khugepaged_scan.address = 0;
+                khugepaged_scan.mm_slot = mm_slot;
+        }
+        spin_unlock(&khugepaged_mm_lock);
+        mm = mm_slot->mm;
+        down_read(&mm->mmap_sem);
+        if (unlikely(khugepaged_test_exit(mm)))
+                vma = NULL;
+        else
+                vma = find_vma(mm, khugepaged_scan.address);
+        progress++;
+        for (; vma; vma = vma->vm_next) {
+                unsigned long hstart, hend;
+                cond_resched();
+                if (unlikely(khugepaged_test_exit(mm))) {
+                        progress++;
+                        break;
+                }
+                if ((!(vma->vm_flags & VM_HUGEPAGE) &&
+                     !khugepaged_always()) ||
+                    (vma->vm_flags & VM_NOHUGEPAGE)) {
+                        progress++;
+                        continue;
+                }
+                /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
+                if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
+                        khugepaged_scan.address = vma->vm_end;
+                        progress++;
+                        continue;
+                }
+                VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+                hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+                hend = vma->vm_end & HPAGE_PMD_MASK;
+                if (hstart >= hend) {
+                        progress++;
+                        continue;
+                }
+                if (khugepaged_scan.address < hstart)
+                        khugepaged_scan.address = hstart;
+                if (khugepaged_scan.address > hend) {
+                        khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
+                        progress++;
+                        continue;
+                }
+                BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+                while (khugepaged_scan.address < hend) {
+                        int ret;
+                        cond_resched();
+                        if (unlikely(khugepaged_test_exit(mm)))
+                                goto breakouterloop;
+                        VM_BUG_ON(khugepaged_scan.address < hstart ||
+                                  khugepaged_scan.address + HPAGE_PMD_SIZE >
+                                  hend);
+                        ret = khugepaged_scan_pmd(mm, vma,
+                                                  khugepaged_scan.address,
+                                                  hpage);
+                        /* move to next address */
+                        khugepaged_scan.address += HPAGE_PMD_SIZE;
+                        progress += HPAGE_PMD_NR;
+                        if (ret)
+                                /* we released mmap_sem so break loop */
+                                goto breakouterloop_mmap_sem;
+                        if (progress >= pages)
+                                goto breakouterloop;
+                }
+        }
+breakouterloop:
+        up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
+breakouterloop_mmap_sem:
+        spin_lock(&khugepaged_mm_lock);
+        BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+        /*
+         * Release the current mm_slot if this mm is about to die, or
+         * if we scanned all vmas of this mm.
+         */
+        if (khugepaged_test_exit(mm) || !vma) {
+                /*
+                 * Make sure that if mm_users is reaching zero while
+                 * khugepaged runs here, khugepaged_exit will find
+                 * mm_slot not pointing to the exiting mm.
+                 */
+                if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
+                        khugepaged_scan.mm_slot = list_entry(
+                                mm_slot->mm_node.next,
+                                struct mm_slot, mm_node);
+                        khugepaged_scan.address = 0;
+                } else {
+                        khugepaged_scan.mm_slot = NULL;
+                        khugepaged_full_scans++;
+                }
+                collect_mm_slot(mm_slot);
+        }
+        return progress;
+}
+static int khugepaged_has_work(void)
+{
+        return !list_empty(&khugepaged_scan.mm_head) &&
+                khugepaged_enabled();
+}
+static int khugepaged_wait_event(void)
+{
+        return !list_empty(&khugepaged_scan.mm_head) ||
+                !khugepaged_enabled();
+}
+static void khugepaged_do_scan(struct page **hpage)
+{
+        unsigned int progress = 0, pass_through_head = 0;
+        unsigned int pages = khugepaged_pages_to_scan;
+        barrier(); /* write khugepaged_pages_to_scan to local stack */
+        while (progress < pages) {
+                cond_resched();
+#ifndef CONFIG_NUMA
+                if (!*hpage) {
+                        *hpage = alloc_hugepage(khugepaged_defrag());
+                        if (unlikely(!*hpage))
+                                break;
+                }
+#else
+                if (IS_ERR(*hpage))
+                        break;
+#endif
+                if (unlikely(kthread_should_stop() || freezing(current)))
+                        break;
+                spin_lock(&khugepaged_mm_lock);
+                if (!khugepaged_scan.mm_slot)
+                        pass_through_head++;
+                if (khugepaged_has_work() &&
+                    pass_through_head < 2)
+                        progress += khugepaged_scan_mm_slot(pages - progress,
+                                                            hpage);
+                else
+                        progress = pages;
+                spin_unlock(&khugepaged_mm_lock);
+        }
+}
+static void khugepaged_alloc_sleep(void)
+{
+        DEFINE_WAIT(wait);
+        add_wait_queue(&khugepaged_wait, &wait);
+        schedule_timeout_interruptible(
+                msecs_to_jiffies(
+                        khugepaged_alloc_sleep_millisecs));
+        remove_wait_queue(&khugepaged_wait, &wait);
+}
+#ifndef CONFIG_NUMA
+static struct page *khugepaged_alloc_hugepage(void)
+{
+        struct page *hpage;
+        do {
+                hpage = alloc_hugepage(khugepaged_defrag());
+                if (!hpage)
+                        khugepaged_alloc_sleep();
+        } while (unlikely(!hpage) &&
+                 likely(khugepaged_enabled()));
+        return hpage;
+}
+#endif
+static void khugepaged_loop(void)
+{
+        struct page *hpage;
+#ifdef CONFIG_NUMA
+        hpage = NULL;
+#endif
+        while (likely(khugepaged_enabled())) {
+#ifndef CONFIG_NUMA
+                hpage = khugepaged_alloc_hugepage();
+                if (unlikely(!hpage))
+                        break;
+#else
+                if (IS_ERR(hpage)) {
+                        khugepaged_alloc_sleep();
+                        hpage = NULL;
+                }
+#endif
+                khugepaged_do_scan(&hpage);
+#ifndef CONFIG_NUMA
+                if (hpage)
+                        put_page(hpage);
+#endif
+                try_to_freeze();
+                if (unlikely(kthread_should_stop()))
+                        break;
+                if (khugepaged_has_work()) {
+                        DEFINE_WAIT(wait);
+                        if (!khugepaged_scan_sleep_millisecs)
+                                continue;
+                        add_wait_queue(&khugepaged_wait, &wait);
+                        schedule_timeout_interruptible(
+                                msecs_to_jiffies(
+                                        khugepaged_scan_sleep_millisecs));
+                        remove_wait_queue(&khugepaged_wait, &wait);
+                } else if (khugepaged_enabled())
+                        wait_event_freezable(khugepaged_wait,
+                                             khugepaged_wait_event());
+        }
+}
+static int khugepaged(void *none)
+{
+        struct mm_slot *mm_slot;
+        set_freezable();
+        set_user_nice(current, 19);
+        /* serialize with start_khugepaged() */
+        mutex_lock(&khugepaged_mutex);
+        for (;;) {
+                mutex_unlock(&khugepaged_mutex);
+                BUG_ON(khugepaged_thread != current);
+                khugepaged_loop();
+                BUG_ON(khugepaged_thread != current);
+                mutex_lock(&khugepaged_mutex);
+                if (!khugepaged_enabled())
+                        break;
+                if (unlikely(kthread_should_stop()))
+                        break;
+        }
+        spin_lock(&khugepaged_mm_lock);
+        mm_slot = khugepaged_scan.mm_slot;
+        khugepaged_scan.mm_slot = NULL;
+        if (mm_slot)
+                collect_mm_slot(mm_slot);
+        spin_unlock(&khugepaged_mm_lock);
+        khugepaged_thread = NULL;
+        mutex_unlock(&khugepaged_mutex);
+        return 0;
+}
+void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
+{
+        struct page *page;
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_trans_huge(*pmd))) {
+                spin_unlock(&mm->page_table_lock);
+                return;
+        }
+        page = pmd_page(*pmd);
+        VM_BUG_ON(!page_count(page));
+        get_page(page);
+        spin_unlock(&mm->page_table_lock);
+        split_huge_page(page);
+        put_page(page);
+        BUG_ON(pmd_trans_huge(*pmd));
+}
+static void split_huge_page_address(struct mm_struct *mm,
+                                    unsigned long address)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
+        pgd = pgd_offset(mm, address);
+        if (!pgd_present(*pgd))
+                return;
+        pud = pud_offset(pgd, address);
+        if (!pud_present(*pud))
+                return;
+        pmd = pmd_offset(pud, address);
+        if (!pmd_present(*pmd))
+                return;
+        /*
+         * Caller holds the mmap_sem write mode, so a huge pmd cannot
+         * materialize from under us.
+         */
+        split_huge_page_pmd(mm, pmd);
+}
+void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+                             unsigned long start,
+                             unsigned long end,
+                             long adjust_next)
+{
+        /*
+         * If the new start address isn't hpage aligned and it could
+         * previously contain an hugepage: check if we need to split
+         * an huge pmd.
+         */
+        if (start & ~HPAGE_PMD_MASK &&
+            (start & HPAGE_PMD_MASK) >= vma->vm_start &&
+            (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+                split_huge_page_address(vma->vm_mm, start);
+        /*
+         * If the new end address isn't hpage aligned and it could
+         * previously contain an hugepage: check if we need to split
+         * an huge pmd.
+         */
+        if (end & ~HPAGE_PMD_MASK &&
+            (end & HPAGE_PMD_MASK) >= vma->vm_start &&
+            (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+                split_huge_page_address(vma->vm_mm, end);
+        /*
+         * If we're also updating the vma->vm_next->vm_start, if the new
+         * vm_next->vm_start isn't page aligned and it could previously
+         * contain an hugepage: check if we need to split an huge pmd.
+         */
+        if (adjust_next > 0) {
+                struct vm_area_struct *next = vma->vm_next;
+                unsigned long nstart = next->vm_start;
+                nstart += adjust_next << PAGE_SHIFT;
+                if (nstart & ~HPAGE_PMD_MASK &&
+                    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
+                    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
+                        split_huge_page_address(next->vm_mm, nstart);
+        }
+}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 85855240933d..bb0b7c128015 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -394,71 +394,6 @@ static int vma_has_reserves(struct vm_area_struct *vma)
        return 0;
 }
-static void clear_gigantic_page(struct page *page,
-                        unsigned long addr, unsigned long sz)
-{
-        int i;
-        struct page *p = page;
-        might_sleep();
-        for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
-                cond_resched();
-                clear_user_highpage(p, addr + i * PAGE_SIZE);
-        }
-}
-static void clear_huge_page(struct page *page,
-                        unsigned long addr, unsigned long sz)
-{
-        int i;
-        if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
-                clear_gigantic_page(page, addr, sz);
-                return;
-        }
-        might_sleep();
-        for (i = 0; i < sz/PAGE_SIZE; i++) {
-                cond_resched();
-                clear_user_highpage(page + i, addr + i * PAGE_SIZE);
-        }
-}
-static void copy_user_gigantic_page(struct page *dst, struct page *src,
-                           unsigned long addr, struct vm_area_struct *vma)
-{
-        int i;
-        struct hstate *h = hstate_vma(vma);
-        struct page *dst_base = dst;
-        struct page *src_base = src;
-        for (i = 0; i < pages_per_huge_page(h); ) {
-                cond_resched();
-                copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
-                i++;
-                dst = mem_map_next(dst, dst_base, i);
-                src = mem_map_next(src, src_base, i);
-        }
-}
-static void copy_user_huge_page(struct page *dst, struct page *src,
-                           unsigned long addr, struct vm_area_struct *vma)
-{
-        int i;
-        struct hstate *h = hstate_vma(vma);
-        if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
-                copy_user_gigantic_page(dst, src, addr, vma);
-                return;
-        }
-        might_sleep();
-        for (i = 0; i < pages_per_huge_page(h); i++) {
-                cond_resched();
-                copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
-        }
-}
 static void copy_gigantic_page(struct page *dst, struct page *src)
 {
        int i;
@@ -1428,6 +1363,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj,
        return sprintf(buf, "%lu\n", nr_huge_pages);
 }
 static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
                        struct kobject *kobj, struct kobj_attribute *attr,
                        const char *buf, size_t len)
@@ -1440,9 +1376,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
        err = strict_strtoul(buf, 10, &count);
        if (err)
-                return 0;
+                goto out;
        h = kobj_to_hstate(kobj, &nid);
+        if (h->order >= MAX_ORDER) {
+                err = -EINVAL;
+                goto out;
+        }
        if (nid == NUMA_NO_NODE) {
                /*
                 * global hstate attribute
@@ -1468,6 +1409,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
                NODEMASK_FREE(nodes_allowed);
        return len;
+out:
+        NODEMASK_FREE(nodes_allowed);
+        return err;
 }
 static ssize_t nr_hugepages_show(struct kobject *kobj,
@@ -1510,6 +1454,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
        struct hstate *h = kobj_to_hstate(kobj, NULL);
        return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
 }
 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
                struct kobj_attribute *attr, const char *buf, size_t count)
 {
@@ -1517,9 +1462,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
        unsigned long input;
        struct hstate *h = kobj_to_hstate(kobj, NULL);
+        if (h->order >= MAX_ORDER)
+                return -EINVAL;
        err = strict_strtoul(buf, 10, &input);
        if (err)
-                return 0;
+                return err;
        spin_lock(&hugetlb_lock);
        h->nr_overcommit_huge_pages = input;
@@ -1922,13 +1870,19 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
 {
        struct hstate *h = &default_hstate;
        unsigned long tmp;
+        int ret;
        if (!write)
                tmp = h->max_huge_pages;
+        if (write && h->order >= MAX_ORDER)
+                return -EINVAL;
        table->data = &tmp;
        table->maxlen = sizeof(unsigned long);
-        proc_doulongvec_minmax(table, write, buffer, length, ppos);
+        ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+        if (ret)
+                goto out;
        if (write) {
                NODEMASK_ALLOC(nodemask_t, nodes_allowed,
@@ -1943,8 +1897,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
                if (nodes_allowed != &node_states[N_HIGH_MEMORY])
                        NODEMASK_FREE(nodes_allowed);
        }
+out:
-        return 0;
+        return ret;
 }
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -1982,21 +1936,27 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 {
        struct hstate *h = &default_hstate;
        unsigned long tmp;
+        int ret;
        if (!write)
                tmp = h->nr_overcommit_huge_pages;
+        if (write && h->order >= MAX_ORDER)
+                return -EINVAL;
        table->data = &tmp;
        table->maxlen = sizeof(unsigned long);
-        proc_doulongvec_minmax(table, write, buffer, length, ppos);
+        ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+        if (ret)
+                goto out;
        if (write) {
                spin_lock(&hugetlb_lock);
                h->nr_overcommit_huge_pages = tmp;
                spin_unlock(&hugetlb_lock);
        }
+out:
-        return 0;
+        return ret;
 }
 #endif /* CONFIG_SYSCTL */
@@ -2454,7 +2414,8 @@ retry_avoidcopy:
                return VM_FAULT_OOM;
        }
-        copy_user_huge_page(new_page, old_page, address, vma);
+        copy_user_huge_page(new_page, old_page, address, vma,
+                            pages_per_huge_page(h));
        __SetPageUptodate(new_page);
        /*
@@ -2558,7 +2519,7 @@ retry:
                        ret = -PTR_ERR(page);
                        goto out;
                }
-                clear_huge_page(page, address, huge_page_size(h));
+                clear_huge_page(page, address, pages_per_huge_page(h));
                __SetPageUptodate(page);
                if (vma->vm_flags & VM_MAYSHARE) {
diff --git a/mm/internal.h b/mm/internal.h
index dedb0aff673f..4c98630f0f77 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -39,6 +39,15 @@ static inline void __put_page(struct page *page)
 extern unsigned long highest_memmap_pfn;
+#ifdef CONFIG_SMP
+extern int putback_active_lru_page(struct zone *zone, struct page *page);
+#else
+static inline int putback_active_lru_page(struct zone *zone, struct page *page)
+{
+        return 0;
+}
+#endif
 /*
 * in mm/vmscan.c:
 */
@@ -134,6 +143,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
        }
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern unsigned long vma_address(struct page *page,
+                                 struct vm_area_struct *vma);
+#endif
 #else /* !CONFIG_MMU */
 static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
 {
@@ -243,7 +256,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                     unsigned long start, int len, unsigned int foll_flags,
-                     struct page **pages, struct vm_area_struct **vmas);
+                     struct page **pages, struct vm_area_struct **vmas,
+                     int *nonblocking);
 #define ZONE_RECLAIM_NOSCAN     -2
 #define ZONE_RECLAIM_FULL       -1
diff --git a/mm/ksm.c b/mm/ksm.c
index 43bc893470b4..c2b2a94f9d67 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -34,6 +34,7 @@
 #include <linux/swap.h>
 #include <linux/ksm.h>
 #include <linux/hash.h>
+#include <linux/freezer.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -411,6 +412,20 @@ out:
        up_read(&mm->mmap_sem);
 }
+static struct page *page_trans_compound_anon(struct page *page)
+{
+        if (PageTransCompound(page)) {
+                struct page *head = compound_trans_head(page);
+                /*
+                 * head may actually be splitted and freed from under
+                 * us but it's ok here.
+                 */
+                if (PageAnon(head))
+                        return head;
+        }
+        return NULL;
+}
 static struct page *get_mergeable_page(struct rmap_item *rmap_item)
 {
        struct mm_struct *mm = rmap_item->mm;
@@ -430,7 +445,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
        page = follow_page(vma, addr, FOLL_GET);
        if (IS_ERR_OR_NULL(page))
                goto out;
-        if (PageAnon(page)) {
+        if (PageAnon(page) || page_trans_compound_anon(page)) {
                flush_anon_page(vma, page, addr);
                flush_dcache_page(page);
        } else {
@@ -708,6 +723,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
        if (addr == -EFAULT)
                goto out;
+        BUG_ON(PageTransCompound(page));
        ptep = page_check_address(page, mm, addr, &ptl, 0);
        if (!ptep)
                goto out;
@@ -783,6 +799,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
                goto out;
        pmd = pmd_offset(pud, addr);
+        BUG_ON(pmd_trans_huge(*pmd));
        if (!pmd_present(*pmd))
                goto out;
@@ -800,6 +817,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
        page_remove_rmap(page);
+        if (!page_mapped(page))
+                try_to_free_swap(page);
        put_page(page);
        pte_unmap_unlock(ptep, ptl);
@@ -808,6 +827,33 @@ out:
        return err;
 }
+static int page_trans_compound_anon_split(struct page *page)
+{
+        int ret = 0;
+        struct page *transhuge_head = page_trans_compound_anon(page);
+        if (transhuge_head) {
+                /* Get the reference on the head to split it. */
+                if (get_page_unless_zero(transhuge_head)) {
+                        /*
+                         * Recheck we got the reference while the head
+                         * was still anonymous.
+                         */
+                        if (PageAnon(transhuge_head))
+                                ret = split_huge_page(transhuge_head);
+                        else
+                                /*
+                                 * Retry later if split_huge_page run
+                                 * from under us.
+                                 */
+                                ret = 1;
+                        put_page(transhuge_head);
+                } else
+                        /* Retry later if split_huge_page run from under us. */
+                        ret = 1;
+        }
+        return ret;
+}
 /*
 * try_to_merge_one_page - take two pages and merge them into one
 * @vma: the vma that holds the pte pointing to page
@@ -828,6 +874,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
        if (!(vma->vm_flags & VM_MERGEABLE))
                goto out;
+        if (PageTransCompound(page) && page_trans_compound_anon_split(page))
+                goto out;
+        BUG_ON(PageTransCompound(page));
        if (!PageAnon(page))
                goto out;
@@ -1247,6 +1296,18 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
        slot = ksm_scan.mm_slot;
        if (slot == &ksm_mm_head) {
+                /*
+                 * A number of pages can hang around indefinitely on per-cpu
+                 * pagevecs, raised page count preventing write_protect_page
+                 * from merging them.  Though it doesn't really matter much,
+                 * it is puzzling to see some stuck in pages_volatile until
+                 * other activity jostles them out, and they also prevented
+                 * LTP's KSM test from succeeding deterministically; so drain
+                 * them here (here rather than on entry to ksm_do_scan(),
+                 * so we don't IPI too often when pages_to_scan is set low).
+                 */
+                lru_add_drain_all();
                root_unstable_tree = RB_ROOT;
                spin_lock(&ksm_mmlist_lock);
@@ -1277,7 +1338,13 @@ next_mm:
                        if (ksm_test_exit(mm))
                                break;
                        *page = follow_page(vma, ksm_scan.address, FOLL_GET);
-                        if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) {
+                        if (IS_ERR_OR_NULL(*page)) {
+                                ksm_scan.address += PAGE_SIZE;
+                                cond_resched();
+                                continue;
+                        }
+                        if (PageAnon(*page) ||
+                            page_trans_compound_anon(*page)) {
                                flush_anon_page(vma, *page, ksm_scan.address);
                                flush_dcache_page(*page);
                                rmap_item = get_next_rmap_item(slot,
@@ -1291,8 +1358,7 @@ next_mm:
                                up_read(&mm->mmap_sem);
                                return rmap_item;
                        }
-                        if (!IS_ERR_OR_NULL(*page))
+                        put_page(*page);
-                                put_page(*page);
                        ksm_scan.address += PAGE_SIZE;
                        cond_resched();
                }
@@ -1352,7 +1418,7 @@ static void ksm_do_scan(unsigned int scan_npages)
        struct rmap_item *rmap_item;
        struct page *uninitialized_var(page);
-        while (scan_npages--) {
+        while (scan_npages-- && likely(!freezing(current))) {
                cond_resched();
                rmap_item = scan_get_next_rmap_item(&page);
                if (!rmap_item)
@@ -1370,6 +1436,7 @@ static int ksmd_should_run(void)
 static int ksm_scan_thread(void *nothing)
 {
+        set_freezable();
        set_user_nice(current, 5);
        while (!kthread_should_stop()) {
@@ -1378,11 +1445,13 @@ static int ksm_scan_thread(void *nothing)
                        ksm_do_scan(ksm_thread_pages_to_scan);
                mutex_unlock(&ksm_thread_mutex);
+                try_to_freeze();
                if (ksmd_should_run()) {
                        schedule_timeout_interruptible(
                                msecs_to_jiffies(ksm_thread_sleep_millisecs));
                } else {
-                        wait_event_interruptible(ksm_thread_wait,
+                        wait_event_freezable(ksm_thread_wait,
                                ksmd_should_run() || kthread_should_stop());
                }
        }
diff --git a/mm/madvise.c b/mm/madvise.c
index 319528b8db74..2221491ed503 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -71,6 +71,12 @@ static long madvise_behavior(struct vm_area_struct * vma,
                if (error)
                        goto out;
                break;
+        case MADV_HUGEPAGE:
+        case MADV_NOHUGEPAGE:
+                error = hugepage_madvise(vma, &new_flags, behavior);
+                if (error)
+                        goto out;
+                break;
        }
        if (new_flags == vma->vm_flags) {
@@ -283,6 +289,10 @@ madvise_behavior_valid(int behavior)
        case MADV_MERGEABLE:
        case MADV_UNMERGEABLE:
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        case MADV_HUGEPAGE:
+        case MADV_NOHUGEPAGE:
+#endif
                return 1;
        default:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 00bb8a64d028..8ab841031436 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -292,7 +292,6 @@ static struct move_charge_struct {
        unsigned long moved_charge;
        unsigned long moved_swap;
        struct task_struct *moving_task;        /* a task moving charges */
-        struct mm_struct *mm;
        wait_queue_head_t waitq;                /* a waitq for other context */
 } mc = {
        .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
@@ -821,7 +820,6 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
                return;
        VM_BUG_ON(list_empty(&pc->lru));
        list_del_init(&pc->lru);
-        return;
 }
 void mem_cgroup_del_lru(struct page *page)
@@ -1087,7 +1085,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
                case 0:
                        list_move(&page->lru, dst);
                        mem_cgroup_del_lru(page);
-                        nr_taken++;
+                        nr_taken += hpage_nr_pages(page);
                        break;
                case -EBUSY:
                        /* we don't affect global LRU but rotate in our LRU */
@@ -1312,8 +1310,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
        u64 limit;
        u64 memsw;
-        limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
+        limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
-                        total_swap_pages;
+        limit += total_swap_pages << PAGE_SHIFT;
        memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
        /*
         * If memsw is finite and limits the amount of swap space available
@@ -1600,11 +1599,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
 * possibility of race condition. If there is, we take a lock.
 */
-static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
+void mem_cgroup_update_page_stat(struct page *page,
+                                 enum mem_cgroup_page_stat_item idx, int val)
 {
        struct mem_cgroup *mem;
        struct page_cgroup *pc = lookup_page_cgroup(page);
        bool need_unlock = false;
+        unsigned long uninitialized_var(flags);
        if (unlikely(!pc))
                return;
@@ -1616,37 +1617,34 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
        /* pc->mem_cgroup is unstable ? */
        if (unlikely(mem_cgroup_stealed(mem))) {
                /* take a lock against to access pc->mem_cgroup */
-                lock_page_cgroup(pc);
+                move_lock_page_cgroup(pc, &flags);
                need_unlock = true;
                mem = pc->mem_cgroup;
                if (!mem || !PageCgroupUsed(pc))
                        goto out;
        }
-        this_cpu_add(mem->stat->count[idx], val);
        switch (idx) {
-        case MEM_CGROUP_STAT_FILE_MAPPED:
+        case MEMCG_NR_FILE_MAPPED:
                if (val > 0)
                        SetPageCgroupFileMapped(pc);
                else if (!page_mapped(page))
                        ClearPageCgroupFileMapped(pc);
+                idx = MEM_CGROUP_STAT_FILE_MAPPED;
                break;
        default:
                BUG();
        }
+        this_cpu_add(mem->stat->count[idx], val);
 out:
        if (unlikely(need_unlock))
-                unlock_page_cgroup(pc);
+                move_unlock_page_cgroup(pc, &flags);
        rcu_read_unlock();
        return;
 }
+EXPORT_SYMBOL(mem_cgroup_update_page_stat);
-void mem_cgroup_update_file_mapped(struct page *page, int val)
-{
-        mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
-}
 /*
 * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -1887,12 +1885,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
 * oom-killer can be invoked.
 */
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
-                gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+                                   gfp_t gfp_mask,
+                                   struct mem_cgroup **memcg, bool oom,
+                                   int page_size)
 {
        int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
        struct mem_cgroup *mem = NULL;
        int ret;
-        int csize = CHARGE_SIZE;
+        int csize = max(CHARGE_SIZE, (unsigned long) page_size);
        /*
         * Unlike gloval-vm's OOM-kill, we're not in memory shortage
@@ -1917,7 +1917,7 @@ again:
                VM_BUG_ON(css_is_removed(&mem->css));
                if (mem_cgroup_is_root(mem))
                        goto done;
-                if (consume_stock(mem))
+                if (page_size == PAGE_SIZE && consume_stock(mem))
                        goto done;
                css_get(&mem->css);
        } else {
@@ -1940,7 +1940,7 @@ again:
                        rcu_read_unlock();
                        goto done;
                }
-                if (consume_stock(mem)) {
+                if (page_size == PAGE_SIZE && consume_stock(mem)) {
                        /*
                         * It seems dagerous to access memcg without css_get().
                         * But considering how consume_stok works, it's not
@@ -1981,7 +1981,7 @@ again:
                case CHARGE_OK:
                        break;
                case CHARGE_RETRY: /* not in OOM situation but retry */
-                        csize = PAGE_SIZE;
+                        csize = page_size;
                        css_put(&mem->css);
                        mem = NULL;
                        goto again;
@@ -2002,8 +2002,8 @@ again:
                }
        } while (ret != CHARGE_OK);
-        if (csize > PAGE_SIZE)
+        if (csize > page_size)
-                refill_stock(mem, csize - PAGE_SIZE);
+                refill_stock(mem, csize - page_size);
        css_put(&mem->css);
 done:
        *memcg = mem;
@@ -2031,9 +2031,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
        }
 }
-static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
+static void mem_cgroup_cancel_charge(struct mem_cgroup *mem,
+                                     int page_size)
 {
-        __mem_cgroup_cancel_charge(mem, 1);
+        __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT);
 }
 /*
@@ -2087,22 +2088,10 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
 * USED state. If already USED, uncharge and return.
 */
+static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
-static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+                                         struct page_cgroup *pc,
-                                     struct page_cgroup *pc,
+                                         enum charge_type ctype)
-                                     enum charge_type ctype)
 {
-        /* try_charge() can return NULL to *memcg, taking care of it. */
-        if (!mem)
-                return;
-        lock_page_cgroup(pc);
-        if (unlikely(PageCgroupUsed(pc))) {
-                unlock_page_cgroup(pc);
-                mem_cgroup_cancel_charge(mem);
-                return;
-        }
        pc->mem_cgroup = mem;
        /*
         * We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2127,6 +2116,33 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
        }
        mem_cgroup_charge_statistics(mem, pc, true);
+}
+static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+                                       struct page_cgroup *pc,
+                                       enum charge_type ctype,
+                                       int page_size)
+{
+        int i;
+        int count = page_size >> PAGE_SHIFT;
+        /* try_charge() can return NULL to *memcg, taking care of it. */
+        if (!mem)
+                return;
+        lock_page_cgroup(pc);
+        if (unlikely(PageCgroupUsed(pc))) {
+                unlock_page_cgroup(pc);
+                mem_cgroup_cancel_charge(mem, page_size);
+                return;
+        }
+        /*
+         * we don't need page_cgroup_lock about tail pages, becase they are not
+         * accessed by any other context at this point.
+         */
+        for (i = 0; i < count; i++)
+                ____mem_cgroup_commit_charge(mem, pc + i, ctype);
        unlock_page_cgroup(pc);
        /*
@@ -2173,7 +2189,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
        mem_cgroup_charge_statistics(from, pc, false);
        if (uncharge)
                /* This is not "cancel", but cancel_charge does all we need. */
-                mem_cgroup_cancel_charge(from);
+                mem_cgroup_cancel_charge(from, PAGE_SIZE);
        /* caller should have done css_get */
        pc->mem_cgroup = to;
@@ -2195,9 +2211,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
                struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
 {
        int ret = -EINVAL;
+        unsigned long flags;
        lock_page_cgroup(pc);
        if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
+                move_lock_page_cgroup(pc, &flags);
                __mem_cgroup_move_account(pc, from, to, uncharge);
+                move_unlock_page_cgroup(pc, &flags);
                ret = 0;
        }
        unlock_page_cgroup(pc);
@@ -2234,13 +2254,14 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
                goto put;
        parent = mem_cgroup_from_cont(pcg);
-        ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
+        ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false,
+                                      PAGE_SIZE);
        if (ret || !parent)
                goto put_back;
        ret = mem_cgroup_move_account(pc, child, parent, true);
        if (ret)
-                mem_cgroup_cancel_charge(parent);
+                mem_cgroup_cancel_charge(parent, PAGE_SIZE);
 put_back:
        putback_lru_page(page);
 put:
@@ -2261,6 +2282,12 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
        struct mem_cgroup *mem = NULL;
        struct page_cgroup *pc;
        int ret;
+        int page_size = PAGE_SIZE;
+        if (PageTransHuge(page)) {
+                page_size <<= compound_order(page);
+                VM_BUG_ON(!PageTransHuge(page));
+        }
        pc = lookup_page_cgroup(page);
        /* can happen at boot */
@@ -2268,11 +2295,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                return 0;
        prefetchw(pc);
-        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
+        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size);
        if (ret || !mem)
                return ret;
-        __mem_cgroup_commit_charge(mem, pc, ctype);
+        __mem_cgroup_commit_charge(mem, pc, ctype, page_size);
        return 0;
 }
@@ -2281,8 +2308,6 @@ int mem_cgroup_newpage_charge(struct page *page,
 {
        if (mem_cgroup_disabled())
                return 0;
-        if (PageCompound(page))
-                return 0;
        /*
         * If already mapped, we don't have to account.
         * If page cache, page->mapping has address_space.
@@ -2388,13 +2413,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
        if (!mem)
                goto charge_cur_mm;
        *ptr = mem;
-        ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
+        ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE);
        css_put(&mem->css);
        return ret;
 charge_cur_mm:
        if (unlikely(!mm))
                mm = &init_mm;
-        return __mem_cgroup_try_charge(mm, mask, ptr, true);
+        return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE);
 }
 static void
@@ -2410,7 +2435,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
        cgroup_exclude_rmdir(&ptr->css);
        pc = lookup_page_cgroup(page);
        mem_cgroup_lru_del_before_commit_swapcache(page);
-        __mem_cgroup_commit_charge(ptr, pc, ctype);
+        __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE);
        mem_cgroup_lru_add_after_commit_swapcache(page);
        /*
         * Now swap is on-memory. This means this page may be
@@ -2459,11 +2484,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
                return;
        if (!mem)
                return;
-        mem_cgroup_cancel_charge(mem);
+        mem_cgroup_cancel_charge(mem, PAGE_SIZE);
 }
 static void
-__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
+__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
+              int page_size)
 {
        struct memcg_batch_info *batch = NULL;
        bool uncharge_memsw = true;
@@ -2490,6 +2516,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
        if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
                goto direct_uncharge;
+        if (page_size != PAGE_SIZE)
+                goto direct_uncharge;
        /*
         * In typical case, batch->memcg == mem. This means we can
         * merge a series of uncharges to an uncharge of res_counter.
@@ -2503,9 +2532,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
                batch->memsw_bytes += PAGE_SIZE;
        return;
 direct_uncharge:
-        res_counter_uncharge(&mem->res, PAGE_SIZE);
+        res_counter_uncharge(&mem->res, page_size);
        if (uncharge_memsw)
-                res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+                res_counter_uncharge(&mem->memsw, page_size);
        if (unlikely(batch->memcg != mem))
                memcg_oom_recover(mem);
        return;
@@ -2517,8 +2546,11 @@ direct_uncharge:
 static struct mem_cgroup *
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
+        int i;
+        int count;
        struct page_cgroup *pc;
        struct mem_cgroup *mem = NULL;
+        int page_size = PAGE_SIZE;
        if (mem_cgroup_disabled())
                return NULL;
@@ -2526,6 +2558,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        if (PageSwapCache(page))
                return NULL;
+        if (PageTransHuge(page)) {
+                page_size <<= compound_order(page);
+                VM_BUG_ON(!PageTransHuge(page));
+        }
+        count = page_size >> PAGE_SHIFT;
        /*
         * Check if our page_cgroup is valid
         */
@@ -2558,7 +2596,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                break;
        }
-        mem_cgroup_charge_statistics(mem, pc, false);
+        for (i = 0; i < count; i++)
+                mem_cgroup_charge_statistics(mem, pc + i, false);
        ClearPageCgroupUsed(pc);
        /*
@@ -2579,7 +2618,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                mem_cgroup_get(mem);
        }
        if (!mem_cgroup_is_root(mem))
-                __do_uncharge(mem, ctype);
+                __do_uncharge(mem, ctype, page_size);
        return mem;
@@ -2774,6 +2813,7 @@ int mem_cgroup_prepare_migration(struct page *page,
        enum charge_type ctype;
        int ret = 0;
+        VM_BUG_ON(PageTransHuge(page));
        if (mem_cgroup_disabled())
                return 0;
@@ -2823,7 +2863,7 @@ int mem_cgroup_prepare_migration(struct page *page,
                return 0;
        *ptr = mem;
-        ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
+        ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE);
        css_put(&mem->css);/* drop extra refcnt */
        if (ret || *ptr == NULL) {
                if (PageAnon(page)) {
@@ -2850,13 +2890,13 @@ int mem_cgroup_prepare_migration(struct page *page,
                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
        else
                ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-        __mem_cgroup_commit_charge(mem, pc, ctype);
+        __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE);
        return ret;
 }
 /* remove redundant charge if migration failed*/
 void mem_cgroup_end_migration(struct mem_cgroup *mem,
-        struct page *oldpage, struct page *newpage)
+        struct page *oldpage, struct page *newpage, bool migration_ok)
 {
        struct page *used, *unused;
        struct page_cgroup *pc;
@@ -2865,8 +2905,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
                return;
        /* blocks rmdir() */
        cgroup_exclude_rmdir(&mem->css);
-        /* at migration success, oldpage->mapping is NULL. */
+        if (!migration_ok) {
-        if (oldpage->mapping) {
                used = oldpage;
                unused = newpage;
        } else {
@@ -4176,13 +4215,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
         */
        if (!node_state(node, N_NORMAL_MEMORY))
                tmp = -1;
-        pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
+        pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
        if (!pn)
                return 1;
        mem->info.nodeinfo[node] = pn;
-        memset(pn, 0, sizeof(*pn));
        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
                mz = &pn->zoneinfo[zone];
                for_each_lru(l)
@@ -4206,14 +4243,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
        /* Can be very big if MAX_NUMNODES is very big */
        if (size < PAGE_SIZE)
-                mem = kmalloc(size, GFP_KERNEL);
+                mem = kzalloc(size, GFP_KERNEL);
        else
-                mem = vmalloc(size);
+                mem = vzalloc(size);
        if (!mem)
                return NULL;
-        memset(mem, 0, size);
        mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
        if (!mem->stat)
                goto out_free;
@@ -4461,7 +4497,8 @@ one_by_one:
                        batch_count = PRECHARGE_COUNT_AT_ONCE;
                        cond_resched();
                }
-                ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
+                ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
+                                              PAGE_SIZE);
                if (ret || !mem)
                        /* mem_cgroup_clear_mc() will do uncharge later */
                        return -ENOMEM;
@@ -4623,6 +4660,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
        pte_t *pte;
        spinlock_t *ptl;
+        VM_BUG_ON(pmd_trans_huge(*pmd));
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; pte++, addr += PAGE_SIZE)
                if (is_target_pte_for_mc(vma, addr, *pte, NULL))
@@ -4638,7 +4676,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
        unsigned long precharge;
        struct vm_area_struct *vma;
-        /* We've already held the mmap_sem */
+        down_read(&mm->mmap_sem);
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                struct mm_walk mem_cgroup_count_precharge_walk = {
                        .pmd_entry = mem_cgroup_count_precharge_pte_range,
@@ -4650,6 +4688,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
                walk_page_range(vma->vm_start, vma->vm_end,
                                        &mem_cgroup_count_precharge_walk);
        }
+        up_read(&mm->mmap_sem);
        precharge = mc.precharge;
        mc.precharge = 0;
@@ -4659,10 +4698,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
 {
-        return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
+        unsigned long precharge = mem_cgroup_count_precharge(mm);
+        VM_BUG_ON(mc.moving_task);
+        mc.moving_task = current;
+        return mem_cgroup_do_precharge(precharge);
 }
-static void mem_cgroup_clear_mc(void)
+/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
+static void __mem_cgroup_clear_mc(void)
 {
        struct mem_cgroup *from = mc.from;
        struct mem_cgroup *to = mc.to;
@@ -4697,23 +4741,28 @@ static void mem_cgroup_clear_mc(void)
                                                PAGE_SIZE * mc.moved_swap);
                }
                /* we've already done mem_cgroup_get(mc.to) */
                mc.moved_swap = 0;
        }
-        if (mc.mm) {
+        memcg_oom_recover(from);
-                up_read(&mc.mm->mmap_sem);
+        memcg_oom_recover(to);
-                mmput(mc.mm);
+        wake_up_all(&mc.waitq);
-        }
+}
+static void mem_cgroup_clear_mc(void)
+{
+        struct mem_cgroup *from = mc.from;
+        /*
+         * we must clear moving_task before waking up waiters at the end of
+         * task migration.
+         */
+        mc.moving_task = NULL;
+        __mem_cgroup_clear_mc();
        spin_lock(&mc.lock);
        mc.from = NULL;
        mc.to = NULL;
        spin_unlock(&mc.lock);
-        mc.moving_task = NULL;
-        mc.mm = NULL;
        mem_cgroup_end_move(from);
-        memcg_oom_recover(from);
-        memcg_oom_recover(to);
-        wake_up_all(&mc.waitq);
 }
 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
@@ -4735,38 +4784,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
                        return 0;
                /* We move charges only when we move a owner of the mm */
                if (mm->owner == p) {
-                        /*
-                         * We do all the move charge works under one mmap_sem to
-                         * avoid deadlock with down_write(&mmap_sem)
-                         * -> try_charge() -> if (mc.moving_task) -> sleep.
-                         */
-                        down_read(&mm->mmap_sem);
                        VM_BUG_ON(mc.from);
                        VM_BUG_ON(mc.to);
                        VM_BUG_ON(mc.precharge);
                        VM_BUG_ON(mc.moved_charge);
                        VM_BUG_ON(mc.moved_swap);
-                        VM_BUG_ON(mc.moving_task);
-                        VM_BUG_ON(mc.mm);
                        mem_cgroup_start_move(from);
                        spin_lock(&mc.lock);
                        mc.from = from;
                        mc.to = mem;
-                        mc.precharge = 0;
-                        mc.moved_charge = 0;
-                        mc.moved_swap = 0;
                        spin_unlock(&mc.lock);
-                        mc.moving_task = current;
+                        /* We set mc.moving_task later */
-                        mc.mm = mm;
                        ret = mem_cgroup_precharge_mc(mm);
                        if (ret)
                                mem_cgroup_clear_mc();
-                        /* We call up_read() and mmput() in clear_mc(). */
+                }
-                } else
+                mmput(mm);
-                        mmput(mm);
        }
        return ret;
 }
@@ -4789,6 +4823,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
        spinlock_t *ptl;
 retry:
+        VM_BUG_ON(pmd_trans_huge(*pmd));
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; addr += PAGE_SIZE) {
                pte_t ptent = *(pte++);
@@ -4854,7 +4889,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
        struct vm_area_struct *vma;
        lru_add_drain_all();
-        /* We've already held the mmap_sem */
+retry:
+        if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+                /*
+                 * Someone who are holding the mmap_sem might be waiting in
+                 * waitq. So we cancel all extra charges, wake up all waiters,
+                 * and retry. Because we cancel precharges, we might not be able
+                 * to move enough charges, but moving charge is a best-effort
+                 * feature anyway, so it wouldn't be a big problem.
+                 */
+                __mem_cgroup_clear_mc();
+                cond_resched();
+                goto retry;
+        }
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                int ret;
                struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4873,6 +4920,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
                         */
                        break;
        }
+        up_read(&mm->mmap_sem);
 }
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -4881,11 +4929,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
                                struct task_struct *p,
                                bool threadgroup)
 {
-        if (!mc.mm)
+        struct mm_struct *mm;
+        if (!mc.to)
                /* no need to move charge */
                return;
-        mem_cgroup_move_charge(mc.mm);
+        mm = get_task_mm(p);
+        if (mm) {
+                mem_cgroup_move_charge(mm);
+                mmput(mm);
+        }
        mem_cgroup_clear_mc();
 }
 #else   /* !CONFIG_MMU */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 46ab2c044b0e..548fbd70f026 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -203,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
 #ifdef __ARCH_SI_TRAPNO
        si.si_trapno = trapno;
 #endif
-        si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
+        si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
        /*
         * Don't use force here, it's convenient if the signal
         * can be temporarily blocked.
@@ -386,6 +386,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        struct task_struct *tsk;
        struct anon_vma *av;
+        if (!PageHuge(page) && unlikely(split_huge_page(page)))
+                return;
        read_lock(&tasklist_lock);
        av = page_lock_anon_vma(page);
        if (av == NULL) /* Not actually mapped anymore */
@@ -928,7 +930,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 static void set_page_hwpoison_huge_page(struct page *hpage)
 {
        int i;
-        int nr_pages = 1 << compound_order(hpage);
+        int nr_pages = 1 << compound_trans_order(hpage);
        for (i = 0; i < nr_pages; i++)
                SetPageHWPoison(hpage + i);
 }
@@ -936,7 +938,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
 static void clear_page_hwpoison_huge_page(struct page *hpage)
 {
        int i;
-        int nr_pages = 1 << compound_order(hpage);
+        int nr_pages = 1 << compound_trans_order(hpage);
        for (i = 0; i < nr_pages; i++)
                ClearPageHWPoison(hpage + i);
 }
@@ -966,7 +968,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
                return 0;
        }
-        nr_pages = 1 << compound_order(hpage);
+        nr_pages = 1 << compound_trans_order(hpage);
        atomic_long_add(nr_pages, &mce_bad_pages);
        /*
@@ -1164,7 +1166,7 @@ int unpoison_memory(unsigned long pfn)
                return 0;
        }
-        nr_pages = 1 << compound_order(page);
+        nr_pages = 1 << compound_trans_order(page);
        if (!get_page_unless_zero(page)) {
                /*
@@ -1290,9 +1292,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
        /* Keep page count to indicate a given hugepage is isolated. */
        list_add(&hpage->lru, &pagelist);
-        ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+        ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
+                                true);
        if (ret) {
-                        putback_lru_pages(&pagelist);
+                putback_lru_pages(&pagelist);
                pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
                         pfn, ret, page->flags);
                if (ret > 0)
@@ -1301,7 +1304,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
        }
 done:
        if (!PageHWPoison(hpage))
-                atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
+                atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
        set_page_hwpoison_huge_page(hpage);
        dequeue_hwpoisoned_huge_page(hpage);
        /* keep elevated page count for bad page */
@@ -1413,7 +1416,8 @@ int soft_offline_page(struct page *page, int flags)
                LIST_HEAD(pagelist);
                list_add(&page->lru, &pagelist);
-                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+                                                                0, true);
                if (ret) {
                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                                pfn, ret, page->flags);
diff --git a/mm/memory.c b/mm/memory.c
index 02e48aa0ed13..31250faff390 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
        }
 }
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+                pmd_t *pmd, unsigned long address)
 {
        pgtable_t new = pte_alloc_one(mm, address);
+        int wait_split_huge_page;
        if (!new)
                return -ENOMEM;
@@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
        smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
        spin_lock(&mm->page_table_lock);
-        if (!pmd_present(*pmd)) {       /* Has another populated it ? */
+        wait_split_huge_page = 0;
+        if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                mm->nr_ptes++;
                pmd_populate(mm, pmd, new);
                new = NULL;
-        }
+        } else if (unlikely(pmd_trans_splitting(*pmd)))
+                wait_split_huge_page = 1;
        spin_unlock(&mm->page_table_lock);
        if (new)
                pte_free(mm, new);
+        if (wait_split_huge_page)
+                wait_split_huge_page(vma->anon_vma, pmd);
        return 0;
 }
@@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
        smp_wmb(); /* See comment in __pte_alloc */
        spin_lock(&init_mm.page_table_lock);
-        if (!pmd_present(*pmd)) {       /* Has another populated it ? */
+        if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                pmd_populate_kernel(&init_mm, pmd, new);
                new = NULL;
-        }
+        } else
+                VM_BUG_ON(pmd_trans_splitting(*pmd));
        spin_unlock(&init_mm.page_table_lock);
        if (new)
                pte_free_kernel(&init_mm, new);
@@ -719,9 +726,9 @@ out_set_pte:
        return 0;
 }
-static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-                pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+                   pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
-                unsigned long addr, unsigned long end)
+                   unsigned long addr, unsigned long end)
 {
        pte_t *orig_src_pte, *orig_dst_pte;
        pte_t *src_pte, *dst_pte;
@@ -795,6 +802,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
        src_pmd = pmd_offset(src_pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                if (pmd_trans_huge(*src_pmd)) {
+                        int err;
+                        VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
+                        err = copy_huge_pmd(dst_mm, src_mm,
+                                            dst_pmd, src_pmd, addr, vma);
+                        if (err == -ENOMEM)
+                                return -ENOMEM;
+                        if (!err)
+                                continue;
+                        /* fall through */
+                }
                if (pmd_none_or_clear_bad(src_pmd))
                        continue;
                if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -997,6 +1015,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                if (pmd_trans_huge(*pmd)) {
+                        if (next-addr != HPAGE_PMD_SIZE) {
+                                VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
+                                split_huge_page_pmd(vma->vm_mm, pmd);
+                        } else if (zap_huge_pmd(tlb, vma, pmd)) {
+                                (*zap_work)--;
+                                continue;
+                        }
+                        /* fall through */
+                }
                if (pmd_none_or_clear_bad(pmd)) {
                        (*zap_work)--;
                        continue;
@@ -1262,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
        pud = pud_offset(pgd, address);
        if (pud_none(*pud))
                goto no_page_table;
-        if (pud_huge(*pud)) {
+        if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
                BUG_ON(flags & FOLL_GET);
                page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
                goto out;
@@ -1273,11 +1301,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
        pmd = pmd_offset(pud, address);
        if (pmd_none(*pmd))
                goto no_page_table;
-        if (pmd_huge(*pmd)) {
+        if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
                BUG_ON(flags & FOLL_GET);
                page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
                goto out;
        }
+        if (pmd_trans_huge(*pmd)) {
+                if (flags & FOLL_SPLIT) {
+                        split_huge_page_pmd(mm, pmd);
+                        goto split_fallthrough;
+                }
+                spin_lock(&mm->page_table_lock);
+                if (likely(pmd_trans_huge(*pmd))) {
+                        if (unlikely(pmd_trans_splitting(*pmd))) {
+                                spin_unlock(&mm->page_table_lock);
+                                wait_split_huge_page(vma->anon_vma, pmd);
+                        } else {
+                                page = follow_trans_huge_pmd(mm, address,
+                                                             pmd, flags);
+                                spin_unlock(&mm->page_table_lock);
+                                goto out;
+                        }
+                } else
+                        spin_unlock(&mm->page_table_lock);
+                /* fall through */
+        }
+split_fallthrough:
        if (unlikely(pmd_bad(*pmd)))
                goto no_page_table;
@@ -1310,6 +1359,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                 */
                mark_page_accessed(page);
        }
+        if (flags & FOLL_MLOCK) {
+                /*
+                 * The preliminary mapping check is mainly to avoid the
+                 * pointless overhead of lock_page on the ZERO_PAGE
+                 * which might bounce very badly if there is contention.
+                 *
+                 * If the page is already locked, we don't need to
+                 * handle it now - vmscan will handle it later if and
+                 * when it attempts to reclaim the page.
+                 */
+                if (page->mapping && trylock_page(page)) {
+                        lru_add_drain();  /* push cached pages to LRU */
+                        /*
+                         * Because we lock page here and migration is
+                         * blocked by the pte's page reference, we need
+                         * only check for file-cache page truncation.
+                         */
+                        if (page->mapping)
+                                mlock_vma_page(page);
+                        unlock_page(page);
+                }
+        }
 unlock:
        pte_unmap_unlock(ptep, ptl);
 out:
@@ -1341,7 +1412,8 @@ no_page_table:
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                     unsigned long start, int nr_pages, unsigned int gup_flags,
-                     struct page **pages, struct vm_area_struct **vmas)
+                     struct page **pages, struct vm_area_struct **vmas,
+                     int *nonblocking)
 {
        int i;
        unsigned long vm_flags;
@@ -1386,6 +1458,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        pmd = pmd_offset(pud, pg);
                        if (pmd_none(*pmd))
                                return i ? : -EFAULT;
+                        VM_BUG_ON(pmd_trans_huge(*pmd));
                        pte = pte_offset_map(pmd, pg);
                        if (pte_none(*pte)) {
                                pte_unmap(pte);
@@ -1441,10 +1514,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        cond_resched();
                        while (!(page = follow_page(vma, start, foll_flags))) {
                                int ret;
+                                unsigned int fault_flags = 0;
+                                if (foll_flags & FOLL_WRITE)
+                                        fault_flags |= FAULT_FLAG_WRITE;
+                                if (nonblocking)
+                                        fault_flags |= FAULT_FLAG_ALLOW_RETRY;
                                ret = handle_mm_fault(mm, vma, start,
-                                        (foll_flags & FOLL_WRITE) ?
+                                                        fault_flags);
-                                        FAULT_FLAG_WRITE : 0);
                                if (ret & VM_FAULT_ERROR) {
                                        if (ret & VM_FAULT_OOM)
@@ -1460,6 +1538,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                else
                                        tsk->min_flt++;
+                                if (ret & VM_FAULT_RETRY) {
+                                        *nonblocking = 0;
+                                        return i;
+                                }
                                /*
                                 * The VM_FAULT_WRITE bit tells us that
                                 * do_wp_page has broken COW when necessary,
@@ -1559,7 +1642,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        if (force)
                flags |= FOLL_FORCE;
-        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
+        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+                                NULL);
 }
 EXPORT_SYMBOL(get_user_pages);
@@ -1584,7 +1668,8 @@ struct page *get_dump_page(unsigned long addr)
        struct page *page;
        if (__get_user_pages(current, current->mm, addr, 1,
-                        FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
+                             FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
+                             NULL) < 1)
                return NULL;
        flush_cache_page(vma, addr, page_to_pfn(page));
        return page;
@@ -1598,8 +1683,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
        pud_t * pud = pud_alloc(mm, pgd, addr);
        if (pud) {
                pmd_t * pmd = pmd_alloc(mm, pud, addr);
-                if (pmd)
+                if (pmd) {
+                        VM_BUG_ON(pmd_trans_huge(*pmd));
                        return pte_alloc_map_lock(mm, pmd, addr, ptl);
+                }
        }
        return NULL;
 }
@@ -1818,6 +1905,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return -ENOMEM;
+        VM_BUG_ON(pmd_trans_huge(*pmd));
        do {
                next = pmd_addr_end(addr, end);
                if (remap_pte_range(mm, pmd, addr, next,
@@ -2048,19 +2136,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
        return same;
 }
-/*
- * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
- * servicing faults for write access.  In the normal case, do always want
- * pte_mkwrite.  But get_user_pages can cause write faults for mappings
- * that do not have writing enabled, when used by access_process_vm.
- */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
-{
-        if (likely(vma->vm_flags & VM_WRITE))
-                pte = pte_mkwrite(pte);
-        return pte;
-}
 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
 {
        /*
@@ -2112,7 +2187,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        struct page *old_page, *new_page;
        pte_t entry;
-        int reuse = 0, ret = 0;
+        int ret = 0;
        int page_mkwrite = 0;
        struct page *dirty_page = NULL;
@@ -2149,14 +2224,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        }
                        page_cache_release(old_page);
                }
-                reuse = reuse_swap_page(old_page);
+                if (reuse_swap_page(old_page)) {
-                if (reuse)
                        /*
                         * The page is all ours.  Move it to our anon_vma so
                         * the rmap code will not search our parent or siblings.
                         * Protected against the rmap code by the page lock.
                         */
                        page_move_anon_rmap(old_page, vma, address);
+                        unlock_page(old_page);
+                        goto reuse;
+                }
                unlock_page(old_page);
        } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                        (VM_WRITE|VM_SHARED))) {
@@ -2220,18 +2297,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                }
                dirty_page = old_page;
                get_page(dirty_page);
-                reuse = 1;
-        }
-        if (reuse) {
 reuse:
                flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = pte_mkyoung(orig_pte);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                if (ptep_set_access_flags(vma, address, page_table, entry,1))
                        update_mmu_cache(vma, address, page_table);
+                pte_unmap_unlock(page_table, ptl);
                ret |= VM_FAULT_WRITE;
-                goto unlock;
+                if (!dirty_page)
+                        return ret;
+                /*
+                 * Yes, Virginia, this is actually required to prevent a race
+                 * with clear_page_dirty_for_io() from clearing the page dirty
+                 * bit after it clear all dirty ptes, but before a racing
+                 * do_wp_page installs a dirty pte.
+                 *
+                 * do_no_page is protected similarly.
+                 */
+                if (!page_mkwrite) {
+                        wait_on_page_locked(dirty_page);
+                        set_page_dirty_balance(dirty_page, page_mkwrite);
+                }
+                put_page(dirty_page);
+                if (page_mkwrite) {
+                        struct address_space *mapping = dirty_page->mapping;
+                        set_page_dirty(dirty_page);
+                        unlock_page(dirty_page);
+                        page_cache_release(dirty_page);
+                        if (mapping)    {
+                                /*
+                                 * Some device drivers do not set page.mapping
+                                 * but still dirty their pages
+                                 */
+                                balance_dirty_pages_ratelimited(mapping);
+                        }
+                }
+                /* file_update_time outside page_lock */
+                if (vma->vm_file)
+                        file_update_time(vma->vm_file);
+                return ret;
        }
        /*
@@ -2337,39 +2448,6 @@ gotten:
                page_cache_release(old_page);
 unlock:
        pte_unmap_unlock(page_table, ptl);
-        if (dirty_page) {
-                /*
-                 * Yes, Virginia, this is actually required to prevent a race
-                 * with clear_page_dirty_for_io() from clearing the page dirty
-                 * bit after it clear all dirty ptes, but before a racing
-                 * do_wp_page installs a dirty pte.
-                 *
-                 * do_no_page is protected similarly.
-                 */
-                if (!page_mkwrite) {
-                        wait_on_page_locked(dirty_page);
-                        set_page_dirty_balance(dirty_page, page_mkwrite);
-                }
-                put_page(dirty_page);
-                if (page_mkwrite) {
-                        struct address_space *mapping = dirty_page->mapping;
-                        set_page_dirty(dirty_page);
-                        unlock_page(dirty_page);
-                        page_cache_release(dirty_page);
-                        if (mapping)    {
-                                /*
-                                 * Some device drivers do not set page.mapping
-                                 * but still dirty their pages
-                                 */
-                                balance_dirty_pages_ratelimited(mapping);
-                        }
-                }
-                /* file_update_time outside page_lock */
-                if (vma->vm_file)
-                        file_update_time(vma->vm_file);
-        }
        return ret;
 oom_free_new:
        page_cache_release(new_page);
@@ -3147,9 +3225,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with mmap_sem still held, but pte unmapped and unlocked.
 */
-static inline int handle_pte_fault(struct mm_struct *mm,
+int handle_pte_fault(struct mm_struct *mm,
-                struct vm_area_struct *vma, unsigned long address,
+                     struct vm_area_struct *vma, unsigned long address,
-                pte_t *pte, pmd_t *pmd, unsigned int flags)
+                     pte_t *pte, pmd_t *pmd, unsigned int flags)
 {
        pte_t entry;
        spinlock_t *ptl;
@@ -3228,9 +3306,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pmd = pmd_alloc(mm, pud, address);
        if (!pmd)
                return VM_FAULT_OOM;
-        pte = pte_alloc_map(mm, pmd, address);
+        if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
-        if (!pte)
+                if (!vma->vm_ops)
+                        return do_huge_pmd_anonymous_page(mm, vma, address,
+                                                          pmd, flags);
+        } else {
+                pmd_t orig_pmd = *pmd;
+                barrier();
+                if (pmd_trans_huge(orig_pmd)) {
+                        if (flags & FAULT_FLAG_WRITE &&
+                            !pmd_write(orig_pmd) &&
+                            !pmd_trans_splitting(orig_pmd))
+                                return do_huge_pmd_wp_page(mm, vma, address,
+                                                           pmd, orig_pmd);
+                        return 0;
+                }
+        }
+        /*
+         * Use __pte_alloc instead of pte_alloc_map, because we can't
+         * run pte_offset_map on the pmd, if an huge pmd could
+         * materialize from under us from a different thread.
+         */
+        if (unlikely(__pte_alloc(mm, vma, pmd, address)))
                return VM_FAULT_OOM;
+        /* if an huge pmd materialized from under us just retry later */
+        if (unlikely(pmd_trans_huge(*pmd)))
+                return 0;
+        /*
+         * A regular pmd is established and it can't morph into a huge pmd
+         * from under us anymore at this point because we hold the mmap_sem
+         * read mode and khugepaged takes it in write mode. So now it's
+         * safe to run pte_offset_map().
+         */
+        pte = pte_offset_map(pmd, address);
        return handle_pte_fault(mm, vma, address, pte, pmd, flags);
 }
@@ -3296,7 +3405,12 @@ int make_pages_present(unsigned long addr, unsigned long end)
        vma = find_vma(current->mm, addr);
        if (!vma)
                return -ENOMEM;
-        write = (vma->vm_flags & VM_WRITE) != 0;
+        /*
+         * We want to touch writable mappings with a write fault in order
+         * to break COW, except for shared mappings because these don't COW
+         * and we would not want to dirty them for nothing.
+         */
+        write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
        BUG_ON(addr >= end);
        BUG_ON(end > vma->vm_end);
        len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
@@ -3368,6 +3482,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address,
                goto out;
        pmd = pmd_offset(pud, address);
+        VM_BUG_ON(pmd_trans_huge(*pmd));
        if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
                goto out;
@@ -3608,3 +3723,74 @@ void might_fault(void)
 }
 EXPORT_SYMBOL(might_fault);
 #endif
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+static void clear_gigantic_page(struct page *page,
+                                unsigned long addr,
+                                unsigned int pages_per_huge_page)
+{
+        int i;
+        struct page *p = page;
+        might_sleep();
+        for (i = 0; i < pages_per_huge_page;
+             i++, p = mem_map_next(p, page, i)) {
+                cond_resched();
+                clear_user_highpage(p, addr + i * PAGE_SIZE);
+        }
+}
+void clear_huge_page(struct page *page,
+                     unsigned long addr, unsigned int pages_per_huge_page)
+{
+        int i;
+        if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+                clear_gigantic_page(page, addr, pages_per_huge_page);
+                return;
+        }
+        might_sleep();
+        for (i = 0; i < pages_per_huge_page; i++) {
+                cond_resched();
+                clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+        }
+}
+static void copy_user_gigantic_page(struct page *dst, struct page *src,
+                                    unsigned long addr,
+                                    struct vm_area_struct *vma,
+                                    unsigned int pages_per_huge_page)
+{
+        int i;
+        struct page *dst_base = dst;
+        struct page *src_base = src;
+        for (i = 0; i < pages_per_huge_page; ) {
+                cond_resched();
+                copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+                i++;
+                dst = mem_map_next(dst, dst_base, i);
+                src = mem_map_next(src, src_base, i);
+        }
+}
+void copy_user_huge_page(struct page *dst, struct page *src,
+                         unsigned long addr, struct vm_area_struct *vma,
+                         unsigned int pages_per_huge_page)
+{
+        int i;
+        if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+                copy_user_gigantic_page(dst, src, addr, vma,
+                                        pages_per_huge_page);
+                return;
+        }
+        might_sleep();
+        for (i = 0; i < pages_per_huge_page; i++) {
+                cond_resched();
+                copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
+        }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2c6523af5473..e92f04749fcb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -82,9 +82,10 @@ static void release_memory_resource(struct resource *res)
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 #ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void get_page_bootmem(unsigned long info,  struct page *page, int type)
+static void get_page_bootmem(unsigned long info,  struct page *page,
+                             unsigned long type)
 {
-        atomic_set(&page->_mapcount, type);
+        page->lru.next = (struct list_head *) type;
        SetPagePrivate(page);
        set_page_private(page, info);
        atomic_inc(&page->_count);
@@ -94,15 +95,16 @@ static void get_page_bootmem(unsigned long info,  struct page *page, int type)
 * so use __ref to tell modpost not to generate a warning */
 void __ref put_page_bootmem(struct page *page)
 {
-        int type;
+        unsigned long type;
-        type = atomic_read(&page->_mapcount);
+        type = (unsigned long) page->lru.next;
-        BUG_ON(type >= -1);
+        BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
+               type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
        if (atomic_dec_return(&page->_count) == 1) {
                ClearPagePrivate(page);
                set_page_private(page, 0);
-                reset_page_mapcount(page);
+                INIT_LIST_HEAD(&page->lru);
                __free_pages_bootmem(page, 0);
        }
@@ -733,7 +735,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                        goto out;
                }
                /* this function returns # of failed pages */
-                ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
+                ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
+                                                                true, true);
                if (ret)
                        putback_lru_pages(&source);
        }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 11ff260fb282..368fc9d23610 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -514,6 +514,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                split_huge_page_pmd(vma->vm_mm, pmd);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -935,7 +936,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
                return PTR_ERR(vma);
        if (!list_empty(&pagelist)) {
-                err = migrate_pages(&pagelist, new_node_page, dest, 0);
+                err = migrate_pages(&pagelist, new_node_page, dest,
+                                                                false, true);
                if (err)
                        putback_lru_pages(&pagelist);
        }
@@ -1155,7 +1157,8 @@ static long do_mbind(unsigned long start, unsigned long len,
                if (!list_empty(&pagelist)) {
                        nr_failed = migrate_pages(&pagelist, new_vma_page,
-                                                (unsigned long)vma, 0);
+                                                (unsigned long)vma,
+                                                false, true);
                        if (nr_failed)
                                putback_lru_pages(&pagelist);
                }
@@ -1308,16 +1311,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
        /* Find the mm_struct */
        rcu_read_lock();
-        read_lock(&tasklist_lock);
        task = pid ? find_task_by_vpid(pid) : current;
        if (!task) {
-                read_unlock(&tasklist_lock);
                rcu_read_unlock();
                err = -ESRCH;
                goto out;
        }
        mm = get_task_mm(task);
-        read_unlock(&tasklist_lock);
        rcu_read_unlock();
        err = -EINVAL;
@@ -1796,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 }
 /**
- *      alloc_page_vma  - Allocate a page for a VMA.
+ *      alloc_pages_vma - Allocate a page for a VMA.
 *
 *      @gfp:
 *      %GFP_USER    user allocation.
@@ -1805,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 *      %GFP_FS      allocation should not call back into a file system.
 *      %GFP_ATOMIC  don't sleep.
 *
+ *      @order:Order of the GFP allocation.
 *      @vma:  Pointer to VMA or NULL if not available.
 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
 *
@@ -1818,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 *      Should be called with the mm_sem of the vma hold.
 */
 struct page *
-alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
+alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
+                unsigned long addr)
 {
        struct mempolicy *pol = get_vma_policy(current, vma, addr);
        struct zonelist *zl;
@@ -1830,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
                mpol_cond_put(pol);
-                page = alloc_page_interleave(gfp, 0, nid);
+                page = alloc_page_interleave(gfp, order, nid);
                put_mems_allowed();
                return page;
        }
@@ -1839,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
                /*
                 * slow path: ref counted shared policy
                 */
-                struct page *page =  __alloc_pages_nodemask(gfp, 0,
+                struct page *page =  __alloc_pages_nodemask(gfp, order,
                                                zl, policy_nodemask(gfp, pol));
                __mpol_put(pol);
                put_mems_allowed();
@@ -1848,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
        /*
         * fast path:  default or task policy
         */
-        page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+        page = __alloc_pages_nodemask(gfp, order, zl,
+                                      policy_nodemask(gfp, pol));
        put_mems_allowed();
        return page;
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 6ae8a66a7045..46fe8cc13d67 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -113,6 +113,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                        goto out;
                pmd = pmd_offset(pud, addr);
+                if (pmd_trans_huge(*pmd))
+                        goto out;
                if (!pmd_present(*pmd))
                        goto out;
@@ -246,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        expected_count = 2 + page_has_private(page);
        if (page_count(page) != expected_count ||
-                        (struct page *)radix_tree_deref_slot(pslot) != page) {
+                radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
                spin_unlock_irq(&mapping->tree_lock);
                return -EAGAIN;
        }
@@ -318,7 +320,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
        expected_count = 2 + page_has_private(page);
        if (page_count(page) != expected_count ||
-            (struct page *)radix_tree_deref_slot(pslot) != page) {
+                radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
                spin_unlock_irq(&mapping->tree_lock);
                return -EAGAIN;
        }
@@ -614,13 +616,12 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 * to the newly allocated page in newpage.
 */
 static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-                        struct page *page, int force, int offlining)
+                        struct page *page, int force, bool offlining, bool sync)
 {
        int rc = 0;
        int *result = NULL;
        struct page *newpage = get_new_page(page, private, &result);
        int remap_swapcache = 1;
-        int rcu_locked = 0;
        int charge = 0;
        struct mem_cgroup *mem = NULL;
        struct anon_vma *anon_vma = NULL;
@@ -632,6 +633,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                /* page was freed from under us. So we are done. */
                goto move_newpage;
        }
+        if (unlikely(PageTransHuge(page)))
+                if (unlikely(split_huge_page(page)))
+                        goto move_newpage;
        /* prepare cgroup just returns 0 or -ENOMEM */
        rc = -EAGAIN;
@@ -639,6 +643,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        if (!trylock_page(page)) {
                if (!force)
                        goto move_newpage;
+                /*
+                 * It's not safe for direct compaction to call lock_page.
+                 * For example, during page readahead pages are added locked
+                 * to the LRU. Later, when the IO completes the pages are
+                 * marked uptodate and unlocked. However, the queueing
+                 * could be merging multiple pages for one bio (e.g.
+                 * mpage_readpages). If an allocation happens for the
+                 * second or third page, the process can end up locking
+                 * the same page twice and deadlocking. Rather than
+                 * trying to be clever about what pages can be locked,
+                 * avoid the use of lock_page for direct compaction
+                 * altogether.
+                 */
+                if (current->flags & PF_MEMALLOC)
+                        goto move_newpage;
                lock_page(page);
        }
@@ -665,27 +686,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        BUG_ON(charge);
        if (PageWriteback(page)) {
-                if (!force)
+                if (!force || !sync)
                        goto uncharge;
                wait_on_page_writeback(page);
        }
        /*
         * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
         * we cannot notice that anon_vma is freed while we migrates a page.
-         * This rcu_read_lock() delays freeing anon_vma pointer until the end
+         * This get_anon_vma() delays freeing anon_vma pointer until the end
         * of migration. File cache pages are no problem because of page_lock()
         * File Caches may use write_page() or lock_page() in migration, then,
         * just care Anon page here.
         */
        if (PageAnon(page)) {
-                rcu_read_lock();
+                /*
-                rcu_locked = 1;
+                 * Only page_lock_anon_vma() understands the subtleties of
+                 * getting a hold on an anon_vma from outside one of its mms.
-                /* Determine how to safely use anon_vma */
+                 */
-                if (!page_mapped(page)) {
+                anon_vma = page_lock_anon_vma(page);
-                        if (!PageSwapCache(page))
+                if (anon_vma) {
-                                goto rcu_unlock;
+                        /*
+                         * Take a reference count on the anon_vma if the
+                         * page is mapped so that it is guaranteed to
+                         * exist when the page is remapped later
+                         */
+                        get_anon_vma(anon_vma);
+                        page_unlock_anon_vma(anon_vma);
+                } else if (PageSwapCache(page)) {
                        /*
                         * We cannot be sure that the anon_vma of an unmapped
                         * swapcache page is safe to use because we don't
@@ -700,13 +727,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                         */
                        remap_swapcache = 0;
                } else {
-                        /*
+                        goto uncharge;
-                         * Take a reference count on the anon_vma if the
-                         * page is mapped so that it is guaranteed to
-                         * exist when the page is remapped later
-                         */
-                        anon_vma = page_anon_vma(page);
-                        get_anon_vma(anon_vma);
                }
        }
@@ -723,16 +744,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
         * free the metadata, so the page can be freed.
         */
        if (!page->mapping) {
-                if (!PageAnon(page) && page_has_private(page)) {
+                VM_BUG_ON(PageAnon(page));
-                        /*
+                if (page_has_private(page)) {
-                         * Go direct to try_to_free_buffers() here because
-                         * a) that's what try_to_release_page() would do anyway
-                         * b) we may be under rcu_read_lock() here, so we can't
-                         *    use GFP_KERNEL which is what try_to_release_page()
-                         *    needs to be effective.
-                         */
                        try_to_free_buffers(page);
-                        goto rcu_unlock;
+                        goto uncharge;
                }
                goto skip_unmap;
        }
@@ -746,17 +761,14 @@ skip_unmap:
        if (rc && remap_swapcache)
                remove_migration_ptes(page, page);
-rcu_unlock:
        /* Drop an anon_vma reference if we took one */
        if (anon_vma)
                drop_anon_vma(anon_vma);
-        if (rcu_locked)
-                rcu_read_unlock();
 uncharge:
        if (!charge)
-                mem_cgroup_end_migration(mem, page, newpage);
+                mem_cgroup_end_migration(mem, page, newpage, rc == 0);
 unlock:
        unlock_page(page);
@@ -810,12 +822,11 @@ move_newpage:
 */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
                                unsigned long private, struct page *hpage,
-                                int force, int offlining)
+                                int force, bool offlining, bool sync)
 {
        int rc = 0;
        int *result = NULL;
        struct page *new_hpage = get_new_page(hpage, private, &result);
-        int rcu_locked = 0;
        struct anon_vma *anon_vma = NULL;
        if (!new_hpage)
@@ -824,18 +835,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        rc = -EAGAIN;
        if (!trylock_page(hpage)) {
-                if (!force)
+                if (!force || !sync)
                        goto out;
                lock_page(hpage);
        }
        if (PageAnon(hpage)) {
-                rcu_read_lock();
+                anon_vma = page_lock_anon_vma(hpage);
-                rcu_locked = 1;
+                if (anon_vma) {
+                        get_anon_vma(anon_vma);
-                if (page_mapped(hpage)) {
+                        page_unlock_anon_vma(anon_vma);
-                        anon_vma = page_anon_vma(hpage);
-                        atomic_inc(&anon_vma->external_refcount);
                }
        }
@@ -847,16 +856,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        if (rc)
                remove_migration_ptes(hpage, hpage);
-        if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
+        if (anon_vma)
-                                            &anon_vma->lock)) {
+                drop_anon_vma(anon_vma);
-                int empty = list_empty(&anon_vma->head);
-                spin_unlock(&anon_vma->lock);
-                if (empty)
-                        anon_vma_free(anon_vma);
-        }
-        if (rcu_locked)
-                rcu_read_unlock();
 out:
        unlock_page(hpage);
@@ -892,7 +893,8 @@ out:
 * Return: Number of pages not migrated or error code.
 */
 int migrate_pages(struct list_head *from,
-                new_page_t get_new_page, unsigned long private, int offlining)
+                new_page_t get_new_page, unsigned long private, bool offlining,
+                bool sync)
 {
        int retry = 1;
        int nr_failed = 0;
@@ -912,7 +914,8 @@ int migrate_pages(struct list_head *from,
                        cond_resched();
                        rc = unmap_and_move(get_new_page, private,
-                                                page, pass > 2, offlining);
+                                                page, pass > 2, offlining,
+                                                sync);
                        switch(rc) {
                        case -ENOMEM:
@@ -941,7 +944,8 @@ out:
 }
 int migrate_huge_pages(struct list_head *from,
-                new_page_t get_new_page, unsigned long private, int offlining)
+                new_page_t get_new_page, unsigned long private, bool offlining,
+                bool sync)
 {
        int retry = 1;
        int nr_failed = 0;
@@ -957,7 +961,8 @@ int migrate_huge_pages(struct list_head *from,
                        cond_resched();
                        rc = unmap_and_move_huge_page(get_new_page,
-                                        private, page, pass > 2, offlining);
+                                        private, page, pass > 2, offlining,
+                                        sync);
                        switch(rc) {
                        case -ENOMEM:
@@ -1042,7 +1047,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
                        goto set_status;
-                page = follow_page(vma, pp->addr, FOLL_GET);
+                page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
                err = PTR_ERR(page);
                if (IS_ERR(page))
@@ -1090,7 +1095,7 @@ set_status:
        err = 0;
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_page_node,
-                                (unsigned long)pm, 0);
+                                (unsigned long)pm, 0, true);
                if (err)
                        putback_lru_pages(&pagelist);
        }
diff --git a/mm/mincore.c b/mm/mincore.c
index 9ac42dc6d7b6..a4e6b9d75c76 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -154,6 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                if (pmd_trans_huge(*pmd)) {
+                        if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
+                                vec += (next - addr) >> PAGE_SHIFT;
+                                continue;
+                        }
+                        /* fall through */
+                }
                if (pmd_none_or_clear_bad(pmd))
                        mincore_unmapped_range(vma, addr, next, vec);
                else
diff --git a/mm/mlock.c b/mm/mlock.c
index b70919ce4f72..13e81ee8be9d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -155,13 +155,12 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
 * vma->vm_mm->mmap_sem must be held for at least read.
 */
 static long __mlock_vma_pages_range(struct vm_area_struct *vma,
-                                    unsigned long start, unsigned long end)
+                                    unsigned long start, unsigned long end,
+                                    int *nonblocking)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long addr = start;
-        struct page *pages[16]; /* 16 gives a reasonable batch */
        int nr_pages = (end - start) / PAGE_SIZE;
-        int ret = 0;
        int gup_flags;
        VM_BUG_ON(start & ~PAGE_MASK);
@@ -170,73 +169,26 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
        VM_BUG_ON(end   > vma->vm_end);
        VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
-        gup_flags = FOLL_TOUCH | FOLL_GET;
+        gup_flags = FOLL_TOUCH;
-        if (vma->vm_flags & VM_WRITE)
+        /*
+         * We want to touch writable mappings with a write fault in order
+         * to break COW, except for shared mappings because these don't COW
+         * and we would not want to dirty them for nothing.
+         */
+        if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
                gup_flags |= FOLL_WRITE;
+        if (vma->vm_flags & VM_LOCKED)
+                gup_flags |= FOLL_MLOCK;
        /* We don't try to access the guard page of a stack vma */
        if (stack_guard_page(vma, start)) {
                addr += PAGE_SIZE;
                nr_pages--;
        }
-        while (nr_pages > 0) {
+        return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
-                int i;
+                                NULL, NULL, nonblocking);
-                cond_resched();
-                /*
-                 * get_user_pages makes pages present if we are
-                 * setting mlock. and this extra reference count will
-                 * disable migration of this page.  However, page may
-                 * still be truncated out from under us.
-                 */
-                ret = __get_user_pages(current, mm, addr,
-                                min_t(int, nr_pages, ARRAY_SIZE(pages)),
-                                gup_flags, pages, NULL);
-                /*
-                 * This can happen for, e.g., VM_NONLINEAR regions before
-                 * a page has been allocated and mapped at a given offset,
-                 * or for addresses that map beyond end of a file.
-                 * We'll mlock the pages if/when they get faulted in.
-                 */
-                if (ret < 0)
-                        break;
-                lru_add_drain();        /* push cached pages to LRU */
-                for (i = 0; i < ret; i++) {
-                        struct page *page = pages[i];
-                        if (page->mapping) {
-                                /*
-                                 * That preliminary check is mainly to avoid
-                                 * the pointless overhead of lock_page on the
-                                 * ZERO_PAGE: which might bounce very badly if
-                                 * there is contention.  However, we're still
-                                 * dirtying its cacheline with get/put_page:
-                                 * we'll add another __get_user_pages flag to
-                                 * avoid it if that case turns out to matter.
-                                 */
-                                lock_page(page);
-                                /*
-                                 * Because we lock page here and migration is
-                                 * blocked by the elevated reference, we need
-                                 * only check for file-cache page truncation.
-                                 */
-                                if (page->mapping)
-                                        mlock_vma_page(page);
-                                unlock_page(page);
-                        }
-                        put_page(page); /* ref from get_user_pages() */
-                }
-                addr += ret * PAGE_SIZE;
-                nr_pages -= ret;
-                ret = 0;
-        }
-        return ret;     /* 0 or negative error code */
 }
 /*
@@ -280,7 +232,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
                        is_vm_hugetlb_page(vma) ||
                        vma == get_gate_vma(current))) {
-                __mlock_vma_pages_range(vma, start, end);
+                __mlock_vma_pages_range(vma, start, end, NULL);
                /* Hide errors from mmap() and other callers */
                return 0;
@@ -372,18 +324,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
        int ret = 0;
        int lock = newflags & VM_LOCKED;
-        if (newflags == vma->vm_flags ||
+        if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
-                        (vma->vm_flags & (VM_IO | VM_PFNMAP)))
+            is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))
                goto out;       /* don't set VM_LOCKED,  don't count */
-        if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
-                        is_vm_hugetlb_page(vma) ||
-                        vma == get_gate_vma(current)) {
-                if (lock)
-                        make_pages_present(start, end);
-                goto out;       /* don't set VM_LOCKED,  don't count */
-        }
        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
        *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
                          vma->vm_file, pgoff, vma_policy(vma));
@@ -419,14 +363,10 @@ success:
         * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
         */
-        if (lock) {
+        if (lock)
                vma->vm_flags = newflags;
-                ret = __mlock_vma_pages_range(vma, start, end);
+        else
-                if (ret < 0)
-                        ret = __mlock_posix_error_return(ret);
-        } else {
                munlock_vma_pages_range(vma, start, end);
-        }
 out:
        *prev = vma;
@@ -439,7 +379,8 @@ static int do_mlock(unsigned long start, size_t len, int on)
        struct vm_area_struct * vma, * prev;
        int error;
-        len = PAGE_ALIGN(len);
+        VM_BUG_ON(start & ~PAGE_MASK);
+        VM_BUG_ON(len != PAGE_ALIGN(len));
        end = start + len;
        if (end < start)
                return -EINVAL;
@@ -482,6 +423,62 @@ static int do_mlock(unsigned long start, size_t len, int on)
        return error;
 }
+static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
+{
+        struct mm_struct *mm = current->mm;
+        unsigned long end, nstart, nend;
+        struct vm_area_struct *vma = NULL;
+        int locked = 0;
+        int ret = 0;
+        VM_BUG_ON(start & ~PAGE_MASK);
+        VM_BUG_ON(len != PAGE_ALIGN(len));
+        end = start + len;
+        for (nstart = start; nstart < end; nstart = nend) {
+                /*
+                 * We want to fault in pages for [nstart; end) address range.
+                 * Find first corresponding VMA.
+                 */
+                if (!locked) {
+                        locked = 1;
+                        down_read(&mm->mmap_sem);
+                        vma = find_vma(mm, nstart);
+                } else if (nstart >= vma->vm_end)
+                        vma = vma->vm_next;
+                if (!vma || vma->vm_start >= end)
+                        break;
+                /*
+                 * Set [nstart; nend) to intersection of desired address
+                 * range with the first VMA. Also, skip undesirable VMA types.
+                 */
+                nend = min(end, vma->vm_end);
+                if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+                        continue;
+                if (nstart < vma->vm_start)
+                        nstart = vma->vm_start;
+                /*
+                 * Now fault in a range of pages. __mlock_vma_pages_range()
+                 * double checks the vma flags, so that it won't mlock pages
+                 * if the vma was already munlocked.
+                 */
+                ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
+                if (ret < 0) {
+                        if (ignore_errors) {
+                                ret = 0;
+                                continue;       /* continue at next VMA */
+                        }
+                        ret = __mlock_posix_error_return(ret);
+                        break;
+                }
+                nend = nstart + ret * PAGE_SIZE;
+                ret = 0;
+        }
+        if (locked)
+                up_read(&mm->mmap_sem);
+        return ret;     /* 0 or negative error code */
+}
 SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
 {
        unsigned long locked;
@@ -507,6 +504,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
        if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
                error = do_mlock(start, len, 1);
        up_write(&current->mm->mmap_sem);
+        if (!error)
+                error = do_mlock_pages(start, len, 0);
        return error;
 }
@@ -571,6 +570,10 @@ SYSCALL_DEFINE1(mlockall, int, flags)
            capable(CAP_IPC_LOCK))
                ret = do_mlockall(flags);
        up_write(&current->mm->mmap_sem);
+        if (!ret && (flags & MCL_CURRENT)) {
+                /* Ignore errors */
+                do_mlock_pages(0, TASK_SIZE, 1);
+        }
 out:
        return ret;
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 50a4aa0255a0..2ec8eb5a9cdd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -29,6 +29,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/perf_event.h>
 #include <linux/audit.h>
+#include <linux/khugepaged.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -253,7 +254,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
        down_write(&mm->mmap_sem);
 #ifdef CONFIG_COMPAT_BRK
-        min_brk = mm->end_code;
+        /*
+         * CONFIG_COMPAT_BRK can still be overridden by setting
+         * randomize_va_space to 2, which will still cause mm->start_brk
+         * to be arbitrarily shifted
+         */
+        if (mm->start_brk > PAGE_ALIGN(mm->end_data))
+                min_brk = mm->start_brk;
+        else
+                min_brk = mm->end_data;
 #else
        min_brk = mm->start_brk;
 #endif
@@ -588,6 +597,8 @@ again:			remove_next = 1 + (end > next->vm_end);
                }
        }
+        vma_adjust_trans_huge(vma, start, end, adjust_next);
        /*
         * When changing only vma->vm_end, we don't really need anon_vma
         * lock. This is a fairly rare case by itself, but the anon_vma
@@ -815,6 +826,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                end, prev->vm_pgoff, NULL);
                if (err)
                        return NULL;
+                khugepaged_enter_vma_merge(prev);
                return prev;
        }
@@ -833,6 +845,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                next->vm_pgoff - pglen, NULL);
                if (err)
                        return NULL;
+                khugepaged_enter_vma_merge(area);
                return area;
        }
@@ -1761,6 +1774,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                }
        }
        vma_unlock_anon_vma(vma);
+        khugepaged_enter_vma_merge(vma);
        return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1808,6 +1822,7 @@ static int expand_downwards(struct vm_area_struct *vma,
                }
        }
        vma_unlock_anon_vma(vma);
+        khugepaged_enter_vma_merge(vma);
        return error;
 }
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 438951d366f2..8d032de4088e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
        return young;
 }
+int __mmu_notifier_test_young(struct mm_struct *mm,
+                              unsigned long address)
+{
+        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        int young = 0;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+                if (mn->ops->test_young) {
+                        young = mn->ops->test_young(mn, mm, address);
+                        if (young)
+                                break;
+                }
+        }
+        rcu_read_unlock();
+        return young;
+}
 void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
                               pte_t pte)
 {
diff --git a/mm/mmzone.c b/mm/mmzone.c
index e35bfb82c855..f5b7d1760213 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
        return 1;
 }
 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-#ifdef CONFIG_SMP
-/* Called when a more accurate view of NR_FREE_PAGES is needed */
-unsigned long zone_nr_free_pages(struct zone *zone)
-{
-        unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
-        /*
-         * While kswapd is awake, it is considered the zone is under some
-         * memory pressure. Under pressure, there is a risk that
-         * per-cpu-counter-drift will allow the min watermark to be breached
-         * potentially causing a live-lock. While kswapd is awake and
-         * free pages are low, get a better estimate for free pages
-         */
-        if (nr_free_pages < zone->percpu_drift_mark &&
-                        !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
-                return zone_page_state_snapshot(zone, NR_FREE_PAGES);
-        return nr_free_pages;
-}
-#endif /* CONFIG_SMP */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 4c5133873097..5a688a2756be 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -78,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
        pte_unmap_unlock(pte - 1, ptl);
 }
-static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
+static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                unsigned long addr, unsigned long end, pgprot_t newprot,
                int dirty_accountable)
 {
@@ -88,13 +88,21 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                if (pmd_trans_huge(*pmd)) {
+                        if (next - addr != HPAGE_PMD_SIZE)
+                                split_huge_page_pmd(vma->vm_mm, pmd);
+                        else if (change_huge_pmd(vma, pmd, addr, newprot))
+                                continue;
+                        /* fall through */
+                }
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-                change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
+                change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
+                                 dirty_accountable);
        } while (pmd++, addr = next, addr != end);
 }
-static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
+static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
                unsigned long addr, unsigned long end, pgprot_t newprot,
                int dirty_accountable)
 {
@@ -106,7 +114,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
-                change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable);
+                change_pmd_range(vma, pud, addr, next, newprot,
+                                 dirty_accountable);
        } while (pud++, addr = next, addr != end);
 }
@@ -126,7 +135,8 @@ static void change_protection(struct vm_area_struct *vma,
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-                change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable);
+                change_pud_range(vma, pgd, addr, next, newprot,
+                                 dirty_accountable);
        } while (pgd++, addr = next, addr != end);
        flush_tlb_range(vma, start, end);
 }
diff --git a/mm/mremap.c b/mm/mremap.c
index 563fbdd6293a..9925b6391b80 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -41,13 +41,15 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
                return NULL;
        pmd = pmd_offset(pud, addr);
+        split_huge_page_pmd(mm, pmd);
        if (pmd_none_or_clear_bad(pmd))
                return NULL;
        return pmd;
 }
-static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+                            unsigned long addr)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -62,7 +64,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
        if (!pmd)
                return NULL;
-        if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
+        VM_BUG_ON(pmd_trans_huge(*pmd));
+        if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
                return NULL;
        return pmd;
@@ -147,7 +150,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                old_pmd = get_old_pmd(vma->vm_mm, old_addr);
                if (!old_pmd)
                        continue;
-                new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
+                new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
                if (!new_pmd)
                        break;
                next = (new_addr + PMD_SIZE) & PMD_MASK;
diff --git a/mm/nommu.c b/mm/nommu.c
index ef4045d010d5..f59e1424d3db 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -127,7 +127,8 @@ unsigned int kobjsize(const void *objp)
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                     unsigned long start, int nr_pages, unsigned int foll_flags,
-                     struct page **pages, struct vm_area_struct **vmas)
+                     struct page **pages, struct vm_area_struct **vmas,
+                     int *retry)
 {
        struct vm_area_struct *vma;
        unsigned long vm_flags;
@@ -185,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        if (force)
                flags |= FOLL_FORCE;
-        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
+        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+                                NULL);
 }
 EXPORT_SYMBOL(get_user_pages);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b5d8a1f820a0..2cb01f6ec5d0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -410,9 +410,12 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 {
        unsigned long background;
        unsigned long dirty;
-        unsigned long available_memory = determine_dirtyable_memory();
+        unsigned long uninitialized_var(available_memory);
        struct task_struct *tsk;
+        if (!vm_dirty_bytes || !dirty_background_bytes)
+                available_memory = determine_dirtyable_memory();
        if (vm_dirty_bytes)
                dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
        else
@@ -1103,7 +1106,7 @@ EXPORT_SYMBOL(write_one_page);
 int __set_page_dirty_no_writeback(struct page *page)
 {
        if (!PageDirty(page))
-                SetPageDirty(page);
+                return !TestSetPageDirty(page);
        return 0;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 826ba6922e84..90c1439549fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -357,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order)
        }
 }
+/* update __split_huge_page_refcount if you change this function */
 static int destroy_compound_page(struct page *page, unsigned long order)
 {
        int i;
@@ -426,18 +427,10 @@ static inline void rmv_page_order(struct page *page)
 *
 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
 */
-static inline struct page *
-__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
-{
-        unsigned long buddy_idx = page_idx ^ (1 << order);
-        return page + (buddy_idx - page_idx);
-}
 static inline unsigned long
-__find_combined_index(unsigned long page_idx, unsigned int order)
+__find_buddy_index(unsigned long page_idx, unsigned int order)
 {
-        return (page_idx & ~(1 << order));
+        return page_idx ^ (1 << order);
 }
 /*
@@ -448,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
 * (c) a page and its buddy have the same order &&
 * (d) a page and its buddy are in the same zone.
 *
- * For recording whether a page is in the buddy system, we use PG_buddy.
+ * For recording whether a page is in the buddy system, we set ->_mapcount -2.
- * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
 *
 * For recording page's order, we use page_private(page).
 */
@@ -482,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
 * as necessary, plus some accounting needed to play nicely with other
 * parts of the VM system.
 * At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * free pages of length of (1 << order) and marked with _mapcount -2. Page's
 * order is recorded in page_private(page) field.
 * So when we are allocating or freeing one, we can derive the state of the
 * other.  That is, if we allocate a small block, and both were   
@@ -499,6 +492,7 @@ static inline void __free_one_page(struct page *page,
 {
        unsigned long page_idx;
        unsigned long combined_idx;
+        unsigned long uninitialized_var(buddy_idx);
        struct page *buddy;
        if (unlikely(PageCompound(page)))
@@ -513,7 +507,8 @@ static inline void __free_one_page(struct page *page,
        VM_BUG_ON(bad_range(zone, page));
        while (order < MAX_ORDER-1) {
-                buddy = __page_find_buddy(page, page_idx, order);
+                buddy_idx = __find_buddy_index(page_idx, order);
+                buddy = page + (buddy_idx - page_idx);
                if (!page_is_buddy(page, buddy, order))
                        break;
@@ -521,7 +516,7 @@ static inline void __free_one_page(struct page *page,
                list_del(&buddy->lru);
                zone->free_area[order].nr_free--;
                rmv_page_order(buddy);
-                combined_idx = __find_combined_index(page_idx, order);
+                combined_idx = buddy_idx & page_idx;
                page = page + (combined_idx - page_idx);
                page_idx = combined_idx;
                order++;
@@ -538,9 +533,10 @@ static inline void __free_one_page(struct page *page,
         */
        if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
                struct page *higher_page, *higher_buddy;
-                combined_idx = __find_combined_index(page_idx, order);
+                combined_idx = buddy_idx & page_idx;
-                higher_page = page + combined_idx - page_idx;
+                higher_page = page + (combined_idx - page_idx);
-                higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
+                buddy_idx = __find_buddy_index(combined_idx, order + 1);
+                higher_buddy = page + (buddy_idx - combined_idx);
                if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
                        list_add_tail(&page->lru,
                                &zone->free_area[order].free_list[migratetype]);
@@ -651,13 +647,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
        trace_mm_page_free_direct(page, order);
        kmemcheck_free_shadow(page, order);
-        for (i = 0; i < (1 << order); i++) {
+        if (PageAnon(page))
-                struct page *pg = page + i;
+                page->mapping = NULL;
+        for (i = 0; i < (1 << order); i++)
-                if (PageAnon(pg))
+                bad += free_pages_check(page + i);
-                        pg->mapping = NULL;
-                bad += free_pages_check(pg);
-        }
        if (bad)
                return false;
@@ -1460,24 +1453,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 /*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
 * of the allocation.
 */
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-                      int classzone_idx, int alloc_flags)
+                      int classzone_idx, int alloc_flags, long free_pages)
 {
        /* free_pages my go negative - that's OK */
        long min = mark;
-        long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
        int o;
+        free_pages -= (1 << order) + 1;
        if (alloc_flags & ALLOC_HIGH)
                min -= min / 2;
        if (alloc_flags & ALLOC_HARDER)
                min -= min / 4;
        if (free_pages <= min + z->lowmem_reserve[classzone_idx])
-                return 0;
+                return false;
        for (o = 0; o < order; o++) {
                /* At the next order, this order's pages become unavailable */
                free_pages -= z->free_area[o].nr_free << o;
@@ -1486,9 +1479,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                min >>= 1;
                if (free_pages <= min)
-                        return 0;
+                        return false;
        }
-        return 1;
+        return true;
+}
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+                      int classzone_idx, int alloc_flags)
+{
+        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                        zone_page_state(z, NR_FREE_PAGES));
+}
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+                      int classzone_idx, int alloc_flags)
+{
+        long free_pages = zone_page_state(z, NR_FREE_PAGES);
+        if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+                free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                                                free_pages);
 }
 #ifdef CONFIG_NUMA
@@ -1793,15 +1805,18 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int migratetype, unsigned long *did_some_progress)
+        int migratetype, unsigned long *did_some_progress,
+        bool sync_migration)
 {
        struct page *page;
        if (!order || compaction_deferred(preferred_zone))
                return NULL;
+        current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-                                                                nodemask);
+                                                nodemask, sync_migration);
+        current->flags &= ~PF_MEMALLOC;
        if (*did_some_progress != COMPACT_SKIPPED) {
                /* Page migration frees to the PCP lists but we want merging */
@@ -1837,7 +1852,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int migratetype, unsigned long *did_some_progress)
+        int migratetype, unsigned long *did_some_progress,
+        bool sync_migration)
 {
        return NULL;
 }
@@ -1852,23 +1868,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 {
        struct page *page = NULL;
        struct reclaim_state reclaim_state;
-        struct task_struct *p = current;
        bool drained = false;
        cond_resched();
        /* We now go into synchronous reclaim */
        cpuset_memory_pressure_bump();
-        p->flags |= PF_MEMALLOC;
+        current->flags |= PF_MEMALLOC;
        lockdep_set_current_reclaim_state(gfp_mask);
        reclaim_state.reclaimed_slab = 0;
-        p->reclaim_state = &reclaim_state;
+        current->reclaim_state = &reclaim_state;
        *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
-        p->reclaim_state = NULL;
+        current->reclaim_state = NULL;
        lockdep_clear_current_reclaim_state();
-        p->flags &= ~PF_MEMALLOC;
+        current->flags &= ~PF_MEMALLOC;
        cond_resched();
@@ -1920,19 +1935,19 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 static inline
 void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
-                                                enum zone_type high_zoneidx)
+                                                enum zone_type high_zoneidx,
+                                                enum zone_type classzone_idx)
 {
        struct zoneref *z;
        struct zone *zone;
        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
-                wakeup_kswapd(zone, order);
+                wakeup_kswapd(zone, order, classzone_idx);
 }
 static inline int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
-        struct task_struct *p = current;
        int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
        const gfp_t wait = gfp_mask & __GFP_WAIT;
@@ -1948,18 +1963,23 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
        alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
        if (!wait) {
-                alloc_flags |= ALLOC_HARDER;
+                /*
+                 * Not worth trying to allocate harder for
+                 * __GFP_NOMEMALLOC even if it can't schedule.
+                 */
+                if  (!(gfp_mask & __GFP_NOMEMALLOC))
+                        alloc_flags |= ALLOC_HARDER;
                /*
                 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
                 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
                 */
                alloc_flags &= ~ALLOC_CPUSET;
-        } else if (unlikely(rt_task(p)) && !in_interrupt())
+        } else if (unlikely(rt_task(current)) && !in_interrupt())
                alloc_flags |= ALLOC_HARDER;
        if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
                if (!in_interrupt() &&
-                    ((p->flags & PF_MEMALLOC) ||
+                    ((current->flags & PF_MEMALLOC) ||
                     unlikely(test_thread_flag(TIF_MEMDIE))))
                        alloc_flags |= ALLOC_NO_WATERMARKS;
        }
@@ -1978,7 +1998,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        int alloc_flags;
        unsigned long pages_reclaimed = 0;
        unsigned long did_some_progress;
-        struct task_struct *p = current;
+        bool sync_migration = false;
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -2003,7 +2023,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                goto nopage;
 restart:
-        wake_all_kswapd(order, zonelist, high_zoneidx);
+        if (!(gfp_mask & __GFP_NO_KSWAPD))
+                wake_all_kswapd(order, zonelist, high_zoneidx,
+                                                zone_idx(preferred_zone));
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -2034,21 +2056,26 @@ rebalance:
                goto nopage;
        /* Avoid recursion of direct reclaim */
-        if (p->flags & PF_MEMALLOC)
+        if (current->flags & PF_MEMALLOC)
                goto nopage;
        /* Avoid allocations with no watermarks from looping endlessly */
        if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
                goto nopage;
-        /* Try direct compaction */
+        /*
+         * Try direct compaction. The first pass is asynchronous. Subsequent
+         * attempts after direct reclaim are synchronous
+         */
        page = __alloc_pages_direct_compact(gfp_mask, order,
                                        zonelist, high_zoneidx,
                                        nodemask,
                                        alloc_flags, preferred_zone,
-                                        migratetype, &did_some_progress);
+                                        migratetype, &did_some_progress,
+                                        sync_migration);
        if (page)
                goto got_pg;
+        sync_migration = true;
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2102,13 +2129,27 @@ rebalance:
                /* Wait for some write requests to complete then retry */
                wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto rebalance;
+        } else {
+                /*
+                 * High-order allocations do not necessarily loop after
+                 * direct reclaim and reclaim/compaction depends on compaction
+                 * being called after reclaim so call directly if necessary
+                 */
+                page = __alloc_pages_direct_compact(gfp_mask, order,
+                                        zonelist, high_zoneidx,
+                                        nodemask,
+                                        alloc_flags, preferred_zone,
+                                        migratetype, &did_some_progress,
+                                        sync_migration);
+                if (page)
+                        goto got_pg;
        }
 nopage:
        if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
                printk(KERN_WARNING "%s: page allocation failure."
                        " order:%d, mode:0x%x\n",
-                        p->comm, order, gfp_mask);
+                        current->comm, order, gfp_mask);
                dump_stack();
                show_mem();
        }
@@ -2442,7 +2483,7 @@ void show_free_areas(void)
                        " all_unreclaimable? %s"
                        "\n",
                        zone->name,
-                        K(zone_nr_free_pages(zone)),
+                        K(zone_page_state(zone, NR_FREE_PAGES)),
                        K(min_wmark_pages(zone)),
                        K(low_wmark_pages(zone)),
                        K(high_wmark_pages(zone)),
@@ -2585,9 +2626,16 @@ static int __parse_numa_zonelist_order(char *s)
 static __init int setup_numa_zonelist_order(char *s)
 {
-        if (s)
+        int ret;
-                return __parse_numa_zonelist_order(s);
-        return 0;
+        if (!s)
+                return 0;
+        ret = __parse_numa_zonelist_order(s);
+        if (ret == 0)
+                strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
+        return ret;
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
@@ -5517,7 +5565,6 @@ static struct trace_print_flags pageflag_names[] = {
        {1UL << PG_swapcache,           "swapcache"     },
        {1UL << PG_mappedtodisk,        "mappedtodisk"  },
        {1UL << PG_reclaim,             "reclaim"       },
-        {1UL << PG_buddy,               "buddy"         },
        {1UL << PG_swapbacked,          "swapbacked"    },
        {1UL << PG_unevictable,         "unevictable"   },
 #ifdef CONFIG_MMU
@@ -5565,7 +5612,7 @@ void dump_page(struct page *page)
 {
        printk(KERN_ALERT
               "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
-                page, page_count(page), page_mapcount(page),
+                page, atomic_read(&page->_count), page_mapcount(page),
                page->mapping, page->index);
        dump_page_flags(page->flags);
 }
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 38cc58b8b2b0..7cfa6ae02303 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -34,6 +34,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                split_huge_page_pmd(walk->mm, pmd);
                if (pmd_none_or_clear_bad(pmd)) {
                        if (walk->pte_hole)
                                err = walk->pte_hole(addr, next, walk);
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 7d9c1d0ebd3f..ea534960a04b 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
                return NULL;
        vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
-                                pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL);
+                                pcpu_nr_groups, pcpu_atom_size);
        if (!vms) {
                pcpu_free_chunk(chunk);
                return NULL;
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
new file mode 100644
index 000000000000..d030548047e2
--- /dev/null
+++ b/mm/pgtable-generic.c
@@ -0,0 +1,123 @@
+/*
+ *  mm/pgtable-generic.c
+ *
+ *  Generic pgtable methods declared in asm-generic/pgtable.h
+ *
+ *  Copyright (C) 2010  Linus Torvalds
+ */
+#include <asm/tlb.h>
+#include <asm-generic/pgtable.h>
+#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+/*
+ * Only sets the access flags (dirty, accessed, and
+ * writable). Furthermore, we know it always gets set to a "more
+ * permissive" setting, which allows most architectures to optimize
+ * this. We return whether the PTE actually changed, which in turn
+ * instructs the caller to do things like update__mmu_cache.  This
+ * used to be done in the caller, but sparc needs minor faults to
+ * force that call on sun4c so we changed this macro slightly
+ */
+int ptep_set_access_flags(struct vm_area_struct *vma,
+                          unsigned long address, pte_t *ptep,
+                          pte_t entry, int dirty)
+{
+        int changed = !pte_same(*ptep, entry);
+        if (changed) {
+                set_pte_at(vma->vm_mm, address, ptep, entry);
+                flush_tlb_page(vma, address);
+        }
+        return changed;
+}
+#endif
+#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+                          unsigned long address, pmd_t *pmdp,
+                          pmd_t entry, int dirty)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        int changed = !pmd_same(*pmdp, entry);
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        if (changed) {
+                set_pmd_at(vma->vm_mm, address, pmdp, entry);
+                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+        }
+        return changed;
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+        BUG();
+        return 0;
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+}
+#endif
+#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+                           unsigned long address, pte_t *ptep)
+{
+        int young;
+        young = ptep_test_and_clear_young(vma, address, ptep);
+        if (young)
+                flush_tlb_page(vma, address);
+        return young;
+}
+#endif
+#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+                           unsigned long address, pmd_t *pmdp)
+{
+        int young;
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+        BUG();
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        young = pmdp_test_and_clear_young(vma, address, pmdp);
+        if (young)
+                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+        return young;
+}
+#endif
+#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
+pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
+                       pte_t *ptep)
+{
+        pte_t pte;
+        pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
+        flush_tlb_page(vma, address);
+        return pte;
+}
+#endif
+#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
+pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+                       pmd_t *pmdp)
+{
+        pmd_t pmd;
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+        BUG();
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+        return pmd;
+}
+#endif
+#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
+                           pmd_t *pmdp)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        pmd_t pmd = pmd_mksplitting(*pmdp);
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        set_pmd_at(vma->vm_mm, address, pmdp, pmd);
+        /* tlb flush only to serialize against gup-fast */
+        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+        BUG();
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+}
+#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index c95d2ba27a0b..f21f4a1d6a1c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -177,6 +177,10 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
        list_add(&avc->same_vma, &vma->anon_vma_chain);
        anon_vma_lock(anon_vma);
+        /*
+         * It's critical to add new vmas to the tail of the anon_vma,
+         * see comment in huge_memory.c:__split_huge_page().
+         */
        list_add_tail(&avc->same_anon_vma, &anon_vma->head);
        anon_vma_unlock(anon_vma);
 }
@@ -360,7 +364,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
 * Returns virtual address or -EFAULT if page's index/offset is not
 * within the range mapped the @vma.
 */
-static inline unsigned long
+inline unsigned long
 vma_address(struct page *page, struct vm_area_struct *vma)
 {
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -435,6 +439,8 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
        pmd = pmd_offset(pud, address);
        if (!pmd_present(*pmd))
                return NULL;
+        if (pmd_trans_huge(*pmd))
+                return NULL;
        pte = pte_offset_map(pmd, address);
        /* Make a quick check before getting the lock */
@@ -489,35 +495,17 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        unsigned long *vm_flags)
 {
        struct mm_struct *mm = vma->vm_mm;
-        pte_t *pte;
-        spinlock_t *ptl;
        int referenced = 0;
-        pte = page_check_address(page, mm, address, &ptl, 0);
-        if (!pte)
-                goto out;
        /*
         * Don't want to elevate referenced for mlocked page that gets this far,
         * in order that it progresses to try_to_unmap and is moved to the
         * unevictable list.
         */
        if (vma->vm_flags & VM_LOCKED) {
-                *mapcount = 1;  /* break early from loop */
+                *mapcount = 0;  /* break early from loop */
                *vm_flags |= VM_LOCKED;
-                goto out_unmap;
+                goto out;
-        }
-        if (ptep_clear_flush_young_notify(vma, address, pte)) {
-                /*
-                 * Don't treat a reference through a sequentially read
-                 * mapping as such.  If the page has been used in
-                 * another mapping, we will catch it; if this other
-                 * mapping is already gone, the unmap path will have
-                 * set PG_referenced or activated the page.
-                 */
-                if (likely(!VM_SequentialReadHint(vma)))
-                        referenced++;
        }
        /* Pretend the page is referenced if the task has the
@@ -526,9 +514,39 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        rwsem_is_locked(&mm->mmap_sem))
                referenced++;
-out_unmap:
+        if (unlikely(PageTransHuge(page))) {
+                pmd_t *pmd;
+                spin_lock(&mm->page_table_lock);
+                pmd = page_check_address_pmd(page, mm, address,
+                                             PAGE_CHECK_ADDRESS_PMD_FLAG);
+                if (pmd && !pmd_trans_splitting(*pmd) &&
+                    pmdp_clear_flush_young_notify(vma, address, pmd))
+                        referenced++;
+                spin_unlock(&mm->page_table_lock);
+        } else {
+                pte_t *pte;
+                spinlock_t *ptl;
+                pte = page_check_address(page, mm, address, &ptl, 0);
+                if (!pte)
+                        goto out;
+                if (ptep_clear_flush_young_notify(vma, address, pte)) {
+                        /*
+                         * Don't treat a reference through a sequentially read
+                         * mapping as such.  If the page has been used in
+                         * another mapping, we will catch it; if this other
+                         * mapping is already gone, the unmap path will have
+                         * set PG_referenced or activated the page.
+                         */
+                        if (likely(!VM_SequentialReadHint(vma)))
+                                referenced++;
+                }
+                pte_unmap_unlock(pte, ptl);
+        }
        (*mapcount)--;
-        pte_unmap_unlock(pte, ptl);
        if (referenced)
                *vm_flags |= vma->vm_flags;
@@ -864,8 +882,13 @@ void do_page_add_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address, int exclusive)
 {
        int first = atomic_inc_and_test(&page->_mapcount);
-        if (first)
+        if (first) {
-                __inc_zone_page_state(page, NR_ANON_PAGES);
+                if (!PageTransHuge(page))
+                        __inc_zone_page_state(page, NR_ANON_PAGES);
+                else
+                        __inc_zone_page_state(page,
+                                              NR_ANON_TRANSPARENT_HUGEPAGES);
+        }
        if (unlikely(PageKsm(page)))
                return;
@@ -893,7 +916,10 @@ void page_add_new_anon_rmap(struct page *page,
        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        SetPageSwapBacked(page);
        atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
-        __inc_zone_page_state(page, NR_ANON_PAGES);
+        if (!PageTransHuge(page))
+                __inc_zone_page_state(page, NR_ANON_PAGES);
+        else
+                __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
        __page_set_anon_rmap(page, vma, address, 1);
        if (page_evictable(page, vma))
                lru_cache_add_lru(page, LRU_ACTIVE_ANON);
@@ -911,7 +937,7 @@ void page_add_file_rmap(struct page *page)
 {
        if (atomic_inc_and_test(&page->_mapcount)) {
                __inc_zone_page_state(page, NR_FILE_MAPPED);
-                mem_cgroup_update_file_mapped(page, 1);
+                mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
        }
 }
@@ -946,10 +972,14 @@ void page_remove_rmap(struct page *page)
                return;
        if (PageAnon(page)) {
                mem_cgroup_uncharge_page(page);
-                __dec_zone_page_state(page, NR_ANON_PAGES);
+                if (!PageTransHuge(page))
+                        __dec_zone_page_state(page, NR_ANON_PAGES);
+                else
+                        __dec_zone_page_state(page,
+                                              NR_ANON_TRANSPARENT_HUGEPAGES);
        } else {
                __dec_zone_page_state(page, NR_FILE_MAPPED);
-                mem_cgroup_update_file_mapped(page, -1);
+                mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
        }
        /*
         * It would be tidy to reset the PageAnon mapping here,
@@ -1202,7 +1232,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
        return ret;
 }
-static bool is_vma_temporary_stack(struct vm_area_struct *vma)
+bool is_vma_temporary_stack(struct vm_area_struct *vma)
 {
        int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -1400,6 +1430,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
        int ret;
        BUG_ON(!PageLocked(page));
+        VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
        if (unlikely(PageKsm(page)))
                ret = try_to_unmap_ksm(page, flags);
diff --git a/mm/slub.c b/mm/slub.c
index 008cd743a36a..c7ef0070dd86 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3636,7 +3636,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
                len += sprintf(buf + len, "%7ld ", l->count);
                if (l->addr)
-                        len += sprint_symbol(buf + len, (unsigned long)l->addr);
+                        len += sprintf(buf + len, "%pS", (void *)l->addr);
                else
                        len += sprintf(buf + len, "<not-available>");
@@ -3946,12 +3946,9 @@ SLAB_ATTR(min_partial);
 static ssize_t ctor_show(struct kmem_cache *s, char *buf)
 {
-        if (s->ctor) {
+        if (!s->ctor)
-                int n = sprint_symbol(buf, (unsigned long)s->ctor);
+                return 0;
+        return sprintf(buf, "%pS\n", s->ctor);
-                return n + sprintf(buf + n, "\n");
-        }
-        return 0;
 }
 SLAB_ATTR_RO(ctor);
diff --git a/mm/sparse.c b/mm/sparse.c
index 95ac219af379..93250207c5cf 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 static void free_map_bootmem(struct page *page, unsigned long nr_pages)
 {
        unsigned long maps_section_nr, removing_section_nr, i;
-        int magic;
+        unsigned long magic;
        for (i = 0; i < nr_pages; i++, page++) {
-                magic = atomic_read(&page->_mapcount);
+                magic = (unsigned long) page->lru.next;
                BUG_ON(magic == NODE_INFO);
diff --git a/mm/swap.c b/mm/swap.c
index 3f4854205b16..bbc1ce9f9460 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -56,17 +56,97 @@ static void __page_cache_release(struct page *page)
                del_page_from_lru(zone, page);
                spin_unlock_irqrestore(&zone->lru_lock, flags);
        }
+}
+static void __put_single_page(struct page *page)
+{
+        __page_cache_release(page);
        free_hot_cold_page(page, 0);
 }
-static void put_compound_page(struct page *page)
+static void __put_compound_page(struct page *page)
 {
-        page = compound_head(page);
+        compound_page_dtor *dtor;
-        if (put_page_testzero(page)) {
-                compound_page_dtor *dtor;
-                dtor = get_compound_page_dtor(page);
+        __page_cache_release(page);
-                (*dtor)(page);
+        dtor = get_compound_page_dtor(page);
+        (*dtor)(page);
+}
+static void put_compound_page(struct page *page)
+{
+        if (unlikely(PageTail(page))) {
+                /* __split_huge_page_refcount can run under us */
+                struct page *page_head = page->first_page;
+                smp_rmb();
+                /*
+                 * If PageTail is still set after smp_rmb() we can be sure
+                 * that the page->first_page we read wasn't a dangling pointer.
+                 * See __split_huge_page_refcount() smp_wmb().
+                 */
+                if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
+                        unsigned long flags;
+                        /*
+                         * Verify that our page_head wasn't converted
+                         * to a a regular page before we got a
+                         * reference on it.
+                         */
+                        if (unlikely(!PageHead(page_head))) {
+                                /* PageHead is cleared after PageTail */
+                                smp_rmb();
+                                VM_BUG_ON(PageTail(page));
+                                goto out_put_head;
+                        }
+                        /*
+                         * Only run compound_lock on a valid PageHead,
+                         * after having it pinned with
+                         * get_page_unless_zero() above.
+                         */
+                        smp_mb();
+                        /* page_head wasn't a dangling pointer */
+                        flags = compound_lock_irqsave(page_head);
+                        if (unlikely(!PageTail(page))) {
+                                /* __split_huge_page_refcount run before us */
+                                compound_unlock_irqrestore(page_head, flags);
+                                VM_BUG_ON(PageHead(page_head));
+                        out_put_head:
+                                if (put_page_testzero(page_head))
+                                        __put_single_page(page_head);
+                        out_put_single:
+                                if (put_page_testzero(page))
+                                        __put_single_page(page);
+                                return;
+                        }
+                        VM_BUG_ON(page_head != page->first_page);
+                        /*
+                         * We can release the refcount taken by
+                         * get_page_unless_zero now that
+                         * split_huge_page_refcount is blocked on the
+                         * compound_lock.
+                         */
+                        if (put_page_testzero(page_head))
+                                VM_BUG_ON(1);
+                        /* __split_huge_page_refcount will wait now */
+                        VM_BUG_ON(atomic_read(&page->_count) <= 0);
+                        atomic_dec(&page->_count);
+                        VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
+                        compound_unlock_irqrestore(page_head, flags);
+                        if (put_page_testzero(page_head)) {
+                                if (PageHead(page_head))
+                                        __put_compound_page(page_head);
+                                else
+                                        __put_single_page(page_head);
+                        }
+                } else {
+                        /* page_head is a dangling pointer */
+                        VM_BUG_ON(PageTail(page));
+                        goto out_put_single;
+                }
+        } else if (put_page_testzero(page)) {
+                if (PageHead(page))
+                        __put_compound_page(page);
+                else
+                        __put_single_page(page);
        }
 }
@@ -75,7 +155,7 @@ void put_page(struct page *page)
        if (unlikely(PageCompound(page)))
                put_compound_page(page);
        else if (put_page_testzero(page))
-                __page_cache_release(page);
+                __put_single_page(page);
 }
 EXPORT_SYMBOL(put_page);
@@ -98,15 +178,13 @@ void put_pages_list(struct list_head *pages)
 }
 EXPORT_SYMBOL(put_pages_list);
-/*
+static void pagevec_lru_move_fn(struct pagevec *pvec,
- * pagevec_move_tail() must be called with IRQ disabled.
+                                void (*move_fn)(struct page *page, void *arg),
- * Otherwise this may cause nasty races.
+                                void *arg)
- */
-static void pagevec_move_tail(struct pagevec *pvec)
 {
        int i;
-        int pgmoved = 0;
        struct zone *zone = NULL;
+        unsigned long flags = 0;
        for (i = 0; i < pagevec_count(pvec); i++) {
                struct page *page = pvec->pages[i];
@@ -114,29 +192,49 @@ static void pagevec_move_tail(struct pagevec *pvec)
                if (pagezone != zone) {
                        if (zone)
-                                spin_unlock(&zone->lru_lock);
+                                spin_unlock_irqrestore(&zone->lru_lock, flags);
                        zone = pagezone;
-                        spin_lock(&zone->lru_lock);
+                        spin_lock_irqsave(&zone->lru_lock, flags);
-                }
-                if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
-                        int lru = page_lru_base_type(page);
-                        list_move_tail(&page->lru, &zone->lru[lru].list);
-                        pgmoved++;
                }
+                (*move_fn)(page, arg);
        }
        if (zone)
-                spin_unlock(&zone->lru_lock);
+                spin_unlock_irqrestore(&zone->lru_lock, flags);
-        __count_vm_events(PGROTATED, pgmoved);
+        release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
-        release_pages(pvec->pages, pvec->nr, pvec->cold);
        pagevec_reinit(pvec);
 }
+static void pagevec_move_tail_fn(struct page *page, void *arg)
+{
+        int *pgmoved = arg;
+        struct zone *zone = page_zone(page);
+        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+                int lru = page_lru_base_type(page);
+                list_move_tail(&page->lru, &zone->lru[lru].list);
+                (*pgmoved)++;
+        }
+}
+/*
+ * pagevec_move_tail() must be called with IRQ disabled.
+ * Otherwise this may cause nasty races.
+ */
+static void pagevec_move_tail(struct pagevec *pvec)
+{
+        int pgmoved = 0;
+        pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
+        __count_vm_events(PGROTATED, pgmoved);
+}
 /*
 * Writeback is about to end against a page which has been marked for immediate
 * reclaim.  If it still appears to be reclaimable, move it to the tail of the
 * inactive list.
 */
-void  rotate_reclaimable_page(struct page *page)
+void rotate_reclaimable_page(struct page *page)
 {
        if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
            !PageUnevictable(page) && PageLRU(page)) {
@@ -173,27 +271,94 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page,
 }
 /*
- * FIXME: speed this up?
+ * A page will go to active list either by activate_page or putback_lru_page.
+ * In the activate_page case, the page hasn't active bit set. The page might
+ * not in LRU list because it's isolated before it gets a chance to be moved to
+ * active list. The window is small because pagevec just stores several pages.
+ * For such case, we do nothing for such page.
+ * In the putback_lru_page case, the page isn't in lru list but has active
+ * bit set
 */
-void activate_page(struct page *page)
+static void __activate_page(struct page *page, void *arg)
 {
        struct zone *zone = page_zone(page);
+        int file = page_is_file_cache(page);
+        int lru = page_lru_base_type(page);
+        bool putback = !PageLRU(page);
-        spin_lock_irq(&zone->lru_lock);
+        /* The page is isolated before it's moved to active list */
-        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+        if (!PageLRU(page) && !PageActive(page))
-                int file = page_is_file_cache(page);
+                return;
-                int lru = page_lru_base_type(page);
+        if ((PageLRU(page) && PageActive(page)) || PageUnevictable(page))
+                return;
+        if (!putback)
                del_page_from_lru_list(zone, page, lru);
+        else
+                SetPageLRU(page);
-                SetPageActive(page);
+        SetPageActive(page);
-                lru += LRU_ACTIVE;
+        lru += LRU_ACTIVE;
-                add_page_to_lru_list(zone, page, lru);
+        add_page_to_lru_list(zone, page, lru);
-                __count_vm_event(PGACTIVATE);
-                update_page_reclaim_stat(zone, page, file, 1);
+        if (putback)
+                return;
+        __count_vm_event(PGACTIVATE);
+        update_page_reclaim_stat(zone, page, file, 1);
+}
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
+static void activate_page_drain(int cpu)
+{
+        struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
+        if (pagevec_count(pvec))
+                pagevec_lru_move_fn(pvec, __activate_page, NULL);
+}
+void activate_page(struct page *page)
+{
+        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+                struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+                page_cache_get(page);
+                if (!pagevec_add(pvec, page))
+                        pagevec_lru_move_fn(pvec, __activate_page, NULL);
+                put_cpu_var(activate_page_pvecs);
+        }
+}
+/* Caller should hold zone->lru_lock */
+int putback_active_lru_page(struct zone *zone, struct page *page)
+{
+        struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+        if (!pagevec_add(pvec, page)) {
+                spin_unlock_irq(&zone->lru_lock);
+                pagevec_lru_move_fn(pvec, __activate_page, NULL);
+                spin_lock_irq(&zone->lru_lock);
        }
+        put_cpu_var(activate_page_pvecs);
+        return 1;
+}
+#else
+static inline void activate_page_drain(int cpu)
+{
+}
+void activate_page(struct page *page)
+{
+        struct zone *zone = page_zone(page);
+        spin_lock_irq(&zone->lru_lock);
+        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page))
+                __activate_page(page, NULL);
        spin_unlock_irq(&zone->lru_lock);
 }
+#endif
 /*
 * Mark a page as having seen activity.
@@ -292,6 +457,7 @@ static void drain_cpu_pagevecs(int cpu)
                pagevec_move_tail(pvec);
                local_irq_restore(flags);
        }
+        activate_page_drain(cpu);
 }
 void lru_add_drain(void)
@@ -399,44 +565,70 @@ void __pagevec_release(struct pagevec *pvec)
 EXPORT_SYMBOL(__pagevec_release);
+/* used by __split_huge_page_refcount() */
+void lru_add_page_tail(struct zone* zone,
+                       struct page *page, struct page *page_tail)
+{
+        int active;
+        enum lru_list lru;
+        const int file = 0;
+        struct list_head *head;
+        VM_BUG_ON(!PageHead(page));
+        VM_BUG_ON(PageCompound(page_tail));
+        VM_BUG_ON(PageLRU(page_tail));
+        VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
+        SetPageLRU(page_tail);
+        if (page_evictable(page_tail, NULL)) {
+                if (PageActive(page)) {
+                        SetPageActive(page_tail);
+                        active = 1;
+                        lru = LRU_ACTIVE_ANON;
+                } else {
+                        active = 0;
+                        lru = LRU_INACTIVE_ANON;
+                }
+                update_page_reclaim_stat(zone, page_tail, file, active);
+                if (likely(PageLRU(page)))
+                        head = page->lru.prev;
+                else
+                        head = &zone->lru[lru].list;
+                __add_page_to_lru_list(zone, page_tail, lru, head);
+        } else {
+                SetPageUnevictable(page_tail);
+                add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE);
+        }
+}
+static void ____pagevec_lru_add_fn(struct page *page, void *arg)
+{
+        enum lru_list lru = (enum lru_list)arg;
+        struct zone *zone = page_zone(page);
+        int file = is_file_lru(lru);
+        int active = is_active_lru(lru);
+        VM_BUG_ON(PageActive(page));
+        VM_BUG_ON(PageUnevictable(page));
+        VM_BUG_ON(PageLRU(page));
+        SetPageLRU(page);
+        if (active)
+                SetPageActive(page);
+        update_page_reclaim_stat(zone, page, file, active);
+        add_page_to_lru_list(zone, page, lru);
+}
 /*
 * Add the passed pages to the LRU, then drop the caller's refcount
 * on them.  Reinitialises the caller's pagevec.
 */
 void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 {
-        int i;
-        struct zone *zone = NULL;
        VM_BUG_ON(is_unevictable_lru(lru));
-        for (i = 0; i < pagevec_count(pvec); i++) {
+        pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru);
-                struct page *page = pvec->pages[i];
-                struct zone *pagezone = page_zone(page);
-                int file;
-                int active;
-                if (pagezone != zone) {
-                        if (zone)
-                                spin_unlock_irq(&zone->lru_lock);
-                        zone = pagezone;
-                        spin_lock_irq(&zone->lru_lock);
-                }
-                VM_BUG_ON(PageActive(page));
-                VM_BUG_ON(PageUnevictable(page));
-                VM_BUG_ON(PageLRU(page));
-                SetPageLRU(page);
-                active = is_active_lru(lru);
-                file = is_file_lru(lru);
-                if (active)
-                        SetPageActive(page);
-                update_page_reclaim_stat(zone, page, file, active);
-                add_page_to_lru_list(zone, page, lru);
-        }
-        if (zone)
-                spin_unlock_irq(&zone->lru_lock);
-        release_pages(pvec->pages, pvec->nr, pvec->cold);
-        pagevec_reinit(pvec);
 }
 EXPORT_SYMBOL(____pagevec_lru_add);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e10f5833167f..5c8cfabbc9bc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -157,6 +157,12 @@ int add_to_swap(struct page *page)
        if (!entry.val)
                return 0;
+        if (unlikely(PageTransHuge(page)))
+                if (unlikely(split_huge_page(page))) {
+                        swapcache_free(entry, NULL);
+                        return 0;
+                }
        /*
         * Radix-tree node allocations from PF_MEMALLOC contexts could
         * completely exhaust the page allocator. __GFP_NOMEMALLOC
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b6adcfbf6f48..07a458d72fa8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -964,6 +964,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                if (unlikely(pmd_trans_huge(*pmd)))
+                        continue;
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index eb5cc7d00c5a..cac13b415635 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -748,7 +748,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
        va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
                                        VMALLOC_START, VMALLOC_END,
                                        node, gfp_mask);
-        if (unlikely(IS_ERR(va))) {
+        if (IS_ERR(va)) {
                kfree(vb);
                return ERR_CAST(va);
        }
@@ -1315,13 +1315,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
                                                -1, GFP_KERNEL, caller);
 }
-struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
-                                   int node, gfp_t gfp_mask)
-{
-        return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
-                                  node, gfp_mask, __builtin_return_address(0));
-}
 static struct vm_struct *find_vm_area(const void *addr)
 {
        struct vmap_area *va;
@@ -1537,25 +1530,12 @@ fail:
        return NULL;
 }
-void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
-{
-        void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1,
-                                         __builtin_return_address(0));
-        /*
-         * A ref_count = 3 is needed because the vm_struct and vmap_area
-         * structures allocated in the __get_vm_area_node() function contain
-         * references to the virtual address of the vmalloc'ed block.
-         */
-        kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask);
-        return addr;
-}
 /**
- *      __vmalloc_node  -  allocate virtually contiguous memory
+ *      __vmalloc_node_range  -  allocate virtually contiguous memory
 *      @size:          allocation size
 *      @align:         desired alignment
+ *      @start:         vm area range start
+ *      @end:           vm area range end
 *      @gfp_mask:      flags for the page level allocator
 *      @prot:          protection mask for the allocated pages
 *      @node:          node to use for allocation or -1
@@ -1565,9 +1545,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
 *      allocator with @gfp_mask flags.  Map them into contiguous
 *      kernel virtual space, using a pagetable protection of @prot.
 */
-static void *__vmalloc_node(unsigned long size, unsigned long align,
+void *__vmalloc_node_range(unsigned long size, unsigned long align,
-                            gfp_t gfp_mask, pgprot_t prot,
+                        unsigned long start, unsigned long end, gfp_t gfp_mask,
-                            int node, void *caller)
+                        pgprot_t prot, int node, void *caller)
 {
        struct vm_struct *area;
        void *addr;
@@ -1577,8 +1557,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
        if (!size || (size >> PAGE_SHIFT) > totalram_pages)
                return NULL;
-        area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START,
+        area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node,
-                                  VMALLOC_END, node, gfp_mask, caller);
+                                  gfp_mask, caller);
        if (!area)
                return NULL;
@@ -1595,6 +1575,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
        return addr;
 }
+/**
+ *      __vmalloc_node  -  allocate virtually contiguous memory
+ *      @size:          allocation size
+ *      @align:         desired alignment
+ *      @gfp_mask:      flags for the page level allocator
+ *      @prot:          protection mask for the allocated pages
+ *      @node:          node to use for allocation or -1
+ *      @caller:        caller's return address
+ *
+ *      Allocate enough pages to cover @size from the page level
+ *      allocator with @gfp_mask flags.  Map them into contiguous
+ *      kernel virtual space, using a pagetable protection of @prot.
+ */
+static void *__vmalloc_node(unsigned long size, unsigned long align,
+                            gfp_t gfp_mask, pgprot_t prot,
+                            int node, void *caller)
+{
+        return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+                                gfp_mask, prot, node, caller);
+}
 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 {
        return __vmalloc_node(size, 1, gfp_mask, prot, -1,
@@ -2203,17 +2204,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
 * @sizes: array containing size of each area
 * @nr_vms: the number of areas to allocate
 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
- * @gfp_mask: allocation mask
 *
 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
 *          vm_structs on success, %NULL on failure
 *
 * Percpu allocator wants to use congruent vm areas so that it can
 * maintain the offsets among percpu areas.  This function allocates
- * congruent vmalloc areas for it.  These areas tend to be scattered
+ * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
- * pretty far, distance between two areas easily going up to
+ * be scattered pretty far, distance between two areas easily going up
- * gigabytes.  To avoid interacting with regular vmallocs, these areas
+ * to gigabytes.  To avoid interacting with regular vmallocs, these
- * are allocated from top.
+ * areas are allocated from top.
 *
 * Despite its complicated look, this allocator is rather simple.  It
 * does everything top-down and scans areas from the end looking for
@@ -2224,7 +2224,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
 */
 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                                     const size_t *sizes, int nr_vms,
-                                     size_t align, gfp_t gfp_mask)
+                                     size_t align)
 {
        const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
        const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
@@ -2234,8 +2234,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
        unsigned long base, start, end, last_end;
        bool purged = false;
-        gfp_mask &= GFP_RECLAIM_MASK;
        /* verify parameters and allocate data structures */
        BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
        for (last_area = 0, area = 0; area < nr_vms; area++) {
@@ -2268,14 +2266,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                return NULL;
        }
-        vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask);
+        vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
-        vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask);
+        vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
        if (!vas || !vms)
                goto err_free;
        for (area = 0; area < nr_vms; area++) {
-                vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
+                vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
-                vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
+                vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
                if (!vas[area] || !vms[area])
                        goto err_free;
        }
@@ -2456,13 +2454,8 @@ static int s_show(struct seq_file *m, void *p)
        seq_printf(m, "0x%p-0x%p %7ld",
                v->addr, v->addr + v->size, v->size);
-        if (v->caller) {
+        if (v->caller)
-                char buff[KSYM_SYMBOL_LEN];
+                seq_printf(m, " %pS", v->caller);
-                seq_putc(m, ' ');
-                sprint_symbol(buff, (unsigned long)v->caller);
-                seq_puts(m, buff);
-        }
        if (v->nr_pages)
                seq_printf(m, " pages=%d", v->nr_pages);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9ca587c69274..99999a9b2b0b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -32,6 +32,7 @@
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/compaction.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
@@ -40,6 +41,7 @@
 #include <linux/memcontrol.h>
 #include <linux/delayacct.h>
 #include <linux/sysctl.h>
+#include <linux/compaction.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -51,11 +53,23 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/vmscan.h>
-enum lumpy_mode {
+/*
-        LUMPY_MODE_NONE,
+ * reclaim_mode determines how the inactive list is shrunk
-        LUMPY_MODE_ASYNC,
+ * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
-        LUMPY_MODE_SYNC,
+ * RECLAIM_MODE_ASYNC:  Do not block
-};
+ * RECLAIM_MODE_SYNC:   Allow blocking e.g. call wait_on_page_writeback
+ * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
+ *                      page from the LRU and reclaim all pages within a
+ *                      naturally aligned range
+ * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
+ *                      order-0 pages and then compact the zone
+ */
+typedef unsigned __bitwise__ reclaim_mode_t;
+#define RECLAIM_MODE_SINGLE             ((__force reclaim_mode_t)0x01u)
+#define RECLAIM_MODE_ASYNC              ((__force reclaim_mode_t)0x02u)
+#define RECLAIM_MODE_SYNC               ((__force reclaim_mode_t)0x04u)
+#define RECLAIM_MODE_LUMPYRECLAIM       ((__force reclaim_mode_t)0x08u)
+#define RECLAIM_MODE_COMPACTION         ((__force reclaim_mode_t)0x10u)
 struct scan_control {
        /* Incremented by the number of inactive pages that were scanned */
@@ -88,7 +102,7 @@ struct scan_control {
         * Intend to reclaim enough continuous memory rather than reclaim
         * enough amount of memory. i.e, mode for high order allocation.
         */
-        enum lumpy_mode lumpy_reclaim_mode;
+        reclaim_mode_t reclaim_mode;
        /* Which cgroup do we reclaim from */
        struct mem_cgroup *mem_cgroup;
@@ -271,34 +285,37 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
        return ret;
 }
-static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
+static void set_reclaim_mode(int priority, struct scan_control *sc,
                                   bool sync)
 {
-        enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;
+        reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
        /*
-         * Some reclaim have alredy been failed. No worth to try synchronous
+         * Initially assume we are entering either lumpy reclaim or
-         * lumpy reclaim.
+         * reclaim/compaction.Depending on the order, we will either set the
+         * sync mode or just reclaim order-0 pages later.
         */
-        if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
+        if (COMPACTION_BUILD)
-                return;
+                sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
+        else
+                sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
        /*
-         * If we need a large contiguous chunk of memory, or have
+         * Avoid using lumpy reclaim or reclaim/compaction if possible by
-         * trouble getting a small set of contiguous pages, we
+         * restricting when its set to either costly allocations or when
-         * will reclaim both active and inactive pages.
+         * under memory pressure
         */
        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
-                sc->lumpy_reclaim_mode = mode;
+                sc->reclaim_mode |= syncmode;
        else if (sc->order && priority < DEF_PRIORITY - 2)
-                sc->lumpy_reclaim_mode = mode;
+                sc->reclaim_mode |= syncmode;
        else
-                sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
+                sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
 }
-static void disable_lumpy_reclaim_mode(struct scan_control *sc)
+static void reset_reclaim_mode(struct scan_control *sc)
 {
-        sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
+        sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
 }
 static inline int is_page_cache_freeable(struct page *page)
@@ -429,7 +446,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                 * first attempt to free a range of pages fails.
                 */
                if (PageWriteback(page) &&
-                    sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC)
+                    (sc->reclaim_mode & RECLAIM_MODE_SYNC))
                        wait_on_page_writeback(page);
                if (!PageWriteback(page)) {
@@ -437,7 +454,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                        ClearPageReclaim(page);
                }
                trace_mm_vmscan_writepage(page,
-                        trace_reclaim_flags(page, sc->lumpy_reclaim_mode));
+                        trace_reclaim_flags(page, sc->reclaim_mode));
                inc_zone_page_state(page, NR_VMSCAN_WRITE);
                return PAGE_SUCCESS;
        }
@@ -622,7 +639,7 @@ static enum page_references page_check_references(struct page *page,
        referenced_page = TestClearPageReferenced(page);
        /* Lumpy reclaim - ignore references */
-        if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE)
+        if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
                return PAGEREF_RECLAIM;
        /*
@@ -739,7 +756,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         * for any page for which writeback has already
                         * started.
                         */
-                        if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC &&
+                        if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
                            may_enter_fs)
                                wait_on_page_writeback(page);
                        else {
@@ -895,7 +912,7 @@ cull_mlocked:
                        try_to_free_swap(page);
                unlock_page(page);
                putback_lru_page(page);
-                disable_lumpy_reclaim_mode(sc);
+                reset_reclaim_mode(sc);
                continue;
 activate_locked:
@@ -908,7 +925,7 @@ activate_locked:
 keep_locked:
                unlock_page(page);
 keep:
-                disable_lumpy_reclaim_mode(sc);
+                reset_reclaim_mode(sc);
 keep_lumpy:
                list_add(&page->lru, &ret_pages);
                VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
@@ -1028,7 +1045,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                case 0:
                        list_move(&page->lru, dst);
                        mem_cgroup_del_lru(page);
-                        nr_taken++;
+                        nr_taken += hpage_nr_pages(page);
                        break;
                case -EBUSY:
@@ -1086,7 +1103,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                        if (__isolate_lru_page(cursor_page, mode, file) == 0) {
                                list_move(&cursor_page->lru, dst);
                                mem_cgroup_del_lru(cursor_page);
-                                nr_taken++;
+                                nr_taken += hpage_nr_pages(page);
                                nr_lumpy_taken++;
                                if (PageDirty(cursor_page))
                                        nr_lumpy_dirty++;
@@ -1141,14 +1158,15 @@ static unsigned long clear_active_flags(struct list_head *page_list,
        struct page *page;
        list_for_each_entry(page, page_list, lru) {
+                int numpages = hpage_nr_pages(page);
                lru = page_lru_base_type(page);
                if (PageActive(page)) {
                        lru += LRU_ACTIVE;
                        ClearPageActive(page);
-                        nr_active++;
+                        nr_active += numpages;
                }
                if (count)
-                        count[lru]++;
+                        count[lru] += numpages;
        }
        return nr_active;
@@ -1253,13 +1271,16 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
                        spin_lock_irq(&zone->lru_lock);
                        continue;
                }
-                SetPageLRU(page);
                lru = page_lru(page);
-                add_page_to_lru_list(zone, page, lru);
                if (is_active_lru(lru)) {
                        int file = is_file_lru(lru);
-                        reclaim_stat->recent_rotated[file]++;
+                        int numpages = hpage_nr_pages(page);
+                        reclaim_stat->recent_rotated[file] += numpages;
+                        if (putback_active_lru_page(zone, page))
+                                continue;
                }
+                SetPageLRU(page);
+                add_page_to_lru_list(zone, page, lru);
                if (!pagevec_add(&pvec, page)) {
                        spin_unlock_irq(&zone->lru_lock);
                        __pagevec_release(&pvec);
@@ -1324,7 +1345,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
                return false;
        /* Only stall on lumpy reclaim */
-        if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
+        if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
                return false;
        /* If we have relaimed everything on the isolated list, no stall */
@@ -1368,15 +1389,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                        return SWAP_CLUSTER_MAX;
        }
-        set_lumpy_reclaim_mode(priority, sc, false);
+        set_reclaim_mode(priority, sc, false);
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
        if (scanning_global_lru(sc)) {
                nr_taken = isolate_pages_global(nr_to_scan,
                        &page_list, &nr_scanned, sc->order,
-                        sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
+                        sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                        ISOLATE_INACTIVE : ISOLATE_BOTH,
+                                        ISOLATE_BOTH : ISOLATE_INACTIVE,
                        zone, 0, file);
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
@@ -1388,8 +1409,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        } else {
                nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
                        &page_list, &nr_scanned, sc->order,
-                        sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
+                        sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                        ISOLATE_INACTIVE : ISOLATE_BOTH,
+                                        ISOLATE_BOTH : ISOLATE_INACTIVE,
                        zone, sc->mem_cgroup,
                        0, file);
                /*
@@ -1411,7 +1432,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        /* Check if we should syncronously wait for writeback */
        if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
-                set_lumpy_reclaim_mode(priority, sc, true);
+                set_reclaim_mode(priority, sc, true);
                nr_reclaimed += shrink_page_list(&page_list, zone, sc);
        }
@@ -1426,7 +1447,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                zone_idx(zone),
                nr_scanned, nr_reclaimed,
                priority,
-                trace_shrink_flags(file, sc->lumpy_reclaim_mode));
+                trace_shrink_flags(file, sc->reclaim_mode));
        return nr_reclaimed;
 }
@@ -1466,7 +1487,7 @@ static void move_active_pages_to_lru(struct zone *zone,
                list_move(&page->lru, &zone->lru[lru].list);
                mem_cgroup_add_lru_list(page, lru);
-                pgmoved++;
+                pgmoved += hpage_nr_pages(page);
                if (!pagevec_add(&pvec, page) || list_empty(list)) {
                        spin_unlock_irq(&zone->lru_lock);
@@ -1534,7 +1555,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                }
                if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
-                        nr_rotated++;
+                        nr_rotated += hpage_nr_pages(page);
                        /*
                         * Identify referenced, file-backed active pages and
                         * give them one more trip around the active list. So
@@ -1805,6 +1826,57 @@ out:
 }
 /*
+ * Reclaim/compaction depends on a number of pages being freed. To avoid
+ * disruption to the system, a small number of order-0 pages continue to be
+ * rotated and reclaimed in the normal fashion. However, by the time we get
+ * back to the allocator and call try_to_compact_zone(), we ensure that
+ * there are enough free pages for it to be likely successful
+ */
+static inline bool should_continue_reclaim(struct zone *zone,
+                                        unsigned long nr_reclaimed,
+                                        unsigned long nr_scanned,
+                                        struct scan_control *sc)
+{
+        unsigned long pages_for_compaction;
+        unsigned long inactive_lru_pages;
+        /* If not in reclaim/compaction mode, stop */
+        if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
+                return false;
+        /*
+         * If we failed to reclaim and have scanned the full list, stop.
+         * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
+         *       faster but obviously would be less likely to succeed
+         *       allocation. If this is desirable, use GFP_REPEAT to decide
+         *       if both reclaimed and scanned should be checked or just
+         *       reclaimed
+         */
+        if (!nr_reclaimed && !nr_scanned)
+                return false;
+        /*
+         * If we have not reclaimed enough pages for compaction and the
+         * inactive lists are large enough, continue reclaiming
+         */
+        pages_for_compaction = (2UL << sc->order);
+        inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
+                                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+        if (sc->nr_reclaimed < pages_for_compaction &&
+                        inactive_lru_pages > pages_for_compaction)
+                return true;
+        /* If compaction would go ahead or the allocation would succeed, stop */
+        switch (compaction_suitable(zone, sc->order)) {
+        case COMPACT_PARTIAL:
+        case COMPACT_CONTINUE:
+                return false;
+        default:
+                return true;
+        }
+}
+/*
 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
 */
 static void shrink_zone(int priority, struct zone *zone,
@@ -1813,9 +1885,12 @@ static void shrink_zone(int priority, struct zone *zone,
        unsigned long nr[NR_LRU_LISTS];
        unsigned long nr_to_scan;
        enum lru_list l;
-        unsigned long nr_reclaimed = sc->nr_reclaimed;
+        unsigned long nr_reclaimed;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+        unsigned long nr_scanned = sc->nr_scanned;
+restart:
+        nr_reclaimed = 0;
        get_scan_count(zone, sc, nr, priority);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1841,8 +1916,7 @@ static void shrink_zone(int priority, struct zone *zone,
                if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
                        break;
        }
+        sc->nr_reclaimed += nr_reclaimed;
-        sc->nr_reclaimed = nr_reclaimed;
        /*
         * Even if we did not try to evict anon pages at all, we want to
@@ -1851,6 +1925,11 @@ static void shrink_zone(int priority, struct zone *zone,
        if (inactive_anon_is_low(zone, sc))
                shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+        /* reclaim/compaction might need reclaim to continue */
+        if (should_continue_reclaim(zone, nr_reclaimed,
+                                        sc->nr_scanned - nr_scanned, sc))
+                goto restart;
        throttle_vm_writeout(sc->gfp_mask);
 }
@@ -2124,38 +2203,87 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 }
 #endif
+/*
+ * pgdat_balanced is used when checking if a node is balanced for high-order
+ * allocations. Only zones that meet watermarks and are in a zone allowed
+ * by the callers classzone_idx are added to balanced_pages. The total of
+ * balanced pages must be at least 25% of the zones allowed by classzone_idx
+ * for the node to be considered balanced. Forcing all zones to be balanced
+ * for high orders can cause excessive reclaim when there are imbalanced zones.
+ * The choice of 25% is due to
+ *   o a 16M DMA zone that is balanced will not balance a zone on any
+ *     reasonable sized machine
+ *   o On all other machines, the top zone must be at least a reasonable
+ *     precentage of the middle zones. For example, on 32-bit x86, highmem
+ *     would need to be at least 256M for it to be balance a whole node.
+ *     Similarly, on x86-64 the Normal zone would need to be at least 1G
+ *     to balance a node on its own. These seemed like reasonable ratios.
+ */
+static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
+                                                int classzone_idx)
+{
+        unsigned long present_pages = 0;
+        int i;
+        for (i = 0; i <= classzone_idx; i++)
+                present_pages += pgdat->node_zones[i].present_pages;
+        return balanced_pages > (present_pages >> 2);
+}
 /* is kswapd sleeping prematurely? */
-static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
+static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+                                        int classzone_idx)
 {
        int i;
+        unsigned long balanced = 0;
+        bool all_zones_ok = true;
        /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
        if (remaining)
-                return 1;
+                return true;
-        /* If after HZ/10, a zone is below the high mark, it's premature */
+        /* Check the watermark levels */
        for (i = 0; i < pgdat->nr_zones; i++) {
                struct zone *zone = pgdat->node_zones + i;
                if (!populated_zone(zone))
                        continue;
-                if (zone->all_unreclaimable)
+                /*
+                 * balance_pgdat() skips over all_unreclaimable after
+                 * DEF_PRIORITY. Effectively, it considers them balanced so
+                 * they must be considered balanced here as well if kswapd
+                 * is to sleep
+                 */
+                if (zone->all_unreclaimable) {
+                        balanced += zone->present_pages;
                        continue;
+                }
-                if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
+                if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
-                                                                0, 0))
+                                                        classzone_idx, 0))
-                        return 1;
+                        all_zones_ok = false;
+                else
+                        balanced += zone->present_pages;
        }
-        return 0;
+        /*
+         * For high-order requests, the balanced zones must contain at least
+         * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
+         * must be balanced
+         */
+        if (order)
+                return pgdat_balanced(pgdat, balanced, classzone_idx);
+        else
+                return !all_zones_ok;
 }
 /*
 * For kswapd, balance_pgdat() will work across all this node's zones until
 * they are all at high_wmark_pages(zone).
 *
- * Returns the number of pages which were actually freed.
+ * Returns the final order kswapd was reclaiming at
 *
 * There is special handling here for zones which are full of pinned pages.
 * This can happen if the pages are all mlocked, or if they are all used by
@@ -2172,11 +2300,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
 * interoperates with the page allocator fallback scheme to ensure that aging
 * of pages is balanced across the zones.
 */
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
+                                                        int *classzone_idx)
 {
        int all_zones_ok;
+        unsigned long balanced;
        int priority;
        int i;
+        int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
        unsigned long total_scanned;
        struct reclaim_state *reclaim_state = current->reclaim_state;
        struct scan_control sc = {
@@ -2199,7 +2330,6 @@ loop_again:
        count_vm_event(PAGEOUTRUN);
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
-                int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
                unsigned long lru_pages = 0;
                int has_under_min_watermark_zone = 0;
@@ -2208,6 +2338,7 @@ loop_again:
                        disable_swap_token();
                all_zones_ok = 1;
+                balanced = 0;
                /*
                 * Scan in the highmem->dma direction for the highest
@@ -2230,9 +2361,10 @@ loop_again:
                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
                                                        &sc, priority, 0);
-                        if (!zone_watermark_ok(zone, order,
+                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), 0, 0)) {
                                end_zone = i;
+                                *classzone_idx = i;
                                break;
                        }
                }
@@ -2255,6 +2387,7 @@ loop_again:
                 * cause too much scanning of the lower zones.
                 */
                for (i = 0; i <= end_zone; i++) {
+                        int compaction;
                        struct zone *zone = pgdat->node_zones + i;
                        int nr_slab;
@@ -2276,7 +2409,7 @@ loop_again:
                         * We put equal pressure on every zone, unless one
                         * zone has way too many pages free already.
                         */
-                        if (!zone_watermark_ok(zone, order,
+                        if (!zone_watermark_ok_safe(zone, order,
                                        8*high_wmark_pages(zone), end_zone, 0))
                                shrink_zone(priority, zone, &sc);
                        reclaim_state->reclaimed_slab = 0;
@@ -2284,9 +2417,26 @@ loop_again:
                                                lru_pages);
                        sc.nr_reclaimed += reclaim_state->reclaimed_slab;
                        total_scanned += sc.nr_scanned;
+                        compaction = 0;
+                        if (order &&
+                            zone_watermark_ok(zone, 0,
+                                               high_wmark_pages(zone),
+                                              end_zone, 0) &&
+                            !zone_watermark_ok(zone, order,
+                                               high_wmark_pages(zone),
+                                               end_zone, 0)) {
+                                compact_zone_order(zone,
+                                                   order,
+                                                   sc.gfp_mask, false,
+                                                   COMPACT_MODE_KSWAPD);
+                                compaction = 1;
+                        }
                        if (zone->all_unreclaimable)
                                continue;
-                        if (nr_slab == 0 && !zone_reclaimable(zone))
+                        if (!compaction && nr_slab == 0 &&
+                            !zone_reclaimable(zone))
                                zone->all_unreclaimable = 1;
                        /*
                         * If we've done a decent amount of scanning and
@@ -2297,7 +2447,7 @@ loop_again:
                            total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
                                sc.may_writepage = 1;
-                        if (!zone_watermark_ok(zone, order,
+                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), end_zone, 0)) {
                                all_zones_ok = 0;
                                /*
@@ -2305,7 +2455,7 @@ loop_again:
                                 * means that we have a GFP_ATOMIC allocation
                                 * failure risk. Hurry up!
                                 */
-                                if (!zone_watermark_ok(zone, order,
+                                if (!zone_watermark_ok_safe(zone, order,
                                            min_wmark_pages(zone), end_zone, 0))
                                        has_under_min_watermark_zone = 1;
                        } else {
@@ -2317,10 +2467,12 @@ loop_again:
                                 * spectulatively avoid congestion waits
                                 */
                                zone_clear_flag(zone, ZONE_CONGESTED);
+                                if (i <= *classzone_idx)
+                                        balanced += zone->present_pages;
                        }
                }
-                if (all_zones_ok)
+                if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
                        break;          /* kswapd: all done */
                /*
                 * OK, kswapd is getting into trouble.  Take a nap, then take
@@ -2343,7 +2495,13 @@ loop_again:
                        break;
        }
 out:
-        if (!all_zones_ok) {
+        /*
+         * order-0: All zones must meet high watermark for a balanced node
+         * high-order: Balanced zones must make up at least 25% of the node
+         *             for the node to be balanced
+         */
+        if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
                cond_resched();
                try_to_freeze();
@@ -2368,7 +2526,88 @@ out:
                goto loop_again;
        }
-        return sc.nr_reclaimed;
+        /*
+         * If kswapd was reclaiming at a higher order, it has the option of
+         * sleeping without all zones being balanced. Before it does, it must
+         * ensure that the watermarks for order-0 on *all* zones are met and
+         * that the congestion flags are cleared. The congestion flag must
+         * be cleared as kswapd is the only mechanism that clears the flag
+         * and it is potentially going to sleep here.
+         */
+        if (order) {
+                for (i = 0; i <= end_zone; i++) {
+                        struct zone *zone = pgdat->node_zones + i;
+                        if (!populated_zone(zone))
+                                continue;
+                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                                continue;
+                        /* Confirm the zone is balanced for order-0 */
+                        if (!zone_watermark_ok(zone, 0,
+                                        high_wmark_pages(zone), 0, 0)) {
+                                order = sc.order = 0;
+                                goto loop_again;
+                        }
+                        /* If balanced, clear the congested flag */
+                        zone_clear_flag(zone, ZONE_CONGESTED);
+                }
+        }
+        /*
+         * Return the order we were reclaiming at so sleeping_prematurely()
+         * makes a decision on the order we were last reclaiming at. However,
+         * if another caller entered the allocator slow path while kswapd
+         * was awake, order will remain at the higher level
+         */
+        *classzone_idx = end_zone;
+        return order;
+}
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+{
+        long remaining = 0;
+        DEFINE_WAIT(wait);
+        if (freezing(current) || kthread_should_stop())
+                return;
+        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+        /* Try to sleep for a short interval */
+        if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+                remaining = schedule_timeout(HZ/10);
+                finish_wait(&pgdat->kswapd_wait, &wait);
+                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+        }
+        /*
+         * After a short sleep, check if it was a premature sleep. If not, then
+         * go fully to sleep until explicitly woken up.
+         */
+        if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+                trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
+                /*
+                 * vmstat counters are not perfectly accurate and the estimated
+                 * value for counters such as NR_FREE_PAGES can deviate from the
+                 * true value by nr_online_cpus * threshold. To avoid the zone
+                 * watermarks being breached while under pressure, we reduce the
+                 * per-cpu vmstat threshold while kswapd is awake and restore
+                 * them before going back to sleep.
+                 */
+                set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
+                schedule();
+                set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
+        } else {
+                if (remaining)
+                        count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
+                else
+                        count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
+        }
+        finish_wait(&pgdat->kswapd_wait, &wait);
 }
 /*
@@ -2387,9 +2626,10 @@ out:
 static int kswapd(void *p)
 {
        unsigned long order;
+        int classzone_idx;
        pg_data_t *pgdat = (pg_data_t*)p;
        struct task_struct *tsk = current;
-        DEFINE_WAIT(wait);
        struct reclaim_state reclaim_state = {
                .reclaimed_slab = 0,
        };
@@ -2417,49 +2657,30 @@ static int kswapd(void *p)
        set_freezable();
        order = 0;
+        classzone_idx = MAX_NR_ZONES - 1;
        for ( ; ; ) {
                unsigned long new_order;
+                int new_classzone_idx;
                int ret;
-                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
                new_order = pgdat->kswapd_max_order;
+                new_classzone_idx = pgdat->classzone_idx;
                pgdat->kswapd_max_order = 0;
-                if (order < new_order) {
+                pgdat->classzone_idx = MAX_NR_ZONES - 1;
+                if (order < new_order || classzone_idx > new_classzone_idx) {
                        /*
                         * Don't sleep if someone wants a larger 'order'
-                         * allocation
+                         * allocation or has tigher zone constraints
                         */
                        order = new_order;
+                        classzone_idx = new_classzone_idx;
                } else {
-                        if (!freezing(current) && !kthread_should_stop()) {
+                        kswapd_try_to_sleep(pgdat, order, classzone_idx);
-                                long remaining = 0;
-                                /* Try to sleep for a short interval */
-                                if (!sleeping_prematurely(pgdat, order, remaining)) {
-                                        remaining = schedule_timeout(HZ/10);
-                                        finish_wait(&pgdat->kswapd_wait, &wait);
-                                        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
-                                }
-                                /*
-                                 * After a short sleep, check if it was a
-                                 * premature sleep. If not, then go fully
-                                 * to sleep until explicitly woken up
-                                 */
-                                if (!sleeping_prematurely(pgdat, order, remaining)) {
-                                        trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
-                                        schedule();
-                                } else {
-                                        if (remaining)
-                                                count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
-                                        else
-                                                count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
-                                }
-                        }
                        order = pgdat->kswapd_max_order;
+                        classzone_idx = pgdat->classzone_idx;
+                        pgdat->kswapd_max_order = 0;
+                        pgdat->classzone_idx = MAX_NR_ZONES - 1;
                }
-                finish_wait(&pgdat->kswapd_wait, &wait);
                ret = try_to_freeze();
                if (kthread_should_stop())
@@ -2471,7 +2692,7 @@ static int kswapd(void *p)
                 */
                if (!ret) {
                        trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-                        balance_pgdat(pgdat, order);
+                        order = balance_pgdat(pgdat, order, &classzone_idx);
                }
        }
        return 0;
@@ -2480,23 +2701,26 @@ static int kswapd(void *p)
 /*
 * A zone is low on free memory, so wake its kswapd task to service it.
 */
-void wakeup_kswapd(struct zone *zone, int order)
+void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 {
        pg_data_t *pgdat;
        if (!populated_zone(zone))
                return;
-        pgdat = zone->zone_pgdat;
-        if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
-                return;
-        if (pgdat->kswapd_max_order < order)
-                pgdat->kswapd_max_order = order;
-        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                return;
+        pgdat = zone->zone_pgdat;
+        if (pgdat->kswapd_max_order < order) {
+                pgdat->kswapd_max_order = order;
+                pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
+        }
        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;
+        if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+                return;
+        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
        wake_up_interruptible(&pgdat->kswapd_wait);
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 312d728976f1..0c3b5048773e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -83,7 +83,31 @@ EXPORT_SYMBOL(vm_stat);
 #ifdef CONFIG_SMP
-static int calculate_threshold(struct zone *zone)
+int calculate_pressure_threshold(struct zone *zone)
+{
+        int threshold;
+        int watermark_distance;
+        /*
+         * As vmstats are not up to date, there is drift between the estimated
+         * and real values. For high thresholds and a high number of CPUs, it
+         * is possible for the min watermark to be breached while the estimated
+         * value looks fine. The pressure threshold is a reduced value such
+         * that even the maximum amount of drift will not accidentally breach
+         * the min watermark
+         */
+        watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
+        threshold = max(1, (int)(watermark_distance / num_online_cpus()));
+        /*
+         * Maximum threshold is 125
+         */
+        threshold = min(125, threshold);
+        return threshold;
+}
+int calculate_normal_threshold(struct zone *zone)
 {
        int threshold;
        int mem;        /* memory in 128 MB units */
@@ -142,7 +166,7 @@ static void refresh_zone_stat_thresholds(void)
        for_each_populated_zone(zone) {
                unsigned long max_drift, tolerate_drift;
-                threshold = calculate_threshold(zone);
+                threshold = calculate_normal_threshold(zone);
                for_each_online_cpu(cpu)
                        per_cpu_ptr(zone->pageset, cpu)->stat_threshold
@@ -161,6 +185,26 @@ static void refresh_zone_stat_thresholds(void)
        }
 }
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+                                int (*calculate_pressure)(struct zone *))
+{
+        struct zone *zone;
+        int cpu;
+        int threshold;
+        int i;
+        for (i = 0; i < pgdat->nr_zones; i++) {
+                zone = &pgdat->node_zones[i];
+                if (!zone->percpu_drift_mark)
+                        continue;
+                threshold = (*calculate_pressure)(zone);
+                for_each_possible_cpu(cpu)
+                        per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+                                                        = threshold;
+        }
+}
 /*
 * For use when we know that interrupts are disabled.
 */
@@ -836,6 +880,7 @@ static const char * const vmstat_text[] = {
        "numa_local",
        "numa_other",
 #endif
+        "nr_anon_transparent_hugepages",
        "nr_dirty_threshold",
        "nr_dirty_background_threshold",
@@ -911,7 +956,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                   "\n        scanned  %lu"
                   "\n        spanned  %lu"
                   "\n        present  %lu",
-                   zone_nr_free_pages(zone),
+                   zone_page_state(zone, NR_FREE_PAGES),
                   min_wmark_pages(zone),
                   low_wmark_pages(zone),
                   high_wmark_pages(zone),
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7f686251f711..f29abeb6a912 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -104,8 +104,26 @@ static pfn_t fault_pfn;
 inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
        if (pfn_valid(pfn)) {
-                struct page *page = compound_head(pfn_to_page(pfn));
+                int reserved;
-                return PageReserved(page);
+                struct page *tail = pfn_to_page(pfn);
+                struct page *head = compound_trans_head(tail);
+                reserved = PageReserved(head);
+                if (head != tail) {
+                        /*
+                         * "head" is not a dangling pointer
+                         * (compound_trans_head takes care of that)
+                         * but the hugepage may have been splitted
+                         * from under us (and we may not hold a
+                         * reference count on the head page so it can
+                         * be reused before we run PageReferenced), so
+                         * we've to check PageTail before returning
+                         * what we just read.
+                         */
+                        smp_rmb();
+                        if (PageTail(tail))
+                                return reserved;
+                }
+                return PageReserved(tail);
        }
        return true;
@@ -352,6 +370,22 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
        return young;
 }
+static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
+                                       struct mm_struct *mm,
+                                       unsigned long address)
+{
+        struct kvm *kvm = mmu_notifier_to_kvm(mn);
+        int young, idx;
+        idx = srcu_read_lock(&kvm->srcu);
+        spin_lock(&kvm->mmu_lock);
+        young = kvm_test_age_hva(kvm, address);
+        spin_unlock(&kvm->mmu_lock);
+        srcu_read_unlock(&kvm->srcu, idx);
+        return young;
+}
 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
                                     struct mm_struct *mm)
 {
@@ -368,6 +402,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
        .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
        .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
        .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
+        .test_young             = kvm_mmu_notifier_test_young,
        .change_pte             = kvm_mmu_notifier_change_pte,
        .release                = kvm_mmu_notifier_release,
 };
author	Paul Mundt <lethal@linux-sh.org>	2011-01-14 02:06:31 -0500
committer	Paul Mundt <lethal@linux-sh.org>	2011-01-14 02:06:31 -0500
commit	c488a4731abb53aa1bab9fccd8a7472083159bfd (patch)
tree	db6d4a664a1e4b7685c1d2d79da63263f40adf7b
parent	6d2ae89c36e2adab5cfa69fecb11290082817ac6 (diff)
parent	bba958783b1b4cb0a9420f4e11082467132a334c (diff)