aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-01-17 15:58:52 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-17 15:58:52 -0500
commit0cbeafb245ca568bc0765645aa64f0451b716657 (patch)
tree663c09ff5a62a1b2b66a17c4dfe0413603530a36
parent58cf279acac3080ce03eeea5ca268210b3165fe1 (diff)
parent06b031de22d28ae76b2e5bfaf22c56a265a1e106 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge second patch-bomb from Andrew Morton: - more MM stuff: - Kirill's page-flags rework - Kirill's now-allegedly-fixed THP rework - MADV_FREE implementation - DAX feature work (msync/fsync). This isn't quite complete but DAX is new and it's good enough and the guys have a handle on what needs to be done - I expect this to be wrapped in the next week or two. - some vsprintf maintenance work - various other misc bits * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (145 commits) printk: change recursion_bug type to bool lib/vsprintf: factor out %pN[F] handler as netdev_bits() lib/vsprintf: refactor duplicate code to special_hex_number() printk-formats.txt: remove unimplemented %pT printk: help pr_debug and pr_devel to optimize out arguments lib/test_printf.c: test dentry printing lib/test_printf.c: add test for large bitmaps lib/test_printf.c: account for kvasprintf tests lib/test_printf.c: add a few number() tests lib/test_printf.c: test precision quirks lib/test_printf.c: check for out-of-bound writes lib/test_printf.c: don't BUG lib/kasprintf.c: add sanity check to kvasprintf lib/vsprintf.c: warn about too large precisions and field widths lib/vsprintf.c: help gcc make number() smaller lib/vsprintf.c: expand field_width to 24 bits lib/vsprintf.c: eliminate potential race in string() lib/vsprintf.c: move string() below widen_string() lib/vsprintf.c: pull out padding code from dentry_name() printk: do cond_resched() between lines while outputting to consoles ...
-rw-r--r--Documentation/features/vm/pmdp_splitting_flush/arch-support.txt40
-rw-r--r--Documentation/printk-formats.txt9
-rw-r--r--Documentation/vm/transhuge.txt151
-rw-r--r--arch/alpha/include/uapi/asm/mman.h2
-rw-r--r--arch/arc/Kconfig3
-rw-r--r--arch/arc/mm/cache.c4
-rw-r--r--arch/arm/Kconfig5
-rw-r--r--arch/arm/include/asm/kvm_mmu.h5
-rw-r--r--arch/arm/include/asm/pgtable-3level.h10
-rw-r--r--arch/arm/kvm/mmu.c10
-rw-r--r--arch/arm/lib/uaccess_with_memcpy.c5
-rw-r--r--arch/arm/mm/flush.c17
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h3
-rw-r--r--arch/arm64/include/asm/pgtable.h9
-rw-r--r--arch/arm64/mm/flush.c16
-rw-r--r--arch/avr32/include/asm/page.h8
-rw-r--r--arch/frv/include/asm/page.h2
-rw-r--r--arch/ia64/include/asm/page.h1
-rw-r--r--arch/metag/Kconfig3
-rw-r--r--arch/microblaze/Kconfig3
-rw-r--r--arch/mips/include/asm/kvm_host.h6
-rw-r--r--arch/mips/include/asm/pgtable-bits.h10
-rw-r--r--arch/mips/include/asm/pgtable.h18
-rw-r--r--arch/mips/include/uapi/asm/mman.h2
-rw-r--r--arch/mips/kvm/emulate.c2
-rw-r--r--arch/mips/kvm/tlb.c14
-rw-r--r--arch/mips/mm/c-r4k.c3
-rw-r--r--arch/mips/mm/cache.c2
-rw-r--r--arch/mips/mm/gup.c17
-rw-r--r--arch/mips/mm/init.c6
-rw-r--r--arch/mips/mm/pgtable-64.c14
-rw-r--r--arch/mips/mm/tlbex.c1
-rw-r--r--arch/mn10300/include/asm/page.h1
-rw-r--r--arch/parisc/Kconfig3
-rw-r--r--arch/parisc/include/uapi/asm/mman.h2
-rw-r--r--arch/powerpc/Kconfig3
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash-64k.h12
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash.h10
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h6
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h4
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h2
-rw-r--r--arch/powerpc/kvm/book3s.c6
-rw-r--r--arch/powerpc/kvm/book3s_32_mmu_host.c2
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_host.c2
-rw-r--r--arch/powerpc/kvm/e500.h2
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.c8
-rw-r--r--arch/powerpc/kvm/trace_pr.h2
-rw-r--r--arch/powerpc/mm/hugepage-hash64.c3
-rw-r--r--arch/powerpc/mm/hugetlbpage.c17
-rw-r--r--arch/powerpc/mm/pgtable_64.c49
-rw-r--r--arch/powerpc/mm/subpage-prot.c2
-rw-r--r--arch/powerpc/sysdev/axonram.c9
-rw-r--r--arch/s390/Kconfig3
-rw-r--r--arch/s390/include/asm/pgtable.h16
-rw-r--r--arch/s390/mm/gup.c24
-rw-r--r--arch/s390/mm/pgtable.c47
-rw-r--r--arch/sh/Kconfig3
-rw-r--r--arch/sh/mm/cache-sh4.c2
-rw-r--r--arch/sh/mm/cache.c8
-rw-r--r--arch/sparc/Kconfig4
-rw-r--r--arch/sparc/include/asm/pgtable_64.h19
-rw-r--r--arch/sparc/mm/fault_64.c3
-rw-r--r--arch/sparc/mm/gup.c16
-rw-r--r--arch/tile/include/asm/pgtable.h10
-rw-r--r--arch/um/include/asm/page.h7
-rw-r--r--arch/um/include/asm/pgtable-3level.h4
-rw-r--r--arch/um/include/asm/pgtable.h2
-rw-r--r--arch/unicore32/Kconfig3
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/include/asm/pgtable.h40
-rw-r--r--arch/x86/include/asm/pgtable_types.h9
-rw-r--r--arch/x86/include/asm/pmem.h7
-rw-r--r--arch/x86/kernel/vm86_32.c6
-rw-r--r--arch/x86/kvm/iommu.c11
-rw-r--r--arch/x86/kvm/mmu.c37
-rw-r--r--arch/x86/kvm/mmu_audit.c2
-rw-r--r--arch/x86/kvm/paging_tmpl.h6
-rw-r--r--arch/x86/kvm/vmx.c2
-rw-r--r--arch/x86/kvm/x86.c2
-rw-r--r--arch/x86/mm/gup.c74
-rw-r--r--arch/x86/mm/init_64.c33
-rw-r--r--arch/x86/mm/pat.c5
-rw-r--r--arch/x86/mm/pgtable.c13
-rw-r--r--arch/xtensa/include/uapi/asm/mman.h2
-rw-r--r--arch/xtensa/mm/tlb.c2
-rw-r--r--drivers/base/memory.c13
-rw-r--r--drivers/block/brd.c7
-rw-r--r--drivers/block/zram/zram_drv.c7
-rw-r--r--drivers/gpu/drm/exynos/exynos_drm_gem.c4
-rw-r--r--drivers/gpu/drm/gma500/framebuffer.c4
-rw-r--r--drivers/gpu/drm/msm/msm_gem.c4
-rw-r--r--drivers/gpu/drm/omapdrm/omap_gem.c7
-rw-r--r--drivers/gpu/drm/ttm/ttm_bo_vm.c4
-rw-r--r--drivers/iio/industrialio-core.c9
-rw-r--r--drivers/net/wireless/intel/iwlwifi/dvm/calib.c2
-rw-r--r--drivers/nvdimm/pfn_devs.c3
-rw-r--r--drivers/nvdimm/pmem.c73
-rw-r--r--drivers/s390/block/dcssblk.c11
-rw-r--r--fs/Kconfig3
-rw-r--r--fs/block_dev.c15
-rw-r--r--fs/cifs/file.c8
-rw-r--r--fs/dax.c301
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--fs/hugetlbfs/inode.c143
-rw-r--r--fs/proc/page.c4
-rw-r--r--fs/proc/task_mmu.c55
-rw-r--r--fs/stat.c2
-rw-r--r--include/asm-generic/pgtable.h15
-rw-r--r--include/asm-generic/sections.h65
-rw-r--r--include/linux/blkdev.h20
-rw-r--r--include/linux/console.h1
-rw-r--r--include/linux/err.h2
-rw-r--r--include/linux/huge_mm.h79
-rw-r--r--include/linux/hugetlb.h1
-rw-r--r--include/linux/io.h15
-rw-r--r--include/linux/kdev_t.h5
-rw-r--r--include/linux/kernel.h36
-rw-r--r--include/linux/kvm_host.h37
-rw-r--r--include/linux/kvm_types.h2
-rw-r--r--include/linux/list.h11
-rw-r--r--include/linux/memblock.h18
-rw-r--r--include/linux/memcontrol.h16
-rw-r--r--include/linux/memory_hotplug.h3
-rw-r--r--include/linux/memremap.h114
-rw-r--r--include/linux/mm.h199
-rw-r--r--include/linux/mm_types.h25
-rw-r--r--include/linux/mmdebug.h6
-rw-r--r--include/linux/page-flags.h286
-rw-r--r--include/linux/pagemap.h38
-rw-r--r--include/linux/pfn.h9
-rw-r--r--include/linux/pfn_t.h102
-rw-r--r--include/linux/poison.h6
-rw-r--r--include/linux/printk.h12
-rw-r--r--include/linux/rmap.h37
-rw-r--r--include/linux/swap.h4
-rw-r--r--include/linux/vm_event_item.h5
-rw-r--r--include/trace/events/huge_memory.h1
-rw-r--r--include/uapi/asm-generic/mman-common.h1
-rw-r--r--init/Kconfig2
-rw-r--r--kernel/events/uprobes.c11
-rw-r--r--kernel/futex.c65
-rw-r--r--kernel/memremap.c219
-rw-r--r--kernel/panic.c3
-rw-r--r--kernel/printk/printk.c67
-rw-r--r--kernel/stop_machine.c4
-rw-r--r--lib/Kconfig.debug9
-rw-r--r--lib/kasprintf.c10
-rw-r--r--lib/list_debug.c9
-rw-r--r--lib/test_printf.c121
-rw-r--r--lib/vsprintf.c252
-rw-r--r--mm/debug.c8
-rw-r--r--mm/filemap.c25
-rw-r--r--mm/gup.c172
-rw-r--r--mm/huge_memory.c1506
-rw-r--r--mm/hugetlb.c12
-rw-r--r--mm/internal.h70
-rw-r--r--mm/ksm.c69
-rw-r--r--mm/madvise.c201
-rw-r--r--mm/memcontrol.c106
-rw-r--r--mm/memory-failure.c125
-rw-r--r--mm/memory.c101
-rw-r--r--mm/memory_hotplug.c67
-rw-r--r--mm/mempolicy.c45
-rw-r--r--mm/migrate.c21
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mlock.c27
-rw-r--r--mm/mmap.c25
-rw-r--r--mm/mprotect.c7
-rw-r--r--mm/mremap.c15
-rw-r--r--mm/page_alloc.c47
-rw-r--r--mm/page_idle.c27
-rw-r--r--mm/page_isolation.c6
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/pgtable-generic.c14
-rw-r--r--mm/rmap.c369
-rw-r--r--mm/shmem.c25
-rw-r--r--mm/slub.c2
-rw-r--r--mm/sparse-vmemmap.c76
-rw-r--r--mm/sparse.c8
-rw-r--r--mm/swap.c319
-rw-r--r--mm/swap_state.c9
-rw-r--r--mm/swapfile.c34
-rw-r--r--mm/userfaultfd.c8
-rw-r--r--mm/util.c24
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c16
-rw-r--r--mm/vmstat.c5
-rwxr-xr-xscripts/tags.sh2
-rw-r--r--virt/kvm/kvm_main.c47
189 files changed, 4357 insertions, 2886 deletions
diff --git a/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt b/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt
deleted file mode 100644
index 26f74b457e0b..000000000000
--- a/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt
+++ /dev/null
@@ -1,40 +0,0 @@
1#
2# Feature name: pmdp_splitting_flush
3# Kconfig: __HAVE_ARCH_PMDP_SPLITTING_FLUSH
4# description: arch supports the pmdp_splitting_flush() VM API
5#
6 -----------------------
7 | arch |status|
8 -----------------------
9 | alpha: | TODO |
10 | arc: | TODO |
11 | arm: | ok |
12 | arm64: | ok |
13 | avr32: | TODO |
14 | blackfin: | TODO |
15 | c6x: | TODO |
16 | cris: | TODO |
17 | frv: | TODO |
18 | h8300: | TODO |
19 | hexagon: | TODO |
20 | ia64: | TODO |
21 | m32r: | TODO |
22 | m68k: | TODO |
23 | metag: | TODO |
24 | microblaze: | TODO |
25 | mips: | ok |
26 | mn10300: | TODO |
27 | nios2: | TODO |
28 | openrisc: | TODO |
29 | parisc: | TODO |
30 | powerpc: | ok |
31 | s390: | ok |
32 | score: | TODO |
33 | sh: | TODO |
34 | sparc: | TODO |
35 | tile: | TODO |
36 | um: | TODO |
37 | unicore32: | TODO |
38 | x86: | ok |
39 | xtensa: | TODO |
40 -----------------------
diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt
index 6389551bbad6..5d1128bf0282 100644
--- a/Documentation/printk-formats.txt
+++ b/Documentation/printk-formats.txt
@@ -306,15 +306,6 @@ Network device features:
306 306
307 Passed by reference. 307 Passed by reference.
308 308
309Command from struct task_struct
310
311 %pT ls
312
313 For printing executable name excluding path from struct
314 task_struct.
315
316 Passed by reference.
317
318If you add other %p extensions, please extend lib/test_printf.c with 309If you add other %p extensions, please extend lib/test_printf.c with
319one or more test cases, if at all feasible. 310one or more test cases, if at all feasible.
320 311
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 8a282687ee06..21cf34f3ddb2 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -35,10 +35,10 @@ miss is going to run faster.
35 35
36== Design == 36== Design ==
37 37
38- "graceful fallback": mm components which don't have transparent 38- "graceful fallback": mm components which don't have transparent hugepage
39 hugepage knowledge fall back to breaking a transparent hugepage and 39 knowledge fall back to breaking huge pmd mapping into table of ptes and,
40 working on the regular pages and their respective regular pmd/pte 40 if necessary, split a transparent hugepage. Therefore these components
41 mappings 41 can continue working on the regular pages or regular pte mappings.
42 42
43- if a hugepage allocation fails because of memory fragmentation, 43- if a hugepage allocation fails because of memory fragmentation,
44 regular pages should be gracefully allocated instead and mixed in 44 regular pages should be gracefully allocated instead and mixed in
@@ -221,9 +221,18 @@ thp_collapse_alloc_failed is incremented if khugepaged found a range
221 of pages that should be collapsed into one huge page but failed 221 of pages that should be collapsed into one huge page but failed
222 the allocation. 222 the allocation.
223 223
224thp_split is incremented every time a huge page is split into base 224thp_split_page is incremented every time a huge page is split into base
225 pages. This can happen for a variety of reasons but a common 225 pages. This can happen for a variety of reasons but a common
226 reason is that a huge page is old and is being reclaimed. 226 reason is that a huge page is old and is being reclaimed.
227 This action implies splitting all PMD the page mapped with.
228
229thp_split_page_failed is is incremented if kernel fails to split huge
230 page. This can happen if the page was pinned by somebody.
231
232thp_split_pmd is incremented every time a PMD split into table of PTEs.
233 This can happen, for instance, when application calls mprotect() or
234 munmap() on part of huge page. It doesn't split huge page, only
235 page table entry.
227 236
228thp_zero_page_alloc is incremented every time a huge zero page is 237thp_zero_page_alloc is incremented every time a huge zero page is
229 successfully allocated. It includes allocations which where 238 successfully allocated. It includes allocations which where
@@ -274,10 +283,8 @@ is complete, so they won't ever notice the fact the page is huge. But
274if any driver is going to mangle over the page structure of the tail 283if any driver is going to mangle over the page structure of the tail
275page (like for checking page->mapping or other bits that are relevant 284page (like for checking page->mapping or other bits that are relevant
276for the head page and not the tail page), it should be updated to jump 285for the head page and not the tail page), it should be updated to jump
277to check head page instead (while serializing properly against 286to check head page instead. Taking reference on any head/tail page would
278split_huge_page() to avoid the head and tail pages to disappear from 287prevent page from being split by anyone.
279under it, see the futex code to see an example of that, hugetlbfs also
280needed special handling in futex code for similar reasons).
281 288
282NOTE: these aren't new constraints to the GUP API, and they match the 289NOTE: these aren't new constraints to the GUP API, and they match the
283same constrains that applies to hugetlbfs too, so any driver capable 290same constrains that applies to hugetlbfs too, so any driver capable
@@ -312,9 +319,9 @@ unaffected. libhugetlbfs will also work fine as usual.
312== Graceful fallback == 319== Graceful fallback ==
313 320
314Code walking pagetables but unware about huge pmds can simply call 321Code walking pagetables but unware about huge pmds can simply call
315split_huge_page_pmd(vma, addr, pmd) where the pmd is the one returned by 322split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by
316pmd_offset. It's trivial to make the code transparent hugepage aware 323pmd_offset. It's trivial to make the code transparent hugepage aware
317by just grepping for "pmd_offset" and adding split_huge_page_pmd where 324by just grepping for "pmd_offset" and adding split_huge_pmd where
318missing after pmd_offset returns the pmd. Thanks to the graceful 325missing after pmd_offset returns the pmd. Thanks to the graceful
319fallback design, with a one liner change, you can avoid to write 326fallback design, with a one liner change, you can avoid to write
320hundred if not thousand of lines of complex code to make your code 327hundred if not thousand of lines of complex code to make your code
@@ -323,7 +330,8 @@ hugepage aware.
323If you're not walking pagetables but you run into a physical hugepage 330If you're not walking pagetables but you run into a physical hugepage
324but you can't handle it natively in your code, you can split it by 331but you can't handle it natively in your code, you can split it by
325calling split_huge_page(page). This is what the Linux VM does before 332calling split_huge_page(page). This is what the Linux VM does before
326it tries to swapout the hugepage for example. 333it tries to swapout the hugepage for example. split_huge_page() can fail
334if the page is pinned and you must handle this correctly.
327 335
328Example to make mremap.c transparent hugepage aware with a one liner 336Example to make mremap.c transparent hugepage aware with a one liner
329change: 337change:
@@ -335,14 +343,14 @@ diff --git a/mm/mremap.c b/mm/mremap.c
335 return NULL; 343 return NULL;
336 344
337 pmd = pmd_offset(pud, addr); 345 pmd = pmd_offset(pud, addr);
338+ split_huge_page_pmd(vma, addr, pmd); 346+ split_huge_pmd(vma, pmd, addr);
339 if (pmd_none_or_clear_bad(pmd)) 347 if (pmd_none_or_clear_bad(pmd))
340 return NULL; 348 return NULL;
341 349
342== Locking in hugepage aware code == 350== Locking in hugepage aware code ==
343 351
344We want as much code as possible hugepage aware, as calling 352We want as much code as possible hugepage aware, as calling
345split_huge_page() or split_huge_page_pmd() has a cost. 353split_huge_page() or split_huge_pmd() has a cost.
346 354
347To make pagetable walks huge pmd aware, all you need to do is to call 355To make pagetable walks huge pmd aware, all you need to do is to call
348pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the 356pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
@@ -351,47 +359,80 @@ created from under you by khugepaged (khugepaged collapse_huge_page
351takes the mmap_sem in write mode in addition to the anon_vma lock). If 359takes the mmap_sem in write mode in addition to the anon_vma lock). If
352pmd_trans_huge returns false, you just fallback in the old code 360pmd_trans_huge returns false, you just fallback in the old code
353paths. If instead pmd_trans_huge returns true, you have to take the 361paths. If instead pmd_trans_huge returns true, you have to take the
354mm->page_table_lock and re-run pmd_trans_huge. Taking the 362page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the
355page_table_lock will prevent the huge pmd to be converted into a 363page table lock will prevent the huge pmd to be converted into a
356regular pmd from under you (split_huge_page can run in parallel to the 364regular pmd from under you (split_huge_pmd can run in parallel to the
357pagetable walk). If the second pmd_trans_huge returns false, you 365pagetable walk). If the second pmd_trans_huge returns false, you
358should just drop the page_table_lock and fallback to the old code as 366should just drop the page table lock and fallback to the old code as
359before. Otherwise you should run pmd_trans_splitting on the pmd. In 367before. Otherwise you can proceed to process the huge pmd and the
360case pmd_trans_splitting returns true, it means split_huge_page is 368hugepage natively. Once finished you can drop the page table lock.
361already in the middle of splitting the page. So if pmd_trans_splitting 369
362returns true it's enough to drop the page_table_lock and call 370== Refcounts and transparent huge pages ==
363wait_split_huge_page and then fallback the old code paths. You are 371
364guaranteed by the time wait_split_huge_page returns, the pmd isn't 372Refcounting on THP is mostly consistent with refcounting on other compound
365huge anymore. If pmd_trans_splitting returns false, you can proceed to 373pages:
366process the huge pmd and the hugepage natively. Once finished you can 374
367drop the page_table_lock. 375 - get_page()/put_page() and GUP operate in head page's ->_count.
368 376
369== compound_lock, get_user_pages and put_page == 377 - ->_count in tail pages is always zero: get_page_unless_zero() never
378 succeed on tail pages.
379
380 - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
381 on relevant sub-page of the compound page.
382
383 - map/unmap of the whole compound page accounted in compound_mapcount
384 (stored in first tail page).
385
386PageDoubleMap() indicates that ->_mapcount in all subpages is offset up by one.
387This additional reference is required to get race-free detection of unmap of
388subpages when we have them mapped with both PMDs and PTEs.
389
390This is optimization required to lower overhead of per-subpage mapcount
391tracking. The alternative is alter ->_mapcount in all subpages on each
392map/unmap of the whole compound page.
393
394We set PG_double_map when a PMD of the page got split for the first time,
395but still have PMD mapping. The addtional references go away with last
396compound_mapcount.
370 397
371split_huge_page internally has to distribute the refcounts in the head 398split_huge_page internally has to distribute the refcounts in the head
372page to the tail pages before clearing all PG_head/tail bits from the 399page to the tail pages before clearing all PG_head/tail bits from the page
373page structures. It can do that easily for refcounts taken by huge pmd 400structures. It can be done easily for refcounts taken by page table
374mappings. But the GUI API as created by hugetlbfs (that returns head 401entries. But we don't have enough information on how to distribute any
375and tail pages if running get_user_pages on an address backed by any 402additional pins (i.e. from get_user_pages). split_huge_page() fails any
376hugepage), requires the refcount to be accounted on the tail pages and 403requests to split pinned huge page: it expects page count to be equal to
377not only in the head pages, if we want to be able to run 404sum of mapcount of all sub-pages plus one (split_huge_page caller must
378split_huge_page while there are gup pins established on any tail 405have reference for head page).
379page. Failure to be able to run split_huge_page if there's any gup pin 406
380on any tail page, would mean having to split all hugepages upfront in 407split_huge_page uses migration entries to stabilize page->_count and
381get_user_pages which is unacceptable as too many gup users are 408page->_mapcount.
382performance critical and they must work natively on hugepages like 409
383they work natively on hugetlbfs already (hugetlbfs is simpler because 410We safe against physical memory scanners too: the only legitimate way
384hugetlbfs pages cannot be split so there wouldn't be requirement of 411scanner can get reference to a page is get_page_unless_zero().
385accounting the pins on the tail pages for hugetlbfs). If we wouldn't 412
386account the gup refcounts on the tail pages during gup, we won't know 413All tail pages has zero ->_count until atomic_add(). It prevent scanner
387anymore which tail page is pinned by gup and which is not while we run 414from geting reference to tail page up to the point. After the atomic_add()
388split_huge_page. But we still have to add the gup pin to the head page 415we don't care about ->_count value. We already known how many references
389too, to know when we can free the compound page in case it's never 416with should uncharge from head page.
390split during its lifetime. That requires changing not just 417
391get_page, but put_page as well so that when put_page runs on a tail 418For head page get_page_unless_zero() will succeed and we don't mind. It's
392page (and only on a tail page) it will find its respective head page, 419clear where reference should go after split: it will stay on head page.
393and then it will decrease the head page refcount in addition to the 420
394tail page refcount. To obtain a head page reliably and to decrease its 421Note that split_huge_pmd() doesn't have any limitation on refcounting:
395refcount without race conditions, put_page has to serialize against 422pmd can be split at any point and never fails.
396__split_huge_page_refcount using a special per-page lock called 423
397compound_lock. 424== Partial unmap and deferred_split_huge_page() ==
425
426Unmapping part of THP (with munmap() or other way) is not going to free
427memory immediately. Instead, we detect that a subpage of THP is not in use
428in page_remove_rmap() and queue the THP for splitting if memory pressure
429comes. Splitting will free up unused subpages.
430
431Splitting the page right away is not an option due to locking context in
432the place where we can detect partial unmap. It's also might be
433counterproductive since in many cases partial unmap unmap happens during
434exit(2) if an THP crosses VMA boundary.
435
436Function deferred_split_huge_page() is used to queue page for splitting.
437The splitting itself will happen when we get memory pressure via shrinker
438interface.
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index f2f949671798..ab336c06153e 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -47,8 +47,10 @@
47#define MADV_WILLNEED 3 /* will need these pages */ 47#define MADV_WILLNEED 3 /* will need these pages */
48#define MADV_SPACEAVAIL 5 /* ensure resources are available */ 48#define MADV_SPACEAVAIL 5 /* ensure resources are available */
49#define MADV_DONTNEED 6 /* don't need these pages */ 49#define MADV_DONTNEED 6 /* don't need these pages */
50#define MADV_FREE 7 /* free pages only if memory pressure */
50 51
51/* common/generic parameters */ 52/* common/generic parameters */
53#define MADV_FREE 8 /* free pages only if memory pressure */
52#define MADV_REMOVE 9 /* remove these pages & resources */ 54#define MADV_REMOVE 9 /* remove these pages & resources */
53#define MADV_DONTFORK 10 /* don't inherit across fork */ 55#define MADV_DONTFORK 10 /* don't inherit across fork */
54#define MADV_DOFORK 11 /* do inherit across fork */ 56#define MADV_DOFORK 11 /* do inherit across fork */
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 6312f607932f..76dde9db7934 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -73,9 +73,6 @@ config STACKTRACE_SUPPORT
73 def_bool y 73 def_bool y
74 select STACKTRACE 74 select STACKTRACE
75 75
76config HAVE_LATENCYTOP_SUPPORT
77 def_bool y
78
79config HAVE_ARCH_TRANSPARENT_HUGEPAGE 76config HAVE_ARCH_TRANSPARENT_HUGEPAGE
80 def_bool y 77 def_bool y
81 depends on ARC_MMU_V4 78 depends on ARC_MMU_V4
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index ff7ff6cbb811..b65f797e9ad6 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -617,7 +617,7 @@ void flush_dcache_page(struct page *page)
617 */ 617 */
618 if (!mapping_mapped(mapping)) { 618 if (!mapping_mapped(mapping)) {
619 clear_bit(PG_dc_clean, &page->flags); 619 clear_bit(PG_dc_clean, &page->flags);
620 } else if (page_mapped(page)) { 620 } else if (page_mapcount(page)) {
621 621
622 /* kernel reading from page with U-mapping */ 622 /* kernel reading from page with U-mapping */
623 phys_addr_t paddr = (unsigned long)page_address(page); 623 phys_addr_t paddr = (unsigned long)page_address(page);
@@ -857,7 +857,7 @@ void copy_user_highpage(struct page *to, struct page *from,
857 * For !VIPT cache, all of this gets compiled out as 857 * For !VIPT cache, all of this gets compiled out as
858 * addr_not_cache_congruent() is 0 858 * addr_not_cache_congruent() is 0
859 */ 859 */
860 if (page_mapped(from) && addr_not_cache_congruent(kfrom, u_vaddr)) { 860 if (page_mapcount(from) && addr_not_cache_congruent(kfrom, u_vaddr)) {
861 __flush_dcache_page((unsigned long)kfrom, u_vaddr); 861 __flush_dcache_page((unsigned long)kfrom, u_vaddr);
862 clean_src_k_mappings = 1; 862 clean_src_k_mappings = 1;
863 } 863 }
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 4e489cc5c45e..6a889afa6a2c 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -168,11 +168,6 @@ config STACKTRACE_SUPPORT
168 bool 168 bool
169 default y 169 default y
170 170
171config HAVE_LATENCYTOP_SUPPORT
172 bool
173 depends on !SMP
174 default y
175
176config LOCKDEP_SUPPORT 171config LOCKDEP_SUPPORT
177 bool 172 bool
178 default y 173 default y
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 9203c21b4673..a520b7987a29 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -182,7 +182,8 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
182 return (vcpu->arch.cp15[c1_SCTLR] & 0b101) == 0b101; 182 return (vcpu->arch.cp15[c1_SCTLR] & 0b101) == 0b101;
183} 183}
184 184
185static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn, 185static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu,
186 kvm_pfn_t pfn,
186 unsigned long size, 187 unsigned long size,
187 bool ipa_uncached) 188 bool ipa_uncached)
188{ 189{
@@ -246,7 +247,7 @@ static inline void __kvm_flush_dcache_pte(pte_t pte)
246static inline void __kvm_flush_dcache_pmd(pmd_t pmd) 247static inline void __kvm_flush_dcache_pmd(pmd_t pmd)
247{ 248{
248 unsigned long size = PMD_SIZE; 249 unsigned long size = PMD_SIZE;
249 pfn_t pfn = pmd_pfn(pmd); 250 kvm_pfn_t pfn = pmd_pfn(pmd);
250 251
251 while (size) { 252 while (size) {
252 void *va = kmap_atomic_pfn(pfn); 253 void *va = kmap_atomic_pfn(pfn);
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index a745a2a53853..dc46398bc3a5 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -88,7 +88,6 @@
88 88
89#define L_PMD_SECT_VALID (_AT(pmdval_t, 1) << 0) 89#define L_PMD_SECT_VALID (_AT(pmdval_t, 1) << 0)
90#define L_PMD_SECT_DIRTY (_AT(pmdval_t, 1) << 55) 90#define L_PMD_SECT_DIRTY (_AT(pmdval_t, 1) << 55)
91#define L_PMD_SECT_SPLITTING (_AT(pmdval_t, 1) << 56)
92#define L_PMD_SECT_NONE (_AT(pmdval_t, 1) << 57) 91#define L_PMD_SECT_NONE (_AT(pmdval_t, 1) << 57)
93#define L_PMD_SECT_RDONLY (_AT(pteval_t, 1) << 58) 92#define L_PMD_SECT_RDONLY (_AT(pteval_t, 1) << 58)
94 93
@@ -232,13 +231,6 @@ static inline pte_t pte_mkspecial(pte_t pte)
232 231
233#ifdef CONFIG_TRANSPARENT_HUGEPAGE 232#ifdef CONFIG_TRANSPARENT_HUGEPAGE
234#define pmd_trans_huge(pmd) (pmd_val(pmd) && !pmd_table(pmd)) 233#define pmd_trans_huge(pmd) (pmd_val(pmd) && !pmd_table(pmd))
235#define pmd_trans_splitting(pmd) (pmd_isset((pmd), L_PMD_SECT_SPLITTING))
236
237#ifdef CONFIG_HAVE_RCU_TABLE_FREE
238#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
239void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
240 pmd_t *pmdp);
241#endif
242#endif 234#endif
243 235
244#define PMD_BIT_FUNC(fn,op) \ 236#define PMD_BIT_FUNC(fn,op) \
@@ -246,9 +238,9 @@ static inline pmd_t pmd_##fn(pmd_t pmd) { pmd_val(pmd) op; return pmd; }
246 238
247PMD_BIT_FUNC(wrprotect, |= L_PMD_SECT_RDONLY); 239PMD_BIT_FUNC(wrprotect, |= L_PMD_SECT_RDONLY);
248PMD_BIT_FUNC(mkold, &= ~PMD_SECT_AF); 240PMD_BIT_FUNC(mkold, &= ~PMD_SECT_AF);
249PMD_BIT_FUNC(mksplitting, |= L_PMD_SECT_SPLITTING);
250PMD_BIT_FUNC(mkwrite, &= ~L_PMD_SECT_RDONLY); 241PMD_BIT_FUNC(mkwrite, &= ~L_PMD_SECT_RDONLY);
251PMD_BIT_FUNC(mkdirty, |= L_PMD_SECT_DIRTY); 242PMD_BIT_FUNC(mkdirty, |= L_PMD_SECT_DIRTY);
243PMD_BIT_FUNC(mkclean, &= ~L_PMD_SECT_DIRTY);
252PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF); 244PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF);
253 245
254#define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT)) 246#define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 22f7fa0124ec..aba61fd3697a 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -992,9 +992,9 @@ out:
992 return ret; 992 return ret;
993} 993}
994 994
995static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) 995static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
996{ 996{
997 pfn_t pfn = *pfnp; 997 kvm_pfn_t pfn = *pfnp;
998 gfn_t gfn = *ipap >> PAGE_SHIFT; 998 gfn_t gfn = *ipap >> PAGE_SHIFT;
999 999
1000 if (PageTransCompound(pfn_to_page(pfn))) { 1000 if (PageTransCompound(pfn_to_page(pfn))) {
@@ -1201,7 +1201,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1201 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); 1201 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1202} 1202}
1203 1203
1204static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn, 1204static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn,
1205 unsigned long size, bool uncached) 1205 unsigned long size, bool uncached)
1206{ 1206{
1207 __coherent_cache_guest_page(vcpu, pfn, size, uncached); 1207 __coherent_cache_guest_page(vcpu, pfn, size, uncached);
@@ -1218,7 +1218,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1218 struct kvm *kvm = vcpu->kvm; 1218 struct kvm *kvm = vcpu->kvm;
1219 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; 1219 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1220 struct vm_area_struct *vma; 1220 struct vm_area_struct *vma;
1221 pfn_t pfn; 1221 kvm_pfn_t pfn;
1222 pgprot_t mem_type = PAGE_S2; 1222 pgprot_t mem_type = PAGE_S2;
1223 bool fault_ipa_uncached; 1223 bool fault_ipa_uncached;
1224 bool logging_active = memslot_is_logging(memslot); 1224 bool logging_active = memslot_is_logging(memslot);
@@ -1346,7 +1346,7 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1346{ 1346{
1347 pmd_t *pmd; 1347 pmd_t *pmd;
1348 pte_t *pte; 1348 pte_t *pte;
1349 pfn_t pfn; 1349 kvm_pfn_t pfn;
1350 bool pfn_valid = false; 1350 bool pfn_valid = false;
1351 1351
1352 trace_kvm_access_fault(fault_ipa); 1352 trace_kvm_access_fault(fault_ipa);
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
index 588bbc288396..6bd1089b07e0 100644
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -52,14 +52,13 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
52 * 52 *
53 * Lock the page table for the destination and check 53 * Lock the page table for the destination and check
54 * to see that it's still huge and whether or not we will 54 * to see that it's still huge and whether or not we will
55 * need to fault on write, or if we have a splitting THP. 55 * need to fault on write.
56 */ 56 */
57 if (unlikely(pmd_thp_or_huge(*pmd))) { 57 if (unlikely(pmd_thp_or_huge(*pmd))) {
58 ptl = &current->mm->page_table_lock; 58 ptl = &current->mm->page_table_lock;
59 spin_lock(ptl); 59 spin_lock(ptl);
60 if (unlikely(!pmd_thp_or_huge(*pmd) 60 if (unlikely(!pmd_thp_or_huge(*pmd)
61 || pmd_hugewillfault(*pmd) 61 || pmd_hugewillfault(*pmd))) {
62 || pmd_trans_splitting(*pmd))) {
63 spin_unlock(ptl); 62 spin_unlock(ptl);
64 return 0; 63 return 0;
65 } 64 }
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 1ec8e7590fc6..d0ba3551d49a 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -330,7 +330,7 @@ void flush_dcache_page(struct page *page)
330 mapping = page_mapping(page); 330 mapping = page_mapping(page);
331 331
332 if (!cache_ops_need_broadcast() && 332 if (!cache_ops_need_broadcast() &&
333 mapping && !page_mapped(page)) 333 mapping && !page_mapcount(page))
334 clear_bit(PG_dcache_clean, &page->flags); 334 clear_bit(PG_dcache_clean, &page->flags);
335 else { 335 else {
336 __flush_dcache_page(mapping, page); 336 __flush_dcache_page(mapping, page);
@@ -415,18 +415,3 @@ void __flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned l
415 */ 415 */
416 __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE); 416 __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE);
417} 417}
418
419#ifdef CONFIG_TRANSPARENT_HUGEPAGE
420#ifdef CONFIG_HAVE_RCU_TABLE_FREE
421void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
422 pmd_t *pmdp)
423{
424 pmd_t pmd = pmd_mksplitting(*pmdp);
425 VM_BUG_ON(address & ~PMD_MASK);
426 set_pmd_at(vma->vm_mm, address, pmdp, pmd);
427
428 /* dummy IPI to serialise against fast_gup */
429 kick_all_cpus_sync();
430}
431#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
432#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 0bf8b4320a91..736433912a1e 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -230,7 +230,8 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
230 return (vcpu_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101; 230 return (vcpu_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
231} 231}
232 232
233static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn, 233static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu,
234 kvm_pfn_t pfn,
234 unsigned long size, 235 unsigned long size,
235 bool ipa_uncached) 236 bool ipa_uncached)
236{ 237{
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 69d2e2f86bce..2d545d7aa80b 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -353,21 +353,14 @@ static inline pgprot_t mk_sect_prot(pgprot_t prot)
353 353
354#ifdef CONFIG_TRANSPARENT_HUGEPAGE 354#ifdef CONFIG_TRANSPARENT_HUGEPAGE
355#define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT)) 355#define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
356#define pmd_trans_splitting(pmd) pte_special(pmd_pte(pmd))
357#ifdef CONFIG_HAVE_RCU_TABLE_FREE
358#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
359struct vm_area_struct;
360void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
361 pmd_t *pmdp);
362#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
363#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 356#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
364 357
365#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) 358#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
366#define pmd_young(pmd) pte_young(pmd_pte(pmd)) 359#define pmd_young(pmd) pte_young(pmd_pte(pmd))
367#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) 360#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
368#define pmd_mksplitting(pmd) pte_pmd(pte_mkspecial(pmd_pte(pmd)))
369#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) 361#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
370#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd))) 362#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
363#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
371#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) 364#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
372#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) 365#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
373#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK)) 366#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK))
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index 46649d6e6c5a..60585bde1264 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -102,19 +102,3 @@ EXPORT_SYMBOL(flush_dcache_page);
102 * Additional functions defined in assembly. 102 * Additional functions defined in assembly.
103 */ 103 */
104EXPORT_SYMBOL(flush_icache_range); 104EXPORT_SYMBOL(flush_icache_range);
105
106#ifdef CONFIG_TRANSPARENT_HUGEPAGE
107#ifdef CONFIG_HAVE_RCU_TABLE_FREE
108void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
109 pmd_t *pmdp)
110{
111 pmd_t pmd = pmd_mksplitting(*pmdp);
112
113 VM_BUG_ON(address & ~PMD_MASK);
114 set_pmd_at(vma->vm_mm, address, pmdp, pmd);
115
116 /* dummy IPI to serialise against fast_gup */
117 kick_all_cpus_sync();
118}
119#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
120#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/avr32/include/asm/page.h b/arch/avr32/include/asm/page.h
index f805d1cb11bc..c5d2a3e2c62f 100644
--- a/arch/avr32/include/asm/page.h
+++ b/arch/avr32/include/asm/page.h
@@ -83,11 +83,9 @@ static inline int get_order(unsigned long size)
83 83
84#ifndef CONFIG_NEED_MULTIPLE_NODES 84#ifndef CONFIG_NEED_MULTIPLE_NODES
85 85
86#define PHYS_PFN_OFFSET (CONFIG_PHYS_OFFSET >> PAGE_SHIFT) 86#define ARCH_PFN_OFFSET (CONFIG_PHYS_OFFSET >> PAGE_SHIFT)
87 87
88#define pfn_to_page(pfn) (mem_map + ((pfn) - PHYS_PFN_OFFSET)) 88#define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && (pfn) < (ARCH_PFN_OFFSET + max_mapnr))
89#define page_to_pfn(page) ((unsigned long)((page) - mem_map) + PHYS_PFN_OFFSET)
90#define pfn_valid(pfn) ((pfn) >= PHYS_PFN_OFFSET && (pfn) < (PHYS_PFN_OFFSET + max_mapnr))
91#endif /* CONFIG_NEED_MULTIPLE_NODES */ 89#endif /* CONFIG_NEED_MULTIPLE_NODES */
92 90
93#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) 91#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
@@ -101,4 +99,6 @@ static inline int get_order(unsigned long size)
101 */ 99 */
102#define HIGHMEM_START 0x20000000UL 100#define HIGHMEM_START 0x20000000UL
103 101
102#include <asm-generic/memory_model.h>
103
104#endif /* __ASM_AVR32_PAGE_H */ 104#endif /* __ASM_AVR32_PAGE_H */
diff --git a/arch/frv/include/asm/page.h b/arch/frv/include/asm/page.h
index 8c97068ac8fc..688d8076a43a 100644
--- a/arch/frv/include/asm/page.h
+++ b/arch/frv/include/asm/page.h
@@ -34,7 +34,7 @@ typedef struct page *pgtable_t;
34#define pgprot_val(x) ((x).pgprot) 34#define pgprot_val(x) ((x).pgprot)
35 35
36#define __pte(x) ((pte_t) { (x) } ) 36#define __pte(x) ((pte_t) { (x) } )
37#define __pmd(x) ((pmd_t) { (x) } ) 37#define __pmd(x) ((pmd_t) { { (x) } } )
38#define __pud(x) ((pud_t) { (x) } ) 38#define __pud(x) ((pud_t) { (x) } )
39#define __pgd(x) ((pgd_t) { (x) } ) 39#define __pgd(x) ((pgd_t) { (x) } )
40#define __pgprot(x) ((pgprot_t) { (x) } ) 40#define __pgprot(x) ((pgprot_t) { (x) } )
diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h
index ec48bb9f95e1..e8c486ef0d76 100644
--- a/arch/ia64/include/asm/page.h
+++ b/arch/ia64/include/asm/page.h
@@ -105,6 +105,7 @@ extern struct page *vmem_map;
105#ifdef CONFIG_DISCONTIGMEM 105#ifdef CONFIG_DISCONTIGMEM
106# define page_to_pfn(page) ((unsigned long) (page - vmem_map)) 106# define page_to_pfn(page) ((unsigned long) (page - vmem_map))
107# define pfn_to_page(pfn) (vmem_map + (pfn)) 107# define pfn_to_page(pfn) (vmem_map + (pfn))
108# define __pfn_to_phys(pfn) PFN_PHYS(pfn)
108#else 109#else
109# include <asm-generic/memory_model.h> 110# include <asm-generic/memory_model.h>
110#endif 111#endif
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index 0b389a81c43a..a0fa88da3e31 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -36,9 +36,6 @@ config STACKTRACE_SUPPORT
36config LOCKDEP_SUPPORT 36config LOCKDEP_SUPPORT
37 def_bool y 37 def_bool y
38 38
39config HAVE_LATENCYTOP_SUPPORT
40 def_bool y
41
42config RWSEM_GENERIC_SPINLOCK 39config RWSEM_GENERIC_SPINLOCK
43 def_bool y 40 def_bool y
44 41
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 0bce820428fc..5ecd0287a874 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -67,9 +67,6 @@ config STACKTRACE_SUPPORT
67config LOCKDEP_SUPPORT 67config LOCKDEP_SUPPORT
68 def_bool y 68 def_bool y
69 69
70config HAVE_LATENCYTOP_SUPPORT
71 def_bool y
72
73source "init/Kconfig" 70source "init/Kconfig"
74 71
75source "kernel/Kconfig.freezer" 72source "kernel/Kconfig.freezer"
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 6ded8d347af9..7c191443c7ea 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -101,9 +101,9 @@
101#define CAUSEF_DC (_ULCAST_(1) << 27) 101#define CAUSEF_DC (_ULCAST_(1) << 27)
102 102
103extern atomic_t kvm_mips_instance; 103extern atomic_t kvm_mips_instance;
104extern pfn_t(*kvm_mips_gfn_to_pfn) (struct kvm *kvm, gfn_t gfn); 104extern kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn);
105extern void (*kvm_mips_release_pfn_clean) (pfn_t pfn); 105extern void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn);
106extern bool(*kvm_mips_is_error_pfn) (pfn_t pfn); 106extern bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn);
107 107
108struct kvm_vm_stat { 108struct kvm_vm_stat {
109 u32 remote_tlb_flush; 109 u32 remote_tlb_flush;
diff --git a/arch/mips/include/asm/pgtable-bits.h b/arch/mips/include/asm/pgtable-bits.h
index ff7ad91c85db..97b313882678 100644
--- a/arch/mips/include/asm/pgtable-bits.h
+++ b/arch/mips/include/asm/pgtable-bits.h
@@ -131,14 +131,12 @@
131/* Huge TLB page */ 131/* Huge TLB page */
132#define _PAGE_HUGE_SHIFT (_PAGE_MODIFIED_SHIFT + 1) 132#define _PAGE_HUGE_SHIFT (_PAGE_MODIFIED_SHIFT + 1)
133#define _PAGE_HUGE (1 << _PAGE_HUGE_SHIFT) 133#define _PAGE_HUGE (1 << _PAGE_HUGE_SHIFT)
134#define _PAGE_SPLITTING_SHIFT (_PAGE_HUGE_SHIFT + 1)
135#define _PAGE_SPLITTING (1 << _PAGE_SPLITTING_SHIFT)
136#endif /* CONFIG_64BIT && CONFIG_MIPS_HUGE_TLB_SUPPORT */ 134#endif /* CONFIG_64BIT && CONFIG_MIPS_HUGE_TLB_SUPPORT */
137 135
138#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6) 136#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6)
139/* XI - page cannot be executed */ 137/* XI - page cannot be executed */
140#ifdef _PAGE_SPLITTING_SHIFT 138#ifdef _PAGE_HUGE_SHIFT
141#define _PAGE_NO_EXEC_SHIFT (_PAGE_SPLITTING_SHIFT + 1) 139#define _PAGE_NO_EXEC_SHIFT (_PAGE_HUGE_SHIFT + 1)
142#else 140#else
143#define _PAGE_NO_EXEC_SHIFT (_PAGE_MODIFIED_SHIFT + 1) 141#define _PAGE_NO_EXEC_SHIFT (_PAGE_MODIFIED_SHIFT + 1)
144#endif 142#endif
@@ -153,8 +151,8 @@
153 151
154#if defined(_PAGE_NO_READ_SHIFT) 152#if defined(_PAGE_NO_READ_SHIFT)
155#define _PAGE_GLOBAL_SHIFT (_PAGE_NO_READ_SHIFT + 1) 153#define _PAGE_GLOBAL_SHIFT (_PAGE_NO_READ_SHIFT + 1)
156#elif defined(_PAGE_SPLITTING_SHIFT) 154#elif defined(_PAGE_HUGE_SHIFT)
157#define _PAGE_GLOBAL_SHIFT (_PAGE_SPLITTING_SHIFT + 1) 155#define _PAGE_GLOBAL_SHIFT (_PAGE_HUGE_SHIFT + 1)
158#else 156#else
159#define _PAGE_GLOBAL_SHIFT (_PAGE_MODIFIED_SHIFT + 1) 157#define _PAGE_GLOBAL_SHIFT (_PAGE_MODIFIED_SHIFT + 1)
160#endif 158#endif
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 8957f15e21ec..6995b4a02e23 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -482,27 +482,9 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
482 return pmd; 482 return pmd;
483} 483}
484 484
485static inline int pmd_trans_splitting(pmd_t pmd)
486{
487 return !!(pmd_val(pmd) & _PAGE_SPLITTING);
488}
489
490static inline pmd_t pmd_mksplitting(pmd_t pmd)
491{
492 pmd_val(pmd) |= _PAGE_SPLITTING;
493
494 return pmd;
495}
496
497extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, 485extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
498 pmd_t *pmdp, pmd_t pmd); 486 pmd_t *pmdp, pmd_t pmd);
499 487
500#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
501/* Extern to avoid header file madness */
502extern void pmdp_splitting_flush(struct vm_area_struct *vma,
503 unsigned long address,
504 pmd_t *pmdp);
505
506#define __HAVE_ARCH_PMD_WRITE 488#define __HAVE_ARCH_PMD_WRITE
507static inline int pmd_write(pmd_t pmd) 489static inline int pmd_write(pmd_t pmd)
508{ 490{
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 97c03f468924..b0ebe59f73fd 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -73,8 +73,10 @@
73#define MADV_SEQUENTIAL 2 /* expect sequential page references */ 73#define MADV_SEQUENTIAL 2 /* expect sequential page references */
74#define MADV_WILLNEED 3 /* will need these pages */ 74#define MADV_WILLNEED 3 /* will need these pages */
75#define MADV_DONTNEED 4 /* don't need these pages */ 75#define MADV_DONTNEED 4 /* don't need these pages */
76#define MADV_FREE 5 /* free pages only if memory pressure */
76 77
77/* common parameters: try to keep these consistent across architectures */ 78/* common parameters: try to keep these consistent across architectures */
79#define MADV_FREE 8 /* free pages only if memory pressure */
78#define MADV_REMOVE 9 /* remove these pages & resources */ 80#define MADV_REMOVE 9 /* remove these pages & resources */
79#define MADV_DONTFORK 10 /* don't inherit across fork */ 81#define MADV_DONTFORK 10 /* don't inherit across fork */
80#define MADV_DOFORK 11 /* do inherit across fork */ 82#define MADV_DOFORK 11 /* do inherit across fork */
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 41b1b090f56f..1b675c7ce89f 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1525,7 +1525,7 @@ int kvm_mips_sync_icache(unsigned long va, struct kvm_vcpu *vcpu)
1525 struct kvm *kvm = vcpu->kvm; 1525 struct kvm *kvm = vcpu->kvm;
1526 unsigned long pa; 1526 unsigned long pa;
1527 gfn_t gfn; 1527 gfn_t gfn;
1528 pfn_t pfn; 1528 kvm_pfn_t pfn;
1529 1529
1530 gfn = va >> PAGE_SHIFT; 1530 gfn = va >> PAGE_SHIFT;
1531 1531
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index aed0ac2a4972..570479c03bdc 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -38,13 +38,13 @@ atomic_t kvm_mips_instance;
38EXPORT_SYMBOL(kvm_mips_instance); 38EXPORT_SYMBOL(kvm_mips_instance);
39 39
40/* These function pointers are initialized once the KVM module is loaded */ 40/* These function pointers are initialized once the KVM module is loaded */
41pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn); 41kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn);
42EXPORT_SYMBOL(kvm_mips_gfn_to_pfn); 42EXPORT_SYMBOL(kvm_mips_gfn_to_pfn);
43 43
44void (*kvm_mips_release_pfn_clean)(pfn_t pfn); 44void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn);
45EXPORT_SYMBOL(kvm_mips_release_pfn_clean); 45EXPORT_SYMBOL(kvm_mips_release_pfn_clean);
46 46
47bool (*kvm_mips_is_error_pfn)(pfn_t pfn); 47bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn);
48EXPORT_SYMBOL(kvm_mips_is_error_pfn); 48EXPORT_SYMBOL(kvm_mips_is_error_pfn);
49 49
50uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) 50uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
@@ -144,7 +144,7 @@ EXPORT_SYMBOL(kvm_mips_dump_guest_tlbs);
144static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn) 144static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
145{ 145{
146 int srcu_idx, err = 0; 146 int srcu_idx, err = 0;
147 pfn_t pfn; 147 kvm_pfn_t pfn;
148 148
149 if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE) 149 if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE)
150 return 0; 150 return 0;
@@ -262,7 +262,7 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
262 struct kvm_vcpu *vcpu) 262 struct kvm_vcpu *vcpu)
263{ 263{
264 gfn_t gfn; 264 gfn_t gfn;
265 pfn_t pfn0, pfn1; 265 kvm_pfn_t pfn0, pfn1;
266 unsigned long vaddr = 0; 266 unsigned long vaddr = 0;
267 unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; 267 unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
268 int even; 268 int even;
@@ -313,7 +313,7 @@ EXPORT_SYMBOL(kvm_mips_handle_kseg0_tlb_fault);
313int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, 313int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
314 struct kvm_vcpu *vcpu) 314 struct kvm_vcpu *vcpu)
315{ 315{
316 pfn_t pfn0, pfn1; 316 kvm_pfn_t pfn0, pfn1;
317 unsigned long flags, old_entryhi = 0, vaddr = 0; 317 unsigned long flags, old_entryhi = 0, vaddr = 0;
318 unsigned long entrylo0 = 0, entrylo1 = 0; 318 unsigned long entrylo0 = 0, entrylo1 = 0;
319 319
@@ -360,7 +360,7 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
360{ 360{
361 unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; 361 unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
362 struct kvm *kvm = vcpu->kvm; 362 struct kvm *kvm = vcpu->kvm;
363 pfn_t pfn0, pfn1; 363 kvm_pfn_t pfn0, pfn1;
364 364
365 if ((tlb->tlb_hi & VPN2_MASK) == 0) { 365 if ((tlb->tlb_hi & VPN2_MASK) == 0) {
366 pfn0 = 0; 366 pfn0 = 0;
diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c
index 5d3a25e1cfae..caac3d747a90 100644
--- a/arch/mips/mm/c-r4k.c
+++ b/arch/mips/mm/c-r4k.c
@@ -587,7 +587,8 @@ static inline void local_r4k_flush_cache_page(void *args)
587 * another ASID than the current one. 587 * another ASID than the current one.
588 */ 588 */
589 map_coherent = (cpu_has_dc_aliases && 589 map_coherent = (cpu_has_dc_aliases &&
590 page_mapped(page) && !Page_dcache_dirty(page)); 590 page_mapcount(page) &&
591 !Page_dcache_dirty(page));
591 if (map_coherent) 592 if (map_coherent)
592 vaddr = kmap_coherent(page, addr); 593 vaddr = kmap_coherent(page, addr);
593 else 594 else
diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c
index aab218c36e0d..3f159caf6dbc 100644
--- a/arch/mips/mm/cache.c
+++ b/arch/mips/mm/cache.c
@@ -106,7 +106,7 @@ void __flush_anon_page(struct page *page, unsigned long vmaddr)
106 unsigned long addr = (unsigned long) page_address(page); 106 unsigned long addr = (unsigned long) page_address(page);
107 107
108 if (pages_do_alias(addr, vmaddr)) { 108 if (pages_do_alias(addr, vmaddr)) {
109 if (page_mapped(page) && !Page_dcache_dirty(page)) { 109 if (page_mapcount(page) && !Page_dcache_dirty(page)) {
110 void *kaddr; 110 void *kaddr;
111 111
112 kaddr = kmap_coherent(page, vmaddr); 112 kaddr = kmap_coherent(page, vmaddr);
diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c
index 349995d19c7f..1afd87c999b0 100644
--- a/arch/mips/mm/gup.c
+++ b/arch/mips/mm/gup.c
@@ -87,8 +87,6 @@ static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end,
87 do { 87 do {
88 VM_BUG_ON(compound_head(page) != head); 88 VM_BUG_ON(compound_head(page) != head);
89 pages[*nr] = page; 89 pages[*nr] = page;
90 if (PageTail(page))
91 get_huge_page_tail(page);
92 (*nr)++; 90 (*nr)++;
93 page++; 91 page++;
94 refs++; 92 refs++;
@@ -109,18 +107,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
109 pmd_t pmd = *pmdp; 107 pmd_t pmd = *pmdp;
110 108
111 next = pmd_addr_end(addr, end); 109 next = pmd_addr_end(addr, end);
112 /* 110 if (pmd_none(pmd))
113 * The pmd_trans_splitting() check below explains why
114 * pmdp_splitting_flush has to flush the tlb, to stop
115 * this gup-fast code from running while we set the
116 * splitting bit in the pmd. Returning zero will take
117 * the slow path that will call wait_split_huge_page()
118 * if the pmd is still in splitting state. gup-fast
119 * can't because it has irq disabled and
120 * wait_split_huge_page() would never return as the
121 * tlb flush IPI wouldn't run.
122 */
123 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
124 return 0; 111 return 0;
125 if (unlikely(pmd_huge(pmd))) { 112 if (unlikely(pmd_huge(pmd))) {
126 if (!gup_huge_pmd(pmd, addr, next, write, pages,nr)) 113 if (!gup_huge_pmd(pmd, addr, next, write, pages,nr))
@@ -153,8 +140,6 @@ static int gup_huge_pud(pud_t pud, unsigned long addr, unsigned long end,
153 do { 140 do {
154 VM_BUG_ON(compound_head(page) != head); 141 VM_BUG_ON(compound_head(page) != head);
155 pages[*nr] = page; 142 pages[*nr] = page;
156 if (PageTail(page))
157 get_huge_page_tail(page);
158 (*nr)++; 143 (*nr)++;
159 page++; 144 page++;
160 refs++; 145 refs++;
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 8770e619185e..7e5fa0938c21 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -165,7 +165,7 @@ void copy_user_highpage(struct page *to, struct page *from,
165 165
166 vto = kmap_atomic(to); 166 vto = kmap_atomic(to);
167 if (cpu_has_dc_aliases && 167 if (cpu_has_dc_aliases &&
168 page_mapped(from) && !Page_dcache_dirty(from)) { 168 page_mapcount(from) && !Page_dcache_dirty(from)) {
169 vfrom = kmap_coherent(from, vaddr); 169 vfrom = kmap_coherent(from, vaddr);
170 copy_page(vto, vfrom); 170 copy_page(vto, vfrom);
171 kunmap_coherent(); 171 kunmap_coherent();
@@ -187,7 +187,7 @@ void copy_to_user_page(struct vm_area_struct *vma,
187 unsigned long len) 187 unsigned long len)
188{ 188{
189 if (cpu_has_dc_aliases && 189 if (cpu_has_dc_aliases &&
190 page_mapped(page) && !Page_dcache_dirty(page)) { 190 page_mapcount(page) && !Page_dcache_dirty(page)) {
191 void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); 191 void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
192 memcpy(vto, src, len); 192 memcpy(vto, src, len);
193 kunmap_coherent(); 193 kunmap_coherent();
@@ -205,7 +205,7 @@ void copy_from_user_page(struct vm_area_struct *vma,
205 unsigned long len) 205 unsigned long len)
206{ 206{
207 if (cpu_has_dc_aliases && 207 if (cpu_has_dc_aliases &&
208 page_mapped(page) && !Page_dcache_dirty(page)) { 208 page_mapcount(page) && !Page_dcache_dirty(page)) {
209 void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); 209 void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
210 memcpy(dst, vfrom, len); 210 memcpy(dst, vfrom, len);
211 kunmap_coherent(); 211 kunmap_coherent();
diff --git a/arch/mips/mm/pgtable-64.c b/arch/mips/mm/pgtable-64.c
index e8adc0069d66..ce4473e7c0d2 100644
--- a/arch/mips/mm/pgtable-64.c
+++ b/arch/mips/mm/pgtable-64.c
@@ -62,20 +62,6 @@ void pmd_init(unsigned long addr, unsigned long pagetable)
62} 62}
63#endif 63#endif
64 64
65#ifdef CONFIG_TRANSPARENT_HUGEPAGE
66
67void pmdp_splitting_flush(struct vm_area_struct *vma,
68 unsigned long address,
69 pmd_t *pmdp)
70{
71 if (!pmd_trans_splitting(*pmdp)) {
72 pmd_t pmd = pmd_mksplitting(*pmdp);
73 set_pmd_at(vma->vm_mm, address, pmdp, pmd);
74 }
75}
76
77#endif
78
79pmd_t mk_pmd(struct page *page, pgprot_t prot) 65pmd_t mk_pmd(struct page *page, pgprot_t prot)
80{ 66{
81 pmd_t pmd; 67 pmd_t pmd;
diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c
index 32e0be27673f..482192cc8f2b 100644
--- a/arch/mips/mm/tlbex.c
+++ b/arch/mips/mm/tlbex.c
@@ -240,7 +240,6 @@ static void output_pgtable_bits_defines(void)
240 pr_define("_PAGE_MODIFIED_SHIFT %d\n", _PAGE_MODIFIED_SHIFT); 240 pr_define("_PAGE_MODIFIED_SHIFT %d\n", _PAGE_MODIFIED_SHIFT);
241#ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT 241#ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
242 pr_define("_PAGE_HUGE_SHIFT %d\n", _PAGE_HUGE_SHIFT); 242 pr_define("_PAGE_HUGE_SHIFT %d\n", _PAGE_HUGE_SHIFT);
243 pr_define("_PAGE_SPLITTING_SHIFT %d\n", _PAGE_SPLITTING_SHIFT);
244#endif 243#endif
245#ifdef CONFIG_CPU_MIPSR2 244#ifdef CONFIG_CPU_MIPSR2
246 if (cpu_has_rixi) { 245 if (cpu_has_rixi) {
diff --git a/arch/mn10300/include/asm/page.h b/arch/mn10300/include/asm/page.h
index 8288e124165b..3810a6f740fd 100644
--- a/arch/mn10300/include/asm/page.h
+++ b/arch/mn10300/include/asm/page.h
@@ -107,6 +107,7 @@ static inline int get_order(unsigned long size)
107#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) 107#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
108#define pfn_to_page(pfn) (mem_map + ((pfn) - __pfn_disp)) 108#define pfn_to_page(pfn) (mem_map + ((pfn) - __pfn_disp))
109#define page_to_pfn(page) ((unsigned long)((page) - mem_map) + __pfn_disp) 109#define page_to_pfn(page) ((unsigned long)((page) - mem_map) + __pfn_disp)
110#define __pfn_to_phys(pfn) PFN_PHYS(pfn)
110 111
111#define pfn_valid(pfn) \ 112#define pfn_valid(pfn) \
112({ \ 113({ \
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 729f89163bc3..7c34cafdf301 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -79,9 +79,6 @@ config TIME_LOW_RES
79 depends on SMP 79 depends on SMP
80 default y 80 default y
81 81
82config HAVE_LATENCYTOP_SUPPORT
83 def_bool y
84
85# unless you want to implement ACPI on PA-RISC ... ;-) 82# unless you want to implement ACPI on PA-RISC ... ;-)
86config PM 83config PM
87 bool 84 bool
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index dd4d1876a020..cf830d465f75 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -43,8 +43,10 @@
43#define MADV_SPACEAVAIL 5 /* insure that resources are reserved */ 43#define MADV_SPACEAVAIL 5 /* insure that resources are reserved */
44#define MADV_VPS_PURGE 6 /* Purge pages from VM page cache */ 44#define MADV_VPS_PURGE 6 /* Purge pages from VM page cache */
45#define MADV_VPS_INHERIT 7 /* Inherit parents page size */ 45#define MADV_VPS_INHERIT 7 /* Inherit parents page size */
46#define MADV_FREE 8 /* free pages only if memory pressure */
46 47
47/* common/generic parameters */ 48/* common/generic parameters */
49#define MADV_FREE 8 /* free pages only if memory pressure */
48#define MADV_REMOVE 9 /* remove these pages & resources */ 50#define MADV_REMOVE 9 /* remove these pages & resources */
49#define MADV_DONTFORK 10 /* don't inherit across fork */ 51#define MADV_DONTFORK 10 /* don't inherit across fork */
50#define MADV_DOFORK 11 /* do inherit across fork */ 52#define MADV_DOFORK 11 /* do inherit across fork */
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 7d5a8350f913..94f6c5089e0c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -47,9 +47,6 @@ config STACKTRACE_SUPPORT
47 bool 47 bool
48 default y 48 default y
49 49
50config HAVE_LATENCYTOP_SUPPORT
51 def_bool y
52
53config TRACE_IRQFLAGS_SUPPORT 50config TRACE_IRQFLAGS_SUPPORT
54 bool 51 bool
55 default y 52 default y
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 9e55e3b1fef0..849bbec80f7b 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -256,13 +256,6 @@ static inline int pmd_trans_huge(pmd_t pmd)
256 (_PAGE_PTE | _PAGE_THP_HUGE)); 256 (_PAGE_PTE | _PAGE_THP_HUGE));
257} 257}
258 258
259static inline int pmd_trans_splitting(pmd_t pmd)
260{
261 if (pmd_trans_huge(pmd))
262 return pmd_val(pmd) & _PAGE_SPLITTING;
263 return 0;
264}
265
266static inline int pmd_large(pmd_t pmd) 259static inline int pmd_large(pmd_t pmd)
267{ 260{
268 return !!(pmd_val(pmd) & _PAGE_PTE); 261 return !!(pmd_val(pmd) & _PAGE_PTE);
@@ -273,11 +266,6 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
273 return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT); 266 return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT);
274} 267}
275 268
276static inline pmd_t pmd_mksplitting(pmd_t pmd)
277{
278 return __pmd(pmd_val(pmd) | _PAGE_SPLITTING);
279}
280
281#define __HAVE_ARCH_PMD_SAME 269#define __HAVE_ARCH_PMD_SAME
282static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) 270static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
283{ 271{
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 2ff8b3df553d..06f17e778c27 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -41,11 +41,6 @@
41#endif 41#endif
42 42
43/* 43/*
44 * THP pages can't be special. So use the _PAGE_SPECIAL
45 */
46#define _PAGE_SPLITTING _PAGE_SPECIAL
47
48/*
49 * We need to differentiate between explicit huge page and THP huge 44 * We need to differentiate between explicit huge page and THP huge
50 * page, since THP huge page also need to track real subpage details 45 * page, since THP huge page also need to track real subpage details
51 */ 46 */
@@ -54,9 +49,8 @@
54/* 49/*
55 * set of bits not changed in pmd_modify. 50 * set of bits not changed in pmd_modify.
56 */ 51 */
57#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | \ 52#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
58 _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \ 53 _PAGE_ACCESSED | _PAGE_THP_HUGE)
59 _PAGE_THP_HUGE | _PAGE_PTE | _PAGE_SOFT_DIRTY)
60 54
61#ifdef CONFIG_PPC_64K_PAGES 55#ifdef CONFIG_PPC_64K_PAGES
62#include <asm/book3s/64/hash-64k.h> 56#include <asm/book3s/64/hash-64k.h>
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index b3a5badab69f..8204b0c393aa 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -223,9 +223,11 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
223#define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd)) 223#define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd))
224#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) 224#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
225#define pmd_young(pmd) pte_young(pmd_pte(pmd)) 225#define pmd_young(pmd) pte_young(pmd_pte(pmd))
226#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
226#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) 227#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
227#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) 228#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
228#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) 229#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
230#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
229#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) 231#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
230#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd))) 232#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
231 233
@@ -266,10 +268,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
266extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, 268extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
267 unsigned long addr, pmd_t *pmdp); 269 unsigned long addr, pmd_t *pmdp);
268 270
269#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
270extern void pmdp_splitting_flush(struct vm_area_struct *vma,
271 unsigned long address, pmd_t *pmdp);
272
273extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, 271extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
274 unsigned long address, pmd_t *pmdp); 272 unsigned long address, pmd_t *pmdp);
275#define pmdp_collapse_flush pmdp_collapse_flush 273#define pmdp_collapse_flush pmdp_collapse_flush
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 9fac01cb89c1..8f39796c9da8 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -154,8 +154,8 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
154 bool upper, u32 val); 154 bool upper, u32 val);
155extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); 155extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
156extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu); 156extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
157extern pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing, 157extern kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa,
158 bool *writable); 158 bool writing, bool *writable);
159extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, 159extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
160 unsigned long *rmap, long pte_index, int realmode); 160 unsigned long *rmap, long pte_index, int realmode);
161extern void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize); 161extern void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize);
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index c6ef05bd0765..2241d5357129 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -515,7 +515,7 @@ void kvmppc_claim_lpid(long lpid);
515void kvmppc_free_lpid(long lpid); 515void kvmppc_free_lpid(long lpid);
516void kvmppc_init_lpid(unsigned long nr_lpids); 516void kvmppc_init_lpid(unsigned long nr_lpids);
517 517
518static inline void kvmppc_mmu_flush_icache(pfn_t pfn) 518static inline void kvmppc_mmu_flush_icache(kvm_pfn_t pfn)
519{ 519{
520 struct page *page; 520 struct page *page;
521 /* 521 /*
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 099c79d8c160..638c6d9be9e0 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -366,7 +366,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
366} 366}
367EXPORT_SYMBOL_GPL(kvmppc_core_prepare_to_enter); 367EXPORT_SYMBOL_GPL(kvmppc_core_prepare_to_enter);
368 368
369pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing, 369kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing,
370 bool *writable) 370 bool *writable)
371{ 371{
372 ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM; 372 ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM;
@@ -379,9 +379,9 @@ pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing,
379 gpa &= ~0xFFFULL; 379 gpa &= ~0xFFFULL;
380 if (unlikely(mp_pa) && unlikely((gpa & KVM_PAM) == mp_pa)) { 380 if (unlikely(mp_pa) && unlikely((gpa & KVM_PAM) == mp_pa)) {
381 ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK; 381 ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
382 pfn_t pfn; 382 kvm_pfn_t pfn;
383 383
384 pfn = (pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT; 384 pfn = (kvm_pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT;
385 get_page(pfn_to_page(pfn)); 385 get_page(pfn_to_page(pfn));
386 if (writable) 386 if (writable)
387 *writable = true; 387 *writable = true;
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index d5c9bfeb0c9c..55c4d51ea3e2 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -142,7 +142,7 @@ extern char etext[];
142int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, 142int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte,
143 bool iswrite) 143 bool iswrite)
144{ 144{
145 pfn_t hpaddr; 145 kvm_pfn_t hpaddr;
146 u64 vpn; 146 u64 vpn;
147 u64 vsid; 147 u64 vsid;
148 struct kvmppc_sid_map *map; 148 struct kvmppc_sid_map *map;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 79ad35abd196..913cd2198fa6 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -83,7 +83,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte,
83 bool iswrite) 83 bool iswrite)
84{ 84{
85 unsigned long vpn; 85 unsigned long vpn;
86 pfn_t hpaddr; 86 kvm_pfn_t hpaddr;
87 ulong hash, hpteg; 87 ulong hash, hpteg;
88 u64 vsid; 88 u64 vsid;
89 int ret; 89 int ret;
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index 72920bed3ac6..94f04fcb373e 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -41,7 +41,7 @@ enum vcpu_ftr {
41#define E500_TLB_MAS2_ATTR (0x7f) 41#define E500_TLB_MAS2_ATTR (0x7f)
42 42
43struct tlbe_ref { 43struct tlbe_ref {
44 pfn_t pfn; /* valid only for TLB0, except briefly */ 44 kvm_pfn_t pfn; /* valid only for TLB0, except briefly */
45 unsigned int flags; /* E500_TLB_* */ 45 unsigned int flags; /* E500_TLB_* */
46}; 46};
47 47
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 34c43fff4adb..b0333cc737dd 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -163,9 +163,9 @@ void kvmppc_map_magic(struct kvm_vcpu *vcpu)
163 struct kvm_book3e_206_tlb_entry magic; 163 struct kvm_book3e_206_tlb_entry magic;
164 ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK; 164 ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
165 unsigned int stid; 165 unsigned int stid;
166 pfn_t pfn; 166 kvm_pfn_t pfn;
167 167
168 pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT; 168 pfn = (kvm_pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT;
169 get_page(pfn_to_page(pfn)); 169 get_page(pfn_to_page(pfn));
170 170
171 preempt_disable(); 171 preempt_disable();
@@ -246,7 +246,7 @@ static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe)
246 246
247static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, 247static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
248 struct kvm_book3e_206_tlb_entry *gtlbe, 248 struct kvm_book3e_206_tlb_entry *gtlbe,
249 pfn_t pfn, unsigned int wimg) 249 kvm_pfn_t pfn, unsigned int wimg)
250{ 250{
251 ref->pfn = pfn; 251 ref->pfn = pfn;
252 ref->flags = E500_TLB_VALID; 252 ref->flags = E500_TLB_VALID;
@@ -309,7 +309,7 @@ static void kvmppc_e500_setup_stlbe(
309 int tsize, struct tlbe_ref *ref, u64 gvaddr, 309 int tsize, struct tlbe_ref *ref, u64 gvaddr,
310 struct kvm_book3e_206_tlb_entry *stlbe) 310 struct kvm_book3e_206_tlb_entry *stlbe)
311{ 311{
312 pfn_t pfn = ref->pfn; 312 kvm_pfn_t pfn = ref->pfn;
313 u32 pr = vcpu->arch.shared->msr & MSR_PR; 313 u32 pr = vcpu->arch.shared->msr & MSR_PR;
314 314
315 BUG_ON(!(ref->flags & E500_TLB_VALID)); 315 BUG_ON(!(ref->flags & E500_TLB_VALID));
diff --git a/arch/powerpc/kvm/trace_pr.h b/arch/powerpc/kvm/trace_pr.h
index 810507cb688a..d44f324184fb 100644
--- a/arch/powerpc/kvm/trace_pr.h
+++ b/arch/powerpc/kvm/trace_pr.h
@@ -30,7 +30,7 @@ TRACE_EVENT(kvm_book3s_reenter,
30#ifdef CONFIG_PPC_BOOK3S_64 30#ifdef CONFIG_PPC_BOOK3S_64
31 31
32TRACE_EVENT(kvm_book3s_64_mmu_map, 32TRACE_EVENT(kvm_book3s_64_mmu_map,
33 TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr, 33 TP_PROTO(int rflags, ulong hpteg, ulong va, kvm_pfn_t hpaddr,
34 struct kvmppc_pte *orig_pte), 34 struct kvmppc_pte *orig_pte),
35 TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte), 35 TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte),
36 36
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index baf1301ded0c..49b152b0f926 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -39,9 +39,6 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
39 /* If PMD busy, retry the access */ 39 /* If PMD busy, retry the access */
40 if (unlikely(old_pmd & _PAGE_BUSY)) 40 if (unlikely(old_pmd & _PAGE_BUSY))
41 return 0; 41 return 0;
42 /* If PMD is trans splitting retry the access */
43 if (unlikely(old_pmd & _PAGE_SPLITTING))
44 return 0;
45 /* If PMD permissions don't match, take page fault */ 42 /* If PMD permissions don't match, take page fault */
46 if (unlikely(access & ~old_pmd)) 43 if (unlikely(access & ~old_pmd))
47 return 1; 44 return 1;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 61b8b7ccea4f..744e24bcb85c 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -958,10 +958,6 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
958 /* 958 /*
959 * A hugepage collapse is captured by pmd_none, because 959 * A hugepage collapse is captured by pmd_none, because
960 * it mark the pmd none and do a hpte invalidate. 960 * it mark the pmd none and do a hpte invalidate.
961 *
962 * We don't worry about pmd_trans_splitting here, The
963 * caller if it needs to handle the splitting case
964 * should check for that.
965 */ 961 */
966 if (pmd_none(pmd)) 962 if (pmd_none(pmd))
967 return NULL; 963 return NULL;
@@ -999,7 +995,7 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
999{ 995{
1000 unsigned long mask; 996 unsigned long mask;
1001 unsigned long pte_end; 997 unsigned long pte_end;
1002 struct page *head, *page, *tail; 998 struct page *head, *page;
1003 pte_t pte; 999 pte_t pte;
1004 int refs; 1000 int refs;
1005 1001
@@ -1022,7 +1018,6 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
1022 head = pte_page(pte); 1018 head = pte_page(pte);
1023 1019
1024 page = head + ((addr & (sz-1)) >> PAGE_SHIFT); 1020 page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
1025 tail = page;
1026 do { 1021 do {
1027 VM_BUG_ON(compound_head(page) != head); 1022 VM_BUG_ON(compound_head(page) != head);
1028 pages[*nr] = page; 1023 pages[*nr] = page;
@@ -1044,15 +1039,5 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
1044 return 0; 1039 return 0;
1045 } 1040 }
1046 1041
1047 /*
1048 * Any tail page need their mapcount reference taken before we
1049 * return.
1050 */
1051 while (refs--) {
1052 if (PageTail(tail))
1053 get_huge_page_tail(tail);
1054 tail++;
1055 }
1056
1057 return 1; 1042 return 1;
1058} 1043}
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index ea6bc31debb0..3124a20d0fab 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -604,55 +604,6 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
604} 604}
605 605
606/* 606/*
607 * We mark the pmd splitting and invalidate all the hpte
608 * entries for this hugepage.
609 */
610void pmdp_splitting_flush(struct vm_area_struct *vma,
611 unsigned long address, pmd_t *pmdp)
612{
613 unsigned long old, tmp;
614
615 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
616
617#ifdef CONFIG_DEBUG_VM
618 WARN_ON(!pmd_trans_huge(*pmdp));
619 assert_spin_locked(&vma->vm_mm->page_table_lock);
620#endif
621
622#ifdef PTE_ATOMIC_UPDATES
623
624 __asm__ __volatile__(
625 "1: ldarx %0,0,%3\n\
626 andi. %1,%0,%6\n\
627 bne- 1b \n\
628 oris %1,%0,%4@h \n\
629 stdcx. %1,0,%3 \n\
630 bne- 1b"
631 : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
632 : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
633 : "cc" );
634#else
635 old = pmd_val(*pmdp);
636 *pmdp = __pmd(old | _PAGE_SPLITTING);
637#endif
638 /*
639 * If we didn't had the splitting flag set, go and flush the
640 * HPTE entries.
641 */
642 trace_hugepage_splitting(address, old);
643 if (!(old & _PAGE_SPLITTING)) {
644 /* We need to flush the hpte */
645 if (old & _PAGE_HASHPTE)
646 hpte_do_hugepage_flush(vma->vm_mm, address, pmdp, old);
647 }
648 /*
649 * This ensures that generic code that rely on IRQ disabling
650 * to prevent a parallel THP split work as expected.
651 */
652 kick_all_cpus_sync();
653}
654
655/*
656 * We want to put the pgtable in pmd and use pgtable for tracking 607 * We want to put the pgtable in pmd and use pgtable for tracking
657 * the base page size hptes 608 * the base page size hptes
658 */ 609 */
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index fa9fb5b4c66c..d5543514c1df 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -135,7 +135,7 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
135 unsigned long end, struct mm_walk *walk) 135 unsigned long end, struct mm_walk *walk)
136{ 136{
137 struct vm_area_struct *vma = walk->vma; 137 struct vm_area_struct *vma = walk->vma;
138 split_huge_page_pmd(vma, addr, pmd); 138 split_huge_pmd(vma, pmd, addr);
139 return 0; 139 return 0;
140} 140}
141 141
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index c713b349d967..0d112b94d91d 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -43,6 +43,7 @@
43#include <linux/types.h> 43#include <linux/types.h>
44#include <linux/of_device.h> 44#include <linux/of_device.h>
45#include <linux/of_platform.h> 45#include <linux/of_platform.h>
46#include <linux/pfn_t.h>
46 47
47#include <asm/page.h> 48#include <asm/page.h>
48#include <asm/prom.h> 49#include <asm/prom.h>
@@ -142,15 +143,13 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
142 */ 143 */
143static long 144static long
144axon_ram_direct_access(struct block_device *device, sector_t sector, 145axon_ram_direct_access(struct block_device *device, sector_t sector,
145 void __pmem **kaddr, unsigned long *pfn) 146 void __pmem **kaddr, pfn_t *pfn)
146{ 147{
147 struct axon_ram_bank *bank = device->bd_disk->private_data; 148 struct axon_ram_bank *bank = device->bd_disk->private_data;
148 loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT; 149 loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
149 void *addr = (void *)(bank->ph_addr + offset);
150
151 *kaddr = (void __pmem *)addr;
152 *pfn = virt_to_phys(addr) >> PAGE_SHIFT;
153 150
151 *kaddr = (void __pmem __force *) bank->io_addr + offset;
152 *pfn = phys_to_pfn_t(bank->ph_addr + offset, PFN_DEV);
154 return bank->size - offset; 153 return bank->size - offset;
155} 154}
156 155
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 24490344c30f..dbeeb3a049f2 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -10,9 +10,6 @@ config LOCKDEP_SUPPORT
10config STACKTRACE_SUPPORT 10config STACKTRACE_SUPPORT
11 def_bool y 11 def_bool y
12 12
13config HAVE_LATENCYTOP_SUPPORT
14 def_bool y
15
16config RWSEM_GENERIC_SPINLOCK 13config RWSEM_GENERIC_SPINLOCK
17 bool 14 bool
18 15
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 024f85f947ae..64ead8091248 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -286,7 +286,6 @@ static inline int is_module_addr(void *addr)
286 286
287#define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */ 287#define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */
288#define _SEGMENT_ENTRY_YOUNG 0x1000 /* SW segment young bit */ 288#define _SEGMENT_ENTRY_YOUNG 0x1000 /* SW segment young bit */
289#define _SEGMENT_ENTRY_SPLIT 0x0800 /* THP splitting bit */
290#define _SEGMENT_ENTRY_LARGE 0x0400 /* STE-format control, large page */ 289#define _SEGMENT_ENTRY_LARGE 0x0400 /* STE-format control, large page */
291#define _SEGMENT_ENTRY_READ 0x0002 /* SW segment read bit */ 290#define _SEGMENT_ENTRY_READ 0x0002 /* SW segment read bit */
292#define _SEGMENT_ENTRY_WRITE 0x0001 /* SW segment write bit */ 291#define _SEGMENT_ENTRY_WRITE 0x0001 /* SW segment write bit */
@@ -318,8 +317,6 @@ static inline int is_module_addr(void *addr)
318 * SW-bits: y young, d dirty, r read, w write 317 * SW-bits: y young, d dirty, r read, w write
319 */ 318 */
320 319
321#define _SEGMENT_ENTRY_SPLIT_BIT 11 /* THP splitting bit number */
322
323/* Page status table bits for virtualization */ 320/* Page status table bits for virtualization */
324#define PGSTE_ACC_BITS 0xf000000000000000UL 321#define PGSTE_ACC_BITS 0xf000000000000000UL
325#define PGSTE_FP_BIT 0x0800000000000000UL 322#define PGSTE_FP_BIT 0x0800000000000000UL
@@ -523,10 +520,6 @@ static inline int pmd_bad(pmd_t pmd)
523 return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0; 520 return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
524} 521}
525 522
526#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
527extern void pmdp_splitting_flush(struct vm_area_struct *vma,
528 unsigned long addr, pmd_t *pmdp);
529
530#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS 523#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
531extern int pmdp_set_access_flags(struct vm_area_struct *vma, 524extern int pmdp_set_access_flags(struct vm_area_struct *vma,
532 unsigned long address, pmd_t *pmdp, 525 unsigned long address, pmd_t *pmdp,
@@ -1424,8 +1417,7 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
1424 if (pmd_large(pmd)) { 1417 if (pmd_large(pmd)) {
1425 pmd_val(pmd) &= _SEGMENT_ENTRY_ORIGIN_LARGE | 1418 pmd_val(pmd) &= _SEGMENT_ENTRY_ORIGIN_LARGE |
1426 _SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_YOUNG | 1419 _SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_YOUNG |
1427 _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SPLIT | 1420 _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SOFT_DIRTY;
1428 _SEGMENT_ENTRY_SOFT_DIRTY;
1429 pmd_val(pmd) |= massage_pgprot_pmd(newprot); 1421 pmd_val(pmd) |= massage_pgprot_pmd(newprot);
1430 if (!(pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY)) 1422 if (!(pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY))
1431 pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT; 1423 pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
@@ -1533,12 +1525,6 @@ extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
1533#define __HAVE_ARCH_PGTABLE_WITHDRAW 1525#define __HAVE_ARCH_PGTABLE_WITHDRAW
1534extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); 1526extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
1535 1527
1536static inline int pmd_trans_splitting(pmd_t pmd)
1537{
1538 return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) &&
1539 (pmd_val(pmd) & _SEGMENT_ENTRY_SPLIT);
1540}
1541
1542static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, 1528static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
1543 pmd_t *pmdp, pmd_t entry) 1529 pmd_t *pmdp, pmd_t entry)
1544{ 1530{
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index 21c74a71e2ab..13dab0c1645c 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -55,7 +55,7 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
55 unsigned long end, int write, struct page **pages, int *nr) 55 unsigned long end, int write, struct page **pages, int *nr)
56{ 56{
57 unsigned long mask, result; 57 unsigned long mask, result;
58 struct page *head, *page, *tail; 58 struct page *head, *page;
59 int refs; 59 int refs;
60 60
61 result = write ? 0 : _SEGMENT_ENTRY_PROTECT; 61 result = write ? 0 : _SEGMENT_ENTRY_PROTECT;
@@ -67,7 +67,6 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
67 refs = 0; 67 refs = 0;
68 head = pmd_page(pmd); 68 head = pmd_page(pmd);
69 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 69 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
70 tail = page;
71 do { 70 do {
72 VM_BUG_ON(compound_head(page) != head); 71 VM_BUG_ON(compound_head(page) != head);
73 pages[*nr] = page; 72 pages[*nr] = page;
@@ -88,16 +87,6 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
88 return 0; 87 return 0;
89 } 88 }
90 89
91 /*
92 * Any tail page need their mapcount reference taken before we
93 * return.
94 */
95 while (refs--) {
96 if (PageTail(tail))
97 get_huge_page_tail(tail);
98 tail++;
99 }
100
101 return 1; 90 return 1;
102} 91}
103 92
@@ -116,16 +105,7 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
116 pmd = *pmdp; 105 pmd = *pmdp;
117 barrier(); 106 barrier();
118 next = pmd_addr_end(addr, end); 107 next = pmd_addr_end(addr, end);
119 /* 108 if (pmd_none(pmd))
120 * The pmd_trans_splitting() check below explains why
121 * pmdp_splitting_flush() has to serialize with
122 * smp_call_function() against our disabled IRQs, to stop
123 * this gup-fast code from running while we set the
124 * splitting bit in the pmd. Returning zero will take
125 * the slow path that will call wait_split_huge_page()
126 * if the pmd is still in splitting state.
127 */
128 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
129 return 0; 109 return 0;
130 if (unlikely(pmd_large(pmd))) { 110 if (unlikely(pmd_large(pmd))) {
131 /* 111 /*
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index aa34af0a0b26..a809fa8e6f8b 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -578,17 +578,29 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr,
578{ 578{
579 unsigned long vmaddr; 579 unsigned long vmaddr;
580 int rc; 580 int rc;
581 bool unlocked;
581 582
582 down_read(&gmap->mm->mmap_sem); 583 down_read(&gmap->mm->mmap_sem);
584
585retry:
586 unlocked = false;
583 vmaddr = __gmap_translate(gmap, gaddr); 587 vmaddr = __gmap_translate(gmap, gaddr);
584 if (IS_ERR_VALUE(vmaddr)) { 588 if (IS_ERR_VALUE(vmaddr)) {
585 rc = vmaddr; 589 rc = vmaddr;
586 goto out_up; 590 goto out_up;
587 } 591 }
588 if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) { 592 if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags,
593 &unlocked)) {
589 rc = -EFAULT; 594 rc = -EFAULT;
590 goto out_up; 595 goto out_up;
591 } 596 }
597 /*
598 * In the case that fixup_user_fault unlocked the mmap_sem during
599 * faultin redo __gmap_translate to not race with a map/unmap_segment.
600 */
601 if (unlocked)
602 goto retry;
603
592 rc = __gmap_link(gmap, gaddr, vmaddr); 604 rc = __gmap_link(gmap, gaddr, vmaddr);
593out_up: 605out_up:
594 up_read(&gmap->mm->mmap_sem); 606 up_read(&gmap->mm->mmap_sem);
@@ -714,12 +726,14 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
714 spinlock_t *ptl; 726 spinlock_t *ptl;
715 pte_t *ptep, entry; 727 pte_t *ptep, entry;
716 pgste_t pgste; 728 pgste_t pgste;
729 bool unlocked;
717 int rc = 0; 730 int rc = 0;
718 731
719 if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) 732 if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
720 return -EINVAL; 733 return -EINVAL;
721 down_read(&gmap->mm->mmap_sem); 734 down_read(&gmap->mm->mmap_sem);
722 while (len) { 735 while (len) {
736 unlocked = false;
723 /* Convert gmap address and connect the page tables */ 737 /* Convert gmap address and connect the page tables */
724 addr = __gmap_translate(gmap, gaddr); 738 addr = __gmap_translate(gmap, gaddr);
725 if (IS_ERR_VALUE(addr)) { 739 if (IS_ERR_VALUE(addr)) {
@@ -727,10 +741,14 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
727 break; 741 break;
728 } 742 }
729 /* Get the page mapped */ 743 /* Get the page mapped */
730 if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) { 744 if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
745 &unlocked)) {
731 rc = -EFAULT; 746 rc = -EFAULT;
732 break; 747 break;
733 } 748 }
749 /* While trying to map mmap_sem got unlocked. Let us retry */
750 if (unlocked)
751 continue;
734 rc = __gmap_link(gmap, gaddr, addr); 752 rc = __gmap_link(gmap, gaddr, addr);
735 if (rc) 753 if (rc)
736 break; 754 break;
@@ -791,9 +809,11 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
791 spinlock_t *ptl; 809 spinlock_t *ptl;
792 pgste_t old, new; 810 pgste_t old, new;
793 pte_t *ptep; 811 pte_t *ptep;
812 bool unlocked;
794 813
795 down_read(&mm->mmap_sem); 814 down_read(&mm->mmap_sem);
796retry: 815retry:
816 unlocked = false;
797 ptep = get_locked_pte(mm, addr, &ptl); 817 ptep = get_locked_pte(mm, addr, &ptl);
798 if (unlikely(!ptep)) { 818 if (unlikely(!ptep)) {
799 up_read(&mm->mmap_sem); 819 up_read(&mm->mmap_sem);
@@ -802,7 +822,12 @@ retry:
802 if (!(pte_val(*ptep) & _PAGE_INVALID) && 822 if (!(pte_val(*ptep) & _PAGE_INVALID) &&
803 (pte_val(*ptep) & _PAGE_PROTECT)) { 823 (pte_val(*ptep) & _PAGE_PROTECT)) {
804 pte_unmap_unlock(ptep, ptl); 824 pte_unmap_unlock(ptep, ptl);
805 if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) { 825 /*
826 * We do not really care about unlocked. We will retry either
827 * way. But this allows fixup_user_fault to enable userfaultfd.
828 */
829 if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE,
830 &unlocked)) {
806 up_read(&mm->mmap_sem); 831 up_read(&mm->mmap_sem);
807 return -EFAULT; 832 return -EFAULT;
808 } 833 }
@@ -1305,22 +1330,6 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
1305 return 1; 1330 return 1;
1306} 1331}
1307 1332
1308static void pmdp_splitting_flush_sync(void *arg)
1309{
1310 /* Simply deliver the interrupt */
1311}
1312
1313void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
1314 pmd_t *pmdp)
1315{
1316 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1317 if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
1318 (unsigned long *) pmdp)) {
1319 /* need to serialize against gup-fast (IRQ disabled) */
1320 smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
1321 }
1322}
1323
1324void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 1333void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
1325 pgtable_t pgtable) 1334 pgtable_t pgtable)
1326{ 1335{
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index d514df7e04dd..6c391a5d3e5c 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -130,9 +130,6 @@ config STACKTRACE_SUPPORT
130config LOCKDEP_SUPPORT 130config LOCKDEP_SUPPORT
131 def_bool y 131 def_bool y
132 132
133config HAVE_LATENCYTOP_SUPPORT
134 def_bool y
135
136config ARCH_HAS_ILOG2_U32 133config ARCH_HAS_ILOG2_U32
137 def_bool n 134 def_bool n
138 135
diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c
index 51d8f7f31d1d..58aaa4f33b81 100644
--- a/arch/sh/mm/cache-sh4.c
+++ b/arch/sh/mm/cache-sh4.c
@@ -241,7 +241,7 @@ static void sh4_flush_cache_page(void *args)
241 */ 241 */
242 map_coherent = (current_cpu_data.dcache.n_aliases && 242 map_coherent = (current_cpu_data.dcache.n_aliases &&
243 test_bit(PG_dcache_clean, &page->flags) && 243 test_bit(PG_dcache_clean, &page->flags) &&
244 page_mapped(page)); 244 page_mapcount(page));
245 if (map_coherent) 245 if (map_coherent)
246 vaddr = kmap_coherent(page, address); 246 vaddr = kmap_coherent(page, address);
247 else 247 else
diff --git a/arch/sh/mm/cache.c b/arch/sh/mm/cache.c
index f770e3992620..e58cfbf45150 100644
--- a/arch/sh/mm/cache.c
+++ b/arch/sh/mm/cache.c
@@ -59,7 +59,7 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
59 unsigned long vaddr, void *dst, const void *src, 59 unsigned long vaddr, void *dst, const void *src,
60 unsigned long len) 60 unsigned long len)
61{ 61{
62 if (boot_cpu_data.dcache.n_aliases && page_mapped(page) && 62 if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
63 test_bit(PG_dcache_clean, &page->flags)) { 63 test_bit(PG_dcache_clean, &page->flags)) {
64 void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); 64 void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
65 memcpy(vto, src, len); 65 memcpy(vto, src, len);
@@ -78,7 +78,7 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
78 unsigned long vaddr, void *dst, const void *src, 78 unsigned long vaddr, void *dst, const void *src,
79 unsigned long len) 79 unsigned long len)
80{ 80{
81 if (boot_cpu_data.dcache.n_aliases && page_mapped(page) && 81 if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
82 test_bit(PG_dcache_clean, &page->flags)) { 82 test_bit(PG_dcache_clean, &page->flags)) {
83 void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); 83 void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
84 memcpy(dst, vfrom, len); 84 memcpy(dst, vfrom, len);
@@ -97,7 +97,7 @@ void copy_user_highpage(struct page *to, struct page *from,
97 97
98 vto = kmap_atomic(to); 98 vto = kmap_atomic(to);
99 99
100 if (boot_cpu_data.dcache.n_aliases && page_mapped(from) && 100 if (boot_cpu_data.dcache.n_aliases && page_mapcount(from) &&
101 test_bit(PG_dcache_clean, &from->flags)) { 101 test_bit(PG_dcache_clean, &from->flags)) {
102 vfrom = kmap_coherent(from, vaddr); 102 vfrom = kmap_coherent(from, vaddr);
103 copy_page(vto, vfrom); 103 copy_page(vto, vfrom);
@@ -153,7 +153,7 @@ void __flush_anon_page(struct page *page, unsigned long vmaddr)
153 unsigned long addr = (unsigned long) page_address(page); 153 unsigned long addr = (unsigned long) page_address(page);
154 154
155 if (pages_do_alias(addr, vmaddr)) { 155 if (pages_do_alias(addr, vmaddr)) {
156 if (boot_cpu_data.dcache.n_aliases && page_mapped(page) && 156 if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
157 test_bit(PG_dcache_clean, &page->flags)) { 157 test_bit(PG_dcache_clean, &page->flags)) {
158 void *kaddr; 158 void *kaddr;
159 159
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 56442d2d7bbc..3203e42190dd 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -101,10 +101,6 @@ config LOCKDEP_SUPPORT
101 bool 101 bool
102 default y if SPARC64 102 default y if SPARC64
103 103
104config HAVE_LATENCYTOP_SUPPORT
105 bool
106 default y if SPARC64
107
108config ARCH_HIBERNATION_POSSIBLE 104config ARCH_HIBERNATION_POSSIBLE
109 def_bool y if SPARC64 105 def_bool y if SPARC64
110 106
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 131d36fcd07a..7a38d6a576c5 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -681,13 +681,6 @@ static inline unsigned long pmd_trans_huge(pmd_t pmd)
681 return pte_val(pte) & _PAGE_PMD_HUGE; 681 return pte_val(pte) & _PAGE_PMD_HUGE;
682} 682}
683 683
684static inline unsigned long pmd_trans_splitting(pmd_t pmd)
685{
686 pte_t pte = __pte(pmd_val(pmd));
687
688 return pmd_trans_huge(pmd) && pte_special(pte);
689}
690
691#define has_transparent_hugepage() 1 684#define has_transparent_hugepage() 1
692 685
693static inline pmd_t pmd_mkold(pmd_t pmd) 686static inline pmd_t pmd_mkold(pmd_t pmd)
@@ -717,29 +710,29 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
717 return __pmd(pte_val(pte)); 710 return __pmd(pte_val(pte));
718} 711}
719 712
720static inline pmd_t pmd_mkyoung(pmd_t pmd) 713static inline pmd_t pmd_mkclean(pmd_t pmd)
721{ 714{
722 pte_t pte = __pte(pmd_val(pmd)); 715 pte_t pte = __pte(pmd_val(pmd));
723 716
724 pte = pte_mkyoung(pte); 717 pte = pte_mkclean(pte);
725 718
726 return __pmd(pte_val(pte)); 719 return __pmd(pte_val(pte));
727} 720}
728 721
729static inline pmd_t pmd_mkwrite(pmd_t pmd) 722static inline pmd_t pmd_mkyoung(pmd_t pmd)
730{ 723{
731 pte_t pte = __pte(pmd_val(pmd)); 724 pte_t pte = __pte(pmd_val(pmd));
732 725
733 pte = pte_mkwrite(pte); 726 pte = pte_mkyoung(pte);
734 727
735 return __pmd(pte_val(pte)); 728 return __pmd(pte_val(pte));
736} 729}
737 730
738static inline pmd_t pmd_mksplitting(pmd_t pmd) 731static inline pmd_t pmd_mkwrite(pmd_t pmd)
739{ 732{
740 pte_t pte = __pte(pmd_val(pmd)); 733 pte_t pte = __pte(pmd_val(pmd));
741 734
742 pte = pte_mkspecial(pte); 735 pte = pte_mkwrite(pte);
743 736
744 return __pmd(pte_val(pte)); 737 return __pmd(pte_val(pte));
745} 738}
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index dbabe5713a15..cb841a33da59 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -113,9 +113,6 @@ static unsigned int get_user_insn(unsigned long tpc)
113 113
114#ifdef CONFIG_TRANSPARENT_HUGEPAGE 114#ifdef CONFIG_TRANSPARENT_HUGEPAGE
115 if (pmd_trans_huge(*pmdp)) { 115 if (pmd_trans_huge(*pmdp)) {
116 if (pmd_trans_splitting(*pmdp))
117 goto out_irq_enable;
118
119 pa = pmd_pfn(*pmdp) << PAGE_SHIFT; 116 pa = pmd_pfn(*pmdp) << PAGE_SHIFT;
120 pa += tpc & ~HPAGE_MASK; 117 pa += tpc & ~HPAGE_MASK;
121 118
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
index 2e5c4fc2daa9..eb3d8e8ebc6b 100644
--- a/arch/sparc/mm/gup.c
+++ b/arch/sparc/mm/gup.c
@@ -56,8 +56,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
56 put_page(head); 56 put_page(head);
57 return 0; 57 return 0;
58 } 58 }
59 if (head != page)
60 get_huge_page_tail(page);
61 59
62 pages[*nr] = page; 60 pages[*nr] = page;
63 (*nr)++; 61 (*nr)++;
@@ -70,7 +68,7 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
70 unsigned long end, int write, struct page **pages, 68 unsigned long end, int write, struct page **pages,
71 int *nr) 69 int *nr)
72{ 70{
73 struct page *head, *page, *tail; 71 struct page *head, *page;
74 int refs; 72 int refs;
75 73
76 if (!(pmd_val(pmd) & _PAGE_VALID)) 74 if (!(pmd_val(pmd) & _PAGE_VALID))
@@ -82,7 +80,6 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
82 refs = 0; 80 refs = 0;
83 head = pmd_page(pmd); 81 head = pmd_page(pmd);
84 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 82 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
85 tail = page;
86 do { 83 do {
87 VM_BUG_ON(compound_head(page) != head); 84 VM_BUG_ON(compound_head(page) != head);
88 pages[*nr] = page; 85 pages[*nr] = page;
@@ -103,15 +100,6 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
103 return 0; 100 return 0;
104 } 101 }
105 102
106 /* Any tail page need their mapcount reference taken before we
107 * return.
108 */
109 while (refs--) {
110 if (PageTail(tail))
111 get_huge_page_tail(tail);
112 tail++;
113 }
114
115 return 1; 103 return 1;
116} 104}
117 105
@@ -126,7 +114,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
126 pmd_t pmd = *pmdp; 114 pmd_t pmd = *pmdp;
127 115
128 next = pmd_addr_end(addr, end); 116 next = pmd_addr_end(addr, end);
129 if (pmd_none(pmd) || pmd_trans_splitting(pmd)) 117 if (pmd_none(pmd))
130 return 0; 118 return 0;
131 if (unlikely(pmd_large(pmd))) { 119 if (unlikely(pmd_large(pmd))) {
132 if (!gup_huge_pmd(pmdp, pmd, addr, next, 120 if (!gup_huge_pmd(pmdp, pmd, addr, next,
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index 2b05ccbebed9..96cecf55522e 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -489,16 +489,6 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
489#ifdef CONFIG_TRANSPARENT_HUGEPAGE 489#ifdef CONFIG_TRANSPARENT_HUGEPAGE
490#define has_transparent_hugepage() 1 490#define has_transparent_hugepage() 1
491#define pmd_trans_huge pmd_huge_page 491#define pmd_trans_huge pmd_huge_page
492
493static inline pmd_t pmd_mksplitting(pmd_t pmd)
494{
495 return pte_pmd(hv_pte_set_client2(pmd_pte(pmd)));
496}
497
498static inline int pmd_trans_splitting(pmd_t pmd)
499{
500 return hv_pte_get_client2(pmd_pte(pmd));
501}
502#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 492#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
503 493
504/* 494/*
diff --git a/arch/um/include/asm/page.h b/arch/um/include/asm/page.h
index 71c5d132062a..e13d41c392ae 100644
--- a/arch/um/include/asm/page.h
+++ b/arch/um/include/asm/page.h
@@ -18,6 +18,7 @@
18 18
19struct page; 19struct page;
20 20
21#include <linux/pfn.h>
21#include <linux/types.h> 22#include <linux/types.h>
22#include <asm/vm-flags.h> 23#include <asm/vm-flags.h>
23 24
@@ -52,7 +53,6 @@ typedef struct { unsigned long pgd; } pgd_t;
52#define pmd_val(x) ((x).pmd) 53#define pmd_val(x) ((x).pmd)
53#define __pmd(x) ((pmd_t) { (x) } ) 54#define __pmd(x) ((pmd_t) { (x) } )
54 55
55typedef unsigned long long pfn_t;
56typedef unsigned long long phys_t; 56typedef unsigned long long phys_t;
57 57
58#else 58#else
@@ -76,7 +76,6 @@ typedef struct { unsigned long pmd; } pmd_t;
76#define pte_is_zero(p) (!((p).pte & ~_PAGE_NEWPAGE)) 76#define pte_is_zero(p) (!((p).pte & ~_PAGE_NEWPAGE))
77#define pte_set_val(p, phys, prot) (p).pte = (phys | pgprot_val(prot)) 77#define pte_set_val(p, phys, prot) (p).pte = (phys | pgprot_val(prot))
78 78
79typedef unsigned long pfn_t;
80typedef unsigned long phys_t; 79typedef unsigned long phys_t;
81 80
82#endif 81#endif
@@ -109,8 +108,8 @@ extern unsigned long uml_physmem;
109#define __pa(virt) to_phys((void *) (unsigned long) (virt)) 108#define __pa(virt) to_phys((void *) (unsigned long) (virt))
110#define __va(phys) to_virt((unsigned long) (phys)) 109#define __va(phys) to_virt((unsigned long) (phys))
111 110
112#define phys_to_pfn(p) ((pfn_t) ((p) >> PAGE_SHIFT)) 111#define phys_to_pfn(p) ((p) >> PAGE_SHIFT)
113#define pfn_to_phys(pfn) ((phys_t) ((pfn) << PAGE_SHIFT)) 112#define pfn_to_phys(pfn) PFN_PHYS(pfn)
114 113
115#define pfn_valid(pfn) ((pfn) < max_mapnr) 114#define pfn_valid(pfn) ((pfn) < max_mapnr)
116#define virt_addr_valid(v) pfn_valid(phys_to_pfn(__pa(v))) 115#define virt_addr_valid(v) pfn_valid(phys_to_pfn(__pa(v)))
diff --git a/arch/um/include/asm/pgtable-3level.h b/arch/um/include/asm/pgtable-3level.h
index 2b4274e7c095..bae8523a162f 100644
--- a/arch/um/include/asm/pgtable-3level.h
+++ b/arch/um/include/asm/pgtable-3level.h
@@ -98,7 +98,7 @@ static inline unsigned long pte_pfn(pte_t pte)
98 return phys_to_pfn(pte_val(pte)); 98 return phys_to_pfn(pte_val(pte));
99} 99}
100 100
101static inline pte_t pfn_pte(pfn_t page_nr, pgprot_t pgprot) 101static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
102{ 102{
103 pte_t pte; 103 pte_t pte;
104 phys_t phys = pfn_to_phys(page_nr); 104 phys_t phys = pfn_to_phys(page_nr);
@@ -107,7 +107,7 @@ static inline pte_t pfn_pte(pfn_t page_nr, pgprot_t pgprot)
107 return pte; 107 return pte;
108} 108}
109 109
110static inline pmd_t pfn_pmd(pfn_t page_nr, pgprot_t pgprot) 110static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
111{ 111{
112 return __pmd((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)); 112 return __pmd((page_nr << PAGE_SHIFT) | pgprot_val(pgprot));
113} 113}
diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h
index 18eb9924dda3..7485398d0737 100644
--- a/arch/um/include/asm/pgtable.h
+++ b/arch/um/include/asm/pgtable.h
@@ -271,7 +271,7 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b)
271 271
272#define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys)) 272#define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys))
273#define __virt_to_page(virt) phys_to_page(__pa(virt)) 273#define __virt_to_page(virt) phys_to_page(__pa(virt))
274#define page_to_phys(page) pfn_to_phys((pfn_t) page_to_pfn(page)) 274#define page_to_phys(page) pfn_to_phys(page_to_pfn(page))
275#define virt_to_page(addr) __virt_to_page((const unsigned long) addr) 275#define virt_to_page(addr) __virt_to_page((const unsigned long) addr)
276 276
277#define mk_pte(page, pgprot) \ 277#define mk_pte(page, pgprot) \
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index 5dc4c0a43ccd..877342640b6e 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -34,9 +34,6 @@ config NO_IOPORT_MAP
34config STACKTRACE_SUPPORT 34config STACKTRACE_SUPPORT
35 def_bool y 35 def_bool y
36 36
37config HAVE_LATENCYTOP_SUPPORT
38 def_bool y
39
40config LOCKDEP_SUPPORT 37config LOCKDEP_SUPPORT
41 def_bool y 38 def_bool y
42 39
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 24f362bf3ec6..4a10ba9e95da 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -180,9 +180,6 @@ config LOCKDEP_SUPPORT
180config STACKTRACE_SUPPORT 180config STACKTRACE_SUPPORT
181 def_bool y 181 def_bool y
182 182
183config HAVE_LATENCYTOP_SUPPORT
184 def_bool y
185
186config MMU 183config MMU
187 def_bool y 184 def_bool y
188 185
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index d3eee663c41f..0687c4748b8f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -162,20 +162,22 @@ static inline int pmd_large(pmd_t pte)
162} 162}
163 163
164#ifdef CONFIG_TRANSPARENT_HUGEPAGE 164#ifdef CONFIG_TRANSPARENT_HUGEPAGE
165static inline int pmd_trans_splitting(pmd_t pmd)
166{
167 return pmd_val(pmd) & _PAGE_SPLITTING;
168}
169
170static inline int pmd_trans_huge(pmd_t pmd) 165static inline int pmd_trans_huge(pmd_t pmd)
171{ 166{
172 return pmd_val(pmd) & _PAGE_PSE; 167 return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
173} 168}
174 169
175static inline int has_transparent_hugepage(void) 170static inline int has_transparent_hugepage(void)
176{ 171{
177 return cpu_has_pse; 172 return cpu_has_pse;
178} 173}
174
175#ifdef __HAVE_ARCH_PTE_DEVMAP
176static inline int pmd_devmap(pmd_t pmd)
177{
178 return !!(pmd_val(pmd) & _PAGE_DEVMAP);
179}
180#endif
179#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 181#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
180 182
181static inline pte_t pte_set_flags(pte_t pte, pteval_t set) 183static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
@@ -252,6 +254,11 @@ static inline pte_t pte_mkspecial(pte_t pte)
252 return pte_set_flags(pte, _PAGE_SPECIAL); 254 return pte_set_flags(pte, _PAGE_SPECIAL);
253} 255}
254 256
257static inline pte_t pte_mkdevmap(pte_t pte)
258{
259 return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP);
260}
261
255static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) 262static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
256{ 263{
257 pmdval_t v = native_pmd_val(pmd); 264 pmdval_t v = native_pmd_val(pmd);
@@ -271,6 +278,11 @@ static inline pmd_t pmd_mkold(pmd_t pmd)
271 return pmd_clear_flags(pmd, _PAGE_ACCESSED); 278 return pmd_clear_flags(pmd, _PAGE_ACCESSED);
272} 279}
273 280
281static inline pmd_t pmd_mkclean(pmd_t pmd)
282{
283 return pmd_clear_flags(pmd, _PAGE_DIRTY);
284}
285
274static inline pmd_t pmd_wrprotect(pmd_t pmd) 286static inline pmd_t pmd_wrprotect(pmd_t pmd)
275{ 287{
276 return pmd_clear_flags(pmd, _PAGE_RW); 288 return pmd_clear_flags(pmd, _PAGE_RW);
@@ -281,6 +293,11 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
281 return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); 293 return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
282} 294}
283 295
296static inline pmd_t pmd_mkdevmap(pmd_t pmd)
297{
298 return pmd_set_flags(pmd, _PAGE_DEVMAP);
299}
300
284static inline pmd_t pmd_mkhuge(pmd_t pmd) 301static inline pmd_t pmd_mkhuge(pmd_t pmd)
285{ 302{
286 return pmd_set_flags(pmd, _PAGE_PSE); 303 return pmd_set_flags(pmd, _PAGE_PSE);
@@ -462,6 +479,13 @@ static inline int pte_present(pte_t a)
462 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); 479 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
463} 480}
464 481
482#ifdef __HAVE_ARCH_PTE_DEVMAP
483static inline int pte_devmap(pte_t a)
484{
485 return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
486}
487#endif
488
465#define pte_accessible pte_accessible 489#define pte_accessible pte_accessible
466static inline bool pte_accessible(struct mm_struct *mm, pte_t a) 490static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
467{ 491{
@@ -808,10 +832,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
808 unsigned long address, pmd_t *pmdp); 832 unsigned long address, pmd_t *pmdp);
809 833
810 834
811#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
812extern void pmdp_splitting_flush(struct vm_area_struct *vma,
813 unsigned long addr, pmd_t *pmdp);
814
815#define __HAVE_ARCH_PMD_WRITE 835#define __HAVE_ARCH_PMD_WRITE
816static inline int pmd_write(pmd_t pmd) 836static inline int pmd_write(pmd_t pmd)
817{ 837{
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index a471cadb9630..04c27a013165 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -22,10 +22,11 @@
22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ 22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 23#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
25#define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
26#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ 25#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
27#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ 26#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
28#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ 27#define _PAGE_BIT_SOFTW4 58 /* available for programmer */
28#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
29#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
29 30
30/* If _PAGE_BIT_PRESENT is clear, we use these: */ 31/* If _PAGE_BIT_PRESENT is clear, we use these: */
31/* - if the user mapped it with PROT_NONE; pte_present gives true */ 32/* - if the user mapped it with PROT_NONE; pte_present gives true */
@@ -46,7 +47,6 @@
46#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 47#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
47#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) 48#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
48#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) 49#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
49#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
50#define __HAVE_ARCH_PTE_SPECIAL 50#define __HAVE_ARCH_PTE_SPECIAL
51 51
52#ifdef CONFIG_KMEMCHECK 52#ifdef CONFIG_KMEMCHECK
@@ -85,8 +85,11 @@
85 85
86#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 86#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
87#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) 87#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
88#define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP)
89#define __HAVE_ARCH_PTE_DEVMAP
88#else 90#else
89#define _PAGE_NX (_AT(pteval_t, 0)) 91#define _PAGE_NX (_AT(pteval_t, 0))
92#define _PAGE_DEVMAP (_AT(pteval_t, 0))
90#endif 93#endif
91 94
92#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) 95#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index d8ce3ec816ab..1544fabcd7f9 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -132,12 +132,7 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size)
132{ 132{
133 void *vaddr = (void __force *)addr; 133 void *vaddr = (void __force *)addr;
134 134
135 /* TODO: implement the zeroing via non-temporal writes */ 135 memset(vaddr, 0, size);
136 if (size == PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0)
137 clear_page(vaddr);
138 else
139 memset(vaddr, 0, size);
140
141 __arch_wb_cache_pmem(vaddr, size); 136 __arch_wb_cache_pmem(vaddr, size);
142} 137}
143 138
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 483231ebbb0b..e574b8546518 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -175,7 +175,11 @@ static void mark_screen_rdonly(struct mm_struct *mm)
175 if (pud_none_or_clear_bad(pud)) 175 if (pud_none_or_clear_bad(pud))
176 goto out; 176 goto out;
177 pmd = pmd_offset(pud, 0xA0000); 177 pmd = pmd_offset(pud, 0xA0000);
178 split_huge_page_pmd_mm(mm, 0xA0000, pmd); 178
179 if (pmd_trans_huge(*pmd)) {
180 struct vm_area_struct *vma = find_vma(mm, 0xA0000);
181 split_huge_pmd(vma, pmd, 0xA0000);
182 }
179 if (pmd_none_or_clear_bad(pmd)) 183 if (pmd_none_or_clear_bad(pmd))
180 goto out; 184 goto out;
181 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); 185 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
index 5c520ebf6343..a22a488b4622 100644
--- a/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@ -43,11 +43,11 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
43static void kvm_iommu_put_pages(struct kvm *kvm, 43static void kvm_iommu_put_pages(struct kvm *kvm,
44 gfn_t base_gfn, unsigned long npages); 44 gfn_t base_gfn, unsigned long npages);
45 45
46static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, 46static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
47 unsigned long npages) 47 unsigned long npages)
48{ 48{
49 gfn_t end_gfn; 49 gfn_t end_gfn;
50 pfn_t pfn; 50 kvm_pfn_t pfn;
51 51
52 pfn = gfn_to_pfn_memslot(slot, gfn); 52 pfn = gfn_to_pfn_memslot(slot, gfn);
53 end_gfn = gfn + npages; 53 end_gfn = gfn + npages;
@@ -62,7 +62,8 @@ static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
62 return pfn; 62 return pfn;
63} 63}
64 64
65static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages) 65static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn,
66 unsigned long npages)
66{ 67{
67 unsigned long i; 68 unsigned long i;
68 69
@@ -73,7 +74,7 @@ static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages)
73int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) 74int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
74{ 75{
75 gfn_t gfn, end_gfn; 76 gfn_t gfn, end_gfn;
76 pfn_t pfn; 77 kvm_pfn_t pfn;
77 int r = 0; 78 int r = 0;
78 struct iommu_domain *domain = kvm->arch.iommu_domain; 79 struct iommu_domain *domain = kvm->arch.iommu_domain;
79 int flags; 80 int flags;
@@ -275,7 +276,7 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
275{ 276{
276 struct iommu_domain *domain; 277 struct iommu_domain *domain;
277 gfn_t end_gfn, gfn; 278 gfn_t end_gfn, gfn;
278 pfn_t pfn; 279 kvm_pfn_t pfn;
279 u64 phys; 280 u64 phys;
280 281
281 domain = kvm->arch.iommu_domain; 282 domain = kvm->arch.iommu_domain;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 420a5ca3c0ee..95a955de5964 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -259,7 +259,7 @@ static unsigned get_mmio_spte_access(u64 spte)
259} 259}
260 260
261static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, 261static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
262 pfn_t pfn, unsigned access) 262 kvm_pfn_t pfn, unsigned access)
263{ 263{
264 if (unlikely(is_noslot_pfn(pfn))) { 264 if (unlikely(is_noslot_pfn(pfn))) {
265 mark_mmio_spte(vcpu, sptep, gfn, access); 265 mark_mmio_spte(vcpu, sptep, gfn, access);
@@ -320,7 +320,7 @@ static int is_last_spte(u64 pte, int level)
320 return 0; 320 return 0;
321} 321}
322 322
323static pfn_t spte_to_pfn(u64 pte) 323static kvm_pfn_t spte_to_pfn(u64 pte)
324{ 324{
325 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 325 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
326} 326}
@@ -582,7 +582,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
582 */ 582 */
583static int mmu_spte_clear_track_bits(u64 *sptep) 583static int mmu_spte_clear_track_bits(u64 *sptep)
584{ 584{
585 pfn_t pfn; 585 kvm_pfn_t pfn;
586 u64 old_spte = *sptep; 586 u64 old_spte = *sptep;
587 587
588 if (!spte_has_volatile_bits(old_spte)) 588 if (!spte_has_volatile_bits(old_spte))
@@ -1372,7 +1372,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1372 int need_flush = 0; 1372 int need_flush = 0;
1373 u64 new_spte; 1373 u64 new_spte;
1374 pte_t *ptep = (pte_t *)data; 1374 pte_t *ptep = (pte_t *)data;
1375 pfn_t new_pfn; 1375 kvm_pfn_t new_pfn;
1376 1376
1377 WARN_ON(pte_huge(*ptep)); 1377 WARN_ON(pte_huge(*ptep));
1378 new_pfn = pte_pfn(*ptep); 1378 new_pfn = pte_pfn(*ptep);
@@ -2450,7 +2450,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2450 return 0; 2450 return 0;
2451} 2451}
2452 2452
2453static bool kvm_is_mmio_pfn(pfn_t pfn) 2453static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
2454{ 2454{
2455 if (pfn_valid(pfn)) 2455 if (pfn_valid(pfn))
2456 return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)); 2456 return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
@@ -2460,7 +2460,7 @@ static bool kvm_is_mmio_pfn(pfn_t pfn)
2460 2460
2461static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2461static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2462 unsigned pte_access, int level, 2462 unsigned pte_access, int level,
2463 gfn_t gfn, pfn_t pfn, bool speculative, 2463 gfn_t gfn, kvm_pfn_t pfn, bool speculative,
2464 bool can_unsync, bool host_writable) 2464 bool can_unsync, bool host_writable)
2465{ 2465{
2466 u64 spte; 2466 u64 spte;
@@ -2539,7 +2539,7 @@ done:
2539} 2539}
2540 2540
2541static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, 2541static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
2542 int write_fault, int level, gfn_t gfn, pfn_t pfn, 2542 int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
2543 bool speculative, bool host_writable) 2543 bool speculative, bool host_writable)
2544{ 2544{
2545 int was_rmapped = 0; 2545 int was_rmapped = 0;
@@ -2602,7 +2602,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
2602 return emulate; 2602 return emulate;
2603} 2603}
2604 2604
2605static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2605static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2606 bool no_dirty_log) 2606 bool no_dirty_log)
2607{ 2607{
2608 struct kvm_memory_slot *slot; 2608 struct kvm_memory_slot *slot;
@@ -2684,7 +2684,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2684} 2684}
2685 2685
2686static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, 2686static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
2687 int level, gfn_t gfn, pfn_t pfn, bool prefault) 2687 int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
2688{ 2688{
2689 struct kvm_shadow_walk_iterator iterator; 2689 struct kvm_shadow_walk_iterator iterator;
2690 struct kvm_mmu_page *sp; 2690 struct kvm_mmu_page *sp;
@@ -2732,7 +2732,7 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
2732 send_sig_info(SIGBUS, &info, tsk); 2732 send_sig_info(SIGBUS, &info, tsk);
2733} 2733}
2734 2734
2735static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) 2735static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
2736{ 2736{
2737 /* 2737 /*
2738 * Do not cache the mmio info caused by writing the readonly gfn 2738 * Do not cache the mmio info caused by writing the readonly gfn
@@ -2752,9 +2752,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
2752} 2752}
2753 2753
2754static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, 2754static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2755 gfn_t *gfnp, pfn_t *pfnp, int *levelp) 2755 gfn_t *gfnp, kvm_pfn_t *pfnp,
2756 int *levelp)
2756{ 2757{
2757 pfn_t pfn = *pfnp; 2758 kvm_pfn_t pfn = *pfnp;
2758 gfn_t gfn = *gfnp; 2759 gfn_t gfn = *gfnp;
2759 int level = *levelp; 2760 int level = *levelp;
2760 2761
@@ -2793,7 +2794,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2793} 2794}
2794 2795
2795static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, 2796static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
2796 pfn_t pfn, unsigned access, int *ret_val) 2797 kvm_pfn_t pfn, unsigned access, int *ret_val)
2797{ 2798{
2798 bool ret = true; 2799 bool ret = true;
2799 2800
@@ -2947,7 +2948,7 @@ exit:
2947} 2948}
2948 2949
2949static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 2950static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2950 gva_t gva, pfn_t *pfn, bool write, bool *writable); 2951 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
2951static void make_mmu_pages_available(struct kvm_vcpu *vcpu); 2952static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
2952 2953
2953static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, 2954static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
@@ -2956,7 +2957,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
2956 int r; 2957 int r;
2957 int level; 2958 int level;
2958 bool force_pt_level = false; 2959 bool force_pt_level = false;
2959 pfn_t pfn; 2960 kvm_pfn_t pfn;
2960 unsigned long mmu_seq; 2961 unsigned long mmu_seq;
2961 bool map_writable, write = error_code & PFERR_WRITE_MASK; 2962 bool map_writable, write = error_code & PFERR_WRITE_MASK;
2962 2963
@@ -3410,7 +3411,7 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
3410} 3411}
3411 3412
3412static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 3413static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3413 gva_t gva, pfn_t *pfn, bool write, bool *writable) 3414 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
3414{ 3415{
3415 struct kvm_memory_slot *slot; 3416 struct kvm_memory_slot *slot;
3416 bool async; 3417 bool async;
@@ -3448,7 +3449,7 @@ check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
3448static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, 3449static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3449 bool prefault) 3450 bool prefault)
3450{ 3451{
3451 pfn_t pfn; 3452 kvm_pfn_t pfn;
3452 int r; 3453 int r;
3453 int level; 3454 int level;
3454 bool force_pt_level; 3455 bool force_pt_level;
@@ -4601,7 +4602,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
4601 u64 *sptep; 4602 u64 *sptep;
4602 struct rmap_iterator iter; 4603 struct rmap_iterator iter;
4603 int need_tlb_flush = 0; 4604 int need_tlb_flush = 0;
4604 pfn_t pfn; 4605 kvm_pfn_t pfn;
4605 struct kvm_mmu_page *sp; 4606 struct kvm_mmu_page *sp;
4606 4607
4607restart: 4608restart:
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 1cee3ec20dd2..dcce533d420c 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -97,7 +97,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
97{ 97{
98 struct kvm_mmu_page *sp; 98 struct kvm_mmu_page *sp;
99 gfn_t gfn; 99 gfn_t gfn;
100 pfn_t pfn; 100 kvm_pfn_t pfn;
101 hpa_t hpa; 101 hpa_t hpa;
102 102
103 sp = page_header(__pa(sptep)); 103 sp = page_header(__pa(sptep));
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 91e939b486d1..6c9fed957cce 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -456,7 +456,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
456{ 456{
457 unsigned pte_access; 457 unsigned pte_access;
458 gfn_t gfn; 458 gfn_t gfn;
459 pfn_t pfn; 459 kvm_pfn_t pfn;
460 460
461 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) 461 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
462 return false; 462 return false;
@@ -551,7 +551,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
551static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 551static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
552 struct guest_walker *gw, 552 struct guest_walker *gw,
553 int write_fault, int hlevel, 553 int write_fault, int hlevel,
554 pfn_t pfn, bool map_writable, bool prefault) 554 kvm_pfn_t pfn, bool map_writable, bool prefault)
555{ 555{
556 struct kvm_mmu_page *sp = NULL; 556 struct kvm_mmu_page *sp = NULL;
557 struct kvm_shadow_walk_iterator it; 557 struct kvm_shadow_walk_iterator it;
@@ -694,7 +694,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
694 int user_fault = error_code & PFERR_USER_MASK; 694 int user_fault = error_code & PFERR_USER_MASK;
695 struct guest_walker walker; 695 struct guest_walker walker;
696 int r; 696 int r;
697 pfn_t pfn; 697 kvm_pfn_t pfn;
698 int level = PT_PAGE_TABLE_LEVEL; 698 int level = PT_PAGE_TABLE_LEVEL;
699 bool force_pt_level = false; 699 bool force_pt_level = false;
700 unsigned long mmu_seq; 700 unsigned long mmu_seq;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 04d61d496b14..e2951b6edbbc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4251,7 +4251,7 @@ out:
4251static int init_rmode_identity_map(struct kvm *kvm) 4251static int init_rmode_identity_map(struct kvm *kvm)
4252{ 4252{
4253 int i, idx, r = 0; 4253 int i, idx, r = 0;
4254 pfn_t identity_map_pfn; 4254 kvm_pfn_t identity_map_pfn;
4255 u32 tmp; 4255 u32 tmp;
4256 4256
4257 if (!enable_ept) 4257 if (!enable_ept)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f53f5b13c677..4244c2baf57d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5148,7 +5148,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
5148 int emulation_type) 5148 int emulation_type)
5149{ 5149{
5150 gpa_t gpa = cr2; 5150 gpa_t gpa = cr2;
5151 pfn_t pfn; 5151 kvm_pfn_t pfn;
5152 5152
5153 if (emulation_type & EMULTYPE_NO_REEXECUTE) 5153 if (emulation_type & EMULTYPE_NO_REEXECUTE)
5154 return false; 5154 return false;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index ae9a37bf1371..6d5eb5900372 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -9,6 +9,7 @@
9#include <linux/vmstat.h> 9#include <linux/vmstat.h>
10#include <linux/highmem.h> 10#include <linux/highmem.h>
11#include <linux/swap.h> 11#include <linux/swap.h>
12#include <linux/memremap.h>
12 13
13#include <asm/pgtable.h> 14#include <asm/pgtable.h>
14 15
@@ -63,6 +64,16 @@ retry:
63#endif 64#endif
64} 65}
65 66
67static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
68{
69 while ((*nr) - nr_start) {
70 struct page *page = pages[--(*nr)];
71
72 ClearPageReferenced(page);
73 put_page(page);
74 }
75}
76
66/* 77/*
67 * The performance critical leaf functions are made noinline otherwise gcc 78 * The performance critical leaf functions are made noinline otherwise gcc
68 * inlines everything into a single function which results in too much 79 * inlines everything into a single function which results in too much
@@ -71,7 +82,9 @@ retry:
71static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, 82static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
72 unsigned long end, int write, struct page **pages, int *nr) 83 unsigned long end, int write, struct page **pages, int *nr)
73{ 84{
85 struct dev_pagemap *pgmap = NULL;
74 unsigned long mask; 86 unsigned long mask;
87 int nr_start = *nr;
75 pte_t *ptep; 88 pte_t *ptep;
76 89
77 mask = _PAGE_PRESENT|_PAGE_USER; 90 mask = _PAGE_PRESENT|_PAGE_USER;
@@ -89,13 +102,21 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
89 return 0; 102 return 0;
90 } 103 }
91 104
92 if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { 105 page = pte_page(pte);
106 if (pte_devmap(pte)) {
107 pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
108 if (unlikely(!pgmap)) {
109 undo_dev_pagemap(nr, nr_start, pages);
110 pte_unmap(ptep);
111 return 0;
112 }
113 } else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
93 pte_unmap(ptep); 114 pte_unmap(ptep);
94 return 0; 115 return 0;
95 } 116 }
96 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 117 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
97 page = pte_page(pte);
98 get_page(page); 118 get_page(page);
119 put_dev_pagemap(pgmap);
99 SetPageReferenced(page); 120 SetPageReferenced(page);
100 pages[*nr] = page; 121 pages[*nr] = page;
101 (*nr)++; 122 (*nr)++;
@@ -114,6 +135,32 @@ static inline void get_head_page_multiple(struct page *page, int nr)
114 SetPageReferenced(page); 135 SetPageReferenced(page);
115} 136}
116 137
138static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
139 unsigned long end, struct page **pages, int *nr)
140{
141 int nr_start = *nr;
142 unsigned long pfn = pmd_pfn(pmd);
143 struct dev_pagemap *pgmap = NULL;
144
145 pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
146 do {
147 struct page *page = pfn_to_page(pfn);
148
149 pgmap = get_dev_pagemap(pfn, pgmap);
150 if (unlikely(!pgmap)) {
151 undo_dev_pagemap(nr, nr_start, pages);
152 return 0;
153 }
154 SetPageReferenced(page);
155 pages[*nr] = page;
156 get_page(page);
157 put_dev_pagemap(pgmap);
158 (*nr)++;
159 pfn++;
160 } while (addr += PAGE_SIZE, addr != end);
161 return 1;
162}
163
117static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, 164static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
118 unsigned long end, int write, struct page **pages, int *nr) 165 unsigned long end, int write, struct page **pages, int *nr)
119{ 166{
@@ -126,9 +173,13 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
126 mask |= _PAGE_RW; 173 mask |= _PAGE_RW;
127 if ((pmd_flags(pmd) & mask) != mask) 174 if ((pmd_flags(pmd) & mask) != mask)
128 return 0; 175 return 0;
176
177 VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
178 if (pmd_devmap(pmd))
179 return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
180
129 /* hugepages are never "special" */ 181 /* hugepages are never "special" */
130 VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); 182 VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
131 VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
132 183
133 refs = 0; 184 refs = 0;
134 head = pmd_page(pmd); 185 head = pmd_page(pmd);
@@ -136,8 +187,6 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
136 do { 187 do {
137 VM_BUG_ON_PAGE(compound_head(page) != head, page); 188 VM_BUG_ON_PAGE(compound_head(page) != head, page);
138 pages[*nr] = page; 189 pages[*nr] = page;
139 if (PageTail(page))
140 get_huge_page_tail(page);
141 (*nr)++; 190 (*nr)++;
142 page++; 191 page++;
143 refs++; 192 refs++;
@@ -158,18 +207,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
158 pmd_t pmd = *pmdp; 207 pmd_t pmd = *pmdp;
159 208
160 next = pmd_addr_end(addr, end); 209 next = pmd_addr_end(addr, end);
161 /* 210 if (pmd_none(pmd))
162 * The pmd_trans_splitting() check below explains why
163 * pmdp_splitting_flush has to flush the tlb, to stop
164 * this gup-fast code from running while we set the
165 * splitting bit in the pmd. Returning zero will take
166 * the slow path that will call wait_split_huge_page()
167 * if the pmd is still in splitting state. gup-fast
168 * can't because it has irq disabled and
169 * wait_split_huge_page() would never return as the
170 * tlb flush IPI wouldn't run.
171 */
172 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
173 return 0; 211 return 0;
174 if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { 212 if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
175 /* 213 /*
@@ -212,8 +250,6 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
212 do { 250 do {
213 VM_BUG_ON_PAGE(compound_head(page) != head, page); 251 VM_BUG_ON_PAGE(compound_head(page) != head, page);
214 pages[*nr] = page; 252 pages[*nr] = page;
215 if (PageTail(page))
216 get_huge_page_tail(page);
217 (*nr)++; 253 (*nr)++;
218 page++; 254 page++;
219 refs++; 255 refs++;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 8829482d69ec..5488d21123bd 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -30,6 +30,7 @@
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/memory.h> 31#include <linux/memory.h>
32#include <linux/memory_hotplug.h> 32#include <linux/memory_hotplug.h>
33#include <linux/memremap.h>
33#include <linux/nmi.h> 34#include <linux/nmi.h>
34#include <linux/gfp.h> 35#include <linux/gfp.h>
35#include <linux/kcore.h> 36#include <linux/kcore.h>
@@ -714,6 +715,12 @@ static void __meminit free_pagetable(struct page *page, int order)
714{ 715{
715 unsigned long magic; 716 unsigned long magic;
716 unsigned int nr_pages = 1 << order; 717 unsigned int nr_pages = 1 << order;
718 struct vmem_altmap *altmap = to_vmem_altmap((unsigned long) page);
719
720 if (altmap) {
721 vmem_altmap_free(altmap, nr_pages);
722 return;
723 }
717 724
718 /* bootmem page has reserved flag */ 725 /* bootmem page has reserved flag */
719 if (PageReserved(page)) { 726 if (PageReserved(page)) {
@@ -1017,13 +1024,19 @@ int __ref arch_remove_memory(u64 start, u64 size)
1017{ 1024{
1018 unsigned long start_pfn = start >> PAGE_SHIFT; 1025 unsigned long start_pfn = start >> PAGE_SHIFT;
1019 unsigned long nr_pages = size >> PAGE_SHIFT; 1026 unsigned long nr_pages = size >> PAGE_SHIFT;
1027 struct page *page = pfn_to_page(start_pfn);
1028 struct vmem_altmap *altmap;
1020 struct zone *zone; 1029 struct zone *zone;
1021 int ret; 1030 int ret;
1022 1031
1023 zone = page_zone(pfn_to_page(start_pfn)); 1032 /* With altmap the first mapped page is offset from @start */
1024 kernel_physical_mapping_remove(start, start + size); 1033 altmap = to_vmem_altmap((unsigned long) page);
1034 if (altmap)
1035 page += vmem_altmap_offset(altmap);
1036 zone = page_zone(page);
1025 ret = __remove_pages(zone, start_pfn, nr_pages); 1037 ret = __remove_pages(zone, start_pfn, nr_pages);
1026 WARN_ON_ONCE(ret); 1038 WARN_ON_ONCE(ret);
1039 kernel_physical_mapping_remove(start, start + size);
1027 1040
1028 return ret; 1041 return ret;
1029} 1042}
@@ -1235,7 +1248,7 @@ static void __meminitdata *p_start, *p_end;
1235static int __meminitdata node_start; 1248static int __meminitdata node_start;
1236 1249
1237static int __meminit vmemmap_populate_hugepages(unsigned long start, 1250static int __meminit vmemmap_populate_hugepages(unsigned long start,
1238 unsigned long end, int node) 1251 unsigned long end, int node, struct vmem_altmap *altmap)
1239{ 1252{
1240 unsigned long addr; 1253 unsigned long addr;
1241 unsigned long next; 1254 unsigned long next;
@@ -1258,7 +1271,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
1258 if (pmd_none(*pmd)) { 1271 if (pmd_none(*pmd)) {
1259 void *p; 1272 void *p;
1260 1273
1261 p = vmemmap_alloc_block_buf(PMD_SIZE, node); 1274 p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
1262 if (p) { 1275 if (p) {
1263 pte_t entry; 1276 pte_t entry;
1264 1277
@@ -1279,7 +1292,8 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
1279 addr_end = addr + PMD_SIZE; 1292 addr_end = addr + PMD_SIZE;
1280 p_end = p + PMD_SIZE; 1293 p_end = p + PMD_SIZE;
1281 continue; 1294 continue;
1282 } 1295 } else if (altmap)
1296 return -ENOMEM; /* no fallback */
1283 } else if (pmd_large(*pmd)) { 1297 } else if (pmd_large(*pmd)) {
1284 vmemmap_verify((pte_t *)pmd, node, addr, next); 1298 vmemmap_verify((pte_t *)pmd, node, addr, next);
1285 continue; 1299 continue;
@@ -1293,11 +1307,16 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
1293 1307
1294int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) 1308int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
1295{ 1309{
1310 struct vmem_altmap *altmap = to_vmem_altmap(start);
1296 int err; 1311 int err;
1297 1312
1298 if (cpu_has_pse) 1313 if (cpu_has_pse)
1299 err = vmemmap_populate_hugepages(start, end, node); 1314 err = vmemmap_populate_hugepages(start, end, node, altmap);
1300 else 1315 else if (altmap) {
1316 pr_err_once("%s: no cpu support for altmap allocations\n",
1317 __func__);
1318 err = -ENOMEM;
1319 } else
1301 err = vmemmap_populate_basepages(start, end, node); 1320 err = vmemmap_populate_basepages(start, end, node);
1302 if (!err) 1321 if (!err)
1303 sync_global_pgds(start, end - 1, 0); 1322 sync_global_pgds(start, end - 1, 0);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 031782e74231..f4ae536b0914 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -12,6 +12,7 @@
12#include <linux/debugfs.h> 12#include <linux/debugfs.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/pfn_t.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16#include <linux/mm.h> 17#include <linux/mm.h>
17#include <linux/fs.h> 18#include <linux/fs.h>
@@ -949,7 +950,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
949} 950}
950 951
951int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, 952int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
952 unsigned long pfn) 953 pfn_t pfn)
953{ 954{
954 enum page_cache_mode pcm; 955 enum page_cache_mode pcm;
955 956
@@ -957,7 +958,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
957 return 0; 958 return 0;
958 959
959 /* Set prot based on lookup */ 960 /* Set prot based on lookup */
960 pcm = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); 961 pcm = lookup_memtype(pfn_t_to_phys(pfn));
961 *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | 962 *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
962 cachemode2protval(pcm)); 963 cachemode2protval(pcm));
963 964
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ee9c2e3a7199..4eb287e25043 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -505,19 +505,6 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
505 505
506 return young; 506 return young;
507} 507}
508
509void pmdp_splitting_flush(struct vm_area_struct *vma,
510 unsigned long address, pmd_t *pmdp)
511{
512 int set;
513 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
514 set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
515 (unsigned long *)pmdp);
516 if (set) {
517 /* need tlb flush only to serialize against gup-fast */
518 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
519 }
520}
521#endif 508#endif
522 509
523/** 510/**
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 360944e1da52..d030594ed22b 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -86,8 +86,10 @@
86#define MADV_SEQUENTIAL 2 /* expect sequential page references */ 86#define MADV_SEQUENTIAL 2 /* expect sequential page references */
87#define MADV_WILLNEED 3 /* will need these pages */ 87#define MADV_WILLNEED 3 /* will need these pages */
88#define MADV_DONTNEED 4 /* don't need these pages */ 88#define MADV_DONTNEED 4 /* don't need these pages */
89#define MADV_FREE 5 /* free pages only if memory pressure */
89 90
90/* common parameters: try to keep these consistent across architectures */ 91/* common parameters: try to keep these consistent across architectures */
92#define MADV_FREE 8 /* free pages only if memory pressure */
91#define MADV_REMOVE 9 /* remove these pages & resources */ 93#define MADV_REMOVE 9 /* remove these pages & resources */
92#define MADV_DONTFORK 10 /* don't inherit across fork */ 94#define MADV_DONTFORK 10 /* don't inherit across fork */
93#define MADV_DOFORK 11 /* do inherit across fork */ 95#define MADV_DOFORK 11 /* do inherit across fork */
diff --git a/arch/xtensa/mm/tlb.c b/arch/xtensa/mm/tlb.c
index 5ece856c5725..35c822286bbe 100644
--- a/arch/xtensa/mm/tlb.c
+++ b/arch/xtensa/mm/tlb.c
@@ -245,7 +245,7 @@ static int check_tlb_entry(unsigned w, unsigned e, bool dtlb)
245 page_mapcount(p)); 245 page_mapcount(p));
246 if (!page_count(p)) 246 if (!page_count(p))
247 rc |= TLB_INSANE; 247 rc |= TLB_INSANE;
248 else if (page_mapped(p)) 248 else if (page_mapcount(p))
249 rc |= TLB_SUSPICIOUS; 249 rc |= TLB_SUSPICIOUS;
250 } else { 250 } else {
251 rc |= TLB_INSANE; 251 rc |= TLB_INSANE;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 619fe584a44c..213456c2b123 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -647,6 +647,13 @@ static int add_memory_block(int base_section_nr)
647 return 0; 647 return 0;
648} 648}
649 649
650static bool is_zone_device_section(struct mem_section *ms)
651{
652 struct page *page;
653
654 page = sparse_decode_mem_map(ms->section_mem_map, __section_nr(ms));
655 return is_zone_device_page(page);
656}
650 657
651/* 658/*
652 * need an interface for the VM to add new memory regions, 659 * need an interface for the VM to add new memory regions,
@@ -657,6 +664,9 @@ int register_new_memory(int nid, struct mem_section *section)
657 int ret = 0; 664 int ret = 0;
658 struct memory_block *mem; 665 struct memory_block *mem;
659 666
667 if (is_zone_device_section(section))
668 return 0;
669
660 mutex_lock(&mem_sysfs_mutex); 670 mutex_lock(&mem_sysfs_mutex);
661 671
662 mem = find_memory_block(section); 672 mem = find_memory_block(section);
@@ -693,6 +703,9 @@ static int remove_memory_section(unsigned long node_id,
693{ 703{
694 struct memory_block *mem; 704 struct memory_block *mem;
695 705
706 if (is_zone_device_section(section))
707 return 0;
708
696 mutex_lock(&mem_sysfs_mutex); 709 mutex_lock(&mem_sysfs_mutex);
697 mem = find_memory_block(section); 710 mem = find_memory_block(section);
698 unregister_mem_sect_under_nodes(mem, __section_nr(section)); 711 unregister_mem_sect_under_nodes(mem, __section_nr(section));
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index a5880f4ab40e..cb27190e9f39 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -19,6 +19,9 @@
19#include <linux/radix-tree.h> 19#include <linux/radix-tree.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#ifdef CONFIG_BLK_DEV_RAM_DAX
23#include <linux/pfn_t.h>
24#endif
22 25
23#include <asm/uaccess.h> 26#include <asm/uaccess.h>
24 27
@@ -378,7 +381,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
378 381
379#ifdef CONFIG_BLK_DEV_RAM_DAX 382#ifdef CONFIG_BLK_DEV_RAM_DAX
380static long brd_direct_access(struct block_device *bdev, sector_t sector, 383static long brd_direct_access(struct block_device *bdev, sector_t sector,
381 void __pmem **kaddr, unsigned long *pfn) 384 void __pmem **kaddr, pfn_t *pfn)
382{ 385{
383 struct brd_device *brd = bdev->bd_disk->private_data; 386 struct brd_device *brd = bdev->bd_disk->private_data;
384 struct page *page; 387 struct page *page;
@@ -389,7 +392,7 @@ static long brd_direct_access(struct block_device *bdev, sector_t sector,
389 if (!page) 392 if (!page)
390 return -ENOSPC; 393 return -ENOSPC;
391 *kaddr = (void __pmem *)page_address(page); 394 *kaddr = (void __pmem *)page_address(page);
392 *pfn = page_to_pfn(page); 395 *pfn = page_to_pfn_t(page);
393 396
394 return PAGE_SIZE; 397 return PAGE_SIZE;
395} 398}
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 47915d736f8d..370c2f76016d 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1325,7 +1325,6 @@ static int zram_remove(struct zram *zram)
1325 1325
1326 pr_info("Removed device: %s\n", zram->disk->disk_name); 1326 pr_info("Removed device: %s\n", zram->disk->disk_name);
1327 1327
1328 idr_remove(&zram_index_idr, zram->disk->first_minor);
1329 blk_cleanup_queue(zram->disk->queue); 1328 blk_cleanup_queue(zram->disk->queue);
1330 del_gendisk(zram->disk); 1329 del_gendisk(zram->disk);
1331 put_disk(zram->disk); 1330 put_disk(zram->disk);
@@ -1367,10 +1366,12 @@ static ssize_t hot_remove_store(struct class *class,
1367 mutex_lock(&zram_index_mutex); 1366 mutex_lock(&zram_index_mutex);
1368 1367
1369 zram = idr_find(&zram_index_idr, dev_id); 1368 zram = idr_find(&zram_index_idr, dev_id);
1370 if (zram) 1369 if (zram) {
1371 ret = zram_remove(zram); 1370 ret = zram_remove(zram);
1372 else 1371 idr_remove(&zram_index_idr, dev_id);
1372 } else {
1373 ret = -ENODEV; 1373 ret = -ENODEV;
1374 }
1374 1375
1375 mutex_unlock(&zram_index_mutex); 1376 mutex_unlock(&zram_index_mutex);
1376 return ret ? ret : count; 1377 return ret ? ret : count;
diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c
index 252eb301470c..32358c5e3db4 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_gem.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c
@@ -14,6 +14,7 @@
14 14
15#include <linux/shmem_fs.h> 15#include <linux/shmem_fs.h>
16#include <linux/dma-buf.h> 16#include <linux/dma-buf.h>
17#include <linux/pfn_t.h>
17#include <drm/exynos_drm.h> 18#include <drm/exynos_drm.h>
18 19
19#include "exynos_drm_drv.h" 20#include "exynos_drm_drv.h"
@@ -490,7 +491,8 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
490 } 491 }
491 492
492 pfn = page_to_pfn(exynos_gem->pages[page_offset]); 493 pfn = page_to_pfn(exynos_gem->pages[page_offset]);
493 ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, pfn); 494 ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
495 __pfn_to_pfn_t(pfn, PFN_DEV));
494 496
495out: 497out:
496 switch (ret) { 498 switch (ret) {
diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c
index ee95c03a8c54..cb95765050cc 100644
--- a/drivers/gpu/drm/gma500/framebuffer.c
+++ b/drivers/gpu/drm/gma500/framebuffer.c
@@ -21,6 +21,7 @@
21#include <linux/kernel.h> 21#include <linux/kernel.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/pfn_t.h>
24#include <linux/mm.h> 25#include <linux/mm.h>
25#include <linux/tty.h> 26#include <linux/tty.h>
26#include <linux/slab.h> 27#include <linux/slab.h>
@@ -132,7 +133,8 @@ static int psbfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
132 for (i = 0; i < page_num; i++) { 133 for (i = 0; i < page_num; i++) {
133 pfn = (phys_addr >> PAGE_SHIFT); 134 pfn = (phys_addr >> PAGE_SHIFT);
134 135
135 ret = vm_insert_mixed(vma, address, pfn); 136 ret = vm_insert_mixed(vma, address,
137 __pfn_to_pfn_t(pfn, PFN_DEV));
136 if (unlikely((ret == -EBUSY) || (ret != 0 && i > 0))) 138 if (unlikely((ret == -EBUSY) || (ret != 0 && i > 0)))
137 break; 139 break;
138 else if (unlikely(ret != 0)) { 140 else if (unlikely(ret != 0)) {
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index c76cc853b08a..3cedb8d5c855 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -18,6 +18,7 @@
18#include <linux/spinlock.h> 18#include <linux/spinlock.h>
19#include <linux/shmem_fs.h> 19#include <linux/shmem_fs.h>
20#include <linux/dma-buf.h> 20#include <linux/dma-buf.h>
21#include <linux/pfn_t.h>
21 22
22#include "msm_drv.h" 23#include "msm_drv.h"
23#include "msm_gem.h" 24#include "msm_gem.h"
@@ -222,7 +223,8 @@ int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
222 VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, 223 VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address,
223 pfn, pfn << PAGE_SHIFT); 224 pfn, pfn << PAGE_SHIFT);
224 225
225 ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, pfn); 226 ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
227 __pfn_to_pfn_t(pfn, PFN_DEV));
226 228
227out_unlock: 229out_unlock:
228 mutex_unlock(&dev->struct_mutex); 230 mutex_unlock(&dev->struct_mutex);
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index 7ed08fdc4c42..ceba5459ceb7 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -19,6 +19,7 @@
19 19
20#include <linux/shmem_fs.h> 20#include <linux/shmem_fs.h>
21#include <linux/spinlock.h> 21#include <linux/spinlock.h>
22#include <linux/pfn_t.h>
22 23
23#include <drm/drm_vma_manager.h> 24#include <drm/drm_vma_manager.h>
24 25
@@ -385,7 +386,8 @@ static int fault_1d(struct drm_gem_object *obj,
385 VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, 386 VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address,
386 pfn, pfn << PAGE_SHIFT); 387 pfn, pfn << PAGE_SHIFT);
387 388
388 return vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, pfn); 389 return vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
390 __pfn_to_pfn_t(pfn, PFN_DEV));
389} 391}
390 392
391/* Special handling for the case of faulting in 2d tiled buffers */ 393/* Special handling for the case of faulting in 2d tiled buffers */
@@ -478,7 +480,8 @@ static int fault_2d(struct drm_gem_object *obj,
478 pfn, pfn << PAGE_SHIFT); 480 pfn, pfn << PAGE_SHIFT);
479 481
480 for (i = n; i > 0; i--) { 482 for (i = n; i > 0; i--) {
481 vm_insert_mixed(vma, (unsigned long)vaddr, pfn); 483 vm_insert_mixed(vma, (unsigned long)vaddr,
484 __pfn_to_pfn_t(pfn, PFN_DEV));
482 pfn += usergart[fmt].stride_pfn; 485 pfn += usergart[fmt].stride_pfn;
483 vaddr += PAGE_SIZE * m; 486 vaddr += PAGE_SIZE * m;
484 } 487 }
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 8fb7213277cc..06d26dc438b2 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -35,6 +35,7 @@
35#include <ttm/ttm_placement.h> 35#include <ttm/ttm_placement.h>
36#include <drm/drm_vma_manager.h> 36#include <drm/drm_vma_manager.h>
37#include <linux/mm.h> 37#include <linux/mm.h>
38#include <linux/pfn_t.h>
38#include <linux/rbtree.h> 39#include <linux/rbtree.h>
39#include <linux/module.h> 40#include <linux/module.h>
40#include <linux/uaccess.h> 41#include <linux/uaccess.h>
@@ -229,7 +230,8 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
229 } 230 }
230 231
231 if (vma->vm_flags & VM_MIXEDMAP) 232 if (vma->vm_flags & VM_MIXEDMAP)
232 ret = vm_insert_mixed(&cvma, address, pfn); 233 ret = vm_insert_mixed(&cvma, address,
234 __pfn_to_pfn_t(pfn, PFN_DEV));
233 else 235 else
234 ret = vm_insert_pfn(&cvma, address, pfn); 236 ret = vm_insert_pfn(&cvma, address, pfn);
235 237
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index fd01f3493fc7..af7cc1e65656 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -433,16 +433,15 @@ ssize_t iio_format_value(char *buf, unsigned int type, int size, int *vals)
433 scale_db = true; 433 scale_db = true;
434 case IIO_VAL_INT_PLUS_MICRO: 434 case IIO_VAL_INT_PLUS_MICRO:
435 if (vals[1] < 0) 435 if (vals[1] < 0)
436 return sprintf(buf, "-%ld.%06u%s\n", abs(vals[0]), 436 return sprintf(buf, "-%d.%06u%s\n", abs(vals[0]),
437 -vals[1], 437 -vals[1], scale_db ? " dB" : "");
438 scale_db ? " dB" : "");
439 else 438 else
440 return sprintf(buf, "%d.%06u%s\n", vals[0], vals[1], 439 return sprintf(buf, "%d.%06u%s\n", vals[0], vals[1],
441 scale_db ? " dB" : ""); 440 scale_db ? " dB" : "");
442 case IIO_VAL_INT_PLUS_NANO: 441 case IIO_VAL_INT_PLUS_NANO:
443 if (vals[1] < 0) 442 if (vals[1] < 0)
444 return sprintf(buf, "-%ld.%09u\n", abs(vals[0]), 443 return sprintf(buf, "-%d.%09u\n", abs(vals[0]),
445 -vals[1]); 444 -vals[1]);
446 else 445 else
447 return sprintf(buf, "%d.%09u\n", vals[0], vals[1]); 446 return sprintf(buf, "%d.%09u\n", vals[0], vals[1]);
448 case IIO_VAL_FRACTIONAL: 447 case IIO_VAL_FRACTIONAL:
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c
index 07a4c644fb9b..e9cef9de9ed8 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c
@@ -901,7 +901,7 @@ static void iwlagn_gain_computation(struct iwl_priv *priv,
901 /* bound gain by 2 bits value max, 3rd bit is sign */ 901 /* bound gain by 2 bits value max, 3rd bit is sign */
902 data->delta_gain_code[i] = 902 data->delta_gain_code[i] =
903 min(abs(delta_g), 903 min(abs(delta_g),
904 (long) CHAIN_NOISE_MAX_DELTA_GAIN_CODE); 904 (s32) CHAIN_NOISE_MAX_DELTA_GAIN_CODE);
905 905
906 if (delta_g < 0) 906 if (delta_g < 0)
907 /* 907 /*
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index f9b674bc49db..0cc9048b86e2 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -83,8 +83,7 @@ static ssize_t mode_store(struct device *dev,
83 83
84 if (strncmp(buf, "pmem\n", n) == 0 84 if (strncmp(buf, "pmem\n", n) == 0
85 || strncmp(buf, "pmem", n) == 0) { 85 || strncmp(buf, "pmem", n) == 0) {
86 /* TODO: allocate from PMEM support */ 86 nd_pfn->mode = PFN_MODE_PMEM;
87 rc = -ENOTTY;
88 } else if (strncmp(buf, "ram\n", n) == 0 87 } else if (strncmp(buf, "ram\n", n) == 0
89 || strncmp(buf, "ram", n) == 0) 88 || strncmp(buf, "ram", n) == 0)
90 nd_pfn->mode = PFN_MODE_RAM; 89 nd_pfn->mode = PFN_MODE_RAM;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index b493ff3fccb2..7edf31671dab 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -21,10 +21,11 @@
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/platform_device.h> 22#include <linux/platform_device.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/memory_hotplug.h>
25#include <linux/moduleparam.h> 24#include <linux/moduleparam.h>
26#include <linux/badblocks.h> 25#include <linux/badblocks.h>
26#include <linux/memremap.h>
27#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
28#include <linux/pfn_t.h>
28#include <linux/slab.h> 29#include <linux/slab.h>
29#include <linux/pmem.h> 30#include <linux/pmem.h>
30#include <linux/nd.h> 31#include <linux/nd.h>
@@ -40,6 +41,7 @@ struct pmem_device {
40 phys_addr_t phys_addr; 41 phys_addr_t phys_addr;
41 /* when non-zero this device is hosting a 'pfn' instance */ 42 /* when non-zero this device is hosting a 'pfn' instance */
42 phys_addr_t data_offset; 43 phys_addr_t data_offset;
44 unsigned long pfn_flags;
43 void __pmem *virt_addr; 45 void __pmem *virt_addr;
44 size_t size; 46 size_t size;
45 struct badblocks bb; 47 struct badblocks bb;
@@ -135,13 +137,13 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
135} 137}
136 138
137static long pmem_direct_access(struct block_device *bdev, sector_t sector, 139static long pmem_direct_access(struct block_device *bdev, sector_t sector,
138 void __pmem **kaddr, unsigned long *pfn) 140 void __pmem **kaddr, pfn_t *pfn)
139{ 141{
140 struct pmem_device *pmem = bdev->bd_disk->private_data; 142 struct pmem_device *pmem = bdev->bd_disk->private_data;
141 resource_size_t offset = sector * 512 + pmem->data_offset; 143 resource_size_t offset = sector * 512 + pmem->data_offset;
142 144
143 *kaddr = pmem->virt_addr + offset; 145 *kaddr = pmem->virt_addr + offset;
144 *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT; 146 *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
145 147
146 return pmem->size - offset; 148 return pmem->size - offset;
147} 149}
@@ -157,6 +159,7 @@ static struct pmem_device *pmem_alloc(struct device *dev,
157 struct resource *res, int id) 159 struct resource *res, int id)
158{ 160{
159 struct pmem_device *pmem; 161 struct pmem_device *pmem;
162 struct request_queue *q;
160 163
161 pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL); 164 pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
162 if (!pmem) 165 if (!pmem)
@@ -174,16 +177,26 @@ static struct pmem_device *pmem_alloc(struct device *dev,
174 return ERR_PTR(-EBUSY); 177 return ERR_PTR(-EBUSY);
175 } 178 }
176 179
177 if (pmem_should_map_pages(dev)) 180 q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
178 pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res); 181 if (!q)
179 else 182 return ERR_PTR(-ENOMEM);
183
184 pmem->pfn_flags = PFN_DEV;
185 if (pmem_should_map_pages(dev)) {
186 pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res,
187 &q->q_usage_counter, NULL);
188 pmem->pfn_flags |= PFN_MAP;
189 } else
180 pmem->virt_addr = (void __pmem *) devm_memremap(dev, 190 pmem->virt_addr = (void __pmem *) devm_memremap(dev,
181 pmem->phys_addr, pmem->size, 191 pmem->phys_addr, pmem->size,
182 ARCH_MEMREMAP_PMEM); 192 ARCH_MEMREMAP_PMEM);
183 193
184 if (IS_ERR(pmem->virt_addr)) 194 if (IS_ERR(pmem->virt_addr)) {
195 blk_cleanup_queue(q);
185 return (void __force *) pmem->virt_addr; 196 return (void __force *) pmem->virt_addr;
197 }
186 198
199 pmem->pmem_queue = q;
187 return pmem; 200 return pmem;
188} 201}
189 202
@@ -203,10 +216,6 @@ static int pmem_attach_disk(struct device *dev,
203 int nid = dev_to_node(dev); 216 int nid = dev_to_node(dev);
204 struct gendisk *disk; 217 struct gendisk *disk;
205 218
206 pmem->pmem_queue = blk_alloc_queue_node(GFP_KERNEL, nid);
207 if (!pmem->pmem_queue)
208 return -ENOMEM;
209
210 blk_queue_make_request(pmem->pmem_queue, pmem_make_request); 219 blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
211 blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE); 220 blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE);
212 blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX); 221 blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX);
@@ -352,12 +361,17 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
352 struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); 361 struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
353 struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim); 362 struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
354 struct device *dev = &nd_pfn->dev; 363 struct device *dev = &nd_pfn->dev;
355 struct vmem_altmap *altmap;
356 struct nd_region *nd_region; 364 struct nd_region *nd_region;
365 struct vmem_altmap *altmap;
357 struct nd_pfn_sb *pfn_sb; 366 struct nd_pfn_sb *pfn_sb;
358 struct pmem_device *pmem; 367 struct pmem_device *pmem;
368 struct request_queue *q;
359 phys_addr_t offset; 369 phys_addr_t offset;
360 int rc; 370 int rc;
371 struct vmem_altmap __altmap = {
372 .base_pfn = __phys_to_pfn(nsio->res.start),
373 .reserve = __phys_to_pfn(SZ_8K),
374 };
361 375
362 if (!nd_pfn->uuid || !nd_pfn->ndns) 376 if (!nd_pfn->uuid || !nd_pfn->ndns)
363 return -ENODEV; 377 return -ENODEV;
@@ -375,6 +389,17 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
375 return -EINVAL; 389 return -EINVAL;
376 nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); 390 nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
377 altmap = NULL; 391 altmap = NULL;
392 } else if (nd_pfn->mode == PFN_MODE_PMEM) {
393 nd_pfn->npfns = (resource_size(&nsio->res) - offset)
394 / PAGE_SIZE;
395 if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
396 dev_info(&nd_pfn->dev,
397 "number of pfns truncated from %lld to %ld\n",
398 le64_to_cpu(nd_pfn->pfn_sb->npfns),
399 nd_pfn->npfns);
400 altmap = & __altmap;
401 altmap->free = __phys_to_pfn(offset - SZ_8K);
402 altmap->alloc = 0;
378 } else { 403 } else {
379 rc = -ENXIO; 404 rc = -ENXIO;
380 goto err; 405 goto err;
@@ -382,8 +407,11 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
382 407
383 /* establish pfn range for lookup, and switch to direct map */ 408 /* establish pfn range for lookup, and switch to direct map */
384 pmem = dev_get_drvdata(dev); 409 pmem = dev_get_drvdata(dev);
410 q = pmem->pmem_queue;
385 devm_memunmap(dev, (void __force *) pmem->virt_addr); 411 devm_memunmap(dev, (void __force *) pmem->virt_addr);
386 pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res); 412 pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res,
413 &q->q_usage_counter, altmap);
414 pmem->pfn_flags |= PFN_MAP;
387 if (IS_ERR(pmem->virt_addr)) { 415 if (IS_ERR(pmem->virt_addr)) {
388 rc = PTR_ERR(pmem->virt_addr); 416 rc = PTR_ERR(pmem->virt_addr);
389 goto err; 417 goto err;
@@ -424,19 +452,22 @@ static int nd_pmem_probe(struct device *dev)
424 return -ENOMEM; 452 return -ENOMEM;
425 nvdimm_namespace_add_poison(ndns, &pmem->bb, 0); 453 nvdimm_namespace_add_poison(ndns, &pmem->bb, 0);
426 454
427 if (is_nd_btt(dev)) 455 if (is_nd_btt(dev)) {
456 /* btt allocates its own request_queue */
457 blk_cleanup_queue(pmem->pmem_queue);
458 pmem->pmem_queue = NULL;
428 return nvdimm_namespace_attach_btt(ndns); 459 return nvdimm_namespace_attach_btt(ndns);
460 }
429 461
430 if (is_nd_pfn(dev)) 462 if (is_nd_pfn(dev))
431 return nvdimm_namespace_attach_pfn(ndns); 463 return nvdimm_namespace_attach_pfn(ndns);
432 464
433 if (nd_btt_probe(ndns, pmem) == 0) { 465 if (nd_btt_probe(ndns, pmem) == 0 || nd_pfn_probe(ndns, pmem) == 0) {
434 /* we'll come back as btt-pmem */ 466 /*
435 return -ENXIO; 467 * We'll come back as either btt-pmem, or pfn-pmem, so
436 } 468 * drop the queue allocation for now.
437 469 */
438 if (nd_pfn_probe(ndns, pmem) == 0) { 470 blk_cleanup_queue(pmem->pmem_queue);
439 /* we'll come back as pfn-pmem */
440 return -ENXIO; 471 return -ENXIO;
441 } 472 }
442 473
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 94a8f4ab57bc..ce7b70181740 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -17,6 +17,7 @@
17#include <linux/completion.h> 17#include <linux/completion.h>
18#include <linux/interrupt.h> 18#include <linux/interrupt.h>
19#include <linux/platform_device.h> 19#include <linux/platform_device.h>
20#include <linux/pfn_t.h>
20#include <asm/extmem.h> 21#include <asm/extmem.h>
21#include <asm/io.h> 22#include <asm/io.h>
22 23
@@ -30,7 +31,7 @@ static void dcssblk_release(struct gendisk *disk, fmode_t mode);
30static blk_qc_t dcssblk_make_request(struct request_queue *q, 31static blk_qc_t dcssblk_make_request(struct request_queue *q,
31 struct bio *bio); 32 struct bio *bio);
32static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum, 33static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
33 void __pmem **kaddr, unsigned long *pfn); 34 void __pmem **kaddr, pfn_t *pfn);
34 35
35static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0"; 36static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
36 37
@@ -883,20 +884,18 @@ fail:
883 884
884static long 885static long
885dcssblk_direct_access (struct block_device *bdev, sector_t secnum, 886dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
886 void __pmem **kaddr, unsigned long *pfn) 887 void __pmem **kaddr, pfn_t *pfn)
887{ 888{
888 struct dcssblk_dev_info *dev_info; 889 struct dcssblk_dev_info *dev_info;
889 unsigned long offset, dev_sz; 890 unsigned long offset, dev_sz;
890 void *addr;
891 891
892 dev_info = bdev->bd_disk->private_data; 892 dev_info = bdev->bd_disk->private_data;
893 if (!dev_info) 893 if (!dev_info)
894 return -ENODEV; 894 return -ENODEV;
895 dev_sz = dev_info->end - dev_info->start; 895 dev_sz = dev_info->end - dev_info->start;
896 offset = secnum * 512; 896 offset = secnum * 512;
897 addr = (void *) (dev_info->start + offset); 897 *kaddr = (void __pmem *) (dev_info->start + offset);
898 *pfn = virt_to_phys(addr) >> PAGE_SHIFT; 898 *pfn = __pfn_to_pfn_t(PFN_DOWN(dev_info->start + offset), PFN_DEV);
899 *kaddr = (void __pmem *) addr;
900 899
901 return dev_sz - offset; 900 return dev_sz - offset;
902} 901}
diff --git a/fs/Kconfig b/fs/Kconfig
index 2bb1ef86c411..9adee0d7536e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -50,7 +50,8 @@ config FS_DAX_PMD
50 bool 50 bool
51 default FS_DAX 51 default FS_DAX
52 depends on FS_DAX 52 depends on FS_DAX
53 depends on BROKEN 53 depends on ZONE_DEVICE
54 depends on TRANSPARENT_HUGEPAGE
54 55
55endif # BLOCK 56endif # BLOCK
56 57
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 81c0705558be..530145b607c4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -455,10 +455,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
455/** 455/**
456 * bdev_direct_access() - Get the address for directly-accessibly memory 456 * bdev_direct_access() - Get the address for directly-accessibly memory
457 * @bdev: The device containing the memory 457 * @bdev: The device containing the memory
458 * @sector: The offset within the device 458 * @dax: control and output parameters for ->direct_access
459 * @addr: Where to put the address of the memory
460 * @pfn: The Page Frame Number for the memory
461 * @size: The number of bytes requested
462 * 459 *
463 * If a block device is made up of directly addressable memory, this function 460 * If a block device is made up of directly addressable memory, this function
464 * will tell the caller the PFN and the address of the memory. The address 461 * will tell the caller the PFN and the address of the memory. The address
@@ -469,10 +466,10 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
469 * Return: negative errno if an error occurs, otherwise the number of bytes 466 * Return: negative errno if an error occurs, otherwise the number of bytes
470 * accessible at this address. 467 * accessible at this address.
471 */ 468 */
472long bdev_direct_access(struct block_device *bdev, sector_t sector, 469long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
473 void __pmem **addr, unsigned long *pfn, long size)
474{ 470{
475 long avail; 471 sector_t sector = dax->sector;
472 long avail, size = dax->size;
476 const struct block_device_operations *ops = bdev->bd_disk->fops; 473 const struct block_device_operations *ops = bdev->bd_disk->fops;
477 474
478 /* 475 /*
@@ -491,9 +488,11 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector,
491 sector += get_start_sect(bdev); 488 sector += get_start_sect(bdev);
492 if (sector % (PAGE_SIZE / 512)) 489 if (sector % (PAGE_SIZE / 512))
493 return -EINVAL; 490 return -EINVAL;
494 avail = ops->direct_access(bdev, sector, addr, pfn); 491 avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn);
495 if (!avail) 492 if (!avail)
496 return -ERANGE; 493 return -ERANGE;
494 if (avail > 0 && avail & ~PAGE_MASK)
495 return -ENXIO;
497 return min(avail, size); 496 return min(avail, size);
498} 497}
499EXPORT_SYMBOL_GPL(bdev_direct_access); 498EXPORT_SYMBOL_GPL(bdev_direct_access);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0068e82217c3..0a2752b79e72 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3391,13 +3391,13 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
3391 * should have access to this page, we're safe to simply set 3391 * should have access to this page, we're safe to simply set
3392 * PG_locked without checking it first. 3392 * PG_locked without checking it first.
3393 */ 3393 */
3394 __set_page_locked(page); 3394 __SetPageLocked(page);
3395 rc = add_to_page_cache_locked(page, mapping, 3395 rc = add_to_page_cache_locked(page, mapping,
3396 page->index, gfp); 3396 page->index, gfp);
3397 3397
3398 /* give up if we can't stick it in the cache */ 3398 /* give up if we can't stick it in the cache */
3399 if (rc) { 3399 if (rc) {
3400 __clear_page_locked(page); 3400 __ClearPageLocked(page);
3401 return rc; 3401 return rc;
3402 } 3402 }
3403 3403
@@ -3418,9 +3418,9 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
3418 if (*bytes + PAGE_CACHE_SIZE > rsize) 3418 if (*bytes + PAGE_CACHE_SIZE > rsize)
3419 break; 3419 break;
3420 3420
3421 __set_page_locked(page); 3421 __SetPageLocked(page);
3422 if (add_to_page_cache_locked(page, mapping, page->index, gfp)) { 3422 if (add_to_page_cache_locked(page, mapping, page->index, gfp)) {
3423 __clear_page_locked(page); 3423 __ClearPageLocked(page);
3424 break; 3424 break;
3425 } 3425 }
3426 list_move_tail(&page->lru, tmplist); 3426 list_move_tail(&page->lru, tmplist);
diff --git a/fs/dax.c b/fs/dax.c
index 43671b68220e..7af879759064 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -28,54 +28,68 @@
28#include <linux/sched.h> 28#include <linux/sched.h>
29#include <linux/uio.h> 29#include <linux/uio.h>
30#include <linux/vmstat.h> 30#include <linux/vmstat.h>
31#include <linux/pfn_t.h>
32#include <linux/sizes.h>
33
34static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
35{
36 struct request_queue *q = bdev->bd_queue;
37 long rc = -EIO;
38
39 dax->addr = (void __pmem *) ERR_PTR(-EIO);
40 if (blk_queue_enter(q, true) != 0)
41 return rc;
42
43 rc = bdev_direct_access(bdev, dax);
44 if (rc < 0) {
45 dax->addr = (void __pmem *) ERR_PTR(rc);
46 blk_queue_exit(q);
47 return rc;
48 }
49 return rc;
50}
51
52static void dax_unmap_atomic(struct block_device *bdev,
53 const struct blk_dax_ctl *dax)
54{
55 if (IS_ERR(dax->addr))
56 return;
57 blk_queue_exit(bdev->bd_queue);
58}
31 59
32/* 60/*
33 * dax_clear_blocks() is called from within transaction context from XFS, 61 * dax_clear_blocks() is called from within transaction context from XFS,
34 * and hence this means the stack from this point must follow GFP_NOFS 62 * and hence this means the stack from this point must follow GFP_NOFS
35 * semantics for all operations. 63 * semantics for all operations.
36 */ 64 */
37int dax_clear_blocks(struct inode *inode, sector_t block, long size) 65int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
38{ 66{
39 struct block_device *bdev = inode->i_sb->s_bdev; 67 struct block_device *bdev = inode->i_sb->s_bdev;
40 sector_t sector = block << (inode->i_blkbits - 9); 68 struct blk_dax_ctl dax = {
69 .sector = block << (inode->i_blkbits - 9),
70 .size = _size,
71 };
41 72
42 might_sleep(); 73 might_sleep();
43 do { 74 do {
44 void __pmem *addr; 75 long count, sz;
45 unsigned long pfn;
46 long count;
47 76
48 count = bdev_direct_access(bdev, sector, &addr, &pfn, size); 77 count = dax_map_atomic(bdev, &dax);
49 if (count < 0) 78 if (count < 0)
50 return count; 79 return count;
51 BUG_ON(size < count); 80 sz = min_t(long, count, SZ_128K);
52 while (count > 0) { 81 clear_pmem(dax.addr, sz);
53 unsigned pgsz = PAGE_SIZE - offset_in_page(addr); 82 dax.size -= sz;
54 if (pgsz > count) 83 dax.sector += sz / 512;
55 pgsz = count; 84 dax_unmap_atomic(bdev, &dax);
56 clear_pmem(addr, pgsz); 85 cond_resched();
57 addr += pgsz; 86 } while (dax.size);
58 size -= pgsz;
59 count -= pgsz;
60 BUG_ON(pgsz & 511);
61 sector += pgsz / 512;
62 cond_resched();
63 }
64 } while (size);
65 87
66 wmb_pmem(); 88 wmb_pmem();
67 return 0; 89 return 0;
68} 90}
69EXPORT_SYMBOL_GPL(dax_clear_blocks); 91EXPORT_SYMBOL_GPL(dax_clear_blocks);
70 92
71static long dax_get_addr(struct buffer_head *bh, void __pmem **addr,
72 unsigned blkbits)
73{
74 unsigned long pfn;
75 sector_t sector = bh->b_blocknr << (blkbits - 9);
76 return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
77}
78
79/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ 93/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
80static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, 94static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
81 loff_t pos, loff_t end) 95 loff_t pos, loff_t end)
@@ -105,19 +119,29 @@ static bool buffer_size_valid(struct buffer_head *bh)
105 return bh->b_state != 0; 119 return bh->b_state != 0;
106} 120}
107 121
122
123static sector_t to_sector(const struct buffer_head *bh,
124 const struct inode *inode)
125{
126 sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
127
128 return sector;
129}
130
108static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, 131static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
109 loff_t start, loff_t end, get_block_t get_block, 132 loff_t start, loff_t end, get_block_t get_block,
110 struct buffer_head *bh) 133 struct buffer_head *bh)
111{ 134{
112 ssize_t retval = 0; 135 loff_t pos = start, max = start, bh_max = start;
113 loff_t pos = start; 136 bool hole = false, need_wmb = false;
114 loff_t max = start; 137 struct block_device *bdev = NULL;
115 loff_t bh_max = start; 138 int rw = iov_iter_rw(iter), rc;
116 void __pmem *addr; 139 long map_len = 0;
117 bool hole = false; 140 struct blk_dax_ctl dax = {
118 bool need_wmb = false; 141 .addr = (void __pmem *) ERR_PTR(-EIO),
119 142 };
120 if (iov_iter_rw(iter) != WRITE) 143
144 if (rw == READ)
121 end = min(end, i_size_read(inode)); 145 end = min(end, i_size_read(inode));
122 146
123 while (pos < end) { 147 while (pos < end) {
@@ -132,13 +156,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
132 if (pos == bh_max) { 156 if (pos == bh_max) {
133 bh->b_size = PAGE_ALIGN(end - pos); 157 bh->b_size = PAGE_ALIGN(end - pos);
134 bh->b_state = 0; 158 bh->b_state = 0;
135 retval = get_block(inode, block, bh, 159 rc = get_block(inode, block, bh, rw == WRITE);
136 iov_iter_rw(iter) == WRITE); 160 if (rc)
137 if (retval)
138 break; 161 break;
139 if (!buffer_size_valid(bh)) 162 if (!buffer_size_valid(bh))
140 bh->b_size = 1 << blkbits; 163 bh->b_size = 1 << blkbits;
141 bh_max = pos - first + bh->b_size; 164 bh_max = pos - first + bh->b_size;
165 bdev = bh->b_bdev;
142 } else { 166 } else {
143 unsigned done = bh->b_size - 167 unsigned done = bh->b_size -
144 (bh_max - (pos - first)); 168 (bh_max - (pos - first));
@@ -146,47 +170,53 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
146 bh->b_size -= done; 170 bh->b_size -= done;
147 } 171 }
148 172
149 hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh); 173 hole = rw == READ && !buffer_written(bh);
150 if (hole) { 174 if (hole) {
151 addr = NULL;
152 size = bh->b_size - first; 175 size = bh->b_size - first;
153 } else { 176 } else {
154 retval = dax_get_addr(bh, &addr, blkbits); 177 dax_unmap_atomic(bdev, &dax);
155 if (retval < 0) 178 dax.sector = to_sector(bh, inode);
179 dax.size = bh->b_size;
180 map_len = dax_map_atomic(bdev, &dax);
181 if (map_len < 0) {
182 rc = map_len;
156 break; 183 break;
184 }
157 if (buffer_unwritten(bh) || buffer_new(bh)) { 185 if (buffer_unwritten(bh) || buffer_new(bh)) {
158 dax_new_buf(addr, retval, first, pos, 186 dax_new_buf(dax.addr, map_len, first,
159 end); 187 pos, end);
160 need_wmb = true; 188 need_wmb = true;
161 } 189 }
162 addr += first; 190 dax.addr += first;
163 size = retval - first; 191 size = map_len - first;
164 } 192 }
165 max = min(pos + size, end); 193 max = min(pos + size, end);
166 } 194 }
167 195
168 if (iov_iter_rw(iter) == WRITE) { 196 if (iov_iter_rw(iter) == WRITE) {
169 len = copy_from_iter_pmem(addr, max - pos, iter); 197 len = copy_from_iter_pmem(dax.addr, max - pos, iter);
170 need_wmb = true; 198 need_wmb = true;
171 } else if (!hole) 199 } else if (!hole)
172 len = copy_to_iter((void __force *)addr, max - pos, 200 len = copy_to_iter((void __force *) dax.addr, max - pos,
173 iter); 201 iter);
174 else 202 else
175 len = iov_iter_zero(max - pos, iter); 203 len = iov_iter_zero(max - pos, iter);
176 204
177 if (!len) { 205 if (!len) {
178 retval = -EFAULT; 206 rc = -EFAULT;
179 break; 207 break;
180 } 208 }
181 209
182 pos += len; 210 pos += len;
183 addr += len; 211 if (!IS_ERR(dax.addr))
212 dax.addr += len;
184 } 213 }
185 214
186 if (need_wmb) 215 if (need_wmb)
187 wmb_pmem(); 216 wmb_pmem();
217 dax_unmap_atomic(bdev, &dax);
188 218
189 return (pos == start) ? retval : pos - start; 219 return (pos == start) ? rc : pos - start;
190} 220}
191 221
192/** 222/**
@@ -275,28 +305,35 @@ static int dax_load_hole(struct address_space *mapping, struct page *page,
275 return VM_FAULT_LOCKED; 305 return VM_FAULT_LOCKED;
276} 306}
277 307
278static int copy_user_bh(struct page *to, struct buffer_head *bh, 308static int copy_user_bh(struct page *to, struct inode *inode,
279 unsigned blkbits, unsigned long vaddr) 309 struct buffer_head *bh, unsigned long vaddr)
280{ 310{
281 void __pmem *vfrom; 311 struct blk_dax_ctl dax = {
312 .sector = to_sector(bh, inode),
313 .size = bh->b_size,
314 };
315 struct block_device *bdev = bh->b_bdev;
282 void *vto; 316 void *vto;
283 317
284 if (dax_get_addr(bh, &vfrom, blkbits) < 0) 318 if (dax_map_atomic(bdev, &dax) < 0)
285 return -EIO; 319 return PTR_ERR(dax.addr);
286 vto = kmap_atomic(to); 320 vto = kmap_atomic(to);
287 copy_user_page(vto, (void __force *)vfrom, vaddr, to); 321 copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
288 kunmap_atomic(vto); 322 kunmap_atomic(vto);
323 dax_unmap_atomic(bdev, &dax);
289 return 0; 324 return 0;
290} 325}
291 326
292static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, 327static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
293 struct vm_area_struct *vma, struct vm_fault *vmf) 328 struct vm_area_struct *vma, struct vm_fault *vmf)
294{ 329{
295 struct address_space *mapping = inode->i_mapping;
296 sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
297 unsigned long vaddr = (unsigned long)vmf->virtual_address; 330 unsigned long vaddr = (unsigned long)vmf->virtual_address;
298 void __pmem *addr; 331 struct address_space *mapping = inode->i_mapping;
299 unsigned long pfn; 332 struct block_device *bdev = bh->b_bdev;
333 struct blk_dax_ctl dax = {
334 .sector = to_sector(bh, inode),
335 .size = bh->b_size,
336 };
300 pgoff_t size; 337 pgoff_t size;
301 int error; 338 int error;
302 339
@@ -315,20 +352,18 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
315 goto out; 352 goto out;
316 } 353 }
317 354
318 error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size); 355 if (dax_map_atomic(bdev, &dax) < 0) {
319 if (error < 0) 356 error = PTR_ERR(dax.addr);
320 goto out;
321 if (error < PAGE_SIZE) {
322 error = -EIO;
323 goto out; 357 goto out;
324 } 358 }
325 359
326 if (buffer_unwritten(bh) || buffer_new(bh)) { 360 if (buffer_unwritten(bh) || buffer_new(bh)) {
327 clear_pmem(addr, PAGE_SIZE); 361 clear_pmem(dax.addr, PAGE_SIZE);
328 wmb_pmem(); 362 wmb_pmem();
329 } 363 }
364 dax_unmap_atomic(bdev, &dax);
330 365
331 error = vm_insert_mixed(vma, vaddr, pfn); 366 error = vm_insert_mixed(vma, vaddr, dax.pfn);
332 367
333 out: 368 out:
334 i_mmap_unlock_read(mapping); 369 i_mmap_unlock_read(mapping);
@@ -422,7 +457,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
422 if (vmf->cow_page) { 457 if (vmf->cow_page) {
423 struct page *new_page = vmf->cow_page; 458 struct page *new_page = vmf->cow_page;
424 if (buffer_written(&bh)) 459 if (buffer_written(&bh))
425 error = copy_user_bh(new_page, &bh, blkbits, vaddr); 460 error = copy_user_bh(new_page, inode, &bh, vaddr);
426 else 461 else
427 clear_user_highpage(new_page, vaddr); 462 clear_user_highpage(new_page, vaddr);
428 if (error) 463 if (error)
@@ -523,6 +558,24 @@ EXPORT_SYMBOL_GPL(dax_fault);
523 */ 558 */
524#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 559#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
525 560
561static void __dax_dbg(struct buffer_head *bh, unsigned long address,
562 const char *reason, const char *fn)
563{
564 if (bh) {
565 char bname[BDEVNAME_SIZE];
566 bdevname(bh->b_bdev, bname);
567 pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
568 "length %zd fallback: %s\n", fn, current->comm,
569 address, bname, bh->b_state, (u64)bh->b_blocknr,
570 bh->b_size, reason);
571 } else {
572 pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
573 current->comm, address, reason);
574 }
575}
576
577#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd")
578
526int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 579int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
527 pmd_t *pmd, unsigned int flags, get_block_t get_block, 580 pmd_t *pmd, unsigned int flags, get_block_t get_block,
528 dax_iodone_t complete_unwritten) 581 dax_iodone_t complete_unwritten)
@@ -534,41 +587,49 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
534 unsigned blkbits = inode->i_blkbits; 587 unsigned blkbits = inode->i_blkbits;
535 unsigned long pmd_addr = address & PMD_MASK; 588 unsigned long pmd_addr = address & PMD_MASK;
536 bool write = flags & FAULT_FLAG_WRITE; 589 bool write = flags & FAULT_FLAG_WRITE;
537 long length; 590 struct block_device *bdev;
538 void __pmem *kaddr;
539 pgoff_t size, pgoff; 591 pgoff_t size, pgoff;
540 sector_t block, sector; 592 sector_t block;
541 unsigned long pfn;
542 int result = 0; 593 int result = 0;
543 594
544 /* dax pmd mappings are broken wrt gup and fork */ 595 /* dax pmd mappings require pfn_t_devmap() */
545 if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) 596 if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
546 return VM_FAULT_FALLBACK; 597 return VM_FAULT_FALLBACK;
547 598
548 /* Fall back to PTEs if we're going to COW */ 599 /* Fall back to PTEs if we're going to COW */
549 if (write && !(vma->vm_flags & VM_SHARED)) 600 if (write && !(vma->vm_flags & VM_SHARED)) {
601 split_huge_pmd(vma, pmd, address);
602 dax_pmd_dbg(NULL, address, "cow write");
550 return VM_FAULT_FALLBACK; 603 return VM_FAULT_FALLBACK;
604 }
551 /* If the PMD would extend outside the VMA */ 605 /* If the PMD would extend outside the VMA */
552 if (pmd_addr < vma->vm_start) 606 if (pmd_addr < vma->vm_start) {
607 dax_pmd_dbg(NULL, address, "vma start unaligned");
553 return VM_FAULT_FALLBACK; 608 return VM_FAULT_FALLBACK;
554 if ((pmd_addr + PMD_SIZE) > vma->vm_end) 609 }
610 if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
611 dax_pmd_dbg(NULL, address, "vma end unaligned");
555 return VM_FAULT_FALLBACK; 612 return VM_FAULT_FALLBACK;
613 }
556 614
557 pgoff = linear_page_index(vma, pmd_addr); 615 pgoff = linear_page_index(vma, pmd_addr);
558 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 616 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
559 if (pgoff >= size) 617 if (pgoff >= size)
560 return VM_FAULT_SIGBUS; 618 return VM_FAULT_SIGBUS;
561 /* If the PMD would cover blocks out of the file */ 619 /* If the PMD would cover blocks out of the file */
562 if ((pgoff | PG_PMD_COLOUR) >= size) 620 if ((pgoff | PG_PMD_COLOUR) >= size) {
621 dax_pmd_dbg(NULL, address,
622 "offset + huge page size > file size");
563 return VM_FAULT_FALLBACK; 623 return VM_FAULT_FALLBACK;
624 }
564 625
565 memset(&bh, 0, sizeof(bh)); 626 memset(&bh, 0, sizeof(bh));
566 block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); 627 block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
567 628
568 bh.b_size = PMD_SIZE; 629 bh.b_size = PMD_SIZE;
569 length = get_block(inode, block, &bh, write); 630 if (get_block(inode, block, &bh, write) != 0)
570 if (length)
571 return VM_FAULT_SIGBUS; 631 return VM_FAULT_SIGBUS;
632 bdev = bh.b_bdev;
572 i_mmap_lock_read(mapping); 633 i_mmap_lock_read(mapping);
573 634
574 /* 635 /*
@@ -576,8 +637,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
576 * just fall back to PTEs. Calling get_block 512 times in a loop 637 * just fall back to PTEs. Calling get_block 512 times in a loop
577 * would be silly. 638 * would be silly.
578 */ 639 */
579 if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) 640 if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
641 dax_pmd_dbg(&bh, address, "allocated block too small");
580 goto fallback; 642 goto fallback;
643 }
581 644
582 /* 645 /*
583 * If we allocated new storage, make sure no process has any 646 * If we allocated new storage, make sure no process has any
@@ -600,57 +663,82 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
600 result = VM_FAULT_SIGBUS; 663 result = VM_FAULT_SIGBUS;
601 goto out; 664 goto out;
602 } 665 }
603 if ((pgoff | PG_PMD_COLOUR) >= size) 666 if ((pgoff | PG_PMD_COLOUR) >= size) {
667 dax_pmd_dbg(&bh, address, "pgoff unaligned");
604 goto fallback; 668 goto fallback;
669 }
605 670
606 if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { 671 if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
607 spinlock_t *ptl; 672 spinlock_t *ptl;
608 pmd_t entry; 673 pmd_t entry;
609 struct page *zero_page = get_huge_zero_page(); 674 struct page *zero_page = get_huge_zero_page();
610 675
611 if (unlikely(!zero_page)) 676 if (unlikely(!zero_page)) {
677 dax_pmd_dbg(&bh, address, "no zero page");
612 goto fallback; 678 goto fallback;
679 }
613 680
614 ptl = pmd_lock(vma->vm_mm, pmd); 681 ptl = pmd_lock(vma->vm_mm, pmd);
615 if (!pmd_none(*pmd)) { 682 if (!pmd_none(*pmd)) {
616 spin_unlock(ptl); 683 spin_unlock(ptl);
684 dax_pmd_dbg(&bh, address, "pmd already present");
617 goto fallback; 685 goto fallback;
618 } 686 }
619 687
688 dev_dbg(part_to_dev(bdev->bd_part),
689 "%s: %s addr: %lx pfn: <zero> sect: %llx\n",
690 __func__, current->comm, address,
691 (unsigned long long) to_sector(&bh, inode));
692
620 entry = mk_pmd(zero_page, vma->vm_page_prot); 693 entry = mk_pmd(zero_page, vma->vm_page_prot);
621 entry = pmd_mkhuge(entry); 694 entry = pmd_mkhuge(entry);
622 set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); 695 set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
623 result = VM_FAULT_NOPAGE; 696 result = VM_FAULT_NOPAGE;
624 spin_unlock(ptl); 697 spin_unlock(ptl);
625 } else { 698 } else {
626 sector = bh.b_blocknr << (blkbits - 9); 699 struct blk_dax_ctl dax = {
627 length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn, 700 .sector = to_sector(&bh, inode),
628 bh.b_size); 701 .size = PMD_SIZE,
702 };
703 long length = dax_map_atomic(bdev, &dax);
704
629 if (length < 0) { 705 if (length < 0) {
630 result = VM_FAULT_SIGBUS; 706 result = VM_FAULT_SIGBUS;
631 goto out; 707 goto out;
632 } 708 }
633 if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR)) 709 if (length < PMD_SIZE) {
710 dax_pmd_dbg(&bh, address, "dax-length too small");
711 dax_unmap_atomic(bdev, &dax);
634 goto fallback; 712 goto fallback;
713 }
714 if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
715 dax_pmd_dbg(&bh, address, "pfn unaligned");
716 dax_unmap_atomic(bdev, &dax);
717 goto fallback;
718 }
635 719
636 /* 720 if (!pfn_t_devmap(dax.pfn)) {
637 * TODO: teach vmf_insert_pfn_pmd() to support 721 dax_unmap_atomic(bdev, &dax);
638 * 'pte_special' for pmds 722 dax_pmd_dbg(&bh, address, "pfn not in memmap");
639 */
640 if (pfn_valid(pfn))
641 goto fallback; 723 goto fallback;
724 }
642 725
643 if (buffer_unwritten(&bh) || buffer_new(&bh)) { 726 if (buffer_unwritten(&bh) || buffer_new(&bh)) {
644 int i; 727 clear_pmem(dax.addr, PMD_SIZE);
645 for (i = 0; i < PTRS_PER_PMD; i++)
646 clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
647 wmb_pmem(); 728 wmb_pmem();
648 count_vm_event(PGMAJFAULT); 729 count_vm_event(PGMAJFAULT);
649 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 730 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
650 result |= VM_FAULT_MAJOR; 731 result |= VM_FAULT_MAJOR;
651 } 732 }
652 733 dax_unmap_atomic(bdev, &dax);
653 result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write); 734
735 dev_dbg(part_to_dev(bdev->bd_part),
736 "%s: %s addr: %lx pfn: %lx sect: %llx\n",
737 __func__, current->comm, address,
738 pfn_t_to_pfn(dax.pfn),
739 (unsigned long long) dax.sector);
740 result |= vmf_insert_pfn_pmd(vma, address, pmd,
741 dax.pfn, write);
654 } 742 }
655 743
656 out: 744 out:
@@ -752,12 +840,17 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
752 if (err < 0) 840 if (err < 0)
753 return err; 841 return err;
754 if (buffer_written(&bh)) { 842 if (buffer_written(&bh)) {
755 void __pmem *addr; 843 struct block_device *bdev = bh.b_bdev;
756 err = dax_get_addr(&bh, &addr, inode->i_blkbits); 844 struct blk_dax_ctl dax = {
757 if (err < 0) 845 .sector = to_sector(&bh, inode),
758 return err; 846 .size = PAGE_CACHE_SIZE,
759 clear_pmem(addr + offset, length); 847 };
848
849 if (dax_map_atomic(bdev, &dax) < 0)
850 return PTR_ERR(dax.addr);
851 clear_pmem(dax.addr + offset, length);
760 wmb_pmem(); 852 wmb_pmem();
853 dax_unmap_atomic(bdev, &dax);
761 } 854 }
762 855
763 return 0; 856 return 0;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 023f6a1f23cd..6915c950e6e8 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -677,9 +677,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page,
677 if (!wbc->wb) 677 if (!wbc->wb)
678 return; 678 return;
679 679
680 rcu_read_lock();
681 id = mem_cgroup_css_from_page(page)->id; 680 id = mem_cgroup_css_from_page(page)->id;
682 rcu_read_unlock();
683 681
684 if (id == wbc->wb_id) { 682 if (id == wbc->wb_id) {
685 wbc->wb_bytes += bytes; 683 wbc->wb_bytes += bytes;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 47789292a582..8bbf7f3e2a27 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -324,11 +324,48 @@ static void remove_huge_page(struct page *page)
324 delete_from_page_cache(page); 324 delete_from_page_cache(page);
325} 325}
326 326
327static void
328hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
329{
330 struct vm_area_struct *vma;
331
332 /*
333 * end == 0 indicates that the entire range after
334 * start should be unmapped.
335 */
336 vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
337 unsigned long v_offset;
338 unsigned long v_end;
339
340 /*
341 * Can the expression below overflow on 32-bit arches?
342 * No, because the interval tree returns us only those vmas
343 * which overlap the truncated area starting at pgoff,
344 * and no vma on a 32-bit arch can span beyond the 4GB.
345 */
346 if (vma->vm_pgoff < start)
347 v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
348 else
349 v_offset = 0;
350
351 if (!end)
352 v_end = vma->vm_end;
353 else {
354 v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
355 + vma->vm_start;
356 if (v_end > vma->vm_end)
357 v_end = vma->vm_end;
358 }
359
360 unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
361 NULL);
362 }
363}
327 364
328/* 365/*
329 * remove_inode_hugepages handles two distinct cases: truncation and hole 366 * remove_inode_hugepages handles two distinct cases: truncation and hole
330 * punch. There are subtle differences in operation for each case. 367 * punch. There are subtle differences in operation for each case.
331 368 *
332 * truncation is indicated by end of range being LLONG_MAX 369 * truncation is indicated by end of range being LLONG_MAX
333 * In this case, we first scan the range and release found pages. 370 * In this case, we first scan the range and release found pages.
334 * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv 371 * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
@@ -379,6 +416,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
379 416
380 for (i = 0; i < pagevec_count(&pvec); ++i) { 417 for (i = 0; i < pagevec_count(&pvec); ++i) {
381 struct page *page = pvec.pages[i]; 418 struct page *page = pvec.pages[i];
419 bool rsv_on_error;
382 u32 hash; 420 u32 hash;
383 421
384 /* 422 /*
@@ -395,37 +433,43 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
395 mapping, next, 0); 433 mapping, next, 0);
396 mutex_lock(&hugetlb_fault_mutex_table[hash]); 434 mutex_lock(&hugetlb_fault_mutex_table[hash]);
397 435
398 lock_page(page); 436 /*
399 if (likely(!page_mapped(page))) { 437 * If page is mapped, it was faulted in after being
400 bool rsv_on_error = !PagePrivate(page); 438 * unmapped in caller. Unmap (again) now after taking
401 /* 439 * the fault mutex. The mutex will prevent faults
402 * We must free the huge page and remove 440 * until we finish removing the page.
403 * from page cache (remove_huge_page) BEFORE 441 *
404 * removing the region/reserve map 442 * This race can only happen in the hole punch case.
405 * (hugetlb_unreserve_pages). In rare out 443 * Getting here in a truncate operation is a bug.
406 * of memory conditions, removal of the 444 */
407 * region/reserve map could fail. Before 445 if (unlikely(page_mapped(page))) {
408 * free'ing the page, note PagePrivate which
409 * is used in case of error.
410 */
411 remove_huge_page(page);
412 freed++;
413 if (!truncate_op) {
414 if (unlikely(hugetlb_unreserve_pages(
415 inode, next,
416 next + 1, 1)))
417 hugetlb_fix_reserve_counts(
418 inode, rsv_on_error);
419 }
420 } else {
421 /*
422 * If page is mapped, it was faulted in after
423 * being unmapped. It indicates a race between
424 * hole punch and page fault. Do nothing in
425 * this case. Getting here in a truncate
426 * operation is a bug.
427 */
428 BUG_ON(truncate_op); 446 BUG_ON(truncate_op);
447
448 i_mmap_lock_write(mapping);
449 hugetlb_vmdelete_list(&mapping->i_mmap,
450 next * pages_per_huge_page(h),
451 (next + 1) * pages_per_huge_page(h));
452 i_mmap_unlock_write(mapping);
453 }
454
455 lock_page(page);
456 /*
457 * We must free the huge page and remove from page
458 * cache (remove_huge_page) BEFORE removing the
459 * region/reserve map (hugetlb_unreserve_pages). In
460 * rare out of memory conditions, removal of the
461 * region/reserve map could fail. Before free'ing
462 * the page, note PagePrivate which is used in case
463 * of error.
464 */
465 rsv_on_error = !PagePrivate(page);
466 remove_huge_page(page);
467 freed++;
468 if (!truncate_op) {
469 if (unlikely(hugetlb_unreserve_pages(inode,
470 next, next + 1, 1)))
471 hugetlb_fix_reserve_counts(inode,
472 rsv_on_error);
429 } 473 }
430 474
431 unlock_page(page); 475 unlock_page(page);
@@ -452,41 +496,6 @@ static void hugetlbfs_evict_inode(struct inode *inode)
452 clear_inode(inode); 496 clear_inode(inode);
453} 497}
454 498
455static inline void
456hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
457{
458 struct vm_area_struct *vma;
459
460 /*
461 * end == 0 indicates that the entire range after
462 * start should be unmapped.
463 */
464 vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
465 unsigned long v_offset;
466
467 /*
468 * Can the expression below overflow on 32-bit arches?
469 * No, because the interval tree returns us only those vmas
470 * which overlap the truncated area starting at pgoff,
471 * and no vma on a 32-bit arch can span beyond the 4GB.
472 */
473 if (vma->vm_pgoff < start)
474 v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
475 else
476 v_offset = 0;
477
478 if (end) {
479 end = ((end - start) << PAGE_SHIFT) +
480 vma->vm_start + v_offset;
481 if (end > vma->vm_end)
482 end = vma->vm_end;
483 } else
484 end = vma->vm_end;
485
486 unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
487 }
488}
489
490static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) 499static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
491{ 500{
492 pgoff_t pgoff; 501 pgoff_t pgoff;
@@ -708,7 +717,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
708/* 717/*
709 * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never 718 * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
710 * be taken from reclaim -- unlike regular filesystems. This needs an 719 * be taken from reclaim -- unlike regular filesystems. This needs an
711 * annotation because huge_pmd_share() does an allocation under 720 * annotation because huge_pmd_share() does an allocation under hugetlb's
712 * i_mmap_rwsem. 721 * i_mmap_rwsem.
713 */ 722 */
714static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; 723static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 93484034a03d..b2855eea5405 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page)
103 * pseudo flags for the well known (anonymous) memory mapped pages 103 * pseudo flags for the well known (anonymous) memory mapped pages
104 * 104 *
105 * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the 105 * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
106 * simple test in page_mapped() is not enough. 106 * simple test in page_mapcount() is not enough.
107 */ 107 */
108 if (!PageSlab(page) && page_mapped(page)) 108 if (!PageSlab(page) && page_mapcount(page))
109 u |= 1 << KPF_MMAP; 109 u |= 1 << KPF_MMAP;
110 if (PageAnon(page)) 110 if (PageAnon(page))
111 u |= 1 << KPF_ANON; 111 u |= 1 << KPF_ANON;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a353b4c6e86e..65a1b6c69c11 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -466,9 +466,10 @@ struct mem_size_stats {
466}; 466};
467 467
468static void smaps_account(struct mem_size_stats *mss, struct page *page, 468static void smaps_account(struct mem_size_stats *mss, struct page *page,
469 unsigned long size, bool young, bool dirty) 469 bool compound, bool young, bool dirty)
470{ 470{
471 int mapcount; 471 int i, nr = compound ? HPAGE_PMD_NR : 1;
472 unsigned long size = nr * PAGE_SIZE;
472 473
473 if (PageAnon(page)) 474 if (PageAnon(page))
474 mss->anonymous += size; 475 mss->anonymous += size;
@@ -477,23 +478,37 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
477 /* Accumulate the size in pages that have been accessed. */ 478 /* Accumulate the size in pages that have been accessed. */
478 if (young || page_is_young(page) || PageReferenced(page)) 479 if (young || page_is_young(page) || PageReferenced(page))
479 mss->referenced += size; 480 mss->referenced += size;
480 mapcount = page_mapcount(page);
481 if (mapcount >= 2) {
482 u64 pss_delta;
483 481
484 if (dirty || PageDirty(page)) 482 /*
485 mss->shared_dirty += size; 483 * page_count(page) == 1 guarantees the page is mapped exactly once.
486 else 484 * If any subpage of the compound page mapped with PTE it would elevate
487 mss->shared_clean += size; 485 * page_count().
488 pss_delta = (u64)size << PSS_SHIFT; 486 */
489 do_div(pss_delta, mapcount); 487 if (page_count(page) == 1) {
490 mss->pss += pss_delta;
491 } else {
492 if (dirty || PageDirty(page)) 488 if (dirty || PageDirty(page))
493 mss->private_dirty += size; 489 mss->private_dirty += size;
494 else 490 else
495 mss->private_clean += size; 491 mss->private_clean += size;
496 mss->pss += (u64)size << PSS_SHIFT; 492 mss->pss += (u64)size << PSS_SHIFT;
493 return;
494 }
495
496 for (i = 0; i < nr; i++, page++) {
497 int mapcount = page_mapcount(page);
498
499 if (mapcount >= 2) {
500 if (dirty || PageDirty(page))
501 mss->shared_dirty += PAGE_SIZE;
502 else
503 mss->shared_clean += PAGE_SIZE;
504 mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
505 } else {
506 if (dirty || PageDirty(page))
507 mss->private_dirty += PAGE_SIZE;
508 else
509 mss->private_clean += PAGE_SIZE;
510 mss->pss += PAGE_SIZE << PSS_SHIFT;
511 }
497 } 512 }
498} 513}
499 514
@@ -554,7 +569,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
554 569
555 if (!page) 570 if (!page)
556 return; 571 return;
557 smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte)); 572
573 smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
558} 574}
559 575
560#ifdef CONFIG_TRANSPARENT_HUGEPAGE 576#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -570,8 +586,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
570 if (IS_ERR_OR_NULL(page)) 586 if (IS_ERR_OR_NULL(page))
571 return; 587 return;
572 mss->anonymous_thp += HPAGE_PMD_SIZE; 588 mss->anonymous_thp += HPAGE_PMD_SIZE;
573 smaps_account(mss, page, HPAGE_PMD_SIZE, 589 smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
574 pmd_young(*pmd), pmd_dirty(*pmd));
575} 590}
576#else 591#else
577static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, 592static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -587,7 +602,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
587 pte_t *pte; 602 pte_t *pte;
588 spinlock_t *ptl; 603 spinlock_t *ptl;
589 604
590 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 605 if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
591 smaps_pmd_entry(pmd, addr, walk); 606 smaps_pmd_entry(pmd, addr, walk);
592 spin_unlock(ptl); 607 spin_unlock(ptl);
593 return 0; 608 return 0;
@@ -898,7 +913,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
898 spinlock_t *ptl; 913 spinlock_t *ptl;
899 struct page *page; 914 struct page *page;
900 915
901 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 916 if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
902 if (cp->type == CLEAR_REFS_SOFT_DIRTY) { 917 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
903 clear_soft_dirty_pmd(vma, addr, pmd); 918 clear_soft_dirty_pmd(vma, addr, pmd);
904 goto out; 919 goto out;
@@ -1172,7 +1187,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
1172 int err = 0; 1187 int err = 0;
1173 1188
1174#ifdef CONFIG_TRANSPARENT_HUGEPAGE 1189#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1175 if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) { 1190 if (pmd_trans_huge_lock(pmdp, vma, &ptl)) {
1176 u64 flags = 0, frame = 0; 1191 u64 flags = 0, frame = 0;
1177 pmd_t pmd = *pmdp; 1192 pmd_t pmd = *pmdp;
1178 1193
@@ -1504,7 +1519,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1504 pte_t *orig_pte; 1519 pte_t *orig_pte;
1505 pte_t *pte; 1520 pte_t *pte;
1506 1521
1507 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 1522 if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
1508 pte_t huge_pte = *(pte_t *)pmd; 1523 pte_t huge_pte = *(pte_t *)pmd;
1509 struct page *page; 1524 struct page *page;
1510 1525
diff --git a/fs/stat.c b/fs/stat.c
index d4a61d8dc021..bc045c7994e1 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -219,7 +219,7 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
219# define choose_32_64(a,b) b 219# define choose_32_64(a,b) b
220#endif 220#endif
221 221
222#define valid_dev(x) choose_32_64(old_valid_dev,new_valid_dev)(x) 222#define valid_dev(x) choose_32_64(old_valid_dev(x),true)
223#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x) 223#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
224 224
225#ifndef INIT_STRUCT_STAT_PADDING 225#ifndef INIT_STRUCT_STAT_PADDING
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 3a6803cb0ec9..0b3c0d39ef75 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_GENERIC_PGTABLE_H 1#ifndef _ASM_GENERIC_PGTABLE_H
2#define _ASM_GENERIC_PGTABLE_H 2#define _ASM_GENERIC_PGTABLE_H
3 3
4#include <linux/pfn.h>
5
4#ifndef __ASSEMBLY__ 6#ifndef __ASSEMBLY__
5#ifdef CONFIG_MMU 7#ifdef CONFIG_MMU
6 8
@@ -207,11 +209,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
207#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 209#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
208#endif 210#endif
209 211
210#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
211extern void pmdp_splitting_flush(struct vm_area_struct *vma,
212 unsigned long address, pmd_t *pmdp);
213#endif
214
215#ifndef pmdp_collapse_flush 212#ifndef pmdp_collapse_flush
216#ifdef CONFIG_TRANSPARENT_HUGEPAGE 213#ifdef CONFIG_TRANSPARENT_HUGEPAGE
217extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, 214extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
@@ -554,7 +551,7 @@ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
554 * by vm_insert_pfn(). 551 * by vm_insert_pfn().
555 */ 552 */
556static inline int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, 553static inline int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
557 unsigned long pfn) 554 pfn_t pfn)
558{ 555{
559 return 0; 556 return 0;
560} 557}
@@ -589,7 +586,7 @@ extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
589 unsigned long pfn, unsigned long addr, 586 unsigned long pfn, unsigned long addr,
590 unsigned long size); 587 unsigned long size);
591extern int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, 588extern int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
592 unsigned long pfn); 589 pfn_t pfn);
593extern int track_pfn_copy(struct vm_area_struct *vma); 590extern int track_pfn_copy(struct vm_area_struct *vma);
594extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, 591extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
595 unsigned long size); 592 unsigned long size);
@@ -627,10 +624,6 @@ static inline int pmd_trans_huge(pmd_t pmd)
627{ 624{
628 return 0; 625 return 0;
629} 626}
630static inline int pmd_trans_splitting(pmd_t pmd)
631{
632 return 0;
633}
634#ifndef __HAVE_ARCH_PMD_WRITE 627#ifndef __HAVE_ARCH_PMD_WRITE
635static inline int pmd_write(pmd_t pmd) 628static inline int pmd_write(pmd_t pmd)
636{ 629{
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index b58fd667f87b..af0254c09424 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -4,6 +4,7 @@
4/* References to section boundaries */ 4/* References to section boundaries */
5 5
6#include <linux/compiler.h> 6#include <linux/compiler.h>
7#include <linux/types.h>
7 8
8/* 9/*
9 * Usage guidelines: 10 * Usage guidelines:
@@ -63,4 +64,68 @@ static inline int arch_is_kernel_data(unsigned long addr)
63} 64}
64#endif 65#endif
65 66
67/**
68 * memory_contains - checks if an object is contained within a memory region
69 * @begin: virtual address of the beginning of the memory region
70 * @end: virtual address of the end of the memory region
71 * @virt: virtual address of the memory object
72 * @size: size of the memory object
73 *
74 * Returns: true if the object specified by @virt and @size is entirely
75 * contained within the memory region defined by @begin and @end, false
76 * otherwise.
77 */
78static inline bool memory_contains(void *begin, void *end, void *virt,
79 size_t size)
80{
81 return virt >= begin && virt + size <= end;
82}
83
84/**
85 * memory_intersects - checks if the region occupied by an object intersects
86 * with another memory region
87 * @begin: virtual address of the beginning of the memory regien
88 * @end: virtual address of the end of the memory region
89 * @virt: virtual address of the memory object
90 * @size: size of the memory object
91 *
92 * Returns: true if an object's memory region, specified by @virt and @size,
93 * intersects with the region specified by @begin and @end, false otherwise.
94 */
95static inline bool memory_intersects(void *begin, void *end, void *virt,
96 size_t size)
97{
98 void *vend = virt + size;
99
100 return (virt >= begin && virt < end) || (vend >= begin && vend < end);
101}
102
103/**
104 * init_section_contains - checks if an object is contained within the init
105 * section
106 * @virt: virtual address of the memory object
107 * @size: size of the memory object
108 *
109 * Returns: true if the object specified by @virt and @size is entirely
110 * contained within the init section, false otherwise.
111 */
112static inline bool init_section_contains(void *virt, size_t size)
113{
114 return memory_contains(__init_begin, __init_end, virt, size);
115}
116
117/**
118 * init_section_intersects - checks if the region occupied by an object
119 * intersects with the init section
120 * @virt: virtual address of the memory object
121 * @size: size of the memory object
122 *
123 * Returns: true if an object's memory region, specified by @virt and @size,
124 * intersects with the init section, false otherwise.
125 */
126static inline bool init_section_intersects(void *virt, size_t size)
127{
128 return memory_intersects(__init_begin, __init_end, virt, size);
129}
130
66#endif /* _ASM_GENERIC_SECTIONS_H_ */ 131#endif /* _ASM_GENERIC_SECTIONS_H_ */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c70e3588a48c..bfb64d672e19 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -15,6 +15,7 @@
15#include <linux/backing-dev-defs.h> 15#include <linux/backing-dev-defs.h>
16#include <linux/wait.h> 16#include <linux/wait.h>
17#include <linux/mempool.h> 17#include <linux/mempool.h>
18#include <linux/pfn.h>
18#include <linux/bio.h> 19#include <linux/bio.h>
19#include <linux/stringify.h> 20#include <linux/stringify.h>
20#include <linux/gfp.h> 21#include <linux/gfp.h>
@@ -1617,6 +1618,20 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
1617 1618
1618#endif /* CONFIG_BLK_DEV_INTEGRITY */ 1619#endif /* CONFIG_BLK_DEV_INTEGRITY */
1619 1620
1621/**
1622 * struct blk_dax_ctl - control and output parameters for ->direct_access
1623 * @sector: (input) offset relative to a block_device
1624 * @addr: (output) kernel virtual address for @sector populated by driver
1625 * @pfn: (output) page frame number for @addr populated by driver
1626 * @size: (input) number of bytes requested
1627 */
1628struct blk_dax_ctl {
1629 sector_t sector;
1630 void __pmem *addr;
1631 long size;
1632 pfn_t pfn;
1633};
1634
1620struct block_device_operations { 1635struct block_device_operations {
1621 int (*open) (struct block_device *, fmode_t); 1636 int (*open) (struct block_device *, fmode_t);
1622 void (*release) (struct gendisk *, fmode_t); 1637 void (*release) (struct gendisk *, fmode_t);
@@ -1624,7 +1639,7 @@ struct block_device_operations {
1624 int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); 1639 int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
1625 int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); 1640 int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
1626 long (*direct_access)(struct block_device *, sector_t, void __pmem **, 1641 long (*direct_access)(struct block_device *, sector_t, void __pmem **,
1627 unsigned long *pfn); 1642 pfn_t *);
1628 unsigned int (*check_events) (struct gendisk *disk, 1643 unsigned int (*check_events) (struct gendisk *disk,
1629 unsigned int clearing); 1644 unsigned int clearing);
1630 /* ->media_changed() is DEPRECATED, use ->check_events() instead */ 1645 /* ->media_changed() is DEPRECATED, use ->check_events() instead */
@@ -1643,8 +1658,7 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
1643extern int bdev_read_page(struct block_device *, sector_t, struct page *); 1658extern int bdev_read_page(struct block_device *, sector_t, struct page *);
1644extern int bdev_write_page(struct block_device *, sector_t, struct page *, 1659extern int bdev_write_page(struct block_device *, sector_t, struct page *,
1645 struct writeback_control *); 1660 struct writeback_control *);
1646extern long bdev_direct_access(struct block_device *, sector_t, 1661extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
1647 void __pmem **addr, unsigned long *pfn, long size);
1648#else /* CONFIG_BLOCK */ 1662#else /* CONFIG_BLOCK */
1649 1663
1650struct block_device; 1664struct block_device;
diff --git a/include/linux/console.h b/include/linux/console.h
index bd194343c346..ea731af2451e 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -150,6 +150,7 @@ extern int console_trylock(void);
150extern void console_unlock(void); 150extern void console_unlock(void);
151extern void console_conditional_schedule(void); 151extern void console_conditional_schedule(void);
152extern void console_unblank(void); 152extern void console_unblank(void);
153extern void console_flush_on_panic(void);
153extern struct tty_driver *console_device(int *); 154extern struct tty_driver *console_device(int *);
154extern void console_stop(struct console *); 155extern void console_stop(struct console *);
155extern void console_start(struct console *); 156extern void console_start(struct console *);
diff --git a/include/linux/err.h b/include/linux/err.h
index a729120644d5..56762ab41713 100644
--- a/include/linux/err.h
+++ b/include/linux/err.h
@@ -37,7 +37,7 @@ static inline bool __must_check IS_ERR(__force const void *ptr)
37 37
38static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr) 38static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr)
39{ 39{
40 return !ptr || IS_ERR_VALUE((unsigned long)ptr); 40 return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr);
41} 41}
42 42
43/** 43/**
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ecb080d6ff42..cfe81e10bd54 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -19,13 +19,16 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
19 unsigned long addr, 19 unsigned long addr,
20 pmd_t *pmd, 20 pmd_t *pmd,
21 unsigned int flags); 21 unsigned int flags);
22extern int madvise_free_huge_pmd(struct mmu_gather *tlb,
23 struct vm_area_struct *vma,
24 pmd_t *pmd, unsigned long addr, unsigned long next);
22extern int zap_huge_pmd(struct mmu_gather *tlb, 25extern int zap_huge_pmd(struct mmu_gather *tlb,
23 struct vm_area_struct *vma, 26 struct vm_area_struct *vma,
24 pmd_t *pmd, unsigned long addr); 27 pmd_t *pmd, unsigned long addr);
25extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 28extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
26 unsigned long addr, unsigned long end, 29 unsigned long addr, unsigned long end,
27 unsigned char *vec); 30 unsigned char *vec);
28extern int move_huge_pmd(struct vm_area_struct *vma, 31extern bool move_huge_pmd(struct vm_area_struct *vma,
29 struct vm_area_struct *new_vma, 32 struct vm_area_struct *new_vma,
30 unsigned long old_addr, 33 unsigned long old_addr,
31 unsigned long new_addr, unsigned long old_end, 34 unsigned long new_addr, unsigned long old_end,
@@ -34,8 +37,7 @@ extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
34 unsigned long addr, pgprot_t newprot, 37 unsigned long addr, pgprot_t newprot,
35 int prot_numa); 38 int prot_numa);
36int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *, 39int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *,
37 unsigned long pfn, bool write); 40 pfn_t pfn, bool write);
38
39enum transparent_hugepage_flag { 41enum transparent_hugepage_flag {
40 TRANSPARENT_HUGEPAGE_FLAG, 42 TRANSPARENT_HUGEPAGE_FLAG,
41 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 43 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
@@ -48,21 +50,13 @@ enum transparent_hugepage_flag {
48#endif 50#endif
49}; 51};
50 52
51enum page_check_address_pmd_flag {
52 PAGE_CHECK_ADDRESS_PMD_FLAG,
53 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG,
54 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG,
55};
56extern pmd_t *page_check_address_pmd(struct page *page,
57 struct mm_struct *mm,
58 unsigned long address,
59 enum page_check_address_pmd_flag flag,
60 spinlock_t **ptl);
61
62#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) 53#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
63#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) 54#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
64 55
65#ifdef CONFIG_TRANSPARENT_HUGEPAGE 56#ifdef CONFIG_TRANSPARENT_HUGEPAGE
57struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
58 pmd_t *pmd, int flags);
59
66#define HPAGE_PMD_SHIFT PMD_SHIFT 60#define HPAGE_PMD_SHIFT PMD_SHIFT
67#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) 61#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT)
68#define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1)) 62#define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1))
@@ -95,30 +89,28 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
95#endif /* CONFIG_DEBUG_VM */ 89#endif /* CONFIG_DEBUG_VM */
96 90
97extern unsigned long transparent_hugepage_flags; 91extern unsigned long transparent_hugepage_flags;
98extern int split_huge_page_to_list(struct page *page, struct list_head *list); 92
93extern void prep_transhuge_page(struct page *page);
94extern void free_transhuge_page(struct page *page);
95
96int split_huge_page_to_list(struct page *page, struct list_head *list);
99static inline int split_huge_page(struct page *page) 97static inline int split_huge_page(struct page *page)
100{ 98{
101 return split_huge_page_to_list(page, NULL); 99 return split_huge_page_to_list(page, NULL);
102} 100}
103extern void __split_huge_page_pmd(struct vm_area_struct *vma, 101void deferred_split_huge_page(struct page *page);
104 unsigned long address, pmd_t *pmd); 102
105#define split_huge_page_pmd(__vma, __address, __pmd) \ 103void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
104 unsigned long address);
105
106#define split_huge_pmd(__vma, __pmd, __address) \
106 do { \ 107 do { \
107 pmd_t *____pmd = (__pmd); \ 108 pmd_t *____pmd = (__pmd); \
108 if (unlikely(pmd_trans_huge(*____pmd))) \ 109 if (pmd_trans_huge(*____pmd) \
109 __split_huge_page_pmd(__vma, __address, \ 110 || pmd_devmap(*____pmd)) \
110 ____pmd); \ 111 __split_huge_pmd(__vma, __pmd, __address); \
111 } while (0) 112 } while (0)
112#define wait_split_huge_page(__anon_vma, __pmd) \ 113
113 do { \
114 pmd_t *____pmd = (__pmd); \
115 anon_vma_lock_write(__anon_vma); \
116 anon_vma_unlock_write(__anon_vma); \
117 BUG_ON(pmd_trans_splitting(*____pmd) || \
118 pmd_trans_huge(*____pmd)); \
119 } while (0)
120extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
121 pmd_t *pmd);
122#if HPAGE_PMD_ORDER >= MAX_ORDER 114#if HPAGE_PMD_ORDER >= MAX_ORDER
123#error "hugepages can't be allocated by the buddy allocator" 115#error "hugepages can't be allocated by the buddy allocator"
124#endif 116#endif
@@ -128,17 +120,17 @@ extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
128 unsigned long start, 120 unsigned long start,
129 unsigned long end, 121 unsigned long end,
130 long adjust_next); 122 long adjust_next);
131extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, 123extern bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
132 spinlock_t **ptl); 124 spinlock_t **ptl);
133/* mmap_sem must be held on entry */ 125/* mmap_sem must be held on entry */
134static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, 126static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
135 spinlock_t **ptl) 127 spinlock_t **ptl)
136{ 128{
137 VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma); 129 VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
138 if (pmd_trans_huge(*pmd)) 130 if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd))
139 return __pmd_trans_huge_lock(pmd, vma, ptl); 131 return __pmd_trans_huge_lock(pmd, vma, ptl);
140 else 132 else
141 return 0; 133 return false;
142} 134}
143static inline int hpage_nr_pages(struct page *page) 135static inline int hpage_nr_pages(struct page *page)
144{ 136{
@@ -183,11 +175,8 @@ static inline int split_huge_page(struct page *page)
183{ 175{
184 return 0; 176 return 0;
185} 177}
186#define split_huge_page_pmd(__vma, __address, __pmd) \ 178static inline void deferred_split_huge_page(struct page *page) {}
187 do { } while (0) 179#define split_huge_pmd(__vma, __pmd, __address) \
188#define wait_split_huge_page(__anon_vma, __pmd) \
189 do { } while (0)
190#define split_huge_page_pmd_mm(__mm, __address, __pmd) \
191 do { } while (0) 180 do { } while (0)
192static inline int hugepage_madvise(struct vm_area_struct *vma, 181static inline int hugepage_madvise(struct vm_area_struct *vma,
193 unsigned long *vm_flags, int advice) 182 unsigned long *vm_flags, int advice)
@@ -201,10 +190,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
201 long adjust_next) 190 long adjust_next)
202{ 191{
203} 192}
204static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, 193static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
205 spinlock_t **ptl) 194 spinlock_t **ptl)
206{ 195{
207 return 0; 196 return false;
208} 197}
209 198
210static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 199static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -218,6 +207,12 @@ static inline bool is_huge_zero_page(struct page *page)
218 return false; 207 return false;
219} 208}
220 209
210
211static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
212 unsigned long addr, pmd_t *pmd, int flags)
213{
214 return NULL;
215}
221#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 216#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
222 217
223#endif /* _LINUX_HUGE_MM_H */ 218#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e76574d8f9b5..7d953c2542a8 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -8,6 +8,7 @@
8#include <linux/cgroup.h> 8#include <linux/cgroup.h>
9#include <linux/list.h> 9#include <linux/list.h>
10#include <linux/kref.h> 10#include <linux/kref.h>
11#include <asm/pgtable.h>
11 12
12struct ctl_table; 13struct ctl_table;
13struct user_struct; 14struct user_struct;
diff --git a/include/linux/io.h b/include/linux/io.h
index de64c1e53612..fffd88d7f426 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -89,21 +89,6 @@ void devm_memunmap(struct device *dev, void *addr);
89 89
90void *__devm_memremap_pages(struct device *dev, struct resource *res); 90void *__devm_memremap_pages(struct device *dev, struct resource *res);
91 91
92#ifdef CONFIG_ZONE_DEVICE
93void *devm_memremap_pages(struct device *dev, struct resource *res);
94#else
95static inline void *devm_memremap_pages(struct device *dev, struct resource *res)
96{
97 /*
98 * Fail attempts to call devm_memremap_pages() without
99 * ZONE_DEVICE support enabled, this requires callers to fall
100 * back to plain devm_memremap() based on config
101 */
102 WARN_ON_ONCE(1);
103 return ERR_PTR(-ENXIO);
104}
105#endif
106
107/* 92/*
108 * Some systems do not have legacy ISA devices. 93 * Some systems do not have legacy ISA devices.
109 * /dev/port is not a valid interface on these systems. 94 * /dev/port is not a valid interface on these systems.
diff --git a/include/linux/kdev_t.h b/include/linux/kdev_t.h
index 052c7b32cc91..8e9e288b08c1 100644
--- a/include/linux/kdev_t.h
+++ b/include/linux/kdev_t.h
@@ -35,11 +35,6 @@ static inline dev_t old_decode_dev(u16 val)
35 return MKDEV((val >> 8) & 255, val & 255); 35 return MKDEV((val >> 8) & 255, val & 255);
36} 36}
37 37
38static inline bool new_valid_dev(dev_t dev)
39{
40 return 1;
41}
42
43static inline u32 new_encode_dev(dev_t dev) 38static inline u32 new_encode_dev(dev_t dev)
44{ 39{
45 unsigned major = MAJOR(dev); 40 unsigned major = MAJOR(dev);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7311c3294e25..f31638c6e873 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -202,26 +202,26 @@ extern int _cond_resched(void);
202 202
203/** 203/**
204 * abs - return absolute value of an argument 204 * abs - return absolute value of an argument
205 * @x: the value. If it is unsigned type, it is converted to signed type first 205 * @x: the value. If it is unsigned type, it is converted to signed type first.
206 * (s64, long or int depending on its size). 206 * char is treated as if it was signed (regardless of whether it really is)
207 * but the macro's return type is preserved as char.
207 * 208 *
208 * Return: an absolute value of x. If x is 64-bit, macro's return type is s64, 209 * Return: an absolute value of x.
209 * otherwise it is signed long.
210 */ 210 */
211#define abs(x) __builtin_choose_expr(sizeof(x) == sizeof(s64), ({ \ 211#define abs(x) __abs_choose_expr(x, long long, \
212 s64 __x = (x); \ 212 __abs_choose_expr(x, long, \
213 (__x < 0) ? -__x : __x; \ 213 __abs_choose_expr(x, int, \
214 }), ({ \ 214 __abs_choose_expr(x, short, \
215 long ret; \ 215 __abs_choose_expr(x, char, \
216 if (sizeof(x) == sizeof(long)) { \ 216 __builtin_choose_expr( \
217 long __x = (x); \ 217 __builtin_types_compatible_p(typeof(x), char), \
218 ret = (__x < 0) ? -__x : __x; \ 218 (char)({ signed char __x = (x); __x<0?-__x:__x; }), \
219 } else { \ 219 ((void)0)))))))
220 int __x = (x); \ 220
221 ret = (__x < 0) ? -__x : __x; \ 221#define __abs_choose_expr(x, type, other) __builtin_choose_expr( \
222 } \ 222 __builtin_types_compatible_p(typeof(x), signed type) || \
223 ret; \ 223 __builtin_types_compatible_p(typeof(x), unsigned type), \
224 })) 224 ({ signed type __x = (x); __x < 0 ? -__x : __x; }), other)
225 225
226/** 226/**
227 * reciprocal_scale - "scale" a value into range [0, ep_ro) 227 * reciprocal_scale - "scale" a value into range [0, ep_ro)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f707f74055c3..861f690aa791 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -66,7 +66,7 @@
66 * error pfns indicate that the gfn is in slot but faild to 66 * error pfns indicate that the gfn is in slot but faild to
67 * translate it to pfn on host. 67 * translate it to pfn on host.
68 */ 68 */
69static inline bool is_error_pfn(pfn_t pfn) 69static inline bool is_error_pfn(kvm_pfn_t pfn)
70{ 70{
71 return !!(pfn & KVM_PFN_ERR_MASK); 71 return !!(pfn & KVM_PFN_ERR_MASK);
72} 72}
@@ -76,13 +76,13 @@ static inline bool is_error_pfn(pfn_t pfn)
76 * translated to pfn - it is not in slot or failed to 76 * translated to pfn - it is not in slot or failed to
77 * translate it to pfn. 77 * translate it to pfn.
78 */ 78 */
79static inline bool is_error_noslot_pfn(pfn_t pfn) 79static inline bool is_error_noslot_pfn(kvm_pfn_t pfn)
80{ 80{
81 return !!(pfn & KVM_PFN_ERR_NOSLOT_MASK); 81 return !!(pfn & KVM_PFN_ERR_NOSLOT_MASK);
82} 82}
83 83
84/* noslot pfn indicates that the gfn is not in slot. */ 84/* noslot pfn indicates that the gfn is not in slot. */
85static inline bool is_noslot_pfn(pfn_t pfn) 85static inline bool is_noslot_pfn(kvm_pfn_t pfn)
86{ 86{
87 return pfn == KVM_PFN_NOSLOT; 87 return pfn == KVM_PFN_NOSLOT;
88} 88}
@@ -591,19 +591,20 @@ void kvm_release_page_clean(struct page *page);
591void kvm_release_page_dirty(struct page *page); 591void kvm_release_page_dirty(struct page *page);
592void kvm_set_page_accessed(struct page *page); 592void kvm_set_page_accessed(struct page *page);
593 593
594pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); 594kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
595pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); 595kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
596pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 596kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
597 bool *writable); 597 bool *writable);
598pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); 598kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
599pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); 599kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
600pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, 600kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
601 bool *async, bool write_fault, bool *writable); 601 bool atomic, bool *async, bool write_fault,
602 bool *writable);
602 603
603void kvm_release_pfn_clean(pfn_t pfn); 604void kvm_release_pfn_clean(kvm_pfn_t pfn);
604void kvm_set_pfn_dirty(pfn_t pfn); 605void kvm_set_pfn_dirty(kvm_pfn_t pfn);
605void kvm_set_pfn_accessed(pfn_t pfn); 606void kvm_set_pfn_accessed(kvm_pfn_t pfn);
606void kvm_get_pfn(pfn_t pfn); 607void kvm_get_pfn(kvm_pfn_t pfn);
607 608
608int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 609int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
609 int len); 610 int len);
@@ -629,8 +630,8 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
629 630
630struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); 631struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu);
631struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn); 632struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn);
632pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); 633kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
633pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); 634kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
634struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn); 635struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn);
635unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); 636unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
636unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); 637unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable);
@@ -811,7 +812,7 @@ void kvm_arch_sync_events(struct kvm *kvm);
811int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); 812int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
812void kvm_vcpu_kick(struct kvm_vcpu *vcpu); 813void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
813 814
814bool kvm_is_reserved_pfn(pfn_t pfn); 815bool kvm_is_reserved_pfn(kvm_pfn_t pfn);
815 816
816struct kvm_irq_ack_notifier { 817struct kvm_irq_ack_notifier {
817 struct hlist_node link; 818 struct hlist_node link;
@@ -965,7 +966,7 @@ static inline gfn_t gpa_to_gfn(gpa_t gpa)
965 return (gfn_t)(gpa >> PAGE_SHIFT); 966 return (gfn_t)(gpa >> PAGE_SHIFT);
966} 967}
967 968
968static inline hpa_t pfn_to_hpa(pfn_t pfn) 969static inline hpa_t pfn_to_hpa(kvm_pfn_t pfn)
969{ 970{
970 return (hpa_t)pfn << PAGE_SHIFT; 971 return (hpa_t)pfn << PAGE_SHIFT;
971} 972}
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 1b47a185c2f0..8bf259dae9f6 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -53,7 +53,7 @@ typedef unsigned long hva_t;
53typedef u64 hpa_t; 53typedef u64 hpa_t;
54typedef u64 hfn_t; 54typedef u64 hfn_t;
55 55
56typedef hfn_t pfn_t; 56typedef hfn_t kvm_pfn_t;
57 57
58struct gfn_to_hva_cache { 58struct gfn_to_hva_cache {
59 u64 generation; 59 u64 generation;
diff --git a/include/linux/list.h b/include/linux/list.h
index 5356f4d661a7..30cf4200ab40 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -113,6 +113,17 @@ extern void __list_del_entry(struct list_head *entry);
113extern void list_del(struct list_head *entry); 113extern void list_del(struct list_head *entry);
114#endif 114#endif
115 115
116#ifdef CONFIG_DEBUG_LIST
117/*
118 * See devm_memremap_pages() which wants DEBUG_LIST=y to assert if one
119 * of the pages it allocates is ever passed to list_add()
120 */
121extern void list_force_poison(struct list_head *entry);
122#else
123/* fallback to the less strict LIST_POISON* definitions */
124#define list_force_poison list_del
125#endif
126
116/** 127/**
117 * list_replace - replace old entry by new one 128 * list_replace - replace old entry by new one
118 * @old : the element to be replaced 129 * @old : the element to be replaced
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 173fb44e22f1..3106ac1c895e 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -61,6 +61,14 @@ extern int memblock_debug;
61extern bool movable_node_enabled; 61extern bool movable_node_enabled;
62#endif /* CONFIG_MOVABLE_NODE */ 62#endif /* CONFIG_MOVABLE_NODE */
63 63
64#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
65#define __init_memblock __meminit
66#define __initdata_memblock __meminitdata
67#else
68#define __init_memblock
69#define __initdata_memblock
70#endif
71
64#define memblock_dbg(fmt, ...) \ 72#define memblock_dbg(fmt, ...) \
65 if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) 73 if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
66 74
@@ -166,7 +174,7 @@ static inline bool memblock_is_hotpluggable(struct memblock_region *m)
166 return m->flags & MEMBLOCK_HOTPLUG; 174 return m->flags & MEMBLOCK_HOTPLUG;
167} 175}
168 176
169static inline bool movable_node_is_enabled(void) 177static inline bool __init_memblock movable_node_is_enabled(void)
170{ 178{
171 return movable_node_enabled; 179 return movable_node_enabled;
172} 180}
@@ -405,14 +413,6 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
405 for (idx = 0; idx < memblock_type->cnt; \ 413 for (idx = 0; idx < memblock_type->cnt; \
406 idx++,rgn = &memblock_type->regions[idx]) 414 idx++,rgn = &memblock_type->regions[idx])
407 415
408#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
409#define __init_memblock __meminit
410#define __initdata_memblock __meminitdata
411#else
412#define __init_memblock
413#define __initdata_memblock
414#endif
415
416#ifdef CONFIG_MEMTEST 416#ifdef CONFIG_MEMTEST
417extern void early_memtest(phys_addr_t start, phys_addr_t end); 417extern void early_memtest(phys_addr_t start, phys_addr_t end);
418#else 418#else
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2292468f2a30..189f04d4d2ec 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -280,10 +280,12 @@ static inline void mem_cgroup_events(struct mem_cgroup *memcg,
280bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); 280bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
281 281
282int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, 282int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
283 gfp_t gfp_mask, struct mem_cgroup **memcgp); 283 gfp_t gfp_mask, struct mem_cgroup **memcgp,
284 bool compound);
284void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, 285void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
285 bool lrucare); 286 bool lrucare, bool compound);
286void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg); 287void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
288 bool compound);
287void mem_cgroup_uncharge(struct page *page); 289void mem_cgroup_uncharge(struct page *page);
288void mem_cgroup_uncharge_list(struct list_head *page_list); 290void mem_cgroup_uncharge_list(struct list_head *page_list);
289 291
@@ -515,7 +517,8 @@ static inline bool mem_cgroup_low(struct mem_cgroup *root,
515 517
516static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, 518static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
517 gfp_t gfp_mask, 519 gfp_t gfp_mask,
518 struct mem_cgroup **memcgp) 520 struct mem_cgroup **memcgp,
521 bool compound)
519{ 522{
520 *memcgp = NULL; 523 *memcgp = NULL;
521 return 0; 524 return 0;
@@ -523,12 +526,13 @@ static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
523 526
524static inline void mem_cgroup_commit_charge(struct page *page, 527static inline void mem_cgroup_commit_charge(struct page *page,
525 struct mem_cgroup *memcg, 528 struct mem_cgroup *memcg,
526 bool lrucare) 529 bool lrucare, bool compound)
527{ 530{
528} 531}
529 532
530static inline void mem_cgroup_cancel_charge(struct page *page, 533static inline void mem_cgroup_cancel_charge(struct page *page,
531 struct mem_cgroup *memcg) 534 struct mem_cgroup *memcg,
535 bool compound)
532{ 536{
533} 537}
534 538
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 2ea574ff9714..43405992d027 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -275,7 +275,8 @@ extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
275extern bool is_memblock_offlined(struct memory_block *mem); 275extern bool is_memblock_offlined(struct memory_block *mem);
276extern void remove_memory(int nid, u64 start, u64 size); 276extern void remove_memory(int nid, u64 start, u64 size);
277extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn); 277extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn);
278extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms); 278extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
279 unsigned long map_offset);
279extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, 280extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
280 unsigned long pnum); 281 unsigned long pnum);
281 282
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
new file mode 100644
index 000000000000..bcaa634139a9
--- /dev/null
+++ b/include/linux/memremap.h
@@ -0,0 +1,114 @@
1#ifndef _LINUX_MEMREMAP_H_
2#define _LINUX_MEMREMAP_H_
3#include <linux/mm.h>
4#include <linux/ioport.h>
5#include <linux/percpu-refcount.h>
6
7struct resource;
8struct device;
9
10/**
11 * struct vmem_altmap - pre-allocated storage for vmemmap_populate
12 * @base_pfn: base of the entire dev_pagemap mapping
13 * @reserve: pages mapped, but reserved for driver use (relative to @base)
14 * @free: free pages set aside in the mapping for memmap storage
15 * @align: pages reserved to meet allocation alignments
16 * @alloc: track pages consumed, private to vmemmap_populate()
17 */
18struct vmem_altmap {
19 const unsigned long base_pfn;
20 const unsigned long reserve;
21 unsigned long free;
22 unsigned long align;
23 unsigned long alloc;
24};
25
26unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
27void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
28
29#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_ZONE_DEVICE)
30struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start);
31#else
32static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
33{
34 return NULL;
35}
36#endif
37
38/**
39 * struct dev_pagemap - metadata for ZONE_DEVICE mappings
40 * @altmap: pre-allocated/reserved memory for vmemmap allocations
41 * @res: physical address range covered by @ref
42 * @ref: reference count that pins the devm_memremap_pages() mapping
43 * @dev: host device of the mapping for debug
44 */
45struct dev_pagemap {
46 struct vmem_altmap *altmap;
47 const struct resource *res;
48 struct percpu_ref *ref;
49 struct device *dev;
50};
51
52#ifdef CONFIG_ZONE_DEVICE
53void *devm_memremap_pages(struct device *dev, struct resource *res,
54 struct percpu_ref *ref, struct vmem_altmap *altmap);
55struct dev_pagemap *find_dev_pagemap(resource_size_t phys);
56#else
57static inline void *devm_memremap_pages(struct device *dev,
58 struct resource *res, struct percpu_ref *ref,
59 struct vmem_altmap *altmap)
60{
61 /*
62 * Fail attempts to call devm_memremap_pages() without
63 * ZONE_DEVICE support enabled, this requires callers to fall
64 * back to plain devm_memremap() based on config
65 */
66 WARN_ON_ONCE(1);
67 return ERR_PTR(-ENXIO);
68}
69
70static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
71{
72 return NULL;
73}
74#endif
75
76/**
77 * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
78 * @pfn: page frame number to lookup page_map
79 * @pgmap: optional known pgmap that already has a reference
80 *
81 * @pgmap allows the overhead of a lookup to be bypassed when @pfn lands in the
82 * same mapping.
83 */
84static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
85 struct dev_pagemap *pgmap)
86{
87 const struct resource *res = pgmap ? pgmap->res : NULL;
88 resource_size_t phys = PFN_PHYS(pfn);
89
90 /*
91 * In the cached case we're already holding a live reference so
92 * we can simply do a blind increment
93 */
94 if (res && phys >= res->start && phys <= res->end) {
95 percpu_ref_get(pgmap->ref);
96 return pgmap;
97 }
98
99 /* fall back to slow path lookup */
100 rcu_read_lock();
101 pgmap = find_dev_pagemap(phys);
102 if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
103 pgmap = NULL;
104 rcu_read_unlock();
105
106 return pgmap;
107}
108
109static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
110{
111 if (pgmap)
112 percpu_ref_put(pgmap->ref);
113}
114#endif /* _LINUX_MEMREMAP_H_ */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 839d9e9a1c38..f1cd22f2df1a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -16,6 +16,7 @@
16#include <linux/mm_types.h> 16#include <linux/mm_types.h>
17#include <linux/range.h> 17#include <linux/range.h>
18#include <linux/pfn.h> 18#include <linux/pfn.h>
19#include <linux/percpu-refcount.h>
19#include <linux/bit_spinlock.h> 20#include <linux/bit_spinlock.h>
20#include <linux/shrinker.h> 21#include <linux/shrinker.h>
21#include <linux/resource.h> 22#include <linux/resource.h>
@@ -329,6 +330,13 @@ struct inode;
329#define page_private(page) ((page)->private) 330#define page_private(page) ((page)->private)
330#define set_page_private(page, v) ((page)->private = (v)) 331#define set_page_private(page, v) ((page)->private = (v))
331 332
333#if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
334static inline int pmd_devmap(pmd_t pmd)
335{
336 return 0;
337}
338#endif
339
332/* 340/*
333 * FIXME: take this include out, include page-flags.h in 341 * FIXME: take this include out, include page-flags.h in
334 * files which need it (119 of them) 342 * files which need it (119 of them)
@@ -410,39 +418,17 @@ static inline int is_vmalloc_or_module_addr(const void *x)
410 418
411extern void kvfree(const void *addr); 419extern void kvfree(const void *addr);
412 420
413static inline void compound_lock(struct page *page) 421static inline atomic_t *compound_mapcount_ptr(struct page *page)
414{ 422{
415#ifdef CONFIG_TRANSPARENT_HUGEPAGE 423 return &page[1].compound_mapcount;
416 VM_BUG_ON_PAGE(PageSlab(page), page);
417 bit_spin_lock(PG_compound_lock, &page->flags);
418#endif
419} 424}
420 425
421static inline void compound_unlock(struct page *page) 426static inline int compound_mapcount(struct page *page)
422{ 427{
423#ifdef CONFIG_TRANSPARENT_HUGEPAGE 428 if (!PageCompound(page))
424 VM_BUG_ON_PAGE(PageSlab(page), page); 429 return 0;
425 bit_spin_unlock(PG_compound_lock, &page->flags); 430 page = compound_head(page);
426#endif 431 return atomic_read(compound_mapcount_ptr(page)) + 1;
427}
428
429static inline unsigned long compound_lock_irqsave(struct page *page)
430{
431 unsigned long uninitialized_var(flags);
432#ifdef CONFIG_TRANSPARENT_HUGEPAGE
433 local_irq_save(flags);
434 compound_lock(page);
435#endif
436 return flags;
437}
438
439static inline void compound_unlock_irqrestore(struct page *page,
440 unsigned long flags)
441{
442#ifdef CONFIG_TRANSPARENT_HUGEPAGE
443 compound_unlock(page);
444 local_irq_restore(flags);
445#endif
446} 432}
447 433
448/* 434/*
@@ -455,61 +441,29 @@ static inline void page_mapcount_reset(struct page *page)
455 atomic_set(&(page)->_mapcount, -1); 441 atomic_set(&(page)->_mapcount, -1);
456} 442}
457 443
444int __page_mapcount(struct page *page);
445
458static inline int page_mapcount(struct page *page) 446static inline int page_mapcount(struct page *page)
459{ 447{
460 VM_BUG_ON_PAGE(PageSlab(page), page); 448 VM_BUG_ON_PAGE(PageSlab(page), page);
461 return atomic_read(&page->_mapcount) + 1;
462}
463 449
464static inline int page_count(struct page *page) 450 if (unlikely(PageCompound(page)))
465{ 451 return __page_mapcount(page);
466 return atomic_read(&compound_head(page)->_count); 452 return atomic_read(&page->_mapcount) + 1;
467}
468
469static inline bool __compound_tail_refcounted(struct page *page)
470{
471 return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page);
472}
473
474/*
475 * This takes a head page as parameter and tells if the
476 * tail page reference counting can be skipped.
477 *
478 * For this to be safe, PageSlab and PageHeadHuge must remain true on
479 * any given page where they return true here, until all tail pins
480 * have been released.
481 */
482static inline bool compound_tail_refcounted(struct page *page)
483{
484 VM_BUG_ON_PAGE(!PageHead(page), page);
485 return __compound_tail_refcounted(page);
486} 453}
487 454
488static inline void get_huge_page_tail(struct page *page) 455#ifdef CONFIG_TRANSPARENT_HUGEPAGE
456int total_mapcount(struct page *page);
457#else
458static inline int total_mapcount(struct page *page)
489{ 459{
490 /* 460 return page_mapcount(page);
491 * __split_huge_page_refcount() cannot run from under us.
492 */
493 VM_BUG_ON_PAGE(!PageTail(page), page);
494 VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
495 VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
496 if (compound_tail_refcounted(compound_head(page)))
497 atomic_inc(&page->_mapcount);
498} 461}
462#endif
499 463
500extern bool __get_page_tail(struct page *page); 464static inline int page_count(struct page *page)
501
502static inline void get_page(struct page *page)
503{ 465{
504 if (unlikely(PageTail(page))) 466 return atomic_read(&compound_head(page)->_count);
505 if (likely(__get_page_tail(page)))
506 return;
507 /*
508 * Getting a normal page or the head of a compound page
509 * requires to already have an elevated page->_count.
510 */
511 VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
512 atomic_inc(&page->_count);
513} 467}
514 468
515static inline struct page *virt_to_head_page(const void *x) 469static inline struct page *virt_to_head_page(const void *x)
@@ -528,7 +482,8 @@ static inline void init_page_count(struct page *page)
528 atomic_set(&page->_count, 1); 482 atomic_set(&page->_count, 1);
529} 483}
530 484
531void put_page(struct page *page); 485void __put_page(struct page *page);
486
532void put_pages_list(struct list_head *pages); 487void put_pages_list(struct list_head *pages);
533 488
534void split_page(struct page *page, unsigned int order); 489void split_page(struct page *page, unsigned int order);
@@ -548,6 +503,9 @@ enum compound_dtor_id {
548#ifdef CONFIG_HUGETLB_PAGE 503#ifdef CONFIG_HUGETLB_PAGE
549 HUGETLB_PAGE_DTOR, 504 HUGETLB_PAGE_DTOR,
550#endif 505#endif
506#ifdef CONFIG_TRANSPARENT_HUGEPAGE
507 TRANSHUGE_PAGE_DTOR,
508#endif
551 NR_COMPOUND_DTORS, 509 NR_COMPOUND_DTORS,
552}; 510};
553extern compound_page_dtor * const compound_page_dtors[]; 511extern compound_page_dtor * const compound_page_dtors[];
@@ -577,6 +535,8 @@ static inline void set_compound_order(struct page *page, unsigned int order)
577 page[1].compound_order = order; 535 page[1].compound_order = order;
578} 536}
579 537
538void free_compound_page(struct page *page);
539
580#ifdef CONFIG_MMU 540#ifdef CONFIG_MMU
581/* 541/*
582 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when 542 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
@@ -704,6 +664,51 @@ static inline enum zone_type page_zonenum(const struct page *page)
704 return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; 664 return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
705} 665}
706 666
667#ifdef CONFIG_ZONE_DEVICE
668void get_zone_device_page(struct page *page);
669void put_zone_device_page(struct page *page);
670static inline bool is_zone_device_page(const struct page *page)
671{
672 return page_zonenum(page) == ZONE_DEVICE;
673}
674#else
675static inline void get_zone_device_page(struct page *page)
676{
677}
678static inline void put_zone_device_page(struct page *page)
679{
680}
681static inline bool is_zone_device_page(const struct page *page)
682{
683 return false;
684}
685#endif
686
687static inline void get_page(struct page *page)
688{
689 page = compound_head(page);
690 /*
691 * Getting a normal page or the head of a compound page
692 * requires to already have an elevated page->_count.
693 */
694 VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
695 atomic_inc(&page->_count);
696
697 if (unlikely(is_zone_device_page(page)))
698 get_zone_device_page(page);
699}
700
701static inline void put_page(struct page *page)
702{
703 page = compound_head(page);
704
705 if (put_page_testzero(page))
706 __put_page(page);
707
708 if (unlikely(is_zone_device_page(page)))
709 put_zone_device_page(page);
710}
711
707#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) 712#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
708#define SECTION_IN_PAGE_FLAGS 713#define SECTION_IN_PAGE_FLAGS
709#endif 714#endif
@@ -993,10 +998,21 @@ static inline pgoff_t page_file_index(struct page *page)
993 998
994/* 999/*
995 * Return true if this page is mapped into pagetables. 1000 * Return true if this page is mapped into pagetables.
1001 * For compound page it returns true if any subpage of compound page is mapped.
996 */ 1002 */
997static inline int page_mapped(struct page *page) 1003static inline bool page_mapped(struct page *page)
998{ 1004{
999 return atomic_read(&(page)->_mapcount) >= 0; 1005 int i;
1006 if (likely(!PageCompound(page)))
1007 return atomic_read(&page->_mapcount) >= 0;
1008 page = compound_head(page);
1009 if (atomic_read(compound_mapcount_ptr(page)) >= 0)
1010 return true;
1011 for (i = 0; i < hpage_nr_pages(page); i++) {
1012 if (atomic_read(&page[i]._mapcount) >= 0)
1013 return true;
1014 }
1015 return false;
1000} 1016}
1001 1017
1002/* 1018/*
@@ -1084,7 +1100,7 @@ static inline bool shmem_mapping(struct address_space *mapping)
1084} 1100}
1085#endif 1101#endif
1086 1102
1087extern int can_do_mlock(void); 1103extern bool can_do_mlock(void);
1088extern int user_shm_lock(size_t, struct user_struct *); 1104extern int user_shm_lock(size_t, struct user_struct *);
1089extern void user_shm_unlock(size_t, struct user_struct *); 1105extern void user_shm_unlock(size_t, struct user_struct *);
1090 1106
@@ -1178,7 +1194,8 @@ int invalidate_inode_page(struct page *page);
1178extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 1194extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
1179 unsigned long address, unsigned int flags); 1195 unsigned long address, unsigned int flags);
1180extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, 1196extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1181 unsigned long address, unsigned int fault_flags); 1197 unsigned long address, unsigned int fault_flags,
1198 bool *unlocked);
1182#else 1199#else
1183static inline int handle_mm_fault(struct mm_struct *mm, 1200static inline int handle_mm_fault(struct mm_struct *mm,
1184 struct vm_area_struct *vma, unsigned long address, 1201 struct vm_area_struct *vma, unsigned long address,
@@ -1190,7 +1207,7 @@ static inline int handle_mm_fault(struct mm_struct *mm,
1190} 1207}
1191static inline int fixup_user_fault(struct task_struct *tsk, 1208static inline int fixup_user_fault(struct task_struct *tsk,
1192 struct mm_struct *mm, unsigned long address, 1209 struct mm_struct *mm, unsigned long address,
1193 unsigned int fault_flags) 1210 unsigned int fault_flags, bool *unlocked)
1194{ 1211{
1195 /* should never happen if there's no MMU */ 1212 /* should never happen if there's no MMU */
1196 BUG(); 1213 BUG();
@@ -1444,6 +1461,13 @@ static inline void sync_mm_rss(struct mm_struct *mm)
1444} 1461}
1445#endif 1462#endif
1446 1463
1464#ifndef __HAVE_ARCH_PTE_DEVMAP
1465static inline int pte_devmap(pte_t pte)
1466{
1467 return 0;
1468}
1469#endif
1470
1447int vma_wants_writenotify(struct vm_area_struct *vma); 1471int vma_wants_writenotify(struct vm_area_struct *vma);
1448 1472
1449extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, 1473extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
@@ -2114,7 +2138,7 @@ int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
2114int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 2138int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2115 unsigned long pfn); 2139 unsigned long pfn);
2116int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, 2140int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2117 unsigned long pfn); 2141 pfn_t pfn);
2118int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); 2142int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
2119 2143
2120 2144
@@ -2224,7 +2248,14 @@ pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node);
2224pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); 2248pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
2225pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node); 2249pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node);
2226void *vmemmap_alloc_block(unsigned long size, int node); 2250void *vmemmap_alloc_block(unsigned long size, int node);
2227void *vmemmap_alloc_block_buf(unsigned long size, int node); 2251struct vmem_altmap;
2252void *__vmemmap_alloc_block_buf(unsigned long size, int node,
2253 struct vmem_altmap *altmap);
2254static inline void *vmemmap_alloc_block_buf(unsigned long size, int node)
2255{
2256 return __vmemmap_alloc_block_buf(size, node, NULL);
2257}
2258
2228void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); 2259void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
2229int vmemmap_populate_basepages(unsigned long start, unsigned long end, 2260int vmemmap_populate_basepages(unsigned long start, unsigned long end,
2230 int node); 2261 int node);
@@ -2246,7 +2277,7 @@ extern int memory_failure(unsigned long pfn, int trapno, int flags);
2246extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); 2277extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
2247extern int unpoison_memory(unsigned long pfn); 2278extern int unpoison_memory(unsigned long pfn);
2248extern int get_hwpoison_page(struct page *page); 2279extern int get_hwpoison_page(struct page *page);
2249extern void put_hwpoison_page(struct page *page); 2280#define put_hwpoison_page(page) put_page(page)
2250extern int sysctl_memory_failure_early_kill; 2281extern int sysctl_memory_failure_early_kill;
2251extern int sysctl_memory_failure_recovery; 2282extern int sysctl_memory_failure_recovery;
2252extern void shake_page(struct page *p, int access); 2283extern void shake_page(struct page *p, int access);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6bc9a0ce2253..d3ebb9d21a53 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -54,6 +54,8 @@ struct page {
54 * see PAGE_MAPPING_ANON below. 54 * see PAGE_MAPPING_ANON below.
55 */ 55 */
56 void *s_mem; /* slab first object */ 56 void *s_mem; /* slab first object */
57 atomic_t compound_mapcount; /* first tail page */
58 /* page_deferred_list().next -- second tail page */
57 }; 59 };
58 60
59 /* Second double word */ 61 /* Second double word */
@@ -61,6 +63,7 @@ struct page {
61 union { 63 union {
62 pgoff_t index; /* Our offset within mapping. */ 64 pgoff_t index; /* Our offset within mapping. */
63 void *freelist; /* sl[aou]b first free object */ 65 void *freelist; /* sl[aou]b first free object */
66 /* page_deferred_list().prev -- second tail page */
64 }; 67 };
65 68
66 union { 69 union {
@@ -81,20 +84,9 @@ struct page {
81 84
82 union { 85 union {
83 /* 86 /*
84 * Count of ptes mapped in 87 * Count of ptes mapped in mms, to show
85 * mms, to show when page is 88 * when page is mapped & limit reverse
86 * mapped & limit reverse map 89 * map searches.
87 * searches.
88 *
89 * Used also for tail pages
90 * refcounting instead of
91 * _count. Tail pages cannot
92 * be mapped and keeping the
93 * tail page _count zero at
94 * all times guarantees
95 * get_page_unless_zero() will
96 * never succeed on tail
97 * pages.
98 */ 90 */
99 atomic_t _mapcount; 91 atomic_t _mapcount;
100 92
@@ -124,6 +116,11 @@ struct page {
124 * Can be used as a generic list 116 * Can be used as a generic list
125 * by the page owner. 117 * by the page owner.
126 */ 118 */
119 struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an
120 * lru or handled by a slab
121 * allocator, this points to the
122 * hosting device page map.
123 */
127 struct { /* slub per cpu partial pages */ 124 struct { /* slub per cpu partial pages */
128 struct page *next; /* Next partial slab */ 125 struct page *next; /* Next partial slab */
129#ifdef CONFIG_64BIT 126#ifdef CONFIG_64BIT
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 772362adf471..053824b0a412 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -56,4 +56,10 @@ void dump_mm(const struct mm_struct *mm);
56#define VIRTUAL_BUG_ON(cond) do { } while (0) 56#define VIRTUAL_BUG_ON(cond) do { } while (0)
57#endif 57#endif
58 58
59#ifdef CONFIG_DEBUG_VM_PGFLAGS
60#define VM_BUG_ON_PGFLAGS(cond, page) VM_BUG_ON_PAGE(cond, page)
61#else
62#define VM_BUG_ON_PGFLAGS(cond, page) BUILD_BUG_ON_INVALID(cond)
63#endif
64
59#endif 65#endif
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index bb53c7b86315..19724e6ebd26 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -101,9 +101,6 @@ enum pageflags {
101#ifdef CONFIG_MEMORY_FAILURE 101#ifdef CONFIG_MEMORY_FAILURE
102 PG_hwpoison, /* hardware poisoned page. Don't touch */ 102 PG_hwpoison, /* hardware poisoned page. Don't touch */
103#endif 103#endif
104#ifdef CONFIG_TRANSPARENT_HUGEPAGE
105 PG_compound_lock,
106#endif
107#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) 104#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
108 PG_young, 105 PG_young,
109 PG_idle, 106 PG_idle,
@@ -129,53 +126,104 @@ enum pageflags {
129 126
130 /* SLOB */ 127 /* SLOB */
131 PG_slob_free = PG_private, 128 PG_slob_free = PG_private,
129
130 /* Compound pages. Stored in first tail page's flags */
131 PG_double_map = PG_private_2,
132}; 132};
133 133
134#ifndef __GENERATING_BOUNDS_H 134#ifndef __GENERATING_BOUNDS_H
135 135
136struct page; /* forward declaration */
137
138static inline struct page *compound_head(struct page *page)
139{
140 unsigned long head = READ_ONCE(page->compound_head);
141
142 if (unlikely(head & 1))
143 return (struct page *) (head - 1);
144 return page;
145}
146
147static inline int PageTail(struct page *page)
148{
149 return READ_ONCE(page->compound_head) & 1;
150}
151
152static inline int PageCompound(struct page *page)
153{
154 return test_bit(PG_head, &page->flags) || PageTail(page);
155}
156
157/*
158 * Page flags policies wrt compound pages
159 *
160 * PF_ANY:
161 * the page flag is relevant for small, head and tail pages.
162 *
163 * PF_HEAD:
164 * for compound page all operations related to the page flag applied to
165 * head page.
166 *
167 * PF_NO_TAIL:
168 * modifications of the page flag must be done on small or head pages,
169 * checks can be done on tail pages too.
170 *
171 * PF_NO_COMPOUND:
172 * the page flag is not relevant for compound pages.
173 */
174#define PF_ANY(page, enforce) page
175#define PF_HEAD(page, enforce) compound_head(page)
176#define PF_NO_TAIL(page, enforce) ({ \
177 VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \
178 compound_head(page);})
179#define PF_NO_COMPOUND(page, enforce) ({ \
180 VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \
181 page;})
182
136/* 183/*
137 * Macros to create function definitions for page flags 184 * Macros to create function definitions for page flags
138 */ 185 */
139#define TESTPAGEFLAG(uname, lname) \ 186#define TESTPAGEFLAG(uname, lname, policy) \
140static inline int Page##uname(const struct page *page) \ 187static inline int Page##uname(struct page *page) \
141 { return test_bit(PG_##lname, &page->flags); } 188 { return test_bit(PG_##lname, &policy(page, 0)->flags); }
142 189
143#define SETPAGEFLAG(uname, lname) \ 190#define SETPAGEFLAG(uname, lname, policy) \
144static inline void SetPage##uname(struct page *page) \ 191static inline void SetPage##uname(struct page *page) \
145 { set_bit(PG_##lname, &page->flags); } 192 { set_bit(PG_##lname, &policy(page, 1)->flags); }
146 193
147#define CLEARPAGEFLAG(uname, lname) \ 194#define CLEARPAGEFLAG(uname, lname, policy) \
148static inline void ClearPage##uname(struct page *page) \ 195static inline void ClearPage##uname(struct page *page) \
149 { clear_bit(PG_##lname, &page->flags); } 196 { clear_bit(PG_##lname, &policy(page, 1)->flags); }
150 197
151#define __SETPAGEFLAG(uname, lname) \ 198#define __SETPAGEFLAG(uname, lname, policy) \
152static inline void __SetPage##uname(struct page *page) \ 199static inline void __SetPage##uname(struct page *page) \
153 { __set_bit(PG_##lname, &page->flags); } 200 { __set_bit(PG_##lname, &policy(page, 1)->flags); }
154 201
155#define __CLEARPAGEFLAG(uname, lname) \ 202#define __CLEARPAGEFLAG(uname, lname, policy) \
156static inline void __ClearPage##uname(struct page *page) \ 203static inline void __ClearPage##uname(struct page *page) \
157 { __clear_bit(PG_##lname, &page->flags); } 204 { __clear_bit(PG_##lname, &policy(page, 1)->flags); }
158 205
159#define TESTSETFLAG(uname, lname) \ 206#define TESTSETFLAG(uname, lname, policy) \
160static inline int TestSetPage##uname(struct page *page) \ 207static inline int TestSetPage##uname(struct page *page) \
161 { return test_and_set_bit(PG_##lname, &page->flags); } 208 { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
162 209
163#define TESTCLEARFLAG(uname, lname) \ 210#define TESTCLEARFLAG(uname, lname, policy) \
164static inline int TestClearPage##uname(struct page *page) \ 211static inline int TestClearPage##uname(struct page *page) \
165 { return test_and_clear_bit(PG_##lname, &page->flags); } 212 { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
166
167#define __TESTCLEARFLAG(uname, lname) \
168static inline int __TestClearPage##uname(struct page *page) \
169 { return __test_and_clear_bit(PG_##lname, &page->flags); }
170 213
171#define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \ 214#define PAGEFLAG(uname, lname, policy) \
172 SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname) 215 TESTPAGEFLAG(uname, lname, policy) \
216 SETPAGEFLAG(uname, lname, policy) \
217 CLEARPAGEFLAG(uname, lname, policy)
173 218
174#define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \ 219#define __PAGEFLAG(uname, lname, policy) \
175 __SETPAGEFLAG(uname, lname) __CLEARPAGEFLAG(uname, lname) 220 TESTPAGEFLAG(uname, lname, policy) \
221 __SETPAGEFLAG(uname, lname, policy) \
222 __CLEARPAGEFLAG(uname, lname, policy)
176 223
177#define TESTSCFLAG(uname, lname) \ 224#define TESTSCFLAG(uname, lname, policy) \
178 TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname) 225 TESTSETFLAG(uname, lname, policy) \
226 TESTCLEARFLAG(uname, lname, policy)
179 227
180#define TESTPAGEFLAG_FALSE(uname) \ 228#define TESTPAGEFLAG_FALSE(uname) \
181static inline int Page##uname(const struct page *page) { return 0; } 229static inline int Page##uname(const struct page *page) { return 0; }
@@ -195,56 +243,62 @@ static inline int TestSetPage##uname(struct page *page) { return 0; }
195#define TESTCLEARFLAG_FALSE(uname) \ 243#define TESTCLEARFLAG_FALSE(uname) \
196static inline int TestClearPage##uname(struct page *page) { return 0; } 244static inline int TestClearPage##uname(struct page *page) { return 0; }
197 245
198#define __TESTCLEARFLAG_FALSE(uname) \
199static inline int __TestClearPage##uname(struct page *page) { return 0; }
200
201#define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname) \ 246#define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname) \
202 SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname) 247 SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname)
203 248
204#define TESTSCFLAG_FALSE(uname) \ 249#define TESTSCFLAG_FALSE(uname) \
205 TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname) 250 TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)
206 251
207struct page; /* forward declaration */ 252__PAGEFLAG(Locked, locked, PF_NO_TAIL)
208 253PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND)
209TESTPAGEFLAG(Locked, locked) 254PAGEFLAG(Referenced, referenced, PF_HEAD)
210PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) 255 TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
211PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) 256 __SETPAGEFLAG(Referenced, referenced, PF_HEAD)
212 __SETPAGEFLAG(Referenced, referenced) 257PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
213PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) 258 __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
214PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) 259PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
215PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active) 260PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
216 TESTCLEARFLAG(Active, active) 261 TESTCLEARFLAG(Active, active, PF_HEAD)
217__PAGEFLAG(Slab, slab) 262__PAGEFLAG(Slab, slab, PF_NO_TAIL)
218PAGEFLAG(Checked, checked) /* Used by some filesystems */ 263__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
219PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */ 264PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */
220PAGEFLAG(SavePinned, savepinned); /* Xen */ 265
221PAGEFLAG(Foreign, foreign); /* Xen */ 266/* Xen */
222PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) 267PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND)
223PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) 268 TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
224 __SETPAGEFLAG(SwapBacked, swapbacked) 269PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND);
225 270PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND);
226__PAGEFLAG(SlobFree, slob_free) 271
272PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
273 __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
274PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
275 __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
276 __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
227 277
228/* 278/*
229 * Private page markings that may be used by the filesystem that owns the page 279 * Private page markings that may be used by the filesystem that owns the page
230 * for its own purposes. 280 * for its own purposes.
231 * - PG_private and PG_private_2 cause releasepage() and co to be invoked 281 * - PG_private and PG_private_2 cause releasepage() and co to be invoked
232 */ 282 */
233PAGEFLAG(Private, private) __SETPAGEFLAG(Private, private) 283PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY)
234 __CLEARPAGEFLAG(Private, private) 284 __CLEARPAGEFLAG(Private, private, PF_ANY)
235PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2) 285PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
236PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1) 286PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
287 TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
237 288
238/* 289/*
239 * Only test-and-set exist for PG_writeback. The unconditional operators are 290 * Only test-and-set exist for PG_writeback. The unconditional operators are
240 * risky: they bypass page accounting. 291 * risky: they bypass page accounting.
241 */ 292 */
242TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) 293TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND)
243PAGEFLAG(MappedToDisk, mappedtodisk) 294 TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND)
295PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_COMPOUND)
244 296
245/* PG_readahead is only used for reads; PG_reclaim is only for writes */ 297/* PG_readahead is only used for reads; PG_reclaim is only for writes */
246PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim) 298PAGEFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
247PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim) 299 TESTCLEARFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
300PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
301 TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
248 302
249#ifdef CONFIG_HIGHMEM 303#ifdef CONFIG_HIGHMEM
250/* 304/*
@@ -257,31 +311,33 @@ PAGEFLAG_FALSE(HighMem)
257#endif 311#endif
258 312
259#ifdef CONFIG_SWAP 313#ifdef CONFIG_SWAP
260PAGEFLAG(SwapCache, swapcache) 314PAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
261#else 315#else
262PAGEFLAG_FALSE(SwapCache) 316PAGEFLAG_FALSE(SwapCache)
263#endif 317#endif
264 318
265PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) 319PAGEFLAG(Unevictable, unevictable, PF_HEAD)
266 TESTCLEARFLAG(Unevictable, unevictable) 320 __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD)
321 TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD)
267 322
268#ifdef CONFIG_MMU 323#ifdef CONFIG_MMU
269PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked) 324PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
270 TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked) 325 __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
326 TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
271#else 327#else
272PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) 328PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked)
273 TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked) 329 TESTSCFLAG_FALSE(Mlocked)
274#endif 330#endif
275 331
276#ifdef CONFIG_ARCH_USES_PG_UNCACHED 332#ifdef CONFIG_ARCH_USES_PG_UNCACHED
277PAGEFLAG(Uncached, uncached) 333PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
278#else 334#else
279PAGEFLAG_FALSE(Uncached) 335PAGEFLAG_FALSE(Uncached)
280#endif 336#endif
281 337
282#ifdef CONFIG_MEMORY_FAILURE 338#ifdef CONFIG_MEMORY_FAILURE
283PAGEFLAG(HWPoison, hwpoison) 339PAGEFLAG(HWPoison, hwpoison, PF_ANY)
284TESTSCFLAG(HWPoison, hwpoison) 340TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
285#define __PG_HWPOISON (1UL << PG_hwpoison) 341#define __PG_HWPOISON (1UL << PG_hwpoison)
286#else 342#else
287PAGEFLAG_FALSE(HWPoison) 343PAGEFLAG_FALSE(HWPoison)
@@ -289,10 +345,10 @@ PAGEFLAG_FALSE(HWPoison)
289#endif 345#endif
290 346
291#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) 347#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
292TESTPAGEFLAG(Young, young) 348TESTPAGEFLAG(Young, young, PF_ANY)
293SETPAGEFLAG(Young, young) 349SETPAGEFLAG(Young, young, PF_ANY)
294TESTCLEARFLAG(Young, young) 350TESTCLEARFLAG(Young, young, PF_ANY)
295PAGEFLAG(Idle, idle) 351PAGEFLAG(Idle, idle, PF_ANY)
296#endif 352#endif
297 353
298/* 354/*
@@ -317,6 +373,7 @@ PAGEFLAG(Idle, idle)
317 373
318static inline int PageAnon(struct page *page) 374static inline int PageAnon(struct page *page)
319{ 375{
376 page = compound_head(page);
320 return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; 377 return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
321} 378}
322 379
@@ -329,6 +386,7 @@ static inline int PageAnon(struct page *page)
329 */ 386 */
330static inline int PageKsm(struct page *page) 387static inline int PageKsm(struct page *page)
331{ 388{
389 page = compound_head(page);
332 return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == 390 return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
333 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); 391 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
334} 392}
@@ -340,8 +398,9 @@ u64 stable_page_flags(struct page *page);
340 398
341static inline int PageUptodate(struct page *page) 399static inline int PageUptodate(struct page *page)
342{ 400{
343 int ret = test_bit(PG_uptodate, &(page)->flags); 401 int ret;
344 402 page = compound_head(page);
403 ret = test_bit(PG_uptodate, &(page)->flags);
345 /* 404 /*
346 * Must ensure that the data we read out of the page is loaded 405 * Must ensure that the data we read out of the page is loaded
347 * _after_ we've loaded page->flags to check for PageUptodate. 406 * _after_ we've loaded page->flags to check for PageUptodate.
@@ -358,22 +417,24 @@ static inline int PageUptodate(struct page *page)
358 417
359static inline void __SetPageUptodate(struct page *page) 418static inline void __SetPageUptodate(struct page *page)
360{ 419{
420 VM_BUG_ON_PAGE(PageTail(page), page);
361 smp_wmb(); 421 smp_wmb();
362 __set_bit(PG_uptodate, &(page)->flags); 422 __set_bit(PG_uptodate, &page->flags);
363} 423}
364 424
365static inline void SetPageUptodate(struct page *page) 425static inline void SetPageUptodate(struct page *page)
366{ 426{
427 VM_BUG_ON_PAGE(PageTail(page), page);
367 /* 428 /*
368 * Memory barrier must be issued before setting the PG_uptodate bit, 429 * Memory barrier must be issued before setting the PG_uptodate bit,
369 * so that all previous stores issued in order to bring the page 430 * so that all previous stores issued in order to bring the page
370 * uptodate are actually visible before PageUptodate becomes true. 431 * uptodate are actually visible before PageUptodate becomes true.
371 */ 432 */
372 smp_wmb(); 433 smp_wmb();
373 set_bit(PG_uptodate, &(page)->flags); 434 set_bit(PG_uptodate, &page->flags);
374} 435}
375 436
376CLEARPAGEFLAG(Uptodate, uptodate) 437CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)
377 438
378int test_clear_page_writeback(struct page *page); 439int test_clear_page_writeback(struct page *page);
379int __test_set_page_writeback(struct page *page, bool keep_write); 440int __test_set_page_writeback(struct page *page, bool keep_write);
@@ -393,12 +454,7 @@ static inline void set_page_writeback_keepwrite(struct page *page)
393 test_set_page_writeback_keepwrite(page); 454 test_set_page_writeback_keepwrite(page);
394} 455}
395 456
396__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head) 457__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
397
398static inline int PageTail(struct page *page)
399{
400 return READ_ONCE(page->compound_head) & 1;
401}
402 458
403static inline void set_compound_head(struct page *page, struct page *head) 459static inline void set_compound_head(struct page *page, struct page *head)
404{ 460{
@@ -410,20 +466,6 @@ static inline void clear_compound_head(struct page *page)
410 WRITE_ONCE(page->compound_head, 0); 466 WRITE_ONCE(page->compound_head, 0);
411} 467}
412 468
413static inline struct page *compound_head(struct page *page)
414{
415 unsigned long head = READ_ONCE(page->compound_head);
416
417 if (unlikely(head & 1))
418 return (struct page *) (head - 1);
419 return page;
420}
421
422static inline int PageCompound(struct page *page)
423{
424 return PageHead(page) || PageTail(page);
425
426}
427#ifdef CONFIG_TRANSPARENT_HUGEPAGE 469#ifdef CONFIG_TRANSPARENT_HUGEPAGE
428static inline void ClearPageCompound(struct page *page) 470static inline void ClearPageCompound(struct page *page)
429{ 471{
@@ -484,22 +526,43 @@ static inline int PageTransTail(struct page *page)
484 return PageTail(page); 526 return PageTail(page);
485} 527}
486 528
487#else 529/*
488 530 * PageDoubleMap indicates that the compound page is mapped with PTEs as well
489static inline int PageTransHuge(struct page *page) 531 * as PMDs.
532 *
533 * This is required for optimization of rmap operations for THP: we can postpone
534 * per small page mapcount accounting (and its overhead from atomic operations)
535 * until the first PMD split.
536 *
537 * For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up
538 * by one. This reference will go away with last compound_mapcount.
539 *
540 * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap().
541 */
542static inline int PageDoubleMap(struct page *page)
490{ 543{
491 return 0; 544 return PageHead(page) && test_bit(PG_double_map, &page[1].flags);
492} 545}
493 546
494static inline int PageTransCompound(struct page *page) 547static inline int TestSetPageDoubleMap(struct page *page)
495{ 548{
496 return 0; 549 VM_BUG_ON_PAGE(!PageHead(page), page);
550 return test_and_set_bit(PG_double_map, &page[1].flags);
497} 551}
498 552
499static inline int PageTransTail(struct page *page) 553static inline int TestClearPageDoubleMap(struct page *page)
500{ 554{
501 return 0; 555 VM_BUG_ON_PAGE(!PageHead(page), page);
556 return test_and_clear_bit(PG_double_map, &page[1].flags);
502} 557}
558
559#else
560TESTPAGEFLAG_FALSE(TransHuge)
561TESTPAGEFLAG_FALSE(TransCompound)
562TESTPAGEFLAG_FALSE(TransTail)
563TESTPAGEFLAG_FALSE(DoubleMap)
564 TESTSETFLAG_FALSE(DoubleMap)
565 TESTCLEARFLAG_FALSE(DoubleMap)
503#endif 566#endif
504 567
505/* 568/*
@@ -583,12 +646,6 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
583#define __PG_MLOCKED 0 646#define __PG_MLOCKED 0
584#endif 647#endif
585 648
586#ifdef CONFIG_TRANSPARENT_HUGEPAGE
587#define __PG_COMPOUND_LOCK (1 << PG_compound_lock)
588#else
589#define __PG_COMPOUND_LOCK 0
590#endif
591
592/* 649/*
593 * Flags checked when a page is freed. Pages being freed should not have 650 * Flags checked when a page is freed. Pages being freed should not have
594 * these flags set. It they are, there is a problem. 651 * these flags set. It they are, there is a problem.
@@ -598,8 +655,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
598 1 << PG_private | 1 << PG_private_2 | \ 655 1 << PG_private | 1 << PG_private_2 | \
599 1 << PG_writeback | 1 << PG_reserved | \ 656 1 << PG_writeback | 1 << PG_reserved | \
600 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ 657 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \
601 1 << PG_unevictable | __PG_MLOCKED | \ 658 1 << PG_unevictable | __PG_MLOCKED)
602 __PG_COMPOUND_LOCK)
603 659
604/* 660/*
605 * Flags checked when a page is prepped for return by the page allocator. 661 * Flags checked when a page is prepped for return by the page allocator.
@@ -626,6 +682,10 @@ static inline int page_has_private(struct page *page)
626 return !!(page->flags & PAGE_FLAGS_PRIVATE); 682 return !!(page->flags & PAGE_FLAGS_PRIVATE);
627} 683}
628 684
685#undef PF_ANY
686#undef PF_HEAD
687#undef PF_NO_TAIL
688#undef PF_NO_COMPOUND
629#endif /* !__GENERATING_BOUNDS_H */ 689#endif /* !__GENERATING_BOUNDS_H */
630 690
631#endif /* PAGE_FLAGS_H */ 691#endif /* PAGE_FLAGS_H */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 26eabf5ec718..4d08b6c33557 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -394,10 +394,21 @@ static inline struct page *read_mapping_page(struct address_space *mapping,
394 */ 394 */
395static inline pgoff_t page_to_pgoff(struct page *page) 395static inline pgoff_t page_to_pgoff(struct page *page)
396{ 396{
397 pgoff_t pgoff;
398
397 if (unlikely(PageHeadHuge(page))) 399 if (unlikely(PageHeadHuge(page)))
398 return page->index << compound_order(page); 400 return page->index << compound_order(page);
399 else 401
402 if (likely(!PageTransTail(page)))
400 return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 403 return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
404
405 /*
406 * We don't initialize ->index for tail pages: calculate based on
407 * head page
408 */
409 pgoff = compound_head(page)->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
410 pgoff += page - compound_head(page);
411 return pgoff;
401} 412}
402 413
403/* 414/*
@@ -433,18 +444,9 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
433 unsigned int flags); 444 unsigned int flags);
434extern void unlock_page(struct page *page); 445extern void unlock_page(struct page *page);
435 446
436static inline void __set_page_locked(struct page *page)
437{
438 __set_bit(PG_locked, &page->flags);
439}
440
441static inline void __clear_page_locked(struct page *page)
442{
443 __clear_bit(PG_locked, &page->flags);
444}
445
446static inline int trylock_page(struct page *page) 447static inline int trylock_page(struct page *page)
447{ 448{
449 page = compound_head(page);
448 return (likely(!test_and_set_bit_lock(PG_locked, &page->flags))); 450 return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
449} 451}
450 452
@@ -497,9 +499,9 @@ extern int wait_on_page_bit_killable_timeout(struct page *page,
497 499
498static inline int wait_on_page_locked_killable(struct page *page) 500static inline int wait_on_page_locked_killable(struct page *page)
499{ 501{
500 if (PageLocked(page)) 502 if (!PageLocked(page))
501 return wait_on_page_bit_killable(page, PG_locked); 503 return 0;
502 return 0; 504 return wait_on_page_bit_killable(compound_head(page), PG_locked);
503} 505}
504 506
505extern wait_queue_head_t *page_waitqueue(struct page *page); 507extern wait_queue_head_t *page_waitqueue(struct page *page);
@@ -518,7 +520,7 @@ static inline void wake_up_page(struct page *page, int bit)
518static inline void wait_on_page_locked(struct page *page) 520static inline void wait_on_page_locked(struct page *page)
519{ 521{
520 if (PageLocked(page)) 522 if (PageLocked(page))
521 wait_on_page_bit(page, PG_locked); 523 wait_on_page_bit(compound_head(page), PG_locked);
522} 524}
523 525
524/* 526/*
@@ -664,17 +666,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
664 666
665/* 667/*
666 * Like add_to_page_cache_locked, but used to add newly allocated pages: 668 * Like add_to_page_cache_locked, but used to add newly allocated pages:
667 * the page is new, so we can just run __set_page_locked() against it. 669 * the page is new, so we can just run __SetPageLocked() against it.
668 */ 670 */
669static inline int add_to_page_cache(struct page *page, 671static inline int add_to_page_cache(struct page *page,
670 struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) 672 struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
671{ 673{
672 int error; 674 int error;
673 675
674 __set_page_locked(page); 676 __SetPageLocked(page);
675 error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); 677 error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
676 if (unlikely(error)) 678 if (unlikely(error))
677 __clear_page_locked(page); 679 __ClearPageLocked(page);
678 return error; 680 return error;
679} 681}
680 682
diff --git a/include/linux/pfn.h b/include/linux/pfn.h
index 97f3e88aead4..2d8e49711b63 100644
--- a/include/linux/pfn.h
+++ b/include/linux/pfn.h
@@ -3,6 +3,15 @@
3 3
4#ifndef __ASSEMBLY__ 4#ifndef __ASSEMBLY__
5#include <linux/types.h> 5#include <linux/types.h>
6
7/*
8 * pfn_t: encapsulates a page-frame number that is optionally backed
9 * by memmap (struct page). Whether a pfn_t has a 'struct page'
10 * backing is indicated by flags in the high bits of the value.
11 */
12typedef struct {
13 unsigned long val;
14} pfn_t;
6#endif 15#endif
7 16
8#define PFN_ALIGN(x) (((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK) 17#define PFN_ALIGN(x) (((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK)
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
new file mode 100644
index 000000000000..0703b5360d31
--- /dev/null
+++ b/include/linux/pfn_t.h
@@ -0,0 +1,102 @@
1#ifndef _LINUX_PFN_T_H_
2#define _LINUX_PFN_T_H_
3#include <linux/mm.h>
4
5/*
6 * PFN_FLAGS_MASK - mask of all the possible valid pfn_t flags
7 * PFN_SG_CHAIN - pfn is a pointer to the next scatterlist entry
8 * PFN_SG_LAST - pfn references a page and is the last scatterlist entry
9 * PFN_DEV - pfn is not covered by system memmap by default
10 * PFN_MAP - pfn has a dynamic page mapping established by a device driver
11 */
12#define PFN_FLAGS_MASK (((unsigned long) ~PAGE_MASK) \
13 << (BITS_PER_LONG - PAGE_SHIFT))
14#define PFN_SG_CHAIN (1UL << (BITS_PER_LONG - 1))
15#define PFN_SG_LAST (1UL << (BITS_PER_LONG - 2))
16#define PFN_DEV (1UL << (BITS_PER_LONG - 3))
17#define PFN_MAP (1UL << (BITS_PER_LONG - 4))
18
19static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, unsigned long flags)
20{
21 pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), };
22
23 return pfn_t;
24}
25
26/* a default pfn to pfn_t conversion assumes that @pfn is pfn_valid() */
27static inline pfn_t pfn_to_pfn_t(unsigned long pfn)
28{
29 return __pfn_to_pfn_t(pfn, 0);
30}
31
32extern pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags);
33
34static inline bool pfn_t_has_page(pfn_t pfn)
35{
36 return (pfn.val & PFN_MAP) == PFN_MAP || (pfn.val & PFN_DEV) == 0;
37}
38
39static inline unsigned long pfn_t_to_pfn(pfn_t pfn)
40{
41 return pfn.val & ~PFN_FLAGS_MASK;
42}
43
44static inline struct page *pfn_t_to_page(pfn_t pfn)
45{
46 if (pfn_t_has_page(pfn))
47 return pfn_to_page(pfn_t_to_pfn(pfn));
48 return NULL;
49}
50
51static inline dma_addr_t pfn_t_to_phys(pfn_t pfn)
52{
53 return PFN_PHYS(pfn_t_to_pfn(pfn));
54}
55
56static inline void *pfn_t_to_virt(pfn_t pfn)
57{
58 if (pfn_t_has_page(pfn))
59 return __va(pfn_t_to_phys(pfn));
60 return NULL;
61}
62
63static inline pfn_t page_to_pfn_t(struct page *page)
64{
65 return pfn_to_pfn_t(page_to_pfn(page));
66}
67
68static inline int pfn_t_valid(pfn_t pfn)
69{
70 return pfn_valid(pfn_t_to_pfn(pfn));
71}
72
73#ifdef CONFIG_MMU
74static inline pte_t pfn_t_pte(pfn_t pfn, pgprot_t pgprot)
75{
76 return pfn_pte(pfn_t_to_pfn(pfn), pgprot);
77}
78#endif
79
80#ifdef CONFIG_TRANSPARENT_HUGEPAGE
81static inline pmd_t pfn_t_pmd(pfn_t pfn, pgprot_t pgprot)
82{
83 return pfn_pmd(pfn_t_to_pfn(pfn), pgprot);
84}
85#endif
86
87#ifdef __HAVE_ARCH_PTE_DEVMAP
88static inline bool pfn_t_devmap(pfn_t pfn)
89{
90 const unsigned long flags = PFN_DEV|PFN_MAP;
91
92 return (pfn.val & flags) == flags;
93}
94#else
95static inline bool pfn_t_devmap(pfn_t pfn)
96{
97 return false;
98}
99pte_t pte_mkdevmap(pte_t pte);
100pmd_t pmd_mkdevmap(pmd_t pmd);
101#endif
102#endif /* _LINUX_PFN_T_H_ */
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 317e16de09e5..4a27153574e2 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -27,11 +27,15 @@
27 * Magic number "tsta" to indicate a static timer initializer 27 * Magic number "tsta" to indicate a static timer initializer
28 * for the object debugging code. 28 * for the object debugging code.
29 */ 29 */
30#define TIMER_ENTRY_STATIC ((void *) 0x74737461) 30#define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA)
31 31
32/********** mm/debug-pagealloc.c **********/ 32/********** mm/debug-pagealloc.c **********/
33#define PAGE_POISON 0xaa 33#define PAGE_POISON 0xaa
34 34
35/********** mm/page_alloc.c ************/
36
37#define TAIL_MAPPING ((void *) 0x400 + POISON_POINTER_DELTA)
38
35/********** mm/slab.c **********/ 39/********** mm/slab.c **********/
36/* 40/*
37 * Magic nums for obj red zoning. 41 * Magic nums for obj red zoning.
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9729565c25ff..9ccbdf2c1453 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -106,13 +106,13 @@ struct va_format {
106 106
107/* 107/*
108 * Dummy printk for disabled debugging statements to use whilst maintaining 108 * Dummy printk for disabled debugging statements to use whilst maintaining
109 * gcc's format and side-effect checking. 109 * gcc's format checking.
110 */ 110 */
111static inline __printf(1, 2) 111#define no_printk(fmt, ...) \
112int no_printk(const char *fmt, ...) 112do { \
113{ 113 if (0) \
114 return 0; 114 printk(fmt, ##__VA_ARGS__); \
115} 115} while (0)
116 116
117#ifdef CONFIG_EARLY_PRINTK 117#ifdef CONFIG_EARLY_PRINTK
118extern asmlinkage __printf(1, 2) 118extern asmlinkage __printf(1, 2)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 29446aeef36e..bdf597c4f0be 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -85,6 +85,7 @@ enum ttu_flags {
85 TTU_UNMAP = 1, /* unmap mode */ 85 TTU_UNMAP = 1, /* unmap mode */
86 TTU_MIGRATION = 2, /* migration mode */ 86 TTU_MIGRATION = 2, /* migration mode */
87 TTU_MUNLOCK = 4, /* munlock mode */ 87 TTU_MUNLOCK = 4, /* munlock mode */
88 TTU_LZFREE = 8, /* lazy free mode */
88 89
89 TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ 90 TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
90 TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ 91 TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
@@ -161,25 +162,31 @@ static inline void anon_vma_merge(struct vm_area_struct *vma,
161 162
162struct anon_vma *page_get_anon_vma(struct page *page); 163struct anon_vma *page_get_anon_vma(struct page *page);
163 164
165/* bitflags for do_page_add_anon_rmap() */
166#define RMAP_EXCLUSIVE 0x01
167#define RMAP_COMPOUND 0x02
168
164/* 169/*
165 * rmap interfaces called when adding or removing pte of page 170 * rmap interfaces called when adding or removing pte of page
166 */ 171 */
167void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); 172void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
168void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); 173void page_add_anon_rmap(struct page *, struct vm_area_struct *,
174 unsigned long, bool);
169void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, 175void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
170 unsigned long, int); 176 unsigned long, int);
171void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); 177void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
178 unsigned long, bool);
172void page_add_file_rmap(struct page *); 179void page_add_file_rmap(struct page *);
173void page_remove_rmap(struct page *); 180void page_remove_rmap(struct page *, bool);
174 181
175void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, 182void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
176 unsigned long); 183 unsigned long);
177void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, 184void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
178 unsigned long); 185 unsigned long);
179 186
180static inline void page_dup_rmap(struct page *page) 187static inline void page_dup_rmap(struct page *page, bool compound)
181{ 188{
182 atomic_inc(&page->_mapcount); 189 atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount);
183} 190}
184 191
185/* 192/*
@@ -210,6 +217,25 @@ static inline pte_t *page_check_address(struct page *page, struct mm_struct *mm,
210} 217}
211 218
212/* 219/*
220 * Used by idle page tracking to check if a page was referenced via page
221 * tables.
222 */
223#ifdef CONFIG_TRANSPARENT_HUGEPAGE
224bool page_check_address_transhuge(struct page *page, struct mm_struct *mm,
225 unsigned long address, pmd_t **pmdp,
226 pte_t **ptep, spinlock_t **ptlp);
227#else
228static inline bool page_check_address_transhuge(struct page *page,
229 struct mm_struct *mm, unsigned long address,
230 pmd_t **pmdp, pte_t **ptep, spinlock_t **ptlp)
231{
232 *ptep = page_check_address(page, mm, address, ptlp, 0);
233 *pmdp = NULL;
234 return !!*ptep;
235}
236#endif
237
238/*
213 * Used by swapoff to help locate where page is expected in vma. 239 * Used by swapoff to help locate where page is expected in vma.
214 */ 240 */
215unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); 241unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
@@ -286,5 +312,6 @@ static inline int page_mkclean(struct page *page)
286#define SWAP_AGAIN 1 312#define SWAP_AGAIN 1
287#define SWAP_FAIL 2 313#define SWAP_FAIL 2
288#define SWAP_MLOCK 3 314#define SWAP_MLOCK 3
315#define SWAP_LZFREE 4
289 316
290#endif /* _LINUX_RMAP_H */ 317#endif /* _LINUX_RMAP_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 066bd21765ad..414e101cd061 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -307,6 +307,7 @@ extern void lru_add_drain_cpu(int cpu);
307extern void lru_add_drain_all(void); 307extern void lru_add_drain_all(void);
308extern void rotate_reclaimable_page(struct page *page); 308extern void rotate_reclaimable_page(struct page *page);
309extern void deactivate_file_page(struct page *page); 309extern void deactivate_file_page(struct page *page);
310extern void deactivate_page(struct page *page);
310extern void swap_setup(void); 311extern void swap_setup(void);
311 312
312extern void add_page_to_unevictable_list(struct page *page); 313extern void add_page_to_unevictable_list(struct page *page);
@@ -538,7 +539,8 @@ static inline int swp_swapcount(swp_entry_t entry)
538 return 0; 539 return 0;
539} 540}
540 541
541#define reuse_swap_page(page) (page_mapcount(page) == 1) 542#define reuse_swap_page(page) \
543 (!PageTransCompound(page) && page_mapcount(page) == 1)
542 544
543static inline int try_to_free_swap(struct page *page) 545static inline int try_to_free_swap(struct page *page)
544{ 546{
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index e623d392db0c..67c1dbd19c6d 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -25,6 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
25 FOR_ALL_ZONES(PGALLOC), 25 FOR_ALL_ZONES(PGALLOC),
26 PGFREE, PGACTIVATE, PGDEACTIVATE, 26 PGFREE, PGACTIVATE, PGDEACTIVATE,
27 PGFAULT, PGMAJFAULT, 27 PGFAULT, PGMAJFAULT,
28 PGLAZYFREED,
28 FOR_ALL_ZONES(PGREFILL), 29 FOR_ALL_ZONES(PGREFILL),
29 FOR_ALL_ZONES(PGSTEAL_KSWAPD), 30 FOR_ALL_ZONES(PGSTEAL_KSWAPD),
30 FOR_ALL_ZONES(PGSTEAL_DIRECT), 31 FOR_ALL_ZONES(PGSTEAL_DIRECT),
@@ -68,7 +69,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
68 THP_FAULT_FALLBACK, 69 THP_FAULT_FALLBACK,
69 THP_COLLAPSE_ALLOC, 70 THP_COLLAPSE_ALLOC,
70 THP_COLLAPSE_ALLOC_FAILED, 71 THP_COLLAPSE_ALLOC_FAILED,
71 THP_SPLIT, 72 THP_SPLIT_PAGE,
73 THP_SPLIT_PAGE_FAILED,
74 THP_SPLIT_PMD,
72 THP_ZERO_PAGE_ALLOC, 75 THP_ZERO_PAGE_ALLOC,
73 THP_ZERO_PAGE_ALLOC_FAILED, 76 THP_ZERO_PAGE_ALLOC_FAILED,
74#endif 77#endif
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index 97d635cabac8..0f803d2783e3 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -22,6 +22,7 @@
22 EM( SCAN_PAGE_LRU, "page_not_in_lru") \ 22 EM( SCAN_PAGE_LRU, "page_not_in_lru") \
23 EM( SCAN_PAGE_LOCK, "page_locked") \ 23 EM( SCAN_PAGE_LOCK, "page_locked") \
24 EM( SCAN_PAGE_ANON, "page_not_anon") \ 24 EM( SCAN_PAGE_ANON, "page_not_anon") \
25 EM( SCAN_PAGE_COMPOUND, "page_compound") \
25 EM( SCAN_ANY_PROCESS, "no_process_for_page") \ 26 EM( SCAN_ANY_PROCESS, "no_process_for_page") \
26 EM( SCAN_VMA_NULL, "vma_null") \ 27 EM( SCAN_VMA_NULL, "vma_null") \
27 EM( SCAN_VMA_CHECK, "vma_check_failed") \ 28 EM( SCAN_VMA_CHECK, "vma_check_failed") \
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index a74dd84bbb6d..58274382a616 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -41,6 +41,7 @@
41#define MADV_DONTNEED 4 /* don't need these pages */ 41#define MADV_DONTNEED 4 /* don't need these pages */
42 42
43/* common parameters: try to keep these consistent across architectures */ 43/* common parameters: try to keep these consistent across architectures */
44#define MADV_FREE 8 /* free pages only if memory pressure */
44#define MADV_REMOVE 9 /* remove these pages & resources */ 45#define MADV_REMOVE 9 /* remove these pages & resources */
45#define MADV_DONTFORK 10 /* don't inherit across fork */ 46#define MADV_DONTFORK 10 /* don't inherit across fork */
46#define MADV_DOFORK 11 /* do inherit across fork */ 47#define MADV_DOFORK 11 /* do inherit across fork */
diff --git a/init/Kconfig b/init/Kconfig
index 5481b49e8c3f..4644217b2373 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -285,7 +285,7 @@ config FHANDLE
285 285
286config USELIB 286config USELIB
287 bool "uselib syscall" 287 bool "uselib syscall"
288 default y 288 def_bool ALPHA || M68K || SPARC || X86_32 || IA32_EMULATION
289 help 289 help
290 This option enables the uselib syscall, a system call used in the 290 This option enables the uselib syscall, a system call used in the
291 dynamic linker from libc5 and earlier. glibc does not use this 291 dynamic linker from libc5 and earlier. glibc does not use this
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index bb0669169716..0167679182c0 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
161 const unsigned long mmun_end = addr + PAGE_SIZE; 161 const unsigned long mmun_end = addr + PAGE_SIZE;
162 struct mem_cgroup *memcg; 162 struct mem_cgroup *memcg;
163 163
164 err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg); 164 err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg,
165 false);
165 if (err) 166 if (err)
166 return err; 167 return err;
167 168
@@ -175,8 +176,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
175 goto unlock; 176 goto unlock;
176 177
177 get_page(kpage); 178 get_page(kpage);
178 page_add_new_anon_rmap(kpage, vma, addr); 179 page_add_new_anon_rmap(kpage, vma, addr, false);
179 mem_cgroup_commit_charge(kpage, memcg, false); 180 mem_cgroup_commit_charge(kpage, memcg, false, false);
180 lru_cache_add_active_or_unevictable(kpage, vma); 181 lru_cache_add_active_or_unevictable(kpage, vma);
181 182
182 if (!PageAnon(page)) { 183 if (!PageAnon(page)) {
@@ -188,7 +189,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
188 ptep_clear_flush_notify(vma, addr, ptep); 189 ptep_clear_flush_notify(vma, addr, ptep);
189 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); 190 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
190 191
191 page_remove_rmap(page); 192 page_remove_rmap(page, false);
192 if (!page_mapped(page)) 193 if (!page_mapped(page))
193 try_to_free_swap(page); 194 try_to_free_swap(page);
194 pte_unmap_unlock(ptep, ptl); 195 pte_unmap_unlock(ptep, ptl);
@@ -199,7 +200,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
199 200
200 err = 0; 201 err = 0;
201 unlock: 202 unlock:
202 mem_cgroup_cancel_charge(kpage, memcg); 203 mem_cgroup_cancel_charge(kpage, memcg, false);
203 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 204 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
204 unlock_page(page); 205 unlock_page(page);
205 return err; 206 return err;
diff --git a/kernel/futex.c b/kernel/futex.c
index 8a310e240cda..c6f514573b28 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -469,7 +469,8 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
469{ 469{
470 unsigned long address = (unsigned long)uaddr; 470 unsigned long address = (unsigned long)uaddr;
471 struct mm_struct *mm = current->mm; 471 struct mm_struct *mm = current->mm;
472 struct page *page, *page_head; 472 struct page *page;
473 struct address_space *mapping;
473 int err, ro = 0; 474 int err, ro = 0;
474 475
475 /* 476 /*
@@ -519,46 +520,9 @@ again:
519 else 520 else
520 err = 0; 521 err = 0;
521 522
522#ifdef CONFIG_TRANSPARENT_HUGEPAGE 523 lock_page(page);
523 page_head = page;
524 if (unlikely(PageTail(page))) {
525 put_page(page);
526 /* serialize against __split_huge_page_splitting() */
527 local_irq_disable();
528 if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
529 page_head = compound_head(page);
530 /*
531 * page_head is valid pointer but we must pin
532 * it before taking the PG_lock and/or
533 * PG_compound_lock. The moment we re-enable
534 * irqs __split_huge_page_splitting() can
535 * return and the head page can be freed from
536 * under us. We can't take the PG_lock and/or
537 * PG_compound_lock on a page that could be
538 * freed from under us.
539 */
540 if (page != page_head) {
541 get_page(page_head);
542 put_page(page);
543 }
544 local_irq_enable();
545 } else {
546 local_irq_enable();
547 goto again;
548 }
549 }
550#else
551 page_head = compound_head(page);
552 if (page != page_head) {
553 get_page(page_head);
554 put_page(page);
555 }
556#endif
557
558 lock_page(page_head);
559
560 /* 524 /*
561 * If page_head->mapping is NULL, then it cannot be a PageAnon 525 * If page->mapping is NULL, then it cannot be a PageAnon
562 * page; but it might be the ZERO_PAGE or in the gate area or 526 * page; but it might be the ZERO_PAGE or in the gate area or
563 * in a special mapping (all cases which we are happy to fail); 527 * in a special mapping (all cases which we are happy to fail);
564 * or it may have been a good file page when get_user_pages_fast 528 * or it may have been a good file page when get_user_pages_fast
@@ -570,12 +534,13 @@ again:
570 * 534 *
571 * The case we do have to guard against is when memory pressure made 535 * The case we do have to guard against is when memory pressure made
572 * shmem_writepage move it from filecache to swapcache beneath us: 536 * shmem_writepage move it from filecache to swapcache beneath us:
573 * an unlikely race, but we do need to retry for page_head->mapping. 537 * an unlikely race, but we do need to retry for page->mapping.
574 */ 538 */
575 if (!page_head->mapping) { 539 mapping = compound_head(page)->mapping;
576 int shmem_swizzled = PageSwapCache(page_head); 540 if (!mapping) {
577 unlock_page(page_head); 541 int shmem_swizzled = PageSwapCache(page);
578 put_page(page_head); 542 unlock_page(page);
543 put_page(page);
579 if (shmem_swizzled) 544 if (shmem_swizzled)
580 goto again; 545 goto again;
581 return -EFAULT; 546 return -EFAULT;
@@ -588,7 +553,7 @@ again:
588 * it's a read-only handle, it's expected that futexes attach to 553 * it's a read-only handle, it's expected that futexes attach to
589 * the object not the particular process. 554 * the object not the particular process.
590 */ 555 */
591 if (PageAnon(page_head)) { 556 if (PageAnon(page)) {
592 /* 557 /*
593 * A RO anonymous page will never change and thus doesn't make 558 * A RO anonymous page will never change and thus doesn't make
594 * sense for futex operations. 559 * sense for futex operations.
@@ -603,15 +568,15 @@ again:
603 key->private.address = address; 568 key->private.address = address;
604 } else { 569 } else {
605 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 570 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
606 key->shared.inode = page_head->mapping->host; 571 key->shared.inode = mapping->host;
607 key->shared.pgoff = basepage_index(page); 572 key->shared.pgoff = basepage_index(page);
608 } 573 }
609 574
610 get_futex_key_refs(key); /* implies MB (B) */ 575 get_futex_key_refs(key); /* implies MB (B) */
611 576
612out: 577out:
613 unlock_page(page_head); 578 unlock_page(page);
614 put_page(page_head); 579 put_page(page);
615 return err; 580 return err;
616} 581}
617 582
@@ -639,7 +604,7 @@ static int fault_in_user_writeable(u32 __user *uaddr)
639 604
640 down_read(&mm->mmap_sem); 605 down_read(&mm->mmap_sem);
641 ret = fixup_user_fault(current, mm, (unsigned long)uaddr, 606 ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
642 FAULT_FLAG_WRITE); 607 FAULT_FLAG_WRITE, NULL);
643 up_read(&mm->mmap_sem); 608 up_read(&mm->mmap_sem);
644 609
645 return ret < 0 ? ret : 0; 610 return ret < 0 ? ret : 0;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 7658d32c5c78..e517a16cb426 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -10,8 +10,11 @@
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details. 11 * General Public License for more details.
12 */ 12 */
13#include <linux/radix-tree.h>
14#include <linux/memremap.h>
13#include <linux/device.h> 15#include <linux/device.h>
14#include <linux/types.h> 16#include <linux/types.h>
17#include <linux/pfn_t.h>
15#include <linux/io.h> 18#include <linux/io.h>
16#include <linux/mm.h> 19#include <linux/mm.h>
17#include <linux/memory_hotplug.h> 20#include <linux/memory_hotplug.h>
@@ -147,24 +150,127 @@ void devm_memunmap(struct device *dev, void *addr)
147} 150}
148EXPORT_SYMBOL(devm_memunmap); 151EXPORT_SYMBOL(devm_memunmap);
149 152
153pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags)
154{
155 return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
156}
157EXPORT_SYMBOL(phys_to_pfn_t);
158
150#ifdef CONFIG_ZONE_DEVICE 159#ifdef CONFIG_ZONE_DEVICE
160static DEFINE_MUTEX(pgmap_lock);
161static RADIX_TREE(pgmap_radix, GFP_KERNEL);
162#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
163#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
164
151struct page_map { 165struct page_map {
152 struct resource res; 166 struct resource res;
167 struct percpu_ref *ref;
168 struct dev_pagemap pgmap;
169 struct vmem_altmap altmap;
153}; 170};
154 171
155static void devm_memremap_pages_release(struct device *dev, void *res) 172void get_zone_device_page(struct page *page)
173{
174 percpu_ref_get(page->pgmap->ref);
175}
176EXPORT_SYMBOL(get_zone_device_page);
177
178void put_zone_device_page(struct page *page)
179{
180 put_dev_pagemap(page->pgmap);
181}
182EXPORT_SYMBOL(put_zone_device_page);
183
184static void pgmap_radix_release(struct resource *res)
185{
186 resource_size_t key;
187
188 mutex_lock(&pgmap_lock);
189 for (key = res->start; key <= res->end; key += SECTION_SIZE)
190 radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT);
191 mutex_unlock(&pgmap_lock);
192}
193
194static unsigned long pfn_first(struct page_map *page_map)
195{
196 struct dev_pagemap *pgmap = &page_map->pgmap;
197 const struct resource *res = &page_map->res;
198 struct vmem_altmap *altmap = pgmap->altmap;
199 unsigned long pfn;
200
201 pfn = res->start >> PAGE_SHIFT;
202 if (altmap)
203 pfn += vmem_altmap_offset(altmap);
204 return pfn;
205}
206
207static unsigned long pfn_end(struct page_map *page_map)
208{
209 const struct resource *res = &page_map->res;
210
211 return (res->start + resource_size(res)) >> PAGE_SHIFT;
212}
213
214#define for_each_device_pfn(pfn, map) \
215 for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++)
216
217static void devm_memremap_pages_release(struct device *dev, void *data)
156{ 218{
157 struct page_map *page_map = res; 219 struct page_map *page_map = data;
220 struct resource *res = &page_map->res;
221 resource_size_t align_start, align_size;
222 struct dev_pagemap *pgmap = &page_map->pgmap;
223
224 if (percpu_ref_tryget_live(pgmap->ref)) {
225 dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
226 percpu_ref_put(pgmap->ref);
227 }
228
229 pgmap_radix_release(res);
158 230
159 /* pages are dead and unused, undo the arch mapping */ 231 /* pages are dead and unused, undo the arch mapping */
160 arch_remove_memory(page_map->res.start, resource_size(&page_map->res)); 232 align_start = res->start & ~(SECTION_SIZE - 1);
233 align_size = ALIGN(resource_size(res), SECTION_SIZE);
234 arch_remove_memory(align_start, align_size);
235 dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
236 "%s: failed to free all reserved pages\n", __func__);
237}
238
239/* assumes rcu_read_lock() held at entry */
240struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
241{
242 struct page_map *page_map;
243
244 WARN_ON_ONCE(!rcu_read_lock_held());
245
246 page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT);
247 return page_map ? &page_map->pgmap : NULL;
161} 248}
162 249
163void *devm_memremap_pages(struct device *dev, struct resource *res) 250/**
251 * devm_memremap_pages - remap and provide memmap backing for the given resource
252 * @dev: hosting device for @res
253 * @res: "host memory" address range
254 * @ref: a live per-cpu reference count
255 * @altmap: optional descriptor for allocating the memmap from @res
256 *
257 * Notes:
258 * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
259 * (or devm release event).
260 *
261 * 2/ @res is expected to be a host memory range that could feasibly be
262 * treated as a "System RAM" range, i.e. not a device mmio range, but
263 * this is not enforced.
264 */
265void *devm_memremap_pages(struct device *dev, struct resource *res,
266 struct percpu_ref *ref, struct vmem_altmap *altmap)
164{ 267{
165 int is_ram = region_intersects(res->start, resource_size(res), 268 int is_ram = region_intersects(res->start, resource_size(res),
166 "System RAM"); 269 "System RAM");
270 resource_size_t key, align_start, align_size;
271 struct dev_pagemap *pgmap;
167 struct page_map *page_map; 272 struct page_map *page_map;
273 unsigned long pfn;
168 int error, nid; 274 int error, nid;
169 275
170 if (is_ram == REGION_MIXED) { 276 if (is_ram == REGION_MIXED) {
@@ -176,25 +282,120 @@ void *devm_memremap_pages(struct device *dev, struct resource *res)
176 if (is_ram == REGION_INTERSECTS) 282 if (is_ram == REGION_INTERSECTS)
177 return __va(res->start); 283 return __va(res->start);
178 284
285 if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) {
286 dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n",
287 __func__);
288 return ERR_PTR(-ENXIO);
289 }
290
291 if (!ref)
292 return ERR_PTR(-EINVAL);
293
179 page_map = devres_alloc_node(devm_memremap_pages_release, 294 page_map = devres_alloc_node(devm_memremap_pages_release,
180 sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); 295 sizeof(*page_map), GFP_KERNEL, dev_to_node(dev));
181 if (!page_map) 296 if (!page_map)
182 return ERR_PTR(-ENOMEM); 297 return ERR_PTR(-ENOMEM);
298 pgmap = &page_map->pgmap;
183 299
184 memcpy(&page_map->res, res, sizeof(*res)); 300 memcpy(&page_map->res, res, sizeof(*res));
185 301
302 pgmap->dev = dev;
303 if (altmap) {
304 memcpy(&page_map->altmap, altmap, sizeof(*altmap));
305 pgmap->altmap = &page_map->altmap;
306 }
307 pgmap->ref = ref;
308 pgmap->res = &page_map->res;
309
310 mutex_lock(&pgmap_lock);
311 error = 0;
312 for (key = res->start; key <= res->end; key += SECTION_SIZE) {
313 struct dev_pagemap *dup;
314
315 rcu_read_lock();
316 dup = find_dev_pagemap(key);
317 rcu_read_unlock();
318 if (dup) {
319 dev_err(dev, "%s: %pr collides with mapping for %s\n",
320 __func__, res, dev_name(dup->dev));
321 error = -EBUSY;
322 break;
323 }
324 error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT,
325 page_map);
326 if (error) {
327 dev_err(dev, "%s: failed: %d\n", __func__, error);
328 break;
329 }
330 }
331 mutex_unlock(&pgmap_lock);
332 if (error)
333 goto err_radix;
334
186 nid = dev_to_node(dev); 335 nid = dev_to_node(dev);
187 if (nid < 0) 336 if (nid < 0)
188 nid = numa_mem_id(); 337 nid = numa_mem_id();
189 338
190 error = arch_add_memory(nid, res->start, resource_size(res), true); 339 align_start = res->start & ~(SECTION_SIZE - 1);
191 if (error) { 340 align_size = ALIGN(resource_size(res), SECTION_SIZE);
192 devres_free(page_map); 341 error = arch_add_memory(nid, align_start, align_size, true);
193 return ERR_PTR(error); 342 if (error)
194 } 343 goto err_add_memory;
195 344
345 for_each_device_pfn(pfn, page_map) {
346 struct page *page = pfn_to_page(pfn);
347
348 /* ZONE_DEVICE pages must never appear on a slab lru */
349 list_force_poison(&page->lru);
350 page->pgmap = pgmap;
351 }
196 devres_add(dev, page_map); 352 devres_add(dev, page_map);
197 return __va(res->start); 353 return __va(res->start);
354
355 err_add_memory:
356 err_radix:
357 pgmap_radix_release(res);
358 devres_free(page_map);
359 return ERR_PTR(error);
198} 360}
199EXPORT_SYMBOL(devm_memremap_pages); 361EXPORT_SYMBOL(devm_memremap_pages);
362
363unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
364{
365 /* number of pfns from base where pfn_to_page() is valid */
366 return altmap->reserve + altmap->free;
367}
368
369void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
370{
371 altmap->alloc -= nr_pfns;
372}
373
374#ifdef CONFIG_SPARSEMEM_VMEMMAP
375struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
376{
377 /*
378 * 'memmap_start' is the virtual address for the first "struct
379 * page" in this range of the vmemmap array. In the case of
380 * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple
381 * pointer arithmetic, so we can perform this to_vmem_altmap()
382 * conversion without concern for the initialization state of
383 * the struct page fields.
384 */
385 struct page *page = (struct page *) memmap_start;
386 struct dev_pagemap *pgmap;
387
388 /*
389 * Uncoditionally retrieve a dev_pagemap associated with the
390 * given physical address, this is only for use in the
391 * arch_{add|remove}_memory() for setting up and tearing down
392 * the memmap.
393 */
394 rcu_read_lock();
395 pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page)));
396 rcu_read_unlock();
397
398 return pgmap ? pgmap->altmap : NULL;
399}
400#endif /* CONFIG_SPARSEMEM_VMEMMAP */
200#endif /* CONFIG_ZONE_DEVICE */ 401#endif /* CONFIG_ZONE_DEVICE */
diff --git a/kernel/panic.c b/kernel/panic.c
index b333380c6bb2..d96469de72dc 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -180,8 +180,7 @@ void panic(const char *fmt, ...)
180 * panic() is not being callled from OOPS. 180 * panic() is not being callled from OOPS.
181 */ 181 */
182 debug_locks_off(); 182 debug_locks_off();
183 console_trylock(); 183 console_flush_on_panic();
184 console_unlock();
185 184
186 if (!panic_blink) 185 if (!panic_blink)
187 panic_blink = no_blink; 186 panic_blink = no_blink;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 2ce8826f1053..e79439134978 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -48,6 +48,7 @@
48#include <linux/uio.h> 48#include <linux/uio.h>
49 49
50#include <asm/uaccess.h> 50#include <asm/uaccess.h>
51#include <asm-generic/sections.h>
51 52
52#define CREATE_TRACE_POINTS 53#define CREATE_TRACE_POINTS
53#include <trace/events/printk.h> 54#include <trace/events/printk.h>
@@ -1660,7 +1661,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1660 const char *dict, size_t dictlen, 1661 const char *dict, size_t dictlen,
1661 const char *fmt, va_list args) 1662 const char *fmt, va_list args)
1662{ 1663{
1663 static int recursion_bug; 1664 static bool recursion_bug;
1664 static char textbuf[LOG_LINE_MAX]; 1665 static char textbuf[LOG_LINE_MAX];
1665 char *text = textbuf; 1666 char *text = textbuf;
1666 size_t text_len = 0; 1667 size_t text_len = 0;
@@ -1696,7 +1697,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1696 * it can be printed at the next appropriate moment: 1697 * it can be printed at the next appropriate moment:
1697 */ 1698 */
1698 if (!oops_in_progress && !lockdep_recursing(current)) { 1699 if (!oops_in_progress && !lockdep_recursing(current)) {
1699 recursion_bug = 1; 1700 recursion_bug = true;
1700 local_irq_restore(flags); 1701 local_irq_restore(flags);
1701 return 0; 1702 return 0;
1702 } 1703 }
@@ -1711,7 +1712,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1711 static const char recursion_msg[] = 1712 static const char recursion_msg[] =
1712 "BUG: recent printk recursion!"; 1713 "BUG: recent printk recursion!";
1713 1714
1714 recursion_bug = 0; 1715 recursion_bug = false;
1715 /* emit KERN_CRIT message */ 1716 /* emit KERN_CRIT message */
1716 printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, 1717 printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
1717 NULL, 0, recursion_msg, 1718 NULL, 0, recursion_msg,
@@ -2233,13 +2234,24 @@ void console_unlock(void)
2233 static u64 seen_seq; 2234 static u64 seen_seq;
2234 unsigned long flags; 2235 unsigned long flags;
2235 bool wake_klogd = false; 2236 bool wake_klogd = false;
2236 bool retry; 2237 bool do_cond_resched, retry;
2237 2238
2238 if (console_suspended) { 2239 if (console_suspended) {
2239 up_console_sem(); 2240 up_console_sem();
2240 return; 2241 return;
2241 } 2242 }
2242 2243
2244 /*
2245 * Console drivers are called under logbuf_lock, so
2246 * @console_may_schedule should be cleared before; however, we may
2247 * end up dumping a lot of lines, for example, if called from
2248 * console registration path, and should invoke cond_resched()
2249 * between lines if allowable. Not doing so can cause a very long
2250 * scheduling stall on a slow console leading to RCU stall and
2251 * softlockup warnings which exacerbate the issue with more
2252 * messages practically incapacitating the system.
2253 */
2254 do_cond_resched = console_may_schedule;
2243 console_may_schedule = 0; 2255 console_may_schedule = 0;
2244 2256
2245 /* flush buffered message fragment immediately to console */ 2257 /* flush buffered message fragment immediately to console */
@@ -2311,6 +2323,9 @@ skip:
2311 call_console_drivers(level, ext_text, ext_len, text, len); 2323 call_console_drivers(level, ext_text, ext_len, text, len);
2312 start_critical_timings(); 2324 start_critical_timings();
2313 local_irq_restore(flags); 2325 local_irq_restore(flags);
2326
2327 if (do_cond_resched)
2328 cond_resched();
2314 } 2329 }
2315 console_locked = 0; 2330 console_locked = 0;
2316 2331
@@ -2378,6 +2393,25 @@ void console_unblank(void)
2378 console_unlock(); 2393 console_unlock();
2379} 2394}
2380 2395
2396/**
2397 * console_flush_on_panic - flush console content on panic
2398 *
2399 * Immediately output all pending messages no matter what.
2400 */
2401void console_flush_on_panic(void)
2402{
2403 /*
2404 * If someone else is holding the console lock, trylock will fail
2405 * and may_schedule may be set. Ignore and proceed to unlock so
2406 * that messages are flushed out. As this can be called from any
2407 * context and we don't want to get preempted while flushing,
2408 * ensure may_schedule is cleared.
2409 */
2410 console_trylock();
2411 console_may_schedule = 0;
2412 console_unlock();
2413}
2414
2381/* 2415/*
2382 * Return the console tty driver structure and its associated index 2416 * Return the console tty driver structure and its associated index
2383 */ 2417 */
@@ -2658,13 +2692,36 @@ int unregister_console(struct console *console)
2658} 2692}
2659EXPORT_SYMBOL(unregister_console); 2693EXPORT_SYMBOL(unregister_console);
2660 2694
2695/*
2696 * Some boot consoles access data that is in the init section and which will
2697 * be discarded after the initcalls have been run. To make sure that no code
2698 * will access this data, unregister the boot consoles in a late initcall.
2699 *
2700 * If for some reason, such as deferred probe or the driver being a loadable
2701 * module, the real console hasn't registered yet at this point, there will
2702 * be a brief interval in which no messages are logged to the console, which
2703 * makes it difficult to diagnose problems that occur during this time.
2704 *
2705 * To mitigate this problem somewhat, only unregister consoles whose memory
2706 * intersects with the init section. Note that code exists elsewhere to get
2707 * rid of the boot console as soon as the proper console shows up, so there
2708 * won't be side-effects from postponing the removal.
2709 */
2661static int __init printk_late_init(void) 2710static int __init printk_late_init(void)
2662{ 2711{
2663 struct console *con; 2712 struct console *con;
2664 2713
2665 for_each_console(con) { 2714 for_each_console(con) {
2666 if (!keep_bootcon && con->flags & CON_BOOT) { 2715 if (!keep_bootcon && con->flags & CON_BOOT) {
2667 unregister_console(con); 2716 /*
2717 * Make sure to unregister boot consoles whose data
2718 * resides in the init section before the init section
2719 * is discarded. Boot consoles whose data will stick
2720 * around will automatically be unregistered when the
2721 * proper console replaces them.
2722 */
2723 if (init_section_intersects(con, sizeof(*con)))
2724 unregister_console(con);
2668 } 2725 }
2669 } 2726 }
2670 hotcpu_notifier(console_cpu_notify, 0); 2727 hotcpu_notifier(console_cpu_notify, 0);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index edb6de4f5908..a467e6c28a3b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -529,8 +529,6 @@ static int __init cpu_stop_init(void)
529} 529}
530early_initcall(cpu_stop_init); 530early_initcall(cpu_stop_init);
531 531
532#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU)
533
534static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) 532static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
535{ 533{
536 struct multi_stop_data msdata = { 534 struct multi_stop_data msdata = {
@@ -628,5 +626,3 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
628 mutex_unlock(&stop_cpus_mutex); 626 mutex_unlock(&stop_cpus_mutex);
629 return ret ?: done.ret; 627 return ret ?: done.ret;
630} 628}
631
632#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ee1ac1cc082c..f75a33f29f6e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -580,6 +580,14 @@ config DEBUG_VM_RB
580 580
581 If unsure, say N. 581 If unsure, say N.
582 582
583config DEBUG_VM_PGFLAGS
584 bool "Debug page-flags operations"
585 depends on DEBUG_VM
586 help
587 Enables extra validation on page flags operations.
588
589 If unsure, say N.
590
583config DEBUG_VIRTUAL 591config DEBUG_VIRTUAL
584 bool "Debug VM translations" 592 bool "Debug VM translations"
585 depends on DEBUG_KERNEL && X86 593 depends on DEBUG_KERNEL && X86
@@ -1589,7 +1597,6 @@ config FAULT_INJECTION_STACKTRACE_FILTER
1589 1597
1590config LATENCYTOP 1598config LATENCYTOP
1591 bool "Latency measuring infrastructure" 1599 bool "Latency measuring infrastructure"
1592 depends on HAVE_LATENCYTOP_SUPPORT
1593 depends on DEBUG_KERNEL 1600 depends on DEBUG_KERNEL
1594 depends on STACKTRACE_SUPPORT 1601 depends on STACKTRACE_SUPPORT
1595 depends on PROC_FS 1602 depends on PROC_FS
diff --git a/lib/kasprintf.c b/lib/kasprintf.c
index f194e6e593e1..7f6c506a4942 100644
--- a/lib/kasprintf.c
+++ b/lib/kasprintf.c
@@ -13,19 +13,21 @@
13/* Simplified asprintf. */ 13/* Simplified asprintf. */
14char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap) 14char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
15{ 15{
16 unsigned int len; 16 unsigned int first, second;
17 char *p; 17 char *p;
18 va_list aq; 18 va_list aq;
19 19
20 va_copy(aq, ap); 20 va_copy(aq, ap);
21 len = vsnprintf(NULL, 0, fmt, aq); 21 first = vsnprintf(NULL, 0, fmt, aq);
22 va_end(aq); 22 va_end(aq);
23 23
24 p = kmalloc_track_caller(len+1, gfp); 24 p = kmalloc_track_caller(first+1, gfp);
25 if (!p) 25 if (!p)
26 return NULL; 26 return NULL;
27 27
28 vsnprintf(p, len+1, fmt, ap); 28 second = vsnprintf(p, first+1, fmt, ap);
29 WARN(first != second, "different return values (%u and %u) from vsnprintf(\"%s\", ...)",
30 first, second, fmt);
29 31
30 return p; 32 return p;
31} 33}
diff --git a/lib/list_debug.c b/lib/list_debug.c
index 3859bf63561c..3345a089ef7b 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -12,6 +12,13 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/rculist.h> 13#include <linux/rculist.h>
14 14
15static struct list_head force_poison;
16void list_force_poison(struct list_head *entry)
17{
18 entry->next = &force_poison;
19 entry->prev = &force_poison;
20}
21
15/* 22/*
16 * Insert a new entry between two known consecutive entries. 23 * Insert a new entry between two known consecutive entries.
17 * 24 *
@@ -23,6 +30,8 @@ void __list_add(struct list_head *new,
23 struct list_head *prev, 30 struct list_head *prev,
24 struct list_head *next) 31 struct list_head *next)
25{ 32{
33 WARN(new->next == &force_poison || new->prev == &force_poison,
34 "list_add attempted on force-poisoned entry\n");
26 WARN(next->prev != prev, 35 WARN(next->prev != prev,
27 "list_add corruption. next->prev should be " 36 "list_add corruption. next->prev should be "
28 "prev (%p), but was %p. (next=%p).\n", 37 "prev (%p), but was %p. (next=%p).\n",
diff --git a/lib/test_printf.c b/lib/test_printf.c
index c5a666af9ba5..4f6ae60433bc 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -12,10 +12,13 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/string.h> 13#include <linux/string.h>
14 14
15#include <linux/bitmap.h>
16#include <linux/dcache.h>
15#include <linux/socket.h> 17#include <linux/socket.h>
16#include <linux/in.h> 18#include <linux/in.h>
17 19
18#define BUF_SIZE 256 20#define BUF_SIZE 256
21#define PAD_SIZE 16
19#define FILL_CHAR '$' 22#define FILL_CHAR '$'
20 23
21#define PTR1 ((void*)0x01234567) 24#define PTR1 ((void*)0x01234567)
@@ -39,6 +42,7 @@
39static unsigned total_tests __initdata; 42static unsigned total_tests __initdata;
40static unsigned failed_tests __initdata; 43static unsigned failed_tests __initdata;
41static char *test_buffer __initdata; 44static char *test_buffer __initdata;
45static char *alloced_buffer __initdata;
42 46
43static int __printf(4, 0) __init 47static int __printf(4, 0) __init
44do_test(int bufsize, const char *expect, int elen, 48do_test(int bufsize, const char *expect, int elen,
@@ -49,7 +53,7 @@ do_test(int bufsize, const char *expect, int elen,
49 53
50 total_tests++; 54 total_tests++;
51 55
52 memset(test_buffer, FILL_CHAR, BUF_SIZE); 56 memset(alloced_buffer, FILL_CHAR, BUF_SIZE + 2*PAD_SIZE);
53 va_copy(aq, ap); 57 va_copy(aq, ap);
54 ret = vsnprintf(test_buffer, bufsize, fmt, aq); 58 ret = vsnprintf(test_buffer, bufsize, fmt, aq);
55 va_end(aq); 59 va_end(aq);
@@ -60,8 +64,13 @@ do_test(int bufsize, const char *expect, int elen,
60 return 1; 64 return 1;
61 } 65 }
62 66
67 if (memchr_inv(alloced_buffer, FILL_CHAR, PAD_SIZE)) {
68 pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote before buffer\n", bufsize, fmt);
69 return 1;
70 }
71
63 if (!bufsize) { 72 if (!bufsize) {
64 if (memchr_inv(test_buffer, FILL_CHAR, BUF_SIZE)) { 73 if (memchr_inv(test_buffer, FILL_CHAR, BUF_SIZE + PAD_SIZE)) {
65 pr_warn("vsnprintf(buf, 0, \"%s\", ...) wrote to buffer\n", 74 pr_warn("vsnprintf(buf, 0, \"%s\", ...) wrote to buffer\n",
66 fmt); 75 fmt);
67 return 1; 76 return 1;
@@ -76,6 +85,12 @@ do_test(int bufsize, const char *expect, int elen,
76 return 1; 85 return 1;
77 } 86 }
78 87
88 if (memchr_inv(test_buffer + written + 1, FILL_CHAR, BUF_SIZE + PAD_SIZE - (written + 1))) {
89 pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote beyond the nul-terminator\n",
90 bufsize, fmt);
91 return 1;
92 }
93
79 if (memcmp(test_buffer, expect, written)) { 94 if (memcmp(test_buffer, expect, written)) {
80 pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote '%s', expected '%.*s'\n", 95 pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote '%s', expected '%.*s'\n",
81 bufsize, fmt, test_buffer, written, expect); 96 bufsize, fmt, test_buffer, written, expect);
@@ -91,7 +106,12 @@ __test(const char *expect, int elen, const char *fmt, ...)
91 int rand; 106 int rand;
92 char *p; 107 char *p;
93 108
94 BUG_ON(elen >= BUF_SIZE); 109 if (elen >= BUF_SIZE) {
110 pr_err("error in test suite: expected output length %d too long. Format was '%s'.\n",
111 elen, fmt);
112 failed_tests++;
113 return;
114 }
95 115
96 va_start(ap, fmt); 116 va_start(ap, fmt);
97 117
@@ -109,6 +129,7 @@ __test(const char *expect, int elen, const char *fmt, ...)
109 129
110 p = kvasprintf(GFP_KERNEL, fmt, ap); 130 p = kvasprintf(GFP_KERNEL, fmt, ap);
111 if (p) { 131 if (p) {
132 total_tests++;
112 if (memcmp(p, expect, elen+1)) { 133 if (memcmp(p, expect, elen+1)) {
113 pr_warn("kvasprintf(..., \"%s\", ...) returned '%s', expected '%s'\n", 134 pr_warn("kvasprintf(..., \"%s\", ...) returned '%s', expected '%s'\n",
114 fmt, p, expect); 135 fmt, p, expect);
@@ -140,6 +161,30 @@ test_number(void)
140 test("0x1234abcd ", "%#-12x", 0x1234abcd); 161 test("0x1234abcd ", "%#-12x", 0x1234abcd);
141 test(" 0x1234abcd", "%#12x", 0x1234abcd); 162 test(" 0x1234abcd", "%#12x", 0x1234abcd);
142 test("0|001| 12|+123| 1234|-123|-1234", "%d|%03d|%3d|%+d|% d|%+d|% d", 0, 1, 12, 123, 1234, -123, -1234); 163 test("0|001| 12|+123| 1234|-123|-1234", "%d|%03d|%3d|%+d|% d|%+d|% d", 0, 1, 12, 123, 1234, -123, -1234);
164 test("0|1|1|128|255", "%hhu|%hhu|%hhu|%hhu|%hhu", 0, 1, 257, 128, -1);
165 test("0|1|1|-128|-1", "%hhd|%hhd|%hhd|%hhd|%hhd", 0, 1, 257, 128, -1);
166 test("2015122420151225", "%ho%ho%#ho", 1037, 5282, -11627);
167 /*
168 * POSIX/C99: »The result of converting zero with an explicit
169 * precision of zero shall be no characters.« Hence the output
170 * from the below test should really be "00|0||| ". However,
171 * the kernel's printf also produces a single 0 in that
172 * case. This test case simply documents the current
173 * behaviour.
174 */
175 test("00|0|0|0|0", "%.2d|%.1d|%.0d|%.*d|%1.0d", 0, 0, 0, 0, 0, 0);
176#ifndef __CHAR_UNSIGNED__
177 {
178 /*
179 * Passing a 'char' to a %02x specifier doesn't do
180 * what was presumably the intention when char is
181 * signed and the value is negative. One must either &
182 * with 0xff or cast to u8.
183 */
184 char val = -16;
185 test("0xfffffff0|0xf0|0xf0", "%#02x|%#02x|%#02x", val, val & 0xff, (u8)val);
186 }
187#endif
143} 188}
144 189
145static void __init 190static void __init
@@ -148,14 +193,23 @@ test_string(void)
148 test("", "%s%.0s", "", "123"); 193 test("", "%s%.0s", "", "123");
149 test("ABCD|abc|123", "%s|%.3s|%.*s", "ABCD", "abcdef", 3, "123456"); 194 test("ABCD|abc|123", "%s|%.3s|%.*s", "ABCD", "abcdef", 3, "123456");
150 test("1 | 2|3 | 4|5 ", "%-3s|%3s|%-*s|%*s|%*s", "1", "2", 3, "3", 3, "4", -3, "5"); 195 test("1 | 2|3 | 4|5 ", "%-3s|%3s|%-*s|%*s|%*s", "1", "2", 3, "3", 3, "4", -3, "5");
196 test("1234 ", "%-10.4s", "123456");
197 test(" 1234", "%10.4s", "123456");
151 /* 198 /*
152 * POSIX and C99 say that a missing precision should be 199 * POSIX and C99 say that a negative precision (which is only
153 * treated as a precision of 0. However, the kernel's printf 200 * possible to pass via a * argument) should be treated as if
154 * implementation treats this case as if the . wasn't 201 * the precision wasn't present, and that if the precision is
155 * present. Let's add a test case documenting the current 202 * omitted (as in %.s), the precision should be taken to be
156 * behaviour; should anyone ever feel the need to follow the 203 * 0. However, the kernel's printf behave exactly opposite,
157 * standards more closely, this can be revisited. 204 * treating a negative precision as 0 and treating an omitted
205 * precision specifier as if no precision was given.
206 *
207 * These test cases document the current behaviour; should
208 * anyone ever feel the need to follow the standards more
209 * closely, this can be revisited.
158 */ 210 */
211 test(" ", "%4.*s", -5, "123456");
212 test("123456", "%.s", "123456");
159 test("a||", "%.s|%.0s|%.*s", "a", "b", 0, "c"); 213 test("a||", "%.s|%.0s|%.*s", "a", "b", 0, "c");
160 test("a | | ", "%-3.s|%-3.0s|%-3.*s", "a", "b", 0, "c"); 214 test("a | | ", "%-3.s|%-3.0s|%-3.*s", "a", "b", 0, "c");
161} 215}
@@ -273,9 +327,35 @@ uuid(void)
273 test("03020100-0504-0706-0809-0A0B0C0D0E0F", "%pUL", uuid); 327 test("03020100-0504-0706-0809-0A0B0C0D0E0F", "%pUL", uuid);
274} 328}
275 329
330static struct dentry test_dentry[4] __initdata = {
331 { .d_parent = &test_dentry[0],
332 .d_name = QSTR_INIT(test_dentry[0].d_iname, 3),
333 .d_iname = "foo" },
334 { .d_parent = &test_dentry[0],
335 .d_name = QSTR_INIT(test_dentry[1].d_iname, 5),
336 .d_iname = "bravo" },
337 { .d_parent = &test_dentry[1],
338 .d_name = QSTR_INIT(test_dentry[2].d_iname, 4),
339 .d_iname = "alfa" },
340 { .d_parent = &test_dentry[2],
341 .d_name = QSTR_INIT(test_dentry[3].d_iname, 5),
342 .d_iname = "romeo" },
343};
344
276static void __init 345static void __init
277dentry(void) 346dentry(void)
278{ 347{
348 test("foo", "%pd", &test_dentry[0]);
349 test("foo", "%pd2", &test_dentry[0]);
350
351 test("romeo", "%pd", &test_dentry[3]);
352 test("alfa/romeo", "%pd2", &test_dentry[3]);
353 test("bravo/alfa/romeo", "%pd3", &test_dentry[3]);
354 test("/bravo/alfa/romeo", "%pd4", &test_dentry[3]);
355 test("/bravo/alfa", "%pd4", &test_dentry[2]);
356
357 test("bravo/alfa |bravo/alfa ", "%-12pd2|%*pd2", &test_dentry[2], -12, &test_dentry[2]);
358 test(" bravo/alfa| bravo/alfa", "%12pd2|%*pd2", &test_dentry[2], 12, &test_dentry[2]);
279} 359}
280 360
281static void __init 361static void __init
@@ -289,6 +369,20 @@ struct_clk(void)
289} 369}
290 370
291static void __init 371static void __init
372large_bitmap(void)
373{
374 const int nbits = 1 << 16;
375 unsigned long *bits = kcalloc(BITS_TO_LONGS(nbits), sizeof(long), GFP_KERNEL);
376 if (!bits)
377 return;
378
379 bitmap_set(bits, 1, 20);
380 bitmap_set(bits, 60000, 15);
381 test("1-20,60000-60014", "%*pbl", nbits, bits);
382 kfree(bits);
383}
384
385static void __init
292bitmap(void) 386bitmap(void)
293{ 387{
294 DECLARE_BITMAP(bits, 20); 388 DECLARE_BITMAP(bits, 20);
@@ -307,6 +401,8 @@ bitmap(void)
307 bitmap_fill(bits, 20); 401 bitmap_fill(bits, 20);
308 test("fffff|fffff", "%20pb|%*pb", bits, 20, bits); 402 test("fffff|fffff", "%20pb|%*pb", bits, 20, bits);
309 test("0-19|0-19", "%20pbl|%*pbl", bits, 20, bits); 403 test("0-19|0-19", "%20pbl|%*pbl", bits, 20, bits);
404
405 large_bitmap();
310} 406}
311 407
312static void __init 408static void __init
@@ -337,16 +433,17 @@ test_pointer(void)
337static int __init 433static int __init
338test_printf_init(void) 434test_printf_init(void)
339{ 435{
340 test_buffer = kmalloc(BUF_SIZE, GFP_KERNEL); 436 alloced_buffer = kmalloc(BUF_SIZE + 2*PAD_SIZE, GFP_KERNEL);
341 if (!test_buffer) 437 if (!alloced_buffer)
342 return -ENOMEM; 438 return -ENOMEM;
439 test_buffer = alloced_buffer + PAD_SIZE;
343 440
344 test_basic(); 441 test_basic();
345 test_number(); 442 test_number();
346 test_string(); 443 test_string();
347 test_pointer(); 444 test_pointer();
348 445
349 kfree(test_buffer); 446 kfree(alloced_buffer);
350 447
351 if (failed_tests == 0) 448 if (failed_tests == 0)
352 pr_info("all %u tests passed\n", total_tests); 449 pr_info("all %u tests passed\n", total_tests);
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index ac3f9476b776..48ff9c36644d 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -383,13 +383,14 @@ enum format_type {
383}; 383};
384 384
385struct printf_spec { 385struct printf_spec {
386 u8 type; /* format_type enum */ 386 unsigned int type:8; /* format_type enum */
387 u8 flags; /* flags to number() */ 387 signed int field_width:24; /* width of output field */
388 u8 base; /* number base, 8, 10 or 16 only */ 388 unsigned int flags:8; /* flags to number() */
389 u8 qualifier; /* number qualifier, one of 'hHlLtzZ' */ 389 unsigned int base:8; /* number base, 8, 10 or 16 only */
390 s16 field_width; /* width of output field */ 390 signed int precision:16; /* # of digits/chars */
391 s16 precision; /* # of digits/chars */ 391} __packed;
392}; 392#define FIELD_WIDTH_MAX ((1 << 23) - 1)
393#define PRECISION_MAX ((1 << 15) - 1)
393 394
394static noinline_for_stack 395static noinline_for_stack
395char *number(char *buf, char *end, unsigned long long num, 396char *number(char *buf, char *end, unsigned long long num,
@@ -402,6 +403,10 @@ char *number(char *buf, char *end, unsigned long long num,
402 int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10); 403 int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
403 int i; 404 int i;
404 bool is_zero = num == 0LL; 405 bool is_zero = num == 0LL;
406 int field_width = spec.field_width;
407 int precision = spec.precision;
408
409 BUILD_BUG_ON(sizeof(struct printf_spec) != 8);
405 410
406 /* locase = 0 or 0x20. ORing digits or letters with 'locase' 411 /* locase = 0 or 0x20. ORing digits or letters with 'locase'
407 * produces same digits or (maybe lowercased) letters */ 412 * produces same digits or (maybe lowercased) letters */
@@ -413,20 +418,20 @@ char *number(char *buf, char *end, unsigned long long num,
413 if ((signed long long)num < 0) { 418 if ((signed long long)num < 0) {
414 sign = '-'; 419 sign = '-';
415 num = -(signed long long)num; 420 num = -(signed long long)num;
416 spec.field_width--; 421 field_width--;
417 } else if (spec.flags & PLUS) { 422 } else if (spec.flags & PLUS) {
418 sign = '+'; 423 sign = '+';
419 spec.field_width--; 424 field_width--;
420 } else if (spec.flags & SPACE) { 425 } else if (spec.flags & SPACE) {
421 sign = ' '; 426 sign = ' ';
422 spec.field_width--; 427 field_width--;
423 } 428 }
424 } 429 }
425 if (need_pfx) { 430 if (need_pfx) {
426 if (spec.base == 16) 431 if (spec.base == 16)
427 spec.field_width -= 2; 432 field_width -= 2;
428 else if (!is_zero) 433 else if (!is_zero)
429 spec.field_width--; 434 field_width--;
430 } 435 }
431 436
432 /* generate full string in tmp[], in reverse order */ 437 /* generate full string in tmp[], in reverse order */
@@ -448,12 +453,12 @@ char *number(char *buf, char *end, unsigned long long num,
448 } 453 }
449 454
450 /* printing 100 using %2d gives "100", not "00" */ 455 /* printing 100 using %2d gives "100", not "00" */
451 if (i > spec.precision) 456 if (i > precision)
452 spec.precision = i; 457 precision = i;
453 /* leading space padding */ 458 /* leading space padding */
454 spec.field_width -= spec.precision; 459 field_width -= precision;
455 if (!(spec.flags & (ZEROPAD | LEFT))) { 460 if (!(spec.flags & (ZEROPAD | LEFT))) {
456 while (--spec.field_width >= 0) { 461 while (--field_width >= 0) {
457 if (buf < end) 462 if (buf < end)
458 *buf = ' '; 463 *buf = ' ';
459 ++buf; 464 ++buf;
@@ -482,14 +487,14 @@ char *number(char *buf, char *end, unsigned long long num,
482 if (!(spec.flags & LEFT)) { 487 if (!(spec.flags & LEFT)) {
483 char c = ' ' + (spec.flags & ZEROPAD); 488 char c = ' ' + (spec.flags & ZEROPAD);
484 BUILD_BUG_ON(' ' + ZEROPAD != '0'); 489 BUILD_BUG_ON(' ' + ZEROPAD != '0');
485 while (--spec.field_width >= 0) { 490 while (--field_width >= 0) {
486 if (buf < end) 491 if (buf < end)
487 *buf = c; 492 *buf = c;
488 ++buf; 493 ++buf;
489 } 494 }
490 } 495 }
491 /* hmm even more zero padding? */ 496 /* hmm even more zero padding? */
492 while (i <= --spec.precision) { 497 while (i <= --precision) {
493 if (buf < end) 498 if (buf < end)
494 *buf = '0'; 499 *buf = '0';
495 ++buf; 500 ++buf;
@@ -501,7 +506,7 @@ char *number(char *buf, char *end, unsigned long long num,
501 ++buf; 506 ++buf;
502 } 507 }
503 /* trailing space padding */ 508 /* trailing space padding */
504 while (--spec.field_width >= 0) { 509 while (--field_width >= 0) {
505 if (buf < end) 510 if (buf < end)
506 *buf = ' '; 511 *buf = ' ';
507 ++buf; 512 ++buf;
@@ -511,37 +516,20 @@ char *number(char *buf, char *end, unsigned long long num,
511} 516}
512 517
513static noinline_for_stack 518static noinline_for_stack
514char *string(char *buf, char *end, const char *s, struct printf_spec spec) 519char *special_hex_number(char *buf, char *end, unsigned long long num, int size)
515{ 520{
516 int len, i; 521 struct printf_spec spec;
517
518 if ((unsigned long)s < PAGE_SIZE)
519 s = "(null)";
520 522
521 len = strnlen(s, spec.precision); 523 spec.type = FORMAT_TYPE_PTR;
522 524 spec.field_width = 2 + 2 * size; /* 0x + hex */
523 if (!(spec.flags & LEFT)) { 525 spec.flags = SPECIAL | SMALL | ZEROPAD;
524 while (len < spec.field_width--) { 526 spec.base = 16;
525 if (buf < end) 527 spec.precision = -1;
526 *buf = ' ';
527 ++buf;
528 }
529 }
530 for (i = 0; i < len; ++i) {
531 if (buf < end)
532 *buf = *s;
533 ++buf; ++s;
534 }
535 while (len < spec.field_width--) {
536 if (buf < end)
537 *buf = ' ';
538 ++buf;
539 }
540 528
541 return buf; 529 return number(buf, end, num, spec);
542} 530}
543 531
544static void widen(char *buf, char *end, unsigned len, unsigned spaces) 532static void move_right(char *buf, char *end, unsigned len, unsigned spaces)
545{ 533{
546 size_t size; 534 size_t size;
547 if (buf >= end) /* nowhere to put anything */ 535 if (buf >= end) /* nowhere to put anything */
@@ -559,6 +547,56 @@ static void widen(char *buf, char *end, unsigned len, unsigned spaces)
559 memset(buf, ' ', spaces); 547 memset(buf, ' ', spaces);
560} 548}
561 549
550/*
551 * Handle field width padding for a string.
552 * @buf: current buffer position
553 * @n: length of string
554 * @end: end of output buffer
555 * @spec: for field width and flags
556 * Returns: new buffer position after padding.
557 */
558static noinline_for_stack
559char *widen_string(char *buf, int n, char *end, struct printf_spec spec)
560{
561 unsigned spaces;
562
563 if (likely(n >= spec.field_width))
564 return buf;
565 /* we want to pad the sucker */
566 spaces = spec.field_width - n;
567 if (!(spec.flags & LEFT)) {
568 move_right(buf - n, end, n, spaces);
569 return buf + spaces;
570 }
571 while (spaces--) {
572 if (buf < end)
573 *buf = ' ';
574 ++buf;
575 }
576 return buf;
577}
578
579static noinline_for_stack
580char *string(char *buf, char *end, const char *s, struct printf_spec spec)
581{
582 int len = 0;
583 size_t lim = spec.precision;
584
585 if ((unsigned long)s < PAGE_SIZE)
586 s = "(null)";
587
588 while (lim--) {
589 char c = *s++;
590 if (!c)
591 break;
592 if (buf < end)
593 *buf = c;
594 ++buf;
595 ++len;
596 }
597 return widen_string(buf, len, end, spec);
598}
599
562static noinline_for_stack 600static noinline_for_stack
563char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_spec spec, 601char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_spec spec,
564 const char *fmt) 602 const char *fmt)
@@ -600,20 +638,7 @@ char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_sp
600 *buf = c; 638 *buf = c;
601 } 639 }
602 rcu_read_unlock(); 640 rcu_read_unlock();
603 if (n < spec.field_width) { 641 return widen_string(buf, n, end, spec);
604 /* we want to pad the sucker */
605 unsigned spaces = spec.field_width - n;
606 if (!(spec.flags & LEFT)) {
607 widen(buf - n, end, n, spaces);
608 return buf + spaces;
609 }
610 while (spaces--) {
611 if (buf < end)
612 *buf = ' ';
613 ++buf;
614 }
615 }
616 return buf;
617} 642}
618 643
619#ifdef CONFIG_BLOCK 644#ifdef CONFIG_BLOCK
@@ -659,11 +684,7 @@ char *symbol_string(char *buf, char *end, void *ptr,
659 684
660 return string(buf, end, sym, spec); 685 return string(buf, end, sym, spec);
661#else 686#else
662 spec.field_width = 2 * sizeof(void *); 687 return special_hex_number(buf, end, value, sizeof(void *));
663 spec.flags |= SPECIAL | SMALL | ZEROPAD;
664 spec.base = 16;
665
666 return number(buf, end, value, spec);
667#endif 688#endif
668} 689}
669 690
@@ -1324,40 +1345,45 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
1324 return string(buf, end, uuid, spec); 1345 return string(buf, end, uuid, spec);
1325} 1346}
1326 1347
1327static 1348static noinline_for_stack
1328char *netdev_feature_string(char *buf, char *end, const u8 *addr, 1349char *netdev_bits(char *buf, char *end, const void *addr, const char *fmt)
1329 struct printf_spec spec)
1330{ 1350{
1331 spec.flags |= SPECIAL | SMALL | ZEROPAD; 1351 unsigned long long num;
1332 if (spec.field_width == -1) 1352 int size;
1333 spec.field_width = 2 + 2 * sizeof(netdev_features_t);
1334 spec.base = 16;
1335 1353
1336 return number(buf, end, *(const netdev_features_t *)addr, spec); 1354 switch (fmt[1]) {
1355 case 'F':
1356 num = *(const netdev_features_t *)addr;
1357 size = sizeof(netdev_features_t);
1358 break;
1359 default:
1360 num = (unsigned long)addr;
1361 size = sizeof(unsigned long);
1362 break;
1363 }
1364
1365 return special_hex_number(buf, end, num, size);
1337} 1366}
1338 1367
1339static noinline_for_stack 1368static noinline_for_stack
1340char *address_val(char *buf, char *end, const void *addr, 1369char *address_val(char *buf, char *end, const void *addr, const char *fmt)
1341 struct printf_spec spec, const char *fmt)
1342{ 1370{
1343 unsigned long long num; 1371 unsigned long long num;
1344 1372 int size;
1345 spec.flags |= SPECIAL | SMALL | ZEROPAD;
1346 spec.base = 16;
1347 1373
1348 switch (fmt[1]) { 1374 switch (fmt[1]) {
1349 case 'd': 1375 case 'd':
1350 num = *(const dma_addr_t *)addr; 1376 num = *(const dma_addr_t *)addr;
1351 spec.field_width = sizeof(dma_addr_t) * 2 + 2; 1377 size = sizeof(dma_addr_t);
1352 break; 1378 break;
1353 case 'p': 1379 case 'p':
1354 default: 1380 default:
1355 num = *(const phys_addr_t *)addr; 1381 num = *(const phys_addr_t *)addr;
1356 spec.field_width = sizeof(phys_addr_t) * 2 + 2; 1382 size = sizeof(phys_addr_t);
1357 break; 1383 break;
1358 } 1384 }
1359 1385
1360 return number(buf, end, num, spec); 1386 return special_hex_number(buf, end, num, size);
1361} 1387}
1362 1388
1363static noinline_for_stack 1389static noinline_for_stack
@@ -1376,10 +1402,7 @@ char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec,
1376#ifdef CONFIG_COMMON_CLK 1402#ifdef CONFIG_COMMON_CLK
1377 return string(buf, end, __clk_get_name(clk), spec); 1403 return string(buf, end, __clk_get_name(clk), spec);
1378#else 1404#else
1379 spec.base = 16; 1405 return special_hex_number(buf, end, (unsigned long)clk, sizeof(unsigned long));
1380 spec.field_width = sizeof(unsigned long) * 2 + 2;
1381 spec.flags |= SPECIAL | SMALL | ZEROPAD;
1382 return number(buf, end, (unsigned long)clk, spec);
1383#endif 1406#endif
1384 } 1407 }
1385} 1408}
@@ -1609,13 +1632,9 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
1609 break; 1632 break;
1610 1633
1611 case 'N': 1634 case 'N':
1612 switch (fmt[1]) { 1635 return netdev_bits(buf, end, ptr, fmt);
1613 case 'F':
1614 return netdev_feature_string(buf, end, ptr, spec);
1615 }
1616 break;
1617 case 'a': 1636 case 'a':
1618 return address_val(buf, end, ptr, spec, fmt); 1637 return address_val(buf, end, ptr, fmt);
1619 case 'd': 1638 case 'd':
1620 return dentry_name(buf, end, ptr, spec, fmt); 1639 return dentry_name(buf, end, ptr, spec, fmt);
1621 case 'C': 1640 case 'C':
@@ -1664,6 +1683,7 @@ static noinline_for_stack
1664int format_decode(const char *fmt, struct printf_spec *spec) 1683int format_decode(const char *fmt, struct printf_spec *spec)
1665{ 1684{
1666 const char *start = fmt; 1685 const char *start = fmt;
1686 char qualifier;
1667 1687
1668 /* we finished early by reading the field width */ 1688 /* we finished early by reading the field width */
1669 if (spec->type == FORMAT_TYPE_WIDTH) { 1689 if (spec->type == FORMAT_TYPE_WIDTH) {
@@ -1746,16 +1766,16 @@ precision:
1746 1766
1747qualifier: 1767qualifier:
1748 /* get the conversion qualifier */ 1768 /* get the conversion qualifier */
1749 spec->qualifier = -1; 1769 qualifier = 0;
1750 if (*fmt == 'h' || _tolower(*fmt) == 'l' || 1770 if (*fmt == 'h' || _tolower(*fmt) == 'l' ||
1751 _tolower(*fmt) == 'z' || *fmt == 't') { 1771 _tolower(*fmt) == 'z' || *fmt == 't') {
1752 spec->qualifier = *fmt++; 1772 qualifier = *fmt++;
1753 if (unlikely(spec->qualifier == *fmt)) { 1773 if (unlikely(qualifier == *fmt)) {
1754 if (spec->qualifier == 'l') { 1774 if (qualifier == 'l') {
1755 spec->qualifier = 'L'; 1775 qualifier = 'L';
1756 ++fmt; 1776 ++fmt;
1757 } else if (spec->qualifier == 'h') { 1777 } else if (qualifier == 'h') {
1758 spec->qualifier = 'H'; 1778 qualifier = 'H';
1759 ++fmt; 1779 ++fmt;
1760 } 1780 }
1761 } 1781 }
@@ -1812,19 +1832,19 @@ qualifier:
1812 return fmt - start; 1832 return fmt - start;
1813 } 1833 }
1814 1834
1815 if (spec->qualifier == 'L') 1835 if (qualifier == 'L')
1816 spec->type = FORMAT_TYPE_LONG_LONG; 1836 spec->type = FORMAT_TYPE_LONG_LONG;
1817 else if (spec->qualifier == 'l') { 1837 else if (qualifier == 'l') {
1818 BUILD_BUG_ON(FORMAT_TYPE_ULONG + SIGN != FORMAT_TYPE_LONG); 1838 BUILD_BUG_ON(FORMAT_TYPE_ULONG + SIGN != FORMAT_TYPE_LONG);
1819 spec->type = FORMAT_TYPE_ULONG + (spec->flags & SIGN); 1839 spec->type = FORMAT_TYPE_ULONG + (spec->flags & SIGN);
1820 } else if (_tolower(spec->qualifier) == 'z') { 1840 } else if (_tolower(qualifier) == 'z') {
1821 spec->type = FORMAT_TYPE_SIZE_T; 1841 spec->type = FORMAT_TYPE_SIZE_T;
1822 } else if (spec->qualifier == 't') { 1842 } else if (qualifier == 't') {
1823 spec->type = FORMAT_TYPE_PTRDIFF; 1843 spec->type = FORMAT_TYPE_PTRDIFF;
1824 } else if (spec->qualifier == 'H') { 1844 } else if (qualifier == 'H') {
1825 BUILD_BUG_ON(FORMAT_TYPE_UBYTE + SIGN != FORMAT_TYPE_BYTE); 1845 BUILD_BUG_ON(FORMAT_TYPE_UBYTE + SIGN != FORMAT_TYPE_BYTE);
1826 spec->type = FORMAT_TYPE_UBYTE + (spec->flags & SIGN); 1846 spec->type = FORMAT_TYPE_UBYTE + (spec->flags & SIGN);
1827 } else if (spec->qualifier == 'h') { 1847 } else if (qualifier == 'h') {
1828 BUILD_BUG_ON(FORMAT_TYPE_USHORT + SIGN != FORMAT_TYPE_SHORT); 1848 BUILD_BUG_ON(FORMAT_TYPE_USHORT + SIGN != FORMAT_TYPE_SHORT);
1829 spec->type = FORMAT_TYPE_USHORT + (spec->flags & SIGN); 1849 spec->type = FORMAT_TYPE_USHORT + (spec->flags & SIGN);
1830 } else { 1850 } else {
@@ -1835,6 +1855,24 @@ qualifier:
1835 return ++fmt - start; 1855 return ++fmt - start;
1836} 1856}
1837 1857
1858static void
1859set_field_width(struct printf_spec *spec, int width)
1860{
1861 spec->field_width = width;
1862 if (WARN_ONCE(spec->field_width != width, "field width %d too large", width)) {
1863 spec->field_width = clamp(width, -FIELD_WIDTH_MAX, FIELD_WIDTH_MAX);
1864 }
1865}
1866
1867static void
1868set_precision(struct printf_spec *spec, int prec)
1869{
1870 spec->precision = prec;
1871 if (WARN_ONCE(spec->precision != prec, "precision %d too large", prec)) {
1872 spec->precision = clamp(prec, 0, PRECISION_MAX);
1873 }
1874}
1875
1838/** 1876/**
1839 * vsnprintf - Format a string and place it in a buffer 1877 * vsnprintf - Format a string and place it in a buffer
1840 * @buf: The buffer to place the result into 1878 * @buf: The buffer to place the result into
@@ -1902,11 +1940,11 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
1902 } 1940 }
1903 1941
1904 case FORMAT_TYPE_WIDTH: 1942 case FORMAT_TYPE_WIDTH:
1905 spec.field_width = va_arg(args, int); 1943 set_field_width(&spec, va_arg(args, int));
1906 break; 1944 break;
1907 1945
1908 case FORMAT_TYPE_PRECISION: 1946 case FORMAT_TYPE_PRECISION:
1909 spec.precision = va_arg(args, int); 1947 set_precision(&spec, va_arg(args, int));
1910 break; 1948 break;
1911 1949
1912 case FORMAT_TYPE_CHAR: { 1950 case FORMAT_TYPE_CHAR: {
@@ -2346,11 +2384,11 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
2346 } 2384 }
2347 2385
2348 case FORMAT_TYPE_WIDTH: 2386 case FORMAT_TYPE_WIDTH:
2349 spec.field_width = get_arg(int); 2387 set_field_width(&spec, get_arg(int));
2350 break; 2388 break;
2351 2389
2352 case FORMAT_TYPE_PRECISION: 2390 case FORMAT_TYPE_PRECISION:
2353 spec.precision = get_arg(int); 2391 set_precision(&spec, get_arg(int));
2354 break; 2392 break;
2355 2393
2356 case FORMAT_TYPE_CHAR: { 2394 case FORMAT_TYPE_CHAR: {
diff --git a/mm/debug.c b/mm/debug.c
index 5d2072ed8d5e..f05b2d5d6481 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -40,9 +40,6 @@ static const struct trace_print_flags pageflag_names[] = {
40#ifdef CONFIG_MEMORY_FAILURE 40#ifdef CONFIG_MEMORY_FAILURE
41 {1UL << PG_hwpoison, "hwpoison" }, 41 {1UL << PG_hwpoison, "hwpoison" },
42#endif 42#endif
43#ifdef CONFIG_TRANSPARENT_HUGEPAGE
44 {1UL << PG_compound_lock, "compound_lock" },
45#endif
46#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) 43#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
47 {1UL << PG_young, "young" }, 44 {1UL << PG_young, "young" },
48 {1UL << PG_idle, "idle" }, 45 {1UL << PG_idle, "idle" },
@@ -82,9 +79,12 @@ static void dump_flags(unsigned long flags,
82void dump_page_badflags(struct page *page, const char *reason, 79void dump_page_badflags(struct page *page, const char *reason,
83 unsigned long badflags) 80 unsigned long badflags)
84{ 81{
85 pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", 82 pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
86 page, atomic_read(&page->_count), page_mapcount(page), 83 page, atomic_read(&page->_count), page_mapcount(page),
87 page->mapping, page->index); 84 page->mapping, page->index);
85 if (PageCompound(page))
86 pr_cont(" compound_mapcount: %d", compound_mapcount(page));
87 pr_cont("\n");
88 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); 88 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
89 dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); 89 dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
90 if (reason) 90 if (reason)
diff --git a/mm/filemap.c b/mm/filemap.c
index ff42d31c891a..847ee43c2806 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -204,7 +204,7 @@ void __delete_from_page_cache(struct page *page, void *shadow,
204 __dec_zone_page_state(page, NR_FILE_PAGES); 204 __dec_zone_page_state(page, NR_FILE_PAGES);
205 if (PageSwapBacked(page)) 205 if (PageSwapBacked(page))
206 __dec_zone_page_state(page, NR_SHMEM); 206 __dec_zone_page_state(page, NR_SHMEM);
207 BUG_ON(page_mapped(page)); 207 VM_BUG_ON_PAGE(page_mapped(page), page);
208 208
209 /* 209 /*
210 * At this point page must be either written or cleaned by truncate. 210 * At this point page must be either written or cleaned by truncate.
@@ -618,7 +618,7 @@ static int __add_to_page_cache_locked(struct page *page,
618 618
619 if (!huge) { 619 if (!huge) {
620 error = mem_cgroup_try_charge(page, current->mm, 620 error = mem_cgroup_try_charge(page, current->mm,
621 gfp_mask, &memcg); 621 gfp_mask, &memcg, false);
622 if (error) 622 if (error)
623 return error; 623 return error;
624 } 624 }
@@ -626,7 +626,7 @@ static int __add_to_page_cache_locked(struct page *page,
626 error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); 626 error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
627 if (error) { 627 if (error) {
628 if (!huge) 628 if (!huge)
629 mem_cgroup_cancel_charge(page, memcg); 629 mem_cgroup_cancel_charge(page, memcg, false);
630 return error; 630 return error;
631 } 631 }
632 632
@@ -645,7 +645,7 @@ static int __add_to_page_cache_locked(struct page *page,
645 __inc_zone_page_state(page, NR_FILE_PAGES); 645 __inc_zone_page_state(page, NR_FILE_PAGES);
646 spin_unlock_irq(&mapping->tree_lock); 646 spin_unlock_irq(&mapping->tree_lock);
647 if (!huge) 647 if (!huge)
648 mem_cgroup_commit_charge(page, memcg, false); 648 mem_cgroup_commit_charge(page, memcg, false, false);
649 trace_mm_filemap_add_to_page_cache(page); 649 trace_mm_filemap_add_to_page_cache(page);
650 return 0; 650 return 0;
651err_insert: 651err_insert:
@@ -653,7 +653,7 @@ err_insert:
653 /* Leave page->index set: truncation relies upon it */ 653 /* Leave page->index set: truncation relies upon it */
654 spin_unlock_irq(&mapping->tree_lock); 654 spin_unlock_irq(&mapping->tree_lock);
655 if (!huge) 655 if (!huge)
656 mem_cgroup_cancel_charge(page, memcg); 656 mem_cgroup_cancel_charge(page, memcg, false);
657 page_cache_release(page); 657 page_cache_release(page);
658 return error; 658 return error;
659} 659}
@@ -682,11 +682,11 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
682 void *shadow = NULL; 682 void *shadow = NULL;
683 int ret; 683 int ret;
684 684
685 __set_page_locked(page); 685 __SetPageLocked(page);
686 ret = __add_to_page_cache_locked(page, mapping, offset, 686 ret = __add_to_page_cache_locked(page, mapping, offset,
687 gfp_mask, &shadow); 687 gfp_mask, &shadow);
688 if (unlikely(ret)) 688 if (unlikely(ret))
689 __clear_page_locked(page); 689 __ClearPageLocked(page);
690 else { 690 else {
691 /* 691 /*
692 * The page might have been evicted from cache only 692 * The page might have been evicted from cache only
@@ -809,6 +809,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
809 */ 809 */
810void unlock_page(struct page *page) 810void unlock_page(struct page *page)
811{ 811{
812 page = compound_head(page);
812 VM_BUG_ON_PAGE(!PageLocked(page), page); 813 VM_BUG_ON_PAGE(!PageLocked(page), page);
813 clear_bit_unlock(PG_locked, &page->flags); 814 clear_bit_unlock(PG_locked, &page->flags);
814 smp_mb__after_atomic(); 815 smp_mb__after_atomic();
@@ -873,18 +874,20 @@ EXPORT_SYMBOL_GPL(page_endio);
873 */ 874 */
874void __lock_page(struct page *page) 875void __lock_page(struct page *page)
875{ 876{
876 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 877 struct page *page_head = compound_head(page);
878 DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
877 879
878 __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io, 880 __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
879 TASK_UNINTERRUPTIBLE); 881 TASK_UNINTERRUPTIBLE);
880} 882}
881EXPORT_SYMBOL(__lock_page); 883EXPORT_SYMBOL(__lock_page);
882 884
883int __lock_page_killable(struct page *page) 885int __lock_page_killable(struct page *page)
884{ 886{
885 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 887 struct page *page_head = compound_head(page);
888 DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
886 889
887 return __wait_on_bit_lock(page_waitqueue(page), &wait, 890 return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
888 bit_wait_io, TASK_KILLABLE); 891 bit_wait_io, TASK_KILLABLE);
889} 892}
890EXPORT_SYMBOL_GPL(__lock_page_killable); 893EXPORT_SYMBOL_GPL(__lock_page_killable);
diff --git a/mm/gup.c b/mm/gup.c
index deafa2c91b36..b64a36175884 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -4,6 +4,7 @@
4#include <linux/spinlock.h> 4#include <linux/spinlock.h>
5 5
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/memremap.h>
7#include <linux/pagemap.h> 8#include <linux/pagemap.h>
8#include <linux/rmap.h> 9#include <linux/rmap.h>
9#include <linux/swap.h> 10#include <linux/swap.h>
@@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
62 unsigned long address, pmd_t *pmd, unsigned int flags) 63 unsigned long address, pmd_t *pmd, unsigned int flags)
63{ 64{
64 struct mm_struct *mm = vma->vm_mm; 65 struct mm_struct *mm = vma->vm_mm;
66 struct dev_pagemap *pgmap = NULL;
65 struct page *page; 67 struct page *page;
66 spinlock_t *ptl; 68 spinlock_t *ptl;
67 pte_t *ptep, pte; 69 pte_t *ptep, pte;
@@ -98,7 +100,17 @@ retry:
98 } 100 }
99 101
100 page = vm_normal_page(vma, address, pte); 102 page = vm_normal_page(vma, address, pte);
101 if (unlikely(!page)) { 103 if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
104 /*
105 * Only return device mapping pages in the FOLL_GET case since
106 * they are only valid while holding the pgmap reference.
107 */
108 pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
109 if (pgmap)
110 page = pte_page(pte);
111 else
112 goto no_page;
113 } else if (unlikely(!page)) {
102 if (flags & FOLL_DUMP) { 114 if (flags & FOLL_DUMP) {
103 /* Avoid special (like zero) pages in core dumps */ 115 /* Avoid special (like zero) pages in core dumps */
104 page = ERR_PTR(-EFAULT); 116 page = ERR_PTR(-EFAULT);
@@ -116,8 +128,28 @@ retry:
116 } 128 }
117 } 129 }
118 130
119 if (flags & FOLL_GET) 131 if (flags & FOLL_SPLIT && PageTransCompound(page)) {
120 get_page_foll(page); 132 int ret;
133 get_page(page);
134 pte_unmap_unlock(ptep, ptl);
135 lock_page(page);
136 ret = split_huge_page(page);
137 unlock_page(page);
138 put_page(page);
139 if (ret)
140 return ERR_PTR(ret);
141 goto retry;
142 }
143
144 if (flags & FOLL_GET) {
145 get_page(page);
146
147 /* drop the pgmap reference now that we hold the page */
148 if (pgmap) {
149 put_dev_pagemap(pgmap);
150 pgmap = NULL;
151 }
152 }
121 if (flags & FOLL_TOUCH) { 153 if (flags & FOLL_TOUCH) {
122 if ((flags & FOLL_WRITE) && 154 if ((flags & FOLL_WRITE) &&
123 !pte_dirty(pte) && !PageDirty(page)) 155 !pte_dirty(pte) && !PageDirty(page))
@@ -130,6 +162,10 @@ retry:
130 mark_page_accessed(page); 162 mark_page_accessed(page);
131 } 163 }
132 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 164 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
165 /* Do not mlock pte-mapped THP */
166 if (PageTransCompound(page))
167 goto out;
168
133 /* 169 /*
134 * The preliminary mapping check is mainly to avoid the 170 * The preliminary mapping check is mainly to avoid the
135 * pointless overhead of lock_page on the ZERO_PAGE 171 * pointless overhead of lock_page on the ZERO_PAGE
@@ -220,27 +256,45 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
220 } 256 }
221 if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) 257 if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
222 return no_page_table(vma, flags); 258 return no_page_table(vma, flags);
223 if (pmd_trans_huge(*pmd)) { 259 if (pmd_devmap(*pmd)) {
224 if (flags & FOLL_SPLIT) {
225 split_huge_page_pmd(vma, address, pmd);
226 return follow_page_pte(vma, address, pmd, flags);
227 }
228 ptl = pmd_lock(mm, pmd); 260 ptl = pmd_lock(mm, pmd);
229 if (likely(pmd_trans_huge(*pmd))) { 261 page = follow_devmap_pmd(vma, address, pmd, flags);
230 if (unlikely(pmd_trans_splitting(*pmd))) { 262 spin_unlock(ptl);
231 spin_unlock(ptl); 263 if (page)
232 wait_split_huge_page(vma->anon_vma, pmd); 264 return page;
233 } else { 265 }
234 page = follow_trans_huge_pmd(vma, address, 266 if (likely(!pmd_trans_huge(*pmd)))
235 pmd, flags); 267 return follow_page_pte(vma, address, pmd, flags);
236 spin_unlock(ptl); 268
237 *page_mask = HPAGE_PMD_NR - 1; 269 ptl = pmd_lock(mm, pmd);
238 return page; 270 if (unlikely(!pmd_trans_huge(*pmd))) {
239 } 271 spin_unlock(ptl);
240 } else 272 return follow_page_pte(vma, address, pmd, flags);
273 }
274 if (flags & FOLL_SPLIT) {
275 int ret;
276 page = pmd_page(*pmd);
277 if (is_huge_zero_page(page)) {
278 spin_unlock(ptl);
279 ret = 0;
280 split_huge_pmd(vma, pmd, address);
281 } else {
282 get_page(page);
241 spin_unlock(ptl); 283 spin_unlock(ptl);
284 lock_page(page);
285 ret = split_huge_page(page);
286 unlock_page(page);
287 put_page(page);
288 }
289
290 return ret ? ERR_PTR(ret) :
291 follow_page_pte(vma, address, pmd, flags);
242 } 292 }
243 return follow_page_pte(vma, address, pmd, flags); 293
294 page = follow_trans_huge_pmd(vma, address, pmd, flags);
295 spin_unlock(ptl);
296 *page_mask = HPAGE_PMD_NR - 1;
297 return page;
244} 298}
245 299
246static int get_gate_page(struct mm_struct *mm, unsigned long address, 300static int get_gate_page(struct mm_struct *mm, unsigned long address,
@@ -564,6 +618,8 @@ EXPORT_SYMBOL(__get_user_pages);
564 * @mm: mm_struct of target mm 618 * @mm: mm_struct of target mm
565 * @address: user address 619 * @address: user address
566 * @fault_flags:flags to pass down to handle_mm_fault() 620 * @fault_flags:flags to pass down to handle_mm_fault()
621 * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller
622 * does not allow retry
567 * 623 *
568 * This is meant to be called in the specific scenario where for locking reasons 624 * This is meant to be called in the specific scenario where for locking reasons
569 * we try to access user memory in atomic context (within a pagefault_disable() 625 * we try to access user memory in atomic context (within a pagefault_disable()
@@ -575,22 +631,28 @@ EXPORT_SYMBOL(__get_user_pages);
575 * The main difference with get_user_pages() is that this function will 631 * The main difference with get_user_pages() is that this function will
576 * unconditionally call handle_mm_fault() which will in turn perform all the 632 * unconditionally call handle_mm_fault() which will in turn perform all the
577 * necessary SW fixup of the dirty and young bits in the PTE, while 633 * necessary SW fixup of the dirty and young bits in the PTE, while
578 * handle_mm_fault() only guarantees to update these in the struct page. 634 * get_user_pages() only guarantees to update these in the struct page.
579 * 635 *
580 * This is important for some architectures where those bits also gate the 636 * This is important for some architectures where those bits also gate the
581 * access permission to the page because they are maintained in software. On 637 * access permission to the page because they are maintained in software. On
582 * such architectures, gup() will not be enough to make a subsequent access 638 * such architectures, gup() will not be enough to make a subsequent access
583 * succeed. 639 * succeed.
584 * 640 *
585 * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault(). 641 * This function will not return with an unlocked mmap_sem. So it has not the
642 * same semantics wrt the @mm->mmap_sem as does filemap_fault().
586 */ 643 */
587int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, 644int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
588 unsigned long address, unsigned int fault_flags) 645 unsigned long address, unsigned int fault_flags,
646 bool *unlocked)
589{ 647{
590 struct vm_area_struct *vma; 648 struct vm_area_struct *vma;
591 vm_flags_t vm_flags; 649 vm_flags_t vm_flags;
592 int ret; 650 int ret, major = 0;
593 651
652 if (unlocked)
653 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
654
655retry:
594 vma = find_extend_vma(mm, address); 656 vma = find_extend_vma(mm, address);
595 if (!vma || address < vma->vm_start) 657 if (!vma || address < vma->vm_start)
596 return -EFAULT; 658 return -EFAULT;
@@ -600,6 +662,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
600 return -EFAULT; 662 return -EFAULT;
601 663
602 ret = handle_mm_fault(mm, vma, address, fault_flags); 664 ret = handle_mm_fault(mm, vma, address, fault_flags);
665 major |= ret & VM_FAULT_MAJOR;
603 if (ret & VM_FAULT_ERROR) { 666 if (ret & VM_FAULT_ERROR) {
604 if (ret & VM_FAULT_OOM) 667 if (ret & VM_FAULT_OOM)
605 return -ENOMEM; 668 return -ENOMEM;
@@ -609,8 +672,19 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
609 return -EFAULT; 672 return -EFAULT;
610 BUG(); 673 BUG();
611 } 674 }
675
676 if (ret & VM_FAULT_RETRY) {
677 down_read(&mm->mmap_sem);
678 if (!(fault_flags & FAULT_FLAG_TRIED)) {
679 *unlocked = true;
680 fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
681 fault_flags |= FAULT_FLAG_TRIED;
682 goto retry;
683 }
684 }
685
612 if (tsk) { 686 if (tsk) {
613 if (ret & VM_FAULT_MAJOR) 687 if (major)
614 tsk->maj_flt++; 688 tsk->maj_flt++;
615 else 689 else
616 tsk->min_flt++; 690 tsk->min_flt++;
@@ -896,7 +970,6 @@ long populate_vma_page_range(struct vm_area_struct *vma,
896 gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; 970 gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
897 if (vma->vm_flags & VM_LOCKONFAULT) 971 if (vma->vm_flags & VM_LOCKONFAULT)
898 gup_flags &= ~FOLL_POPULATE; 972 gup_flags &= ~FOLL_POPULATE;
899
900 /* 973 /*
901 * We want to touch writable mappings with a write fault in order 974 * We want to touch writable mappings with a write fault in order
902 * to break COW, except for shared mappings because these don't COW 975 * to break COW, except for shared mappings because these don't COW
@@ -1036,9 +1109,6 @@ struct page *get_dump_page(unsigned long addr)
1036 * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free 1109 * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
1037 * pages containing page tables. 1110 * pages containing page tables.
1038 * 1111 *
1039 * *) THP splits will broadcast an IPI, this can be achieved by overriding
1040 * pmdp_splitting_flush.
1041 *
1042 * *) ptes can be read atomically by the architecture. 1112 * *) ptes can be read atomically by the architecture.
1043 * 1113 *
1044 * *) access_ok is sufficient to validate userspace address ranges. 1114 * *) access_ok is sufficient to validate userspace address ranges.
@@ -1066,7 +1136,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1066 * for an example see gup_get_pte in arch/x86/mm/gup.c 1136 * for an example see gup_get_pte in arch/x86/mm/gup.c
1067 */ 1137 */
1068 pte_t pte = READ_ONCE(*ptep); 1138 pte_t pte = READ_ONCE(*ptep);
1069 struct page *page; 1139 struct page *head, *page;
1070 1140
1071 /* 1141 /*
1072 * Similar to the PMD case below, NUMA hinting must take slow 1142 * Similar to the PMD case below, NUMA hinting must take slow
@@ -1078,15 +1148,17 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1078 1148
1079 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 1149 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
1080 page = pte_page(pte); 1150 page = pte_page(pte);
1151 head = compound_head(page);
1081 1152
1082 if (!page_cache_get_speculative(page)) 1153 if (!page_cache_get_speculative(head))
1083 goto pte_unmap; 1154 goto pte_unmap;
1084 1155
1085 if (unlikely(pte_val(pte) != pte_val(*ptep))) { 1156 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
1086 put_page(page); 1157 put_page(head);
1087 goto pte_unmap; 1158 goto pte_unmap;
1088 } 1159 }
1089 1160
1161 VM_BUG_ON_PAGE(compound_head(page) != head, page);
1090 pages[*nr] = page; 1162 pages[*nr] = page;
1091 (*nr)++; 1163 (*nr)++;
1092 1164
@@ -1119,7 +1191,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1119static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, 1191static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1120 unsigned long end, int write, struct page **pages, int *nr) 1192 unsigned long end, int write, struct page **pages, int *nr)
1121{ 1193{
1122 struct page *head, *page, *tail; 1194 struct page *head, *page;
1123 int refs; 1195 int refs;
1124 1196
1125 if (write && !pmd_write(orig)) 1197 if (write && !pmd_write(orig))
@@ -1128,7 +1200,6 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1128 refs = 0; 1200 refs = 0;
1129 head = pmd_page(orig); 1201 head = pmd_page(orig);
1130 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 1202 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1131 tail = page;
1132 do { 1203 do {
1133 VM_BUG_ON_PAGE(compound_head(page) != head, page); 1204 VM_BUG_ON_PAGE(compound_head(page) != head, page);
1134 pages[*nr] = page; 1205 pages[*nr] = page;
@@ -1149,24 +1220,13 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1149 return 0; 1220 return 0;
1150 } 1221 }
1151 1222
1152 /*
1153 * Any tail pages need their mapcount reference taken before we
1154 * return. (This allows the THP code to bump their ref count when
1155 * they are split into base pages).
1156 */
1157 while (refs--) {
1158 if (PageTail(tail))
1159 get_huge_page_tail(tail);
1160 tail++;
1161 }
1162
1163 return 1; 1223 return 1;
1164} 1224}
1165 1225
1166static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, 1226static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
1167 unsigned long end, int write, struct page **pages, int *nr) 1227 unsigned long end, int write, struct page **pages, int *nr)
1168{ 1228{
1169 struct page *head, *page, *tail; 1229 struct page *head, *page;
1170 int refs; 1230 int refs;
1171 1231
1172 if (write && !pud_write(orig)) 1232 if (write && !pud_write(orig))
@@ -1175,7 +1235,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
1175 refs = 0; 1235 refs = 0;
1176 head = pud_page(orig); 1236 head = pud_page(orig);
1177 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 1237 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
1178 tail = page;
1179 do { 1238 do {
1180 VM_BUG_ON_PAGE(compound_head(page) != head, page); 1239 VM_BUG_ON_PAGE(compound_head(page) != head, page);
1181 pages[*nr] = page; 1240 pages[*nr] = page;
@@ -1196,12 +1255,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
1196 return 0; 1255 return 0;
1197 } 1256 }
1198 1257
1199 while (refs--) {
1200 if (PageTail(tail))
1201 get_huge_page_tail(tail);
1202 tail++;
1203 }
1204
1205 return 1; 1258 return 1;
1206} 1259}
1207 1260
@@ -1210,7 +1263,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
1210 struct page **pages, int *nr) 1263 struct page **pages, int *nr)
1211{ 1264{
1212 int refs; 1265 int refs;
1213 struct page *head, *page, *tail; 1266 struct page *head, *page;
1214 1267
1215 if (write && !pgd_write(orig)) 1268 if (write && !pgd_write(orig))
1216 return 0; 1269 return 0;
@@ -1218,7 +1271,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
1218 refs = 0; 1271 refs = 0;
1219 head = pgd_page(orig); 1272 head = pgd_page(orig);
1220 page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); 1273 page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
1221 tail = page;
1222 do { 1274 do {
1223 VM_BUG_ON_PAGE(compound_head(page) != head, page); 1275 VM_BUG_ON_PAGE(compound_head(page) != head, page);
1224 pages[*nr] = page; 1276 pages[*nr] = page;
@@ -1239,12 +1291,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
1239 return 0; 1291 return 0;
1240 } 1292 }
1241 1293
1242 while (refs--) {
1243 if (PageTail(tail))
1244 get_huge_page_tail(tail);
1245 tail++;
1246 }
1247
1248 return 1; 1294 return 1;
1249} 1295}
1250 1296
@@ -1259,7 +1305,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
1259 pmd_t pmd = READ_ONCE(*pmdp); 1305 pmd_t pmd = READ_ONCE(*pmdp);
1260 1306
1261 next = pmd_addr_end(addr, end); 1307 next = pmd_addr_end(addr, end);
1262 if (pmd_none(pmd) || pmd_trans_splitting(pmd)) 1308 if (pmd_none(pmd))
1263 return 0; 1309 return 0;
1264 1310
1265 if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { 1311 if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f952f055fdcf..b2db98136af9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -16,12 +16,16 @@
16#include <linux/swap.h> 16#include <linux/swap.h>
17#include <linux/shrinker.h> 17#include <linux/shrinker.h>
18#include <linux/mm_inline.h> 18#include <linux/mm_inline.h>
19#include <linux/swapops.h>
19#include <linux/dax.h> 20#include <linux/dax.h>
20#include <linux/kthread.h> 21#include <linux/kthread.h>
21#include <linux/khugepaged.h> 22#include <linux/khugepaged.h>
22#include <linux/freezer.h> 23#include <linux/freezer.h>
24#include <linux/pfn_t.h>
23#include <linux/mman.h> 25#include <linux/mman.h>
26#include <linux/memremap.h>
24#include <linux/pagemap.h> 27#include <linux/pagemap.h>
28#include <linux/debugfs.h>
25#include <linux/migrate.h> 29#include <linux/migrate.h>
26#include <linux/hashtable.h> 30#include <linux/hashtable.h>
27#include <linux/userfaultfd_k.h> 31#include <linux/userfaultfd_k.h>
@@ -45,6 +49,7 @@ enum scan_result {
45 SCAN_PAGE_LRU, 49 SCAN_PAGE_LRU,
46 SCAN_PAGE_LOCK, 50 SCAN_PAGE_LOCK,
47 SCAN_PAGE_ANON, 51 SCAN_PAGE_ANON,
52 SCAN_PAGE_COMPOUND,
48 SCAN_ANY_PROCESS, 53 SCAN_ANY_PROCESS,
49 SCAN_VMA_NULL, 54 SCAN_VMA_NULL,
50 SCAN_VMA_CHECK, 55 SCAN_VMA_CHECK,
@@ -133,6 +138,10 @@ static struct khugepaged_scan khugepaged_scan = {
133 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), 138 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
134}; 139};
135 140
141static DEFINE_SPINLOCK(split_queue_lock);
142static LIST_HEAD(split_queue);
143static unsigned long split_queue_len;
144static struct shrinker deferred_split_shrinker;
136 145
137static void set_recommended_min_free_kbytes(void) 146static void set_recommended_min_free_kbytes(void)
138{ 147{
@@ -665,6 +674,9 @@ static int __init hugepage_init(void)
665 err = register_shrinker(&huge_zero_page_shrinker); 674 err = register_shrinker(&huge_zero_page_shrinker);
666 if (err) 675 if (err)
667 goto err_hzp_shrinker; 676 goto err_hzp_shrinker;
677 err = register_shrinker(&deferred_split_shrinker);
678 if (err)
679 goto err_split_shrinker;
668 680
669 /* 681 /*
670 * By default disable transparent hugepages on smaller systems, 682 * By default disable transparent hugepages on smaller systems,
@@ -682,6 +694,8 @@ static int __init hugepage_init(void)
682 694
683 return 0; 695 return 0;
684err_khugepaged: 696err_khugepaged:
697 unregister_shrinker(&deferred_split_shrinker);
698err_split_shrinker:
685 unregister_shrinker(&huge_zero_page_shrinker); 699 unregister_shrinker(&huge_zero_page_shrinker);
686err_hzp_shrinker: 700err_hzp_shrinker:
687 khugepaged_slab_exit(); 701 khugepaged_slab_exit();
@@ -738,6 +752,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
738 return entry; 752 return entry;
739} 753}
740 754
755static inline struct list_head *page_deferred_list(struct page *page)
756{
757 /*
758 * ->lru in the tail pages is occupied by compound_head.
759 * Let's use ->mapping + ->index in the second tail page as list_head.
760 */
761 return (struct list_head *)&page[2].mapping;
762}
763
764void prep_transhuge_page(struct page *page)
765{
766 /*
767 * we use page->mapping and page->indexlru in second tail page
768 * as list_head: assuming THP order >= 2
769 */
770 BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
771
772 INIT_LIST_HEAD(page_deferred_list(page));
773 set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
774}
775
741static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, 776static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
742 struct vm_area_struct *vma, 777 struct vm_area_struct *vma,
743 unsigned long address, pmd_t *pmd, 778 unsigned long address, pmd_t *pmd,
@@ -751,7 +786,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
751 786
752 VM_BUG_ON_PAGE(!PageCompound(page), page); 787 VM_BUG_ON_PAGE(!PageCompound(page), page);
753 788
754 if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) { 789 if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) {
755 put_page(page); 790 put_page(page);
756 count_vm_event(THP_FAULT_FALLBACK); 791 count_vm_event(THP_FAULT_FALLBACK);
757 return VM_FAULT_FALLBACK; 792 return VM_FAULT_FALLBACK;
@@ -759,7 +794,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
759 794
760 pgtable = pte_alloc_one(mm, haddr); 795 pgtable = pte_alloc_one(mm, haddr);
761 if (unlikely(!pgtable)) { 796 if (unlikely(!pgtable)) {
762 mem_cgroup_cancel_charge(page, memcg); 797 mem_cgroup_cancel_charge(page, memcg, true);
763 put_page(page); 798 put_page(page);
764 return VM_FAULT_OOM; 799 return VM_FAULT_OOM;
765 } 800 }
@@ -775,7 +810,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
775 ptl = pmd_lock(mm, pmd); 810 ptl = pmd_lock(mm, pmd);
776 if (unlikely(!pmd_none(*pmd))) { 811 if (unlikely(!pmd_none(*pmd))) {
777 spin_unlock(ptl); 812 spin_unlock(ptl);
778 mem_cgroup_cancel_charge(page, memcg); 813 mem_cgroup_cancel_charge(page, memcg, true);
779 put_page(page); 814 put_page(page);
780 pte_free(mm, pgtable); 815 pte_free(mm, pgtable);
781 } else { 816 } else {
@@ -786,7 +821,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
786 int ret; 821 int ret;
787 822
788 spin_unlock(ptl); 823 spin_unlock(ptl);
789 mem_cgroup_cancel_charge(page, memcg); 824 mem_cgroup_cancel_charge(page, memcg, true);
790 put_page(page); 825 put_page(page);
791 pte_free(mm, pgtable); 826 pte_free(mm, pgtable);
792 ret = handle_userfault(vma, address, flags, 827 ret = handle_userfault(vma, address, flags,
@@ -797,8 +832,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
797 832
798 entry = mk_huge_pmd(page, vma->vm_page_prot); 833 entry = mk_huge_pmd(page, vma->vm_page_prot);
799 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 834 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
800 page_add_new_anon_rmap(page, vma, haddr); 835 page_add_new_anon_rmap(page, vma, haddr, true);
801 mem_cgroup_commit_charge(page, memcg, false); 836 mem_cgroup_commit_charge(page, memcg, false, true);
802 lru_cache_add_active_or_unevictable(page, vma); 837 lru_cache_add_active_or_unevictable(page, vma);
803 pgtable_trans_huge_deposit(mm, pmd, pgtable); 838 pgtable_trans_huge_deposit(mm, pmd, pgtable);
804 set_pmd_at(mm, haddr, pmd, entry); 839 set_pmd_at(mm, haddr, pmd, entry);
@@ -892,32 +927,33 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
892 count_vm_event(THP_FAULT_FALLBACK); 927 count_vm_event(THP_FAULT_FALLBACK);
893 return VM_FAULT_FALLBACK; 928 return VM_FAULT_FALLBACK;
894 } 929 }
930 prep_transhuge_page(page);
895 return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, 931 return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
896 flags); 932 flags);
897} 933}
898 934
899static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 935static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
900 pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write) 936 pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write)
901{ 937{
902 struct mm_struct *mm = vma->vm_mm; 938 struct mm_struct *mm = vma->vm_mm;
903 pmd_t entry; 939 pmd_t entry;
904 spinlock_t *ptl; 940 spinlock_t *ptl;
905 941
906 ptl = pmd_lock(mm, pmd); 942 ptl = pmd_lock(mm, pmd);
907 if (pmd_none(*pmd)) { 943 entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
908 entry = pmd_mkhuge(pfn_pmd(pfn, prot)); 944 if (pfn_t_devmap(pfn))
909 if (write) { 945 entry = pmd_mkdevmap(entry);
910 entry = pmd_mkyoung(pmd_mkdirty(entry)); 946 if (write) {
911 entry = maybe_pmd_mkwrite(entry, vma); 947 entry = pmd_mkyoung(pmd_mkdirty(entry));
912 } 948 entry = maybe_pmd_mkwrite(entry, vma);
913 set_pmd_at(mm, addr, pmd, entry); 949 }
914 update_mmu_cache_pmd(vma, addr, pmd); 950 set_pmd_at(mm, addr, pmd, entry);
915 } 951 update_mmu_cache_pmd(vma, addr, pmd);
916 spin_unlock(ptl); 952 spin_unlock(ptl);
917} 953}
918 954
919int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 955int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
920 pmd_t *pmd, unsigned long pfn, bool write) 956 pmd_t *pmd, pfn_t pfn, bool write)
921{ 957{
922 pgprot_t pgprot = vma->vm_page_prot; 958 pgprot_t pgprot = vma->vm_page_prot;
923 /* 959 /*
@@ -929,7 +965,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
929 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 965 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
930 (VM_PFNMAP|VM_MIXEDMAP)); 966 (VM_PFNMAP|VM_MIXEDMAP));
931 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 967 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
932 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); 968 BUG_ON(!pfn_t_devmap(pfn));
933 969
934 if (addr < vma->vm_start || addr >= vma->vm_end) 970 if (addr < vma->vm_start || addr >= vma->vm_end)
935 return VM_FAULT_SIGBUS; 971 return VM_FAULT_SIGBUS;
@@ -939,6 +975,63 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
939 return VM_FAULT_NOPAGE; 975 return VM_FAULT_NOPAGE;
940} 976}
941 977
978static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
979 pmd_t *pmd)
980{
981 pmd_t _pmd;
982
983 /*
984 * We should set the dirty bit only for FOLL_WRITE but for now
985 * the dirty bit in the pmd is meaningless. And if the dirty
986 * bit will become meaningful and we'll only set it with
987 * FOLL_WRITE, an atomic set_bit will be required on the pmd to
988 * set the young bit, instead of the current set_pmd_at.
989 */
990 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
991 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
992 pmd, _pmd, 1))
993 update_mmu_cache_pmd(vma, addr, pmd);
994}
995
996struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
997 pmd_t *pmd, int flags)
998{
999 unsigned long pfn = pmd_pfn(*pmd);
1000 struct mm_struct *mm = vma->vm_mm;
1001 struct dev_pagemap *pgmap;
1002 struct page *page;
1003
1004 assert_spin_locked(pmd_lockptr(mm, pmd));
1005
1006 if (flags & FOLL_WRITE && !pmd_write(*pmd))
1007 return NULL;
1008
1009 if (pmd_present(*pmd) && pmd_devmap(*pmd))
1010 /* pass */;
1011 else
1012 return NULL;
1013
1014 if (flags & FOLL_TOUCH)
1015 touch_pmd(vma, addr, pmd);
1016
1017 /*
1018 * device mapped pages can only be returned if the
1019 * caller will manage the page reference count.
1020 */
1021 if (!(flags & FOLL_GET))
1022 return ERR_PTR(-EEXIST);
1023
1024 pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
1025 pgmap = get_dev_pagemap(pfn, NULL);
1026 if (!pgmap)
1027 return ERR_PTR(-EFAULT);
1028 page = pfn_to_page(pfn);
1029 get_page(page);
1030 put_dev_pagemap(pgmap);
1031
1032 return page;
1033}
1034
942int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1035int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
943 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 1036 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
944 struct vm_area_struct *vma) 1037 struct vm_area_struct *vma)
@@ -960,7 +1053,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
960 1053
961 ret = -EAGAIN; 1054 ret = -EAGAIN;
962 pmd = *src_pmd; 1055 pmd = *src_pmd;
963 if (unlikely(!pmd_trans_huge(pmd))) { 1056 if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) {
964 pte_free(dst_mm, pgtable); 1057 pte_free(dst_mm, pgtable);
965 goto out_unlock; 1058 goto out_unlock;
966 } 1059 }
@@ -983,26 +1076,20 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
983 goto out_unlock; 1076 goto out_unlock;
984 } 1077 }
985 1078
986 if (unlikely(pmd_trans_splitting(pmd))) { 1079 if (pmd_trans_huge(pmd)) {
987 /* split huge page running from under us */ 1080 /* thp accounting separate from pmd_devmap accounting */
988 spin_unlock(src_ptl); 1081 src_page = pmd_page(pmd);
989 spin_unlock(dst_ptl); 1082 VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
990 pte_free(dst_mm, pgtable); 1083 get_page(src_page);
991 1084 page_dup_rmap(src_page, true);
992 wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ 1085 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
993 goto out; 1086 atomic_long_inc(&dst_mm->nr_ptes);
1087 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
994 } 1088 }
995 src_page = pmd_page(pmd);
996 VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
997 get_page(src_page);
998 page_dup_rmap(src_page);
999 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1000 1089
1001 pmdp_set_wrprotect(src_mm, addr, src_pmd); 1090 pmdp_set_wrprotect(src_mm, addr, src_pmd);
1002 pmd = pmd_mkold(pmd_wrprotect(pmd)); 1091 pmd = pmd_mkold(pmd_wrprotect(pmd));
1003 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1004 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 1092 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1005 atomic_long_inc(&dst_mm->nr_ptes);
1006 1093
1007 ret = 0; 1094 ret = 0;
1008out_unlock: 1095out_unlock:
@@ -1035,37 +1122,6 @@ unlock:
1035 spin_unlock(ptl); 1122 spin_unlock(ptl);
1036} 1123}
1037 1124
1038/*
1039 * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages
1040 * during copy_user_huge_page()'s copy_page_rep(): in the case when
1041 * the source page gets split and a tail freed before copy completes.
1042 * Called under pmd_lock of checked pmd, so safe from splitting itself.
1043 */
1044static void get_user_huge_page(struct page *page)
1045{
1046 if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
1047 struct page *endpage = page + HPAGE_PMD_NR;
1048
1049 atomic_add(HPAGE_PMD_NR, &page->_count);
1050 while (++page < endpage)
1051 get_huge_page_tail(page);
1052 } else {
1053 get_page(page);
1054 }
1055}
1056
1057static void put_user_huge_page(struct page *page)
1058{
1059 if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
1060 struct page *endpage = page + HPAGE_PMD_NR;
1061
1062 while (page < endpage)
1063 put_page(page++);
1064 } else {
1065 put_page(page);
1066 }
1067}
1068
1069static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 1125static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
1070 struct vm_area_struct *vma, 1126 struct vm_area_struct *vma,
1071 unsigned long address, 1127 unsigned long address,
@@ -1095,13 +1151,14 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
1095 vma, address, page_to_nid(page)); 1151 vma, address, page_to_nid(page));
1096 if (unlikely(!pages[i] || 1152 if (unlikely(!pages[i] ||
1097 mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, 1153 mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
1098 &memcg))) { 1154 &memcg, false))) {
1099 if (pages[i]) 1155 if (pages[i])
1100 put_page(pages[i]); 1156 put_page(pages[i]);
1101 while (--i >= 0) { 1157 while (--i >= 0) {
1102 memcg = (void *)page_private(pages[i]); 1158 memcg = (void *)page_private(pages[i]);
1103 set_page_private(pages[i], 0); 1159 set_page_private(pages[i], 0);
1104 mem_cgroup_cancel_charge(pages[i], memcg); 1160 mem_cgroup_cancel_charge(pages[i], memcg,
1161 false);
1105 put_page(pages[i]); 1162 put_page(pages[i]);
1106 } 1163 }
1107 kfree(pages); 1164 kfree(pages);
@@ -1139,8 +1196,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
1139 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1196 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1140 memcg = (void *)page_private(pages[i]); 1197 memcg = (void *)page_private(pages[i]);
1141 set_page_private(pages[i], 0); 1198 set_page_private(pages[i], 0);
1142 page_add_new_anon_rmap(pages[i], vma, haddr); 1199 page_add_new_anon_rmap(pages[i], vma, haddr, false);
1143 mem_cgroup_commit_charge(pages[i], memcg, false); 1200 mem_cgroup_commit_charge(pages[i], memcg, false, false);
1144 lru_cache_add_active_or_unevictable(pages[i], vma); 1201 lru_cache_add_active_or_unevictable(pages[i], vma);
1145 pte = pte_offset_map(&_pmd, haddr); 1202 pte = pte_offset_map(&_pmd, haddr);
1146 VM_BUG_ON(!pte_none(*pte)); 1203 VM_BUG_ON(!pte_none(*pte));
@@ -1151,7 +1208,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
1151 1208
1152 smp_wmb(); /* make pte visible before pmd */ 1209 smp_wmb(); /* make pte visible before pmd */
1153 pmd_populate(mm, pmd, pgtable); 1210 pmd_populate(mm, pmd, pgtable);
1154 page_remove_rmap(page); 1211 page_remove_rmap(page, true);
1155 spin_unlock(ptl); 1212 spin_unlock(ptl);
1156 1213
1157 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1214 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -1168,7 +1225,7 @@ out_free_pages:
1168 for (i = 0; i < HPAGE_PMD_NR; i++) { 1225 for (i = 0; i < HPAGE_PMD_NR; i++) {
1169 memcg = (void *)page_private(pages[i]); 1226 memcg = (void *)page_private(pages[i]);
1170 set_page_private(pages[i], 0); 1227 set_page_private(pages[i], 0);
1171 mem_cgroup_cancel_charge(pages[i], memcg); 1228 mem_cgroup_cancel_charge(pages[i], memcg, false);
1172 put_page(pages[i]); 1229 put_page(pages[i]);
1173 } 1230 }
1174 kfree(pages); 1231 kfree(pages);
@@ -1198,7 +1255,17 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1198 1255
1199 page = pmd_page(orig_pmd); 1256 page = pmd_page(orig_pmd);
1200 VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); 1257 VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
1201 if (page_mapcount(page) == 1) { 1258 /*
1259 * We can only reuse the page if nobody else maps the huge page or it's
1260 * part. We can do it by checking page_mapcount() on each sub-page, but
1261 * it's expensive.
1262 * The cheaper way is to check page_count() to be equal 1: every
1263 * mapcount takes page reference reference, so this way we can
1264 * guarantee, that the PMD is the only mapping.
1265 * This can give false negative if somebody pinned the page, but that's
1266 * fine.
1267 */
1268 if (page_mapcount(page) == 1 && page_count(page) == 1) {
1202 pmd_t entry; 1269 pmd_t entry;
1203 entry = pmd_mkyoung(orig_pmd); 1270 entry = pmd_mkyoung(orig_pmd);
1204 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1271 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1207,7 +1274,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1207 ret |= VM_FAULT_WRITE; 1274 ret |= VM_FAULT_WRITE;
1208 goto out_unlock; 1275 goto out_unlock;
1209 } 1276 }
1210 get_user_huge_page(page); 1277 get_page(page);
1211 spin_unlock(ptl); 1278 spin_unlock(ptl);
1212alloc: 1279alloc:
1213 if (transparent_hugepage_enabled(vma) && 1280 if (transparent_hugepage_enabled(vma) &&
@@ -1217,30 +1284,33 @@ alloc:
1217 } else 1284 } else
1218 new_page = NULL; 1285 new_page = NULL;
1219 1286
1220 if (unlikely(!new_page)) { 1287 if (likely(new_page)) {
1288 prep_transhuge_page(new_page);
1289 } else {
1221 if (!page) { 1290 if (!page) {
1222 split_huge_page_pmd(vma, address, pmd); 1291 split_huge_pmd(vma, pmd, address);
1223 ret |= VM_FAULT_FALLBACK; 1292 ret |= VM_FAULT_FALLBACK;
1224 } else { 1293 } else {
1225 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 1294 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
1226 pmd, orig_pmd, page, haddr); 1295 pmd, orig_pmd, page, haddr);
1227 if (ret & VM_FAULT_OOM) { 1296 if (ret & VM_FAULT_OOM) {
1228 split_huge_page(page); 1297 split_huge_pmd(vma, pmd, address);
1229 ret |= VM_FAULT_FALLBACK; 1298 ret |= VM_FAULT_FALLBACK;
1230 } 1299 }
1231 put_user_huge_page(page); 1300 put_page(page);
1232 } 1301 }
1233 count_vm_event(THP_FAULT_FALLBACK); 1302 count_vm_event(THP_FAULT_FALLBACK);
1234 goto out; 1303 goto out;
1235 } 1304 }
1236 1305
1237 if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) { 1306 if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg,
1307 true))) {
1238 put_page(new_page); 1308 put_page(new_page);
1239 if (page) { 1309 if (page) {
1240 split_huge_page(page); 1310 split_huge_pmd(vma, pmd, address);
1241 put_user_huge_page(page); 1311 put_page(page);
1242 } else 1312 } else
1243 split_huge_page_pmd(vma, address, pmd); 1313 split_huge_pmd(vma, pmd, address);
1244 ret |= VM_FAULT_FALLBACK; 1314 ret |= VM_FAULT_FALLBACK;
1245 count_vm_event(THP_FAULT_FALLBACK); 1315 count_vm_event(THP_FAULT_FALLBACK);
1246 goto out; 1316 goto out;
@@ -1260,10 +1330,10 @@ alloc:
1260 1330
1261 spin_lock(ptl); 1331 spin_lock(ptl);
1262 if (page) 1332 if (page)
1263 put_user_huge_page(page); 1333 put_page(page);
1264 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 1334 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
1265 spin_unlock(ptl); 1335 spin_unlock(ptl);
1266 mem_cgroup_cancel_charge(new_page, memcg); 1336 mem_cgroup_cancel_charge(new_page, memcg, true);
1267 put_page(new_page); 1337 put_page(new_page);
1268 goto out_mn; 1338 goto out_mn;
1269 } else { 1339 } else {
@@ -1271,8 +1341,8 @@ alloc:
1271 entry = mk_huge_pmd(new_page, vma->vm_page_prot); 1341 entry = mk_huge_pmd(new_page, vma->vm_page_prot);
1272 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1342 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1273 pmdp_huge_clear_flush_notify(vma, haddr, pmd); 1343 pmdp_huge_clear_flush_notify(vma, haddr, pmd);
1274 page_add_new_anon_rmap(new_page, vma, haddr); 1344 page_add_new_anon_rmap(new_page, vma, haddr, true);
1275 mem_cgroup_commit_charge(new_page, memcg, false); 1345 mem_cgroup_commit_charge(new_page, memcg, false, true);
1276 lru_cache_add_active_or_unevictable(new_page, vma); 1346 lru_cache_add_active_or_unevictable(new_page, vma);
1277 set_pmd_at(mm, haddr, pmd, entry); 1347 set_pmd_at(mm, haddr, pmd, entry);
1278 update_mmu_cache_pmd(vma, address, pmd); 1348 update_mmu_cache_pmd(vma, address, pmd);
@@ -1281,7 +1351,7 @@ alloc:
1281 put_huge_zero_page(); 1351 put_huge_zero_page();
1282 } else { 1352 } else {
1283 VM_BUG_ON_PAGE(!PageHead(page), page); 1353 VM_BUG_ON_PAGE(!PageHead(page), page);
1284 page_remove_rmap(page); 1354 page_remove_rmap(page, true);
1285 put_page(page); 1355 put_page(page);
1286 } 1356 }
1287 ret |= VM_FAULT_WRITE; 1357 ret |= VM_FAULT_WRITE;
@@ -1319,23 +1389,23 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1319 1389
1320 page = pmd_page(*pmd); 1390 page = pmd_page(*pmd);
1321 VM_BUG_ON_PAGE(!PageHead(page), page); 1391 VM_BUG_ON_PAGE(!PageHead(page), page);
1322 if (flags & FOLL_TOUCH) { 1392 if (flags & FOLL_TOUCH)
1323 pmd_t _pmd; 1393 touch_pmd(vma, addr, pmd);
1394 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1324 /* 1395 /*
1325 * We should set the dirty bit only for FOLL_WRITE but 1396 * We don't mlock() pte-mapped THPs. This way we can avoid
1326 * for now the dirty bit in the pmd is meaningless. 1397 * leaking mlocked pages into non-VM_LOCKED VMAs.
1327 * And if the dirty bit will become meaningful and 1398 *
1328 * we'll only set it with FOLL_WRITE, an atomic 1399 * In most cases the pmd is the only mapping of the page as we
1329 * set_bit will be required on the pmd to set the 1400 * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
1330 * young bit, instead of the current set_pmd_at. 1401 * writable private mappings in populate_vma_page_range().
1402 *
1403 * The only scenario when we have the page shared here is if we
1404 * mlocking read-only mapping shared over fork(). We skip
1405 * mlocking such pages.
1331 */ 1406 */
1332 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); 1407 if (compound_mapcount(page) == 1 && !PageDoubleMap(page) &&
1333 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, 1408 page->mapping && trylock_page(page)) {
1334 pmd, _pmd, 1))
1335 update_mmu_cache_pmd(vma, addr, pmd);
1336 }
1337 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1338 if (page->mapping && trylock_page(page)) {
1339 lru_add_drain(); 1409 lru_add_drain();
1340 if (page->mapping) 1410 if (page->mapping)
1341 mlock_vma_page(page); 1411 mlock_vma_page(page);
@@ -1345,7 +1415,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1345 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 1415 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1346 VM_BUG_ON_PAGE(!PageCompound(page), page); 1416 VM_BUG_ON_PAGE(!PageCompound(page), page);
1347 if (flags & FOLL_GET) 1417 if (flags & FOLL_GET)
1348 get_page_foll(page); 1418 get_page(page);
1349 1419
1350out: 1420out:
1351 return page; 1421 return page;
@@ -1480,13 +1550,84 @@ out:
1480 return 0; 1550 return 0;
1481} 1551}
1482 1552
1553int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1554 pmd_t *pmd, unsigned long addr, unsigned long next)
1555
1556{
1557 spinlock_t *ptl;
1558 pmd_t orig_pmd;
1559 struct page *page;
1560 struct mm_struct *mm = tlb->mm;
1561 int ret = 0;
1562
1563 if (!pmd_trans_huge_lock(pmd, vma, &ptl))
1564 goto out;
1565
1566 orig_pmd = *pmd;
1567 if (is_huge_zero_pmd(orig_pmd)) {
1568 ret = 1;
1569 goto out;
1570 }
1571
1572 page = pmd_page(orig_pmd);
1573 /*
1574 * If other processes are mapping this page, we couldn't discard
1575 * the page unless they all do MADV_FREE so let's skip the page.
1576 */
1577 if (page_mapcount(page) != 1)
1578 goto out;
1579
1580 if (!trylock_page(page))
1581 goto out;
1582
1583 /*
1584 * If user want to discard part-pages of THP, split it so MADV_FREE
1585 * will deactivate only them.
1586 */
1587 if (next - addr != HPAGE_PMD_SIZE) {
1588 get_page(page);
1589 spin_unlock(ptl);
1590 if (split_huge_page(page)) {
1591 put_page(page);
1592 unlock_page(page);
1593 goto out_unlocked;
1594 }
1595 put_page(page);
1596 unlock_page(page);
1597 ret = 1;
1598 goto out_unlocked;
1599 }
1600
1601 if (PageDirty(page))
1602 ClearPageDirty(page);
1603 unlock_page(page);
1604
1605 if (PageActive(page))
1606 deactivate_page(page);
1607
1608 if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
1609 orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
1610 tlb->fullmm);
1611 orig_pmd = pmd_mkold(orig_pmd);
1612 orig_pmd = pmd_mkclean(orig_pmd);
1613
1614 set_pmd_at(mm, addr, pmd, orig_pmd);
1615 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1616 }
1617 ret = 1;
1618out:
1619 spin_unlock(ptl);
1620out_unlocked:
1621 return ret;
1622}
1623
1483int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1624int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1484 pmd_t *pmd, unsigned long addr) 1625 pmd_t *pmd, unsigned long addr)
1485{ 1626{
1486 pmd_t orig_pmd; 1627 pmd_t orig_pmd;
1487 spinlock_t *ptl; 1628 spinlock_t *ptl;
1488 1629
1489 if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1) 1630 if (!__pmd_trans_huge_lock(pmd, vma, &ptl))
1490 return 0; 1631 return 0;
1491 /* 1632 /*
1492 * For architectures like ppc64 we look at deposited pgtable 1633 * For architectures like ppc64 we look at deposited pgtable
@@ -1508,7 +1649,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1508 put_huge_zero_page(); 1649 put_huge_zero_page();
1509 } else { 1650 } else {
1510 struct page *page = pmd_page(orig_pmd); 1651 struct page *page = pmd_page(orig_pmd);
1511 page_remove_rmap(page); 1652 page_remove_rmap(page, true);
1512 VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); 1653 VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
1513 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1654 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1514 VM_BUG_ON_PAGE(!PageHead(page), page); 1655 VM_BUG_ON_PAGE(!PageHead(page), page);
@@ -1520,13 +1661,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1520 return 1; 1661 return 1;
1521} 1662}
1522 1663
1523int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, 1664bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1524 unsigned long old_addr, 1665 unsigned long old_addr,
1525 unsigned long new_addr, unsigned long old_end, 1666 unsigned long new_addr, unsigned long old_end,
1526 pmd_t *old_pmd, pmd_t *new_pmd) 1667 pmd_t *old_pmd, pmd_t *new_pmd)
1527{ 1668{
1528 spinlock_t *old_ptl, *new_ptl; 1669 spinlock_t *old_ptl, *new_ptl;
1529 int ret = 0;
1530 pmd_t pmd; 1670 pmd_t pmd;
1531 1671
1532 struct mm_struct *mm = vma->vm_mm; 1672 struct mm_struct *mm = vma->vm_mm;
@@ -1535,7 +1675,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1535 (new_addr & ~HPAGE_PMD_MASK) || 1675 (new_addr & ~HPAGE_PMD_MASK) ||
1536 old_end - old_addr < HPAGE_PMD_SIZE || 1676 old_end - old_addr < HPAGE_PMD_SIZE ||
1537 (new_vma->vm_flags & VM_NOHUGEPAGE)) 1677 (new_vma->vm_flags & VM_NOHUGEPAGE))
1538 goto out; 1678 return false;
1539 1679
1540 /* 1680 /*
1541 * The destination pmd shouldn't be established, free_pgtables() 1681 * The destination pmd shouldn't be established, free_pgtables()
@@ -1543,15 +1683,14 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1543 */ 1683 */
1544 if (WARN_ON(!pmd_none(*new_pmd))) { 1684 if (WARN_ON(!pmd_none(*new_pmd))) {
1545 VM_BUG_ON(pmd_trans_huge(*new_pmd)); 1685 VM_BUG_ON(pmd_trans_huge(*new_pmd));
1546 goto out; 1686 return false;
1547 } 1687 }
1548 1688
1549 /* 1689 /*
1550 * We don't have to worry about the ordering of src and dst 1690 * We don't have to worry about the ordering of src and dst
1551 * ptlocks because exclusive mmap_sem prevents deadlock. 1691 * ptlocks because exclusive mmap_sem prevents deadlock.
1552 */ 1692 */
1553 ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl); 1693 if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) {
1554 if (ret == 1) {
1555 new_ptl = pmd_lockptr(mm, new_pmd); 1694 new_ptl = pmd_lockptr(mm, new_pmd);
1556 if (new_ptl != old_ptl) 1695 if (new_ptl != old_ptl)
1557 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 1696 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -1567,9 +1706,9 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1567 if (new_ptl != old_ptl) 1706 if (new_ptl != old_ptl)
1568 spin_unlock(new_ptl); 1707 spin_unlock(new_ptl);
1569 spin_unlock(old_ptl); 1708 spin_unlock(old_ptl);
1709 return true;
1570 } 1710 }
1571out: 1711 return false;
1572 return ret;
1573} 1712}
1574 1713
1575/* 1714/*
@@ -1585,7 +1724,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1585 spinlock_t *ptl; 1724 spinlock_t *ptl;
1586 int ret = 0; 1725 int ret = 0;
1587 1726
1588 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 1727 if (__pmd_trans_huge_lock(pmd, vma, &ptl)) {
1589 pmd_t entry; 1728 pmd_t entry;
1590 bool preserve_write = prot_numa && pmd_write(*pmd); 1729 bool preserve_write = prot_numa && pmd_write(*pmd);
1591 ret = 1; 1730 ret = 1;
@@ -1616,405 +1755,19 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1616} 1755}
1617 1756
1618/* 1757/*
1619 * Returns 1 if a given pmd maps a stable (not under splitting) thp. 1758 * Returns true if a given pmd maps a thp, false otherwise.
1620 * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
1621 * 1759 *
1622 * Note that if it returns 1, this routine returns without unlocking page 1760 * Note that if it returns true, this routine returns without unlocking page
1623 * table locks. So callers must unlock them. 1761 * table lock. So callers must unlock it.
1624 */ 1762 */
1625int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, 1763bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
1626 spinlock_t **ptl) 1764 spinlock_t **ptl)
1627{ 1765{
1628 *ptl = pmd_lock(vma->vm_mm, pmd); 1766 *ptl = pmd_lock(vma->vm_mm, pmd);
1629 if (likely(pmd_trans_huge(*pmd))) { 1767 if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
1630 if (unlikely(pmd_trans_splitting(*pmd))) { 1768 return true;
1631 spin_unlock(*ptl);
1632 wait_split_huge_page(vma->anon_vma, pmd);
1633 return -1;
1634 } else {
1635 /* Thp mapped by 'pmd' is stable, so we can
1636 * handle it as it is. */
1637 return 1;
1638 }
1639 }
1640 spin_unlock(*ptl);
1641 return 0;
1642}
1643
1644/*
1645 * This function returns whether a given @page is mapped onto the @address
1646 * in the virtual space of @mm.
1647 *
1648 * When it's true, this function returns *pmd with holding the page table lock
1649 * and passing it back to the caller via @ptl.
1650 * If it's false, returns NULL without holding the page table lock.
1651 */
1652pmd_t *page_check_address_pmd(struct page *page,
1653 struct mm_struct *mm,
1654 unsigned long address,
1655 enum page_check_address_pmd_flag flag,
1656 spinlock_t **ptl)
1657{
1658 pgd_t *pgd;
1659 pud_t *pud;
1660 pmd_t *pmd;
1661
1662 if (address & ~HPAGE_PMD_MASK)
1663 return NULL;
1664
1665 pgd = pgd_offset(mm, address);
1666 if (!pgd_present(*pgd))
1667 return NULL;
1668 pud = pud_offset(pgd, address);
1669 if (!pud_present(*pud))
1670 return NULL;
1671 pmd = pmd_offset(pud, address);
1672
1673 *ptl = pmd_lock(mm, pmd);
1674 if (!pmd_present(*pmd))
1675 goto unlock;
1676 if (pmd_page(*pmd) != page)
1677 goto unlock;
1678 /*
1679 * split_vma() may create temporary aliased mappings. There is
1680 * no risk as long as all huge pmd are found and have their
1681 * splitting bit set before __split_huge_page_refcount
1682 * runs. Finding the same huge pmd more than once during the
1683 * same rmap walk is not a problem.
1684 */
1685 if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
1686 pmd_trans_splitting(*pmd))
1687 goto unlock;
1688 if (pmd_trans_huge(*pmd)) {
1689 VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
1690 !pmd_trans_splitting(*pmd));
1691 return pmd;
1692 }
1693unlock:
1694 spin_unlock(*ptl); 1769 spin_unlock(*ptl);
1695 return NULL; 1770 return false;
1696}
1697
1698static int __split_huge_page_splitting(struct page *page,
1699 struct vm_area_struct *vma,
1700 unsigned long address)
1701{
1702 struct mm_struct *mm = vma->vm_mm;
1703 spinlock_t *ptl;
1704 pmd_t *pmd;
1705 int ret = 0;
1706 /* For mmu_notifiers */
1707 const unsigned long mmun_start = address;
1708 const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
1709
1710 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1711 pmd = page_check_address_pmd(page, mm, address,
1712 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
1713 if (pmd) {
1714 /*
1715 * We can't temporarily set the pmd to null in order
1716 * to split it, the pmd must remain marked huge at all
1717 * times or the VM won't take the pmd_trans_huge paths
1718 * and it won't wait on the anon_vma->root->rwsem to
1719 * serialize against split_huge_page*.
1720 */
1721 pmdp_splitting_flush(vma, address, pmd);
1722
1723 ret = 1;
1724 spin_unlock(ptl);
1725 }
1726 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1727
1728 return ret;
1729}
1730
1731static void __split_huge_page_refcount(struct page *page,
1732 struct list_head *list)
1733{
1734 int i;
1735 struct zone *zone = page_zone(page);
1736 struct lruvec *lruvec;
1737 int tail_count = 0;
1738
1739 /* prevent PageLRU to go away from under us, and freeze lru stats */
1740 spin_lock_irq(&zone->lru_lock);
1741 lruvec = mem_cgroup_page_lruvec(page, zone);
1742
1743 compound_lock(page);
1744 /* complete memcg works before add pages to LRU */
1745 mem_cgroup_split_huge_fixup(page);
1746
1747 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
1748 struct page *page_tail = page + i;
1749
1750 /* tail_page->_mapcount cannot change */
1751 BUG_ON(page_mapcount(page_tail) < 0);
1752 tail_count += page_mapcount(page_tail);
1753 /* check for overflow */
1754 BUG_ON(tail_count < 0);
1755 BUG_ON(atomic_read(&page_tail->_count) != 0);
1756 /*
1757 * tail_page->_count is zero and not changing from
1758 * under us. But get_page_unless_zero() may be running
1759 * from under us on the tail_page. If we used
1760 * atomic_set() below instead of atomic_add(), we
1761 * would then run atomic_set() concurrently with
1762 * get_page_unless_zero(), and atomic_set() is
1763 * implemented in C not using locked ops. spin_unlock
1764 * on x86 sometime uses locked ops because of PPro
1765 * errata 66, 92, so unless somebody can guarantee
1766 * atomic_set() here would be safe on all archs (and
1767 * not only on x86), it's safer to use atomic_add().
1768 */
1769 atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
1770 &page_tail->_count);
1771
1772 /* after clearing PageTail the gup refcount can be released */
1773 smp_mb__after_atomic();
1774
1775 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1776 page_tail->flags |= (page->flags &
1777 ((1L << PG_referenced) |
1778 (1L << PG_swapbacked) |
1779 (1L << PG_mlocked) |
1780 (1L << PG_uptodate) |
1781 (1L << PG_active) |
1782 (1L << PG_unevictable)));
1783 page_tail->flags |= (1L << PG_dirty);
1784
1785 clear_compound_head(page_tail);
1786
1787 if (page_is_young(page))
1788 set_page_young(page_tail);
1789 if (page_is_idle(page))
1790 set_page_idle(page_tail);
1791
1792 /*
1793 * __split_huge_page_splitting() already set the
1794 * splitting bit in all pmd that could map this
1795 * hugepage, that will ensure no CPU can alter the
1796 * mapcount on the head page. The mapcount is only
1797 * accounted in the head page and it has to be
1798 * transferred to all tail pages in the below code. So
1799 * for this code to be safe, the split the mapcount
1800 * can't change. But that doesn't mean userland can't
1801 * keep changing and reading the page contents while
1802 * we transfer the mapcount, so the pmd splitting
1803 * status is achieved setting a reserved bit in the
1804 * pmd, not by clearing the present bit.
1805 */
1806 page_tail->_mapcount = page->_mapcount;
1807
1808 BUG_ON(page_tail->mapping);
1809 page_tail->mapping = page->mapping;
1810
1811 page_tail->index = page->index + i;
1812 page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
1813
1814 BUG_ON(!PageAnon(page_tail));
1815 BUG_ON(!PageUptodate(page_tail));
1816 BUG_ON(!PageDirty(page_tail));
1817 BUG_ON(!PageSwapBacked(page_tail));
1818
1819 lru_add_page_tail(page, page_tail, lruvec, list);
1820 }
1821 atomic_sub(tail_count, &page->_count);
1822 BUG_ON(atomic_read(&page->_count) <= 0);
1823
1824 __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
1825
1826 ClearPageCompound(page);
1827 compound_unlock(page);
1828 spin_unlock_irq(&zone->lru_lock);
1829
1830 for (i = 1; i < HPAGE_PMD_NR; i++) {
1831 struct page *page_tail = page + i;
1832 BUG_ON(page_count(page_tail) <= 0);
1833 /*
1834 * Tail pages may be freed if there wasn't any mapping
1835 * like if add_to_swap() is running on a lru page that
1836 * had its mapping zapped. And freeing these pages
1837 * requires taking the lru_lock so we do the put_page
1838 * of the tail pages after the split is complete.
1839 */
1840 put_page(page_tail);
1841 }
1842
1843 /*
1844 * Only the head page (now become a regular page) is required
1845 * to be pinned by the caller.
1846 */
1847 BUG_ON(page_count(page) <= 0);
1848}
1849
1850static int __split_huge_page_map(struct page *page,
1851 struct vm_area_struct *vma,
1852 unsigned long address)
1853{
1854 struct mm_struct *mm = vma->vm_mm;
1855 spinlock_t *ptl;
1856 pmd_t *pmd, _pmd;
1857 int ret = 0, i;
1858 pgtable_t pgtable;
1859 unsigned long haddr;
1860
1861 pmd = page_check_address_pmd(page, mm, address,
1862 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
1863 if (pmd) {
1864 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1865 pmd_populate(mm, &_pmd, pgtable);
1866 if (pmd_write(*pmd))
1867 BUG_ON(page_mapcount(page) != 1);
1868
1869 haddr = address;
1870 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1871 pte_t *pte, entry;
1872 BUG_ON(PageCompound(page+i));
1873 /*
1874 * Note that NUMA hinting access restrictions are not
1875 * transferred to avoid any possibility of altering
1876 * permissions across VMAs.
1877 */
1878 entry = mk_pte(page + i, vma->vm_page_prot);
1879 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1880 if (!pmd_write(*pmd))
1881 entry = pte_wrprotect(entry);
1882 if (!pmd_young(*pmd))
1883 entry = pte_mkold(entry);
1884 pte = pte_offset_map(&_pmd, haddr);
1885 BUG_ON(!pte_none(*pte));
1886 set_pte_at(mm, haddr, pte, entry);
1887 pte_unmap(pte);
1888 }
1889
1890 smp_wmb(); /* make pte visible before pmd */
1891 /*
1892 * Up to this point the pmd is present and huge and
1893 * userland has the whole access to the hugepage
1894 * during the split (which happens in place). If we
1895 * overwrite the pmd with the not-huge version
1896 * pointing to the pte here (which of course we could
1897 * if all CPUs were bug free), userland could trigger
1898 * a small page size TLB miss on the small sized TLB
1899 * while the hugepage TLB entry is still established
1900 * in the huge TLB. Some CPU doesn't like that. See
1901 * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
1902 * Erratum 383 on page 93. Intel should be safe but is
1903 * also warns that it's only safe if the permission
1904 * and cache attributes of the two entries loaded in
1905 * the two TLB is identical (which should be the case
1906 * here). But it is generally safer to never allow
1907 * small and huge TLB entries for the same virtual
1908 * address to be loaded simultaneously. So instead of
1909 * doing "pmd_populate(); flush_pmd_tlb_range();" we first
1910 * mark the current pmd notpresent (atomically because
1911 * here the pmd_trans_huge and pmd_trans_splitting
1912 * must remain set at all times on the pmd until the
1913 * split is complete for this pmd), then we flush the
1914 * SMP TLB and finally we write the non-huge version
1915 * of the pmd entry with pmd_populate.
1916 */
1917 pmdp_invalidate(vma, address, pmd);
1918 pmd_populate(mm, pmd, pgtable);
1919 ret = 1;
1920 spin_unlock(ptl);
1921 }
1922
1923 return ret;
1924}
1925
1926/* must be called with anon_vma->root->rwsem held */
1927static void __split_huge_page(struct page *page,
1928 struct anon_vma *anon_vma,
1929 struct list_head *list)
1930{
1931 int mapcount, mapcount2;
1932 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1933 struct anon_vma_chain *avc;
1934
1935 BUG_ON(!PageHead(page));
1936 BUG_ON(PageTail(page));
1937
1938 mapcount = 0;
1939 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1940 struct vm_area_struct *vma = avc->vma;
1941 unsigned long addr = vma_address(page, vma);
1942 BUG_ON(is_vma_temporary_stack(vma));
1943 mapcount += __split_huge_page_splitting(page, vma, addr);
1944 }
1945 /*
1946 * It is critical that new vmas are added to the tail of the
1947 * anon_vma list. This guarantes that if copy_huge_pmd() runs
1948 * and establishes a child pmd before
1949 * __split_huge_page_splitting() freezes the parent pmd (so if
1950 * we fail to prevent copy_huge_pmd() from running until the
1951 * whole __split_huge_page() is complete), we will still see
1952 * the newly established pmd of the child later during the
1953 * walk, to be able to set it as pmd_trans_splitting too.
1954 */
1955 if (mapcount != page_mapcount(page)) {
1956 pr_err("mapcount %d page_mapcount %d\n",
1957 mapcount, page_mapcount(page));
1958 BUG();
1959 }
1960
1961 __split_huge_page_refcount(page, list);
1962
1963 mapcount2 = 0;
1964 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1965 struct vm_area_struct *vma = avc->vma;
1966 unsigned long addr = vma_address(page, vma);
1967 BUG_ON(is_vma_temporary_stack(vma));
1968 mapcount2 += __split_huge_page_map(page, vma, addr);
1969 }
1970 if (mapcount != mapcount2) {
1971 pr_err("mapcount %d mapcount2 %d page_mapcount %d\n",
1972 mapcount, mapcount2, page_mapcount(page));
1973 BUG();
1974 }
1975}
1976
1977/*
1978 * Split a hugepage into normal pages. This doesn't change the position of head
1979 * page. If @list is null, tail pages will be added to LRU list, otherwise, to
1980 * @list. Both head page and tail pages will inherit mapping, flags, and so on
1981 * from the hugepage.
1982 * Return 0 if the hugepage is split successfully otherwise return 1.
1983 */
1984int split_huge_page_to_list(struct page *page, struct list_head *list)
1985{
1986 struct anon_vma *anon_vma;
1987 int ret = 1;
1988
1989 BUG_ON(is_huge_zero_page(page));
1990 BUG_ON(!PageAnon(page));
1991
1992 /*
1993 * The caller does not necessarily hold an mmap_sem that would prevent
1994 * the anon_vma disappearing so we first we take a reference to it
1995 * and then lock the anon_vma for write. This is similar to
1996 * page_lock_anon_vma_read except the write lock is taken to serialise
1997 * against parallel split or collapse operations.
1998 */
1999 anon_vma = page_get_anon_vma(page);
2000 if (!anon_vma)
2001 goto out;
2002 anon_vma_lock_write(anon_vma);
2003
2004 ret = 0;
2005 if (!PageCompound(page))
2006 goto out_unlock;
2007
2008 BUG_ON(!PageSwapBacked(page));
2009 __split_huge_page(page, anon_vma, list);
2010 count_vm_event(THP_SPLIT);
2011
2012 BUG_ON(PageCompound(page));
2013out_unlock:
2014 anon_vma_unlock_write(anon_vma);
2015 put_anon_vma(anon_vma);
2016out:
2017 return ret;
2018} 1771}
2019 1772
2020#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) 1773#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
@@ -2371,7 +2124,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
2371 * superfluous. 2124 * superfluous.
2372 */ 2125 */
2373 pte_clear(vma->vm_mm, address, _pte); 2126 pte_clear(vma->vm_mm, address, _pte);
2374 page_remove_rmap(src_page); 2127 page_remove_rmap(src_page, false);
2375 spin_unlock(ptl); 2128 spin_unlock(ptl);
2376 free_page_and_swap_cache(src_page); 2129 free_page_and_swap_cache(src_page);
2377 } 2130 }
@@ -2481,6 +2234,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
2481 return NULL; 2234 return NULL;
2482 } 2235 }
2483 2236
2237 prep_transhuge_page(*hpage);
2484 count_vm_event(THP_COLLAPSE_ALLOC); 2238 count_vm_event(THP_COLLAPSE_ALLOC);
2485 return *hpage; 2239 return *hpage;
2486} 2240}
@@ -2492,8 +2246,12 @@ static int khugepaged_find_target_node(void)
2492 2246
2493static inline struct page *alloc_hugepage(int defrag) 2247static inline struct page *alloc_hugepage(int defrag)
2494{ 2248{
2495 return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), 2249 struct page *page;
2496 HPAGE_PMD_ORDER); 2250
2251 page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER);
2252 if (page)
2253 prep_transhuge_page(page);
2254 return page;
2497} 2255}
2498 2256
2499static struct page *khugepaged_alloc_hugepage(bool *wait) 2257static struct page *khugepaged_alloc_hugepage(bool *wait)
@@ -2543,7 +2301,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
2543 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || 2301 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
2544 (vma->vm_flags & VM_NOHUGEPAGE)) 2302 (vma->vm_flags & VM_NOHUGEPAGE))
2545 return false; 2303 return false;
2546
2547 if (!vma->anon_vma || vma->vm_ops) 2304 if (!vma->anon_vma || vma->vm_ops)
2548 return false; 2305 return false;
2549 if (is_vma_temporary_stack(vma)) 2306 if (is_vma_temporary_stack(vma))
@@ -2583,7 +2340,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2583 goto out_nolock; 2340 goto out_nolock;
2584 } 2341 }
2585 2342
2586 if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg))) { 2343 if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
2587 result = SCAN_CGROUP_CHARGE_FAIL; 2344 result = SCAN_CGROUP_CHARGE_FAIL;
2588 goto out_nolock; 2345 goto out_nolock;
2589 } 2346 }
@@ -2682,8 +2439,8 @@ static void collapse_huge_page(struct mm_struct *mm,
2682 2439
2683 spin_lock(pmd_ptl); 2440 spin_lock(pmd_ptl);
2684 BUG_ON(!pmd_none(*pmd)); 2441 BUG_ON(!pmd_none(*pmd));
2685 page_add_new_anon_rmap(new_page, vma, address); 2442 page_add_new_anon_rmap(new_page, vma, address, true);
2686 mem_cgroup_commit_charge(new_page, memcg, false); 2443 mem_cgroup_commit_charge(new_page, memcg, false, true);
2687 lru_cache_add_active_or_unevictable(new_page, vma); 2444 lru_cache_add_active_or_unevictable(new_page, vma);
2688 pgtable_trans_huge_deposit(mm, pmd, pgtable); 2445 pgtable_trans_huge_deposit(mm, pmd, pgtable);
2689 set_pmd_at(mm, address, pmd, _pmd); 2446 set_pmd_at(mm, address, pmd, _pmd);
@@ -2703,7 +2460,7 @@ out_nolock:
2703 trace_mm_collapse_huge_page(mm, isolated, result); 2460 trace_mm_collapse_huge_page(mm, isolated, result);
2704 return; 2461 return;
2705out: 2462out:
2706 mem_cgroup_cancel_charge(new_page, memcg); 2463 mem_cgroup_cancel_charge(new_page, memcg, true);
2707 goto out_up_write; 2464 goto out_up_write;
2708} 2465}
2709 2466
@@ -2755,6 +2512,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2755 result = SCAN_PAGE_NULL; 2512 result = SCAN_PAGE_NULL;
2756 goto out_unmap; 2513 goto out_unmap;
2757 } 2514 }
2515
2516 /* TODO: teach khugepaged to collapse THP mapped with pte */
2517 if (PageCompound(page)) {
2518 result = SCAN_PAGE_COMPOUND;
2519 goto out_unmap;
2520 }
2521
2758 /* 2522 /*
2759 * Record which node the original page is from and save this 2523 * Record which node the original page is from and save this
2760 * information to khugepaged_node_load[]. 2524 * information to khugepaged_node_load[].
@@ -2767,7 +2531,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2767 goto out_unmap; 2531 goto out_unmap;
2768 } 2532 }
2769 khugepaged_node_load[node]++; 2533 khugepaged_node_load[node]++;
2770 VM_BUG_ON_PAGE(PageCompound(page), page);
2771 if (!PageLRU(page)) { 2534 if (!PageLRU(page)) {
2772 result = SCAN_SCAN_ABORT; 2535 result = SCAN_SCAN_ABORT;
2773 goto out_unmap; 2536 goto out_unmap;
@@ -3040,8 +2803,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
3040 pmd_t _pmd; 2803 pmd_t _pmd;
3041 int i; 2804 int i;
3042 2805
3043 pmdp_huge_clear_flush_notify(vma, haddr, pmd);
3044 /* leave pmd empty until pte is filled */ 2806 /* leave pmd empty until pte is filled */
2807 pmdp_huge_clear_flush_notify(vma, haddr, pmd);
3045 2808
3046 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2809 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
3047 pmd_populate(mm, &_pmd, pgtable); 2810 pmd_populate(mm, &_pmd, pgtable);
@@ -3060,66 +2823,153 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
3060 put_huge_zero_page(); 2823 put_huge_zero_page();
3061} 2824}
3062 2825
3063void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, 2826static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
3064 pmd_t *pmd) 2827 unsigned long haddr, bool freeze)
3065{ 2828{
3066 spinlock_t *ptl;
3067 struct page *page = NULL;
3068 struct mm_struct *mm = vma->vm_mm; 2829 struct mm_struct *mm = vma->vm_mm;
3069 unsigned long haddr = address & HPAGE_PMD_MASK; 2830 struct page *page;
3070 unsigned long mmun_start; /* For mmu_notifiers */ 2831 pgtable_t pgtable;
3071 unsigned long mmun_end; /* For mmu_notifiers */ 2832 pmd_t _pmd;
2833 bool young, write, dirty;
2834 int i;
3072 2835
3073 BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); 2836 VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
2837 VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2838 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
2839 VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd));
2840
2841 count_vm_event(THP_SPLIT_PMD);
3074 2842
3075 mmun_start = haddr;
3076 mmun_end = haddr + HPAGE_PMD_SIZE;
3077again:
3078 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
3079 ptl = pmd_lock(mm, pmd);
3080 if (unlikely(!pmd_trans_huge(*pmd)))
3081 goto unlock;
3082 if (vma_is_dax(vma)) { 2843 if (vma_is_dax(vma)) {
3083 pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); 2844 pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
3084 if (is_huge_zero_pmd(_pmd)) 2845 if (is_huge_zero_pmd(_pmd))
3085 put_huge_zero_page(); 2846 put_huge_zero_page();
2847 return;
3086 } else if (is_huge_zero_pmd(*pmd)) { 2848 } else if (is_huge_zero_pmd(*pmd)) {
3087 __split_huge_zero_page_pmd(vma, haddr, pmd); 2849 return __split_huge_zero_page_pmd(vma, haddr, pmd);
3088 } else {
3089 page = pmd_page(*pmd);
3090 VM_BUG_ON_PAGE(!page_count(page), page);
3091 get_page(page);
3092 } 2850 }
3093 unlock:
3094 spin_unlock(ptl);
3095 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
3096 2851
3097 if (!page) 2852 page = pmd_page(*pmd);
3098 return; 2853 VM_BUG_ON_PAGE(!page_count(page), page);
2854 atomic_add(HPAGE_PMD_NR - 1, &page->_count);
2855 write = pmd_write(*pmd);
2856 young = pmd_young(*pmd);
2857 dirty = pmd_dirty(*pmd);
3099 2858
3100 split_huge_page(page); 2859 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
3101 put_page(page); 2860 pmd_populate(mm, &_pmd, pgtable);
3102 2861
2862 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
2863 pte_t entry, *pte;
2864 /*
2865 * Note that NUMA hinting access restrictions are not
2866 * transferred to avoid any possibility of altering
2867 * permissions across VMAs.
2868 */
2869 if (freeze) {
2870 swp_entry_t swp_entry;
2871 swp_entry = make_migration_entry(page + i, write);
2872 entry = swp_entry_to_pte(swp_entry);
2873 } else {
2874 entry = mk_pte(page + i, vma->vm_page_prot);
2875 entry = maybe_mkwrite(entry, vma);
2876 if (!write)
2877 entry = pte_wrprotect(entry);
2878 if (!young)
2879 entry = pte_mkold(entry);
2880 }
2881 if (dirty)
2882 SetPageDirty(page + i);
2883 pte = pte_offset_map(&_pmd, haddr);
2884 BUG_ON(!pte_none(*pte));
2885 set_pte_at(mm, haddr, pte, entry);
2886 atomic_inc(&page[i]._mapcount);
2887 pte_unmap(pte);
2888 }
2889
2890 /*
2891 * Set PG_double_map before dropping compound_mapcount to avoid
2892 * false-negative page_mapped().
2893 */
2894 if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
2895 for (i = 0; i < HPAGE_PMD_NR; i++)
2896 atomic_inc(&page[i]._mapcount);
2897 }
2898
2899 if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
2900 /* Last compound_mapcount is gone. */
2901 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
2902 if (TestClearPageDoubleMap(page)) {
2903 /* No need in mapcount reference anymore */
2904 for (i = 0; i < HPAGE_PMD_NR; i++)
2905 atomic_dec(&page[i]._mapcount);
2906 }
2907 }
2908
2909 smp_wmb(); /* make pte visible before pmd */
3103 /* 2910 /*
3104 * We don't always have down_write of mmap_sem here: a racing 2911 * Up to this point the pmd is present and huge and userland has the
3105 * do_huge_pmd_wp_page() might have copied-on-write to another 2912 * whole access to the hugepage during the split (which happens in
3106 * huge page before our split_huge_page() got the anon_vma lock. 2913 * place). If we overwrite the pmd with the not-huge version pointing
2914 * to the pte here (which of course we could if all CPUs were bug
2915 * free), userland could trigger a small page size TLB miss on the
2916 * small sized TLB while the hugepage TLB entry is still established in
2917 * the huge TLB. Some CPU doesn't like that.
2918 * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
2919 * 383 on page 93. Intel should be safe but is also warns that it's
2920 * only safe if the permission and cache attributes of the two entries
2921 * loaded in the two TLB is identical (which should be the case here).
2922 * But it is generally safer to never allow small and huge TLB entries
2923 * for the same virtual address to be loaded simultaneously. So instead
2924 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
2925 * current pmd notpresent (atomically because here the pmd_trans_huge
2926 * and pmd_trans_splitting must remain set at all times on the pmd
2927 * until the split is complete for this pmd), then we flush the SMP TLB
2928 * and finally we write the non-huge version of the pmd entry with
2929 * pmd_populate.
3107 */ 2930 */
3108 if (unlikely(pmd_trans_huge(*pmd))) 2931 pmdp_invalidate(vma, haddr, pmd);
3109 goto again; 2932 pmd_populate(mm, pmd, pgtable);
2933
2934 if (freeze) {
2935 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
2936 page_remove_rmap(page + i, false);
2937 put_page(page + i);
2938 }
2939 }
3110} 2940}
3111 2941
3112void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, 2942void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
3113 pmd_t *pmd) 2943 unsigned long address)
3114{ 2944{
3115 struct vm_area_struct *vma; 2945 spinlock_t *ptl;
2946 struct mm_struct *mm = vma->vm_mm;
2947 struct page *page = NULL;
2948 unsigned long haddr = address & HPAGE_PMD_MASK;
3116 2949
3117 vma = find_vma(mm, address); 2950 mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
3118 BUG_ON(vma == NULL); 2951 ptl = pmd_lock(mm, pmd);
3119 split_huge_page_pmd(vma, address, pmd); 2952 if (pmd_trans_huge(*pmd)) {
2953 page = pmd_page(*pmd);
2954 if (PageMlocked(page))
2955 get_page(page);
2956 else
2957 page = NULL;
2958 } else if (!pmd_devmap(*pmd))
2959 goto out;
2960 __split_huge_pmd_locked(vma, pmd, haddr, false);
2961out:
2962 spin_unlock(ptl);
2963 mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
2964 if (page) {
2965 lock_page(page);
2966 munlock_vma_page(page);
2967 unlock_page(page);
2968 put_page(page);
2969 }
3120} 2970}
3121 2971
3122static void split_huge_page_address(struct mm_struct *mm, 2972static void split_huge_pmd_address(struct vm_area_struct *vma,
3123 unsigned long address) 2973 unsigned long address)
3124{ 2974{
3125 pgd_t *pgd; 2975 pgd_t *pgd;
@@ -3128,7 +2978,7 @@ static void split_huge_page_address(struct mm_struct *mm,
3128 2978
3129 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); 2979 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
3130 2980
3131 pgd = pgd_offset(mm, address); 2981 pgd = pgd_offset(vma->vm_mm, address);
3132 if (!pgd_present(*pgd)) 2982 if (!pgd_present(*pgd))
3133 return; 2983 return;
3134 2984
@@ -3137,13 +2987,13 @@ static void split_huge_page_address(struct mm_struct *mm,
3137 return; 2987 return;
3138 2988
3139 pmd = pmd_offset(pud, address); 2989 pmd = pmd_offset(pud, address);
3140 if (!pmd_present(*pmd)) 2990 if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)))
3141 return; 2991 return;
3142 /* 2992 /*
3143 * Caller holds the mmap_sem write mode, so a huge pmd cannot 2993 * Caller holds the mmap_sem write mode, so a huge pmd cannot
3144 * materialize from under us. 2994 * materialize from under us.
3145 */ 2995 */
3146 split_huge_page_pmd_mm(mm, address, pmd); 2996 split_huge_pmd(vma, pmd, address);
3147} 2997}
3148 2998
3149void vma_adjust_trans_huge(struct vm_area_struct *vma, 2999void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -3159,7 +3009,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
3159 if (start & ~HPAGE_PMD_MASK && 3009 if (start & ~HPAGE_PMD_MASK &&
3160 (start & HPAGE_PMD_MASK) >= vma->vm_start && 3010 (start & HPAGE_PMD_MASK) >= vma->vm_start &&
3161 (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 3011 (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
3162 split_huge_page_address(vma->vm_mm, start); 3012 split_huge_pmd_address(vma, start);
3163 3013
3164 /* 3014 /*
3165 * If the new end address isn't hpage aligned and it could 3015 * If the new end address isn't hpage aligned and it could
@@ -3169,7 +3019,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
3169 if (end & ~HPAGE_PMD_MASK && 3019 if (end & ~HPAGE_PMD_MASK &&
3170 (end & HPAGE_PMD_MASK) >= vma->vm_start && 3020 (end & HPAGE_PMD_MASK) >= vma->vm_start &&
3171 (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 3021 (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
3172 split_huge_page_address(vma->vm_mm, end); 3022 split_huge_pmd_address(vma, end);
3173 3023
3174 /* 3024 /*
3175 * If we're also updating the vma->vm_next->vm_start, if the new 3025 * If we're also updating the vma->vm_next->vm_start, if the new
@@ -3183,6 +3033,540 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
3183 if (nstart & ~HPAGE_PMD_MASK && 3033 if (nstart & ~HPAGE_PMD_MASK &&
3184 (nstart & HPAGE_PMD_MASK) >= next->vm_start && 3034 (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
3185 (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) 3035 (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
3186 split_huge_page_address(next->vm_mm, nstart); 3036 split_huge_pmd_address(next, nstart);
3037 }
3038}
3039
3040static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
3041 unsigned long address)
3042{
3043 unsigned long haddr = address & HPAGE_PMD_MASK;
3044 spinlock_t *ptl;
3045 pgd_t *pgd;
3046 pud_t *pud;
3047 pmd_t *pmd;
3048 pte_t *pte;
3049 int i, nr = HPAGE_PMD_NR;
3050
3051 /* Skip pages which doesn't belong to the VMA */
3052 if (address < vma->vm_start) {
3053 int off = (vma->vm_start - address) >> PAGE_SHIFT;
3054 page += off;
3055 nr -= off;
3056 address = vma->vm_start;
3057 }
3058
3059 pgd = pgd_offset(vma->vm_mm, address);
3060 if (!pgd_present(*pgd))
3061 return;
3062 pud = pud_offset(pgd, address);
3063 if (!pud_present(*pud))
3064 return;
3065 pmd = pmd_offset(pud, address);
3066 ptl = pmd_lock(vma->vm_mm, pmd);
3067 if (!pmd_present(*pmd)) {
3068 spin_unlock(ptl);
3069 return;
3070 }
3071 if (pmd_trans_huge(*pmd)) {
3072 if (page == pmd_page(*pmd))
3073 __split_huge_pmd_locked(vma, pmd, haddr, true);
3074 spin_unlock(ptl);
3075 return;
3076 }
3077 spin_unlock(ptl);
3078
3079 pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
3080 for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
3081 pte_t entry, swp_pte;
3082 swp_entry_t swp_entry;
3083
3084 /*
3085 * We've just crossed page table boundary: need to map next one.
3086 * It can happen if THP was mremaped to non PMD-aligned address.
3087 */
3088 if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
3089 pte_unmap_unlock(pte - 1, ptl);
3090 pmd = mm_find_pmd(vma->vm_mm, address);
3091 if (!pmd)
3092 return;
3093 pte = pte_offset_map_lock(vma->vm_mm, pmd,
3094 address, &ptl);
3095 }
3096
3097 if (!pte_present(*pte))
3098 continue;
3099 if (page_to_pfn(page) != pte_pfn(*pte))
3100 continue;
3101 flush_cache_page(vma, address, page_to_pfn(page));
3102 entry = ptep_clear_flush(vma, address, pte);
3103 if (pte_dirty(entry))
3104 SetPageDirty(page);
3105 swp_entry = make_migration_entry(page, pte_write(entry));
3106 swp_pte = swp_entry_to_pte(swp_entry);
3107 if (pte_soft_dirty(entry))
3108 swp_pte = pte_swp_mksoft_dirty(swp_pte);
3109 set_pte_at(vma->vm_mm, address, pte, swp_pte);
3110 page_remove_rmap(page, false);
3111 put_page(page);
3112 }
3113 pte_unmap_unlock(pte - 1, ptl);
3114}
3115
3116static void freeze_page(struct anon_vma *anon_vma, struct page *page)
3117{
3118 struct anon_vma_chain *avc;
3119 pgoff_t pgoff = page_to_pgoff(page);
3120
3121 VM_BUG_ON_PAGE(!PageHead(page), page);
3122
3123 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
3124 pgoff + HPAGE_PMD_NR - 1) {
3125 unsigned long address = __vma_address(page, avc->vma);
3126
3127 mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
3128 address, address + HPAGE_PMD_SIZE);
3129 freeze_page_vma(avc->vma, page, address);
3130 mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
3131 address, address + HPAGE_PMD_SIZE);
3132 }
3133}
3134
3135static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
3136 unsigned long address)
3137{
3138 spinlock_t *ptl;
3139 pmd_t *pmd;
3140 pte_t *pte, entry;
3141 swp_entry_t swp_entry;
3142 unsigned long haddr = address & HPAGE_PMD_MASK;
3143 int i, nr = HPAGE_PMD_NR;
3144
3145 /* Skip pages which doesn't belong to the VMA */
3146 if (address < vma->vm_start) {
3147 int off = (vma->vm_start - address) >> PAGE_SHIFT;
3148 page += off;
3149 nr -= off;
3150 address = vma->vm_start;
3151 }
3152
3153 pmd = mm_find_pmd(vma->vm_mm, address);
3154 if (!pmd)
3155 return;
3156
3157 pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
3158 for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
3159 /*
3160 * We've just crossed page table boundary: need to map next one.
3161 * It can happen if THP was mremaped to non-PMD aligned address.
3162 */
3163 if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
3164 pte_unmap_unlock(pte - 1, ptl);
3165 pmd = mm_find_pmd(vma->vm_mm, address);
3166 if (!pmd)
3167 return;
3168 pte = pte_offset_map_lock(vma->vm_mm, pmd,
3169 address, &ptl);
3170 }
3171
3172 if (!is_swap_pte(*pte))
3173 continue;
3174
3175 swp_entry = pte_to_swp_entry(*pte);
3176 if (!is_migration_entry(swp_entry))
3177 continue;
3178 if (migration_entry_to_page(swp_entry) != page)
3179 continue;
3180
3181 get_page(page);
3182 page_add_anon_rmap(page, vma, address, false);
3183
3184 entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
3185 if (PageDirty(page))
3186 entry = pte_mkdirty(entry);
3187 if (is_write_migration_entry(swp_entry))
3188 entry = maybe_mkwrite(entry, vma);
3189
3190 flush_dcache_page(page);
3191 set_pte_at(vma->vm_mm, address, pte, entry);
3192
3193 /* No need to invalidate - it was non-present before */
3194 update_mmu_cache(vma, address, pte);
3195 }
3196 pte_unmap_unlock(pte - 1, ptl);
3197}
3198
3199static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
3200{
3201 struct anon_vma_chain *avc;
3202 pgoff_t pgoff = page_to_pgoff(page);
3203
3204 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
3205 pgoff, pgoff + HPAGE_PMD_NR - 1) {
3206 unsigned long address = __vma_address(page, avc->vma);
3207
3208 mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
3209 address, address + HPAGE_PMD_SIZE);
3210 unfreeze_page_vma(avc->vma, page, address);
3211 mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
3212 address, address + HPAGE_PMD_SIZE);
3213 }
3214}
3215
3216static int __split_huge_page_tail(struct page *head, int tail,
3217 struct lruvec *lruvec, struct list_head *list)
3218{
3219 int mapcount;
3220 struct page *page_tail = head + tail;
3221
3222 mapcount = atomic_read(&page_tail->_mapcount) + 1;
3223 VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
3224
3225 /*
3226 * tail_page->_count is zero and not changing from under us. But
3227 * get_page_unless_zero() may be running from under us on the
3228 * tail_page. If we used atomic_set() below instead of atomic_add(), we
3229 * would then run atomic_set() concurrently with
3230 * get_page_unless_zero(), and atomic_set() is implemented in C not
3231 * using locked ops. spin_unlock on x86 sometime uses locked ops
3232 * because of PPro errata 66, 92, so unless somebody can guarantee
3233 * atomic_set() here would be safe on all archs (and not only on x86),
3234 * it's safer to use atomic_add().
3235 */
3236 atomic_add(mapcount + 1, &page_tail->_count);
3237
3238
3239 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
3240 page_tail->flags |= (head->flags &
3241 ((1L << PG_referenced) |
3242 (1L << PG_swapbacked) |
3243 (1L << PG_mlocked) |
3244 (1L << PG_uptodate) |
3245 (1L << PG_active) |
3246 (1L << PG_locked) |
3247 (1L << PG_unevictable) |
3248 (1L << PG_dirty)));
3249
3250 /*
3251 * After clearing PageTail the gup refcount can be released.
3252 * Page flags also must be visible before we make the page non-compound.
3253 */
3254 smp_wmb();
3255
3256 clear_compound_head(page_tail);
3257
3258 if (page_is_young(head))
3259 set_page_young(page_tail);
3260 if (page_is_idle(head))
3261 set_page_idle(page_tail);
3262
3263 /* ->mapping in first tail page is compound_mapcount */
3264 VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
3265 page_tail);
3266 page_tail->mapping = head->mapping;
3267
3268 page_tail->index = head->index + tail;
3269 page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
3270 lru_add_page_tail(head, page_tail, lruvec, list);
3271
3272 return mapcount;
3273}
3274
3275static void __split_huge_page(struct page *page, struct list_head *list)
3276{
3277 struct page *head = compound_head(page);
3278 struct zone *zone = page_zone(head);
3279 struct lruvec *lruvec;
3280 int i, tail_mapcount;
3281
3282 /* prevent PageLRU to go away from under us, and freeze lru stats */
3283 spin_lock_irq(&zone->lru_lock);
3284 lruvec = mem_cgroup_page_lruvec(head, zone);
3285
3286 /* complete memcg works before add pages to LRU */
3287 mem_cgroup_split_huge_fixup(head);
3288
3289 tail_mapcount = 0;
3290 for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
3291 tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
3292 atomic_sub(tail_mapcount, &head->_count);
3293
3294 ClearPageCompound(head);
3295 spin_unlock_irq(&zone->lru_lock);
3296
3297 unfreeze_page(page_anon_vma(head), head);
3298
3299 for (i = 0; i < HPAGE_PMD_NR; i++) {
3300 struct page *subpage = head + i;
3301 if (subpage == page)
3302 continue;
3303 unlock_page(subpage);
3304
3305 /*
3306 * Subpages may be freed if there wasn't any mapping
3307 * like if add_to_swap() is running on a lru page that
3308 * had its mapping zapped. And freeing these pages
3309 * requires taking the lru_lock so we do the put_page
3310 * of the tail pages after the split is complete.
3311 */
3312 put_page(subpage);
3187 } 3313 }
3188} 3314}
3315
3316int total_mapcount(struct page *page)
3317{
3318 int i, ret;
3319
3320 VM_BUG_ON_PAGE(PageTail(page), page);
3321
3322 if (likely(!PageCompound(page)))
3323 return atomic_read(&page->_mapcount) + 1;
3324
3325 ret = compound_mapcount(page);
3326 if (PageHuge(page))
3327 return ret;
3328 for (i = 0; i < HPAGE_PMD_NR; i++)
3329 ret += atomic_read(&page[i]._mapcount) + 1;
3330 if (PageDoubleMap(page))
3331 ret -= HPAGE_PMD_NR;
3332 return ret;
3333}
3334
3335/*
3336 * This function splits huge page into normal pages. @page can point to any
3337 * subpage of huge page to split. Split doesn't change the position of @page.
3338 *
3339 * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
3340 * The huge page must be locked.
3341 *
3342 * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
3343 *
3344 * Both head page and tail pages will inherit mapping, flags, and so on from
3345 * the hugepage.
3346 *
3347 * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
3348 * they are not mapped.
3349 *
3350 * Returns 0 if the hugepage is split successfully.
3351 * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
3352 * us.
3353 */
3354int split_huge_page_to_list(struct page *page, struct list_head *list)
3355{
3356 struct page *head = compound_head(page);
3357 struct anon_vma *anon_vma;
3358 int count, mapcount, ret;
3359 bool mlocked;
3360
3361 VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
3362 VM_BUG_ON_PAGE(!PageAnon(page), page);
3363 VM_BUG_ON_PAGE(!PageLocked(page), page);
3364 VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
3365 VM_BUG_ON_PAGE(!PageCompound(page), page);
3366
3367 /*
3368 * The caller does not necessarily hold an mmap_sem that would prevent
3369 * the anon_vma disappearing so we first we take a reference to it
3370 * and then lock the anon_vma for write. This is similar to
3371 * page_lock_anon_vma_read except the write lock is taken to serialise
3372 * against parallel split or collapse operations.
3373 */
3374 anon_vma = page_get_anon_vma(head);
3375 if (!anon_vma) {
3376 ret = -EBUSY;
3377 goto out;
3378 }
3379 anon_vma_lock_write(anon_vma);
3380
3381 /*
3382 * Racy check if we can split the page, before freeze_page() will
3383 * split PMDs
3384 */
3385 if (total_mapcount(head) != page_count(head) - 1) {
3386 ret = -EBUSY;
3387 goto out_unlock;
3388 }
3389
3390 mlocked = PageMlocked(page);
3391 freeze_page(anon_vma, head);
3392 VM_BUG_ON_PAGE(compound_mapcount(head), head);
3393
3394 /* Make sure the page is not on per-CPU pagevec as it takes pin */
3395 if (mlocked)
3396 lru_add_drain();
3397
3398 /* Prevent deferred_split_scan() touching ->_count */
3399 spin_lock(&split_queue_lock);
3400 count = page_count(head);
3401 mapcount = total_mapcount(head);
3402 if (!mapcount && count == 1) {
3403 if (!list_empty(page_deferred_list(head))) {
3404 split_queue_len--;
3405 list_del(page_deferred_list(head));
3406 }
3407 spin_unlock(&split_queue_lock);
3408 __split_huge_page(page, list);
3409 ret = 0;
3410 } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
3411 spin_unlock(&split_queue_lock);
3412 pr_alert("total_mapcount: %u, page_count(): %u\n",
3413 mapcount, count);
3414 if (PageTail(page))
3415 dump_page(head, NULL);
3416 dump_page(page, "total_mapcount(head) > 0");
3417 BUG();
3418 } else {
3419 spin_unlock(&split_queue_lock);
3420 unfreeze_page(anon_vma, head);
3421 ret = -EBUSY;
3422 }
3423
3424out_unlock:
3425 anon_vma_unlock_write(anon_vma);
3426 put_anon_vma(anon_vma);
3427out:
3428 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
3429 return ret;
3430}
3431
3432void free_transhuge_page(struct page *page)
3433{
3434 unsigned long flags;
3435
3436 spin_lock_irqsave(&split_queue_lock, flags);
3437 if (!list_empty(page_deferred_list(page))) {
3438 split_queue_len--;
3439 list_del(page_deferred_list(page));
3440 }
3441 spin_unlock_irqrestore(&split_queue_lock, flags);
3442 free_compound_page(page);
3443}
3444
3445void deferred_split_huge_page(struct page *page)
3446{
3447 unsigned long flags;
3448
3449 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3450
3451 spin_lock_irqsave(&split_queue_lock, flags);
3452 if (list_empty(page_deferred_list(page))) {
3453 list_add_tail(page_deferred_list(page), &split_queue);
3454 split_queue_len++;
3455 }
3456 spin_unlock_irqrestore(&split_queue_lock, flags);
3457}
3458
3459static unsigned long deferred_split_count(struct shrinker *shrink,
3460 struct shrink_control *sc)
3461{
3462 /*
3463 * Split a page from split_queue will free up at least one page,
3464 * at most HPAGE_PMD_NR - 1. We don't track exact number.
3465 * Let's use HPAGE_PMD_NR / 2 as ballpark.
3466 */
3467 return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
3468}
3469
3470static unsigned long deferred_split_scan(struct shrinker *shrink,
3471 struct shrink_control *sc)
3472{
3473 unsigned long flags;
3474 LIST_HEAD(list), *pos, *next;
3475 struct page *page;
3476 int split = 0;
3477
3478 spin_lock_irqsave(&split_queue_lock, flags);
3479 list_splice_init(&split_queue, &list);
3480
3481 /* Take pin on all head pages to avoid freeing them under us */
3482 list_for_each_safe(pos, next, &list) {
3483 page = list_entry((void *)pos, struct page, mapping);
3484 page = compound_head(page);
3485 /* race with put_compound_page() */
3486 if (!get_page_unless_zero(page)) {
3487 list_del_init(page_deferred_list(page));
3488 split_queue_len--;
3489 }
3490 }
3491 spin_unlock_irqrestore(&split_queue_lock, flags);
3492
3493 list_for_each_safe(pos, next, &list) {
3494 page = list_entry((void *)pos, struct page, mapping);
3495 lock_page(page);
3496 /* split_huge_page() removes page from list on success */
3497 if (!split_huge_page(page))
3498 split++;
3499 unlock_page(page);
3500 put_page(page);
3501 }
3502
3503 spin_lock_irqsave(&split_queue_lock, flags);
3504 list_splice_tail(&list, &split_queue);
3505 spin_unlock_irqrestore(&split_queue_lock, flags);
3506
3507 return split * HPAGE_PMD_NR / 2;
3508}
3509
3510static struct shrinker deferred_split_shrinker = {
3511 .count_objects = deferred_split_count,
3512 .scan_objects = deferred_split_scan,
3513 .seeks = DEFAULT_SEEKS,
3514};
3515
3516#ifdef CONFIG_DEBUG_FS
3517static int split_huge_pages_set(void *data, u64 val)
3518{
3519 struct zone *zone;
3520 struct page *page;
3521 unsigned long pfn, max_zone_pfn;
3522 unsigned long total = 0, split = 0;
3523
3524 if (val != 1)
3525 return -EINVAL;
3526
3527 for_each_populated_zone(zone) {
3528 max_zone_pfn = zone_end_pfn(zone);
3529 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
3530 if (!pfn_valid(pfn))
3531 continue;
3532
3533 page = pfn_to_page(pfn);
3534 if (!get_page_unless_zero(page))
3535 continue;
3536
3537 if (zone != page_zone(page))
3538 goto next;
3539
3540 if (!PageHead(page) || !PageAnon(page) ||
3541 PageHuge(page))
3542 goto next;
3543
3544 total++;
3545 lock_page(page);
3546 if (!split_huge_page(page))
3547 split++;
3548 unlock_page(page);
3549next:
3550 put_page(page);
3551 }
3552 }
3553
3554 pr_info("%lu of %lu THP split", split, total);
3555
3556 return 0;
3557}
3558DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
3559 "%llu\n");
3560
3561static int __init split_huge_pages_debugfs(void)
3562{
3563 void *ret;
3564
3565 ret = debugfs_create_file("split_huge_pages", 0644, NULL, NULL,
3566 &split_huge_pages_fops);
3567 if (!ret)
3568 pr_warn("Failed to create split_huge_pages in debugfs");
3569 return 0;
3570}
3571late_initcall(split_huge_pages_debugfs);
3572#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index be934df69b85..12908dcf5831 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1267,8 +1267,8 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
1267 1267
1268 /* we rely on prep_new_huge_page to set the destructor */ 1268 /* we rely on prep_new_huge_page to set the destructor */
1269 set_compound_order(page, order); 1269 set_compound_order(page, order);
1270 __SetPageHead(page);
1271 __ClearPageReserved(page); 1270 __ClearPageReserved(page);
1271 __SetPageHead(page);
1272 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 1272 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1273 /* 1273 /*
1274 * For gigantic hugepages allocated through bootmem at 1274 * For gigantic hugepages allocated through bootmem at
@@ -3102,7 +3102,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
3102 entry = huge_ptep_get(src_pte); 3102 entry = huge_ptep_get(src_pte);
3103 ptepage = pte_page(entry); 3103 ptepage = pte_page(entry);
3104 get_page(ptepage); 3104 get_page(ptepage);
3105 page_dup_rmap(ptepage); 3105 page_dup_rmap(ptepage, true);
3106 set_huge_pte_at(dst, addr, dst_pte, entry); 3106 set_huge_pte_at(dst, addr, dst_pte, entry);
3107 hugetlb_count_add(pages_per_huge_page(h), dst); 3107 hugetlb_count_add(pages_per_huge_page(h), dst);
3108 } 3108 }
@@ -3186,7 +3186,7 @@ again:
3186 set_page_dirty(page); 3186 set_page_dirty(page);
3187 3187
3188 hugetlb_count_sub(pages_per_huge_page(h), mm); 3188 hugetlb_count_sub(pages_per_huge_page(h), mm);
3189 page_remove_rmap(page); 3189 page_remove_rmap(page, true);
3190 force_flush = !__tlb_remove_page(tlb, page); 3190 force_flush = !__tlb_remove_page(tlb, page);
3191 if (force_flush) { 3191 if (force_flush) {
3192 address += sz; 3192 address += sz;
@@ -3415,7 +3415,7 @@ retry_avoidcopy:
3415 mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); 3415 mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
3416 set_huge_pte_at(mm, address, ptep, 3416 set_huge_pte_at(mm, address, ptep,
3417 make_huge_pte(vma, new_page, 1)); 3417 make_huge_pte(vma, new_page, 1));
3418 page_remove_rmap(old_page); 3418 page_remove_rmap(old_page, true);
3419 hugepage_add_new_anon_rmap(new_page, vma, address); 3419 hugepage_add_new_anon_rmap(new_page, vma, address);
3420 /* Make the old page be freed below */ 3420 /* Make the old page be freed below */
3421 new_page = old_page; 3421 new_page = old_page;
@@ -3585,7 +3585,7 @@ retry:
3585 ClearPagePrivate(page); 3585 ClearPagePrivate(page);
3586 hugepage_add_new_anon_rmap(page, vma, address); 3586 hugepage_add_new_anon_rmap(page, vma, address);
3587 } else 3587 } else
3588 page_dup_rmap(page); 3588 page_dup_rmap(page, true);
3589 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 3589 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
3590 && (vma->vm_flags & VM_SHARED))); 3590 && (vma->vm_flags & VM_SHARED)));
3591 set_huge_pte_at(mm, address, ptep, new_pte); 3591 set_huge_pte_at(mm, address, ptep, new_pte);
@@ -3865,7 +3865,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
3865same_page: 3865same_page:
3866 if (pages) { 3866 if (pages) {
3867 pages[i] = mem_map_offset(page, pfn_offset); 3867 pages[i] = mem_map_offset(page, pfn_offset);
3868 get_page_foll(pages[i]); 3868 get_page(pages[i]);
3869 } 3869 }
3870 3870
3871 if (vmas) 3871 if (vmas)
diff --git a/mm/internal.h b/mm/internal.h
index 38e24b89e4c4..ed8b5ffcf9b1 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -13,6 +13,7 @@
13 13
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/pagemap.h>
16 17
17/* 18/*
18 * The set of flags that only affect watermark checking and reclaim 19 * The set of flags that only affect watermark checking and reclaim
@@ -66,50 +67,6 @@ static inline void set_page_refcounted(struct page *page)
66 set_page_count(page, 1); 67 set_page_count(page, 1);
67} 68}
68 69
69static inline void __get_page_tail_foll(struct page *page,
70 bool get_page_head)
71{
72 /*
73 * If we're getting a tail page, the elevated page->_count is
74 * required only in the head page and we will elevate the head
75 * page->_count and tail page->_mapcount.
76 *
77 * We elevate page_tail->_mapcount for tail pages to force
78 * page_tail->_count to be zero at all times to avoid getting
79 * false positives from get_page_unless_zero() with
80 * speculative page access (like in
81 * page_cache_get_speculative()) on tail pages.
82 */
83 VM_BUG_ON_PAGE(atomic_read(&compound_head(page)->_count) <= 0, page);
84 if (get_page_head)
85 atomic_inc(&compound_head(page)->_count);
86 get_huge_page_tail(page);
87}
88
89/*
90 * This is meant to be called as the FOLL_GET operation of
91 * follow_page() and it must be called while holding the proper PT
92 * lock while the pte (or pmd_trans_huge) is still mapping the page.
93 */
94static inline void get_page_foll(struct page *page)
95{
96 if (unlikely(PageTail(page)))
97 /*
98 * This is safe only because
99 * __split_huge_page_refcount() can't run under
100 * get_page_foll() because we hold the proper PT lock.
101 */
102 __get_page_tail_foll(page, true);
103 else {
104 /*
105 * Getting a normal page or the head of a compound page
106 * requires to already have an elevated page->_count.
107 */
108 VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
109 atomic_inc(&page->_count);
110 }
111}
112
113extern unsigned long highest_memmap_pfn; 70extern unsigned long highest_memmap_pfn;
114 71
115/* 72/*
@@ -309,10 +266,27 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
309 266
310extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); 267extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
311 268
312#ifdef CONFIG_TRANSPARENT_HUGEPAGE 269/*
313extern unsigned long vma_address(struct page *page, 270 * At what user virtual address is page expected in @vma?
314 struct vm_area_struct *vma); 271 */
315#endif 272static inline unsigned long
273__vma_address(struct page *page, struct vm_area_struct *vma)
274{
275 pgoff_t pgoff = page_to_pgoff(page);
276 return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
277}
278
279static inline unsigned long
280vma_address(struct page *page, struct vm_area_struct *vma)
281{
282 unsigned long address = __vma_address(page, vma);
283
284 /* page should be within @vma mapping range */
285 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
286
287 return address;
288}
289
316#else /* !CONFIG_MMU */ 290#else /* !CONFIG_MMU */
317static inline void clear_page_mlock(struct page *page) { } 291static inline void clear_page_mlock(struct page *page) { }
318static inline void mlock_vma_page(struct page *page) { } 292static inline void mlock_vma_page(struct page *page) { }
diff --git a/mm/ksm.c b/mm/ksm.c
index 2d162c5625f6..ca6d2a06a615 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -441,20 +441,6 @@ static void break_cow(struct rmap_item *rmap_item)
441 up_read(&mm->mmap_sem); 441 up_read(&mm->mmap_sem);
442} 442}
443 443
444static struct page *page_trans_compound_anon(struct page *page)
445{
446 if (PageTransCompound(page)) {
447 struct page *head = compound_head(page);
448 /*
449 * head may actually be splitted and freed from under
450 * us but it's ok here.
451 */
452 if (PageAnon(head))
453 return head;
454 }
455 return NULL;
456}
457
458static struct page *get_mergeable_page(struct rmap_item *rmap_item) 444static struct page *get_mergeable_page(struct rmap_item *rmap_item)
459{ 445{
460 struct mm_struct *mm = rmap_item->mm; 446 struct mm_struct *mm = rmap_item->mm;
@@ -470,7 +456,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
470 page = follow_page(vma, addr, FOLL_GET); 456 page = follow_page(vma, addr, FOLL_GET);
471 if (IS_ERR_OR_NULL(page)) 457 if (IS_ERR_OR_NULL(page))
472 goto out; 458 goto out;
473 if (PageAnon(page) || page_trans_compound_anon(page)) { 459 if (PageAnon(page)) {
474 flush_anon_page(vma, page, addr); 460 flush_anon_page(vma, page, addr);
475 flush_dcache_page(page); 461 flush_dcache_page(page);
476 } else { 462 } else {
@@ -956,13 +942,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
956 } 942 }
957 943
958 get_page(kpage); 944 get_page(kpage);
959 page_add_anon_rmap(kpage, vma, addr); 945 page_add_anon_rmap(kpage, vma, addr, false);
960 946
961 flush_cache_page(vma, addr, pte_pfn(*ptep)); 947 flush_cache_page(vma, addr, pte_pfn(*ptep));
962 ptep_clear_flush_notify(vma, addr, ptep); 948 ptep_clear_flush_notify(vma, addr, ptep);
963 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); 949 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
964 950
965 page_remove_rmap(page); 951 page_remove_rmap(page, false);
966 if (!page_mapped(page)) 952 if (!page_mapped(page))
967 try_to_free_swap(page); 953 try_to_free_swap(page);
968 put_page(page); 954 put_page(page);
@@ -975,33 +961,6 @@ out:
975 return err; 961 return err;
976} 962}
977 963
978static int page_trans_compound_anon_split(struct page *page)
979{
980 int ret = 0;
981 struct page *transhuge_head = page_trans_compound_anon(page);
982 if (transhuge_head) {
983 /* Get the reference on the head to split it. */
984 if (get_page_unless_zero(transhuge_head)) {
985 /*
986 * Recheck we got the reference while the head
987 * was still anonymous.
988 */
989 if (PageAnon(transhuge_head))
990 ret = split_huge_page(transhuge_head);
991 else
992 /*
993 * Retry later if split_huge_page run
994 * from under us.
995 */
996 ret = 1;
997 put_page(transhuge_head);
998 } else
999 /* Retry later if split_huge_page run from under us. */
1000 ret = 1;
1001 }
1002 return ret;
1003}
1004
1005/* 964/*
1006 * try_to_merge_one_page - take two pages and merge them into one 965 * try_to_merge_one_page - take two pages and merge them into one
1007 * @vma: the vma that holds the pte pointing to page 966 * @vma: the vma that holds the pte pointing to page
@@ -1020,9 +979,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
1020 if (page == kpage) /* ksm page forked */ 979 if (page == kpage) /* ksm page forked */
1021 return 0; 980 return 0;
1022 981
1023 if (PageTransCompound(page) && page_trans_compound_anon_split(page))
1024 goto out;
1025 BUG_ON(PageTransCompound(page));
1026 if (!PageAnon(page)) 982 if (!PageAnon(page))
1027 goto out; 983 goto out;
1028 984
@@ -1035,6 +991,13 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
1035 */ 991 */
1036 if (!trylock_page(page)) 992 if (!trylock_page(page))
1037 goto out; 993 goto out;
994
995 if (PageTransCompound(page)) {
996 err = split_huge_page(page);
997 if (err)
998 goto out_unlock;
999 }
1000
1038 /* 1001 /*
1039 * If this anonymous page is mapped only here, its pte may need 1002 * If this anonymous page is mapped only here, its pte may need
1040 * to be write-protected. If it's mapped elsewhere, all of its 1003 * to be write-protected. If it's mapped elsewhere, all of its
@@ -1050,6 +1013,12 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
1050 */ 1013 */
1051 set_page_stable_node(page, NULL); 1014 set_page_stable_node(page, NULL);
1052 mark_page_accessed(page); 1015 mark_page_accessed(page);
1016 /*
1017 * Page reclaim just frees a clean page with no dirty
1018 * ptes: make sure that the ksm page would be swapped.
1019 */
1020 if (!PageDirty(page))
1021 SetPageDirty(page);
1053 err = 0; 1022 err = 0;
1054 } else if (pages_identical(page, kpage)) 1023 } else if (pages_identical(page, kpage))
1055 err = replace_page(vma, page, kpage, orig_pte); 1024 err = replace_page(vma, page, kpage, orig_pte);
@@ -1065,6 +1034,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
1065 } 1034 }
1066 } 1035 }
1067 1036
1037out_unlock:
1068 unlock_page(page); 1038 unlock_page(page);
1069out: 1039out:
1070 return err; 1040 return err;
@@ -1635,8 +1605,7 @@ next_mm:
1635 cond_resched(); 1605 cond_resched();
1636 continue; 1606 continue;
1637 } 1607 }
1638 if (PageAnon(*page) || 1608 if (PageAnon(*page)) {
1639 page_trans_compound_anon(*page)) {
1640 flush_anon_page(vma, *page, ksm_scan.address); 1609 flush_anon_page(vma, *page, ksm_scan.address);
1641 flush_dcache_page(*page); 1610 flush_dcache_page(*page);
1642 rmap_item = get_next_rmap_item(slot, 1611 rmap_item = get_next_rmap_item(slot,
@@ -1899,7 +1868,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
1899 1868
1900 SetPageDirty(new_page); 1869 SetPageDirty(new_page);
1901 __SetPageUptodate(new_page); 1870 __SetPageUptodate(new_page);
1902 __set_page_locked(new_page); 1871 __SetPageLocked(new_page);
1903 } 1872 }
1904 1873
1905 return new_page; 1874 return new_page;
diff --git a/mm/madvise.c b/mm/madvise.c
index c889fcbb530e..f56825b6d2e1 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -20,6 +20,9 @@
20#include <linux/backing-dev.h> 20#include <linux/backing-dev.h>
21#include <linux/swap.h> 21#include <linux/swap.h>
22#include <linux/swapops.h> 22#include <linux/swapops.h>
23#include <linux/mmu_notifier.h>
24
25#include <asm/tlb.h>
23 26
24/* 27/*
25 * Any behaviour which results in changes to the vma->vm_flags needs to 28 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -32,6 +35,7 @@ static int madvise_need_mmap_write(int behavior)
32 case MADV_REMOVE: 35 case MADV_REMOVE:
33 case MADV_WILLNEED: 36 case MADV_WILLNEED:
34 case MADV_DONTNEED: 37 case MADV_DONTNEED:
38 case MADV_FREE:
35 return 0; 39 return 0;
36 default: 40 default:
37 /* be safe, default to 1. list exceptions explicitly */ 41 /* be safe, default to 1. list exceptions explicitly */
@@ -256,6 +260,194 @@ static long madvise_willneed(struct vm_area_struct *vma,
256 return 0; 260 return 0;
257} 261}
258 262
263static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
264 unsigned long end, struct mm_walk *walk)
265
266{
267 struct mmu_gather *tlb = walk->private;
268 struct mm_struct *mm = tlb->mm;
269 struct vm_area_struct *vma = walk->vma;
270 spinlock_t *ptl;
271 pte_t *orig_pte, *pte, ptent;
272 struct page *page;
273 int nr_swap = 0;
274 unsigned long next;
275
276 next = pmd_addr_end(addr, end);
277 if (pmd_trans_huge(*pmd))
278 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
279 goto next;
280
281 if (pmd_trans_unstable(pmd))
282 return 0;
283
284 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
285 arch_enter_lazy_mmu_mode();
286 for (; addr != end; pte++, addr += PAGE_SIZE) {
287 ptent = *pte;
288
289 if (pte_none(ptent))
290 continue;
291 /*
292 * If the pte has swp_entry, just clear page table to
293 * prevent swap-in which is more expensive rather than
294 * (page allocation + zeroing).
295 */
296 if (!pte_present(ptent)) {
297 swp_entry_t entry;
298
299 entry = pte_to_swp_entry(ptent);
300 if (non_swap_entry(entry))
301 continue;
302 nr_swap--;
303 free_swap_and_cache(entry);
304 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
305 continue;
306 }
307
308 page = vm_normal_page(vma, addr, ptent);
309 if (!page)
310 continue;
311
312 /*
313 * If pmd isn't transhuge but the page is THP and
314 * is owned by only this process, split it and
315 * deactivate all pages.
316 */
317 if (PageTransCompound(page)) {
318 if (page_mapcount(page) != 1)
319 goto out;
320 get_page(page);
321 if (!trylock_page(page)) {
322 put_page(page);
323 goto out;
324 }
325 pte_unmap_unlock(orig_pte, ptl);
326 if (split_huge_page(page)) {
327 unlock_page(page);
328 put_page(page);
329 pte_offset_map_lock(mm, pmd, addr, &ptl);
330 goto out;
331 }
332 put_page(page);
333 unlock_page(page);
334 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
335 pte--;
336 addr -= PAGE_SIZE;
337 continue;
338 }
339
340 VM_BUG_ON_PAGE(PageTransCompound(page), page);
341
342 if (PageSwapCache(page) || PageDirty(page)) {
343 if (!trylock_page(page))
344 continue;
345 /*
346 * If page is shared with others, we couldn't clear
347 * PG_dirty of the page.
348 */
349 if (page_mapcount(page) != 1) {
350 unlock_page(page);
351 continue;
352 }
353
354 if (PageSwapCache(page) && !try_to_free_swap(page)) {
355 unlock_page(page);
356 continue;
357 }
358
359 ClearPageDirty(page);
360 unlock_page(page);
361 }
362
363 if (pte_young(ptent) || pte_dirty(ptent)) {
364 /*
365 * Some of architecture(ex, PPC) don't update TLB
366 * with set_pte_at and tlb_remove_tlb_entry so for
367 * the portability, remap the pte with old|clean
368 * after pte clearing.
369 */
370 ptent = ptep_get_and_clear_full(mm, addr, pte,
371 tlb->fullmm);
372
373 ptent = pte_mkold(ptent);
374 ptent = pte_mkclean(ptent);
375 set_pte_at(mm, addr, pte, ptent);
376 if (PageActive(page))
377 deactivate_page(page);
378 tlb_remove_tlb_entry(tlb, pte, addr);
379 }
380 }
381out:
382 if (nr_swap) {
383 if (current->mm == mm)
384 sync_mm_rss(mm);
385
386 add_mm_counter(mm, MM_SWAPENTS, nr_swap);
387 }
388 arch_leave_lazy_mmu_mode();
389 pte_unmap_unlock(orig_pte, ptl);
390 cond_resched();
391next:
392 return 0;
393}
394
395static void madvise_free_page_range(struct mmu_gather *tlb,
396 struct vm_area_struct *vma,
397 unsigned long addr, unsigned long end)
398{
399 struct mm_walk free_walk = {
400 .pmd_entry = madvise_free_pte_range,
401 .mm = vma->vm_mm,
402 .private = tlb,
403 };
404
405 tlb_start_vma(tlb, vma);
406 walk_page_range(addr, end, &free_walk);
407 tlb_end_vma(tlb, vma);
408}
409
410static int madvise_free_single_vma(struct vm_area_struct *vma,
411 unsigned long start_addr, unsigned long end_addr)
412{
413 unsigned long start, end;
414 struct mm_struct *mm = vma->vm_mm;
415 struct mmu_gather tlb;
416
417 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
418 return -EINVAL;
419
420 /* MADV_FREE works for only anon vma at the moment */
421 if (!vma_is_anonymous(vma))
422 return -EINVAL;
423
424 start = max(vma->vm_start, start_addr);
425 if (start >= vma->vm_end)
426 return -EINVAL;
427 end = min(vma->vm_end, end_addr);
428 if (end <= vma->vm_start)
429 return -EINVAL;
430
431 lru_add_drain();
432 tlb_gather_mmu(&tlb, mm, start, end);
433 update_hiwater_rss(mm);
434
435 mmu_notifier_invalidate_range_start(mm, start, end);
436 madvise_free_page_range(&tlb, vma, start, end);
437 mmu_notifier_invalidate_range_end(mm, start, end);
438 tlb_finish_mmu(&tlb, start, end);
439
440 return 0;
441}
442
443static long madvise_free(struct vm_area_struct *vma,
444 struct vm_area_struct **prev,
445 unsigned long start, unsigned long end)
446{
447 *prev = vma;
448 return madvise_free_single_vma(vma, start, end);
449}
450
259/* 451/*
260 * Application no longer needs these pages. If the pages are dirty, 452 * Application no longer needs these pages. If the pages are dirty,
261 * it's OK to just throw them away. The app will be more careful about 453 * it's OK to just throw them away. The app will be more careful about
@@ -379,6 +571,14 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
379 return madvise_remove(vma, prev, start, end); 571 return madvise_remove(vma, prev, start, end);
380 case MADV_WILLNEED: 572 case MADV_WILLNEED:
381 return madvise_willneed(vma, prev, start, end); 573 return madvise_willneed(vma, prev, start, end);
574 case MADV_FREE:
575 /*
576 * XXX: In this implementation, MADV_FREE works like
577 * MADV_DONTNEED on swapless system or full swap.
578 */
579 if (get_nr_swap_pages() > 0)
580 return madvise_free(vma, prev, start, end);
581 /* passthrough */
382 case MADV_DONTNEED: 582 case MADV_DONTNEED:
383 return madvise_dontneed(vma, prev, start, end); 583 return madvise_dontneed(vma, prev, start, end);
384 default: 584 default:
@@ -398,6 +598,7 @@ madvise_behavior_valid(int behavior)
398 case MADV_REMOVE: 598 case MADV_REMOVE:
399 case MADV_WILLNEED: 599 case MADV_WILLNEED:
400 case MADV_DONTNEED: 600 case MADV_DONTNEED:
601 case MADV_FREE:
401#ifdef CONFIG_KSM 602#ifdef CONFIG_KSM
402 case MADV_MERGEABLE: 603 case MADV_MERGEABLE:
403 case MADV_UNMERGEABLE: 604 case MADV_UNMERGEABLE:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 54eae4f19d80..0eda67376df4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -382,14 +382,11 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
382{ 382{
383 struct mem_cgroup *memcg; 383 struct mem_cgroup *memcg;
384 384
385 rcu_read_lock();
386
387 memcg = page->mem_cgroup; 385 memcg = page->mem_cgroup;
388 386
389 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 387 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
390 memcg = root_mem_cgroup; 388 memcg = root_mem_cgroup;
391 389
392 rcu_read_unlock();
393 return &memcg->css; 390 return &memcg->css;
394} 391}
395 392
@@ -647,7 +644,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
647 644
648static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 645static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
649 struct page *page, 646 struct page *page,
650 int nr_pages) 647 bool compound, int nr_pages)
651{ 648{
652 /* 649 /*
653 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 650 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
@@ -660,9 +657,11 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
660 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 657 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
661 nr_pages); 658 nr_pages);
662 659
663 if (PageTransHuge(page)) 660 if (compound) {
661 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
664 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 662 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
665 nr_pages); 663 nr_pages);
664 }
666 665
667 /* pagein of a big page is an event. So, ignore page size */ 666 /* pagein of a big page is an event. So, ignore page size */
668 if (nr_pages > 0) 667 if (nr_pages > 0)
@@ -2431,9 +2430,7 @@ void __memcg_kmem_uncharge(struct page *page, int order)
2431 2430
2432/* 2431/*
2433 * Because tail pages are not marked as "used", set it. We're under 2432 * Because tail pages are not marked as "used", set it. We're under
2434 * zone->lru_lock, 'splitting on pmd' and compound_lock. 2433 * zone->lru_lock and migration entries setup in all page mappings.
2435 * charge/uncharge will be never happen and move_account() is done under
2436 * compound_lock(), so we don't have to take care of races.
2437 */ 2434 */
2438void mem_cgroup_split_huge_fixup(struct page *head) 2435void mem_cgroup_split_huge_fixup(struct page *head)
2439{ 2436{
@@ -3494,16 +3491,17 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3494swap_buffers: 3491swap_buffers:
3495 /* Swap primary and spare array */ 3492 /* Swap primary and spare array */
3496 thresholds->spare = thresholds->primary; 3493 thresholds->spare = thresholds->primary;
3497 /* If all events are unregistered, free the spare array */
3498 if (!new) {
3499 kfree(thresholds->spare);
3500 thresholds->spare = NULL;
3501 }
3502 3494
3503 rcu_assign_pointer(thresholds->primary, new); 3495 rcu_assign_pointer(thresholds->primary, new);
3504 3496
3505 /* To be sure that nobody uses thresholds */ 3497 /* To be sure that nobody uses thresholds */
3506 synchronize_rcu(); 3498 synchronize_rcu();
3499
3500 /* If all events are unregistered, free the spare array */
3501 if (!new) {
3502 kfree(thresholds->spare);
3503 thresholds->spare = NULL;
3504 }
3507unlock: 3505unlock:
3508 mutex_unlock(&memcg->thresholds_lock); 3506 mutex_unlock(&memcg->thresholds_lock);
3509} 3507}
@@ -4505,38 +4503,30 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4505 * @from: mem_cgroup which the page is moved from. 4503 * @from: mem_cgroup which the page is moved from.
4506 * @to: mem_cgroup which the page is moved to. @from != @to. 4504 * @to: mem_cgroup which the page is moved to. @from != @to.
4507 * 4505 *
4508 * The caller must confirm following. 4506 * The caller must make sure the page is not on LRU (isolate_page() is useful.)
4509 * - page is not on LRU (isolate_page() is useful.)
4510 * - compound_lock is held when nr_pages > 1
4511 * 4507 *
4512 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 4508 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
4513 * from old cgroup. 4509 * from old cgroup.
4514 */ 4510 */
4515static int mem_cgroup_move_account(struct page *page, 4511static int mem_cgroup_move_account(struct page *page,
4516 unsigned int nr_pages, 4512 bool compound,
4517 struct mem_cgroup *from, 4513 struct mem_cgroup *from,
4518 struct mem_cgroup *to) 4514 struct mem_cgroup *to)
4519{ 4515{
4520 unsigned long flags; 4516 unsigned long flags;
4517 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
4521 int ret; 4518 int ret;
4522 bool anon; 4519 bool anon;
4523 4520
4524 VM_BUG_ON(from == to); 4521 VM_BUG_ON(from == to);
4525 VM_BUG_ON_PAGE(PageLRU(page), page); 4522 VM_BUG_ON_PAGE(PageLRU(page), page);
4526 /* 4523 VM_BUG_ON(compound && !PageTransHuge(page));
4527 * The page is isolated from LRU. So, collapse function
4528 * will not handle this page. But page splitting can happen.
4529 * Do this check under compound_page_lock(). The caller should
4530 * hold it.
4531 */
4532 ret = -EBUSY;
4533 if (nr_pages > 1 && !PageTransHuge(page))
4534 goto out;
4535 4524
4536 /* 4525 /*
4537 * Prevent mem_cgroup_replace_page() from looking at 4526 * Prevent mem_cgroup_replace_page() from looking at
4538 * page->mem_cgroup of its source page while we change it. 4527 * page->mem_cgroup of its source page while we change it.
4539 */ 4528 */
4529 ret = -EBUSY;
4540 if (!trylock_page(page)) 4530 if (!trylock_page(page))
4541 goto out; 4531 goto out;
4542 4532
@@ -4591,9 +4581,9 @@ static int mem_cgroup_move_account(struct page *page,
4591 ret = 0; 4581 ret = 0;
4592 4582
4593 local_irq_disable(); 4583 local_irq_disable();
4594 mem_cgroup_charge_statistics(to, page, nr_pages); 4584 mem_cgroup_charge_statistics(to, page, compound, nr_pages);
4595 memcg_check_events(to, page); 4585 memcg_check_events(to, page);
4596 mem_cgroup_charge_statistics(from, page, -nr_pages); 4586 mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
4597 memcg_check_events(from, page); 4587 memcg_check_events(from, page);
4598 local_irq_enable(); 4588 local_irq_enable();
4599out_unlock: 4589out_unlock:
@@ -4683,7 +4673,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4683 pte_t *pte; 4673 pte_t *pte;
4684 spinlock_t *ptl; 4674 spinlock_t *ptl;
4685 4675
4686 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 4676 if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
4687 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 4677 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
4688 mc.precharge += HPAGE_PMD_NR; 4678 mc.precharge += HPAGE_PMD_NR;
4689 spin_unlock(ptl); 4679 spin_unlock(ptl);
@@ -4871,17 +4861,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4871 union mc_target target; 4861 union mc_target target;
4872 struct page *page; 4862 struct page *page;
4873 4863
4874 /* 4864 if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
4875 * We don't take compound_lock() here but no race with splitting thp
4876 * happens because:
4877 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not
4878 * under splitting, which means there's no concurrent thp split,
4879 * - if another thread runs into split_huge_page() just after we
4880 * entered this if-block, the thread must wait for page table lock
4881 * to be unlocked in __split_huge_page_splitting(), where the main
4882 * part of thp split is not executed yet.
4883 */
4884 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
4885 if (mc.precharge < HPAGE_PMD_NR) { 4865 if (mc.precharge < HPAGE_PMD_NR) {
4886 spin_unlock(ptl); 4866 spin_unlock(ptl);
4887 return 0; 4867 return 0;
@@ -4890,7 +4870,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4890 if (target_type == MC_TARGET_PAGE) { 4870 if (target_type == MC_TARGET_PAGE) {
4891 page = target.page; 4871 page = target.page;
4892 if (!isolate_lru_page(page)) { 4872 if (!isolate_lru_page(page)) {
4893 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 4873 if (!mem_cgroup_move_account(page, true,
4894 mc.from, mc.to)) { 4874 mc.from, mc.to)) {
4895 mc.precharge -= HPAGE_PMD_NR; 4875 mc.precharge -= HPAGE_PMD_NR;
4896 mc.moved_charge += HPAGE_PMD_NR; 4876 mc.moved_charge += HPAGE_PMD_NR;
@@ -4917,9 +4897,18 @@ retry:
4917 switch (get_mctgt_type(vma, addr, ptent, &target)) { 4897 switch (get_mctgt_type(vma, addr, ptent, &target)) {
4918 case MC_TARGET_PAGE: 4898 case MC_TARGET_PAGE:
4919 page = target.page; 4899 page = target.page;
4900 /*
4901 * We can have a part of the split pmd here. Moving it
4902 * can be done but it would be too convoluted so simply
4903 * ignore such a partial THP and keep it in original
4904 * memcg. There should be somebody mapping the head.
4905 */
4906 if (PageTransCompound(page))
4907 goto put;
4920 if (isolate_lru_page(page)) 4908 if (isolate_lru_page(page))
4921 goto put; 4909 goto put;
4922 if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) { 4910 if (!mem_cgroup_move_account(page, false,
4911 mc.from, mc.to)) {
4923 mc.precharge--; 4912 mc.precharge--;
4924 /* we uncharge from mc.from later. */ 4913 /* we uncharge from mc.from later. */
4925 mc.moved_charge++; 4914 mc.moved_charge++;
@@ -5258,10 +5247,11 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
5258 * with mem_cgroup_cancel_charge() in case page instantiation fails. 5247 * with mem_cgroup_cancel_charge() in case page instantiation fails.
5259 */ 5248 */
5260int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, 5249int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5261 gfp_t gfp_mask, struct mem_cgroup **memcgp) 5250 gfp_t gfp_mask, struct mem_cgroup **memcgp,
5251 bool compound)
5262{ 5252{
5263 struct mem_cgroup *memcg = NULL; 5253 struct mem_cgroup *memcg = NULL;
5264 unsigned int nr_pages = 1; 5254 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5265 int ret = 0; 5255 int ret = 0;
5266 5256
5267 if (mem_cgroup_disabled()) 5257 if (mem_cgroup_disabled())
@@ -5291,11 +5281,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5291 } 5281 }
5292 } 5282 }
5293 5283
5294 if (PageTransHuge(page)) {
5295 nr_pages <<= compound_order(page);
5296 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
5297 }
5298
5299 if (!memcg) 5284 if (!memcg)
5300 memcg = get_mem_cgroup_from_mm(mm); 5285 memcg = get_mem_cgroup_from_mm(mm);
5301 5286
@@ -5324,9 +5309,9 @@ out:
5324 * Use mem_cgroup_cancel_charge() to cancel the transaction instead. 5309 * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
5325 */ 5310 */
5326void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, 5311void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
5327 bool lrucare) 5312 bool lrucare, bool compound)
5328{ 5313{
5329 unsigned int nr_pages = 1; 5314 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5330 5315
5331 VM_BUG_ON_PAGE(!page->mapping, page); 5316 VM_BUG_ON_PAGE(!page->mapping, page);
5332 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); 5317 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
@@ -5343,13 +5328,8 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
5343 5328
5344 commit_charge(page, memcg, lrucare); 5329 commit_charge(page, memcg, lrucare);
5345 5330
5346 if (PageTransHuge(page)) {
5347 nr_pages <<= compound_order(page);
5348 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
5349 }
5350
5351 local_irq_disable(); 5331 local_irq_disable();
5352 mem_cgroup_charge_statistics(memcg, page, nr_pages); 5332 mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
5353 memcg_check_events(memcg, page); 5333 memcg_check_events(memcg, page);
5354 local_irq_enable(); 5334 local_irq_enable();
5355 5335
@@ -5371,9 +5351,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
5371 * 5351 *
5372 * Cancel a charge transaction started by mem_cgroup_try_charge(). 5352 * Cancel a charge transaction started by mem_cgroup_try_charge().
5373 */ 5353 */
5374void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) 5354void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
5355 bool compound)
5375{ 5356{
5376 unsigned int nr_pages = 1; 5357 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5377 5358
5378 if (mem_cgroup_disabled()) 5359 if (mem_cgroup_disabled())
5379 return; 5360 return;
@@ -5385,11 +5366,6 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
5385 if (!memcg) 5366 if (!memcg)
5386 return; 5367 return;
5387 5368
5388 if (PageTransHuge(page)) {
5389 nr_pages <<= compound_order(page);
5390 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
5391 }
5392
5393 cancel_charge(memcg, nr_pages); 5369 cancel_charge(memcg, nr_pages);
5394} 5370}
5395 5371
@@ -5750,7 +5726,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5750 * only synchronisation we have for udpating the per-CPU variables. 5726 * only synchronisation we have for udpating the per-CPU variables.
5751 */ 5727 */
5752 VM_BUG_ON(!irqs_disabled()); 5728 VM_BUG_ON(!irqs_disabled());
5753 mem_cgroup_charge_statistics(memcg, page, -1); 5729 mem_cgroup_charge_statistics(memcg, page, false, -1);
5754 memcg_check_events(memcg, page); 5730 memcg_check_events(memcg, page);
5755} 5731}
5756 5732
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 8424b64711ac..ac595e7a3a95 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -882,15 +882,7 @@ int get_hwpoison_page(struct page *page)
882{ 882{
883 struct page *head = compound_head(page); 883 struct page *head = compound_head(page);
884 884
885 if (PageHuge(head)) 885 if (!PageHuge(head) && PageTransHuge(head)) {
886 return get_page_unless_zero(head);
887
888 /*
889 * Thp tail page has special refcounting rule (refcount of tail pages
890 * is stored in ->_mapcount,) so we can't call get_page_unless_zero()
891 * directly for tail pages.
892 */
893 if (PageTransHuge(head)) {
894 /* 886 /*
895 * Non anonymous thp exists only in allocation/free time. We 887 * Non anonymous thp exists only in allocation/free time. We
896 * can't handle such a case correctly, so let's give it up. 888 * can't handle such a case correctly, so let's give it up.
@@ -902,41 +894,12 @@ int get_hwpoison_page(struct page *page)
902 page_to_pfn(page)); 894 page_to_pfn(page));
903 return 0; 895 return 0;
904 } 896 }
905
906 if (get_page_unless_zero(head)) {
907 if (PageTail(page))
908 get_page(page);
909 return 1;
910 } else {
911 return 0;
912 }
913 } 897 }
914 898
915 return get_page_unless_zero(page); 899 return get_page_unless_zero(head);
916} 900}
917EXPORT_SYMBOL_GPL(get_hwpoison_page); 901EXPORT_SYMBOL_GPL(get_hwpoison_page);
918 902
919/**
920 * put_hwpoison_page() - Put refcount for memory error handling:
921 * @page: raw error page (hit by memory error)
922 */
923void put_hwpoison_page(struct page *page)
924{
925 struct page *head = compound_head(page);
926
927 if (PageHuge(head)) {
928 put_page(head);
929 return;
930 }
931
932 if (PageTransHuge(head))
933 if (page != head)
934 put_page(head);
935
936 put_page(page);
937}
938EXPORT_SYMBOL_GPL(put_hwpoison_page);
939
940/* 903/*
941 * Do all that is necessary to remove user space mappings. Unmap 904 * Do all that is necessary to remove user space mappings. Unmap
942 * the pages and send SIGBUS to the processes if the data was dirty. 905 * the pages and send SIGBUS to the processes if the data was dirty.
@@ -1149,7 +1112,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1149 } 1112 }
1150 1113
1151 if (!PageHuge(p) && PageTransHuge(hpage)) { 1114 if (!PageHuge(p) && PageTransHuge(hpage)) {
1115 lock_page(hpage);
1152 if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { 1116 if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
1117 unlock_page(hpage);
1153 if (!PageAnon(hpage)) 1118 if (!PageAnon(hpage))
1154 pr_err("MCE: %#lx: non anonymous thp\n", pfn); 1119 pr_err("MCE: %#lx: non anonymous thp\n", pfn);
1155 else 1120 else
@@ -1159,6 +1124,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1159 put_hwpoison_page(p); 1124 put_hwpoison_page(p);
1160 return -EBUSY; 1125 return -EBUSY;
1161 } 1126 }
1127 unlock_page(hpage);
1128 get_hwpoison_page(p);
1129 put_hwpoison_page(hpage);
1162 VM_BUG_ON_PAGE(!page_count(p), p); 1130 VM_BUG_ON_PAGE(!page_count(p), p);
1163 hpage = compound_head(p); 1131 hpage = compound_head(p);
1164 } 1132 }
@@ -1166,7 +1134,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1166 /* 1134 /*
1167 * We ignore non-LRU pages for good reasons. 1135 * We ignore non-LRU pages for good reasons.
1168 * - PG_locked is only well defined for LRU pages and a few others 1136 * - PG_locked is only well defined for LRU pages and a few others
1169 * - to avoid races with __set_page_locked() 1137 * - to avoid races with __SetPageLocked()
1170 * - to avoid races with __SetPageSlab*() (and more non-atomic ops) 1138 * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
1171 * The check (unnecessarily) ignores LRU pages being isolated and 1139 * The check (unnecessarily) ignores LRU pages being isolated and
1172 * walked by the page reclaim code, however that's not a big loss. 1140 * walked by the page reclaim code, however that's not a big loss.
@@ -1572,7 +1540,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
1572 * Did it turn free? 1540 * Did it turn free?
1573 */ 1541 */
1574 ret = __get_any_page(page, pfn, 0); 1542 ret = __get_any_page(page, pfn, 0);
1575 if (!PageLRU(page)) { 1543 if (ret == 1 && !PageLRU(page)) {
1576 /* Drop page reference which is from __get_any_page() */ 1544 /* Drop page reference which is from __get_any_page() */
1577 put_hwpoison_page(page); 1545 put_hwpoison_page(page);
1578 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", 1546 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
@@ -1716,6 +1684,49 @@ static int __soft_offline_page(struct page *page, int flags)
1716 return ret; 1684 return ret;
1717} 1685}
1718 1686
1687static int soft_offline_in_use_page(struct page *page, int flags)
1688{
1689 int ret;
1690 struct page *hpage = compound_head(page);
1691
1692 if (!PageHuge(page) && PageTransHuge(hpage)) {
1693 lock_page(hpage);
1694 if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
1695 unlock_page(hpage);
1696 if (!PageAnon(hpage))
1697 pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
1698 else
1699 pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
1700 put_hwpoison_page(hpage);
1701 return -EBUSY;
1702 }
1703 unlock_page(hpage);
1704 get_hwpoison_page(page);
1705 put_hwpoison_page(hpage);
1706 }
1707
1708 if (PageHuge(page))
1709 ret = soft_offline_huge_page(page, flags);
1710 else
1711 ret = __soft_offline_page(page, flags);
1712
1713 return ret;
1714}
1715
1716static void soft_offline_free_page(struct page *page)
1717{
1718 if (PageHuge(page)) {
1719 struct page *hpage = compound_head(page);
1720
1721 set_page_hwpoison_huge_page(hpage);
1722 if (!dequeue_hwpoisoned_huge_page(hpage))
1723 num_poisoned_pages_add(1 << compound_order(hpage));
1724 } else {
1725 if (!TestSetPageHWPoison(page))
1726 num_poisoned_pages_inc();
1727 }
1728}
1729
1719/** 1730/**
1720 * soft_offline_page - Soft offline a page. 1731 * soft_offline_page - Soft offline a page.
1721 * @page: page to offline 1732 * @page: page to offline
@@ -1742,7 +1753,6 @@ int soft_offline_page(struct page *page, int flags)
1742{ 1753{
1743 int ret; 1754 int ret;
1744 unsigned long pfn = page_to_pfn(page); 1755 unsigned long pfn = page_to_pfn(page);
1745 struct page *hpage = compound_head(page);
1746 1756
1747 if (PageHWPoison(page)) { 1757 if (PageHWPoison(page)) {
1748 pr_info("soft offline: %#lx page already poisoned\n", pfn); 1758 pr_info("soft offline: %#lx page already poisoned\n", pfn);
@@ -1750,34 +1760,15 @@ int soft_offline_page(struct page *page, int flags)
1750 put_hwpoison_page(page); 1760 put_hwpoison_page(page);
1751 return -EBUSY; 1761 return -EBUSY;
1752 } 1762 }
1753 if (!PageHuge(page) && PageTransHuge(hpage)) {
1754 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
1755 pr_info("soft offline: %#lx: failed to split THP\n",
1756 pfn);
1757 if (flags & MF_COUNT_INCREASED)
1758 put_hwpoison_page(page);
1759 return -EBUSY;
1760 }
1761 }
1762 1763
1763 get_online_mems(); 1764 get_online_mems();
1764
1765 ret = get_any_page(page, pfn, flags); 1765 ret = get_any_page(page, pfn, flags);
1766 put_online_mems(); 1766 put_online_mems();
1767 if (ret > 0) { /* for in-use pages */ 1767
1768 if (PageHuge(page)) 1768 if (ret > 0)
1769 ret = soft_offline_huge_page(page, flags); 1769 ret = soft_offline_in_use_page(page, flags);
1770 else 1770 else if (ret == 0)
1771 ret = __soft_offline_page(page, flags); 1771 soft_offline_free_page(page);
1772 } else if (ret == 0) { /* for free pages */ 1772
1773 if (PageHuge(page)) {
1774 set_page_hwpoison_huge_page(hpage);
1775 if (!dequeue_hwpoisoned_huge_page(hpage))
1776 num_poisoned_pages_add(1 << compound_order(hpage));
1777 } else {
1778 if (!TestSetPageHWPoison(page))
1779 num_poisoned_pages_inc();
1780 }
1781 }
1782 return ret; 1773 return ret;
1783} 1774}
diff --git a/mm/memory.c b/mm/memory.c
index d4e4d37c1989..ff17850a52d9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -50,6 +50,7 @@
50#include <linux/export.h> 50#include <linux/export.h>
51#include <linux/delayacct.h> 51#include <linux/delayacct.h>
52#include <linux/init.h> 52#include <linux/init.h>
53#include <linux/pfn_t.h>
53#include <linux/writeback.h> 54#include <linux/writeback.h>
54#include <linux/memcontrol.h> 55#include <linux/memcontrol.h>
55#include <linux/mmu_notifier.h> 56#include <linux/mmu_notifier.h>
@@ -566,7 +567,6 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
566{ 567{
567 spinlock_t *ptl; 568 spinlock_t *ptl;
568 pgtable_t new = pte_alloc_one(mm, address); 569 pgtable_t new = pte_alloc_one(mm, address);
569 int wait_split_huge_page;
570 if (!new) 570 if (!new)
571 return -ENOMEM; 571 return -ENOMEM;
572 572
@@ -586,18 +586,14 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
586 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ 586 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
587 587
588 ptl = pmd_lock(mm, pmd); 588 ptl = pmd_lock(mm, pmd);
589 wait_split_huge_page = 0;
590 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ 589 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
591 atomic_long_inc(&mm->nr_ptes); 590 atomic_long_inc(&mm->nr_ptes);
592 pmd_populate(mm, pmd, new); 591 pmd_populate(mm, pmd, new);
593 new = NULL; 592 new = NULL;
594 } else if (unlikely(pmd_trans_splitting(*pmd))) 593 }
595 wait_split_huge_page = 1;
596 spin_unlock(ptl); 594 spin_unlock(ptl);
597 if (new) 595 if (new)
598 pte_free(mm, new); 596 pte_free(mm, new);
599 if (wait_split_huge_page)
600 wait_split_huge_page(vma->anon_vma, pmd);
601 return 0; 597 return 0;
602} 598}
603 599
@@ -613,8 +609,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
613 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ 609 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
614 pmd_populate_kernel(&init_mm, pmd, new); 610 pmd_populate_kernel(&init_mm, pmd, new);
615 new = NULL; 611 new = NULL;
616 } else 612 }
617 VM_BUG_ON(pmd_trans_splitting(*pmd));
618 spin_unlock(&init_mm.page_table_lock); 613 spin_unlock(&init_mm.page_table_lock);
619 if (new) 614 if (new)
620 pte_free_kernel(&init_mm, new); 615 pte_free_kernel(&init_mm, new);
@@ -870,7 +865,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
870 page = vm_normal_page(vma, addr, pte); 865 page = vm_normal_page(vma, addr, pte);
871 if (page) { 866 if (page) {
872 get_page(page); 867 get_page(page);
873 page_dup_rmap(page); 868 page_dup_rmap(page, false);
874 rss[mm_counter(page)]++; 869 rss[mm_counter(page)]++;
875 } 870 }
876 871
@@ -955,7 +950,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
955 src_pmd = pmd_offset(src_pud, addr); 950 src_pmd = pmd_offset(src_pud, addr);
956 do { 951 do {
957 next = pmd_addr_end(addr, end); 952 next = pmd_addr_end(addr, end);
958 if (pmd_trans_huge(*src_pmd)) { 953 if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
959 int err; 954 int err;
960 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); 955 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
961 err = copy_huge_pmd(dst_mm, src_mm, 956 err = copy_huge_pmd(dst_mm, src_mm,
@@ -1118,7 +1113,7 @@ again:
1118 mark_page_accessed(page); 1113 mark_page_accessed(page);
1119 } 1114 }
1120 rss[mm_counter(page)]--; 1115 rss[mm_counter(page)]--;
1121 page_remove_rmap(page); 1116 page_remove_rmap(page, false);
1122 if (unlikely(page_mapcount(page) < 0)) 1117 if (unlikely(page_mapcount(page) < 0))
1123 print_bad_pte(vma, addr, ptent, page); 1118 print_bad_pte(vma, addr, ptent, page);
1124 if (unlikely(!__tlb_remove_page(tlb, page))) { 1119 if (unlikely(!__tlb_remove_page(tlb, page))) {
@@ -1182,7 +1177,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1182 pmd = pmd_offset(pud, addr); 1177 pmd = pmd_offset(pud, addr);
1183 do { 1178 do {
1184 next = pmd_addr_end(addr, end); 1179 next = pmd_addr_end(addr, end);
1185 if (pmd_trans_huge(*pmd)) { 1180 if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
1186 if (next - addr != HPAGE_PMD_SIZE) { 1181 if (next - addr != HPAGE_PMD_SIZE) {
1187#ifdef CONFIG_DEBUG_VM 1182#ifdef CONFIG_DEBUG_VM
1188 if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { 1183 if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
@@ -1193,7 +1188,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1193 BUG(); 1188 BUG();
1194 } 1189 }
1195#endif 1190#endif
1196 split_huge_page_pmd(vma, addr, pmd); 1191 split_huge_pmd(vma, pmd, addr);
1197 } else if (zap_huge_pmd(tlb, vma, pmd, addr)) 1192 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1198 goto next; 1193 goto next;
1199 /* fall through */ 1194 /* fall through */
@@ -1506,7 +1501,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1506EXPORT_SYMBOL(vm_insert_page); 1501EXPORT_SYMBOL(vm_insert_page);
1507 1502
1508static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, 1503static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1509 unsigned long pfn, pgprot_t prot) 1504 pfn_t pfn, pgprot_t prot)
1510{ 1505{
1511 struct mm_struct *mm = vma->vm_mm; 1506 struct mm_struct *mm = vma->vm_mm;
1512 int retval; 1507 int retval;
@@ -1522,7 +1517,10 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1522 goto out_unlock; 1517 goto out_unlock;
1523 1518
1524 /* Ok, finally just insert the thing.. */ 1519 /* Ok, finally just insert the thing.. */
1525 entry = pte_mkspecial(pfn_pte(pfn, prot)); 1520 if (pfn_t_devmap(pfn))
1521 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
1522 else
1523 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
1526 set_pte_at(mm, addr, pte, entry); 1524 set_pte_at(mm, addr, pte, entry);
1527 update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ 1525 update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
1528 1526
@@ -1569,17 +1567,17 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1569 1567
1570 if (addr < vma->vm_start || addr >= vma->vm_end) 1568 if (addr < vma->vm_start || addr >= vma->vm_end)
1571 return -EFAULT; 1569 return -EFAULT;
1572 if (track_pfn_insert(vma, &pgprot, pfn)) 1570 if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)))
1573 return -EINVAL; 1571 return -EINVAL;
1574 1572
1575 ret = insert_pfn(vma, addr, pfn, pgprot); 1573 ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
1576 1574
1577 return ret; 1575 return ret;
1578} 1576}
1579EXPORT_SYMBOL(vm_insert_pfn); 1577EXPORT_SYMBOL(vm_insert_pfn);
1580 1578
1581int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, 1579int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1582 unsigned long pfn) 1580 pfn_t pfn)
1583{ 1581{
1584 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); 1582 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
1585 1583
@@ -1593,10 +1591,10 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1593 * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP 1591 * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
1594 * without pte special, it would there be refcounted as a normal page. 1592 * without pte special, it would there be refcounted as a normal page.
1595 */ 1593 */
1596 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { 1594 if (!HAVE_PTE_SPECIAL && pfn_t_valid(pfn)) {
1597 struct page *page; 1595 struct page *page;
1598 1596
1599 page = pfn_to_page(pfn); 1597 page = pfn_t_to_page(pfn);
1600 return insert_page(vma, addr, page, vma->vm_page_prot); 1598 return insert_page(vma, addr, page, vma->vm_page_prot);
1601 } 1599 }
1602 return insert_pfn(vma, addr, pfn, vma->vm_page_prot); 1600 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
@@ -2087,7 +2085,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2087 cow_user_page(new_page, old_page, address, vma); 2085 cow_user_page(new_page, old_page, address, vma);
2088 } 2086 }
2089 2087
2090 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) 2088 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
2091 goto oom_free_new; 2089 goto oom_free_new;
2092 2090
2093 __SetPageUptodate(new_page); 2091 __SetPageUptodate(new_page);
@@ -2118,8 +2116,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2118 * thread doing COW. 2116 * thread doing COW.
2119 */ 2117 */
2120 ptep_clear_flush_notify(vma, address, page_table); 2118 ptep_clear_flush_notify(vma, address, page_table);
2121 page_add_new_anon_rmap(new_page, vma, address); 2119 page_add_new_anon_rmap(new_page, vma, address, false);
2122 mem_cgroup_commit_charge(new_page, memcg, false); 2120 mem_cgroup_commit_charge(new_page, memcg, false, false);
2123 lru_cache_add_active_or_unevictable(new_page, vma); 2121 lru_cache_add_active_or_unevictable(new_page, vma);
2124 /* 2122 /*
2125 * We call the notify macro here because, when using secondary 2123 * We call the notify macro here because, when using secondary
@@ -2151,14 +2149,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2151 * mapcount is visible. So transitively, TLBs to 2149 * mapcount is visible. So transitively, TLBs to
2152 * old page will be flushed before it can be reused. 2150 * old page will be flushed before it can be reused.
2153 */ 2151 */
2154 page_remove_rmap(old_page); 2152 page_remove_rmap(old_page, false);
2155 } 2153 }
2156 2154
2157 /* Free the old page.. */ 2155 /* Free the old page.. */
2158 new_page = old_page; 2156 new_page = old_page;
2159 page_copied = 1; 2157 page_copied = 1;
2160 } else { 2158 } else {
2161 mem_cgroup_cancel_charge(new_page, memcg); 2159 mem_cgroup_cancel_charge(new_page, memcg, false);
2162 } 2160 }
2163 2161
2164 if (new_page) 2162 if (new_page)
@@ -2173,7 +2171,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2173 */ 2171 */
2174 if (page_copied && (vma->vm_flags & VM_LOCKED)) { 2172 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2175 lock_page(old_page); /* LRU manipulation */ 2173 lock_page(old_page); /* LRU manipulation */
2176 munlock_vma_page(old_page); 2174 if (PageMlocked(old_page))
2175 munlock_vma_page(old_page);
2177 unlock_page(old_page); 2176 unlock_page(old_page);
2178 } 2177 }
2179 page_cache_release(old_page); 2178 page_cache_release(old_page);
@@ -2533,7 +2532,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2533 goto out_page; 2532 goto out_page;
2534 } 2533 }
2535 2534
2536 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) { 2535 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) {
2537 ret = VM_FAULT_OOM; 2536 ret = VM_FAULT_OOM;
2538 goto out_page; 2537 goto out_page;
2539 } 2538 }
@@ -2567,7 +2566,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2567 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2566 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2568 flags &= ~FAULT_FLAG_WRITE; 2567 flags &= ~FAULT_FLAG_WRITE;
2569 ret |= VM_FAULT_WRITE; 2568 ret |= VM_FAULT_WRITE;
2570 exclusive = 1; 2569 exclusive = RMAP_EXCLUSIVE;
2571 } 2570 }
2572 flush_icache_page(vma, page); 2571 flush_icache_page(vma, page);
2573 if (pte_swp_soft_dirty(orig_pte)) 2572 if (pte_swp_soft_dirty(orig_pte))
@@ -2575,10 +2574,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2575 set_pte_at(mm, address, page_table, pte); 2574 set_pte_at(mm, address, page_table, pte);
2576 if (page == swapcache) { 2575 if (page == swapcache) {
2577 do_page_add_anon_rmap(page, vma, address, exclusive); 2576 do_page_add_anon_rmap(page, vma, address, exclusive);
2578 mem_cgroup_commit_charge(page, memcg, true); 2577 mem_cgroup_commit_charge(page, memcg, true, false);
2579 } else { /* ksm created a completely new copy */ 2578 } else { /* ksm created a completely new copy */
2580 page_add_new_anon_rmap(page, vma, address); 2579 page_add_new_anon_rmap(page, vma, address, false);
2581 mem_cgroup_commit_charge(page, memcg, false); 2580 mem_cgroup_commit_charge(page, memcg, false, false);
2582 lru_cache_add_active_or_unevictable(page, vma); 2581 lru_cache_add_active_or_unevictable(page, vma);
2583 } 2582 }
2584 2583
@@ -2613,7 +2612,7 @@ unlock:
2613out: 2612out:
2614 return ret; 2613 return ret;
2615out_nomap: 2614out_nomap:
2616 mem_cgroup_cancel_charge(page, memcg); 2615 mem_cgroup_cancel_charge(page, memcg, false);
2617 pte_unmap_unlock(page_table, ptl); 2616 pte_unmap_unlock(page_table, ptl);
2618out_page: 2617out_page:
2619 unlock_page(page); 2618 unlock_page(page);
@@ -2707,7 +2706,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2707 if (!page) 2706 if (!page)
2708 goto oom; 2707 goto oom;
2709 2708
2710 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) 2709 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false))
2711 goto oom_free_page; 2710 goto oom_free_page;
2712 2711
2713 /* 2712 /*
@@ -2728,15 +2727,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2728 /* Deliver the page fault to userland, check inside PT lock */ 2727 /* Deliver the page fault to userland, check inside PT lock */
2729 if (userfaultfd_missing(vma)) { 2728 if (userfaultfd_missing(vma)) {
2730 pte_unmap_unlock(page_table, ptl); 2729 pte_unmap_unlock(page_table, ptl);
2731 mem_cgroup_cancel_charge(page, memcg); 2730 mem_cgroup_cancel_charge(page, memcg, false);
2732 page_cache_release(page); 2731 page_cache_release(page);
2733 return handle_userfault(vma, address, flags, 2732 return handle_userfault(vma, address, flags,
2734 VM_UFFD_MISSING); 2733 VM_UFFD_MISSING);
2735 } 2734 }
2736 2735
2737 inc_mm_counter_fast(mm, MM_ANONPAGES); 2736 inc_mm_counter_fast(mm, MM_ANONPAGES);
2738 page_add_new_anon_rmap(page, vma, address); 2737 page_add_new_anon_rmap(page, vma, address, false);
2739 mem_cgroup_commit_charge(page, memcg, false); 2738 mem_cgroup_commit_charge(page, memcg, false, false);
2740 lru_cache_add_active_or_unevictable(page, vma); 2739 lru_cache_add_active_or_unevictable(page, vma);
2741setpte: 2740setpte:
2742 set_pte_at(mm, address, page_table, entry); 2741 set_pte_at(mm, address, page_table, entry);
@@ -2747,7 +2746,7 @@ unlock:
2747 pte_unmap_unlock(page_table, ptl); 2746 pte_unmap_unlock(page_table, ptl);
2748 return 0; 2747 return 0;
2749release: 2748release:
2750 mem_cgroup_cancel_charge(page, memcg); 2749 mem_cgroup_cancel_charge(page, memcg, false);
2751 page_cache_release(page); 2750 page_cache_release(page);
2752 goto unlock; 2751 goto unlock;
2753oom_free_page: 2752oom_free_page:
@@ -2824,7 +2823,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
2824 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2823 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2825 if (anon) { 2824 if (anon) {
2826 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); 2825 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2827 page_add_new_anon_rmap(page, vma, address); 2826 page_add_new_anon_rmap(page, vma, address, false);
2828 } else { 2827 } else {
2829 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); 2828 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
2830 page_add_file_rmap(page); 2829 page_add_file_rmap(page);
@@ -3000,7 +2999,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3000 if (!new_page) 2999 if (!new_page)
3001 return VM_FAULT_OOM; 3000 return VM_FAULT_OOM;
3002 3001
3003 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) { 3002 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) {
3004 page_cache_release(new_page); 3003 page_cache_release(new_page);
3005 return VM_FAULT_OOM; 3004 return VM_FAULT_OOM;
3006 } 3005 }
@@ -3029,7 +3028,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3029 goto uncharge_out; 3028 goto uncharge_out;
3030 } 3029 }
3031 do_set_pte(vma, address, new_page, pte, true, true); 3030 do_set_pte(vma, address, new_page, pte, true, true);
3032 mem_cgroup_commit_charge(new_page, memcg, false); 3031 mem_cgroup_commit_charge(new_page, memcg, false, false);
3033 lru_cache_add_active_or_unevictable(new_page, vma); 3032 lru_cache_add_active_or_unevictable(new_page, vma);
3034 pte_unmap_unlock(pte, ptl); 3033 pte_unmap_unlock(pte, ptl);
3035 if (fault_page) { 3034 if (fault_page) {
@@ -3044,7 +3043,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3044 } 3043 }
3045 return ret; 3044 return ret;
3046uncharge_out: 3045uncharge_out:
3047 mem_cgroup_cancel_charge(new_page, memcg); 3046 mem_cgroup_cancel_charge(new_page, memcg, false);
3048 page_cache_release(new_page); 3047 page_cache_release(new_page);
3049 return ret; 3048 return ret;
3050} 3049}
@@ -3096,7 +3095,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3096 * pinned by vma->vm_file's reference. We rely on unlock_page()'s 3095 * pinned by vma->vm_file's reference. We rely on unlock_page()'s
3097 * release semantics to prevent the compiler from undoing this copying. 3096 * release semantics to prevent the compiler from undoing this copying.
3098 */ 3097 */
3099 mapping = fault_page->mapping; 3098 mapping = page_rmapping(fault_page);
3100 unlock_page(fault_page); 3099 unlock_page(fault_page);
3101 if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { 3100 if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
3102 /* 3101 /*
@@ -3198,6 +3197,12 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3198 return 0; 3197 return 0;
3199 } 3198 }
3200 3199
3200 /* TODO: handle PTE-mapped THP */
3201 if (PageCompound(page)) {
3202 pte_unmap_unlock(ptep, ptl);
3203 return 0;
3204 }
3205
3201 /* 3206 /*
3202 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as 3207 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
3203 * much anyway since they can be in shared cache state. This misses 3208 * much anyway since they can be in shared cache state. This misses
@@ -3370,17 +3375,9 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3370 int ret; 3375 int ret;
3371 3376
3372 barrier(); 3377 barrier();
3373 if (pmd_trans_huge(orig_pmd)) { 3378 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
3374 unsigned int dirty = flags & FAULT_FLAG_WRITE; 3379 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3375 3380
3376 /*
3377 * If the pmd is splitting, return and retry the
3378 * the fault. Alternative: wait until the split
3379 * is done, and goto retry.
3380 */
3381 if (pmd_trans_splitting(orig_pmd))
3382 return 0;
3383
3384 if (pmd_protnone(orig_pmd)) 3381 if (pmd_protnone(orig_pmd))
3385 return do_huge_pmd_numa_page(mm, vma, address, 3382 return do_huge_pmd_numa_page(mm, vma, address,
3386 orig_pmd, pmd); 3383 orig_pmd, pmd);
@@ -3407,7 +3404,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3407 unlikely(__pte_alloc(mm, vma, pmd, address))) 3404 unlikely(__pte_alloc(mm, vma, pmd, address)))
3408 return VM_FAULT_OOM; 3405 return VM_FAULT_OOM;
3409 /* if an huge pmd materialized from under us just retry later */ 3406 /* if an huge pmd materialized from under us just retry later */
3410 if (unlikely(pmd_trans_huge(*pmd))) 3407 if (unlikely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
3411 return 0; 3408 return 0;
3412 /* 3409 /*
3413 * A regular pmd is established and it can't morph into a huge pmd 3410 * A regular pmd is established and it can't morph into a huge pmd
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 92f95952692b..4af58a3a8ffa 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -17,6 +17,7 @@
17#include <linux/sysctl.h> 17#include <linux/sysctl.h>
18#include <linux/cpu.h> 18#include <linux/cpu.h>
19#include <linux/memory.h> 19#include <linux/memory.h>
20#include <linux/memremap.h>
20#include <linux/memory_hotplug.h> 21#include <linux/memory_hotplug.h>
21#include <linux/highmem.h> 22#include <linux/highmem.h>
22#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
@@ -506,10 +507,25 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
506 unsigned long i; 507 unsigned long i;
507 int err = 0; 508 int err = 0;
508 int start_sec, end_sec; 509 int start_sec, end_sec;
510 struct vmem_altmap *altmap;
511
509 /* during initialize mem_map, align hot-added range to section */ 512 /* during initialize mem_map, align hot-added range to section */
510 start_sec = pfn_to_section_nr(phys_start_pfn); 513 start_sec = pfn_to_section_nr(phys_start_pfn);
511 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 514 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
512 515
516 altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn));
517 if (altmap) {
518 /*
519 * Validate altmap is within bounds of the total request
520 */
521 if (altmap->base_pfn != phys_start_pfn
522 || vmem_altmap_offset(altmap) > nr_pages) {
523 pr_warn_once("memory add fail, invalid altmap\n");
524 return -EINVAL;
525 }
526 altmap->alloc = 0;
527 }
528
513 for (i = start_sec; i <= end_sec; i++) { 529 for (i = start_sec; i <= end_sec; i++) {
514 err = __add_section(nid, zone, section_nr_to_pfn(i)); 530 err = __add_section(nid, zone, section_nr_to_pfn(i));
515 531
@@ -731,7 +747,8 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
731 pgdat_resize_unlock(zone->zone_pgdat, &flags); 747 pgdat_resize_unlock(zone->zone_pgdat, &flags);
732} 748}
733 749
734static int __remove_section(struct zone *zone, struct mem_section *ms) 750static int __remove_section(struct zone *zone, struct mem_section *ms,
751 unsigned long map_offset)
735{ 752{
736 unsigned long start_pfn; 753 unsigned long start_pfn;
737 int scn_nr; 754 int scn_nr;
@@ -748,7 +765,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
748 start_pfn = section_nr_to_pfn(scn_nr); 765 start_pfn = section_nr_to_pfn(scn_nr);
749 __remove_zone(zone, start_pfn); 766 __remove_zone(zone, start_pfn);
750 767
751 sparse_remove_one_section(zone, ms); 768 sparse_remove_one_section(zone, ms, map_offset);
752 return 0; 769 return 0;
753} 770}
754 771
@@ -767,9 +784,32 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
767 unsigned long nr_pages) 784 unsigned long nr_pages)
768{ 785{
769 unsigned long i; 786 unsigned long i;
770 int sections_to_remove; 787 unsigned long map_offset = 0;
771 resource_size_t start, size; 788 int sections_to_remove, ret = 0;
772 int ret = 0; 789
790 /* In the ZONE_DEVICE case device driver owns the memory region */
791 if (is_dev_zone(zone)) {
792 struct page *page = pfn_to_page(phys_start_pfn);
793 struct vmem_altmap *altmap;
794
795 altmap = to_vmem_altmap((unsigned long) page);
796 if (altmap)
797 map_offset = vmem_altmap_offset(altmap);
798 } else {
799 resource_size_t start, size;
800
801 start = phys_start_pfn << PAGE_SHIFT;
802 size = nr_pages * PAGE_SIZE;
803
804 ret = release_mem_region_adjustable(&iomem_resource, start,
805 size);
806 if (ret) {
807 resource_size_t endres = start + size - 1;
808
809 pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
810 &start, &endres, ret);
811 }
812 }
773 813
774 /* 814 /*
775 * We can only remove entire sections 815 * We can only remove entire sections
@@ -777,23 +817,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
777 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 817 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
778 BUG_ON(nr_pages % PAGES_PER_SECTION); 818 BUG_ON(nr_pages % PAGES_PER_SECTION);
779 819
780 start = phys_start_pfn << PAGE_SHIFT;
781 size = nr_pages * PAGE_SIZE;
782
783 /* in the ZONE_DEVICE case device driver owns the memory region */
784 if (!is_dev_zone(zone))
785 ret = release_mem_region_adjustable(&iomem_resource, start, size);
786 if (ret) {
787 resource_size_t endres = start + size - 1;
788
789 pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
790 &start, &endres, ret);
791 }
792
793 sections_to_remove = nr_pages / PAGES_PER_SECTION; 820 sections_to_remove = nr_pages / PAGES_PER_SECTION;
794 for (i = 0; i < sections_to_remove; i++) { 821 for (i = 0; i < sections_to_remove; i++) {
795 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 822 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
796 ret = __remove_section(zone, __pfn_to_section(pfn)); 823
824 ret = __remove_section(zone, __pfn_to_section(pfn), map_offset);
825 map_offset = 0;
797 if (ret) 826 if (ret)
798 break; 827 break;
799 } 828 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d8caff071a30..27d135408a22 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -489,14 +489,33 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
489 struct page *page; 489 struct page *page;
490 struct queue_pages *qp = walk->private; 490 struct queue_pages *qp = walk->private;
491 unsigned long flags = qp->flags; 491 unsigned long flags = qp->flags;
492 int nid; 492 int nid, ret;
493 pte_t *pte; 493 pte_t *pte;
494 spinlock_t *ptl; 494 spinlock_t *ptl;
495 495
496 split_huge_page_pmd(vma, addr, pmd); 496 if (pmd_trans_huge(*pmd)) {
497 if (pmd_trans_unstable(pmd)) 497 ptl = pmd_lock(walk->mm, pmd);
498 return 0; 498 if (pmd_trans_huge(*pmd)) {
499 page = pmd_page(*pmd);
500 if (is_huge_zero_page(page)) {
501 spin_unlock(ptl);
502 split_huge_pmd(vma, pmd, addr);
503 } else {
504 get_page(page);
505 spin_unlock(ptl);
506 lock_page(page);
507 ret = split_huge_page(page);
508 unlock_page(page);
509 put_page(page);
510 if (ret)
511 return 0;
512 }
513 } else {
514 spin_unlock(ptl);
515 }
516 }
499 517
518retry:
500 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 519 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
501 for (; addr != end; pte++, addr += PAGE_SIZE) { 520 for (; addr != end; pte++, addr += PAGE_SIZE) {
502 if (!pte_present(*pte)) 521 if (!pte_present(*pte))
@@ -513,6 +532,21 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
513 nid = page_to_nid(page); 532 nid = page_to_nid(page);
514 if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) 533 if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
515 continue; 534 continue;
535 if (PageTail(page) && PageAnon(page)) {
536 get_page(page);
537 pte_unmap_unlock(pte, ptl);
538 lock_page(page);
539 ret = split_huge_page(page);
540 unlock_page(page);
541 put_page(page);
542 /* Failed to split -- skip. */
543 if (ret) {
544 pte = pte_offset_map_lock(walk->mm, pmd,
545 addr, &ptl);
546 continue;
547 }
548 goto retry;
549 }
516 550
517 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 551 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
518 migrate_page_add(page, qp->pagelist, flags); 552 migrate_page_add(page, qp->pagelist, flags);
@@ -610,7 +644,8 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
610 644
611 if (flags & MPOL_MF_LAZY) { 645 if (flags & MPOL_MF_LAZY) {
612 /* Similar to task_numa_work, skip inaccessible VMAs */ 646 /* Similar to task_numa_work, skip inaccessible VMAs */
613 if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) 647 if (vma_migratable(vma) &&
648 vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
614 change_prot_numa(vma, start, endvma); 649 change_prot_numa(vma, start, endvma);
615 return 1; 650 return 1;
616 } 651 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 7890d0bb5e23..b1034f9c77e7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -165,9 +165,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
165 if (PageAnon(new)) 165 if (PageAnon(new))
166 hugepage_add_anon_rmap(new, vma, addr); 166 hugepage_add_anon_rmap(new, vma, addr);
167 else 167 else
168 page_dup_rmap(new); 168 page_dup_rmap(new, true);
169 } else if (PageAnon(new)) 169 } else if (PageAnon(new))
170 page_add_anon_rmap(new, vma, addr); 170 page_add_anon_rmap(new, vma, addr, false);
171 else 171 else
172 page_add_file_rmap(new); 172 page_add_file_rmap(new);
173 173
@@ -943,9 +943,13 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
943 goto out; 943 goto out;
944 } 944 }
945 945
946 if (unlikely(PageTransHuge(page))) 946 if (unlikely(PageTransHuge(page))) {
947 if (unlikely(split_huge_page(page))) 947 lock_page(page);
948 rc = split_huge_page(page);
949 unlock_page(page);
950 if (rc)
948 goto out; 951 goto out;
952 }
949 953
950 rc = __unmap_and_move(page, newpage, force, mode); 954 rc = __unmap_and_move(page, newpage, force, mode);
951 if (rc == MIGRATEPAGE_SUCCESS) 955 if (rc == MIGRATEPAGE_SUCCESS)
@@ -1756,6 +1760,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1756 HPAGE_PMD_ORDER); 1760 HPAGE_PMD_ORDER);
1757 if (!new_page) 1761 if (!new_page)
1758 goto out_fail; 1762 goto out_fail;
1763 prep_transhuge_page(new_page);
1759 1764
1760 isolated = numamigrate_isolate_page(pgdat, page); 1765 isolated = numamigrate_isolate_page(pgdat, page);
1761 if (!isolated) { 1766 if (!isolated) {
@@ -1767,7 +1772,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1767 flush_tlb_range(vma, mmun_start, mmun_end); 1772 flush_tlb_range(vma, mmun_start, mmun_end);
1768 1773
1769 /* Prepare a page as a migration target */ 1774 /* Prepare a page as a migration target */
1770 __set_page_locked(new_page); 1775 __SetPageLocked(new_page);
1771 SetPageSwapBacked(new_page); 1776 SetPageSwapBacked(new_page);
1772 1777
1773 /* anon mapping, we can simply copy page->mapping to the new page: */ 1778 /* anon mapping, we can simply copy page->mapping to the new page: */
@@ -1815,7 +1820,7 @@ fail_putback:
1815 * guarantee the copy is visible before the pagetable update. 1820 * guarantee the copy is visible before the pagetable update.
1816 */ 1821 */
1817 flush_cache_range(vma, mmun_start, mmun_end); 1822 flush_cache_range(vma, mmun_start, mmun_end);
1818 page_add_anon_rmap(new_page, vma, mmun_start); 1823 page_add_anon_rmap(new_page, vma, mmun_start, true);
1819 pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); 1824 pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
1820 set_pmd_at(mm, mmun_start, pmd, entry); 1825 set_pmd_at(mm, mmun_start, pmd, entry);
1821 flush_tlb_range(vma, mmun_start, mmun_end); 1826 flush_tlb_range(vma, mmun_start, mmun_end);
@@ -1826,14 +1831,14 @@ fail_putback:
1826 flush_tlb_range(vma, mmun_start, mmun_end); 1831 flush_tlb_range(vma, mmun_start, mmun_end);
1827 mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); 1832 mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
1828 update_mmu_cache_pmd(vma, address, &entry); 1833 update_mmu_cache_pmd(vma, address, &entry);
1829 page_remove_rmap(new_page); 1834 page_remove_rmap(new_page, true);
1830 goto fail_putback; 1835 goto fail_putback;
1831 } 1836 }
1832 1837
1833 mlock_migrate_page(new_page, page); 1838 mlock_migrate_page(new_page, page);
1834 set_page_memcg(new_page, page_memcg(page)); 1839 set_page_memcg(new_page, page_memcg(page));
1835 set_page_memcg(page, NULL); 1840 set_page_memcg(page, NULL);
1836 page_remove_rmap(page); 1841 page_remove_rmap(page, true);
1837 1842
1838 spin_unlock(ptl); 1843 spin_unlock(ptl);
1839 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1844 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
diff --git a/mm/mincore.c b/mm/mincore.c
index 14bb9fb37f0c..2a565ed8bb49 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -117,7 +117,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
117 unsigned char *vec = walk->private; 117 unsigned char *vec = walk->private;
118 int nr = (end - addr) >> PAGE_SHIFT; 118 int nr = (end - addr) >> PAGE_SHIFT;
119 119
120 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 120 if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
121 memset(vec, 1, nr); 121 memset(vec, 1, nr);
122 spin_unlock(ptl); 122 spin_unlock(ptl);
123 goto out; 123 goto out;
diff --git a/mm/mlock.c b/mm/mlock.c
index 9cb87cbc4071..e1e2b1207bf2 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -24,13 +24,13 @@
24 24
25#include "internal.h" 25#include "internal.h"
26 26
27int can_do_mlock(void) 27bool can_do_mlock(void)
28{ 28{
29 if (rlimit(RLIMIT_MEMLOCK) != 0) 29 if (rlimit(RLIMIT_MEMLOCK) != 0)
30 return 1; 30 return true;
31 if (capable(CAP_IPC_LOCK)) 31 if (capable(CAP_IPC_LOCK))
32 return 1; 32 return true;
33 return 0; 33 return false;
34} 34}
35EXPORT_SYMBOL(can_do_mlock); 35EXPORT_SYMBOL(can_do_mlock);
36 36
@@ -82,6 +82,9 @@ void mlock_vma_page(struct page *page)
82 /* Serialize with page migration */ 82 /* Serialize with page migration */
83 BUG_ON(!PageLocked(page)); 83 BUG_ON(!PageLocked(page));
84 84
85 VM_BUG_ON_PAGE(PageTail(page), page);
86 VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
87
85 if (!TestSetPageMlocked(page)) { 88 if (!TestSetPageMlocked(page)) {
86 mod_zone_page_state(page_zone(page), NR_MLOCK, 89 mod_zone_page_state(page_zone(page), NR_MLOCK,
87 hpage_nr_pages(page)); 90 hpage_nr_pages(page));
@@ -178,6 +181,8 @@ unsigned int munlock_vma_page(struct page *page)
178 /* For try_to_munlock() and to serialize with page migration */ 181 /* For try_to_munlock() and to serialize with page migration */
179 BUG_ON(!PageLocked(page)); 182 BUG_ON(!PageLocked(page));
180 183
184 VM_BUG_ON_PAGE(PageTail(page), page);
185
181 /* 186 /*
182 * Serialize with any parallel __split_huge_page_refcount() which 187 * Serialize with any parallel __split_huge_page_refcount() which
183 * might otherwise copy PageMlocked to part of the tail pages before 188 * might otherwise copy PageMlocked to part of the tail pages before
@@ -388,6 +393,13 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
388 if (!page || page_zone_id(page) != zoneid) 393 if (!page || page_zone_id(page) != zoneid)
389 break; 394 break;
390 395
396 /*
397 * Do not use pagevec for PTE-mapped THP,
398 * munlock_vma_pages_range() will handle them.
399 */
400 if (PageTransCompound(page))
401 break;
402
391 get_page(page); 403 get_page(page);
392 /* 404 /*
393 * Increase the address that will be returned *before* the 405 * Increase the address that will be returned *before* the
@@ -444,7 +456,10 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
444 &page_mask); 456 &page_mask);
445 457
446 if (page && !IS_ERR(page)) { 458 if (page && !IS_ERR(page)) {
447 if (PageTransHuge(page)) { 459 if (PageTransTail(page)) {
460 VM_BUG_ON_PAGE(PageMlocked(page), page);
461 put_page(page); /* follow_page_mask() */
462 } else if (PageTransHuge(page)) {
448 lock_page(page); 463 lock_page(page);
449 /* 464 /*
450 * Any THP page found by follow_page_mask() may 465 * Any THP page found by follow_page_mask() may
@@ -477,8 +492,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
477 goto next; 492 goto next;
478 } 493 }
479 } 494 }
480 /* It's a bug to munlock in the middle of a THP page */
481 VM_BUG_ON((start >> PAGE_SHIFT) & page_mask);
482 page_increm = 1 + page_mask; 495 page_increm = 1 + page_mask;
483 start += page_increm * PAGE_SIZE; 496 start += page_increm * PAGE_SIZE;
484next: 497next:
diff --git a/mm/mmap.c b/mm/mmap.c
index b3f00b616b81..84b12624ceb0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3184,10 +3184,16 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
3184 * mapping->flags avoid to take the same lock twice, if more than one 3184 * mapping->flags avoid to take the same lock twice, if more than one
3185 * vma in this mm is backed by the same anon_vma or address_space. 3185 * vma in this mm is backed by the same anon_vma or address_space.
3186 * 3186 *
3187 * We can take all the locks in random order because the VM code 3187 * We take locks in following order, accordingly to comment at beginning
3188 * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never 3188 * of mm/rmap.c:
3189 * takes more than one of them in a row. Secondly we're protected 3189 * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
3190 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. 3190 * hugetlb mapping);
3191 * - all i_mmap_rwsem locks;
3192 * - all anon_vma->rwseml
3193 *
3194 * We can take all locks within these types randomly because the VM code
3195 * doesn't nest them and we protected from parallel mm_take_all_locks() by
3196 * mm_all_locks_mutex.
3191 * 3197 *
3192 * mm_take_all_locks() and mm_drop_all_locks are expensive operations 3198 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
3193 * that may have to take thousand of locks. 3199 * that may have to take thousand of locks.
@@ -3206,7 +3212,16 @@ int mm_take_all_locks(struct mm_struct *mm)
3206 for (vma = mm->mmap; vma; vma = vma->vm_next) { 3212 for (vma = mm->mmap; vma; vma = vma->vm_next) {
3207 if (signal_pending(current)) 3213 if (signal_pending(current))
3208 goto out_unlock; 3214 goto out_unlock;
3209 if (vma->vm_file && vma->vm_file->f_mapping) 3215 if (vma->vm_file && vma->vm_file->f_mapping &&
3216 is_vm_hugetlb_page(vma))
3217 vm_lock_mapping(mm, vma->vm_file->f_mapping);
3218 }
3219
3220 for (vma = mm->mmap; vma; vma = vma->vm_next) {
3221 if (signal_pending(current))
3222 goto out_unlock;
3223 if (vma->vm_file && vma->vm_file->f_mapping &&
3224 !is_vm_hugetlb_page(vma))
3210 vm_lock_mapping(mm, vma->vm_file->f_mapping); 3225 vm_lock_mapping(mm, vma->vm_file->f_mapping);
3211 } 3226 }
3212 3227
diff --git a/mm/mprotect.c b/mm/mprotect.c
index c764402c464f..8eb7bb40dc40 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -149,7 +149,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
149 unsigned long this_pages; 149 unsigned long this_pages;
150 150
151 next = pmd_addr_end(addr, end); 151 next = pmd_addr_end(addr, end);
152 if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd)) 152 if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
153 && pmd_none_or_clear_bad(pmd))
153 continue; 154 continue;
154 155
155 /* invoke the mmu notifier if the pmd is populated */ 156 /* invoke the mmu notifier if the pmd is populated */
@@ -158,9 +159,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
158 mmu_notifier_invalidate_range_start(mm, mni_start, end); 159 mmu_notifier_invalidate_range_start(mm, mni_start, end);
159 } 160 }
160 161
161 if (pmd_trans_huge(*pmd)) { 162 if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
162 if (next - addr != HPAGE_PMD_SIZE) 163 if (next - addr != HPAGE_PMD_SIZE)
163 split_huge_page_pmd(vma, addr, pmd); 164 split_huge_pmd(vma, pmd, addr);
164 else { 165 else {
165 int nr_ptes = change_huge_pmd(vma, pmd, addr, 166 int nr_ptes = change_huge_pmd(vma, pmd, addr,
166 newprot, prot_numa); 167 newprot, prot_numa);
diff --git a/mm/mremap.c b/mm/mremap.c
index e55b157865d5..d77946a997f7 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -192,25 +192,24 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
192 if (!new_pmd) 192 if (!new_pmd)
193 break; 193 break;
194 if (pmd_trans_huge(*old_pmd)) { 194 if (pmd_trans_huge(*old_pmd)) {
195 int err = 0;
196 if (extent == HPAGE_PMD_SIZE) { 195 if (extent == HPAGE_PMD_SIZE) {
196 bool moved;
197 VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma, 197 VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma,
198 vma); 198 vma);
199 /* See comment in move_ptes() */ 199 /* See comment in move_ptes() */
200 if (need_rmap_locks) 200 if (need_rmap_locks)
201 anon_vma_lock_write(vma->anon_vma); 201 anon_vma_lock_write(vma->anon_vma);
202 err = move_huge_pmd(vma, new_vma, old_addr, 202 moved = move_huge_pmd(vma, new_vma, old_addr,
203 new_addr, old_end, 203 new_addr, old_end,
204 old_pmd, new_pmd); 204 old_pmd, new_pmd);
205 if (need_rmap_locks) 205 if (need_rmap_locks)
206 anon_vma_unlock_write(vma->anon_vma); 206 anon_vma_unlock_write(vma->anon_vma);
207 if (moved) {
208 need_flush = true;
209 continue;
210 }
207 } 211 }
208 if (err > 0) { 212 split_huge_pmd(vma, old_pmd, old_addr);
209 need_flush = true;
210 continue;
211 } else if (!err) {
212 split_huge_page_pmd(vma, old_addr, old_pmd);
213 }
214 VM_BUG_ON(pmd_trans_huge(*old_pmd)); 213 VM_BUG_ON(pmd_trans_huge(*old_pmd));
215 } 214 }
216 if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma, 215 if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ce63d603820f..63358d9f9aa9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -43,6 +43,7 @@
43#include <linux/vmalloc.h> 43#include <linux/vmalloc.h>
44#include <linux/vmstat.h> 44#include <linux/vmstat.h>
45#include <linux/mempolicy.h> 45#include <linux/mempolicy.h>
46#include <linux/memremap.h>
46#include <linux/stop_machine.h> 47#include <linux/stop_machine.h>
47#include <linux/sort.h> 48#include <linux/sort.h>
48#include <linux/pfn.h> 49#include <linux/pfn.h>
@@ -222,13 +223,15 @@ static char * const zone_names[MAX_NR_ZONES] = {
222#endif 223#endif
223}; 224};
224 225
225static void free_compound_page(struct page *page);
226compound_page_dtor * const compound_page_dtors[] = { 226compound_page_dtor * const compound_page_dtors[] = {
227 NULL, 227 NULL,
228 free_compound_page, 228 free_compound_page,
229#ifdef CONFIG_HUGETLB_PAGE 229#ifdef CONFIG_HUGETLB_PAGE
230 free_huge_page, 230 free_huge_page,
231#endif 231#endif
232#ifdef CONFIG_TRANSPARENT_HUGEPAGE
233 free_transhuge_page,
234#endif
232}; 235};
233 236
234int min_free_kbytes = 1024; 237int min_free_kbytes = 1024;
@@ -450,7 +453,7 @@ out:
450 * This usage means that zero-order pages may not be compound. 453 * This usage means that zero-order pages may not be compound.
451 */ 454 */
452 455
453static void free_compound_page(struct page *page) 456void free_compound_page(struct page *page)
454{ 457{
455 __free_pages_ok(page, compound_order(page)); 458 __free_pages_ok(page, compound_order(page));
456} 459}
@@ -466,8 +469,10 @@ void prep_compound_page(struct page *page, unsigned int order)
466 for (i = 1; i < nr_pages; i++) { 469 for (i = 1; i < nr_pages; i++) {
467 struct page *p = page + i; 470 struct page *p = page + i;
468 set_page_count(p, 0); 471 set_page_count(p, 0);
472 p->mapping = TAIL_MAPPING;
469 set_compound_head(p, page); 473 set_compound_head(p, page);
470 } 474 }
475 atomic_set(compound_mapcount_ptr(page), -1);
471} 476}
472 477
473#ifdef CONFIG_DEBUG_PAGEALLOC 478#ifdef CONFIG_DEBUG_PAGEALLOC
@@ -732,7 +737,7 @@ static inline int free_pages_check(struct page *page)
732 const char *bad_reason = NULL; 737 const char *bad_reason = NULL;
733 unsigned long bad_flags = 0; 738 unsigned long bad_flags = 0;
734 739
735 if (unlikely(page_mapcount(page))) 740 if (unlikely(atomic_read(&page->_mapcount) != -1))
736 bad_reason = "nonzero mapcount"; 741 bad_reason = "nonzero mapcount";
737 if (unlikely(page->mapping != NULL)) 742 if (unlikely(page->mapping != NULL))
738 bad_reason = "non-NULL mapping"; 743 bad_reason = "non-NULL mapping";
@@ -856,6 +861,27 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
856 ret = 0; 861 ret = 0;
857 goto out; 862 goto out;
858 } 863 }
864 switch (page - head_page) {
865 case 1:
866 /* the first tail page: ->mapping is compound_mapcount() */
867 if (unlikely(compound_mapcount(page))) {
868 bad_page(page, "nonzero compound_mapcount", 0);
869 goto out;
870 }
871 break;
872 case 2:
873 /*
874 * the second tail page: ->mapping is
875 * page_deferred_list().next -- ignore value.
876 */
877 break;
878 default:
879 if (page->mapping != TAIL_MAPPING) {
880 bad_page(page, "corrupted mapping in tail page", 0);
881 goto out;
882 }
883 break;
884 }
859 if (unlikely(!PageTail(page))) { 885 if (unlikely(!PageTail(page))) {
860 bad_page(page, "PageTail not set", 0); 886 bad_page(page, "PageTail not set", 0);
861 goto out; 887 goto out;
@@ -866,6 +892,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
866 } 892 }
867 ret = 0; 893 ret = 0;
868out: 894out:
895 page->mapping = NULL;
869 clear_compound_head(page); 896 clear_compound_head(page);
870 return ret; 897 return ret;
871} 898}
@@ -1329,7 +1356,7 @@ static inline int check_new_page(struct page *page)
1329 const char *bad_reason = NULL; 1356 const char *bad_reason = NULL;
1330 unsigned long bad_flags = 0; 1357 unsigned long bad_flags = 0;
1331 1358
1332 if (unlikely(page_mapcount(page))) 1359 if (unlikely(atomic_read(&page->_mapcount) != -1))
1333 bad_reason = "nonzero mapcount"; 1360 bad_reason = "nonzero mapcount";
1334 if (unlikely(page->mapping != NULL)) 1361 if (unlikely(page->mapping != NULL))
1335 bad_reason = "non-NULL mapping"; 1362 bad_reason = "non-NULL mapping";
@@ -4459,16 +4486,22 @@ static inline unsigned long wait_table_bits(unsigned long size)
4459void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 4486void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
4460 unsigned long start_pfn, enum memmap_context context) 4487 unsigned long start_pfn, enum memmap_context context)
4461{ 4488{
4462 pg_data_t *pgdat = NODE_DATA(nid); 4489 struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn));
4463 unsigned long end_pfn = start_pfn + size; 4490 unsigned long end_pfn = start_pfn + size;
4491 pg_data_t *pgdat = NODE_DATA(nid);
4464 unsigned long pfn; 4492 unsigned long pfn;
4465 struct zone *z;
4466 unsigned long nr_initialised = 0; 4493 unsigned long nr_initialised = 0;
4467 4494
4468 if (highest_memmap_pfn < end_pfn - 1) 4495 if (highest_memmap_pfn < end_pfn - 1)
4469 highest_memmap_pfn = end_pfn - 1; 4496 highest_memmap_pfn = end_pfn - 1;
4470 4497
4471 z = &pgdat->node_zones[zone]; 4498 /*
4499 * Honor reservation requested by the driver for this ZONE_DEVICE
4500 * memory
4501 */
4502 if (altmap && start_pfn == altmap->base_pfn)
4503 start_pfn += altmap->reserve;
4504
4472 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 4505 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
4473 /* 4506 /*
4474 * There can be holes in boot-time mem_map[]s 4507 * There can be holes in boot-time mem_map[]s
diff --git a/mm/page_idle.c b/mm/page_idle.c
index d5dd79041484..4ea9c4ef5146 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -55,25 +55,26 @@ static int page_idle_clear_pte_refs_one(struct page *page,
55 unsigned long addr, void *arg) 55 unsigned long addr, void *arg)
56{ 56{
57 struct mm_struct *mm = vma->vm_mm; 57 struct mm_struct *mm = vma->vm_mm;
58 spinlock_t *ptl;
59 pmd_t *pmd; 58 pmd_t *pmd;
60 pte_t *pte; 59 pte_t *pte;
60 spinlock_t *ptl;
61 bool referenced = false; 61 bool referenced = false;
62 62
63 if (unlikely(PageTransHuge(page))) { 63 if (!page_check_address_transhuge(page, mm, addr, &pmd, &pte, &ptl))
64 pmd = page_check_address_pmd(page, mm, addr, 64 return SWAP_AGAIN;
65 PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); 65
66 if (pmd) { 66 if (pte) {
67 referenced = pmdp_clear_young_notify(vma, addr, pmd); 67 referenced = ptep_clear_young_notify(vma, addr, pte);
68 spin_unlock(ptl); 68 pte_unmap(pte);
69 } 69 } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
70 referenced = pmdp_clear_young_notify(vma, addr, pmd);
70 } else { 71 } else {
71 pte = page_check_address(page, mm, addr, &ptl, 0); 72 /* unexpected pmd-mapped page? */
72 if (pte) { 73 WARN_ON_ONCE(1);
73 referenced = ptep_clear_young_notify(vma, addr, pte);
74 pte_unmap_unlock(pte, ptl);
75 }
76 } 74 }
75
76 spin_unlock(ptl);
77
77 if (referenced) { 78 if (referenced) {
78 clear_page_idle(page); 79 clear_page_idle(page);
79 /* 80 /*
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 5e139fec6c6c..92c4c36501e7 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -196,8 +196,10 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
196{ 196{
197 unsigned long pfn; 197 unsigned long pfn;
198 struct page *page; 198 struct page *page;
199 BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); 199
200 BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); 200 BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
201 BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
202
201 for (pfn = start_pfn; 203 for (pfn = start_pfn;
202 pfn < end_pfn; 204 pfn < end_pfn;
203 pfn += pageblock_nr_pages) { 205 pfn += pageblock_nr_pages) {
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 29f2f8b853ae..207244489a68 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,7 +58,7 @@ again:
58 if (!walk->pte_entry) 58 if (!walk->pte_entry)
59 continue; 59 continue;
60 60
61 split_huge_page_pmd_mm(walk->mm, addr, pmd); 61 split_huge_pmd(walk->vma, pmd, addr);
62 if (pmd_trans_unstable(pmd)) 62 if (pmd_trans_unstable(pmd))
63 goto again; 63 goto again;
64 err = walk_pte_range(pmd, addr, next, walk); 64 err = walk_pte_range(pmd, addr, next, walk);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 4c681baff363..9d4767698a1c 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -132,25 +132,13 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
132{ 132{
133 pmd_t pmd; 133 pmd_t pmd;
134 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 134 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
135 VM_BUG_ON(!pmd_trans_huge(*pmdp)); 135 VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
136 pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); 136 pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
137 flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 137 flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
138 return pmd; 138 return pmd;
139} 139}
140#endif 140#endif
141 141
142#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
143void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
144 pmd_t *pmdp)
145{
146 pmd_t pmd = pmd_mksplitting(*pmdp);
147 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
148 set_pmd_at(vma->vm_mm, address, pmdp, pmd);
149 /* tlb flush only to serialize against gup-fast */
150 flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
151}
152#endif
153
154#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT 142#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
155void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 143void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
156 pgtable_t pgtable) 144 pgtable_t pgtable)
diff --git a/mm/rmap.c b/mm/rmap.c
index 622756c16ac8..79f3bf047f38 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -23,21 +23,22 @@
23 * inode->i_mutex (while writing or truncating, not reading or faulting) 23 * inode->i_mutex (while writing or truncating, not reading or faulting)
24 * mm->mmap_sem 24 * mm->mmap_sem
25 * page->flags PG_locked (lock_page) 25 * page->flags PG_locked (lock_page)
26 * mapping->i_mmap_rwsem 26 * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
27 * anon_vma->rwsem 27 * mapping->i_mmap_rwsem
28 * mm->page_table_lock or pte_lock 28 * anon_vma->rwsem
29 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 29 * mm->page_table_lock or pte_lock
30 * swap_lock (in swap_duplicate, swap_info_get) 30 * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
31 * mmlist_lock (in mmput, drain_mmlist and others) 31 * swap_lock (in swap_duplicate, swap_info_get)
32 * mapping->private_lock (in __set_page_dirty_buffers) 32 * mmlist_lock (in mmput, drain_mmlist and others)
33 * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) 33 * mapping->private_lock (in __set_page_dirty_buffers)
34 * mapping->tree_lock (widely used) 34 * mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
35 * inode->i_lock (in set_page_dirty's __mark_inode_dirty) 35 * mapping->tree_lock (widely used)
36 * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) 36 * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
37 * sb_lock (within inode_lock in fs/fs-writeback.c) 37 * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
38 * mapping->tree_lock (widely used, in set_page_dirty, 38 * sb_lock (within inode_lock in fs/fs-writeback.c)
39 * in arch-dependent flush_dcache_mmap_lock, 39 * mapping->tree_lock (widely used, in set_page_dirty,
40 * within bdi.wb->list_lock in __sync_single_inode) 40 * in arch-dependent flush_dcache_mmap_lock,
41 * within bdi.wb->list_lock in __sync_single_inode)
41 * 42 *
42 * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) 43 * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
43 * ->tasklist_lock 44 * ->tasklist_lock
@@ -567,27 +568,6 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
567 anon_vma_unlock_read(anon_vma); 568 anon_vma_unlock_read(anon_vma);
568} 569}
569 570
570/*
571 * At what user virtual address is page expected in @vma?
572 */
573static inline unsigned long
574__vma_address(struct page *page, struct vm_area_struct *vma)
575{
576 pgoff_t pgoff = page_to_pgoff(page);
577 return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
578}
579
580inline unsigned long
581vma_address(struct page *page, struct vm_area_struct *vma)
582{
583 unsigned long address = __vma_address(page, vma);
584
585 /* page should be within @vma mapping range */
586 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
587
588 return address;
589}
590
591#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 571#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
592static void percpu_flush_tlb_batch_pages(void *data) 572static void percpu_flush_tlb_batch_pages(void *data)
593{ 573{
@@ -819,6 +799,96 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
819 return 1; 799 return 1;
820} 800}
821 801
802#ifdef CONFIG_TRANSPARENT_HUGEPAGE
803/*
804 * Check that @page is mapped at @address into @mm. In contrast to
805 * page_check_address(), this function can handle transparent huge pages.
806 *
807 * On success returns true with pte mapped and locked. For PMD-mapped
808 * transparent huge pages *@ptep is set to NULL.
809 */
810bool page_check_address_transhuge(struct page *page, struct mm_struct *mm,
811 unsigned long address, pmd_t **pmdp,
812 pte_t **ptep, spinlock_t **ptlp)
813{
814 pgd_t *pgd;
815 pud_t *pud;
816 pmd_t *pmd;
817 pte_t *pte;
818 spinlock_t *ptl;
819
820 if (unlikely(PageHuge(page))) {
821 /* when pud is not present, pte will be NULL */
822 pte = huge_pte_offset(mm, address);
823 if (!pte)
824 return false;
825
826 ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
827 pmd = NULL;
828 goto check_pte;
829 }
830
831 pgd = pgd_offset(mm, address);
832 if (!pgd_present(*pgd))
833 return false;
834 pud = pud_offset(pgd, address);
835 if (!pud_present(*pud))
836 return false;
837 pmd = pmd_offset(pud, address);
838
839 if (pmd_trans_huge(*pmd)) {
840 ptl = pmd_lock(mm, pmd);
841 if (!pmd_present(*pmd))
842 goto unlock_pmd;
843 if (unlikely(!pmd_trans_huge(*pmd))) {
844 spin_unlock(ptl);
845 goto map_pte;
846 }
847
848 if (pmd_page(*pmd) != page)
849 goto unlock_pmd;
850
851 pte = NULL;
852 goto found;
853unlock_pmd:
854 spin_unlock(ptl);
855 return false;
856 } else {
857 pmd_t pmde = *pmd;
858
859 barrier();
860 if (!pmd_present(pmde) || pmd_trans_huge(pmde))
861 return false;
862 }
863map_pte:
864 pte = pte_offset_map(pmd, address);
865 if (!pte_present(*pte)) {
866 pte_unmap(pte);
867 return false;
868 }
869
870 ptl = pte_lockptr(mm, pmd);
871check_pte:
872 spin_lock(ptl);
873
874 if (!pte_present(*pte)) {
875 pte_unmap_unlock(pte, ptl);
876 return false;
877 }
878
879 /* THP can be referenced by any subpage */
880 if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) {
881 pte_unmap_unlock(pte, ptl);
882 return false;
883 }
884found:
885 *ptep = pte;
886 *pmdp = pmd;
887 *ptlp = ptl;
888 return true;
889}
890#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
891
822struct page_referenced_arg { 892struct page_referenced_arg {
823 int mapcount; 893 int mapcount;
824 int referenced; 894 int referenced;
@@ -832,49 +902,24 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
832 unsigned long address, void *arg) 902 unsigned long address, void *arg)
833{ 903{
834 struct mm_struct *mm = vma->vm_mm; 904 struct mm_struct *mm = vma->vm_mm;
905 struct page_referenced_arg *pra = arg;
906 pmd_t *pmd;
907 pte_t *pte;
835 spinlock_t *ptl; 908 spinlock_t *ptl;
836 int referenced = 0; 909 int referenced = 0;
837 struct page_referenced_arg *pra = arg;
838
839 if (unlikely(PageTransHuge(page))) {
840 pmd_t *pmd;
841 910
842 /* 911 if (!page_check_address_transhuge(page, mm, address, &pmd, &pte, &ptl))
843 * rmap might return false positives; we must filter 912 return SWAP_AGAIN;
844 * these out using page_check_address_pmd().
845 */
846 pmd = page_check_address_pmd(page, mm, address,
847 PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
848 if (!pmd)
849 return SWAP_AGAIN;
850
851 if (vma->vm_flags & VM_LOCKED) {
852 spin_unlock(ptl);
853 pra->vm_flags |= VM_LOCKED;
854 return SWAP_FAIL; /* To break the loop */
855 }
856 913
857 /* go ahead even if the pmd is pmd_trans_splitting() */ 914 if (vma->vm_flags & VM_LOCKED) {
858 if (pmdp_clear_flush_young_notify(vma, address, pmd)) 915 if (pte)
859 referenced++; 916 pte_unmap(pte);
860 spin_unlock(ptl); 917 spin_unlock(ptl);
861 } else { 918 pra->vm_flags |= VM_LOCKED;
862 pte_t *pte; 919 return SWAP_FAIL; /* To break the loop */
863 920 }
864 /*
865 * rmap might return false positives; we must filter
866 * these out using page_check_address().
867 */
868 pte = page_check_address(page, mm, address, &ptl, 0);
869 if (!pte)
870 return SWAP_AGAIN;
871
872 if (vma->vm_flags & VM_LOCKED) {
873 pte_unmap_unlock(pte, ptl);
874 pra->vm_flags |= VM_LOCKED;
875 return SWAP_FAIL; /* To break the loop */
876 }
877 921
922 if (pte) {
878 if (ptep_clear_flush_young_notify(vma, address, pte)) { 923 if (ptep_clear_flush_young_notify(vma, address, pte)) {
879 /* 924 /*
880 * Don't treat a reference through a sequentially read 925 * Don't treat a reference through a sequentially read
@@ -886,8 +931,15 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
886 if (likely(!(vma->vm_flags & VM_SEQ_READ))) 931 if (likely(!(vma->vm_flags & VM_SEQ_READ)))
887 referenced++; 932 referenced++;
888 } 933 }
889 pte_unmap_unlock(pte, ptl); 934 pte_unmap(pte);
935 } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
936 if (pmdp_clear_flush_young_notify(vma, address, pmd))
937 referenced++;
938 } else {
939 /* unexpected pmd-mapped page? */
940 WARN_ON_ONCE(1);
890 } 941 }
942 spin_unlock(ptl);
891 943
892 if (referenced) 944 if (referenced)
893 clear_page_idle(page); 945 clear_page_idle(page);
@@ -935,7 +987,7 @@ int page_referenced(struct page *page,
935 int ret; 987 int ret;
936 int we_locked = 0; 988 int we_locked = 0;
937 struct page_referenced_arg pra = { 989 struct page_referenced_arg pra = {
938 .mapcount = page_mapcount(page), 990 .mapcount = total_mapcount(page),
939 .memcg = memcg, 991 .memcg = memcg,
940 }; 992 };
941 struct rmap_walk_control rwc = { 993 struct rmap_walk_control rwc = {
@@ -1124,7 +1176,7 @@ static void __page_check_anon_rmap(struct page *page,
1124 * over the call to page_add_new_anon_rmap. 1176 * over the call to page_add_new_anon_rmap.
1125 */ 1177 */
1126 BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); 1178 BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
1127 BUG_ON(page->index != linear_page_index(vma, address)); 1179 BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address));
1128#endif 1180#endif
1129} 1181}
1130 1182
@@ -1133,6 +1185,7 @@ static void __page_check_anon_rmap(struct page *page,
1133 * @page: the page to add the mapping to 1185 * @page: the page to add the mapping to
1134 * @vma: the vm area in which the mapping is added 1186 * @vma: the vm area in which the mapping is added
1135 * @address: the user virtual address mapped 1187 * @address: the user virtual address mapped
1188 * @compound: charge the page as compound or small page
1136 * 1189 *
1137 * The caller needs to hold the pte lock, and the page must be locked in 1190 * The caller needs to hold the pte lock, and the page must be locked in
1138 * the anon_vma case: to serialize mapping,index checking after setting, 1191 * the anon_vma case: to serialize mapping,index checking after setting,
@@ -1140,9 +1193,9 @@ static void __page_check_anon_rmap(struct page *page,
1140 * (but PageKsm is never downgraded to PageAnon). 1193 * (but PageKsm is never downgraded to PageAnon).
1141 */ 1194 */
1142void page_add_anon_rmap(struct page *page, 1195void page_add_anon_rmap(struct page *page,
1143 struct vm_area_struct *vma, unsigned long address) 1196 struct vm_area_struct *vma, unsigned long address, bool compound)
1144{ 1197{
1145 do_page_add_anon_rmap(page, vma, address, 0); 1198 do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
1146} 1199}
1147 1200
1148/* 1201/*
@@ -1151,29 +1204,44 @@ void page_add_anon_rmap(struct page *page,
1151 * Everybody else should continue to use page_add_anon_rmap above. 1204 * Everybody else should continue to use page_add_anon_rmap above.
1152 */ 1205 */
1153void do_page_add_anon_rmap(struct page *page, 1206void do_page_add_anon_rmap(struct page *page,
1154 struct vm_area_struct *vma, unsigned long address, int exclusive) 1207 struct vm_area_struct *vma, unsigned long address, int flags)
1155{ 1208{
1156 int first = atomic_inc_and_test(&page->_mapcount); 1209 bool compound = flags & RMAP_COMPOUND;
1210 bool first;
1211
1212 if (compound) {
1213 atomic_t *mapcount;
1214 VM_BUG_ON_PAGE(!PageLocked(page), page);
1215 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
1216 mapcount = compound_mapcount_ptr(page);
1217 first = atomic_inc_and_test(mapcount);
1218 } else {
1219 first = atomic_inc_and_test(&page->_mapcount);
1220 }
1221
1157 if (first) { 1222 if (first) {
1223 int nr = compound ? hpage_nr_pages(page) : 1;
1158 /* 1224 /*
1159 * We use the irq-unsafe __{inc|mod}_zone_page_stat because 1225 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
1160 * these counters are not modified in interrupt context, and 1226 * these counters are not modified in interrupt context, and
1161 * pte lock(a spinlock) is held, which implies preemption 1227 * pte lock(a spinlock) is held, which implies preemption
1162 * disabled. 1228 * disabled.
1163 */ 1229 */
1164 if (PageTransHuge(page)) 1230 if (compound) {
1165 __inc_zone_page_state(page, 1231 __inc_zone_page_state(page,
1166 NR_ANON_TRANSPARENT_HUGEPAGES); 1232 NR_ANON_TRANSPARENT_HUGEPAGES);
1167 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, 1233 }
1168 hpage_nr_pages(page)); 1234 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
1169 } 1235 }
1170 if (unlikely(PageKsm(page))) 1236 if (unlikely(PageKsm(page)))
1171 return; 1237 return;
1172 1238
1173 VM_BUG_ON_PAGE(!PageLocked(page), page); 1239 VM_BUG_ON_PAGE(!PageLocked(page), page);
1240
1174 /* address might be in next vma when migration races vma_adjust */ 1241 /* address might be in next vma when migration races vma_adjust */
1175 if (first) 1242 if (first)
1176 __page_set_anon_rmap(page, vma, address, exclusive); 1243 __page_set_anon_rmap(page, vma, address,
1244 flags & RMAP_EXCLUSIVE);
1177 else 1245 else
1178 __page_check_anon_rmap(page, vma, address); 1246 __page_check_anon_rmap(page, vma, address);
1179} 1247}
@@ -1183,21 +1251,31 @@ void do_page_add_anon_rmap(struct page *page,
1183 * @page: the page to add the mapping to 1251 * @page: the page to add the mapping to
1184 * @vma: the vm area in which the mapping is added 1252 * @vma: the vm area in which the mapping is added
1185 * @address: the user virtual address mapped 1253 * @address: the user virtual address mapped
1254 * @compound: charge the page as compound or small page
1186 * 1255 *
1187 * Same as page_add_anon_rmap but must only be called on *new* pages. 1256 * Same as page_add_anon_rmap but must only be called on *new* pages.
1188 * This means the inc-and-test can be bypassed. 1257 * This means the inc-and-test can be bypassed.
1189 * Page does not have to be locked. 1258 * Page does not have to be locked.
1190 */ 1259 */
1191void page_add_new_anon_rmap(struct page *page, 1260void page_add_new_anon_rmap(struct page *page,
1192 struct vm_area_struct *vma, unsigned long address) 1261 struct vm_area_struct *vma, unsigned long address, bool compound)
1193{ 1262{
1263 int nr = compound ? hpage_nr_pages(page) : 1;
1264
1194 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); 1265 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
1195 SetPageSwapBacked(page); 1266 SetPageSwapBacked(page);
1196 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ 1267 if (compound) {
1197 if (PageTransHuge(page)) 1268 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
1269 /* increment count (starts at -1) */
1270 atomic_set(compound_mapcount_ptr(page), 0);
1198 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1271 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1199 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, 1272 } else {
1200 hpage_nr_pages(page)); 1273 /* Anon THP always mapped first with PMD */
1274 VM_BUG_ON_PAGE(PageTransCompound(page), page);
1275 /* increment count (starts at -1) */
1276 atomic_set(&page->_mapcount, 0);
1277 }
1278 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
1201 __page_set_anon_rmap(page, vma, address, 1); 1279 __page_set_anon_rmap(page, vma, address, 1);
1202} 1280}
1203 1281
@@ -1225,12 +1303,15 @@ static void page_remove_file_rmap(struct page *page)
1225 1303
1226 memcg = mem_cgroup_begin_page_stat(page); 1304 memcg = mem_cgroup_begin_page_stat(page);
1227 1305
1228 /* page still mapped by someone else? */ 1306 /* Hugepages are not counted in NR_FILE_MAPPED for now. */
1229 if (!atomic_add_negative(-1, &page->_mapcount)) 1307 if (unlikely(PageHuge(page))) {
1308 /* hugetlb pages are always mapped with pmds */
1309 atomic_dec(compound_mapcount_ptr(page));
1230 goto out; 1310 goto out;
1311 }
1231 1312
1232 /* Hugepages are not counted in NR_FILE_MAPPED for now. */ 1313 /* page still mapped by someone else? */
1233 if (unlikely(PageHuge(page))) 1314 if (!atomic_add_negative(-1, &page->_mapcount))
1234 goto out; 1315 goto out;
1235 1316
1236 /* 1317 /*
@@ -1247,41 +1328,79 @@ out:
1247 mem_cgroup_end_page_stat(memcg); 1328 mem_cgroup_end_page_stat(memcg);
1248} 1329}
1249 1330
1331static void page_remove_anon_compound_rmap(struct page *page)
1332{
1333 int i, nr;
1334
1335 if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
1336 return;
1337
1338 /* Hugepages are not counted in NR_ANON_PAGES for now. */
1339 if (unlikely(PageHuge(page)))
1340 return;
1341
1342 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1343 return;
1344
1345 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1346
1347 if (TestClearPageDoubleMap(page)) {
1348 /*
1349 * Subpages can be mapped with PTEs too. Check how many of
1350 * themi are still mapped.
1351 */
1352 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
1353 if (atomic_add_negative(-1, &page[i]._mapcount))
1354 nr++;
1355 }
1356 } else {
1357 nr = HPAGE_PMD_NR;
1358 }
1359
1360 if (unlikely(PageMlocked(page)))
1361 clear_page_mlock(page);
1362
1363 if (nr) {
1364 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr);
1365 deferred_split_huge_page(page);
1366 }
1367}
1368
1250/** 1369/**
1251 * page_remove_rmap - take down pte mapping from a page 1370 * page_remove_rmap - take down pte mapping from a page
1252 * @page: page to remove mapping from 1371 * @page: page to remove mapping from
1372 * @compound: uncharge the page as compound or small page
1253 * 1373 *
1254 * The caller needs to hold the pte lock. 1374 * The caller needs to hold the pte lock.
1255 */ 1375 */
1256void page_remove_rmap(struct page *page) 1376void page_remove_rmap(struct page *page, bool compound)
1257{ 1377{
1258 if (!PageAnon(page)) { 1378 if (!PageAnon(page)) {
1379 VM_BUG_ON_PAGE(compound && !PageHuge(page), page);
1259 page_remove_file_rmap(page); 1380 page_remove_file_rmap(page);
1260 return; 1381 return;
1261 } 1382 }
1262 1383
1384 if (compound)
1385 return page_remove_anon_compound_rmap(page);
1386
1263 /* page still mapped by someone else? */ 1387 /* page still mapped by someone else? */
1264 if (!atomic_add_negative(-1, &page->_mapcount)) 1388 if (!atomic_add_negative(-1, &page->_mapcount))
1265 return; 1389 return;
1266 1390
1267 /* Hugepages are not counted in NR_ANON_PAGES for now. */
1268 if (unlikely(PageHuge(page)))
1269 return;
1270
1271 /* 1391 /*
1272 * We use the irq-unsafe __{inc|mod}_zone_page_stat because 1392 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
1273 * these counters are not modified in interrupt context, and 1393 * these counters are not modified in interrupt context, and
1274 * pte lock(a spinlock) is held, which implies preemption disabled. 1394 * pte lock(a spinlock) is held, which implies preemption disabled.
1275 */ 1395 */
1276 if (PageTransHuge(page)) 1396 __dec_zone_page_state(page, NR_ANON_PAGES);
1277 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1278
1279 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
1280 -hpage_nr_pages(page));
1281 1397
1282 if (unlikely(PageMlocked(page))) 1398 if (unlikely(PageMlocked(page)))
1283 clear_page_mlock(page); 1399 clear_page_mlock(page);
1284 1400
1401 if (PageTransCompound(page))
1402 deferred_split_huge_page(compound_head(page));
1403
1285 /* 1404 /*
1286 * It would be tidy to reset the PageAnon mapping here, 1405 * It would be tidy to reset the PageAnon mapping here,
1287 * but that might overwrite a racing page_add_anon_rmap 1406 * but that might overwrite a racing page_add_anon_rmap
@@ -1293,6 +1412,11 @@ void page_remove_rmap(struct page *page)
1293 */ 1412 */
1294} 1413}
1295 1414
1415struct rmap_private {
1416 enum ttu_flags flags;
1417 int lazyfreed;
1418};
1419
1296/* 1420/*
1297 * @arg: enum ttu_flags will be passed to this argument 1421 * @arg: enum ttu_flags will be passed to this argument
1298 */ 1422 */
@@ -1304,7 +1428,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1304 pte_t pteval; 1428 pte_t pteval;
1305 spinlock_t *ptl; 1429 spinlock_t *ptl;
1306 int ret = SWAP_AGAIN; 1430 int ret = SWAP_AGAIN;
1307 enum ttu_flags flags = (enum ttu_flags)arg; 1431 struct rmap_private *rp = arg;
1432 enum ttu_flags flags = rp->flags;
1308 1433
1309 /* munlock has nothing to gain from examining un-locked vmas */ 1434 /* munlock has nothing to gain from examining un-locked vmas */
1310 if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) 1435 if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
@@ -1396,6 +1521,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1396 * See handle_pte_fault() ... 1521 * See handle_pte_fault() ...
1397 */ 1522 */
1398 VM_BUG_ON_PAGE(!PageSwapCache(page), page); 1523 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
1524
1525 if (!PageDirty(page) && (flags & TTU_LZFREE)) {
1526 /* It's a freeable page by MADV_FREE */
1527 dec_mm_counter(mm, MM_ANONPAGES);
1528 rp->lazyfreed++;
1529 goto discard;
1530 }
1531
1399 if (swap_duplicate(entry) < 0) { 1532 if (swap_duplicate(entry) < 0) {
1400 set_pte_at(mm, address, pte, pteval); 1533 set_pte_at(mm, address, pte, pteval);
1401 ret = SWAP_FAIL; 1534 ret = SWAP_FAIL;
@@ -1416,7 +1549,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1416 } else 1549 } else
1417 dec_mm_counter(mm, mm_counter_file(page)); 1550 dec_mm_counter(mm, mm_counter_file(page));
1418 1551
1419 page_remove_rmap(page); 1552discard:
1553 page_remove_rmap(page, PageHuge(page));
1420 page_cache_release(page); 1554 page_cache_release(page);
1421 1555
1422out_unmap: 1556out_unmap:
@@ -1468,9 +1602,14 @@ static int page_not_mapped(struct page *page)
1468int try_to_unmap(struct page *page, enum ttu_flags flags) 1602int try_to_unmap(struct page *page, enum ttu_flags flags)
1469{ 1603{
1470 int ret; 1604 int ret;
1605 struct rmap_private rp = {
1606 .flags = flags,
1607 .lazyfreed = 0,
1608 };
1609
1471 struct rmap_walk_control rwc = { 1610 struct rmap_walk_control rwc = {
1472 .rmap_one = try_to_unmap_one, 1611 .rmap_one = try_to_unmap_one,
1473 .arg = (void *)flags, 1612 .arg = &rp,
1474 .done = page_not_mapped, 1613 .done = page_not_mapped,
1475 .anon_lock = page_lock_anon_vma_read, 1614 .anon_lock = page_lock_anon_vma_read,
1476 }; 1615 };
@@ -1490,8 +1629,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
1490 1629
1491 ret = rmap_walk(page, &rwc); 1630 ret = rmap_walk(page, &rwc);
1492 1631
1493 if (ret != SWAP_MLOCK && !page_mapped(page)) 1632 if (ret != SWAP_MLOCK && !page_mapped(page)) {
1494 ret = SWAP_SUCCESS; 1633 ret = SWAP_SUCCESS;
1634 if (rp.lazyfreed && !PageDirty(page))
1635 ret = SWAP_LZFREE;
1636 }
1495 return ret; 1637 return ret;
1496} 1638}
1497 1639
@@ -1513,9 +1655,14 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
1513int try_to_munlock(struct page *page) 1655int try_to_munlock(struct page *page)
1514{ 1656{
1515 int ret; 1657 int ret;
1658 struct rmap_private rp = {
1659 .flags = TTU_MUNLOCK,
1660 .lazyfreed = 0,
1661 };
1662
1516 struct rmap_walk_control rwc = { 1663 struct rmap_walk_control rwc = {
1517 .rmap_one = try_to_unmap_one, 1664 .rmap_one = try_to_unmap_one,
1518 .arg = (void *)TTU_MUNLOCK, 1665 .arg = &rp,
1519 .done = page_not_mapped, 1666 .done = page_not_mapped,
1520 .anon_lock = page_lock_anon_vma_read, 1667 .anon_lock = page_lock_anon_vma_read,
1521 1668
@@ -1698,7 +1845,7 @@ void hugepage_add_anon_rmap(struct page *page,
1698 BUG_ON(!PageLocked(page)); 1845 BUG_ON(!PageLocked(page));
1699 BUG_ON(!anon_vma); 1846 BUG_ON(!anon_vma);
1700 /* address might be in next vma when migration races vma_adjust */ 1847 /* address might be in next vma when migration races vma_adjust */
1701 first = atomic_inc_and_test(&page->_mapcount); 1848 first = atomic_inc_and_test(compound_mapcount_ptr(page));
1702 if (first) 1849 if (first)
1703 __hugepage_set_anon_rmap(page, vma, address, 0); 1850 __hugepage_set_anon_rmap(page, vma, address, 0);
1704} 1851}
@@ -1707,7 +1854,7 @@ void hugepage_add_new_anon_rmap(struct page *page,
1707 struct vm_area_struct *vma, unsigned long address) 1854 struct vm_area_struct *vma, unsigned long address)
1708{ 1855{
1709 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1856 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1710 atomic_set(&page->_mapcount, 0); 1857 atomic_set(compound_mapcount_ptr(page), 0);
1711 __hugepage_set_anon_rmap(page, vma, address, 1); 1858 __hugepage_set_anon_rmap(page, vma, address, 1);
1712} 1859}
1713#endif /* CONFIG_HUGETLB_PAGE */ 1860#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/shmem.c b/mm/shmem.c
index 970ff5b80853..b98e1011858c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -810,7 +810,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
810 * the shmem_swaplist_mutex which might hold up shmem_writepage(). 810 * the shmem_swaplist_mutex which might hold up shmem_writepage().
811 * Charged back to the user (not to caller) when swap account is used. 811 * Charged back to the user (not to caller) when swap account is used.
812 */ 812 */
813 error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg); 813 error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg,
814 false);
814 if (error) 815 if (error)
815 goto out; 816 goto out;
816 /* No radix_tree_preload: swap entry keeps a place for page in tree */ 817 /* No radix_tree_preload: swap entry keeps a place for page in tree */
@@ -833,9 +834,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
833 if (error) { 834 if (error) {
834 if (error != -ENOMEM) 835 if (error != -ENOMEM)
835 error = 0; 836 error = 0;
836 mem_cgroup_cancel_charge(page, memcg); 837 mem_cgroup_cancel_charge(page, memcg, false);
837 } else 838 } else
838 mem_cgroup_commit_charge(page, memcg, true); 839 mem_cgroup_commit_charge(page, memcg, true, false);
839out: 840out:
840 unlock_page(page); 841 unlock_page(page);
841 page_cache_release(page); 842 page_cache_release(page);
@@ -1085,7 +1086,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1085 copy_highpage(newpage, oldpage); 1086 copy_highpage(newpage, oldpage);
1086 flush_dcache_page(newpage); 1087 flush_dcache_page(newpage);
1087 1088
1088 __set_page_locked(newpage); 1089 __SetPageLocked(newpage);
1089 SetPageUptodate(newpage); 1090 SetPageUptodate(newpage);
1090 SetPageSwapBacked(newpage); 1091 SetPageSwapBacked(newpage);
1091 set_page_private(newpage, swap_index); 1092 set_page_private(newpage, swap_index);
@@ -1218,7 +1219,8 @@ repeat:
1218 goto failed; 1219 goto failed;
1219 } 1220 }
1220 1221
1221 error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); 1222 error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
1223 false);
1222 if (!error) { 1224 if (!error) {
1223 error = shmem_add_to_page_cache(page, mapping, index, 1225 error = shmem_add_to_page_cache(page, mapping, index,
1224 swp_to_radix_entry(swap)); 1226 swp_to_radix_entry(swap));
@@ -1235,14 +1237,14 @@ repeat:
1235 * "repeat": reading a hole and writing should succeed. 1237 * "repeat": reading a hole and writing should succeed.
1236 */ 1238 */
1237 if (error) { 1239 if (error) {
1238 mem_cgroup_cancel_charge(page, memcg); 1240 mem_cgroup_cancel_charge(page, memcg, false);
1239 delete_from_swap_cache(page); 1241 delete_from_swap_cache(page);
1240 } 1242 }
1241 } 1243 }
1242 if (error) 1244 if (error)
1243 goto failed; 1245 goto failed;
1244 1246
1245 mem_cgroup_commit_charge(page, memcg, true); 1247 mem_cgroup_commit_charge(page, memcg, true, false);
1246 1248
1247 spin_lock(&info->lock); 1249 spin_lock(&info->lock);
1248 info->swapped--; 1250 info->swapped--;
@@ -1277,11 +1279,12 @@ repeat:
1277 } 1279 }
1278 1280
1279 __SetPageSwapBacked(page); 1281 __SetPageSwapBacked(page);
1280 __set_page_locked(page); 1282 __SetPageLocked(page);
1281 if (sgp == SGP_WRITE) 1283 if (sgp == SGP_WRITE)
1282 __SetPageReferenced(page); 1284 __SetPageReferenced(page);
1283 1285
1284 error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); 1286 error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
1287 false);
1285 if (error) 1288 if (error)
1286 goto decused; 1289 goto decused;
1287 error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); 1290 error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
@@ -1291,10 +1294,10 @@ repeat:
1291 radix_tree_preload_end(); 1294 radix_tree_preload_end();
1292 } 1295 }
1293 if (error) { 1296 if (error) {
1294 mem_cgroup_cancel_charge(page, memcg); 1297 mem_cgroup_cancel_charge(page, memcg, false);
1295 goto decused; 1298 goto decused;
1296 } 1299 }
1297 mem_cgroup_commit_charge(page, memcg, false); 1300 mem_cgroup_commit_charge(page, memcg, false, false);
1298 lru_cache_add_anon(page); 1301 lru_cache_add_anon(page);
1299 1302
1300 spin_lock(&info->lock); 1303 spin_lock(&info->lock);
diff --git a/mm/slub.c b/mm/slub.c
index 2d0e610d195a..b21fd24b08b1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -338,11 +338,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
338 */ 338 */
339static __always_inline void slab_lock(struct page *page) 339static __always_inline void slab_lock(struct page *page)
340{ 340{
341 VM_BUG_ON_PAGE(PageTail(page), page);
341 bit_spin_lock(PG_locked, &page->flags); 342 bit_spin_lock(PG_locked, &page->flags);
342} 343}
343 344
344static __always_inline void slab_unlock(struct page *page) 345static __always_inline void slab_unlock(struct page *page)
345{ 346{
347 VM_BUG_ON_PAGE(PageTail(page), page);
346 __bit_spin_unlock(PG_locked, &page->flags); 348 __bit_spin_unlock(PG_locked, &page->flags);
347} 349}
348 350
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 4cba9c2783a1..b60802b3e5ea 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -20,6 +20,7 @@
20#include <linux/mm.h> 20#include <linux/mm.h>
21#include <linux/mmzone.h> 21#include <linux/mmzone.h>
22#include <linux/bootmem.h> 22#include <linux/bootmem.h>
23#include <linux/memremap.h>
23#include <linux/highmem.h> 24#include <linux/highmem.h>
24#include <linux/slab.h> 25#include <linux/slab.h>
25#include <linux/spinlock.h> 26#include <linux/spinlock.h>
@@ -70,7 +71,7 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
70} 71}
71 72
72/* need to make sure size is all the same during early stage */ 73/* need to make sure size is all the same during early stage */
73void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) 74static void * __meminit alloc_block_buf(unsigned long size, int node)
74{ 75{
75 void *ptr; 76 void *ptr;
76 77
@@ -87,6 +88,77 @@ void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
87 return ptr; 88 return ptr;
88} 89}
89 90
91static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
92{
93 return altmap->base_pfn + altmap->reserve + altmap->alloc
94 + altmap->align;
95}
96
97static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
98{
99 unsigned long allocated = altmap->alloc + altmap->align;
100
101 if (altmap->free > allocated)
102 return altmap->free - allocated;
103 return 0;
104}
105
106/**
107 * vmem_altmap_alloc - allocate pages from the vmem_altmap reservation
108 * @altmap - reserved page pool for the allocation
109 * @nr_pfns - size (in pages) of the allocation
110 *
111 * Allocations are aligned to the size of the request
112 */
113static unsigned long __meminit vmem_altmap_alloc(struct vmem_altmap *altmap,
114 unsigned long nr_pfns)
115{
116 unsigned long pfn = vmem_altmap_next_pfn(altmap);
117 unsigned long nr_align;
118
119 nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG);
120 nr_align = ALIGN(pfn, nr_align) - pfn;
121
122 if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap))
123 return ULONG_MAX;
124 altmap->alloc += nr_pfns;
125 altmap->align += nr_align;
126 return pfn + nr_align;
127}
128
129static void * __meminit altmap_alloc_block_buf(unsigned long size,
130 struct vmem_altmap *altmap)
131{
132 unsigned long pfn, nr_pfns;
133 void *ptr;
134
135 if (size & ~PAGE_MASK) {
136 pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n",
137 __func__, size);
138 return NULL;
139 }
140
141 nr_pfns = size >> PAGE_SHIFT;
142 pfn = vmem_altmap_alloc(altmap, nr_pfns);
143 if (pfn < ULONG_MAX)
144 ptr = __va(__pfn_to_phys(pfn));
145 else
146 ptr = NULL;
147 pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n",
148 __func__, pfn, altmap->alloc, altmap->align, nr_pfns);
149
150 return ptr;
151}
152
153/* need to make sure size is all the same during early stage */
154void * __meminit __vmemmap_alloc_block_buf(unsigned long size, int node,
155 struct vmem_altmap *altmap)
156{
157 if (altmap)
158 return altmap_alloc_block_buf(size, altmap);
159 return alloc_block_buf(size, node);
160}
161
90void __meminit vmemmap_verify(pte_t *pte, int node, 162void __meminit vmemmap_verify(pte_t *pte, int node,
91 unsigned long start, unsigned long end) 163 unsigned long start, unsigned long end)
92{ 164{
@@ -103,7 +175,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
103 pte_t *pte = pte_offset_kernel(pmd, addr); 175 pte_t *pte = pte_offset_kernel(pmd, addr);
104 if (pte_none(*pte)) { 176 if (pte_none(*pte)) {
105 pte_t entry; 177 pte_t entry;
106 void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node); 178 void *p = alloc_block_buf(PAGE_SIZE, node);
107 if (!p) 179 if (!p)
108 return NULL; 180 return NULL;
109 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); 181 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
diff --git a/mm/sparse.c b/mm/sparse.c
index d1b48b691ac8..3717ceed4177 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -748,7 +748,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
748 if (!memmap) 748 if (!memmap)
749 return; 749 return;
750 750
751 for (i = 0; i < PAGES_PER_SECTION; i++) { 751 for (i = 0; i < nr_pages; i++) {
752 if (PageHWPoison(&memmap[i])) { 752 if (PageHWPoison(&memmap[i])) {
753 atomic_long_sub(1, &num_poisoned_pages); 753 atomic_long_sub(1, &num_poisoned_pages);
754 ClearPageHWPoison(&memmap[i]); 754 ClearPageHWPoison(&memmap[i]);
@@ -788,7 +788,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
788 free_map_bootmem(memmap); 788 free_map_bootmem(memmap);
789} 789}
790 790
791void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) 791void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
792 unsigned long map_offset)
792{ 793{
793 struct page *memmap = NULL; 794 struct page *memmap = NULL;
794 unsigned long *usemap = NULL, flags; 795 unsigned long *usemap = NULL, flags;
@@ -804,7 +805,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
804 } 805 }
805 pgdat_resize_unlock(pgdat, &flags); 806 pgdat_resize_unlock(pgdat, &flags);
806 807
807 clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); 808 clear_hwpoisoned_pages(memmap + map_offset,
809 PAGES_PER_SECTION - map_offset);
808 free_section_usemap(memmap, usemap); 810 free_section_usemap(memmap, usemap);
809} 811}
810#endif /* CONFIG_MEMORY_HOTREMOVE */ 812#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/swap.c b/mm/swap.c
index 39395fb549c0..09fe5e97714a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -24,6 +24,7 @@
24#include <linux/export.h> 24#include <linux/export.h>
25#include <linux/mm_inline.h> 25#include <linux/mm_inline.h>
26#include <linux/percpu_counter.h> 26#include <linux/percpu_counter.h>
27#include <linux/memremap.h>
27#include <linux/percpu.h> 28#include <linux/percpu.h>
28#include <linux/cpu.h> 29#include <linux/cpu.h>
29#include <linux/notifier.h> 30#include <linux/notifier.h>
@@ -45,6 +46,7 @@ int page_cluster;
45static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); 46static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
46static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 47static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
47static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); 48static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
49static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
48 50
49/* 51/*
50 * This path almost never happens for VM activity - pages are normally 52 * This path almost never happens for VM activity - pages are normally
@@ -89,260 +91,14 @@ static void __put_compound_page(struct page *page)
89 (*dtor)(page); 91 (*dtor)(page);
90} 92}
91 93
92/** 94void __put_page(struct page *page)
93 * Two special cases here: we could avoid taking compound_lock_irqsave
94 * and could skip the tail refcounting(in _mapcount).
95 *
96 * 1. Hugetlbfs page:
97 *
98 * PageHeadHuge will remain true until the compound page
99 * is released and enters the buddy allocator, and it could
100 * not be split by __split_huge_page_refcount().
101 *
102 * So if we see PageHeadHuge set, and we have the tail page pin,
103 * then we could safely put head page.
104 *
105 * 2. Slab THP page:
106 *
107 * PG_slab is cleared before the slab frees the head page, and
108 * tail pin cannot be the last reference left on the head page,
109 * because the slab code is free to reuse the compound page
110 * after a kfree/kmem_cache_free without having to check if
111 * there's any tail pin left. In turn all tail pinsmust be always
112 * released while the head is still pinned by the slab code
113 * and so we know PG_slab will be still set too.
114 *
115 * So if we see PageSlab set, and we have the tail page pin,
116 * then we could safely put head page.
117 */
118static __always_inline
119void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
120{
121 /*
122 * If @page is a THP tail, we must read the tail page
123 * flags after the head page flags. The
124 * __split_huge_page_refcount side enforces write memory barriers
125 * between clearing PageTail and before the head page
126 * can be freed and reallocated.
127 */
128 smp_rmb();
129 if (likely(PageTail(page))) {
130 /*
131 * __split_huge_page_refcount cannot race
132 * here, see the comment above this function.
133 */
134 VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
135 if (put_page_testzero(page_head)) {
136 /*
137 * If this is the tail of a slab THP page,
138 * the tail pin must not be the last reference
139 * held on the page, because the PG_slab cannot
140 * be cleared before all tail pins (which skips
141 * the _mapcount tail refcounting) have been
142 * released.
143 *
144 * If this is the tail of a hugetlbfs page,
145 * the tail pin may be the last reference on
146 * the page instead, because PageHeadHuge will
147 * not go away until the compound page enters
148 * the buddy allocator.
149 */
150 VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
151 __put_compound_page(page_head);
152 }
153 } else
154 /*
155 * __split_huge_page_refcount run before us,
156 * @page was a THP tail. The split @page_head
157 * has been freed and reallocated as slab or
158 * hugetlbfs page of smaller order (only
159 * possible if reallocated as slab on x86).
160 */
161 if (put_page_testzero(page))
162 __put_single_page(page);
163}
164
165static __always_inline
166void put_refcounted_compound_page(struct page *page_head, struct page *page)
167{
168 if (likely(page != page_head && get_page_unless_zero(page_head))) {
169 unsigned long flags;
170
171 /*
172 * @page_head wasn't a dangling pointer but it may not
173 * be a head page anymore by the time we obtain the
174 * lock. That is ok as long as it can't be freed from
175 * under us.
176 */
177 flags = compound_lock_irqsave(page_head);
178 if (unlikely(!PageTail(page))) {
179 /* __split_huge_page_refcount run before us */
180 compound_unlock_irqrestore(page_head, flags);
181 if (put_page_testzero(page_head)) {
182 /*
183 * The @page_head may have been freed
184 * and reallocated as a compound page
185 * of smaller order and then freed
186 * again. All we know is that it
187 * cannot have become: a THP page, a
188 * compound page of higher order, a
189 * tail page. That is because we
190 * still hold the refcount of the
191 * split THP tail and page_head was
192 * the THP head before the split.
193 */
194 if (PageHead(page_head))
195 __put_compound_page(page_head);
196 else
197 __put_single_page(page_head);
198 }
199out_put_single:
200 if (put_page_testzero(page))
201 __put_single_page(page);
202 return;
203 }
204 VM_BUG_ON_PAGE(page_head != compound_head(page), page);
205 /*
206 * We can release the refcount taken by
207 * get_page_unless_zero() now that
208 * __split_huge_page_refcount() is blocked on the
209 * compound_lock.
210 */
211 if (put_page_testzero(page_head))
212 VM_BUG_ON_PAGE(1, page_head);
213 /* __split_huge_page_refcount will wait now */
214 VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page);
215 atomic_dec(&page->_mapcount);
216 VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head);
217 VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
218 compound_unlock_irqrestore(page_head, flags);
219
220 if (put_page_testzero(page_head)) {
221 if (PageHead(page_head))
222 __put_compound_page(page_head);
223 else
224 __put_single_page(page_head);
225 }
226 } else {
227 /* @page_head is a dangling pointer */
228 VM_BUG_ON_PAGE(PageTail(page), page);
229 goto out_put_single;
230 }
231}
232
233static void put_compound_page(struct page *page)
234{
235 struct page *page_head;
236
237 /*
238 * We see the PageCompound set and PageTail not set, so @page maybe:
239 * 1. hugetlbfs head page, or
240 * 2. THP head page.
241 */
242 if (likely(!PageTail(page))) {
243 if (put_page_testzero(page)) {
244 /*
245 * By the time all refcounts have been released
246 * split_huge_page cannot run anymore from under us.
247 */
248 if (PageHead(page))
249 __put_compound_page(page);
250 else
251 __put_single_page(page);
252 }
253 return;
254 }
255
256 /*
257 * We see the PageCompound set and PageTail set, so @page maybe:
258 * 1. a tail hugetlbfs page, or
259 * 2. a tail THP page, or
260 * 3. a split THP page.
261 *
262 * Case 3 is possible, as we may race with
263 * __split_huge_page_refcount tearing down a THP page.
264 */
265 page_head = compound_head(page);
266 if (!__compound_tail_refcounted(page_head))
267 put_unrefcounted_compound_page(page_head, page);
268 else
269 put_refcounted_compound_page(page_head, page);
270}
271
272void put_page(struct page *page)
273{ 95{
274 if (unlikely(PageCompound(page))) 96 if (unlikely(PageCompound(page)))
275 put_compound_page(page); 97 __put_compound_page(page);
276 else if (put_page_testzero(page)) 98 else
277 __put_single_page(page); 99 __put_single_page(page);
278} 100}
279EXPORT_SYMBOL(put_page); 101EXPORT_SYMBOL(__put_page);
280
281/*
282 * This function is exported but must not be called by anything other
283 * than get_page(). It implements the slow path of get_page().
284 */
285bool __get_page_tail(struct page *page)
286{
287 /*
288 * This takes care of get_page() if run on a tail page
289 * returned by one of the get_user_pages/follow_page variants.
290 * get_user_pages/follow_page itself doesn't need the compound
291 * lock because it runs __get_page_tail_foll() under the
292 * proper PT lock that already serializes against
293 * split_huge_page().
294 */
295 unsigned long flags;
296 bool got;
297 struct page *page_head = compound_head(page);
298
299 /* Ref to put_compound_page() comment. */
300 if (!__compound_tail_refcounted(page_head)) {
301 smp_rmb();
302 if (likely(PageTail(page))) {
303 /*
304 * This is a hugetlbfs page or a slab
305 * page. __split_huge_page_refcount
306 * cannot race here.
307 */
308 VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
309 __get_page_tail_foll(page, true);
310 return true;
311 } else {
312 /*
313 * __split_huge_page_refcount run
314 * before us, "page" was a THP
315 * tail. The split page_head has been
316 * freed and reallocated as slab or
317 * hugetlbfs page of smaller order
318 * (only possible if reallocated as
319 * slab on x86).
320 */
321 return false;
322 }
323 }
324
325 got = false;
326 if (likely(page != page_head && get_page_unless_zero(page_head))) {
327 /*
328 * page_head wasn't a dangling pointer but it
329 * may not be a head page anymore by the time
330 * we obtain the lock. That is ok as long as it
331 * can't be freed from under us.
332 */
333 flags = compound_lock_irqsave(page_head);
334 /* here __split_huge_page_refcount won't run anymore */
335 if (likely(PageTail(page))) {
336 __get_page_tail_foll(page, false);
337 got = true;
338 }
339 compound_unlock_irqrestore(page_head, flags);
340 if (unlikely(!got))
341 put_page(page_head);
342 }
343 return got;
344}
345EXPORT_SYMBOL(__get_page_tail);
346 102
347/** 103/**
348 * put_pages_list() - release a list of pages 104 * put_pages_list() - release a list of pages
@@ -604,6 +360,7 @@ static void __lru_cache_activate_page(struct page *page)
604 */ 360 */
605void mark_page_accessed(struct page *page) 361void mark_page_accessed(struct page *page)
606{ 362{
363 page = compound_head(page);
607 if (!PageActive(page) && !PageUnevictable(page) && 364 if (!PageActive(page) && !PageUnevictable(page) &&
608 PageReferenced(page)) { 365 PageReferenced(page)) {
609 366
@@ -799,6 +556,24 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
799 update_page_reclaim_stat(lruvec, file, 0); 556 update_page_reclaim_stat(lruvec, file, 0);
800} 557}
801 558
559
560static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
561 void *arg)
562{
563 if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
564 int file = page_is_file_cache(page);
565 int lru = page_lru_base_type(page);
566
567 del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
568 ClearPageActive(page);
569 ClearPageReferenced(page);
570 add_page_to_lru_list(page, lruvec, lru);
571
572 __count_vm_event(PGDEACTIVATE);
573 update_page_reclaim_stat(lruvec, file, 0);
574 }
575}
576
802/* 577/*
803 * Drain pages out of the cpu's pagevecs. 578 * Drain pages out of the cpu's pagevecs.
804 * Either "cpu" is the current CPU, and preemption has already been 579 * Either "cpu" is the current CPU, and preemption has already been
@@ -825,6 +600,10 @@ void lru_add_drain_cpu(int cpu)
825 if (pagevec_count(pvec)) 600 if (pagevec_count(pvec))
826 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); 601 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
827 602
603 pvec = &per_cpu(lru_deactivate_pvecs, cpu);
604 if (pagevec_count(pvec))
605 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
606
828 activate_page_drain(cpu); 607 activate_page_drain(cpu);
829} 608}
830 609
@@ -854,6 +633,26 @@ void deactivate_file_page(struct page *page)
854 } 633 }
855} 634}
856 635
636/**
637 * deactivate_page - deactivate a page
638 * @page: page to deactivate
639 *
640 * deactivate_page() moves @page to the inactive list if @page was on the active
641 * list and was not an unevictable page. This is done to accelerate the reclaim
642 * of @page.
643 */
644void deactivate_page(struct page *page)
645{
646 if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
647 struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
648
649 page_cache_get(page);
650 if (!pagevec_add(pvec, page))
651 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
652 put_cpu_var(lru_deactivate_pvecs);
653 }
654}
655
857void lru_add_drain(void) 656void lru_add_drain(void)
858{ 657{
859 lru_add_drain_cpu(get_cpu()); 658 lru_add_drain_cpu(get_cpu());
@@ -883,6 +682,7 @@ void lru_add_drain_all(void)
883 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || 682 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
884 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || 683 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
885 pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || 684 pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
685 pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
886 need_activate_page_drain(cpu)) { 686 need_activate_page_drain(cpu)) {
887 INIT_WORK(work, lru_add_drain_per_cpu); 687 INIT_WORK(work, lru_add_drain_per_cpu);
888 schedule_work_on(cpu, work); 688 schedule_work_on(cpu, work);
@@ -918,15 +718,6 @@ void release_pages(struct page **pages, int nr, bool cold)
918 for (i = 0; i < nr; i++) { 718 for (i = 0; i < nr; i++) {
919 struct page *page = pages[i]; 719 struct page *page = pages[i];
920 720
921 if (unlikely(PageCompound(page))) {
922 if (zone) {
923 spin_unlock_irqrestore(&zone->lru_lock, flags);
924 zone = NULL;
925 }
926 put_compound_page(page);
927 continue;
928 }
929
930 /* 721 /*
931 * Make sure the IRQ-safe lock-holding time does not get 722 * Make sure the IRQ-safe lock-holding time does not get
932 * excessive with a continuous string of pages from the 723 * excessive with a continuous string of pages from the
@@ -937,9 +728,19 @@ void release_pages(struct page **pages, int nr, bool cold)
937 zone = NULL; 728 zone = NULL;
938 } 729 }
939 730
731 page = compound_head(page);
940 if (!put_page_testzero(page)) 732 if (!put_page_testzero(page))
941 continue; 733 continue;
942 734
735 if (PageCompound(page)) {
736 if (zone) {
737 spin_unlock_irqrestore(&zone->lru_lock, flags);
738 zone = NULL;
739 }
740 __put_compound_page(page);
741 continue;
742 }
743
943 if (PageLRU(page)) { 744 if (PageLRU(page)) {
944 struct zone *pagezone = page_zone(page); 745 struct zone *pagezone = page_zone(page);
945 746
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d504adb7fa5f..676ff2991380 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -185,13 +185,12 @@ int add_to_swap(struct page *page, struct list_head *list)
185 * deadlock in the swap out path. 185 * deadlock in the swap out path.
186 */ 186 */
187 /* 187 /*
188 * Add it to the swap cache and mark it dirty 188 * Add it to the swap cache.
189 */ 189 */
190 err = add_to_swap_cache(page, entry, 190 err = add_to_swap_cache(page, entry,
191 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); 191 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
192 192
193 if (!err) { /* Success */ 193 if (!err) {
194 SetPageDirty(page);
195 return 1; 194 return 1;
196 } else { /* -ENOMEM radix-tree allocation failure */ 195 } else { /* -ENOMEM radix-tree allocation failure */
197 /* 196 /*
@@ -353,7 +352,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
353 } 352 }
354 353
355 /* May fail (-ENOMEM) if radix-tree node allocation failed. */ 354 /* May fail (-ENOMEM) if radix-tree node allocation failed. */
356 __set_page_locked(new_page); 355 __SetPageLocked(new_page);
357 SetPageSwapBacked(new_page); 356 SetPageSwapBacked(new_page);
358 err = __add_to_swap_cache(new_page, entry); 357 err = __add_to_swap_cache(new_page, entry);
359 if (likely(!err)) { 358 if (likely(!err)) {
@@ -367,7 +366,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
367 } 366 }
368 radix_tree_preload_end(); 367 radix_tree_preload_end();
369 ClearPageSwapBacked(new_page); 368 ClearPageSwapBacked(new_page);
370 __clear_page_locked(new_page); 369 __ClearPageLocked(new_page);
371 /* 370 /*
372 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 371 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
373 * clear SWAP_HAS_CACHE flag. 372 * clear SWAP_HAS_CACHE flag.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e6b8591a3ed2..2bb30aa3a412 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -926,6 +926,9 @@ int reuse_swap_page(struct page *page)
926 VM_BUG_ON_PAGE(!PageLocked(page), page); 926 VM_BUG_ON_PAGE(!PageLocked(page), page);
927 if (unlikely(PageKsm(page))) 927 if (unlikely(PageKsm(page)))
928 return 0; 928 return 0;
929 /* The page is part of THP and cannot be reused */
930 if (PageTransCompound(page))
931 return 0;
929 count = page_mapcount(page); 932 count = page_mapcount(page);
930 if (count <= 1 && PageSwapCache(page)) { 933 if (count <= 1 && PageSwapCache(page)) {
931 count += page_swapcount(page); 934 count += page_swapcount(page);
@@ -1108,19 +1111,9 @@ unsigned int count_swap_pages(int type, int free)
1108} 1111}
1109#endif /* CONFIG_HIBERNATION */ 1112#endif /* CONFIG_HIBERNATION */
1110 1113
1111static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) 1114static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
1112{ 1115{
1113#ifdef CONFIG_MEM_SOFT_DIRTY 1116 return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
1114 /*
1115 * When pte keeps soft dirty bit the pte generated
1116 * from swap entry does not has it, still it's same
1117 * pte from logical point of view.
1118 */
1119 pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte);
1120 return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty);
1121#else
1122 return pte_same(pte, swp_pte);
1123#endif
1124} 1117}
1125 1118
1126/* 1119/*
@@ -1142,14 +1135,15 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1142 if (unlikely(!page)) 1135 if (unlikely(!page))
1143 return -ENOMEM; 1136 return -ENOMEM;
1144 1137
1145 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) { 1138 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
1139 &memcg, false)) {
1146 ret = -ENOMEM; 1140 ret = -ENOMEM;
1147 goto out_nolock; 1141 goto out_nolock;
1148 } 1142 }
1149 1143
1150 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 1144 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1151 if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { 1145 if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
1152 mem_cgroup_cancel_charge(page, memcg); 1146 mem_cgroup_cancel_charge(page, memcg, false);
1153 ret = 0; 1147 ret = 0;
1154 goto out; 1148 goto out;
1155 } 1149 }
@@ -1160,11 +1154,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1160 set_pte_at(vma->vm_mm, addr, pte, 1154 set_pte_at(vma->vm_mm, addr, pte,
1161 pte_mkold(mk_pte(page, vma->vm_page_prot))); 1155 pte_mkold(mk_pte(page, vma->vm_page_prot)));
1162 if (page == swapcache) { 1156 if (page == swapcache) {
1163 page_add_anon_rmap(page, vma, addr); 1157 page_add_anon_rmap(page, vma, addr, false);
1164 mem_cgroup_commit_charge(page, memcg, true); 1158 mem_cgroup_commit_charge(page, memcg, true, false);
1165 } else { /* ksm created a completely new copy */ 1159 } else { /* ksm created a completely new copy */
1166 page_add_new_anon_rmap(page, vma, addr); 1160 page_add_new_anon_rmap(page, vma, addr, false);
1167 mem_cgroup_commit_charge(page, memcg, false); 1161 mem_cgroup_commit_charge(page, memcg, false, false);
1168 lru_cache_add_active_or_unevictable(page, vma); 1162 lru_cache_add_active_or_unevictable(page, vma);
1169 } 1163 }
1170 swap_free(entry); 1164 swap_free(entry);
@@ -1206,7 +1200,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
1206 * swapoff spends a _lot_ of time in this loop! 1200 * swapoff spends a _lot_ of time in this loop!
1207 * Test inline before going to call unuse_pte. 1201 * Test inline before going to call unuse_pte.
1208 */ 1202 */
1209 if (unlikely(maybe_same_pte(*pte, swp_pte))) { 1203 if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
1210 pte_unmap(pte); 1204 pte_unmap(pte);
1211 ret = unuse_pte(vma, pmd, addr, entry, page); 1205 ret = unuse_pte(vma, pmd, addr, entry, page);
1212 if (ret) 1206 if (ret)
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 77fee9325a57..806b0c758c5b 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -63,7 +63,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
63 __SetPageUptodate(page); 63 __SetPageUptodate(page);
64 64
65 ret = -ENOMEM; 65 ret = -ENOMEM;
66 if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg)) 66 if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
67 goto out_release; 67 goto out_release;
68 68
69 _dst_pte = mk_pte(page, dst_vma->vm_page_prot); 69 _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
@@ -76,8 +76,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
76 goto out_release_uncharge_unlock; 76 goto out_release_uncharge_unlock;
77 77
78 inc_mm_counter(dst_mm, MM_ANONPAGES); 78 inc_mm_counter(dst_mm, MM_ANONPAGES);
79 page_add_new_anon_rmap(page, dst_vma, dst_addr); 79 page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
80 mem_cgroup_commit_charge(page, memcg, false); 80 mem_cgroup_commit_charge(page, memcg, false, false);
81 lru_cache_add_active_or_unevictable(page, dst_vma); 81 lru_cache_add_active_or_unevictable(page, dst_vma);
82 82
83 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 83 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
@@ -91,7 +91,7 @@ out:
91 return ret; 91 return ret;
92out_release_uncharge_unlock: 92out_release_uncharge_unlock:
93 pte_unmap_unlock(dst_pte, ptl); 93 pte_unmap_unlock(dst_pte, ptl);
94 mem_cgroup_cancel_charge(page, memcg); 94 mem_cgroup_cancel_charge(page, memcg, false);
95out_release: 95out_release:
96 page_cache_release(page); 96 page_cache_release(page);
97 goto out; 97 goto out;
diff --git a/mm/util.c b/mm/util.c
index 2d28f7930043..6d1f9200f74e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -386,7 +386,9 @@ struct anon_vma *page_anon_vma(struct page *page)
386 386
387struct address_space *page_mapping(struct page *page) 387struct address_space *page_mapping(struct page *page)
388{ 388{
389 unsigned long mapping; 389 struct address_space *mapping;
390
391 page = compound_head(page);
390 392
391 /* This happens if someone calls flush_dcache_page on slab page */ 393 /* This happens if someone calls flush_dcache_page on slab page */
392 if (unlikely(PageSlab(page))) 394 if (unlikely(PageSlab(page)))
@@ -399,11 +401,25 @@ struct address_space *page_mapping(struct page *page)
399 return swap_address_space(entry); 401 return swap_address_space(entry);
400 } 402 }
401 403
402 mapping = (unsigned long)page->mapping; 404 mapping = page->mapping;
403 if (mapping & PAGE_MAPPING_FLAGS) 405 if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
404 return NULL; 406 return NULL;
405 return page->mapping; 407 return mapping;
408}
409
410/* Slow path of page_mapcount() for compound pages */
411int __page_mapcount(struct page *page)
412{
413 int ret;
414
415 ret = atomic_read(&page->_mapcount) + 1;
416 page = compound_head(page);
417 ret += atomic_read(compound_mapcount_ptr(page)) + 1;
418 if (PageDoubleMap(page))
419 ret--;
420 return ret;
406} 421}
422EXPORT_SYMBOL_GPL(__page_mapcount);
407 423
408int overcommit_ratio_handler(struct ctl_table *table, int write, 424int overcommit_ratio_handler(struct ctl_table *table, int write,
409 void __user *buffer, size_t *lenp, 425 void __user *buffer, size_t *lenp,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 58ceeb107960..fb42a5bffe47 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -455,7 +455,7 @@ found:
455 free_vmap_cache = &va->rb_node; 455 free_vmap_cache = &va->rb_node;
456 spin_unlock(&vmap_area_lock); 456 spin_unlock(&vmap_area_lock);
457 457
458 BUG_ON(va->va_start & (align-1)); 458 BUG_ON(!IS_ALIGNED(va->va_start, align));
459 BUG_ON(va->va_start < vstart); 459 BUG_ON(va->va_start < vstart);
460 BUG_ON(va->va_end > vend); 460 BUG_ON(va->va_end > vend);
461 461
@@ -1086,7 +1086,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
1086 BUG_ON(!addr); 1086 BUG_ON(!addr);
1087 BUG_ON(addr < VMALLOC_START); 1087 BUG_ON(addr < VMALLOC_START);
1088 BUG_ON(addr > VMALLOC_END); 1088 BUG_ON(addr > VMALLOC_END);
1089 BUG_ON(addr & (PAGE_SIZE-1)); 1089 BUG_ON(!IS_ALIGNED(addr, PAGE_SIZE));
1090 1090
1091 debug_check_no_locks_freed(mem, size); 1091 debug_check_no_locks_freed(mem, size);
1092 vmap_debug_free_range(addr, addr+size); 1092 vmap_debug_free_range(addr, addr+size);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 108bd119f2f6..5ac86956ff9d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -906,6 +906,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
906 int may_enter_fs; 906 int may_enter_fs;
907 enum page_references references = PAGEREF_RECLAIM_CLEAN; 907 enum page_references references = PAGEREF_RECLAIM_CLEAN;
908 bool dirty, writeback; 908 bool dirty, writeback;
909 bool lazyfree = false;
910 int ret = SWAP_SUCCESS;
909 911
910 cond_resched(); 912 cond_resched();
911 913
@@ -1049,6 +1051,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1049 goto keep_locked; 1051 goto keep_locked;
1050 if (!add_to_swap(page, page_list)) 1052 if (!add_to_swap(page, page_list))
1051 goto activate_locked; 1053 goto activate_locked;
1054 lazyfree = true;
1052 may_enter_fs = 1; 1055 may_enter_fs = 1;
1053 1056
1054 /* Adding to swap updated mapping */ 1057 /* Adding to swap updated mapping */
@@ -1060,14 +1063,17 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1060 * processes. Try to unmap it here. 1063 * processes. Try to unmap it here.
1061 */ 1064 */
1062 if (page_mapped(page) && mapping) { 1065 if (page_mapped(page) && mapping) {
1063 switch (try_to_unmap(page, 1066 switch (ret = try_to_unmap(page, lazyfree ?
1064 ttu_flags|TTU_BATCH_FLUSH)) { 1067 (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
1068 (ttu_flags | TTU_BATCH_FLUSH))) {
1065 case SWAP_FAIL: 1069 case SWAP_FAIL:
1066 goto activate_locked; 1070 goto activate_locked;
1067 case SWAP_AGAIN: 1071 case SWAP_AGAIN:
1068 goto keep_locked; 1072 goto keep_locked;
1069 case SWAP_MLOCK: 1073 case SWAP_MLOCK:
1070 goto cull_mlocked; 1074 goto cull_mlocked;
1075 case SWAP_LZFREE:
1076 goto lazyfree;
1071 case SWAP_SUCCESS: 1077 case SWAP_SUCCESS:
1072 ; /* try to free the page below */ 1078 ; /* try to free the page below */
1073 } 1079 }
@@ -1174,6 +1180,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1174 } 1180 }
1175 } 1181 }
1176 1182
1183lazyfree:
1177 if (!mapping || !__remove_mapping(mapping, page, true)) 1184 if (!mapping || !__remove_mapping(mapping, page, true))
1178 goto keep_locked; 1185 goto keep_locked;
1179 1186
@@ -1184,8 +1191,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1184 * we obviously don't have to worry about waking up a process 1191 * we obviously don't have to worry about waking up a process
1185 * waiting on the page lock, because there are no references. 1192 * waiting on the page lock, because there are no references.
1186 */ 1193 */
1187 __clear_page_locked(page); 1194 __ClearPageLocked(page);
1188free_it: 1195free_it:
1196 if (ret == SWAP_LZFREE)
1197 count_vm_event(PGLAZYFREED);
1198
1189 nr_reclaimed++; 1199 nr_reclaimed++;
1190 1200
1191 /* 1201 /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 83a003bc3cae..64bd0aa13f75 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -783,6 +783,7 @@ const char * const vmstat_text[] = {
783 783
784 "pgfault", 784 "pgfault",
785 "pgmajfault", 785 "pgmajfault",
786 "pglazyfreed",
786 787
787 TEXTS_FOR_ZONES("pgrefill") 788 TEXTS_FOR_ZONES("pgrefill")
788 TEXTS_FOR_ZONES("pgsteal_kswapd") 789 TEXTS_FOR_ZONES("pgsteal_kswapd")
@@ -844,7 +845,9 @@ const char * const vmstat_text[] = {
844 "thp_fault_fallback", 845 "thp_fault_fallback",
845 "thp_collapse_alloc", 846 "thp_collapse_alloc",
846 "thp_collapse_alloc_failed", 847 "thp_collapse_alloc_failed",
847 "thp_split", 848 "thp_split_page",
849 "thp_split_page_failed",
850 "thp_split_pmd",
848 "thp_zero_page_alloc", 851 "thp_zero_page_alloc",
849 "thp_zero_page_alloc_failed", 852 "thp_zero_page_alloc_failed",
850#endif 853#endif
diff --git a/scripts/tags.sh b/scripts/tags.sh
index 262889046703..76f131ebc192 100755
--- a/scripts/tags.sh
+++ b/scripts/tags.sh
@@ -193,7 +193,6 @@ exuberant()
193 --regex-c++='/CLEARPAGEFLAG_NOOP\(([^,)]*).*/ClearPage\1/' \ 193 --regex-c++='/CLEARPAGEFLAG_NOOP\(([^,)]*).*/ClearPage\1/' \
194 --regex-c++='/__CLEARPAGEFLAG_NOOP\(([^,)]*).*/__ClearPage\1/' \ 194 --regex-c++='/__CLEARPAGEFLAG_NOOP\(([^,)]*).*/__ClearPage\1/' \
195 --regex-c++='/TESTCLEARFLAG_FALSE\(([^,)]*).*/TestClearPage\1/' \ 195 --regex-c++='/TESTCLEARFLAG_FALSE\(([^,)]*).*/TestClearPage\1/' \
196 --regex-c++='/__TESTCLEARFLAG_FALSE\(([^,)]*).*/__TestClearPage\1/' \
197 --regex-c++='/_PE\(([^,)]*).*/PEVENT_ERRNO__\1/' \ 196 --regex-c++='/_PE\(([^,)]*).*/PEVENT_ERRNO__\1/' \
198 --regex-c++='/TASK_PFA_TEST\([^,]*,\s*([^)]*)\)/task_\1/' \ 197 --regex-c++='/TASK_PFA_TEST\([^,]*,\s*([^)]*)\)/task_\1/' \
199 --regex-c++='/TASK_PFA_SET\([^,]*,\s*([^)]*)\)/task_set_\1/' \ 198 --regex-c++='/TASK_PFA_SET\([^,]*,\s*([^)]*)\)/task_set_\1/' \
@@ -260,7 +259,6 @@ emacs()
260 --regex='/CLEARPAGEFLAG_NOOP(\([^,)]*\).*/ClearPage\1/' \ 259 --regex='/CLEARPAGEFLAG_NOOP(\([^,)]*\).*/ClearPage\1/' \
261 --regex='/__CLEARPAGEFLAG_NOOP(\([^,)]*\).*/__ClearPage\1/' \ 260 --regex='/__CLEARPAGEFLAG_NOOP(\([^,)]*\).*/__ClearPage\1/' \
262 --regex='/TESTCLEARFLAG_FALSE(\([^,)]*\).*/TestClearPage\1/' \ 261 --regex='/TESTCLEARFLAG_FALSE(\([^,)]*\).*/TestClearPage\1/' \
263 --regex='/__TESTCLEARFLAG_FALSE(\([^,)]*\).*/__TestClearPage\1/' \
264 --regex='/TASK_PFA_TEST\([^,]*,\s*([^)]*)\)/task_\1/' \ 262 --regex='/TASK_PFA_TEST\([^,]*,\s*([^)]*)\)/task_\1/' \
265 --regex='/TASK_PFA_SET\([^,]*,\s*([^)]*)\)/task_set_\1/' \ 263 --regex='/TASK_PFA_SET\([^,]*,\s*([^)]*)\)/task_set_\1/' \
266 --regex='/TASK_PFA_CLEAR\([^,]*,\s*([^)]*)\)/task_clear_\1/' \ 264 --regex='/TASK_PFA_CLEAR\([^,]*,\s*([^)]*)\)/task_clear_\1/' \
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 314c7774652e..a11cfd20a6a0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -111,7 +111,7 @@ static void hardware_disable_all(void);
111 111
112static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 112static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
113 113
114static void kvm_release_pfn_dirty(pfn_t pfn); 114static void kvm_release_pfn_dirty(kvm_pfn_t pfn);
115static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); 115static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
116 116
117__visible bool kvm_rebooting; 117__visible bool kvm_rebooting;
@@ -119,7 +119,7 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
119 119
120static bool largepages_enabled = true; 120static bool largepages_enabled = true;
121 121
122bool kvm_is_reserved_pfn(pfn_t pfn) 122bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
123{ 123{
124 if (pfn_valid(pfn)) 124 if (pfn_valid(pfn))
125 return PageReserved(pfn_to_page(pfn)); 125 return PageReserved(pfn_to_page(pfn));
@@ -1289,7 +1289,7 @@ static inline int check_user_page_hwpoison(unsigned long addr)
1289 * true indicates success, otherwise false is returned. 1289 * true indicates success, otherwise false is returned.
1290 */ 1290 */
1291static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async, 1291static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
1292 bool write_fault, bool *writable, pfn_t *pfn) 1292 bool write_fault, bool *writable, kvm_pfn_t *pfn)
1293{ 1293{
1294 struct page *page[1]; 1294 struct page *page[1];
1295 int npages; 1295 int npages;
@@ -1322,7 +1322,7 @@ static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
1322 * 1 indicates success, -errno is returned if error is detected. 1322 * 1 indicates success, -errno is returned if error is detected.
1323 */ 1323 */
1324static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, 1324static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
1325 bool *writable, pfn_t *pfn) 1325 bool *writable, kvm_pfn_t *pfn)
1326{ 1326{
1327 struct page *page[1]; 1327 struct page *page[1];
1328 int npages = 0; 1328 int npages = 0;
@@ -1386,11 +1386,11 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
1386 * 2): @write_fault = false && @writable, @writable will tell the caller 1386 * 2): @write_fault = false && @writable, @writable will tell the caller
1387 * whether the mapping is writable. 1387 * whether the mapping is writable.
1388 */ 1388 */
1389static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, 1389static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
1390 bool write_fault, bool *writable) 1390 bool write_fault, bool *writable)
1391{ 1391{
1392 struct vm_area_struct *vma; 1392 struct vm_area_struct *vma;
1393 pfn_t pfn = 0; 1393 kvm_pfn_t pfn = 0;
1394 int npages; 1394 int npages;
1395 1395
1396 /* we can do it either atomically or asynchronously, not both */ 1396 /* we can do it either atomically or asynchronously, not both */
@@ -1431,8 +1431,9 @@ exit:
1431 return pfn; 1431 return pfn;
1432} 1432}
1433 1433
1434pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, 1434kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
1435 bool *async, bool write_fault, bool *writable) 1435 bool atomic, bool *async, bool write_fault,
1436 bool *writable)
1436{ 1437{
1437 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); 1438 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
1438 1439
@@ -1453,7 +1454,7 @@ pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
1453} 1454}
1454EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); 1455EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
1455 1456
1456pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 1457kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
1457 bool *writable) 1458 bool *writable)
1458{ 1459{
1459 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, 1460 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
@@ -1461,37 +1462,37 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
1461} 1462}
1462EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1463EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
1463 1464
1464pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 1465kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
1465{ 1466{
1466 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); 1467 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
1467} 1468}
1468EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); 1469EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
1469 1470
1470pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) 1471kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
1471{ 1472{
1472 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); 1473 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
1473} 1474}
1474EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); 1475EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
1475 1476
1476pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1477kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
1477{ 1478{
1478 return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn); 1479 return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn);
1479} 1480}
1480EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); 1481EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
1481 1482
1482pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) 1483kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
1483{ 1484{
1484 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 1485 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
1485} 1486}
1486EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); 1487EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
1487 1488
1488pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1489kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
1489{ 1490{
1490 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); 1491 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
1491} 1492}
1492EXPORT_SYMBOL_GPL(gfn_to_pfn); 1493EXPORT_SYMBOL_GPL(gfn_to_pfn);
1493 1494
1494pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) 1495kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
1495{ 1496{
1496 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 1497 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
1497} 1498}
@@ -1514,7 +1515,7 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
1514} 1515}
1515EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 1516EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
1516 1517
1517static struct page *kvm_pfn_to_page(pfn_t pfn) 1518static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
1518{ 1519{
1519 if (is_error_noslot_pfn(pfn)) 1520 if (is_error_noslot_pfn(pfn))
1520 return KVM_ERR_PTR_BAD_PAGE; 1521 return KVM_ERR_PTR_BAD_PAGE;
@@ -1529,7 +1530,7 @@ static struct page *kvm_pfn_to_page(pfn_t pfn)
1529 1530
1530struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1531struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1531{ 1532{
1532 pfn_t pfn; 1533 kvm_pfn_t pfn;
1533 1534
1534 pfn = gfn_to_pfn(kvm, gfn); 1535 pfn = gfn_to_pfn(kvm, gfn);
1535 1536
@@ -1539,7 +1540,7 @@ EXPORT_SYMBOL_GPL(gfn_to_page);
1539 1540
1540struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) 1541struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
1541{ 1542{
1542 pfn_t pfn; 1543 kvm_pfn_t pfn;
1543 1544
1544 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn); 1545 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
1545 1546
@@ -1555,7 +1556,7 @@ void kvm_release_page_clean(struct page *page)
1555} 1556}
1556EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1557EXPORT_SYMBOL_GPL(kvm_release_page_clean);
1557 1558
1558void kvm_release_pfn_clean(pfn_t pfn) 1559void kvm_release_pfn_clean(kvm_pfn_t pfn)
1559{ 1560{
1560 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) 1561 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
1561 put_page(pfn_to_page(pfn)); 1562 put_page(pfn_to_page(pfn));
@@ -1570,13 +1571,13 @@ void kvm_release_page_dirty(struct page *page)
1570} 1571}
1571EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1572EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
1572 1573
1573static void kvm_release_pfn_dirty(pfn_t pfn) 1574static void kvm_release_pfn_dirty(kvm_pfn_t pfn)
1574{ 1575{
1575 kvm_set_pfn_dirty(pfn); 1576 kvm_set_pfn_dirty(pfn);
1576 kvm_release_pfn_clean(pfn); 1577 kvm_release_pfn_clean(pfn);
1577} 1578}
1578 1579
1579void kvm_set_pfn_dirty(pfn_t pfn) 1580void kvm_set_pfn_dirty(kvm_pfn_t pfn)
1580{ 1581{
1581 if (!kvm_is_reserved_pfn(pfn)) { 1582 if (!kvm_is_reserved_pfn(pfn)) {
1582 struct page *page = pfn_to_page(pfn); 1583 struct page *page = pfn_to_page(pfn);
@@ -1587,14 +1588,14 @@ void kvm_set_pfn_dirty(pfn_t pfn)
1587} 1588}
1588EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1589EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
1589 1590
1590void kvm_set_pfn_accessed(pfn_t pfn) 1591void kvm_set_pfn_accessed(kvm_pfn_t pfn)
1591{ 1592{
1592 if (!kvm_is_reserved_pfn(pfn)) 1593 if (!kvm_is_reserved_pfn(pfn))
1593 mark_page_accessed(pfn_to_page(pfn)); 1594 mark_page_accessed(pfn_to_page(pfn));
1594} 1595}
1595EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1596EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
1596 1597
1597void kvm_get_pfn(pfn_t pfn) 1598void kvm_get_pfn(kvm_pfn_t pfn)
1598{ 1599{
1599 if (!kvm_is_reserved_pfn(pfn)) 1600 if (!kvm_is_reserved_pfn(pfn))
1600 get_page(pfn_to_page(pfn)); 1601 get_page(pfn_to_page(pfn));