aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-01-15 14:41:44 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-15 14:41:44 -0500
commit875fc4f5ddf35605581f9a5900c14afef48611f2 (patch)
treee237a28a71a5d1e72eaf0ecda737eb5c8614c72c
parent7d1fc01afc5af35e5197e0e75abe900f6bd279b8 (diff)
parent7dfa4612204b511c934ca2a0e4f306f9981bd9aa (diff)
Merge branch 'akpm' (patches from Andrew)
Merge first patch-bomb from Andrew Morton: - A few hotfixes which missed 4.4 becasue I was asleep. cc'ed to -stable - A few misc fixes - OCFS2 updates - Part of MM. Including pretty large changes to page-flags handling and to thp management which have been buffered up for 2-3 cycles now. I have a lot of MM material this time. [ It turns out the THP part wasn't quite ready, so that got dropped from this series - Linus ] * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (117 commits) zsmalloc: reorganize struct size_class to pack 4 bytes hole mm/zbud.c: use list_last_entry() instead of list_tail_entry() zram/zcomp: do not zero out zcomp private pages zram: pass gfp from zcomp frontend to backend zram: try vmalloc() after kmalloc() zram/zcomp: use GFP_NOIO to allocate streams mm: add tracepoint for scanning pages drivers/base/memory.c: fix kernel warning during memory hotplug on ppc64 mm/page_isolation: use macro to judge the alignment mm: fix noisy sparse warning in LIBCFS_ALLOC_PRE() mm: rework virtual memory accounting include/linux/memblock.h: fix ordering of 'flags' argument in comments mm: move lru_to_page to mm_inline.h Documentation/filesystems: describe the shared memory usage/accounting memory-hotplug: don't BUG() in register_memory_resource() hugetlb: make mm and fs code explicitly non-modular mm/swapfile.c: use list_for_each_entry_safe in free_swap_count_continuations mm: /proc/pid/clear_refs: no need to clear VM_SOFTDIRTY in clear_soft_dirty_pmd() mm: make sure isolate_lru_page() is never called for tail page vmstat: make vmstat_updater deferrable again and shut down on idle ...
-rw-r--r--Documentation/filesystems/proc.txt23
-rw-r--r--Documentation/filesystems/tmpfs.txt8
-rw-r--r--Documentation/kernel-parameters.txt4
-rw-r--r--Documentation/sysctl/vm.txt29
-rw-r--r--arch/Kconfig68
-rw-r--r--arch/arm/Kconfig9
-rw-r--r--arch/arm/mm/mmap.c3
-rw-r--r--arch/arm64/Kconfig29
-rw-r--r--arch/arm64/mm/mmap.c8
-rw-r--r--arch/ia64/kernel/perfmon.c3
-rw-r--r--arch/m32r/kernel/setup.c3
-rw-r--r--arch/powerpc/platforms/cell/spufs/inode.c2
-rw-r--r--arch/s390/mm/pgtable.c5
-rw-r--r--arch/x86/Kconfig16
-rw-r--r--arch/x86/mm/mmap.c12
-rw-r--r--drivers/base/memory.c22
-rw-r--r--drivers/block/zram/zcomp.c24
-rw-r--r--drivers/block/zram/zcomp.h2
-rw-r--r--drivers/block/zram/zcomp_lz4.c15
-rw-r--r--drivers/block/zram/zcomp_lzo.c15
-rw-r--r--drivers/staging/lustre/lustre/llite/super25.c3
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/adfs/super.c2
-rw-r--r--fs/affs/super.c2
-rw-r--r--fs/afs/super.c2
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/bfs/inode.c2
-rw-r--r--fs/block_dev.c4
-rw-r--r--fs/btrfs/inode.c3
-rw-r--r--fs/ceph/super.c4
-rw-r--r--fs/cifs/cifsfs.c2
-rw-r--r--fs/coda/inode.c6
-rw-r--r--fs/dcache.c5
-rw-r--r--fs/ecryptfs/main.c6
-rw-r--r--fs/efs/super.c6
-rw-r--r--fs/exofs/super.c4
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext4/super.c2
-rw-r--r--fs/f2fs/super.c5
-rw-r--r--fs/fat/inode.c2
-rw-r--r--fs/file.c7
-rw-r--r--fs/fuse/inode.c4
-rw-r--r--fs/gfs2/main.c3
-rw-r--r--fs/hfs/super.c4
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/hugetlbfs/inode.c31
-rw-r--r--fs/inode.c2
-rw-r--r--fs/isofs/inode.c2
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/kernfs/dir.c9
-rw-r--r--fs/logfs/Kconfig2
-rw-r--r--fs/logfs/inode.c3
-rw-r--r--fs/minix/inode.c2
-rw-r--r--fs/ncpfs/inode.c2
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/nilfs2/super.c3
-rw-r--r--fs/notify/inode_mark.c3
-rw-r--r--fs/notify/mark.c66
-rw-r--r--fs/ntfs/super.c4
-rw-r--r--fs/ocfs2/alloc.c15
-rw-r--r--fs/ocfs2/alloc.h2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c4
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h11
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c35
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c15
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c2
-rw-r--r--fs/ocfs2/dlmglue.c8
-rw-r--r--fs/ocfs2/file.c8
-rw-r--r--fs/ocfs2/ioctl.c4
-rw-r--r--fs/ocfs2/journal.c10
-rw-r--r--fs/ocfs2/localalloc.c10
-rw-r--r--fs/ocfs2/namei.c21
-rw-r--r--fs/ocfs2/slot_map.c14
-rw-r--r--fs/ocfs2/super.c13
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/proc/inode.c3
-rw-r--r--fs/proc/meminfo.c5
-rw-r--r--fs/proc/task_mmu.c78
-rw-r--r--fs/qnx4/inode.c2
-rw-r--r--fs/qnx6/inode.c2
-rw-r--r--fs/reiserfs/super.c3
-rw-r--r--fs/romfs/super.c4
-rw-r--r--fs/squashfs/super.c3
-rw-r--r--fs/sysv/inode.c2
-rw-r--r--fs/ubifs/super.c4
-rw-r--r--fs/udf/super.c3
-rw-r--r--fs/ufs/super.c2
-rw-r--r--fs/xfs/kmem.h1
-rw-r--r--fs/xfs/xfs_super.c4
-rw-r--r--include/asm-generic/memory_model.h4
-rw-r--r--include/linux/dcache.h4
-rw-r--r--include/linux/fsnotify_backend.h5
-rw-r--r--include/linux/gfp.h22
-rw-r--r--include/linux/hugetlb.h10
-rw-r--r--include/linux/memblock.h13
-rw-r--r--include/linux/memcontrol.h102
-rw-r--r--include/linux/mempolicy.h2
-rw-r--r--include/linux/mm.h46
-rw-r--r--include/linux/mm_inline.h2
-rw-r--r--include/linux/mm_types.h9
-rw-r--r--include/linux/mmzone.h37
-rw-r--r--include/linux/pfn.h1
-rw-r--r--include/linux/shmem_fs.h4
-rw-r--r--include/linux/slab.h5
-rw-r--r--include/linux/swap.h1
-rw-r--r--include/linux/thread_info.h5
-rw-r--r--include/linux/vmalloc.h1
-rw-r--r--include/linux/vmpressure.h7
-rw-r--r--include/linux/vmstat.h2
-rw-r--r--include/net/sock.h138
-rw-r--r--include/net/tcp.h5
-rw-r--r--include/net/tcp_memcontrol.h1
-rw-r--r--include/trace/events/huge_memory.h136
-rw-r--r--include/trace/events/page_isolation.h38
-rw-r--r--include/trace/events/vmscan.h21
-rw-r--r--ipc/mqueue.c2
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/fork.c27
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/sysctl.c22
-rw-r--r--lib/dma-debug.c2
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/compaction.c18
-rw-r--r--mm/debug.c4
-rw-r--r--mm/filemap.c9
-rw-r--r--mm/huge_memory.c166
-rw-r--r--mm/hugetlb.c39
-rw-r--r--mm/kmemleak.c3
-rw-r--r--mm/ksm.c20
-rw-r--r--mm/memblock.c45
-rw-r--r--mm/memcontrol.c310
-rw-r--r--mm/memory.c47
-rw-r--r--mm/memory_hotplug.c9
-rw-r--r--mm/mempolicy.c30
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mmap.c81
-rw-r--r--mm/mmzone.c8
-rw-r--r--mm/mprotect.c8
-rw-r--r--mm/mremap.c7
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c5
-rw-r--r--mm/page-writeback.c14
-rw-r--r--mm/page_alloc.c158
-rw-r--r--mm/page_isolation.c22
-rw-r--r--mm/pgtable-generic.c9
-rw-r--r--mm/readahead.c9
-rw-r--r--mm/rmap.c18
-rw-r--r--mm/shmem.c83
-rw-r--r--mm/slab.c48
-rw-r--r--mm/slab.h5
-rw-r--r--mm/slab_common.c3
-rw-r--r--mm/slub.c2
-rw-r--r--mm/swapfile.c23
-rw-r--r--mm/vmalloc.c23
-rw-r--r--mm/vmpressure.c78
-rw-r--r--mm/vmscan.c40
-rw-r--r--mm/vmstat.c69
-rw-r--r--mm/zbud.c5
-rw-r--r--mm/zsmalloc.c4
-rw-r--r--net/core/sock.c78
-rw-r--r--net/ipv4/tcp.c3
-rw-r--r--net/ipv4/tcp_ipv4.c9
-rw-r--r--net/ipv4/tcp_memcontrol.c91
-rw-r--r--net/ipv4/tcp_output.c7
-rw-r--r--net/ipv6/tcp_ipv6.c3
-rw-r--r--net/socket.c2
-rw-r--r--net/sunrpc/rpc_pipe.c2
-rwxr-xr-xscripts/bloat-o-meter8
-rw-r--r--scripts/mod/file2alias.c3
176 files changed, 1852 insertions, 1284 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 402ab99e409f..e95aa1c6eadf 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -169,6 +169,9 @@ read the file /proc/PID/status:
169 VmLck: 0 kB 169 VmLck: 0 kB
170 VmHWM: 476 kB 170 VmHWM: 476 kB
171 VmRSS: 476 kB 171 VmRSS: 476 kB
172 RssAnon: 352 kB
173 RssFile: 120 kB
174 RssShmem: 4 kB
172 VmData: 156 kB 175 VmData: 156 kB
173 VmStk: 88 kB 176 VmStk: 88 kB
174 VmExe: 68 kB 177 VmExe: 68 kB
@@ -231,14 +234,20 @@ Table 1-2: Contents of the status files (as of 4.1)
231 VmSize total program size 234 VmSize total program size
232 VmLck locked memory size 235 VmLck locked memory size
233 VmHWM peak resident set size ("high water mark") 236 VmHWM peak resident set size ("high water mark")
234 VmRSS size of memory portions 237 VmRSS size of memory portions. It contains the three
238 following parts (VmRSS = RssAnon + RssFile + RssShmem)
239 RssAnon size of resident anonymous memory
240 RssFile size of resident file mappings
241 RssShmem size of resident shmem memory (includes SysV shm,
242 mapping of tmpfs and shared anonymous mappings)
235 VmData size of data, stack, and text segments 243 VmData size of data, stack, and text segments
236 VmStk size of data, stack, and text segments 244 VmStk size of data, stack, and text segments
237 VmExe size of text segment 245 VmExe size of text segment
238 VmLib size of shared library code 246 VmLib size of shared library code
239 VmPTE size of page table entries 247 VmPTE size of page table entries
240 VmPMD size of second level page tables 248 VmPMD size of second level page tables
241 VmSwap size of swap usage (the number of referred swapents) 249 VmSwap amount of swap used by anonymous private data
250 (shmem swap usage is not included)
242 HugetlbPages size of hugetlb memory portions 251 HugetlbPages size of hugetlb memory portions
243 Threads number of threads 252 Threads number of threads
244 SigQ number of signals queued/max. number for queue 253 SigQ number of signals queued/max. number for queue
@@ -265,7 +274,8 @@ Table 1-3: Contents of the statm files (as of 2.6.8-rc3)
265 Field Content 274 Field Content
266 size total program size (pages) (same as VmSize in status) 275 size total program size (pages) (same as VmSize in status)
267 resident size of memory portions (pages) (same as VmRSS in status) 276 resident size of memory portions (pages) (same as VmRSS in status)
268 shared number of pages that are shared (i.e. backed by a file) 277 shared number of pages that are shared (i.e. backed by a file, same
278 as RssFile+RssShmem in status)
269 trs number of pages that are 'code' (not including libs; broken, 279 trs number of pages that are 'code' (not including libs; broken,
270 includes data segment) 280 includes data segment)
271 lrs number of pages of library (always 0 on 2.6) 281 lrs number of pages of library (always 0 on 2.6)
@@ -459,7 +469,10 @@ and a page is modified, the file page is replaced by a private anonymous copy.
459hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical 469hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical
460reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field. 470reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field.
461"Swap" shows how much would-be-anonymous memory is also used, but out on swap. 471"Swap" shows how much would-be-anonymous memory is also used, but out on swap.
462"SwapPss" shows proportional swap share of this mapping. 472For shmem mappings, "Swap" includes also the size of the mapped (and not
473replaced by copy-on-write) part of the underlying shmem object out on swap.
474"SwapPss" shows proportional swap share of this mapping. Unlike "Swap", this
475does not take into account swapped out page of underlying shmem objects.
463"Locked" indicates whether the mapping is locked in memory or not. 476"Locked" indicates whether the mapping is locked in memory or not.
464 477
465"VmFlags" field deserves a separate description. This member represents the kernel 478"VmFlags" field deserves a separate description. This member represents the kernel
@@ -842,6 +855,7 @@ Dirty: 968 kB
842Writeback: 0 kB 855Writeback: 0 kB
843AnonPages: 861800 kB 856AnonPages: 861800 kB
844Mapped: 280372 kB 857Mapped: 280372 kB
858Shmem: 644 kB
845Slab: 284364 kB 859Slab: 284364 kB
846SReclaimable: 159856 kB 860SReclaimable: 159856 kB
847SUnreclaim: 124508 kB 861SUnreclaim: 124508 kB
@@ -898,6 +912,7 @@ MemAvailable: An estimate of how much memory is available for starting new
898 AnonPages: Non-file backed pages mapped into userspace page tables 912 AnonPages: Non-file backed pages mapped into userspace page tables
899AnonHugePages: Non-file backed huge pages mapped into userspace page tables 913AnonHugePages: Non-file backed huge pages mapped into userspace page tables
900 Mapped: files which have been mmaped, such as libraries 914 Mapped: files which have been mmaped, such as libraries
915 Shmem: Total memory used by shared memory (shmem) and tmpfs
901 Slab: in-kernel data structures cache 916 Slab: in-kernel data structures cache
902SReclaimable: Part of Slab, that might be reclaimed, such as caches 917SReclaimable: Part of Slab, that might be reclaimed, such as caches
903 SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure 918 SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index 98ef55124158..d392e1505f17 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -17,10 +17,10 @@ RAM, where you have to create an ordinary filesystem on top. Ramdisks
17cannot swap and you do not have the possibility to resize them. 17cannot swap and you do not have the possibility to resize them.
18 18
19Since tmpfs lives completely in the page cache and on swap, all tmpfs 19Since tmpfs lives completely in the page cache and on swap, all tmpfs
20pages currently in memory will show up as cached. It will not show up 20pages will be shown as "Shmem" in /proc/meminfo and "Shared" in
21as shared or something like that. Further on you can check the actual 21free(1). Notice that these counters also include shared memory
22RAM+swap use of a tmpfs instance with df(1) and du(1). 22(shmem, see ipcs(1)). The most reliable way to get the count is
23 23using df(1) and du(1).
24 24
25tmpfs has the following uses: 25tmpfs has the following uses:
26 26
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index b7d44871effc..168fd79dc697 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -608,6 +608,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
608 cut the overhead, others just disable the usage. So 608 cut the overhead, others just disable the usage. So
609 only cgroup_disable=memory is actually worthy} 609 only cgroup_disable=memory is actually worthy}
610 610
611 cgroup.memory= [KNL] Pass options to the cgroup memory controller.
612 Format: <string>
613 nosocket -- Disable socket memory accounting.
614
611 checkreqprot [SELINUX] Set initial checkreqprot flag value. 615 checkreqprot [SELINUX] Set initial checkreqprot flag value.
612 Format: { "0" | "1" } 616 Format: { "0" | "1" }
613 See security/selinux/Kconfig help text. 617 See security/selinux/Kconfig help text.
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 8ee925c046aa..89a887c76629 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -42,6 +42,8 @@ Currently, these files are in /proc/sys/vm:
42- min_slab_ratio 42- min_slab_ratio
43- min_unmapped_ratio 43- min_unmapped_ratio
44- mmap_min_addr 44- mmap_min_addr
45- mmap_rnd_bits
46- mmap_rnd_compat_bits
45- nr_hugepages 47- nr_hugepages
46- nr_overcommit_hugepages 48- nr_overcommit_hugepages
47- nr_trim_pages (only if CONFIG_MMU=n) 49- nr_trim_pages (only if CONFIG_MMU=n)
@@ -485,6 +487,33 @@ against future potential kernel bugs.
485 487
486============================================================== 488==============================================================
487 489
490mmap_rnd_bits:
491
492This value can be used to select the number of bits to use to
493determine the random offset to the base address of vma regions
494resulting from mmap allocations on architectures which support
495tuning address space randomization. This value will be bounded
496by the architecture's minimum and maximum supported values.
497
498This value can be changed after boot using the
499/proc/sys/vm/mmap_rnd_bits tunable
500
501==============================================================
502
503mmap_rnd_compat_bits:
504
505This value can be used to select the number of bits to use to
506determine the random offset to the base address of vma regions
507resulting from mmap allocations for applications run in
508compatibility mode on architectures which support tuning address
509space randomization. This value will be bounded by the
510architecture's minimum and maximum supported values.
511
512This value can be changed after boot using the
513/proc/sys/vm/mmap_rnd_compat_bits tunable
514
515==============================================================
516
488nr_hugepages 517nr_hugepages
489 518
490Change the minimum size of the hugepage pool. 519Change the minimum size of the hugepage pool.
diff --git a/arch/Kconfig b/arch/Kconfig
index 4e949e58b192..ba1b626bca00 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -511,6 +511,74 @@ config ARCH_HAS_ELF_RANDOMIZE
511 - arch_mmap_rnd() 511 - arch_mmap_rnd()
512 - arch_randomize_brk() 512 - arch_randomize_brk()
513 513
514config HAVE_ARCH_MMAP_RND_BITS
515 bool
516 help
517 An arch should select this symbol if it supports setting a variable
518 number of bits for use in establishing the base address for mmap
519 allocations, has MMU enabled and provides values for both:
520 - ARCH_MMAP_RND_BITS_MIN
521 - ARCH_MMAP_RND_BITS_MAX
522
523config ARCH_MMAP_RND_BITS_MIN
524 int
525
526config ARCH_MMAP_RND_BITS_MAX
527 int
528
529config ARCH_MMAP_RND_BITS_DEFAULT
530 int
531
532config ARCH_MMAP_RND_BITS
533 int "Number of bits to use for ASLR of mmap base address" if EXPERT
534 range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX
535 default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT
536 default ARCH_MMAP_RND_BITS_MIN
537 depends on HAVE_ARCH_MMAP_RND_BITS
538 help
539 This value can be used to select the number of bits to use to
540 determine the random offset to the base address of vma regions
541 resulting from mmap allocations. This value will be bounded
542 by the architecture's minimum and maximum supported values.
543
544 This value can be changed after boot using the
545 /proc/sys/vm/mmap_rnd_bits tunable
546
547config HAVE_ARCH_MMAP_RND_COMPAT_BITS
548 bool
549 help
550 An arch should select this symbol if it supports running applications
551 in compatibility mode, supports setting a variable number of bits for
552 use in establishing the base address for mmap allocations, has MMU
553 enabled and provides values for both:
554 - ARCH_MMAP_RND_COMPAT_BITS_MIN
555 - ARCH_MMAP_RND_COMPAT_BITS_MAX
556
557config ARCH_MMAP_RND_COMPAT_BITS_MIN
558 int
559
560config ARCH_MMAP_RND_COMPAT_BITS_MAX
561 int
562
563config ARCH_MMAP_RND_COMPAT_BITS_DEFAULT
564 int
565
566config ARCH_MMAP_RND_COMPAT_BITS
567 int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT
568 range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX
569 default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT
570 default ARCH_MMAP_RND_COMPAT_BITS_MIN
571 depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS
572 help
573 This value can be used to select the number of bits to use to
574 determine the random offset to the base address of vma regions
575 resulting from mmap allocations for compatible applications This
576 value will be bounded by the architecture's minimum and maximum
577 supported values.
578
579 This value can be changed after boot using the
580 /proc/sys/vm/mmap_rnd_compat_bits tunable
581
514config HAVE_COPY_THREAD_TLS 582config HAVE_COPY_THREAD_TLS
515 bool 583 bool
516 help 584 help
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 84b1b21b08ae..4e489cc5c45e 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -37,6 +37,7 @@ config ARM
37 select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 37 select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
38 select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU 38 select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
39 select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU 39 select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
40 select HAVE_ARCH_MMAP_RND_BITS if MMU
40 select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT) 41 select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
41 select HAVE_ARCH_TRACEHOOK 42 select HAVE_ARCH_TRACEHOOK
42 select HAVE_ARM_SMCCC if CPU_V7 43 select HAVE_ARM_SMCCC if CPU_V7
@@ -311,6 +312,14 @@ config MMU
311 Select if you want MMU-based virtualised addressing space 312 Select if you want MMU-based virtualised addressing space
312 support by paged memory management. If unsure, say 'Y'. 313 support by paged memory management. If unsure, say 'Y'.
313 314
315config ARCH_MMAP_RND_BITS_MIN
316 default 8
317
318config ARCH_MMAP_RND_BITS_MAX
319 default 14 if PAGE_OFFSET=0x40000000
320 default 15 if PAGE_OFFSET=0x80000000
321 default 16
322
314# 323#
315# The "ARM system type" choice list is ordered alphabetically by option 324# The "ARM system type" choice list is ordered alphabetically by option
316# text. Please add new entries in the option alphabetic order. 325# text. Please add new entries in the option alphabetic order.
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 407dc786583a..4b4058db0781 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -173,8 +173,7 @@ unsigned long arch_mmap_rnd(void)
173{ 173{
174 unsigned long rnd; 174 unsigned long rnd;
175 175
176 /* 8 bits of randomness in 20 address space bits */ 176 rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
177 rnd = (unsigned long)get_random_int() % (1 << 8);
178 177
179 return rnd << PAGE_SHIFT; 178 return rnd << PAGE_SHIFT;
180} 179}
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 4d5b416e2e4b..6be3fa2310ee 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -52,6 +52,8 @@ config ARM64
52 select HAVE_ARCH_JUMP_LABEL 52 select HAVE_ARCH_JUMP_LABEL
53 select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48) 53 select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
54 select HAVE_ARCH_KGDB 54 select HAVE_ARCH_KGDB
55 select HAVE_ARCH_MMAP_RND_BITS
56 select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
55 select HAVE_ARCH_SECCOMP_FILTER 57 select HAVE_ARCH_SECCOMP_FILTER
56 select HAVE_ARCH_TRACEHOOK 58 select HAVE_ARCH_TRACEHOOK
57 select HAVE_BPF_JIT 59 select HAVE_BPF_JIT
@@ -107,6 +109,33 @@ config ARCH_PHYS_ADDR_T_64BIT
107config MMU 109config MMU
108 def_bool y 110 def_bool y
109 111
112config ARCH_MMAP_RND_BITS_MIN
113 default 14 if ARM64_64K_PAGES
114 default 16 if ARM64_16K_PAGES
115 default 18
116
117# max bits determined by the following formula:
118# VA_BITS - PAGE_SHIFT - 3
119config ARCH_MMAP_RND_BITS_MAX
120 default 19 if ARM64_VA_BITS=36
121 default 24 if ARM64_VA_BITS=39
122 default 27 if ARM64_VA_BITS=42
123 default 30 if ARM64_VA_BITS=47
124 default 29 if ARM64_VA_BITS=48 && ARM64_64K_PAGES
125 default 31 if ARM64_VA_BITS=48 && ARM64_16K_PAGES
126 default 33 if ARM64_VA_BITS=48
127 default 14 if ARM64_64K_PAGES
128 default 16 if ARM64_16K_PAGES
129 default 18
130
131config ARCH_MMAP_RND_COMPAT_BITS_MIN
132 default 7 if ARM64_64K_PAGES
133 default 9 if ARM64_16K_PAGES
134 default 11
135
136config ARCH_MMAP_RND_COMPAT_BITS_MAX
137 default 16
138
110config NO_IOPORT_MAP 139config NO_IOPORT_MAP
111 def_bool y if !PCI 140 def_bool y if !PCI
112 141
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index ed177475dd8c..4c893b5189dd 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -51,8 +51,12 @@ unsigned long arch_mmap_rnd(void)
51{ 51{
52 unsigned long rnd; 52 unsigned long rnd;
53 53
54 rnd = (unsigned long)get_random_int() & STACK_RND_MASK; 54#ifdef CONFIG_COMPAT
55 55 if (test_thread_flag(TIF_32BIT))
56 rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_compat_bits) - 1);
57 else
58#endif
59 rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
56 return rnd << PAGE_SHIFT; 60 return rnd << PAGE_SHIFT;
57} 61}
58 62
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 60e02f7747ff..9cd607b06964 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2332,8 +2332,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
2332 */ 2332 */
2333 insert_vm_struct(mm, vma); 2333 insert_vm_struct(mm, vma);
2334 2334
2335 vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, 2335 vm_stat_account(vma->vm_mm, vma->vm_flags, vma_pages(vma));
2336 vma_pages(vma));
2337 up_write(&task->mm->mmap_sem); 2336 up_write(&task->mm->mmap_sem);
2338 2337
2339 /* 2338 /*
diff --git a/arch/m32r/kernel/setup.c b/arch/m32r/kernel/setup.c
index 0392112a5d70..a5ecef7188ba 100644
--- a/arch/m32r/kernel/setup.c
+++ b/arch/m32r/kernel/setup.c
@@ -81,7 +81,10 @@ static struct resource code_resource = {
81}; 81};
82 82
83unsigned long memory_start; 83unsigned long memory_start;
84EXPORT_SYMBOL(memory_start);
85
84unsigned long memory_end; 86unsigned long memory_end;
87EXPORT_SYMBOL(memory_end);
85 88
86void __init setup_arch(char **); 89void __init setup_arch(char **);
87int get_cpuinfo(char *); 90int get_cpuinfo(char *);
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 11634fa7ab3c..ad4840f86be1 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -767,7 +767,7 @@ static int __init spufs_init(void)
767 ret = -ENOMEM; 767 ret = -ENOMEM;
768 spufs_inode_cache = kmem_cache_create("spufs_inode_cache", 768 spufs_inode_cache = kmem_cache_create("spufs_inode_cache",
769 sizeof(struct spufs_inode_info), 0, 769 sizeof(struct spufs_inode_info), 0,
770 SLAB_HWCACHE_ALIGN, spufs_init_once); 770 SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, spufs_init_once);
771 771
772 if (!spufs_inode_cache) 772 if (!spufs_inode_cache)
773 goto out; 773 goto out;
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 63b039899a5e..aa34af0a0b26 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -603,10 +603,7 @@ static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm)
603 else if (is_migration_entry(entry)) { 603 else if (is_migration_entry(entry)) {
604 struct page *page = migration_entry_to_page(entry); 604 struct page *page = migration_entry_to_page(entry);
605 605
606 if (PageAnon(page)) 606 dec_mm_counter(mm, mm_counter(page));
607 dec_mm_counter(mm, MM_ANONPAGES);
608 else
609 dec_mm_counter(mm, MM_FILEPAGES);
610 } 607 }
611 free_swap_and_cache(entry); 608 free_swap_and_cache(entry);
612} 609}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5d2293417946..24f362bf3ec6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -83,6 +83,8 @@ config X86
83 select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP 83 select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
84 select HAVE_ARCH_KGDB 84 select HAVE_ARCH_KGDB
85 select HAVE_ARCH_KMEMCHECK 85 select HAVE_ARCH_KMEMCHECK
86 select HAVE_ARCH_MMAP_RND_BITS if MMU
87 select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
86 select HAVE_ARCH_SECCOMP_FILTER 88 select HAVE_ARCH_SECCOMP_FILTER
87 select HAVE_ARCH_SOFT_DIRTY if X86_64 89 select HAVE_ARCH_SOFT_DIRTY if X86_64
88 select HAVE_ARCH_TRACEHOOK 90 select HAVE_ARCH_TRACEHOOK
@@ -184,6 +186,20 @@ config HAVE_LATENCYTOP_SUPPORT
184config MMU 186config MMU
185 def_bool y 187 def_bool y
186 188
189config ARCH_MMAP_RND_BITS_MIN
190 default 28 if 64BIT
191 default 8
192
193config ARCH_MMAP_RND_BITS_MAX
194 default 32 if 64BIT
195 default 16
196
197config ARCH_MMAP_RND_COMPAT_BITS_MIN
198 default 8
199
200config ARCH_MMAP_RND_COMPAT_BITS_MAX
201 default 16
202
187config SBUS 203config SBUS
188 bool 204 bool
189 205
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 844b06d67df4..96bd1e2bffaf 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -69,14 +69,14 @@ unsigned long arch_mmap_rnd(void)
69{ 69{
70 unsigned long rnd; 70 unsigned long rnd;
71 71
72 /*
73 * 8 bits of randomness in 32bit mmaps, 20 address space bits
74 * 28 bits of randomness in 64bit mmaps, 40 address space bits
75 */
76 if (mmap_is_ia32()) 72 if (mmap_is_ia32())
77 rnd = (unsigned long)get_random_int() % (1<<8); 73#ifdef CONFIG_COMPAT
74 rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_compat_bits) - 1);
75#else
76 rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
77#endif
78 else 78 else
79 rnd = (unsigned long)get_random_int() % (1<<28); 79 rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
80 80
81 return rnd << PAGE_SHIFT; 81 return rnd << PAGE_SHIFT;
82} 82}
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 25425d3f2575..619fe584a44c 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -450,8 +450,7 @@ memory_probe_store(struct device *dev, struct device_attribute *attr,
450 const char *buf, size_t count) 450 const char *buf, size_t count)
451{ 451{
452 u64 phys_addr; 452 u64 phys_addr;
453 int nid; 453 int nid, ret;
454 int i, ret;
455 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 454 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
456 455
457 ret = kstrtoull(buf, 0, &phys_addr); 456 ret = kstrtoull(buf, 0, &phys_addr);
@@ -461,15 +460,12 @@ memory_probe_store(struct device *dev, struct device_attribute *attr,
461 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 460 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
462 return -EINVAL; 461 return -EINVAL;
463 462
464 for (i = 0; i < sections_per_block; i++) { 463 nid = memory_add_physaddr_to_nid(phys_addr);
465 nid = memory_add_physaddr_to_nid(phys_addr); 464 ret = add_memory(nid, phys_addr,
466 ret = add_memory(nid, phys_addr, 465 MIN_MEMORY_BLOCK_SIZE * sections_per_block);
467 PAGES_PER_SECTION << PAGE_SHIFT);
468 if (ret)
469 goto out;
470 466
471 phys_addr += MIN_MEMORY_BLOCK_SIZE; 467 if (ret)
472 } 468 goto out;
473 469
474 ret = count; 470 ret = count;
475out: 471out:
@@ -618,7 +614,6 @@ static int init_memory_block(struct memory_block **memory,
618 base_memory_block_id(scn_nr) * sections_per_block; 614 base_memory_block_id(scn_nr) * sections_per_block;
619 mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; 615 mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
620 mem->state = state; 616 mem->state = state;
621 mem->section_count++;
622 start_pfn = section_nr_to_pfn(mem->start_section_nr); 617 start_pfn = section_nr_to_pfn(mem->start_section_nr);
623 mem->phys_device = arch_get_memory_phys_device(start_pfn); 618 mem->phys_device = arch_get_memory_phys_device(start_pfn);
624 619
@@ -672,6 +667,7 @@ int register_new_memory(int nid, struct mem_section *section)
672 ret = init_memory_block(&mem, section, MEM_OFFLINE); 667 ret = init_memory_block(&mem, section, MEM_OFFLINE);
673 if (ret) 668 if (ret)
674 goto out; 669 goto out;
670 mem->section_count++;
675 } 671 }
676 672
677 if (mem->section_count == sections_per_block) 673 if (mem->section_count == sections_per_block)
@@ -692,7 +688,7 @@ unregister_memory(struct memory_block *memory)
692 device_unregister(&memory->dev); 688 device_unregister(&memory->dev);
693} 689}
694 690
695static int remove_memory_block(unsigned long node_id, 691static int remove_memory_section(unsigned long node_id,
696 struct mem_section *section, int phys_device) 692 struct mem_section *section, int phys_device)
697{ 693{
698 struct memory_block *mem; 694 struct memory_block *mem;
@@ -716,7 +712,7 @@ int unregister_memory_section(struct mem_section *section)
716 if (!present_section(section)) 712 if (!present_section(section))
717 return -EINVAL; 713 return -EINVAL;
718 714
719 return remove_memory_block(0, section, 0); 715 return remove_memory_section(0, section, 0);
720} 716}
721#endif /* CONFIG_MEMORY_HOTREMOVE */ 717#endif /* CONFIG_MEMORY_HOTREMOVE */
722 718
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 5cb13ca3a3ac..3ef42e563bb5 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -74,18 +74,18 @@ static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm)
74 * allocate new zcomp_strm structure with ->private initialized by 74 * allocate new zcomp_strm structure with ->private initialized by
75 * backend, return NULL on error 75 * backend, return NULL on error
76 */ 76 */
77static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) 77static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags)
78{ 78{
79 struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL); 79 struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), flags);
80 if (!zstrm) 80 if (!zstrm)
81 return NULL; 81 return NULL;
82 82
83 zstrm->private = comp->backend->create(); 83 zstrm->private = comp->backend->create(flags);
84 /* 84 /*
85 * allocate 2 pages. 1 for compressed data, plus 1 extra for the 85 * allocate 2 pages. 1 for compressed data, plus 1 extra for the
86 * case when compressed size is larger than the original one 86 * case when compressed size is larger than the original one
87 */ 87 */
88 zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); 88 zstrm->buffer = (void *)__get_free_pages(flags | __GFP_ZERO, 1);
89 if (!zstrm->private || !zstrm->buffer) { 89 if (!zstrm->private || !zstrm->buffer) {
90 zcomp_strm_free(comp, zstrm); 90 zcomp_strm_free(comp, zstrm);
91 zstrm = NULL; 91 zstrm = NULL;
@@ -120,8 +120,16 @@ static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp)
120 /* allocate new zstrm stream */ 120 /* allocate new zstrm stream */
121 zs->avail_strm++; 121 zs->avail_strm++;
122 spin_unlock(&zs->strm_lock); 122 spin_unlock(&zs->strm_lock);
123 123 /*
124 zstrm = zcomp_strm_alloc(comp); 124 * This function can be called in swapout/fs write path
125 * so we can't use GFP_FS|IO. And it assumes we already
126 * have at least one stream in zram initialization so we
127 * don't do best effort to allocate more stream in here.
128 * A default stream will work well without further multiple
129 * streams. That's why we use NORETRY | NOWARN.
130 */
131 zstrm = zcomp_strm_alloc(comp, GFP_NOIO | __GFP_NORETRY |
132 __GFP_NOWARN);
125 if (!zstrm) { 133 if (!zstrm) {
126 spin_lock(&zs->strm_lock); 134 spin_lock(&zs->strm_lock);
127 zs->avail_strm--; 135 zs->avail_strm--;
@@ -209,7 +217,7 @@ static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm)
209 zs->max_strm = max_strm; 217 zs->max_strm = max_strm;
210 zs->avail_strm = 1; 218 zs->avail_strm = 1;
211 219
212 zstrm = zcomp_strm_alloc(comp); 220 zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
213 if (!zstrm) { 221 if (!zstrm) {
214 kfree(zs); 222 kfree(zs);
215 return -ENOMEM; 223 return -ENOMEM;
@@ -259,7 +267,7 @@ static int zcomp_strm_single_create(struct zcomp *comp)
259 267
260 comp->stream = zs; 268 comp->stream = zs;
261 mutex_init(&zs->strm_lock); 269 mutex_init(&zs->strm_lock);
262 zs->zstrm = zcomp_strm_alloc(comp); 270 zs->zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
263 if (!zs->zstrm) { 271 if (!zs->zstrm) {
264 kfree(zs); 272 kfree(zs);
265 return -ENOMEM; 273 return -ENOMEM;
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index 46e2b9f8f1f0..b7d2a4bcae54 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -33,7 +33,7 @@ struct zcomp_backend {
33 int (*decompress)(const unsigned char *src, size_t src_len, 33 int (*decompress)(const unsigned char *src, size_t src_len,
34 unsigned char *dst); 34 unsigned char *dst);
35 35
36 void *(*create)(void); 36 void *(*create)(gfp_t flags);
37 void (*destroy)(void *private); 37 void (*destroy)(void *private);
38 38
39 const char *name; 39 const char *name;
diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c
index f2afb7e988c3..0110086accba 100644
--- a/drivers/block/zram/zcomp_lz4.c
+++ b/drivers/block/zram/zcomp_lz4.c
@@ -10,17 +10,26 @@
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/lz4.h> 12#include <linux/lz4.h>
13#include <linux/vmalloc.h>
14#include <linux/mm.h>
13 15
14#include "zcomp_lz4.h" 16#include "zcomp_lz4.h"
15 17
16static void *zcomp_lz4_create(void) 18static void *zcomp_lz4_create(gfp_t flags)
17{ 19{
18 return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL); 20 void *ret;
21
22 ret = kmalloc(LZ4_MEM_COMPRESS, flags);
23 if (!ret)
24 ret = __vmalloc(LZ4_MEM_COMPRESS,
25 flags | __GFP_HIGHMEM,
26 PAGE_KERNEL);
27 return ret;
19} 28}
20 29
21static void zcomp_lz4_destroy(void *private) 30static void zcomp_lz4_destroy(void *private)
22{ 31{
23 kfree(private); 32 kvfree(private);
24} 33}
25 34
26static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst, 35static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst,
diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c
index da1bc47d588e..ed7a1f0549ec 100644
--- a/drivers/block/zram/zcomp_lzo.c
+++ b/drivers/block/zram/zcomp_lzo.c
@@ -10,17 +10,26 @@
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/lzo.h> 12#include <linux/lzo.h>
13#include <linux/vmalloc.h>
14#include <linux/mm.h>
13 15
14#include "zcomp_lzo.h" 16#include "zcomp_lzo.h"
15 17
16static void *lzo_create(void) 18static void *lzo_create(gfp_t flags)
17{ 19{
18 return kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); 20 void *ret;
21
22 ret = kmalloc(LZO1X_MEM_COMPRESS, flags);
23 if (!ret)
24 ret = __vmalloc(LZO1X_MEM_COMPRESS,
25 flags | __GFP_HIGHMEM,
26 PAGE_KERNEL);
27 return ret;
19} 28}
20 29
21static void lzo_destroy(void *private) 30static void lzo_destroy(void *private)
22{ 31{
23 kfree(private); 32 kvfree(private);
24} 33}
25 34
26static int lzo_compress(const unsigned char *src, unsigned char *dst, 35static int lzo_compress(const unsigned char *src, unsigned char *dst,
diff --git a/drivers/staging/lustre/lustre/llite/super25.c b/drivers/staging/lustre/lustre/llite/super25.c
index 7a9fafc67693..86c371ef71ea 100644
--- a/drivers/staging/lustre/lustre/llite/super25.c
+++ b/drivers/staging/lustre/lustre/llite/super25.c
@@ -106,7 +106,8 @@ static int __init init_lustre_lite(void)
106 rc = -ENOMEM; 106 rc = -ENOMEM;
107 ll_inode_cachep = kmem_cache_create("lustre_inode_cache", 107 ll_inode_cachep = kmem_cache_create("lustre_inode_cache",
108 sizeof(struct ll_inode_info), 108 sizeof(struct ll_inode_info),
109 0, SLAB_HWCACHE_ALIGN, NULL); 109 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
110 NULL);
110 if (ll_inode_cachep == NULL) 111 if (ll_inode_cachep == NULL)
111 goto out_cache; 112 goto out_cache;
112 113
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6caca025019d..072e7599583a 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -575,7 +575,7 @@ static int v9fs_init_inode_cache(void)
575 v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache", 575 v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
576 sizeof(struct v9fs_inode), 576 sizeof(struct v9fs_inode),
577 0, (SLAB_RECLAIM_ACCOUNT| 577 0, (SLAB_RECLAIM_ACCOUNT|
578 SLAB_MEM_SPREAD), 578 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
579 v9fs_inode_init_once); 579 v9fs_inode_init_once);
580 if (!v9fs_inode_cache) 580 if (!v9fs_inode_cache)
581 return -ENOMEM; 581 return -ENOMEM;
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 4d4a0df8344f..c9fdfb112933 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -271,7 +271,7 @@ static int __init init_inodecache(void)
271 adfs_inode_cachep = kmem_cache_create("adfs_inode_cache", 271 adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
272 sizeof(struct adfs_inode_info), 272 sizeof(struct adfs_inode_info),
273 0, (SLAB_RECLAIM_ACCOUNT| 273 0, (SLAB_RECLAIM_ACCOUNT|
274 SLAB_MEM_SPREAD), 274 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
275 init_once); 275 init_once);
276 if (adfs_inode_cachep == NULL) 276 if (adfs_inode_cachep == NULL)
277 return -ENOMEM; 277 return -ENOMEM;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 8836df5f1e11..2a6713b6b9f4 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -132,7 +132,7 @@ static int __init init_inodecache(void)
132 affs_inode_cachep = kmem_cache_create("affs_inode_cache", 132 affs_inode_cachep = kmem_cache_create("affs_inode_cache",
133 sizeof(struct affs_inode_info), 133 sizeof(struct affs_inode_info),
134 0, (SLAB_RECLAIM_ACCOUNT| 134 0, (SLAB_RECLAIM_ACCOUNT|
135 SLAB_MEM_SPREAD), 135 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
136 init_once); 136 init_once);
137 if (affs_inode_cachep == NULL) 137 if (affs_inode_cachep == NULL)
138 return -ENOMEM; 138 return -ENOMEM;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 1fb4a5129f7d..81afefe7d8a6 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -91,7 +91,7 @@ int __init afs_fs_init(void)
91 afs_inode_cachep = kmem_cache_create("afs_inode_cache", 91 afs_inode_cachep = kmem_cache_create("afs_inode_cache",
92 sizeof(struct afs_vnode), 92 sizeof(struct afs_vnode),
93 0, 93 0,
94 SLAB_HWCACHE_ALIGN, 94 SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
95 afs_i_init_once); 95 afs_i_init_once);
96 if (!afs_inode_cachep) { 96 if (!afs_inode_cachep) {
97 printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n"); 97 printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n");
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 25250fa87086..cc0e08252913 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -434,7 +434,7 @@ befs_init_inodecache(void)
434 befs_inode_cachep = kmem_cache_create("befs_inode_cache", 434 befs_inode_cachep = kmem_cache_create("befs_inode_cache",
435 sizeof (struct befs_inode_info), 435 sizeof (struct befs_inode_info),
436 0, (SLAB_RECLAIM_ACCOUNT| 436 0, (SLAB_RECLAIM_ACCOUNT|
437 SLAB_MEM_SPREAD), 437 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
438 init_once); 438 init_once);
439 if (befs_inode_cachep == NULL) { 439 if (befs_inode_cachep == NULL) {
440 pr_err("%s: Couldn't initialize inode slabcache\n", __func__); 440 pr_err("%s: Couldn't initialize inode slabcache\n", __func__);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index fdcb4d69f430..1e5c896f6b79 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -270,7 +270,7 @@ static int __init init_inodecache(void)
270 bfs_inode_cachep = kmem_cache_create("bfs_inode_cache", 270 bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
271 sizeof(struct bfs_inode_info), 271 sizeof(struct bfs_inode_info),
272 0, (SLAB_RECLAIM_ACCOUNT| 272 0, (SLAB_RECLAIM_ACCOUNT|
273 SLAB_MEM_SPREAD), 273 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
274 init_once); 274 init_once);
275 if (bfs_inode_cachep == NULL) 275 if (bfs_inode_cachep == NULL)
276 return -ENOMEM; 276 return -ENOMEM;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index d878e4860fb7..81c0705558be 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -437,7 +437,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
437 437
438 if (!ops->rw_page || bdev_get_integrity(bdev)) 438 if (!ops->rw_page || bdev_get_integrity(bdev))
439 return -EOPNOTSUPP; 439 return -EOPNOTSUPP;
440 result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL); 440 result = blk_queue_enter(bdev->bd_queue, GFP_NOIO);
441 if (result) 441 if (result)
442 return result; 442 return result;
443 443
@@ -595,7 +595,7 @@ void __init bdev_cache_init(void)
595 595
596 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 596 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
597 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 597 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
598 SLAB_MEM_SPREAD|SLAB_PANIC), 598 SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
599 init_once); 599 init_once);
600 err = register_filesystem(&bd_type); 600 err = register_filesystem(&bd_type);
601 if (err) 601 if (err)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3b8856e182ae..394017831692 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9161,7 +9161,8 @@ int btrfs_init_cachep(void)
9161{ 9161{
9162 btrfs_inode_cachep = kmem_cache_create("btrfs_inode", 9162 btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
9163 sizeof(struct btrfs_inode), 0, 9163 sizeof(struct btrfs_inode), 0,
9164 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); 9164 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
9165 init_once);
9165 if (!btrfs_inode_cachep) 9166 if (!btrfs_inode_cachep)
9166 goto fail; 9167 goto fail;
9167 9168
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f446afada328..ca4d5e8457f1 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -639,8 +639,8 @@ static int __init init_caches(void)
639 ceph_inode_cachep = kmem_cache_create("ceph_inode_info", 639 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
640 sizeof(struct ceph_inode_info), 640 sizeof(struct ceph_inode_info),
641 __alignof__(struct ceph_inode_info), 641 __alignof__(struct ceph_inode_info),
642 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), 642 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
643 ceph_inode_init_once); 643 SLAB_ACCOUNT, ceph_inode_init_once);
644 if (ceph_inode_cachep == NULL) 644 if (ceph_inode_cachep == NULL)
645 return -ENOMEM; 645 return -ENOMEM;
646 646
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index b7fcb3151103..c4c1169814b2 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1092,7 +1092,7 @@ cifs_init_inodecache(void)
1092 cifs_inode_cachep = kmem_cache_create("cifs_inode_cache", 1092 cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
1093 sizeof(struct cifsInodeInfo), 1093 sizeof(struct cifsInodeInfo),
1094 0, (SLAB_RECLAIM_ACCOUNT| 1094 0, (SLAB_RECLAIM_ACCOUNT|
1095 SLAB_MEM_SPREAD), 1095 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
1096 cifs_init_once); 1096 cifs_init_once);
1097 if (cifs_inode_cachep == NULL) 1097 if (cifs_inode_cachep == NULL)
1098 return -ENOMEM; 1098 return -ENOMEM;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index cac1390b87a3..57e81cbba0fa 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -74,9 +74,9 @@ static void init_once(void *foo)
74int __init coda_init_inodecache(void) 74int __init coda_init_inodecache(void)
75{ 75{
76 coda_inode_cachep = kmem_cache_create("coda_inode_cache", 76 coda_inode_cachep = kmem_cache_create("coda_inode_cache",
77 sizeof(struct coda_inode_info), 77 sizeof(struct coda_inode_info), 0,
78 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, 78 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
79 init_once); 79 SLAB_ACCOUNT, init_once);
80 if (coda_inode_cachep == NULL) 80 if (coda_inode_cachep == NULL)
81 return -ENOMEM; 81 return -ENOMEM;
82 return 0; 82 return 0;
diff --git a/fs/dcache.c b/fs/dcache.c
index 8d38cd07b207..b4539e84e577 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1571,7 +1571,8 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
1571 dentry->d_iname[DNAME_INLINE_LEN-1] = 0; 1571 dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
1572 if (name->len > DNAME_INLINE_LEN-1) { 1572 if (name->len > DNAME_INLINE_LEN-1) {
1573 size_t size = offsetof(struct external_name, name[1]); 1573 size_t size = offsetof(struct external_name, name[1]);
1574 struct external_name *p = kmalloc(size + name->len, GFP_KERNEL); 1574 struct external_name *p = kmalloc(size + name->len,
1575 GFP_KERNEL_ACCOUNT);
1575 if (!p) { 1576 if (!p) {
1576 kmem_cache_free(dentry_cache, dentry); 1577 kmem_cache_free(dentry_cache, dentry);
1577 return NULL; 1578 return NULL;
@@ -3415,7 +3416,7 @@ static void __init dcache_init(void)
3415 * of the dcache. 3416 * of the dcache.
3416 */ 3417 */
3417 dentry_cache = KMEM_CACHE(dentry, 3418 dentry_cache = KMEM_CACHE(dentry,
3418 SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); 3419 SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT);
3419 3420
3420 /* Hash may have been set up in dcache_init_early */ 3421 /* Hash may have been set up in dcache_init_early */
3421 if (!hashdist) 3422 if (!hashdist)
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 4f4d0474bee9..e25b6b06bacf 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -663,6 +663,7 @@ static struct ecryptfs_cache_info {
663 struct kmem_cache **cache; 663 struct kmem_cache **cache;
664 const char *name; 664 const char *name;
665 size_t size; 665 size_t size;
666 unsigned long flags;
666 void (*ctor)(void *obj); 667 void (*ctor)(void *obj);
667} ecryptfs_cache_infos[] = { 668} ecryptfs_cache_infos[] = {
668 { 669 {
@@ -684,6 +685,7 @@ static struct ecryptfs_cache_info {
684 .cache = &ecryptfs_inode_info_cache, 685 .cache = &ecryptfs_inode_info_cache,
685 .name = "ecryptfs_inode_cache", 686 .name = "ecryptfs_inode_cache",
686 .size = sizeof(struct ecryptfs_inode_info), 687 .size = sizeof(struct ecryptfs_inode_info),
688 .flags = SLAB_ACCOUNT,
687 .ctor = inode_info_init_once, 689 .ctor = inode_info_init_once,
688 }, 690 },
689 { 691 {
@@ -755,8 +757,8 @@ static int ecryptfs_init_kmem_caches(void)
755 struct ecryptfs_cache_info *info; 757 struct ecryptfs_cache_info *info;
756 758
757 info = &ecryptfs_cache_infos[i]; 759 info = &ecryptfs_cache_infos[i];
758 *(info->cache) = kmem_cache_create(info->name, info->size, 760 *(info->cache) = kmem_cache_create(info->name, info->size, 0,
759 0, SLAB_HWCACHE_ALIGN, info->ctor); 761 SLAB_HWCACHE_ALIGN | info->flags, info->ctor);
760 if (!*(info->cache)) { 762 if (!*(info->cache)) {
761 ecryptfs_free_kmem_caches(); 763 ecryptfs_free_kmem_caches();
762 ecryptfs_printk(KERN_WARNING, "%s: " 764 ecryptfs_printk(KERN_WARNING, "%s: "
diff --git a/fs/efs/super.c b/fs/efs/super.c
index c8411a30f7da..cb68dac4f9d3 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -94,9 +94,9 @@ static void init_once(void *foo)
94static int __init init_inodecache(void) 94static int __init init_inodecache(void)
95{ 95{
96 efs_inode_cachep = kmem_cache_create("efs_inode_cache", 96 efs_inode_cachep = kmem_cache_create("efs_inode_cache",
97 sizeof(struct efs_inode_info), 97 sizeof(struct efs_inode_info), 0,
98 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, 98 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
99 init_once); 99 SLAB_ACCOUNT, init_once);
100 if (efs_inode_cachep == NULL) 100 if (efs_inode_cachep == NULL)
101 return -ENOMEM; 101 return -ENOMEM;
102 return 0; 102 return 0;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index b795c567b5e1..6658a50530a0 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -194,8 +194,8 @@ static int init_inodecache(void)
194{ 194{
195 exofs_inode_cachep = kmem_cache_create("exofs_inode_cache", 195 exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
196 sizeof(struct exofs_i_info), 0, 196 sizeof(struct exofs_i_info), 0,
197 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, 197 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
198 exofs_init_once); 198 SLAB_ACCOUNT, exofs_init_once);
199 if (exofs_inode_cachep == NULL) 199 if (exofs_inode_cachep == NULL)
200 return -ENOMEM; 200 return -ENOMEM;
201 return 0; 201 return 0;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 748d35afc902..2a188413a2b0 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -203,7 +203,7 @@ static int __init init_inodecache(void)
203 ext2_inode_cachep = kmem_cache_create("ext2_inode_cache", 203 ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
204 sizeof(struct ext2_inode_info), 204 sizeof(struct ext2_inode_info),
205 0, (SLAB_RECLAIM_ACCOUNT| 205 0, (SLAB_RECLAIM_ACCOUNT|
206 SLAB_MEM_SPREAD), 206 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
207 init_once); 207 init_once);
208 if (ext2_inode_cachep == NULL) 208 if (ext2_inode_cachep == NULL)
209 return -ENOMEM; 209 return -ENOMEM;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c9ab67da6e5a..f1b56ff01208 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -966,7 +966,7 @@ static int __init init_inodecache(void)
966 ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", 966 ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
967 sizeof(struct ext4_inode_info), 967 sizeof(struct ext4_inode_info),
968 0, (SLAB_RECLAIM_ACCOUNT| 968 0, (SLAB_RECLAIM_ACCOUNT|
969 SLAB_MEM_SPREAD), 969 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
970 init_once); 970 init_once);
971 if (ext4_inode_cachep == NULL) 971 if (ext4_inode_cachep == NULL)
972 return -ENOMEM; 972 return -ENOMEM;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3bf990b80026..6134832baaaf 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1541,8 +1541,9 @@ MODULE_ALIAS_FS("f2fs");
1541 1541
1542static int __init init_inodecache(void) 1542static int __init init_inodecache(void)
1543{ 1543{
1544 f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", 1544 f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache",
1545 sizeof(struct f2fs_inode_info)); 1545 sizeof(struct f2fs_inode_info), 0,
1546 SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL);
1546 if (!f2fs_inode_cachep) 1547 if (!f2fs_inode_cachep)
1547 return -ENOMEM; 1548 return -ENOMEM;
1548 return 0; 1549 return 0;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 509411dd3698..6aece96df19f 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -677,7 +677,7 @@ static int __init fat_init_inodecache(void)
677 fat_inode_cachep = kmem_cache_create("fat_inode_cache", 677 fat_inode_cachep = kmem_cache_create("fat_inode_cache",
678 sizeof(struct msdos_inode_info), 678 sizeof(struct msdos_inode_info),
679 0, (SLAB_RECLAIM_ACCOUNT| 679 0, (SLAB_RECLAIM_ACCOUNT|
680 SLAB_MEM_SPREAD), 680 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
681 init_once); 681 init_once);
682 if (fat_inode_cachep == NULL) 682 if (fat_inode_cachep == NULL)
683 return -ENOMEM; 683 return -ENOMEM;
diff --git a/fs/file.c b/fs/file.c
index 1aed0add16a2..1fbc5c0555a9 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -37,11 +37,12 @@ static void *alloc_fdmem(size_t size)
37 * vmalloc() if the allocation size will be considered "large" by the VM. 37 * vmalloc() if the allocation size will be considered "large" by the VM.
38 */ 38 */
39 if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { 39 if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
40 void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY); 40 void *data = kmalloc(size, GFP_KERNEL_ACCOUNT |
41 __GFP_NOWARN | __GFP_NORETRY);
41 if (data != NULL) 42 if (data != NULL)
42 return data; 43 return data;
43 } 44 }
44 return vmalloc(size); 45 return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL);
45} 46}
46 47
47static void __free_fdtable(struct fdtable *fdt) 48static void __free_fdtable(struct fdtable *fdt)
@@ -126,7 +127,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
126 if (unlikely(nr > sysctl_nr_open)) 127 if (unlikely(nr > sysctl_nr_open))
127 nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; 128 nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
128 129
129 fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); 130 fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
130 if (!fdt) 131 if (!fdt)
131 goto out; 132 goto out;
132 fdt->max_fds = nr; 133 fdt->max_fds = nr;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2913db2a5b99..4d69d5c0bedc 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1255,8 +1255,8 @@ static int __init fuse_fs_init(void)
1255 int err; 1255 int err;
1256 1256
1257 fuse_inode_cachep = kmem_cache_create("fuse_inode", 1257 fuse_inode_cachep = kmem_cache_create("fuse_inode",
1258 sizeof(struct fuse_inode), 1258 sizeof(struct fuse_inode), 0,
1259 0, SLAB_HWCACHE_ALIGN, 1259 SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
1260 fuse_inode_init_once); 1260 fuse_inode_init_once);
1261 err = -ENOMEM; 1261 err = -ENOMEM;
1262 if (!fuse_inode_cachep) 1262 if (!fuse_inode_cachep)
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 1d709d496364..f99f8e94de3f 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -114,7 +114,8 @@ static int __init init_gfs2_fs(void)
114 gfs2_inode_cachep = kmem_cache_create("gfs2_inode", 114 gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
115 sizeof(struct gfs2_inode), 115 sizeof(struct gfs2_inode),
116 0, SLAB_RECLAIM_ACCOUNT| 116 0, SLAB_RECLAIM_ACCOUNT|
117 SLAB_MEM_SPREAD, 117 SLAB_MEM_SPREAD|
118 SLAB_ACCOUNT,
118 gfs2_init_inode_once); 119 gfs2_init_inode_once);
119 if (!gfs2_inode_cachep) 120 if (!gfs2_inode_cachep)
120 goto fail; 121 goto fail;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 4574fdd3d421..1ca95c232bb5 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -483,8 +483,8 @@ static int __init init_hfs_fs(void)
483 int err; 483 int err;
484 484
485 hfs_inode_cachep = kmem_cache_create("hfs_inode_cache", 485 hfs_inode_cachep = kmem_cache_create("hfs_inode_cache",
486 sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN, 486 sizeof(struct hfs_inode_info), 0,
487 hfs_init_once); 487 SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfs_init_once);
488 if (!hfs_inode_cachep) 488 if (!hfs_inode_cachep)
489 return -ENOMEM; 489 return -ENOMEM;
490 err = register_filesystem(&hfs_fs_type); 490 err = register_filesystem(&hfs_fs_type);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 7302d96ae8bf..5d54490a136d 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -663,7 +663,7 @@ static int __init init_hfsplus_fs(void)
663 int err; 663 int err;
664 664
665 hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache", 665 hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache",
666 HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN, 666 HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
667 hfsplus_init_once); 667 hfsplus_init_once);
668 if (!hfsplus_inode_cachep) 668 if (!hfsplus_inode_cachep)
669 return -ENOMEM; 669 return -ENOMEM;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index f49be23e78aa..cfaa18c7a337 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -223,7 +223,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
223{ 223{
224 struct hostfs_inode_info *hi; 224 struct hostfs_inode_info *hi;
225 225
226 hi = kmalloc(sizeof(*hi), GFP_KERNEL); 226 hi = kmalloc(sizeof(*hi), GFP_KERNEL_ACCOUNT);
227 if (hi == NULL) 227 if (hi == NULL)
228 return NULL; 228 return NULL;
229 hi->fd = -1; 229 hi->fd = -1;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index a561591896bd..458cf463047b 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -261,7 +261,7 @@ static int init_inodecache(void)
261 hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache", 261 hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache",
262 sizeof(struct hpfs_inode_info), 262 sizeof(struct hpfs_inode_info),
263 0, (SLAB_RECLAIM_ACCOUNT| 263 0, (SLAB_RECLAIM_ACCOUNT|
264 SLAB_MEM_SPREAD), 264 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
265 init_once); 265 init_once);
266 if (hpfs_inode_cachep == NULL) 266 if (hpfs_inode_cachep == NULL)
267 return -ENOMEM; 267 return -ENOMEM;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d8f51ee8126b..47789292a582 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -4,11 +4,11 @@
4 * Nadia Yvette Chambers, 2002 4 * Nadia Yvette Chambers, 2002
5 * 5 *
6 * Copyright (C) 2002 Linus Torvalds. 6 * Copyright (C) 2002 Linus Torvalds.
7 * License: GPL
7 */ 8 */
8 9
9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10 11
11#include <linux/module.h>
12#include <linux/thread_info.h> 12#include <linux/thread_info.h>
13#include <asm/current.h> 13#include <asm/current.h>
14#include <linux/sched.h> /* remove ASAP */ 14#include <linux/sched.h> /* remove ASAP */
@@ -738,7 +738,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
738 /* 738 /*
739 * The policy is initialized here even if we are creating a 739 * The policy is initialized here even if we are creating a
740 * private inode because initialization simply creates an 740 * private inode because initialization simply creates an
741 * an empty rb tree and calls spin_lock_init(), later when we 741 * an empty rb tree and calls rwlock_init(), later when we
742 * call mpol_free_shared_policy() it will just return because 742 * call mpol_free_shared_policy() it will just return because
743 * the rb tree will still be empty. 743 * the rb tree will still be empty.
744 */ 744 */
@@ -1202,7 +1202,6 @@ static struct file_system_type hugetlbfs_fs_type = {
1202 .mount = hugetlbfs_mount, 1202 .mount = hugetlbfs_mount,
1203 .kill_sb = kill_litter_super, 1203 .kill_sb = kill_litter_super,
1204}; 1204};
1205MODULE_ALIAS_FS("hugetlbfs");
1206 1205
1207static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; 1206static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
1208 1207
@@ -1322,7 +1321,7 @@ static int __init init_hugetlbfs_fs(void)
1322 error = -ENOMEM; 1321 error = -ENOMEM;
1323 hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", 1322 hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
1324 sizeof(struct hugetlbfs_inode_info), 1323 sizeof(struct hugetlbfs_inode_info),
1325 0, 0, init_once); 1324 0, SLAB_ACCOUNT, init_once);
1326 if (hugetlbfs_inode_cachep == NULL) 1325 if (hugetlbfs_inode_cachep == NULL)
1327 goto out2; 1326 goto out2;
1328 1327
@@ -1356,26 +1355,4 @@ static int __init init_hugetlbfs_fs(void)
1356 out2: 1355 out2:
1357 return error; 1356 return error;
1358} 1357}
1359 1358fs_initcall(init_hugetlbfs_fs)
1360static void __exit exit_hugetlbfs_fs(void)
1361{
1362 struct hstate *h;
1363 int i;
1364
1365
1366 /*
1367 * Make sure all delayed rcu free inodes are flushed before we
1368 * destroy cache.
1369 */
1370 rcu_barrier();
1371 kmem_cache_destroy(hugetlbfs_inode_cachep);
1372 i = 0;
1373 for_each_hstate(h)
1374 kern_unmount(hugetlbfs_vfsmount[i++]);
1375 unregister_filesystem(&hugetlbfs_fs_type);
1376}
1377
1378module_init(init_hugetlbfs_fs)
1379module_exit(exit_hugetlbfs_fs)
1380
1381MODULE_LICENSE("GPL");
diff --git a/fs/inode.c b/fs/inode.c
index 4230f66b7410..e491e54d2430 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1883,7 +1883,7 @@ void __init inode_init(void)
1883 sizeof(struct inode), 1883 sizeof(struct inode),
1884 0, 1884 0,
1885 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| 1885 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
1886 SLAB_MEM_SPREAD), 1886 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
1887 init_once); 1887 init_once);
1888 1888
1889 /* Hash may have been set up in inode_init_early */ 1889 /* Hash may have been set up in inode_init_early */
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 61abdc4920da..bcd2d41b318a 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -94,7 +94,7 @@ static int __init init_inodecache(void)
94 isofs_inode_cachep = kmem_cache_create("isofs_inode_cache", 94 isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
95 sizeof(struct iso_inode_info), 95 sizeof(struct iso_inode_info),
96 0, (SLAB_RECLAIM_ACCOUNT| 96 0, (SLAB_RECLAIM_ACCOUNT|
97 SLAB_MEM_SPREAD), 97 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
98 init_once); 98 init_once);
99 if (isofs_inode_cachep == NULL) 99 if (isofs_inode_cachep == NULL)
100 return -ENOMEM; 100 return -ENOMEM;
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index d86c5e3176a1..bb080c272149 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -387,7 +387,7 @@ static int __init init_jffs2_fs(void)
387 jffs2_inode_cachep = kmem_cache_create("jffs2_i", 387 jffs2_inode_cachep = kmem_cache_create("jffs2_i",
388 sizeof(struct jffs2_inode_info), 388 sizeof(struct jffs2_inode_info),
389 0, (SLAB_RECLAIM_ACCOUNT| 389 0, (SLAB_RECLAIM_ACCOUNT|
390 SLAB_MEM_SPREAD), 390 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
391 jffs2_i_init_once); 391 jffs2_i_init_once);
392 if (!jffs2_inode_cachep) { 392 if (!jffs2_inode_cachep) {
393 pr_err("error: Failed to initialise inode cache\n"); 393 pr_err("error: Failed to initialise inode cache\n");
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 8f9176caf098..900925b5eb8c 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -898,7 +898,7 @@ static int __init init_jfs_fs(void)
898 898
899 jfs_inode_cachep = 899 jfs_inode_cachep =
900 kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0, 900 kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0,
901 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, 901 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
902 init_once); 902 init_once);
903 if (jfs_inode_cachep == NULL) 903 if (jfs_inode_cachep == NULL)
904 return -ENOMEM; 904 return -ENOMEM;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 742bf4a230e8..821973853340 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -541,14 +541,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
541 if (!kn) 541 if (!kn)
542 goto err_out1; 542 goto err_out1;
543 543
544 /* 544 ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
545 * If the ino of the sysfs entry created for a kmem cache gets
546 * allocated from an ida layer, which is accounted to the memcg that
547 * owns the cache, the memcg will get pinned forever. So do not account
548 * ino ida allocations.
549 */
550 ret = ida_simple_get(&root->ino_ida, 1, 0,
551 GFP_KERNEL | __GFP_NOACCOUNT);
552 if (ret < 0) 545 if (ret < 0)
553 goto err_out2; 546 goto err_out2;
554 kn->ino = ret; 547 kn->ino = ret;
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
index 09ed066c0221..2b4503163930 100644
--- a/fs/logfs/Kconfig
+++ b/fs/logfs/Kconfig
@@ -1,6 +1,6 @@
1config LOGFS 1config LOGFS
2 tristate "LogFS file system" 2 tristate "LogFS file system"
3 depends on (MTD || BLOCK) 3 depends on MTD || (!MTD && BLOCK)
4 select ZLIB_INFLATE 4 select ZLIB_INFLATE
5 select ZLIB_DEFLATE 5 select ZLIB_DEFLATE
6 select CRC32 6 select CRC32
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 0fce46d62b9c..db9cfc598883 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -409,7 +409,8 @@ const struct super_operations logfs_super_operations = {
409int logfs_init_inode_cache(void) 409int logfs_init_inode_cache(void)
410{ 410{
411 logfs_inode_cache = kmem_cache_create("logfs_inode_cache", 411 logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
412 sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT, 412 sizeof(struct logfs_inode), 0,
413 SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
413 logfs_init_once); 414 logfs_init_once);
414 if (!logfs_inode_cache) 415 if (!logfs_inode_cache)
415 return -ENOMEM; 416 return -ENOMEM;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index cb1789ca1ee6..f975d667c539 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -91,7 +91,7 @@ static int __init init_inodecache(void)
91 minix_inode_cachep = kmem_cache_create("minix_inode_cache", 91 minix_inode_cachep = kmem_cache_create("minix_inode_cache",
92 sizeof(struct minix_inode_info), 92 sizeof(struct minix_inode_info),
93 0, (SLAB_RECLAIM_ACCOUNT| 93 0, (SLAB_RECLAIM_ACCOUNT|
94 SLAB_MEM_SPREAD), 94 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
95 init_once); 95 init_once);
96 if (minix_inode_cachep == NULL) 96 if (minix_inode_cachep == NULL)
97 return -ENOMEM; 97 return -ENOMEM;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index ce1eb3f9dfe8..1af15fcbe57b 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -82,7 +82,7 @@ static int init_inodecache(void)
82 ncp_inode_cachep = kmem_cache_create("ncp_inode_cache", 82 ncp_inode_cachep = kmem_cache_create("ncp_inode_cache",
83 sizeof(struct ncp_inode_info), 83 sizeof(struct ncp_inode_info),
84 0, (SLAB_RECLAIM_ACCOUNT| 84 0, (SLAB_RECLAIM_ACCOUNT|
85 SLAB_MEM_SPREAD), 85 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
86 init_once); 86 init_once);
87 if (ncp_inode_cachep == NULL) 87 if (ncp_inode_cachep == NULL)
88 return -ENOMEM; 88 return -ENOMEM;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c11e855e0e18..8e24d886d2c5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1969,7 +1969,7 @@ static int __init nfs_init_inodecache(void)
1969 nfs_inode_cachep = kmem_cache_create("nfs_inode_cache", 1969 nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
1970 sizeof(struct nfs_inode), 1970 sizeof(struct nfs_inode),
1971 0, (SLAB_RECLAIM_ACCOUNT| 1971 0, (SLAB_RECLAIM_ACCOUNT|
1972 SLAB_MEM_SPREAD), 1972 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
1973 init_once); 1973 init_once);
1974 if (nfs_inode_cachep == NULL) 1974 if (nfs_inode_cachep == NULL)
1975 return -ENOMEM; 1975 return -ENOMEM;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index c7343844e6b6..7f5d3d9f1c37 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1416,7 +1416,8 @@ static int __init nilfs_init_cachep(void)
1416{ 1416{
1417 nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache", 1417 nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
1418 sizeof(struct nilfs_inode_info), 0, 1418 sizeof(struct nilfs_inode_info), 0,
1419 SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once); 1419 SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
1420 nilfs_inode_init_once);
1420 if (!nilfs_inode_cachep) 1421 if (!nilfs_inode_cachep)
1421 goto fail; 1422 goto fail;
1422 1423
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index e785fd954c30..741077deef3b 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -199,8 +199,7 @@ void fsnotify_unmount_inodes(struct super_block *sb)
199 break; 199 break;
200 } 200 }
201 spin_unlock(&next_i->i_lock); 201 spin_unlock(&next_i->i_lock);
202 next_i = list_entry(next_i->i_sb_list.next, 202 next_i = list_next_entry(next_i, i_sb_list);
203 struct inode, i_sb_list);
204 } 203 }
205 204
206 /* 205 /*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index fc0df4442f7b..cfcbf114676e 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -92,9 +92,6 @@
92#include "fsnotify.h" 92#include "fsnotify.h"
93 93
94struct srcu_struct fsnotify_mark_srcu; 94struct srcu_struct fsnotify_mark_srcu;
95static DEFINE_SPINLOCK(destroy_lock);
96static LIST_HEAD(destroy_list);
97static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq);
98 95
99void fsnotify_get_mark(struct fsnotify_mark *mark) 96void fsnotify_get_mark(struct fsnotify_mark *mark)
100{ 97{
@@ -168,10 +165,19 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
168 atomic_dec(&group->num_marks); 165 atomic_dec(&group->num_marks);
169} 166}
170 167
168static void
169fsnotify_mark_free_rcu(struct rcu_head *rcu)
170{
171 struct fsnotify_mark *mark;
172
173 mark = container_of(rcu, struct fsnotify_mark, g_rcu);
174 fsnotify_put_mark(mark);
175}
176
171/* 177/*
172 * Free fsnotify mark. The freeing is actually happening from a kthread which 178 * Free fsnotify mark. The freeing is actually happening from a call_srcu
173 * first waits for srcu period end. Caller must have a reference to the mark 179 * callback. Caller must have a reference to the mark or be protected by
174 * or be protected by fsnotify_mark_srcu. 180 * fsnotify_mark_srcu.
175 */ 181 */
176void fsnotify_free_mark(struct fsnotify_mark *mark) 182void fsnotify_free_mark(struct fsnotify_mark *mark)
177{ 183{
@@ -186,10 +192,7 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
186 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; 192 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
187 spin_unlock(&mark->lock); 193 spin_unlock(&mark->lock);
188 194
189 spin_lock(&destroy_lock); 195 call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu);
190 list_add(&mark->g_list, &destroy_list);
191 spin_unlock(&destroy_lock);
192 wake_up(&destroy_waitq);
193 196
194 /* 197 /*
195 * Some groups like to know that marks are being freed. This is a 198 * Some groups like to know that marks are being freed. This is a
@@ -385,11 +388,7 @@ err:
385 388
386 spin_unlock(&mark->lock); 389 spin_unlock(&mark->lock);
387 390
388 spin_lock(&destroy_lock); 391 call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu);
389 list_add(&mark->g_list, &destroy_list);
390 spin_unlock(&destroy_lock);
391 wake_up(&destroy_waitq);
392
393 return ret; 392 return ret;
394} 393}
395 394
@@ -492,40 +491,3 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
492 atomic_set(&mark->refcnt, 1); 491 atomic_set(&mark->refcnt, 1);
493 mark->free_mark = free_mark; 492 mark->free_mark = free_mark;
494} 493}
495
496static int fsnotify_mark_destroy(void *ignored)
497{
498 struct fsnotify_mark *mark, *next;
499 struct list_head private_destroy_list;
500
501 for (;;) {
502 spin_lock(&destroy_lock);
503 /* exchange the list head */
504 list_replace_init(&destroy_list, &private_destroy_list);
505 spin_unlock(&destroy_lock);
506
507 synchronize_srcu(&fsnotify_mark_srcu);
508
509 list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
510 list_del_init(&mark->g_list);
511 fsnotify_put_mark(mark);
512 }
513
514 wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list));
515 }
516
517 return 0;
518}
519
520static int __init fsnotify_mark_init(void)
521{
522 struct task_struct *thread;
523
524 thread = kthread_run(fsnotify_mark_destroy, NULL,
525 "fsnotify_mark");
526 if (IS_ERR(thread))
527 panic("unable to start fsnotify mark destruction thread.");
528
529 return 0;
530}
531device_initcall(fsnotify_mark_init);
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index d1a853585b53..2f77f8dfb861 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3139,8 +3139,8 @@ static int __init init_ntfs_fs(void)
3139 3139
3140 ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name, 3140 ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
3141 sizeof(big_ntfs_inode), 0, 3141 sizeof(big_ntfs_inode), 0,
3142 SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, 3142 SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
3143 ntfs_big_inode_init_once); 3143 SLAB_ACCOUNT, ntfs_big_inode_init_once);
3144 if (!ntfs_big_inode_cache) { 3144 if (!ntfs_big_inode_cache) {
3145 pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name); 3145 pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name);
3146 goto big_inode_err_out; 3146 goto big_inode_err_out;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 86181d6526dc..a3ded88718c9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -164,7 +164,7 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
164 struct ocfs2_extent_rec *rec); 164 struct ocfs2_extent_rec *rec);
165static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et); 165static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
166static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); 166static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
167static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { 167static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
168 .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, 168 .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
169 .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, 169 .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
170 .eo_update_clusters = ocfs2_dinode_update_clusters, 170 .eo_update_clusters = ocfs2_dinode_update_clusters,
@@ -286,7 +286,7 @@ static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
286 le32_add_cpu(&vb->vb_xv->xr_clusters, clusters); 286 le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
287} 287}
288 288
289static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { 289static const struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
290 .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk, 290 .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk,
291 .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk, 291 .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk,
292 .eo_update_clusters = ocfs2_xattr_value_update_clusters, 292 .eo_update_clusters = ocfs2_xattr_value_update_clusters,
@@ -332,7 +332,7 @@ static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
332 le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters); 332 le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
333} 333}
334 334
335static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { 335static const struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
336 .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk, 336 .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk,
337 .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk, 337 .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk,
338 .eo_update_clusters = ocfs2_xattr_tree_update_clusters, 338 .eo_update_clusters = ocfs2_xattr_tree_update_clusters,
@@ -379,7 +379,7 @@ static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
379 et->et_root_el = &dx_root->dr_list; 379 et->et_root_el = &dx_root->dr_list;
380} 380}
381 381
382static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { 382static const struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
383 .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk, 383 .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk,
384 .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk, 384 .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk,
385 .eo_update_clusters = ocfs2_dx_root_update_clusters, 385 .eo_update_clusters = ocfs2_dx_root_update_clusters,
@@ -425,7 +425,7 @@ ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
425 return CONTIG_NONE; 425 return CONTIG_NONE;
426} 426}
427 427
428static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { 428static const struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
429 .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk, 429 .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk,
430 .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk, 430 .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk,
431 .eo_update_clusters = ocfs2_refcount_tree_update_clusters, 431 .eo_update_clusters = ocfs2_refcount_tree_update_clusters,
@@ -438,7 +438,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
438 struct buffer_head *bh, 438 struct buffer_head *bh,
439 ocfs2_journal_access_func access, 439 ocfs2_journal_access_func access,
440 void *obj, 440 void *obj,
441 struct ocfs2_extent_tree_operations *ops) 441 const struct ocfs2_extent_tree_operations *ops)
442{ 442{
443 et->et_ops = ops; 443 et->et_ops = ops;
444 et->et_root_bh = bh; 444 et->et_root_bh = bh;
@@ -6174,8 +6174,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
6174 } 6174 }
6175 6175
6176bail: 6176bail:
6177 if (tl_inode) 6177 iput(tl_inode);
6178 iput(tl_inode);
6179 brelse(tl_bh); 6178 brelse(tl_bh);
6180 6179
6181 if (status < 0) { 6180 if (status < 0) {
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index fb09b97db162..f3dc1b0dfffc 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -54,7 +54,7 @@
54 */ 54 */
55struct ocfs2_extent_tree_operations; 55struct ocfs2_extent_tree_operations;
56struct ocfs2_extent_tree { 56struct ocfs2_extent_tree {
57 struct ocfs2_extent_tree_operations *et_ops; 57 const struct ocfs2_extent_tree_operations *et_ops;
58 struct buffer_head *et_root_bh; 58 struct buffer_head *et_root_bh;
59 struct ocfs2_extent_list *et_root_el; 59 struct ocfs2_extent_list *et_root_el;
60 struct ocfs2_caching_info *et_ci; 60 struct ocfs2_caching_info *et_ci;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 709fbbd44c65..a3cc6d2fc896 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1780,8 +1780,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
1780 } 1780 }
1781 ++live_threshold; 1781 ++live_threshold;
1782 atomic_set(&reg->hr_steady_iterations, live_threshold); 1782 atomic_set(&reg->hr_steady_iterations, live_threshold);
1783 /* unsteady_iterations is double the steady_iterations */ 1783 /* unsteady_iterations is triple the steady_iterations */
1784 atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1)); 1784 atomic_set(&reg->hr_unsteady_iterations, (live_threshold * 3));
1785 1785
1786 hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", 1786 hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
1787 reg->hr_item.ci_name); 1787 reg->hr_item.ci_name);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index e88ccf8c83ff..68c607e63ff6 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -376,17 +376,6 @@ struct dlm_lock
376 lksb_kernel_allocated:1; 376 lksb_kernel_allocated:1;
377}; 377};
378 378
379
380#define DLM_LKSB_UNUSED1 0x01
381#define DLM_LKSB_PUT_LVB 0x02
382#define DLM_LKSB_GET_LVB 0x04
383#define DLM_LKSB_UNUSED2 0x08
384#define DLM_LKSB_UNUSED3 0x10
385#define DLM_LKSB_UNUSED4 0x20
386#define DLM_LKSB_UNUSED5 0x40
387#define DLM_LKSB_UNUSED6 0x80
388
389
390enum dlm_lockres_list { 379enum dlm_lockres_list {
391 DLM_GRANTED_LIST = 0, 380 DLM_GRANTED_LIST = 0,
392 DLM_CONVERTING_LIST = 1, 381 DLM_CONVERTING_LIST = 1,
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 84f2f8079466..9477d6e1de37 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2388,8 +2388,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
2388 2388
2389 spin_lock(&res->spinlock); 2389 spin_lock(&res->spinlock);
2390 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); 2390 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
2391 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
2391 if (test_bit(node, res->refmap)) { 2392 if (test_bit(node, res->refmap)) {
2392 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
2393 dlm_lockres_clear_refmap_bit(dlm, res, node); 2393 dlm_lockres_clear_refmap_bit(dlm, res, node);
2394 cleared = 1; 2394 cleared = 1;
2395 } 2395 }
@@ -2519,6 +2519,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2519 spin_lock(&dlm->master_lock); 2519 spin_lock(&dlm->master_lock);
2520 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, 2520 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
2521 namelen, target, dlm->node_num); 2521 namelen, target, dlm->node_num);
2522 /* get an extra reference on the mle.
2523 * otherwise the assert_master from the new
2524 * master will destroy this.
2525 */
2526 dlm_get_mle_inuse(mle);
2522 spin_unlock(&dlm->master_lock); 2527 spin_unlock(&dlm->master_lock);
2523 spin_unlock(&dlm->spinlock); 2528 spin_unlock(&dlm->spinlock);
2524 2529
@@ -2544,7 +2549,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2544 } 2549 }
2545 2550
2546fail: 2551fail:
2547 if (oldmle) { 2552 if (ret != -EEXIST && oldmle) {
2548 /* master is known, detach if not already detached */ 2553 /* master is known, detach if not already detached */
2549 dlm_mle_detach_hb_events(dlm, oldmle); 2554 dlm_mle_detach_hb_events(dlm, oldmle);
2550 dlm_put_mle(oldmle); 2555 dlm_put_mle(oldmle);
@@ -2554,6 +2559,7 @@ fail:
2554 if (mle_added) { 2559 if (mle_added) {
2555 dlm_mle_detach_hb_events(dlm, mle); 2560 dlm_mle_detach_hb_events(dlm, mle);
2556 dlm_put_mle(mle); 2561 dlm_put_mle(mle);
2562 dlm_put_mle_inuse(mle);
2557 } else if (mle) { 2563 } else if (mle) {
2558 kmem_cache_free(dlm_mle_cache, mle); 2564 kmem_cache_free(dlm_mle_cache, mle);
2559 mle = NULL; 2565 mle = NULL;
@@ -2571,17 +2577,6 @@ fail:
2571 * ensure that all assert_master work is flushed. */ 2577 * ensure that all assert_master work is flushed. */
2572 flush_workqueue(dlm->dlm_worker); 2578 flush_workqueue(dlm->dlm_worker);
2573 2579
2574 /* get an extra reference on the mle.
2575 * otherwise the assert_master from the new
2576 * master will destroy this.
2577 * also, make sure that all callers of dlm_get_mle
2578 * take both dlm->spinlock and dlm->master_lock */
2579 spin_lock(&dlm->spinlock);
2580 spin_lock(&dlm->master_lock);
2581 dlm_get_mle_inuse(mle);
2582 spin_unlock(&dlm->master_lock);
2583 spin_unlock(&dlm->spinlock);
2584
2585 /* notify new node and send all lock state */ 2580 /* notify new node and send all lock state */
2586 /* call send_one_lockres with migration flag. 2581 /* call send_one_lockres with migration flag.
2587 * this serves as notice to the target node that a 2582 * this serves as notice to the target node that a
@@ -3050,7 +3045,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
3050 int ret = 0; 3045 int ret = 0;
3051 3046
3052 if (!dlm_grab(dlm)) 3047 if (!dlm_grab(dlm))
3053 return -EINVAL; 3048 return 0;
3054 3049
3055 name = migrate->name; 3050 name = migrate->name;
3056 namelen = migrate->namelen; 3051 namelen = migrate->namelen;
@@ -3141,7 +3136,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3141 mlog(0, "tried to migrate %.*s, but some " 3136 mlog(0, "tried to migrate %.*s, but some "
3142 "process beat me to it\n", 3137 "process beat me to it\n",
3143 namelen, name); 3138 namelen, name);
3144 ret = -EEXIST; 3139 spin_unlock(&tmp->spinlock);
3140 return -EEXIST;
3145 } else { 3141 } else {
3146 /* bad. 2 NODES are trying to migrate! */ 3142 /* bad. 2 NODES are trying to migrate! */
3147 mlog(ML_ERROR, "migration error mle: " 3143 mlog(ML_ERROR, "migration error mle: "
@@ -3312,6 +3308,15 @@ top:
3312 mle->new_master != dead_node) 3308 mle->new_master != dead_node)
3313 continue; 3309 continue;
3314 3310
3311 if (mle->new_master == dead_node && mle->inuse) {
3312 mlog(ML_NOTICE, "%s: target %u died during "
3313 "migration from %u, the MLE is "
3314 "still keep used, ignore it!\n",
3315 dlm->name, dead_node,
3316 mle->master);
3317 continue;
3318 }
3319
3315 /* If we have reached this point, this mle needs to be 3320 /* If we have reached this point, this mle needs to be
3316 * removed from the list and freed. */ 3321 * removed from the list and freed. */
3317 dlm_clean_migration_mle(dlm, mle); 3322 dlm_clean_migration_mle(dlm, mle);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 9e4f862d20fe..c5bdf02c213b 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1373,6 +1373,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
1373 char *buf = NULL; 1373 char *buf = NULL;
1374 struct dlm_work_item *item = NULL; 1374 struct dlm_work_item *item = NULL;
1375 struct dlm_lock_resource *res = NULL; 1375 struct dlm_lock_resource *res = NULL;
1376 unsigned int hash;
1376 1377
1377 if (!dlm_grab(dlm)) 1378 if (!dlm_grab(dlm))
1378 return -EINVAL; 1379 return -EINVAL;
@@ -1400,7 +1401,10 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
1400 /* lookup the lock to see if we have a secondary queue for this 1401 /* lookup the lock to see if we have a secondary queue for this
1401 * already... just add the locks in and this will have its owner 1402 * already... just add the locks in and this will have its owner
1402 * and RECOVERY flag changed when it completes. */ 1403 * and RECOVERY flag changed when it completes. */
1403 res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len); 1404 hash = dlm_lockid_hash(mres->lockname, mres->lockname_len);
1405 spin_lock(&dlm->spinlock);
1406 res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len,
1407 hash);
1404 if (res) { 1408 if (res) {
1405 /* this will get a ref on res */ 1409 /* this will get a ref on res */
1406 /* mark it as recovering/migrating and hash it */ 1410 /* mark it as recovering/migrating and hash it */
@@ -1421,13 +1425,16 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
1421 mres->lockname_len, mres->lockname); 1425 mres->lockname_len, mres->lockname);
1422 ret = -EFAULT; 1426 ret = -EFAULT;
1423 spin_unlock(&res->spinlock); 1427 spin_unlock(&res->spinlock);
1428 spin_unlock(&dlm->spinlock);
1424 dlm_lockres_put(res); 1429 dlm_lockres_put(res);
1425 goto leave; 1430 goto leave;
1426 } 1431 }
1427 res->state |= DLM_LOCK_RES_MIGRATING; 1432 res->state |= DLM_LOCK_RES_MIGRATING;
1428 } 1433 }
1429 spin_unlock(&res->spinlock); 1434 spin_unlock(&res->spinlock);
1435 spin_unlock(&dlm->spinlock);
1430 } else { 1436 } else {
1437 spin_unlock(&dlm->spinlock);
1431 /* need to allocate, just like if it was 1438 /* need to allocate, just like if it was
1432 * mastered here normally */ 1439 * mastered here normally */
1433 res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len); 1440 res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
@@ -2450,11 +2457,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
2450 * perhaps later we can genericize this for other waiters. */ 2457 * perhaps later we can genericize this for other waiters. */
2451 wake_up(&dlm->migration_wq); 2458 wake_up(&dlm->migration_wq);
2452 2459
2453 if (test_bit(idx, dlm->recovery_map)) 2460 set_bit(idx, dlm->recovery_map);
2454 mlog(0, "domain %s, node %u already added "
2455 "to recovery map!\n", dlm->name, idx);
2456 else
2457 set_bit(idx, dlm->recovery_map);
2458} 2461}
2459 2462
2460void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data) 2463void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 2e3c9dbab68c..1082b2c3014b 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -421,7 +421,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
421 } 421 }
422 422
423 if (!dlm_grab(dlm)) 423 if (!dlm_grab(dlm))
424 return DLM_REJECTED; 424 return DLM_FORWARD;
425 425
426 mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), 426 mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
427 "Domain %s not fully joined!\n", dlm->name); 427 "Domain %s not fully joined!\n", dlm->name);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b5cf27dcb18a..03768bb3aab1 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -638,7 +638,7 @@ static int __init init_dlmfs_fs(void)
638 dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", 638 dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
639 sizeof(struct dlmfs_inode_private), 639 sizeof(struct dlmfs_inode_private),
640 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 640 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
641 SLAB_MEM_SPREAD), 641 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
642 dlmfs_init_once); 642 dlmfs_init_once);
643 if (!dlmfs_inode_cache) { 643 if (!dlmfs_inode_cache) {
644 status = -ENOMEM; 644 status = -ENOMEM;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 20276e340339..f92612e4b9d6 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2432,12 +2432,6 @@ bail:
2432 * done this we have to return AOP_TRUNCATED_PAGE so the aop method 2432 * done this we have to return AOP_TRUNCATED_PAGE so the aop method
2433 * that called us can bubble that back up into the VFS who will then 2433 * that called us can bubble that back up into the VFS who will then
2434 * immediately retry the aop call. 2434 * immediately retry the aop call.
2435 *
2436 * We do a blocking lock and immediate unlock before returning, though, so that
2437 * the lock has a great chance of being cached on this node by the time the VFS
2438 * calls back to retry the aop. This has a potential to livelock as nodes
2439 * ping locks back and forth, but that's a risk we're willing to take to avoid
2440 * the lock inversion simply.
2441 */ 2435 */
2442int ocfs2_inode_lock_with_page(struct inode *inode, 2436int ocfs2_inode_lock_with_page(struct inode *inode,
2443 struct buffer_head **ret_bh, 2437 struct buffer_head **ret_bh,
@@ -2449,8 +2443,6 @@ int ocfs2_inode_lock_with_page(struct inode *inode,
2449 ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); 2443 ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
2450 if (ret == -EAGAIN) { 2444 if (ret == -EAGAIN) {
2451 unlock_page(page); 2445 unlock_page(page);
2452 if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
2453 ocfs2_inode_unlock(inode, ex);
2454 ret = AOP_TRUNCATED_PAGE; 2446 ret = AOP_TRUNCATED_PAGE;
2455 } 2447 }
2456 2448
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 0e5b4515f92e..d63127932509 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1302,6 +1302,14 @@ int ocfs2_getattr(struct vfsmount *mnt,
1302 } 1302 }
1303 1303
1304 generic_fillattr(inode, stat); 1304 generic_fillattr(inode, stat);
1305 /*
1306 * If there is inline data in the inode, the inode will normally not
1307 * have data blocks allocated (it may have an external xattr block).
1308 * Report at least one sector for such files, so tools like tar, rsync,
1309 * others don't incorrectly think the file is completely sparse.
1310 */
1311 if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
1312 stat->blocks += (stat->size + 511)>>9;
1305 1313
1306 /* We set the blksize from the cluster size for performance */ 1314 /* We set the blksize from the cluster size for performance */
1307 stat->blksize = osb->s_clustersize; 1315 stat->blksize = osb->s_clustersize;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 3cb097ccce60..16b0bb482ea7 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -606,9 +606,7 @@ bail:
606 if (gb_inode) 606 if (gb_inode)
607 mutex_unlock(&gb_inode->i_mutex); 607 mutex_unlock(&gb_inode->i_mutex);
608 608
609 if (gb_inode) 609 iput(gb_inode);
610 iput(gb_inode);
611
612 brelse(bh); 610 brelse(bh);
613 611
614 return status; 612 return status;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 13534f4fe5b5..3772a2dbb980 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1042,8 +1042,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
1042 1042
1043// up_write(&journal->j_trans_barrier); 1043// up_write(&journal->j_trans_barrier);
1044done: 1044done:
1045 if (inode) 1045 iput(inode);
1046 iput(inode);
1047} 1046}
1048 1047
1049static void ocfs2_clear_journal_error(struct super_block *sb, 1048static void ocfs2_clear_journal_error(struct super_block *sb,
@@ -1687,9 +1686,7 @@ done:
1687 if (got_lock) 1686 if (got_lock)
1688 ocfs2_inode_unlock(inode, 1); 1687 ocfs2_inode_unlock(inode, 1);
1689 1688
1690 if (inode) 1689 iput(inode);
1691 iput(inode);
1692
1693 brelse(bh); 1690 brelse(bh);
1694 1691
1695 return status; 1692 return status;
@@ -1796,8 +1793,7 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
1796 1793
1797 ocfs2_inode_unlock(inode, 1); 1794 ocfs2_inode_unlock(inode, 1);
1798bail: 1795bail:
1799 if (inode) 1796 iput(inode);
1800 iput(inode);
1801 1797
1802 return status; 1798 return status;
1803} 1799}
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 0a4457fb0711..e9c99e35f5ea 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -358,8 +358,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
358bail: 358bail:
359 if (status < 0) 359 if (status < 0)
360 brelse(alloc_bh); 360 brelse(alloc_bh);
361 if (inode) 361 iput(inode);
362 iput(inode);
363 362
364 trace_ocfs2_load_local_alloc(osb->local_alloc_bits); 363 trace_ocfs2_load_local_alloc(osb->local_alloc_bits);
365 364
@@ -473,8 +472,7 @@ out_mutex:
473 iput(main_bm_inode); 472 iput(main_bm_inode);
474 473
475out: 474out:
476 if (local_alloc_inode) 475 iput(local_alloc_inode);
477 iput(local_alloc_inode);
478 476
479 kfree(alloc_copy); 477 kfree(alloc_copy);
480} 478}
@@ -1327,9 +1325,7 @@ bail:
1327 1325
1328 brelse(main_bm_bh); 1326 brelse(main_bm_bh);
1329 1327
1330 if (main_bm_inode) 1328 iput(main_bm_inode);
1331 iput(main_bm_inode);
1332
1333 kfree(alloc_copy); 1329 kfree(alloc_copy);
1334 1330
1335 if (ac) 1331 if (ac)
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index afb81eae2c18..ab42c38031b1 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1683,8 +1683,7 @@ bail:
1683 if (new_inode) 1683 if (new_inode)
1684 sync_mapping_buffers(old_inode->i_mapping); 1684 sync_mapping_buffers(old_inode->i_mapping);
1685 1685
1686 if (new_inode) 1686 iput(new_inode);
1687 iput(new_inode);
1688 1687
1689 ocfs2_free_dir_lookup_result(&target_lookup_res); 1688 ocfs2_free_dir_lookup_result(&target_lookup_res);
1690 ocfs2_free_dir_lookup_result(&old_entry_lookup); 1689 ocfs2_free_dir_lookup_result(&old_entry_lookup);
@@ -2373,6 +2372,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2373 (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, 2372 (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
2374 name, strlen(name)); 2373 name, strlen(name));
2375 2374
2375 status = ocfs2_journal_access_di(handle,
2376 INODE_CACHE(orphan_dir_inode),
2377 orphan_dir_bh,
2378 OCFS2_JOURNAL_ACCESS_WRITE);
2379 if (status < 0) {
2380 mlog_errno(status);
2381 goto leave;
2382 }
2383
2376 /* find it's spot in the orphan directory */ 2384 /* find it's spot in the orphan directory */
2377 status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode, 2385 status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode,
2378 &lookup); 2386 &lookup);
@@ -2388,15 +2396,6 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2388 goto leave; 2396 goto leave;
2389 } 2397 }
2390 2398
2391 status = ocfs2_journal_access_di(handle,
2392 INODE_CACHE(orphan_dir_inode),
2393 orphan_dir_bh,
2394 OCFS2_JOURNAL_ACCESS_WRITE);
2395 if (status < 0) {
2396 mlog_errno(status);
2397 goto leave;
2398 }
2399
2400 /* do the i_nlink dance! :) */ 2399 /* do the i_nlink dance! :) */
2401 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; 2400 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
2402 if (S_ISDIR(inode->i_mode)) 2401 if (S_ISDIR(inode->i_mode))
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index e78a203d44c8..1e09592148ad 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -322,8 +322,7 @@ static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
322 if (si == NULL) 322 if (si == NULL)
323 return; 323 return;
324 324
325 if (si->si_inode) 325 iput(si->si_inode);
326 iput(si->si_inode);
327 if (si->si_bh) { 326 if (si->si_bh) {
328 for (i = 0; i < si->si_blocks; i++) { 327 for (i = 0; i < si->si_blocks; i++) {
329 if (si->si_bh[i]) { 328 if (si->si_bh[i]) {
@@ -503,8 +502,17 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
503 trace_ocfs2_find_slot(osb->slot_num); 502 trace_ocfs2_find_slot(osb->slot_num);
504 503
505 status = ocfs2_update_disk_slot(osb, si, osb->slot_num); 504 status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
506 if (status < 0) 505 if (status < 0) {
507 mlog_errno(status); 506 mlog_errno(status);
507 /*
508 * if write block failed, invalidate slot to avoid overwrite
509 * slot during dismount in case another node rightly has mounted
510 */
511 spin_lock(&osb->osb_lock);
512 ocfs2_invalidate_slot(si, osb->slot_num);
513 osb->slot_num = OCFS2_INVALID_SLOT;
514 spin_unlock(&osb->osb_lock);
515 }
508 516
509bail: 517bail:
510 return status; 518 return status;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2de4c8a9340c..faa1365097bc 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1280,6 +1280,8 @@ static int ocfs2_parse_options(struct super_block *sb,
1280 int status, user_stack = 0; 1280 int status, user_stack = 0;
1281 char *p; 1281 char *p;
1282 u32 tmp; 1282 u32 tmp;
1283 int token, option;
1284 substring_t args[MAX_OPT_ARGS];
1283 1285
1284 trace_ocfs2_parse_options(is_remount, options ? options : "(none)"); 1286 trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
1285 1287
@@ -1298,9 +1300,6 @@ static int ocfs2_parse_options(struct super_block *sb,
1298 } 1300 }
1299 1301
1300 while ((p = strsep(&options, ",")) != NULL) { 1302 while ((p = strsep(&options, ",")) != NULL) {
1301 int token, option;
1302 substring_t args[MAX_OPT_ARGS];
1303
1304 if (!*p) 1303 if (!*p)
1305 continue; 1304 continue;
1306 1305
@@ -1367,7 +1366,6 @@ static int ocfs2_parse_options(struct super_block *sb,
1367 mopt->atime_quantum = option; 1366 mopt->atime_quantum = option;
1368 break; 1367 break;
1369 case Opt_slot: 1368 case Opt_slot:
1370 option = 0;
1371 if (match_int(&args[0], &option)) { 1369 if (match_int(&args[0], &option)) {
1372 status = 0; 1370 status = 0;
1373 goto bail; 1371 goto bail;
@@ -1376,7 +1374,6 @@ static int ocfs2_parse_options(struct super_block *sb,
1376 mopt->slot = (s16)option; 1374 mopt->slot = (s16)option;
1377 break; 1375 break;
1378 case Opt_commit: 1376 case Opt_commit:
1379 option = 0;
1380 if (match_int(&args[0], &option)) { 1377 if (match_int(&args[0], &option)) {
1381 status = 0; 1378 status = 0;
1382 goto bail; 1379 goto bail;
@@ -1388,7 +1385,6 @@ static int ocfs2_parse_options(struct super_block *sb,
1388 mopt->commit_interval = HZ * option; 1385 mopt->commit_interval = HZ * option;
1389 break; 1386 break;
1390 case Opt_localalloc: 1387 case Opt_localalloc:
1391 option = 0;
1392 if (match_int(&args[0], &option)) { 1388 if (match_int(&args[0], &option)) {
1393 status = 0; 1389 status = 0;
1394 goto bail; 1390 goto bail;
@@ -1726,8 +1722,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
1726 ocfs2_inode_unlock(inode, 0); 1722 ocfs2_inode_unlock(inode, 0);
1727 status = 0; 1723 status = 0;
1728bail: 1724bail:
1729 if (inode) 1725 iput(inode);
1730 iput(inode);
1731 1726
1732 if (status) 1727 if (status)
1733 mlog_errno(status); 1728 mlog_errno(status);
@@ -1771,7 +1766,7 @@ static int ocfs2_initialize_mem_caches(void)
1771 sizeof(struct ocfs2_inode_info), 1766 sizeof(struct ocfs2_inode_info),
1772 0, 1767 0,
1773 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 1768 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
1774 SLAB_MEM_SPREAD), 1769 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
1775 ocfs2_inode_init_once); 1770 ocfs2_inode_init_once);
1776 ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache", 1771 ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
1777 sizeof(struct ocfs2_dquot), 1772 sizeof(struct ocfs2_dquot),
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 15e4500cda3e..b61b883c8ff8 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -443,7 +443,7 @@ static int __init init_openprom_fs(void)
443 sizeof(struct op_inode_info), 443 sizeof(struct op_inode_info),
444 0, 444 0,
445 (SLAB_RECLAIM_ACCOUNT | 445 (SLAB_RECLAIM_ACCOUNT |
446 SLAB_MEM_SPREAD), 446 SLAB_MEM_SPREAD | SLAB_ACCOUNT),
447 op_inode_init_once); 447 op_inode_init_once);
448 if (!op_inode_cachep) 448 if (!op_inode_cachep)
449 return -ENOMEM; 449 return -ENOMEM;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d0e9b9b6223e..42305ddcbaa0 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -95,7 +95,8 @@ void __init proc_init_inodecache(void)
95 proc_inode_cachep = kmem_cache_create("proc_inode_cache", 95 proc_inode_cachep = kmem_cache_create("proc_inode_cache",
96 sizeof(struct proc_inode), 96 sizeof(struct proc_inode),
97 0, (SLAB_RECLAIM_ACCOUNT| 97 0, (SLAB_RECLAIM_ACCOUNT|
98 SLAB_MEM_SPREAD|SLAB_PANIC), 98 SLAB_MEM_SPREAD|SLAB_ACCOUNT|
99 SLAB_PANIC),
99 init_once); 100 init_once);
100} 101}
101 102
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 9155a5a0d3b9..df4661abadc4 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -57,11 +57,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
57 /* 57 /*
58 * Estimate the amount of memory available for userspace allocations, 58 * Estimate the amount of memory available for userspace allocations,
59 * without causing swapping. 59 * without causing swapping.
60 *
61 * Free memory cannot be taken below the low watermark, before the
62 * system starts swapping.
63 */ 60 */
64 available = i.freeram - wmark_low; 61 available = i.freeram - totalreserve_pages;
65 62
66 /* 63 /*
67 * Not all the page cache can be freed, otherwise the system will 64 * Not all the page cache can be freed, otherwise the system will
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 187b3b5f242e..a353b4c6e86e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -14,6 +14,7 @@
14#include <linux/swapops.h> 14#include <linux/swapops.h>
15#include <linux/mmu_notifier.h> 15#include <linux/mmu_notifier.h>
16#include <linux/page_idle.h> 16#include <linux/page_idle.h>
17#include <linux/shmem_fs.h>
17 18
18#include <asm/elf.h> 19#include <asm/elf.h>
19#include <asm/uaccess.h> 20#include <asm/uaccess.h>
@@ -22,9 +23,13 @@
22 23
23void task_mem(struct seq_file *m, struct mm_struct *mm) 24void task_mem(struct seq_file *m, struct mm_struct *mm)
24{ 25{
25 unsigned long data, text, lib, swap, ptes, pmds; 26 unsigned long text, lib, swap, ptes, pmds, anon, file, shmem;
26 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; 27 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
27 28
29 anon = get_mm_counter(mm, MM_ANONPAGES);
30 file = get_mm_counter(mm, MM_FILEPAGES);
31 shmem = get_mm_counter(mm, MM_SHMEMPAGES);
32
28 /* 33 /*
29 * Note: to minimize their overhead, mm maintains hiwater_vm and 34 * Note: to minimize their overhead, mm maintains hiwater_vm and
30 * hiwater_rss only when about to *lower* total_vm or rss. Any 35 * hiwater_rss only when about to *lower* total_vm or rss. Any
@@ -35,11 +40,10 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
35 hiwater_vm = total_vm = mm->total_vm; 40 hiwater_vm = total_vm = mm->total_vm;
36 if (hiwater_vm < mm->hiwater_vm) 41 if (hiwater_vm < mm->hiwater_vm)
37 hiwater_vm = mm->hiwater_vm; 42 hiwater_vm = mm->hiwater_vm;
38 hiwater_rss = total_rss = get_mm_rss(mm); 43 hiwater_rss = total_rss = anon + file + shmem;
39 if (hiwater_rss < mm->hiwater_rss) 44 if (hiwater_rss < mm->hiwater_rss)
40 hiwater_rss = mm->hiwater_rss; 45 hiwater_rss = mm->hiwater_rss;
41 46
42 data = mm->total_vm - mm->shared_vm - mm->stack_vm;
43 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; 47 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
44 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; 48 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
45 swap = get_mm_counter(mm, MM_SWAPENTS); 49 swap = get_mm_counter(mm, MM_SWAPENTS);
@@ -52,6 +56,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
52 "VmPin:\t%8lu kB\n" 56 "VmPin:\t%8lu kB\n"
53 "VmHWM:\t%8lu kB\n" 57 "VmHWM:\t%8lu kB\n"
54 "VmRSS:\t%8lu kB\n" 58 "VmRSS:\t%8lu kB\n"
59 "RssAnon:\t%8lu kB\n"
60 "RssFile:\t%8lu kB\n"
61 "RssShmem:\t%8lu kB\n"
55 "VmData:\t%8lu kB\n" 62 "VmData:\t%8lu kB\n"
56 "VmStk:\t%8lu kB\n" 63 "VmStk:\t%8lu kB\n"
57 "VmExe:\t%8lu kB\n" 64 "VmExe:\t%8lu kB\n"
@@ -65,7 +72,10 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
65 mm->pinned_vm << (PAGE_SHIFT-10), 72 mm->pinned_vm << (PAGE_SHIFT-10),
66 hiwater_rss << (PAGE_SHIFT-10), 73 hiwater_rss << (PAGE_SHIFT-10),
67 total_rss << (PAGE_SHIFT-10), 74 total_rss << (PAGE_SHIFT-10),
68 data << (PAGE_SHIFT-10), 75 anon << (PAGE_SHIFT-10),
76 file << (PAGE_SHIFT-10),
77 shmem << (PAGE_SHIFT-10),
78 mm->data_vm << (PAGE_SHIFT-10),
69 mm->stack_vm << (PAGE_SHIFT-10), text, lib, 79 mm->stack_vm << (PAGE_SHIFT-10), text, lib,
70 ptes >> 10, 80 ptes >> 10,
71 pmds >> 10, 81 pmds >> 10,
@@ -82,10 +92,11 @@ unsigned long task_statm(struct mm_struct *mm,
82 unsigned long *shared, unsigned long *text, 92 unsigned long *shared, unsigned long *text,
83 unsigned long *data, unsigned long *resident) 93 unsigned long *data, unsigned long *resident)
84{ 94{
85 *shared = get_mm_counter(mm, MM_FILEPAGES); 95 *shared = get_mm_counter(mm, MM_FILEPAGES) +
96 get_mm_counter(mm, MM_SHMEMPAGES);
86 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) 97 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
87 >> PAGE_SHIFT; 98 >> PAGE_SHIFT;
88 *data = mm->total_vm - mm->shared_vm; 99 *data = mm->data_vm + mm->stack_vm;
89 *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); 100 *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
90 return mm->total_vm; 101 return mm->total_vm;
91} 102}
@@ -451,6 +462,7 @@ struct mem_size_stats {
451 unsigned long private_hugetlb; 462 unsigned long private_hugetlb;
452 u64 pss; 463 u64 pss;
453 u64 swap_pss; 464 u64 swap_pss;
465 bool check_shmem_swap;
454}; 466};
455 467
456static void smaps_account(struct mem_size_stats *mss, struct page *page, 468static void smaps_account(struct mem_size_stats *mss, struct page *page,
@@ -485,6 +497,19 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
485 } 497 }
486} 498}
487 499
500#ifdef CONFIG_SHMEM
501static int smaps_pte_hole(unsigned long addr, unsigned long end,
502 struct mm_walk *walk)
503{
504 struct mem_size_stats *mss = walk->private;
505
506 mss->swap += shmem_partial_swap_usage(
507 walk->vma->vm_file->f_mapping, addr, end);
508
509 return 0;
510}
511#endif
512
488static void smaps_pte_entry(pte_t *pte, unsigned long addr, 513static void smaps_pte_entry(pte_t *pte, unsigned long addr,
489 struct mm_walk *walk) 514 struct mm_walk *walk)
490{ 515{
@@ -512,6 +537,19 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
512 } 537 }
513 } else if (is_migration_entry(swpent)) 538 } else if (is_migration_entry(swpent))
514 page = migration_entry_to_page(swpent); 539 page = migration_entry_to_page(swpent);
540 } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
541 && pte_none(*pte))) {
542 page = find_get_entry(vma->vm_file->f_mapping,
543 linear_page_index(vma, addr));
544 if (!page)
545 return;
546
547 if (radix_tree_exceptional_entry(page))
548 mss->swap += PAGE_SIZE;
549 else
550 page_cache_release(page);
551
552 return;
515 } 553 }
516 554
517 if (!page) 555 if (!page)
@@ -671,6 +709,31 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
671 }; 709 };
672 710
673 memset(&mss, 0, sizeof mss); 711 memset(&mss, 0, sizeof mss);
712
713#ifdef CONFIG_SHMEM
714 if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
715 /*
716 * For shared or readonly shmem mappings we know that all
717 * swapped out pages belong to the shmem object, and we can
718 * obtain the swap value much more efficiently. For private
719 * writable mappings, we might have COW pages that are
720 * not affected by the parent swapped out pages of the shmem
721 * object, so we have to distinguish them during the page walk.
722 * Unless we know that the shmem object (or the part mapped by
723 * our VMA) has no swapped out pages at all.
724 */
725 unsigned long shmem_swapped = shmem_swap_usage(vma);
726
727 if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
728 !(vma->vm_flags & VM_WRITE)) {
729 mss.swap = shmem_swapped;
730 } else {
731 mss.check_shmem_swap = true;
732 smaps_walk.pte_hole = smaps_pte_hole;
733 }
734 }
735#endif
736
674 /* mmap_sem is held in m_start */ 737 /* mmap_sem is held in m_start */
675 walk_page_vma(vma, &smaps_walk); 738 walk_page_vma(vma, &smaps_walk);
676 739
@@ -817,9 +880,6 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
817 pmd = pmd_wrprotect(pmd); 880 pmd = pmd_wrprotect(pmd);
818 pmd = pmd_clear_soft_dirty(pmd); 881 pmd = pmd_clear_soft_dirty(pmd);
819 882
820 if (vma->vm_flags & VM_SOFTDIRTY)
821 vma->vm_flags &= ~VM_SOFTDIRTY;
822
823 set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 883 set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
824} 884}
825#else 885#else
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index f37b3deb01b4..3a67cfb142d8 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -365,7 +365,7 @@ static int init_inodecache(void)
365 qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache", 365 qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache",
366 sizeof(struct qnx4_inode_info), 366 sizeof(struct qnx4_inode_info),
367 0, (SLAB_RECLAIM_ACCOUNT| 367 0, (SLAB_RECLAIM_ACCOUNT|
368 SLAB_MEM_SPREAD), 368 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
369 init_once); 369 init_once);
370 if (qnx4_inode_cachep == NULL) 370 if (qnx4_inode_cachep == NULL)
371 return -ENOMEM; 371 return -ENOMEM;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 9728b5499e1d..47bb1de07155 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -625,7 +625,7 @@ static int init_inodecache(void)
625 qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache", 625 qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache",
626 sizeof(struct qnx6_inode_info), 626 sizeof(struct qnx6_inode_info),
627 0, (SLAB_RECLAIM_ACCOUNT| 627 0, (SLAB_RECLAIM_ACCOUNT|
628 SLAB_MEM_SPREAD), 628 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
629 init_once); 629 init_once);
630 if (!qnx6_inode_cachep) 630 if (!qnx6_inode_cachep)
631 return -ENOMEM; 631 return -ENOMEM;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 4a62fe8cc3bf..05db7473bcb5 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -626,7 +626,8 @@ static int __init init_inodecache(void)
626 sizeof(struct 626 sizeof(struct
627 reiserfs_inode_info), 627 reiserfs_inode_info),
628 0, (SLAB_RECLAIM_ACCOUNT| 628 0, (SLAB_RECLAIM_ACCOUNT|
629 SLAB_MEM_SPREAD), 629 SLAB_MEM_SPREAD|
630 SLAB_ACCOUNT),
630 init_once); 631 init_once);
631 if (reiserfs_inode_cachep == NULL) 632 if (reiserfs_inode_cachep == NULL)
632 return -ENOMEM; 633 return -ENOMEM;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index bb894e78a821..6b00ca357c58 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -619,8 +619,8 @@ static int __init init_romfs_fs(void)
619 romfs_inode_cachep = 619 romfs_inode_cachep =
620 kmem_cache_create("romfs_i", 620 kmem_cache_create("romfs_i",
621 sizeof(struct romfs_inode_info), 0, 621 sizeof(struct romfs_inode_info), 0,
622 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, 622 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
623 romfs_i_init_once); 623 SLAB_ACCOUNT, romfs_i_init_once);
624 624
625 if (!romfs_inode_cachep) { 625 if (!romfs_inode_cachep) {
626 pr_err("Failed to initialise inode cache\n"); 626 pr_err("Failed to initialise inode cache\n");
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index dded920cbc8f..5e79bfa4f260 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -419,7 +419,8 @@ static int __init init_inodecache(void)
419{ 419{
420 squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache", 420 squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache",
421 sizeof(struct squashfs_inode_info), 0, 421 sizeof(struct squashfs_inode_info), 0,
422 SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once); 422 SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
423 init_once);
423 424
424 return squashfs_inode_cachep ? 0 : -ENOMEM; 425 return squashfs_inode_cachep ? 0 : -ENOMEM;
425} 426}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 07ac18c355e7..d62c423a5a2d 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -346,7 +346,7 @@ int __init sysv_init_icache(void)
346{ 346{
347 sysv_inode_cachep = kmem_cache_create("sysv_inode_cache", 347 sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
348 sizeof(struct sysv_inode_info), 0, 348 sizeof(struct sysv_inode_info), 0,
349 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, 349 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
350 init_once); 350 init_once);
351 if (!sysv_inode_cachep) 351 if (!sysv_inode_cachep)
352 return -ENOMEM; 352 return -ENOMEM;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1fd90c079537..a233ba913be4 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2248,8 +2248,8 @@ static int __init ubifs_init(void)
2248 2248
2249 ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab", 2249 ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
2250 sizeof(struct ubifs_inode), 0, 2250 sizeof(struct ubifs_inode), 0,
2251 SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT, 2251 SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT |
2252 &inode_slab_ctor); 2252 SLAB_ACCOUNT, &inode_slab_ctor);
2253 if (!ubifs_inode_slab) 2253 if (!ubifs_inode_slab)
2254 return -ENOMEM; 2254 return -ENOMEM;
2255 2255
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 81155b9b445b..9c64a3ca9837 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -179,7 +179,8 @@ static int __init init_inodecache(void)
179 udf_inode_cachep = kmem_cache_create("udf_inode_cache", 179 udf_inode_cachep = kmem_cache_create("udf_inode_cache",
180 sizeof(struct udf_inode_info), 180 sizeof(struct udf_inode_info),
181 0, (SLAB_RECLAIM_ACCOUNT | 181 0, (SLAB_RECLAIM_ACCOUNT |
182 SLAB_MEM_SPREAD), 182 SLAB_MEM_SPREAD |
183 SLAB_ACCOUNT),
183 init_once); 184 init_once);
184 if (!udf_inode_cachep) 185 if (!udf_inode_cachep)
185 return -ENOMEM; 186 return -ENOMEM;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index f6390eec02ca..442fd52ebffe 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1427,7 +1427,7 @@ static int __init init_inodecache(void)
1427 ufs_inode_cachep = kmem_cache_create("ufs_inode_cache", 1427 ufs_inode_cachep = kmem_cache_create("ufs_inode_cache",
1428 sizeof(struct ufs_inode_info), 1428 sizeof(struct ufs_inode_info),
1429 0, (SLAB_RECLAIM_ACCOUNT| 1429 0, (SLAB_RECLAIM_ACCOUNT|
1430 SLAB_MEM_SPREAD), 1430 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
1431 init_once); 1431 init_once);
1432 if (ufs_inode_cachep == NULL) 1432 if (ufs_inode_cachep == NULL)
1433 return -ENOMEM; 1433 return -ENOMEM;
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index cc6b768fc068..d1c66e465ca5 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -84,6 +84,7 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags)
84#define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN 84#define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN
85#define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT 85#define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT
86#define KM_ZONE_SPREAD SLAB_MEM_SPREAD 86#define KM_ZONE_SPREAD SLAB_MEM_SPREAD
87#define KM_ZONE_ACCOUNT SLAB_ACCOUNT
87 88
88#define kmem_zone kmem_cache 89#define kmem_zone kmem_cache
89#define kmem_zone_t struct kmem_cache 90#define kmem_zone_t struct kmem_cache
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index b35775752b74..59c9b7bd958d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1714,8 +1714,8 @@ xfs_init_zones(void)
1714 1714
1715 xfs_inode_zone = 1715 xfs_inode_zone =
1716 kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", 1716 kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
1717 KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD, 1717 KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD |
1718 xfs_fs_inode_init_once); 1718 KM_ZONE_ACCOUNT, xfs_fs_inode_init_once);
1719 if (!xfs_inode_zone) 1719 if (!xfs_inode_zone)
1720 goto out_destroy_efi_zone; 1720 goto out_destroy_efi_zone;
1721 1721
diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h
index 4b4b056a6eb0..5148150cc80b 100644
--- a/include/asm-generic/memory_model.h
+++ b/include/asm-generic/memory_model.h
@@ -1,6 +1,8 @@
1#ifndef __ASM_MEMORY_MODEL_H 1#ifndef __ASM_MEMORY_MODEL_H
2#define __ASM_MEMORY_MODEL_H 2#define __ASM_MEMORY_MODEL_H
3 3
4#include <linux/pfn.h>
5
4#ifndef __ASSEMBLY__ 6#ifndef __ASSEMBLY__
5 7
6#if defined(CONFIG_FLATMEM) 8#if defined(CONFIG_FLATMEM)
@@ -72,7 +74,7 @@
72/* 74/*
73 * Convert a physical address to a Page Frame Number and back 75 * Convert a physical address to a Page Frame Number and back
74 */ 76 */
75#define __phys_to_pfn(paddr) ((unsigned long)((paddr) >> PAGE_SHIFT)) 77#define __phys_to_pfn(paddr) PHYS_PFN(paddr)
76#define __pfn_to_phys(pfn) PFN_PHYS(pfn) 78#define __pfn_to_phys(pfn) PFN_PHYS(pfn)
77 79
78#define page_to_pfn __page_to_pfn 80#define page_to_pfn __page_to_pfn
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index d67ae119cf4e..7781ce110503 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -27,10 +27,10 @@ struct vfsmount;
27 27
28/* The hash is always the low bits of hash_len */ 28/* The hash is always the low bits of hash_len */
29#ifdef __LITTLE_ENDIAN 29#ifdef __LITTLE_ENDIAN
30 #define HASH_LEN_DECLARE u32 hash; u32 len; 30 #define HASH_LEN_DECLARE u32 hash; u32 len
31 #define bytemask_from_count(cnt) (~(~0ul << (cnt)*8)) 31 #define bytemask_from_count(cnt) (~(~0ul << (cnt)*8))
32#else 32#else
33 #define HASH_LEN_DECLARE u32 len; u32 hash; 33 #define HASH_LEN_DECLARE u32 len; u32 hash
34 #define bytemask_from_count(cnt) (~(~0ul >> (cnt)*8)) 34 #define bytemask_from_count(cnt) (~(~0ul >> (cnt)*8))
35#endif 35#endif
36 36
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 533c4408529a..6b7e89f45aa4 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -220,7 +220,10 @@ struct fsnotify_mark {
220 /* List of marks by group->i_fsnotify_marks. Also reused for queueing 220 /* List of marks by group->i_fsnotify_marks. Also reused for queueing
221 * mark into destroy_list when it's waiting for the end of SRCU period 221 * mark into destroy_list when it's waiting for the end of SRCU period
222 * before it can be freed. [group->mark_mutex] */ 222 * before it can be freed. [group->mark_mutex] */
223 struct list_head g_list; 223 union {
224 struct list_head g_list;
225 struct rcu_head g_rcu;
226 };
224 /* Protects inode / mnt pointers, flags, masks */ 227 /* Protects inode / mnt pointers, flags, masks */
225 spinlock_t lock; 228 spinlock_t lock;
226 /* List of marks for inode / vfsmount [obj_lock] */ 229 /* List of marks for inode / vfsmount [obj_lock] */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 8942af0813e3..28ad5f6494b0 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -30,7 +30,7 @@ struct vm_area_struct;
30#define ___GFP_HARDWALL 0x20000u 30#define ___GFP_HARDWALL 0x20000u
31#define ___GFP_THISNODE 0x40000u 31#define ___GFP_THISNODE 0x40000u
32#define ___GFP_ATOMIC 0x80000u 32#define ___GFP_ATOMIC 0x80000u
33#define ___GFP_NOACCOUNT 0x100000u 33#define ___GFP_ACCOUNT 0x100000u
34#define ___GFP_NOTRACK 0x200000u 34#define ___GFP_NOTRACK 0x200000u
35#define ___GFP_DIRECT_RECLAIM 0x400000u 35#define ___GFP_DIRECT_RECLAIM 0x400000u
36#define ___GFP_OTHER_NODE 0x800000u 36#define ___GFP_OTHER_NODE 0x800000u
@@ -73,11 +73,15 @@ struct vm_area_struct;
73 * 73 *
74 * __GFP_THISNODE forces the allocation to be satisified from the requested 74 * __GFP_THISNODE forces the allocation to be satisified from the requested
75 * node with no fallbacks or placement policy enforcements. 75 * node with no fallbacks or placement policy enforcements.
76 *
77 * __GFP_ACCOUNT causes the allocation to be accounted to kmemcg (only relevant
78 * to kmem allocations).
76 */ 79 */
77#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) 80#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
78#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) 81#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE)
79#define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) 82#define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL)
80#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE) 83#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)
84#define __GFP_ACCOUNT ((__force gfp_t)___GFP_ACCOUNT)
81 85
82/* 86/*
83 * Watermark modifiers -- controls access to emergency reserves 87 * Watermark modifiers -- controls access to emergency reserves
@@ -104,7 +108,6 @@ struct vm_area_struct;
104#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) 108#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH)
105#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC) 109#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC)
106#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) 110#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC)
107#define __GFP_NOACCOUNT ((__force gfp_t)___GFP_NOACCOUNT)
108 111
109/* 112/*
110 * Reclaim modifiers 113 * Reclaim modifiers
@@ -197,6 +200,9 @@ struct vm_area_struct;
197 * GFP_KERNEL is typical for kernel-internal allocations. The caller requires 200 * GFP_KERNEL is typical for kernel-internal allocations. The caller requires
198 * ZONE_NORMAL or a lower zone for direct access but can direct reclaim. 201 * ZONE_NORMAL or a lower zone for direct access but can direct reclaim.
199 * 202 *
203 * GFP_KERNEL_ACCOUNT is the same as GFP_KERNEL, except the allocation is
204 * accounted to kmemcg.
205 *
200 * GFP_NOWAIT is for kernel allocations that should not stall for direct 206 * GFP_NOWAIT is for kernel allocations that should not stall for direct
201 * reclaim, start physical IO or use any filesystem callback. 207 * reclaim, start physical IO or use any filesystem callback.
202 * 208 *
@@ -236,6 +242,7 @@ struct vm_area_struct;
236 */ 242 */
237#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) 243#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
238#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) 244#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
245#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
239#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) 246#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM)
240#define GFP_NOIO (__GFP_RECLAIM) 247#define GFP_NOIO (__GFP_RECLAIM)
241#define GFP_NOFS (__GFP_RECLAIM | __GFP_IO) 248#define GFP_NOFS (__GFP_RECLAIM | __GFP_IO)
@@ -271,7 +278,7 @@ static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
271 278
272static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) 279static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
273{ 280{
274 return (bool __force)(gfp_flags & __GFP_DIRECT_RECLAIM); 281 return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
275} 282}
276 283
277#ifdef CONFIG_HIGHMEM 284#ifdef CONFIG_HIGHMEM
@@ -377,10 +384,11 @@ static inline enum zone_type gfp_zone(gfp_t flags)
377 384
378static inline int gfp_zonelist(gfp_t flags) 385static inline int gfp_zonelist(gfp_t flags)
379{ 386{
380 if (IS_ENABLED(CONFIG_NUMA) && unlikely(flags & __GFP_THISNODE)) 387#ifdef CONFIG_NUMA
381 return 1; 388 if (unlikely(flags & __GFP_THISNODE))
382 389 return ZONELIST_NOFALLBACK;
383 return 0; 390#endif
391 return ZONELIST_FALLBACK;
384} 392}
385 393
386/* 394/*
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index b0eb06423d5e..e76574d8f9b5 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -263,20 +263,18 @@ struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
263 struct user_struct **user, int creat_flags, 263 struct user_struct **user, int creat_flags,
264 int page_size_log); 264 int page_size_log);
265 265
266static inline int is_file_hugepages(struct file *file) 266static inline bool is_file_hugepages(struct file *file)
267{ 267{
268 if (file->f_op == &hugetlbfs_file_operations) 268 if (file->f_op == &hugetlbfs_file_operations)
269 return 1; 269 return true;
270 if (is_file_shm_hugepages(file))
271 return 1;
272 270
273 return 0; 271 return is_file_shm_hugepages(file);
274} 272}
275 273
276 274
277#else /* !CONFIG_HUGETLBFS */ 275#else /* !CONFIG_HUGETLBFS */
278 276
279#define is_file_hugepages(file) 0 277#define is_file_hugepages(file) false
280static inline struct file * 278static inline struct file *
281hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, 279hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag,
282 struct user_struct **user, int creat_flags, 280 struct user_struct **user, int creat_flags,
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index fec66f86eeff..173fb44e22f1 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -216,10 +216,10 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
216 * for_each_free_mem_range - iterate through free memblock areas 216 * for_each_free_mem_range - iterate through free memblock areas
217 * @i: u64 used as loop variable 217 * @i: u64 used as loop variable
218 * @nid: node selector, %NUMA_NO_NODE for all nodes 218 * @nid: node selector, %NUMA_NO_NODE for all nodes
219 * @flags: pick from blocks based on memory attributes
219 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 220 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
220 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 221 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
221 * @p_nid: ptr to int for nid of the range, can be %NULL 222 * @p_nid: ptr to int for nid of the range, can be %NULL
222 * @flags: pick from blocks based on memory attributes
223 * 223 *
224 * Walks over free (memory && !reserved) areas of memblock. Available as 224 * Walks over free (memory && !reserved) areas of memblock. Available as
225 * soon as memblock is initialized. 225 * soon as memblock is initialized.
@@ -232,10 +232,10 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
232 * for_each_free_mem_range_reverse - rev-iterate through free memblock areas 232 * for_each_free_mem_range_reverse - rev-iterate through free memblock areas
233 * @i: u64 used as loop variable 233 * @i: u64 used as loop variable
234 * @nid: node selector, %NUMA_NO_NODE for all nodes 234 * @nid: node selector, %NUMA_NO_NODE for all nodes
235 * @flags: pick from blocks based on memory attributes
235 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 236 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
236 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 237 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
237 * @p_nid: ptr to int for nid of the range, can be %NULL 238 * @p_nid: ptr to int for nid of the range, can be %NULL
238 * @flags: pick from blocks based on memory attributes
239 * 239 *
240 * Walks over free (memory && !reserved) areas of memblock in reverse 240 * Walks over free (memory && !reserved) areas of memblock in reverse
241 * order. Available as soon as memblock is initialized. 241 * order. Available as soon as memblock is initialized.
@@ -325,10 +325,10 @@ phys_addr_t memblock_mem_size(unsigned long limit_pfn);
325phys_addr_t memblock_start_of_DRAM(void); 325phys_addr_t memblock_start_of_DRAM(void);
326phys_addr_t memblock_end_of_DRAM(void); 326phys_addr_t memblock_end_of_DRAM(void);
327void memblock_enforce_memory_limit(phys_addr_t memory_limit); 327void memblock_enforce_memory_limit(phys_addr_t memory_limit);
328int memblock_is_memory(phys_addr_t addr); 328bool memblock_is_memory(phys_addr_t addr);
329int memblock_is_map_memory(phys_addr_t addr); 329int memblock_is_map_memory(phys_addr_t addr);
330int memblock_is_region_memory(phys_addr_t base, phys_addr_t size); 330int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
331int memblock_is_reserved(phys_addr_t addr); 331bool memblock_is_reserved(phys_addr_t addr);
332bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size); 332bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
333 333
334extern void __memblock_dump_all(void); 334extern void __memblock_dump_all(void);
@@ -399,6 +399,11 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
399 region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \ 399 region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \
400 region++) 400 region++)
401 401
402#define for_each_memblock_type(memblock_type, rgn) \
403 idx = 0; \
404 rgn = &memblock_type->regions[idx]; \
405 for (idx = 0; idx < memblock_type->cnt; \
406 idx++,rgn = &memblock_type->regions[idx])
402 407
403#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK 408#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
404#define __init_memblock __meminit 409#define __init_memblock __meminit
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index cd0e2413c358..2292468f2a30 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -85,32 +85,10 @@ enum mem_cgroup_events_target {
85 MEM_CGROUP_NTARGETS, 85 MEM_CGROUP_NTARGETS,
86}; 86};
87 87
88/*
89 * Bits in struct cg_proto.flags
90 */
91enum cg_proto_flags {
92 /* Currently active and new sockets should be assigned to cgroups */
93 MEMCG_SOCK_ACTIVE,
94 /* It was ever activated; we must disarm static keys on destruction */
95 MEMCG_SOCK_ACTIVATED,
96};
97
98struct cg_proto { 88struct cg_proto {
99 struct page_counter memory_allocated; /* Current allocated memory. */ 89 struct page_counter memory_allocated; /* Current allocated memory. */
100 struct percpu_counter sockets_allocated; /* Current number of sockets. */
101 int memory_pressure; 90 int memory_pressure;
102 long sysctl_mem[3]; 91 bool active;
103 unsigned long flags;
104 /*
105 * memcg field is used to find which memcg we belong directly
106 * Each memcg struct can hold more than one cg_proto, so container_of
107 * won't really cut.
108 *
109 * The elegant solution would be having an inverse function to
110 * proto_cgroup in struct proto, but that means polluting the structure
111 * for everybody, instead of just for memcg users.
112 */
113 struct mem_cgroup *memcg;
114}; 92};
115 93
116#ifdef CONFIG_MEMCG 94#ifdef CONFIG_MEMCG
@@ -192,6 +170,9 @@ struct mem_cgroup {
192 unsigned long low; 170 unsigned long low;
193 unsigned long high; 171 unsigned long high;
194 172
173 /* Range enforcement for interrupt charges */
174 struct work_struct high_work;
175
195 unsigned long soft_limit; 176 unsigned long soft_limit;
196 177
197 /* vmpressure notifications */ 178 /* vmpressure notifications */
@@ -268,6 +249,10 @@ struct mem_cgroup {
268 struct wb_domain cgwb_domain; 249 struct wb_domain cgwb_domain;
269#endif 250#endif
270 251
252#ifdef CONFIG_INET
253 unsigned long socket_pressure;
254#endif
255
271 /* List of events which userspace want to receive */ 256 /* List of events which userspace want to receive */
272 struct list_head event_list; 257 struct list_head event_list;
273 spinlock_t event_list_lock; 258 spinlock_t event_list_lock;
@@ -275,7 +260,8 @@ struct mem_cgroup {
275 struct mem_cgroup_per_node *nodeinfo[0]; 260 struct mem_cgroup_per_node *nodeinfo[0];
276 /* WARNING: nodeinfo must be the last member here */ 261 /* WARNING: nodeinfo must be the last member here */
277}; 262};
278extern struct cgroup_subsys_state *mem_cgroup_root_css; 263
264extern struct mem_cgroup *root_mem_cgroup;
279 265
280/** 266/**
281 * mem_cgroup_events - count memory events against a cgroup 267 * mem_cgroup_events - count memory events against a cgroup
@@ -308,18 +294,34 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
308 294
309bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); 295bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
310struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); 296struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
311struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
312 297
313static inline 298static inline
314struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ 299struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
315 return css ? container_of(css, struct mem_cgroup, css) : NULL; 300 return css ? container_of(css, struct mem_cgroup, css) : NULL;
316} 301}
317 302
303#define mem_cgroup_from_counter(counter, member) \
304 container_of(counter, struct mem_cgroup, member)
305
318struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, 306struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
319 struct mem_cgroup *, 307 struct mem_cgroup *,
320 struct mem_cgroup_reclaim_cookie *); 308 struct mem_cgroup_reclaim_cookie *);
321void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); 309void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
322 310
311/**
312 * parent_mem_cgroup - find the accounting parent of a memcg
313 * @memcg: memcg whose parent to find
314 *
315 * Returns the parent memcg, or NULL if this is the root or the memory
316 * controller is in legacy no-hierarchy mode.
317 */
318static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
319{
320 if (!memcg->memory.parent)
321 return NULL;
322 return mem_cgroup_from_counter(memcg->memory.parent, memory);
323}
324
323static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, 325static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
324 struct mem_cgroup *root) 326 struct mem_cgroup *root)
325{ 327{
@@ -671,12 +673,6 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
671} 673}
672#endif /* CONFIG_MEMCG */ 674#endif /* CONFIG_MEMCG */
673 675
674enum {
675 UNDER_LIMIT,
676 SOFT_LIMIT,
677 OVER_LIMIT,
678};
679
680#ifdef CONFIG_CGROUP_WRITEBACK 676#ifdef CONFIG_CGROUP_WRITEBACK
681 677
682struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); 678struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
@@ -703,20 +699,35 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
703#endif /* CONFIG_CGROUP_WRITEBACK */ 699#endif /* CONFIG_CGROUP_WRITEBACK */
704 700
705struct sock; 701struct sock;
706#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
707void sock_update_memcg(struct sock *sk); 702void sock_update_memcg(struct sock *sk);
708void sock_release_memcg(struct sock *sk); 703void sock_release_memcg(struct sock *sk);
709#else 704bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
710static inline void sock_update_memcg(struct sock *sk) 705void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
706#if defined(CONFIG_MEMCG) && defined(CONFIG_INET)
707extern struct static_key_false memcg_sockets_enabled_key;
708#define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)
709static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
711{ 710{
711#ifdef CONFIG_MEMCG_KMEM
712 if (memcg->tcp_mem.memory_pressure)
713 return true;
714#endif
715 do {
716 if (time_before(jiffies, memcg->socket_pressure))
717 return true;
718 } while ((memcg = parent_mem_cgroup(memcg)));
719 return false;
712} 720}
713static inline void sock_release_memcg(struct sock *sk) 721#else
722#define mem_cgroup_sockets_enabled 0
723static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
714{ 724{
725 return false;
715} 726}
716#endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */ 727#endif
717 728
718#ifdef CONFIG_MEMCG_KMEM 729#ifdef CONFIG_MEMCG_KMEM
719extern struct static_key memcg_kmem_enabled_key; 730extern struct static_key_false memcg_kmem_enabled_key;
720 731
721extern int memcg_nr_cache_ids; 732extern int memcg_nr_cache_ids;
722void memcg_get_cache_ids(void); 733void memcg_get_cache_ids(void);
@@ -732,7 +743,7 @@ void memcg_put_cache_ids(void);
732 743
733static inline bool memcg_kmem_enabled(void) 744static inline bool memcg_kmem_enabled(void)
734{ 745{
735 return static_key_false(&memcg_kmem_enabled_key); 746 return static_branch_unlikely(&memcg_kmem_enabled_key);
736} 747}
737 748
738static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) 749static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
@@ -766,15 +777,13 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
766 return memcg ? memcg->kmemcg_id : -1; 777 return memcg ? memcg->kmemcg_id : -1;
767} 778}
768 779
769struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); 780struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
770void __memcg_kmem_put_cache(struct kmem_cache *cachep); 781void __memcg_kmem_put_cache(struct kmem_cache *cachep);
771 782
772static inline bool __memcg_kmem_bypass(gfp_t gfp) 783static inline bool __memcg_kmem_bypass(void)
773{ 784{
774 if (!memcg_kmem_enabled()) 785 if (!memcg_kmem_enabled())
775 return true; 786 return true;
776 if (gfp & __GFP_NOACCOUNT)
777 return true;
778 if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) 787 if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
779 return true; 788 return true;
780 return false; 789 return false;
@@ -791,7 +800,9 @@ static inline bool __memcg_kmem_bypass(gfp_t gfp)
791static __always_inline int memcg_kmem_charge(struct page *page, 800static __always_inline int memcg_kmem_charge(struct page *page,
792 gfp_t gfp, int order) 801 gfp_t gfp, int order)
793{ 802{
794 if (__memcg_kmem_bypass(gfp)) 803 if (__memcg_kmem_bypass())
804 return 0;
805 if (!(gfp & __GFP_ACCOUNT))
795 return 0; 806 return 0;
796 return __memcg_kmem_charge(page, gfp, order); 807 return __memcg_kmem_charge(page, gfp, order);
797} 808}
@@ -810,16 +821,15 @@ static __always_inline void memcg_kmem_uncharge(struct page *page, int order)
810/** 821/**
811 * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation 822 * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation
812 * @cachep: the original global kmem cache 823 * @cachep: the original global kmem cache
813 * @gfp: allocation flags.
814 * 824 *
815 * All memory allocated from a per-memcg cache is charged to the owner memcg. 825 * All memory allocated from a per-memcg cache is charged to the owner memcg.
816 */ 826 */
817static __always_inline struct kmem_cache * 827static __always_inline struct kmem_cache *
818memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) 828memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
819{ 829{
820 if (__memcg_kmem_bypass(gfp)) 830 if (__memcg_kmem_bypass())
821 return cachep; 831 return cachep;
822 return __memcg_kmem_get_cache(cachep); 832 return __memcg_kmem_get_cache(cachep, gfp);
823} 833}
824 834
825static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep) 835static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 3d385c81c153..2696c1f05ed1 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -122,7 +122,7 @@ struct sp_node {
122 122
123struct shared_policy { 123struct shared_policy {
124 struct rb_root root; 124 struct rb_root root;
125 spinlock_t lock; 125 rwlock_t lock;
126}; 126};
127 127
128int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst); 128int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 00bad7793788..839d9e9a1c38 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -51,6 +51,17 @@ extern int sysctl_legacy_va_layout;
51#define sysctl_legacy_va_layout 0 51#define sysctl_legacy_va_layout 0
52#endif 52#endif
53 53
54#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
55extern const int mmap_rnd_bits_min;
56extern const int mmap_rnd_bits_max;
57extern int mmap_rnd_bits __read_mostly;
58#endif
59#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
60extern const int mmap_rnd_compat_bits_min;
61extern const int mmap_rnd_compat_bits_max;
62extern int mmap_rnd_compat_bits __read_mostly;
63#endif
64
54#include <asm/page.h> 65#include <asm/page.h>
55#include <asm/pgtable.h> 66#include <asm/pgtable.h>
56#include <asm/processor.h> 67#include <asm/processor.h>
@@ -225,10 +236,14 @@ extern pgprot_t protection_map[16];
225 * ->fault function. The vma's ->fault is responsible for returning a bitmask 236 * ->fault function. The vma's ->fault is responsible for returning a bitmask
226 * of VM_FAULT_xxx flags that give details about how the fault was handled. 237 * of VM_FAULT_xxx flags that give details about how the fault was handled.
227 * 238 *
239 * MM layer fills up gfp_mask for page allocations but fault handler might
240 * alter it if its implementation requires a different allocation context.
241 *
228 * pgoff should be used in favour of virtual_address, if possible. 242 * pgoff should be used in favour of virtual_address, if possible.
229 */ 243 */
230struct vm_fault { 244struct vm_fault {
231 unsigned int flags; /* FAULT_FLAG_xxx flags */ 245 unsigned int flags; /* FAULT_FLAG_xxx flags */
246 gfp_t gfp_mask; /* gfp mask to be used for allocations */
232 pgoff_t pgoff; /* Logical page offset based on vma */ 247 pgoff_t pgoff; /* Logical page offset based on vma */
233 void __user *virtual_address; /* Faulting virtual address */ 248 void __user *virtual_address; /* Faulting virtual address */
234 249
@@ -1361,10 +1376,26 @@ static inline void dec_mm_counter(struct mm_struct *mm, int member)
1361 atomic_long_dec(&mm->rss_stat.count[member]); 1376 atomic_long_dec(&mm->rss_stat.count[member]);
1362} 1377}
1363 1378
1379/* Optimized variant when page is already known not to be PageAnon */
1380static inline int mm_counter_file(struct page *page)
1381{
1382 if (PageSwapBacked(page))
1383 return MM_SHMEMPAGES;
1384 return MM_FILEPAGES;
1385}
1386
1387static inline int mm_counter(struct page *page)
1388{
1389 if (PageAnon(page))
1390 return MM_ANONPAGES;
1391 return mm_counter_file(page);
1392}
1393
1364static inline unsigned long get_mm_rss(struct mm_struct *mm) 1394static inline unsigned long get_mm_rss(struct mm_struct *mm)
1365{ 1395{
1366 return get_mm_counter(mm, MM_FILEPAGES) + 1396 return get_mm_counter(mm, MM_FILEPAGES) +
1367 get_mm_counter(mm, MM_ANONPAGES); 1397 get_mm_counter(mm, MM_ANONPAGES) +
1398 get_mm_counter(mm, MM_SHMEMPAGES);
1368} 1399}
1369 1400
1370static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) 1401static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
@@ -1898,7 +1929,9 @@ extern void mm_drop_all_locks(struct mm_struct *mm);
1898extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); 1929extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
1899extern struct file *get_mm_exe_file(struct mm_struct *mm); 1930extern struct file *get_mm_exe_file(struct mm_struct *mm);
1900 1931
1901extern int may_expand_vm(struct mm_struct *mm, unsigned long npages); 1932extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages);
1933extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages);
1934
1902extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, 1935extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
1903 unsigned long addr, unsigned long len, 1936 unsigned long addr, unsigned long len,
1904 unsigned long flags, 1937 unsigned long flags,
@@ -2116,15 +2149,6 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
2116extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, 2149extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
2117 unsigned long size, pte_fn_t fn, void *data); 2150 unsigned long size, pte_fn_t fn, void *data);
2118 2151
2119#ifdef CONFIG_PROC_FS
2120void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
2121#else
2122static inline void vm_stat_account(struct mm_struct *mm,
2123 unsigned long flags, struct file *file, long pages)
2124{
2125 mm->total_vm += pages;
2126}
2127#endif /* CONFIG_PROC_FS */
2128 2152
2129#ifdef CONFIG_DEBUG_PAGEALLOC 2153#ifdef CONFIG_DEBUG_PAGEALLOC
2130extern bool _debug_pagealloc_enabled; 2154extern bool _debug_pagealloc_enabled;
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index cf55945c83fb..712e8c37a200 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -100,4 +100,6 @@ static __always_inline enum lru_list page_lru(struct page *page)
100 return lru; 100 return lru;
101} 101}
102 102
103#define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
104
103#endif 105#endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f8d1492a114f..6bc9a0ce2253 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -369,9 +369,10 @@ struct core_state {
369}; 369};
370 370
371enum { 371enum {
372 MM_FILEPAGES, 372 MM_FILEPAGES, /* Resident file mapping pages */
373 MM_ANONPAGES, 373 MM_ANONPAGES, /* Resident anonymous pages */
374 MM_SWAPENTS, 374 MM_SWAPENTS, /* Anonymous swap entries */
375 MM_SHMEMPAGES, /* Resident shared memory pages */
375 NR_MM_COUNTERS 376 NR_MM_COUNTERS
376}; 377};
377 378
@@ -426,7 +427,7 @@ struct mm_struct {
426 unsigned long total_vm; /* Total pages mapped */ 427 unsigned long total_vm; /* Total pages mapped */
427 unsigned long locked_vm; /* Pages that have PG_mlocked set */ 428 unsigned long locked_vm; /* Pages that have PG_mlocked set */
428 unsigned long pinned_vm; /* Refcount permanently increased */ 429 unsigned long pinned_vm; /* Refcount permanently increased */
429 unsigned long shared_vm; /* Shared pages (files) */ 430 unsigned long data_vm; /* VM_WRITE & ~VM_SHARED/GROWSDOWN */
430 unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */ 431 unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */
431 unsigned long stack_vm; /* VM_GROWSUP/DOWN */ 432 unsigned long stack_vm; /* VM_GROWSUP/DOWN */
432 unsigned long def_flags; 433 unsigned long def_flags;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e23a9e704536..33bb1b19273e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -195,11 +195,6 @@ static inline int is_active_lru(enum lru_list lru)
195 return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); 195 return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
196} 196}
197 197
198static inline int is_unevictable_lru(enum lru_list lru)
199{
200 return (lru == LRU_UNEVICTABLE);
201}
202
203struct zone_reclaim_stat { 198struct zone_reclaim_stat {
204 /* 199 /*
205 * The pageout code in vmscan.c keeps track of how many of the 200 * The pageout code in vmscan.c keeps track of how many of the
@@ -361,10 +356,10 @@ struct zone {
361 struct per_cpu_pageset __percpu *pageset; 356 struct per_cpu_pageset __percpu *pageset;
362 357
363 /* 358 /*
364 * This is a per-zone reserve of pages that should not be 359 * This is a per-zone reserve of pages that are not available
365 * considered dirtyable memory. 360 * to userspace allocations.
366 */ 361 */
367 unsigned long dirty_balance_reserve; 362 unsigned long totalreserve_pages;
368 363
369#ifndef CONFIG_SPARSEMEM 364#ifndef CONFIG_SPARSEMEM
370 /* 365 /*
@@ -576,19 +571,17 @@ static inline bool zone_is_empty(struct zone *zone)
576/* Maximum number of zones on a zonelist */ 571/* Maximum number of zones on a zonelist */
577#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES) 572#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
578 573
574enum {
575 ZONELIST_FALLBACK, /* zonelist with fallback */
579#ifdef CONFIG_NUMA 576#ifdef CONFIG_NUMA
580 577 /*
581/* 578 * The NUMA zonelists are doubled because we need zonelists that
582 * The NUMA zonelists are doubled because we need zonelists that restrict the 579 * restrict the allocations to a single node for __GFP_THISNODE.
583 * allocations to a single node for __GFP_THISNODE. 580 */
584 * 581 ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */
585 * [0] : Zonelist with fallback
586 * [1] : No fallback (__GFP_THISNODE)
587 */
588#define MAX_ZONELISTS 2
589#else
590#define MAX_ZONELISTS 1
591#endif 582#endif
583 MAX_ZONELISTS
584};
592 585
593/* 586/*
594 * This struct contains information about a zone in a zonelist. It is stored 587 * This struct contains information about a zone in a zonelist. It is stored
@@ -1207,13 +1200,13 @@ unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
1207 * the zone and PFN linkages are still valid. This is expensive, but walkers 1200 * the zone and PFN linkages are still valid. This is expensive, but walkers
1208 * of the full memmap are extremely rare. 1201 * of the full memmap are extremely rare.
1209 */ 1202 */
1210int memmap_valid_within(unsigned long pfn, 1203bool memmap_valid_within(unsigned long pfn,
1211 struct page *page, struct zone *zone); 1204 struct page *page, struct zone *zone);
1212#else 1205#else
1213static inline int memmap_valid_within(unsigned long pfn, 1206static inline bool memmap_valid_within(unsigned long pfn,
1214 struct page *page, struct zone *zone) 1207 struct page *page, struct zone *zone)
1215{ 1208{
1216 return 1; 1209 return true;
1217} 1210}
1218#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ 1211#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
1219 1212
diff --git a/include/linux/pfn.h b/include/linux/pfn.h
index 7646637221f3..97f3e88aead4 100644
--- a/include/linux/pfn.h
+++ b/include/linux/pfn.h
@@ -9,5 +9,6 @@
9#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) 9#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
10#define PFN_DOWN(x) ((x) >> PAGE_SHIFT) 10#define PFN_DOWN(x) ((x) >> PAGE_SHIFT)
11#define PFN_PHYS(x) ((phys_addr_t)(x) << PAGE_SHIFT) 11#define PFN_PHYS(x) ((phys_addr_t)(x) << PAGE_SHIFT)
12#define PHYS_PFN(x) ((unsigned long)((x) >> PAGE_SHIFT))
12 13
13#endif 14#endif
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 50777b5b1e4c..a43f41cb3c43 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -60,6 +60,10 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
60extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); 60extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
61extern int shmem_unuse(swp_entry_t entry, struct page *page); 61extern int shmem_unuse(swp_entry_t entry, struct page *page);
62 62
63extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
64extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
65 pgoff_t start, pgoff_t end);
66
63static inline struct page *shmem_read_mapping_page( 67static inline struct page *shmem_read_mapping_page(
64 struct address_space *mapping, pgoff_t index) 68 struct address_space *mapping, pgoff_t index)
65{ 69{
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 2037a861e367..3ffee7422012 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -86,6 +86,11 @@
86#else 86#else
87# define SLAB_FAILSLAB 0x00000000UL 87# define SLAB_FAILSLAB 0x00000000UL
88#endif 88#endif
89#ifdef CONFIG_MEMCG_KMEM
90# define SLAB_ACCOUNT 0x04000000UL /* Account to memcg */
91#else
92# define SLAB_ACCOUNT 0x00000000UL
93#endif
89 94
90/* The following flags affect the page allocator grouping pages by mobility */ 95/* The following flags affect the page allocator grouping pages by mobility */
91#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ 96#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7ba7dccaf0e7..066bd21765ad 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -287,7 +287,6 @@ static inline void workingset_node_shadows_dec(struct radix_tree_node *node)
287/* linux/mm/page_alloc.c */ 287/* linux/mm/page_alloc.c */
288extern unsigned long totalram_pages; 288extern unsigned long totalram_pages;
289extern unsigned long totalreserve_pages; 289extern unsigned long totalreserve_pages;
290extern unsigned long dirty_balance_reserve;
291extern unsigned long nr_free_buffer_pages(void); 290extern unsigned long nr_free_buffer_pages(void);
292extern unsigned long nr_free_pagecache_pages(void); 291extern unsigned long nr_free_pagecache_pages(void);
293 292
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index ff307b548ed3..b4c2a485b28a 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -56,9 +56,10 @@ extern long do_no_restart_syscall(struct restart_block *parm);
56#ifdef __KERNEL__ 56#ifdef __KERNEL__
57 57
58#ifdef CONFIG_DEBUG_STACK_USAGE 58#ifdef CONFIG_DEBUG_STACK_USAGE
59# define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) 59# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \
60 __GFP_ZERO)
60#else 61#else
61# define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK) 62# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK)
62#endif 63#endif
63 64
64/* 65/*
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 3bff87a25a42..d1f1d338af20 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -14,7 +14,6 @@ struct vm_area_struct; /* vma defining user mapping in mm_types.h */
14#define VM_ALLOC 0x00000002 /* vmalloc() */ 14#define VM_ALLOC 0x00000002 /* vmalloc() */
15#define VM_MAP 0x00000004 /* vmap()ed pages */ 15#define VM_MAP 0x00000004 /* vmap()ed pages */
16#define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */ 16#define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */
17#define VM_VPAGES 0x00000010 /* buffer for pages was vmalloc'ed */
18#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ 17#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */
19#define VM_NO_GUARD 0x00000040 /* don't add guard page */ 18#define VM_NO_GUARD 0x00000040 /* don't add guard page */
20#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ 19#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 3e4535876d37..3347cc3ec0ab 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -12,6 +12,9 @@
12struct vmpressure { 12struct vmpressure {
13 unsigned long scanned; 13 unsigned long scanned;
14 unsigned long reclaimed; 14 unsigned long reclaimed;
15
16 unsigned long tree_scanned;
17 unsigned long tree_reclaimed;
15 /* The lock is used to keep the scanned/reclaimed above in sync. */ 18 /* The lock is used to keep the scanned/reclaimed above in sync. */
16 struct spinlock sr_lock; 19 struct spinlock sr_lock;
17 20
@@ -26,7 +29,7 @@ struct vmpressure {
26struct mem_cgroup; 29struct mem_cgroup;
27 30
28#ifdef CONFIG_MEMCG 31#ifdef CONFIG_MEMCG
29extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, 32extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
30 unsigned long scanned, unsigned long reclaimed); 33 unsigned long scanned, unsigned long reclaimed);
31extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); 34extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
32 35
@@ -40,7 +43,7 @@ extern int vmpressure_register_event(struct mem_cgroup *memcg,
40extern void vmpressure_unregister_event(struct mem_cgroup *memcg, 43extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
41 struct eventfd_ctx *eventfd); 44 struct eventfd_ctx *eventfd);
42#else 45#else
43static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, 46static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
44 unsigned long scanned, unsigned long reclaimed) {} 47 unsigned long scanned, unsigned long reclaimed) {}
45static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, 48static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg,
46 int prio) {} 49 int prio) {}
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 3e5d9075960f..73fae8c4a5fb 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -189,6 +189,7 @@ extern void __inc_zone_state(struct zone *, enum zone_stat_item);
189extern void dec_zone_state(struct zone *, enum zone_stat_item); 189extern void dec_zone_state(struct zone *, enum zone_stat_item);
190extern void __dec_zone_state(struct zone *, enum zone_stat_item); 190extern void __dec_zone_state(struct zone *, enum zone_stat_item);
191 191
192void quiet_vmstat(void);
192void cpu_vm_stats_fold(int cpu); 193void cpu_vm_stats_fold(int cpu);
193void refresh_zone_stat_thresholds(void); 194void refresh_zone_stat_thresholds(void);
194 195
@@ -249,6 +250,7 @@ static inline void __dec_zone_page_state(struct page *page,
249 250
250static inline void refresh_zone_stat_thresholds(void) { } 251static inline void refresh_zone_stat_thresholds(void) { }
251static inline void cpu_vm_stats_fold(int cpu) { } 252static inline void cpu_vm_stats_fold(int cpu) { }
253static inline void quiet_vmstat(void) { }
252 254
253static inline void drain_zonestat(struct zone *zone, 255static inline void drain_zonestat(struct zone *zone,
254 struct per_cpu_pageset *pset) { } 256 struct per_cpu_pageset *pset) { }
diff --git a/include/net/sock.h b/include/net/sock.h
index e830c1006935..b9e7b3d863a0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -71,22 +71,6 @@
71#include <net/tcp_states.h> 71#include <net/tcp_states.h>
72#include <linux/net_tstamp.h> 72#include <linux/net_tstamp.h>
73 73
74struct cgroup;
75struct cgroup_subsys;
76#ifdef CONFIG_NET
77int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss);
78void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg);
79#else
80static inline
81int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
82{
83 return 0;
84}
85static inline
86void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
87{
88}
89#endif
90/* 74/*
91 * This structure really needs to be cleaned up. 75 * This structure really needs to be cleaned up.
92 * Most of it is for TCP, and not used by any of 76 * Most of it is for TCP, and not used by any of
@@ -245,7 +229,6 @@ struct sock_common {
245 /* public: */ 229 /* public: */
246}; 230};
247 231
248struct cg_proto;
249/** 232/**
250 * struct sock - network layer representation of sockets 233 * struct sock - network layer representation of sockets
251 * @__sk_common: shared layout with inet_timewait_sock 234 * @__sk_common: shared layout with inet_timewait_sock
@@ -310,7 +293,7 @@ struct cg_proto;
310 * @sk_security: used by security modules 293 * @sk_security: used by security modules
311 * @sk_mark: generic packet mark 294 * @sk_mark: generic packet mark
312 * @sk_cgrp_data: cgroup data for this cgroup 295 * @sk_cgrp_data: cgroup data for this cgroup
313 * @sk_cgrp: this socket's cgroup-specific proto data 296 * @sk_memcg: this socket's memory cgroup association
314 * @sk_write_pending: a write to stream socket waits to start 297 * @sk_write_pending: a write to stream socket waits to start
315 * @sk_state_change: callback to indicate change in the state of the sock 298 * @sk_state_change: callback to indicate change in the state of the sock
316 * @sk_data_ready: callback to indicate there is data to be processed 299 * @sk_data_ready: callback to indicate there is data to be processed
@@ -446,7 +429,7 @@ struct sock {
446 void *sk_security; 429 void *sk_security;
447#endif 430#endif
448 struct sock_cgroup_data sk_cgrp_data; 431 struct sock_cgroup_data sk_cgrp_data;
449 struct cg_proto *sk_cgrp; 432 struct mem_cgroup *sk_memcg;
450 void (*sk_state_change)(struct sock *sk); 433 void (*sk_state_change)(struct sock *sk);
451 void (*sk_data_ready)(struct sock *sk); 434 void (*sk_data_ready)(struct sock *sk);
452 void (*sk_write_space)(struct sock *sk); 435 void (*sk_write_space)(struct sock *sk);
@@ -1096,23 +1079,6 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)
1096#define sk_refcnt_debug_release(sk) do { } while (0) 1079#define sk_refcnt_debug_release(sk) do { } while (0)
1097#endif /* SOCK_REFCNT_DEBUG */ 1080#endif /* SOCK_REFCNT_DEBUG */
1098 1081
1099#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_NET)
1100extern struct static_key memcg_socket_limit_enabled;
1101static inline struct cg_proto *parent_cg_proto(struct proto *proto,
1102 struct cg_proto *cg_proto)
1103{
1104 return proto->proto_cgroup(parent_mem_cgroup(cg_proto->memcg));
1105}
1106#define mem_cgroup_sockets_enabled static_key_false(&memcg_socket_limit_enabled)
1107#else
1108#define mem_cgroup_sockets_enabled 0
1109static inline struct cg_proto *parent_cg_proto(struct proto *proto,
1110 struct cg_proto *cg_proto)
1111{
1112 return NULL;
1113}
1114#endif
1115
1116static inline bool sk_stream_memory_free(const struct sock *sk) 1082static inline bool sk_stream_memory_free(const struct sock *sk)
1117{ 1083{
1118 if (sk->sk_wmem_queued >= sk->sk_sndbuf) 1084 if (sk->sk_wmem_queued >= sk->sk_sndbuf)
@@ -1139,8 +1105,9 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
1139 if (!sk->sk_prot->memory_pressure) 1105 if (!sk->sk_prot->memory_pressure)
1140 return false; 1106 return false;
1141 1107
1142 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) 1108 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
1143 return !!sk->sk_cgrp->memory_pressure; 1109 mem_cgroup_under_socket_pressure(sk->sk_memcg))
1110 return true;
1144 1111
1145 return !!*sk->sk_prot->memory_pressure; 1112 return !!*sk->sk_prot->memory_pressure;
1146} 1113}
@@ -1154,15 +1121,6 @@ static inline void sk_leave_memory_pressure(struct sock *sk)
1154 1121
1155 if (*memory_pressure) 1122 if (*memory_pressure)
1156 *memory_pressure = 0; 1123 *memory_pressure = 0;
1157
1158 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
1159 struct cg_proto *cg_proto = sk->sk_cgrp;
1160 struct proto *prot = sk->sk_prot;
1161
1162 for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
1163 cg_proto->memory_pressure = 0;
1164 }
1165
1166} 1124}
1167 1125
1168static inline void sk_enter_memory_pressure(struct sock *sk) 1126static inline void sk_enter_memory_pressure(struct sock *sk)
@@ -1170,116 +1128,46 @@ static inline void sk_enter_memory_pressure(struct sock *sk)
1170 if (!sk->sk_prot->enter_memory_pressure) 1128 if (!sk->sk_prot->enter_memory_pressure)
1171 return; 1129 return;
1172 1130
1173 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
1174 struct cg_proto *cg_proto = sk->sk_cgrp;
1175 struct proto *prot = sk->sk_prot;
1176
1177 for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
1178 cg_proto->memory_pressure = 1;
1179 }
1180
1181 sk->sk_prot->enter_memory_pressure(sk); 1131 sk->sk_prot->enter_memory_pressure(sk);
1182} 1132}
1183 1133
1184static inline long sk_prot_mem_limits(const struct sock *sk, int index) 1134static inline long sk_prot_mem_limits(const struct sock *sk, int index)
1185{ 1135{
1186 long *prot = sk->sk_prot->sysctl_mem; 1136 return sk->sk_prot->sysctl_mem[index];
1187 if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1188 prot = sk->sk_cgrp->sysctl_mem;
1189 return prot[index];
1190}
1191
1192static inline void memcg_memory_allocated_add(struct cg_proto *prot,
1193 unsigned long amt,
1194 int *parent_status)
1195{
1196 page_counter_charge(&prot->memory_allocated, amt);
1197
1198 if (page_counter_read(&prot->memory_allocated) >
1199 prot->memory_allocated.limit)
1200 *parent_status = OVER_LIMIT;
1201}
1202
1203static inline void memcg_memory_allocated_sub(struct cg_proto *prot,
1204 unsigned long amt)
1205{
1206 page_counter_uncharge(&prot->memory_allocated, amt);
1207} 1137}
1208 1138
1209static inline long 1139static inline long
1210sk_memory_allocated(const struct sock *sk) 1140sk_memory_allocated(const struct sock *sk)
1211{ 1141{
1212 struct proto *prot = sk->sk_prot; 1142 return atomic_long_read(sk->sk_prot->memory_allocated);
1213
1214 if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1215 return page_counter_read(&sk->sk_cgrp->memory_allocated);
1216
1217 return atomic_long_read(prot->memory_allocated);
1218} 1143}
1219 1144
1220static inline long 1145static inline long
1221sk_memory_allocated_add(struct sock *sk, int amt, int *parent_status) 1146sk_memory_allocated_add(struct sock *sk, int amt)
1222{ 1147{
1223 struct proto *prot = sk->sk_prot; 1148 return atomic_long_add_return(amt, sk->sk_prot->memory_allocated);
1224
1225 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
1226 memcg_memory_allocated_add(sk->sk_cgrp, amt, parent_status);
1227 /* update the root cgroup regardless */
1228 atomic_long_add_return(amt, prot->memory_allocated);
1229 return page_counter_read(&sk->sk_cgrp->memory_allocated);
1230 }
1231
1232 return atomic_long_add_return(amt, prot->memory_allocated);
1233} 1149}
1234 1150
1235static inline void 1151static inline void
1236sk_memory_allocated_sub(struct sock *sk, int amt) 1152sk_memory_allocated_sub(struct sock *sk, int amt)
1237{ 1153{
1238 struct proto *prot = sk->sk_prot; 1154 atomic_long_sub(amt, sk->sk_prot->memory_allocated);
1239
1240 if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1241 memcg_memory_allocated_sub(sk->sk_cgrp, amt);
1242
1243 atomic_long_sub(amt, prot->memory_allocated);
1244} 1155}
1245 1156
1246static inline void sk_sockets_allocated_dec(struct sock *sk) 1157static inline void sk_sockets_allocated_dec(struct sock *sk)
1247{ 1158{
1248 struct proto *prot = sk->sk_prot; 1159 percpu_counter_dec(sk->sk_prot->sockets_allocated);
1249
1250 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
1251 struct cg_proto *cg_proto = sk->sk_cgrp;
1252
1253 for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
1254 percpu_counter_dec(&cg_proto->sockets_allocated);
1255 }
1256
1257 percpu_counter_dec(prot->sockets_allocated);
1258} 1160}
1259 1161
1260static inline void sk_sockets_allocated_inc(struct sock *sk) 1162static inline void sk_sockets_allocated_inc(struct sock *sk)
1261{ 1163{
1262 struct proto *prot = sk->sk_prot; 1164 percpu_counter_inc(sk->sk_prot->sockets_allocated);
1263
1264 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
1265 struct cg_proto *cg_proto = sk->sk_cgrp;
1266
1267 for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
1268 percpu_counter_inc(&cg_proto->sockets_allocated);
1269 }
1270
1271 percpu_counter_inc(prot->sockets_allocated);
1272} 1165}
1273 1166
1274static inline int 1167static inline int
1275sk_sockets_allocated_read_positive(struct sock *sk) 1168sk_sockets_allocated_read_positive(struct sock *sk)
1276{ 1169{
1277 struct proto *prot = sk->sk_prot; 1170 return percpu_counter_read_positive(sk->sk_prot->sockets_allocated);
1278
1279 if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1280 return percpu_counter_read_positive(&sk->sk_cgrp->sockets_allocated);
1281
1282 return percpu_counter_read_positive(prot->sockets_allocated);
1283} 1171}
1284 1172
1285static inline int 1173static inline int
diff --git a/include/net/tcp.h b/include/net/tcp.h
index a80255f4ca33..8ea19977ea53 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -289,8 +289,9 @@ extern int tcp_memory_pressure;
289/* optimized version of sk_under_memory_pressure() for TCP sockets */ 289/* optimized version of sk_under_memory_pressure() for TCP sockets */
290static inline bool tcp_under_memory_pressure(const struct sock *sk) 290static inline bool tcp_under_memory_pressure(const struct sock *sk)
291{ 291{
292 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) 292 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
293 return !!sk->sk_cgrp->memory_pressure; 293 mem_cgroup_under_socket_pressure(sk->sk_memcg))
294 return true;
294 295
295 return tcp_memory_pressure; 296 return tcp_memory_pressure;
296} 297}
diff --git a/include/net/tcp_memcontrol.h b/include/net/tcp_memcontrol.h
index 05b94d9453de..3a17b16ae8aa 100644
--- a/include/net/tcp_memcontrol.h
+++ b/include/net/tcp_memcontrol.h
@@ -1,7 +1,6 @@
1#ifndef _TCP_MEMCG_H 1#ifndef _TCP_MEMCG_H
2#define _TCP_MEMCG_H 2#define _TCP_MEMCG_H
3 3
4struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg);
5int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss); 4int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss);
6void tcp_destroy_cgroup(struct mem_cgroup *memcg); 5void tcp_destroy_cgroup(struct mem_cgroup *memcg);
7#endif /* _TCP_MEMCG_H */ 6#endif /* _TCP_MEMCG_H */
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
new file mode 100644
index 000000000000..97d635cabac8
--- /dev/null
+++ b/include/trace/events/huge_memory.h
@@ -0,0 +1,136 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM huge_memory
3
4#if !defined(__HUGE_MEMORY_H) || defined(TRACE_HEADER_MULTI_READ)
5#define __HUGE_MEMORY_H
6
7#include <linux/tracepoint.h>
8
9#include <trace/events/gfpflags.h>
10
11#define SCAN_STATUS \
12 EM( SCAN_FAIL, "failed") \
13 EM( SCAN_SUCCEED, "succeeded") \
14 EM( SCAN_PMD_NULL, "pmd_null") \
15 EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \
16 EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \
17 EM( SCAN_PAGE_RO, "no_writable_page") \
18 EM( SCAN_NO_REFERENCED_PAGE, "no_referenced_page") \
19 EM( SCAN_PAGE_NULL, "page_null") \
20 EM( SCAN_SCAN_ABORT, "scan_aborted") \
21 EM( SCAN_PAGE_COUNT, "not_suitable_page_count") \
22 EM( SCAN_PAGE_LRU, "page_not_in_lru") \
23 EM( SCAN_PAGE_LOCK, "page_locked") \
24 EM( SCAN_PAGE_ANON, "page_not_anon") \
25 EM( SCAN_ANY_PROCESS, "no_process_for_page") \
26 EM( SCAN_VMA_NULL, "vma_null") \
27 EM( SCAN_VMA_CHECK, "vma_check_failed") \
28 EM( SCAN_ADDRESS_RANGE, "not_suitable_address_range") \
29 EM( SCAN_SWAP_CACHE_PAGE, "page_swap_cache") \
30 EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\
31 EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \
32 EMe( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed")
33
34#undef EM
35#undef EMe
36#define EM(a, b) TRACE_DEFINE_ENUM(a);
37#define EMe(a, b) TRACE_DEFINE_ENUM(a);
38
39SCAN_STATUS
40
41#undef EM
42#undef EMe
43#define EM(a, b) {a, b},
44#define EMe(a, b) {a, b}
45
46TRACE_EVENT(mm_khugepaged_scan_pmd,
47
48 TP_PROTO(struct mm_struct *mm, unsigned long pfn, bool writable,
49 bool referenced, int none_or_zero, int status),
50
51 TP_ARGS(mm, pfn, writable, referenced, none_or_zero, status),
52
53 TP_STRUCT__entry(
54 __field(struct mm_struct *, mm)
55 __field(unsigned long, pfn)
56 __field(bool, writable)
57 __field(bool, referenced)
58 __field(int, none_or_zero)
59 __field(int, status)
60 ),
61
62 TP_fast_assign(
63 __entry->mm = mm;
64 __entry->pfn = pfn;
65 __entry->writable = writable;
66 __entry->referenced = referenced;
67 __entry->none_or_zero = none_or_zero;
68 __entry->status = status;
69 ),
70
71 TP_printk("mm=%p, scan_pfn=0x%lx, writable=%d, referenced=%d, none_or_zero=%d, status=%s",
72 __entry->mm,
73 __entry->pfn,
74 __entry->writable,
75 __entry->referenced,
76 __entry->none_or_zero,
77 __print_symbolic(__entry->status, SCAN_STATUS))
78);
79
80TRACE_EVENT(mm_collapse_huge_page,
81
82 TP_PROTO(struct mm_struct *mm, int isolated, int status),
83
84 TP_ARGS(mm, isolated, status),
85
86 TP_STRUCT__entry(
87 __field(struct mm_struct *, mm)
88 __field(int, isolated)
89 __field(int, status)
90 ),
91
92 TP_fast_assign(
93 __entry->mm = mm;
94 __entry->isolated = isolated;
95 __entry->status = status;
96 ),
97
98 TP_printk("mm=%p, isolated=%d, status=%s",
99 __entry->mm,
100 __entry->isolated,
101 __print_symbolic(__entry->status, SCAN_STATUS))
102);
103
104TRACE_EVENT(mm_collapse_huge_page_isolate,
105
106 TP_PROTO(unsigned long pfn, int none_or_zero,
107 bool referenced, bool writable, int status),
108
109 TP_ARGS(pfn, none_or_zero, referenced, writable, status),
110
111 TP_STRUCT__entry(
112 __field(unsigned long, pfn)
113 __field(int, none_or_zero)
114 __field(bool, referenced)
115 __field(bool, writable)
116 __field(int, status)
117 ),
118
119 TP_fast_assign(
120 __entry->pfn = pfn;
121 __entry->none_or_zero = none_or_zero;
122 __entry->referenced = referenced;
123 __entry->writable = writable;
124 __entry->status = status;
125 ),
126
127 TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, writable=%d, status=%s",
128 __entry->pfn,
129 __entry->none_or_zero,
130 __entry->referenced,
131 __entry->writable,
132 __print_symbolic(__entry->status, SCAN_STATUS))
133);
134
135#endif /* __HUGE_MEMORY_H */
136#include <trace/define_trace.h>
diff --git a/include/trace/events/page_isolation.h b/include/trace/events/page_isolation.h
new file mode 100644
index 000000000000..6fb644029c80
--- /dev/null
+++ b/include/trace/events/page_isolation.h
@@ -0,0 +1,38 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM page_isolation
3
4#if !defined(_TRACE_PAGE_ISOLATION_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_PAGE_ISOLATION_H
6
7#include <linux/tracepoint.h>
8
9TRACE_EVENT(test_pages_isolated,
10
11 TP_PROTO(
12 unsigned long start_pfn,
13 unsigned long end_pfn,
14 unsigned long fin_pfn),
15
16 TP_ARGS(start_pfn, end_pfn, fin_pfn),
17
18 TP_STRUCT__entry(
19 __field(unsigned long, start_pfn)
20 __field(unsigned long, end_pfn)
21 __field(unsigned long, fin_pfn)
22 ),
23
24 TP_fast_assign(
25 __entry->start_pfn = start_pfn;
26 __entry->end_pfn = end_pfn;
27 __entry->fin_pfn = fin_pfn;
28 ),
29
30 TP_printk("start_pfn=0x%lx end_pfn=0x%lx fin_pfn=0x%lx ret=%s",
31 __entry->start_pfn, __entry->end_pfn, __entry->fin_pfn,
32 __entry->end_pfn == __entry->fin_pfn ? "success" : "fail")
33);
34
35#endif /* _TRACE_PAGE_ISOLATION_H */
36
37/* This part must be outside protection */
38#include <trace/define_trace.h>
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index f66476b96264..31763dd8db1c 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -330,10 +330,9 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate,
330 330
331TRACE_EVENT(mm_vmscan_writepage, 331TRACE_EVENT(mm_vmscan_writepage,
332 332
333 TP_PROTO(struct page *page, 333 TP_PROTO(struct page *page),
334 int reclaim_flags),
335 334
336 TP_ARGS(page, reclaim_flags), 335 TP_ARGS(page),
337 336
338 TP_STRUCT__entry( 337 TP_STRUCT__entry(
339 __field(unsigned long, pfn) 338 __field(unsigned long, pfn)
@@ -342,7 +341,7 @@ TRACE_EVENT(mm_vmscan_writepage,
342 341
343 TP_fast_assign( 342 TP_fast_assign(
344 __entry->pfn = page_to_pfn(page); 343 __entry->pfn = page_to_pfn(page);
345 __entry->reclaim_flags = reclaim_flags; 344 __entry->reclaim_flags = trace_reclaim_flags(page);
346 ), 345 ),
347 346
348 TP_printk("page=%p pfn=%lu flags=%s", 347 TP_printk("page=%p pfn=%lu flags=%s",
@@ -353,11 +352,11 @@ TRACE_EVENT(mm_vmscan_writepage,
353 352
354TRACE_EVENT(mm_vmscan_lru_shrink_inactive, 353TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
355 354
356 TP_PROTO(int nid, int zid, 355 TP_PROTO(struct zone *zone,
357 unsigned long nr_scanned, unsigned long nr_reclaimed, 356 unsigned long nr_scanned, unsigned long nr_reclaimed,
358 int priority, int reclaim_flags), 357 int priority, int file),
359 358
360 TP_ARGS(nid, zid, nr_scanned, nr_reclaimed, priority, reclaim_flags), 359 TP_ARGS(zone, nr_scanned, nr_reclaimed, priority, file),
361 360
362 TP_STRUCT__entry( 361 TP_STRUCT__entry(
363 __field(int, nid) 362 __field(int, nid)
@@ -369,12 +368,12 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
369 ), 368 ),
370 369
371 TP_fast_assign( 370 TP_fast_assign(
372 __entry->nid = nid; 371 __entry->nid = zone_to_nid(zone);
373 __entry->zid = zid; 372 __entry->zid = zone_idx(zone);
374 __entry->nr_scanned = nr_scanned; 373 __entry->nr_scanned = nr_scanned;
375 __entry->nr_reclaimed = nr_reclaimed; 374 __entry->nr_reclaimed = nr_reclaimed;
376 __entry->priority = priority; 375 __entry->priority = priority;
377 __entry->reclaim_flags = reclaim_flags; 376 __entry->reclaim_flags = trace_shrink_flags(file);
378 ), 377 ),
379 378
380 TP_printk("nid=%d zid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s", 379 TP_printk("nid=%d zid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s",
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 161a1807e6ef..f4617cf07069 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1438,7 +1438,7 @@ static int __init init_mqueue_fs(void)
1438 1438
1439 mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache", 1439 mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
1440 sizeof(struct mqueue_inode_info), 0, 1440 sizeof(struct mqueue_inode_info), 0,
1441 SLAB_HWCACHE_ALIGN, init_once); 1441 SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, init_once);
1442 if (mqueue_inode_cachep == NULL) 1442 if (mqueue_inode_cachep == NULL)
1443 return -ENOMEM; 1443 return -ENOMEM;
1444 1444
diff --git a/kernel/cred.c b/kernel/cred.c
index 71179a09c1d6..0c0cd8a62285 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -569,8 +569,8 @@ EXPORT_SYMBOL(revert_creds);
569void __init cred_init(void) 569void __init cred_init(void)
570{ 570{
571 /* allocate a slab in which we can store credentials */ 571 /* allocate a slab in which we can store credentials */
572 cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 572 cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0,
573 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 573 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
574} 574}
575 575
576/** 576/**
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ef90b04d783f..435c14a45118 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -34,7 +34,7 @@ __setup("nodelayacct", delayacct_setup_disable);
34 34
35void delayacct_init(void) 35void delayacct_init(void)
36{ 36{
37 delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC); 37 delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
38 delayacct_tsk_init(&init_task); 38 delayacct_tsk_init(&init_task);
39} 39}
40 40
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7dad84913abf..bb0669169716 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -180,7 +180,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
180 lru_cache_add_active_or_unevictable(kpage, vma); 180 lru_cache_add_active_or_unevictable(kpage, vma);
181 181
182 if (!PageAnon(page)) { 182 if (!PageAnon(page)) {
183 dec_mm_counter(mm, MM_FILEPAGES); 183 dec_mm_counter(mm, mm_counter_file(page));
184 inc_mm_counter(mm, MM_ANONPAGES); 184 inc_mm_counter(mm, MM_ANONPAGES);
185 } 185 }
186 186
diff --git a/kernel/fork.c b/kernel/fork.c
index 6774e6b2e96d..2e391c754ae7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -300,9 +300,9 @@ void __init fork_init(void)
300#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES 300#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
301#endif 301#endif
302 /* create a slab on which task_structs can be allocated */ 302 /* create a slab on which task_structs can be allocated */
303 task_struct_cachep = 303 task_struct_cachep = kmem_cache_create("task_struct",
304 kmem_cache_create("task_struct", arch_task_struct_size, 304 arch_task_struct_size, ARCH_MIN_TASKALIGN,
305 ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); 305 SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
306#endif 306#endif
307 307
308 /* do the arch specific task caches init */ 308 /* do the arch specific task caches init */
@@ -414,7 +414,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
414 RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); 414 RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
415 415
416 mm->total_vm = oldmm->total_vm; 416 mm->total_vm = oldmm->total_vm;
417 mm->shared_vm = oldmm->shared_vm; 417 mm->data_vm = oldmm->data_vm;
418 mm->exec_vm = oldmm->exec_vm; 418 mm->exec_vm = oldmm->exec_vm;
419 mm->stack_vm = oldmm->stack_vm; 419 mm->stack_vm = oldmm->stack_vm;
420 420
@@ -433,8 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
433 struct file *file; 433 struct file *file;
434 434
435 if (mpnt->vm_flags & VM_DONTCOPY) { 435 if (mpnt->vm_flags & VM_DONTCOPY) {
436 vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, 436 vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
437 -vma_pages(mpnt));
438 continue; 437 continue;
439 } 438 }
440 charge = 0; 439 charge = 0;
@@ -1848,16 +1847,19 @@ void __init proc_caches_init(void)
1848 sighand_cachep = kmem_cache_create("sighand_cache", 1847 sighand_cachep = kmem_cache_create("sighand_cache",
1849 sizeof(struct sighand_struct), 0, 1848 sizeof(struct sighand_struct), 0,
1850 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| 1849 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
1851 SLAB_NOTRACK, sighand_ctor); 1850 SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
1852 signal_cachep = kmem_cache_create("signal_cache", 1851 signal_cachep = kmem_cache_create("signal_cache",
1853 sizeof(struct signal_struct), 0, 1852 sizeof(struct signal_struct), 0,
1854 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); 1853 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
1854 NULL);
1855 files_cachep = kmem_cache_create("files_cache", 1855 files_cachep = kmem_cache_create("files_cache",
1856 sizeof(struct files_struct), 0, 1856 sizeof(struct files_struct), 0,
1857 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); 1857 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
1858 NULL);
1858 fs_cachep = kmem_cache_create("fs_cache", 1859 fs_cachep = kmem_cache_create("fs_cache",
1859 sizeof(struct fs_struct), 0, 1860 sizeof(struct fs_struct), 0,
1860 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); 1861 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
1862 NULL);
1861 /* 1863 /*
1862 * FIXME! The "sizeof(struct mm_struct)" currently includes the 1864 * FIXME! The "sizeof(struct mm_struct)" currently includes the
1863 * whole struct cpumask for the OFFSTACK case. We could change 1865 * whole struct cpumask for the OFFSTACK case. We could change
@@ -1867,8 +1869,9 @@ void __init proc_caches_init(void)
1867 */ 1869 */
1868 mm_cachep = kmem_cache_create("mm_struct", 1870 mm_cachep = kmem_cache_create("mm_struct",
1869 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1871 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1870 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); 1872 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
1871 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); 1873 NULL);
1874 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
1872 mmap_init(); 1875 mmap_init();
1873 nsproxy_cache_init(); 1876 nsproxy_cache_init();
1874} 1877}
diff --git a/kernel/pid.c b/kernel/pid.c
index 78b3d9f80d44..f4ad91b746f1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -604,5 +604,5 @@ void __init pidmap_init(void)
604 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 604 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
605 605
606 init_pid_ns.pid_cachep = KMEM_CACHE(pid, 606 init_pid_ns.pid_cachep = KMEM_CACHE(pid,
607 SLAB_HWCACHE_ALIGN | SLAB_PANIC); 607 SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
608} 608}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 4a2ef5a02fd3..2489140a7c51 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -219,6 +219,7 @@ static void cpu_idle_loop(void)
219 */ 219 */
220 220
221 __current_set_polling(); 221 __current_set_polling();
222 quiet_vmstat();
222 tick_nohz_idle_enter(); 223 tick_nohz_idle_enter();
223 224
224 while (!need_resched()) { 225 while (!need_resched()) {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5faf89ac9ec0..c810f8afdb7f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1568,6 +1568,28 @@ static struct ctl_table vm_table[] = {
1568 .mode = 0644, 1568 .mode = 0644,
1569 .proc_handler = proc_doulongvec_minmax, 1569 .proc_handler = proc_doulongvec_minmax,
1570 }, 1570 },
1571#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
1572 {
1573 .procname = "mmap_rnd_bits",
1574 .data = &mmap_rnd_bits,
1575 .maxlen = sizeof(mmap_rnd_bits),
1576 .mode = 0600,
1577 .proc_handler = proc_dointvec_minmax,
1578 .extra1 = (void *)&mmap_rnd_bits_min,
1579 .extra2 = (void *)&mmap_rnd_bits_max,
1580 },
1581#endif
1582#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
1583 {
1584 .procname = "mmap_rnd_compat_bits",
1585 .data = &mmap_rnd_compat_bits,
1586 .maxlen = sizeof(mmap_rnd_compat_bits),
1587 .mode = 0600,
1588 .proc_handler = proc_dointvec_minmax,
1589 .extra1 = (void *)&mmap_rnd_compat_bits_min,
1590 .extra2 = (void *)&mmap_rnd_compat_bits_max,
1591 },
1592#endif
1571 { } 1593 { }
1572}; 1594};
1573 1595
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index d34bd24c2c84..4a1515f4b452 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -1181,7 +1181,7 @@ static inline bool overlap(void *addr, unsigned long len, void *start, void *end
1181 1181
1182static void check_for_illegal_area(struct device *dev, void *addr, unsigned long len) 1182static void check_for_illegal_area(struct device *dev, void *addr, unsigned long len)
1183{ 1183{
1184 if (overlap(addr, len, _text, _etext) || 1184 if (overlap(addr, len, _stext, _etext) ||
1185 overlap(addr, len, __start_rodata, __end_rodata)) 1185 overlap(addr, len, __start_rodata, __end_rodata))
1186 err_printk(dev, NULL, "DMA-API: device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len); 1186 err_printk(dev, NULL, "DMA-API: device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len);
1187} 1187}
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 7340353f8aea..cc5d29d2da9b 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -672,7 +672,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
672 672
673 ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); 673 ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
674 if (!ret) { 674 if (!ret) {
675 bdi->wb.memcg_css = mem_cgroup_root_css; 675 bdi->wb.memcg_css = &root_mem_cgroup->css;
676 bdi->wb.blkcg_css = blkcg_root_css; 676 bdi->wb.blkcg_css = blkcg_root_css;
677 } 677 }
678 return ret; 678 return ret;
diff --git a/mm/compaction.c b/mm/compaction.c
index de3e1e71cd9f..585de54dbe8c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1658,14 +1658,15 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1658 !compaction_deferred(zone, cc->order)) 1658 !compaction_deferred(zone, cc->order))
1659 compact_zone(zone, cc); 1659 compact_zone(zone, cc);
1660 1660
1661 if (cc->order > 0) {
1662 if (zone_watermark_ok(zone, cc->order,
1663 low_wmark_pages(zone), 0, 0))
1664 compaction_defer_reset(zone, cc->order, false);
1665 }
1666
1667 VM_BUG_ON(!list_empty(&cc->freepages)); 1661 VM_BUG_ON(!list_empty(&cc->freepages));
1668 VM_BUG_ON(!list_empty(&cc->migratepages)); 1662 VM_BUG_ON(!list_empty(&cc->migratepages));
1663
1664 if (is_via_compact_memory(cc->order))
1665 continue;
1666
1667 if (zone_watermark_ok(zone, cc->order,
1668 low_wmark_pages(zone), 0, 0))
1669 compaction_defer_reset(zone, cc->order, false);
1669 } 1670 }
1670} 1671}
1671 1672
@@ -1708,7 +1709,10 @@ static void compact_nodes(void)
1708/* The written value is actually unused, all memory is compacted */ 1709/* The written value is actually unused, all memory is compacted */
1709int sysctl_compact_memory; 1710int sysctl_compact_memory;
1710 1711
1711/* This is the entry point for compacting all nodes via /proc/sys/vm */ 1712/*
1713 * This is the entry point for compacting all nodes via
1714 * /proc/sys/vm/compact_memory
1715 */
1712int sysctl_compaction_handler(struct ctl_table *table, int write, 1716int sysctl_compaction_handler(struct ctl_table *table, int write,
1713 void __user *buffer, size_t *length, loff_t *ppos) 1717 void __user *buffer, size_t *length, loff_t *ppos)
1714{ 1718{
diff --git a/mm/debug.c b/mm/debug.c
index 668aa35191ca..5d2072ed8d5e 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -175,7 +175,7 @@ void dump_mm(const struct mm_struct *mm)
175 "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" 175 "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
176 "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" 176 "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
177 "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" 177 "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
178 "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" 178 "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
179 "start_code %lx end_code %lx start_data %lx end_data %lx\n" 179 "start_code %lx end_code %lx start_data %lx end_data %lx\n"
180 "start_brk %lx brk %lx start_stack %lx\n" 180 "start_brk %lx brk %lx start_stack %lx\n"
181 "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" 181 "arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
@@ -209,7 +209,7 @@ void dump_mm(const struct mm_struct *mm)
209 mm_nr_pmds((struct mm_struct *)mm), 209 mm_nr_pmds((struct mm_struct *)mm),
210 mm->map_count, 210 mm->map_count,
211 mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, 211 mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
212 mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, 212 mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
213 mm->start_code, mm->end_code, mm->start_data, mm->end_data, 213 mm->start_code, mm->end_code, mm->start_data, mm->end_data,
214 mm->start_brk, mm->brk, mm->start_stack, 214 mm->start_brk, mm->brk, mm->start_stack,
215 mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, 215 mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
diff --git a/mm/filemap.c b/mm/filemap.c
index 1bb007624b53..ff42d31c891a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1812,19 +1812,18 @@ EXPORT_SYMBOL(generic_file_read_iter);
1812 * This adds the requested page to the page cache if it isn't already there, 1812 * This adds the requested page to the page cache if it isn't already there,
1813 * and schedules an I/O to read in its contents from disk. 1813 * and schedules an I/O to read in its contents from disk.
1814 */ 1814 */
1815static int page_cache_read(struct file *file, pgoff_t offset) 1815static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
1816{ 1816{
1817 struct address_space *mapping = file->f_mapping; 1817 struct address_space *mapping = file->f_mapping;
1818 struct page *page; 1818 struct page *page;
1819 int ret; 1819 int ret;
1820 1820
1821 do { 1821 do {
1822 page = page_cache_alloc_cold(mapping); 1822 page = __page_cache_alloc(gfp_mask|__GFP_COLD);
1823 if (!page) 1823 if (!page)
1824 return -ENOMEM; 1824 return -ENOMEM;
1825 1825
1826 ret = add_to_page_cache_lru(page, mapping, offset, 1826 ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL);
1827 mapping_gfp_constraint(mapping, GFP_KERNEL));
1828 if (ret == 0) 1827 if (ret == 0)
1829 ret = mapping->a_ops->readpage(file, page); 1828 ret = mapping->a_ops->readpage(file, page);
1830 else if (ret == -EEXIST) 1829 else if (ret == -EEXIST)
@@ -2005,7 +2004,7 @@ no_cached_page:
2005 * We're only likely to ever get here if MADV_RANDOM is in 2004 * We're only likely to ever get here if MADV_RANDOM is in
2006 * effect. 2005 * effect.
2007 */ 2006 */
2008 error = page_cache_read(file, offset); 2007 error = page_cache_read(file, offset, vmf->gfp_mask);
2009 2008
2010 /* 2009 /*
2011 * The page we want has now been added to the page cache. 2010 * The page we want has now been added to the page cache.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 62fe06bb7d04..f952f055fdcf 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -31,6 +31,33 @@
31#include <asm/pgalloc.h> 31#include <asm/pgalloc.h>
32#include "internal.h" 32#include "internal.h"
33 33
34enum scan_result {
35 SCAN_FAIL,
36 SCAN_SUCCEED,
37 SCAN_PMD_NULL,
38 SCAN_EXCEED_NONE_PTE,
39 SCAN_PTE_NON_PRESENT,
40 SCAN_PAGE_RO,
41 SCAN_NO_REFERENCED_PAGE,
42 SCAN_PAGE_NULL,
43 SCAN_SCAN_ABORT,
44 SCAN_PAGE_COUNT,
45 SCAN_PAGE_LRU,
46 SCAN_PAGE_LOCK,
47 SCAN_PAGE_ANON,
48 SCAN_ANY_PROCESS,
49 SCAN_VMA_NULL,
50 SCAN_VMA_CHECK,
51 SCAN_ADDRESS_RANGE,
52 SCAN_SWAP_CACHE_PAGE,
53 SCAN_DEL_PAGE_LRU,
54 SCAN_ALLOC_HUGE_PAGE_FAIL,
55 SCAN_CGROUP_CHARGE_FAIL
56};
57
58#define CREATE_TRACE_POINTS
59#include <trace/events/huge_memory.h>
60
34/* 61/*
35 * By default transparent hugepage support is disabled in order that avoid 62 * By default transparent hugepage support is disabled in order that avoid
36 * to risk increase the memory footprint of applications without a guaranteed 63 * to risk increase the memory footprint of applications without a guaranteed
@@ -2198,26 +2225,33 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2198 unsigned long address, 2225 unsigned long address,
2199 pte_t *pte) 2226 pte_t *pte)
2200{ 2227{
2201 struct page *page; 2228 struct page *page = NULL;
2202 pte_t *_pte; 2229 pte_t *_pte;
2203 int none_or_zero = 0; 2230 int none_or_zero = 0, result = 0;
2204 bool referenced = false, writable = false; 2231 bool referenced = false, writable = false;
2232
2205 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; 2233 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
2206 _pte++, address += PAGE_SIZE) { 2234 _pte++, address += PAGE_SIZE) {
2207 pte_t pteval = *_pte; 2235 pte_t pteval = *_pte;
2208 if (pte_none(pteval) || (pte_present(pteval) && 2236 if (pte_none(pteval) || (pte_present(pteval) &&
2209 is_zero_pfn(pte_pfn(pteval)))) { 2237 is_zero_pfn(pte_pfn(pteval)))) {
2210 if (!userfaultfd_armed(vma) && 2238 if (!userfaultfd_armed(vma) &&
2211 ++none_or_zero <= khugepaged_max_ptes_none) 2239 ++none_or_zero <= khugepaged_max_ptes_none) {
2212 continue; 2240 continue;
2213 else 2241 } else {
2242 result = SCAN_EXCEED_NONE_PTE;
2214 goto out; 2243 goto out;
2244 }
2215 } 2245 }
2216 if (!pte_present(pteval)) 2246 if (!pte_present(pteval)) {
2247 result = SCAN_PTE_NON_PRESENT;
2217 goto out; 2248 goto out;
2249 }
2218 page = vm_normal_page(vma, address, pteval); 2250 page = vm_normal_page(vma, address, pteval);
2219 if (unlikely(!page)) 2251 if (unlikely(!page)) {
2252 result = SCAN_PAGE_NULL;
2220 goto out; 2253 goto out;
2254 }
2221 2255
2222 VM_BUG_ON_PAGE(PageCompound(page), page); 2256 VM_BUG_ON_PAGE(PageCompound(page), page);
2223 VM_BUG_ON_PAGE(!PageAnon(page), page); 2257 VM_BUG_ON_PAGE(!PageAnon(page), page);
@@ -2229,8 +2263,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2229 * is needed to serialize against split_huge_page 2263 * is needed to serialize against split_huge_page
2230 * when invoked from the VM. 2264 * when invoked from the VM.
2231 */ 2265 */
2232 if (!trylock_page(page)) 2266 if (!trylock_page(page)) {
2267 result = SCAN_PAGE_LOCK;
2233 goto out; 2268 goto out;
2269 }
2234 2270
2235 /* 2271 /*
2236 * cannot use mapcount: can't collapse if there's a gup pin. 2272 * cannot use mapcount: can't collapse if there's a gup pin.
@@ -2239,6 +2275,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2239 */ 2275 */
2240 if (page_count(page) != 1 + !!PageSwapCache(page)) { 2276 if (page_count(page) != 1 + !!PageSwapCache(page)) {
2241 unlock_page(page); 2277 unlock_page(page);
2278 result = SCAN_PAGE_COUNT;
2242 goto out; 2279 goto out;
2243 } 2280 }
2244 if (pte_write(pteval)) { 2281 if (pte_write(pteval)) {
@@ -2246,6 +2283,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2246 } else { 2283 } else {
2247 if (PageSwapCache(page) && !reuse_swap_page(page)) { 2284 if (PageSwapCache(page) && !reuse_swap_page(page)) {
2248 unlock_page(page); 2285 unlock_page(page);
2286 result = SCAN_SWAP_CACHE_PAGE;
2249 goto out; 2287 goto out;
2250 } 2288 }
2251 /* 2289 /*
@@ -2260,6 +2298,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2260 */ 2298 */
2261 if (isolate_lru_page(page)) { 2299 if (isolate_lru_page(page)) {
2262 unlock_page(page); 2300 unlock_page(page);
2301 result = SCAN_DEL_PAGE_LRU;
2263 goto out; 2302 goto out;
2264 } 2303 }
2265 /* 0 stands for page_is_file_cache(page) == false */ 2304 /* 0 stands for page_is_file_cache(page) == false */
@@ -2273,10 +2312,21 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2273 mmu_notifier_test_young(vma->vm_mm, address)) 2312 mmu_notifier_test_young(vma->vm_mm, address))
2274 referenced = true; 2313 referenced = true;
2275 } 2314 }
2276 if (likely(referenced && writable)) 2315 if (likely(writable)) {
2277 return 1; 2316 if (likely(referenced)) {
2317 result = SCAN_SUCCEED;
2318 trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
2319 referenced, writable, result);
2320 return 1;
2321 }
2322 } else {
2323 result = SCAN_PAGE_RO;
2324 }
2325
2278out: 2326out:
2279 release_pte_pages(pte, _pte); 2327 release_pte_pages(pte, _pte);
2328 trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
2329 referenced, writable, result);
2280 return 0; 2330 return 0;
2281} 2331}
2282 2332
@@ -2513,7 +2563,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2513 pgtable_t pgtable; 2563 pgtable_t pgtable;
2514 struct page *new_page; 2564 struct page *new_page;
2515 spinlock_t *pmd_ptl, *pte_ptl; 2565 spinlock_t *pmd_ptl, *pte_ptl;
2516 int isolated; 2566 int isolated, result = 0;
2517 unsigned long hstart, hend; 2567 unsigned long hstart, hend;
2518 struct mem_cgroup *memcg; 2568 struct mem_cgroup *memcg;
2519 unsigned long mmun_start; /* For mmu_notifiers */ 2569 unsigned long mmun_start; /* For mmu_notifiers */
@@ -2528,12 +2578,15 @@ static void collapse_huge_page(struct mm_struct *mm,
2528 2578
2529 /* release the mmap_sem read lock. */ 2579 /* release the mmap_sem read lock. */
2530 new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); 2580 new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node);
2531 if (!new_page) 2581 if (!new_page) {
2532 return; 2582 result = SCAN_ALLOC_HUGE_PAGE_FAIL;
2583 goto out_nolock;
2584 }
2533 2585
2534 if (unlikely(mem_cgroup_try_charge(new_page, mm, 2586 if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg))) {
2535 gfp, &memcg))) 2587 result = SCAN_CGROUP_CHARGE_FAIL;
2536 return; 2588 goto out_nolock;
2589 }
2537 2590
2538 /* 2591 /*
2539 * Prevent all access to pagetables with the exception of 2592 * Prevent all access to pagetables with the exception of
@@ -2541,21 +2594,31 @@ static void collapse_huge_page(struct mm_struct *mm,
2541 * handled by the anon_vma lock + PG_lock. 2594 * handled by the anon_vma lock + PG_lock.
2542 */ 2595 */
2543 down_write(&mm->mmap_sem); 2596 down_write(&mm->mmap_sem);
2544 if (unlikely(khugepaged_test_exit(mm))) 2597 if (unlikely(khugepaged_test_exit(mm))) {
2598 result = SCAN_ANY_PROCESS;
2545 goto out; 2599 goto out;
2600 }
2546 2601
2547 vma = find_vma(mm, address); 2602 vma = find_vma(mm, address);
2548 if (!vma) 2603 if (!vma) {
2604 result = SCAN_VMA_NULL;
2549 goto out; 2605 goto out;
2606 }
2550 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2607 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2551 hend = vma->vm_end & HPAGE_PMD_MASK; 2608 hend = vma->vm_end & HPAGE_PMD_MASK;
2552 if (address < hstart || address + HPAGE_PMD_SIZE > hend) 2609 if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
2610 result = SCAN_ADDRESS_RANGE;
2553 goto out; 2611 goto out;
2554 if (!hugepage_vma_check(vma)) 2612 }
2613 if (!hugepage_vma_check(vma)) {
2614 result = SCAN_VMA_CHECK;
2555 goto out; 2615 goto out;
2616 }
2556 pmd = mm_find_pmd(mm, address); 2617 pmd = mm_find_pmd(mm, address);
2557 if (!pmd) 2618 if (!pmd) {
2619 result = SCAN_PMD_NULL;
2558 goto out; 2620 goto out;
2621 }
2559 2622
2560 anon_vma_lock_write(vma->anon_vma); 2623 anon_vma_lock_write(vma->anon_vma);
2561 2624
@@ -2592,6 +2655,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2592 pmd_populate(mm, pmd, pmd_pgtable(_pmd)); 2655 pmd_populate(mm, pmd, pmd_pgtable(_pmd));
2593 spin_unlock(pmd_ptl); 2656 spin_unlock(pmd_ptl);
2594 anon_vma_unlock_write(vma->anon_vma); 2657 anon_vma_unlock_write(vma->anon_vma);
2658 result = SCAN_FAIL;
2595 goto out; 2659 goto out;
2596 } 2660 }
2597 2661
@@ -2629,10 +2693,15 @@ static void collapse_huge_page(struct mm_struct *mm,
2629 *hpage = NULL; 2693 *hpage = NULL;
2630 2694
2631 khugepaged_pages_collapsed++; 2695 khugepaged_pages_collapsed++;
2696 result = SCAN_SUCCEED;
2632out_up_write: 2697out_up_write:
2633 up_write(&mm->mmap_sem); 2698 up_write(&mm->mmap_sem);
2699 trace_mm_collapse_huge_page(mm, isolated, result);
2634 return; 2700 return;
2635 2701
2702out_nolock:
2703 trace_mm_collapse_huge_page(mm, isolated, result);
2704 return;
2636out: 2705out:
2637 mem_cgroup_cancel_charge(new_page, memcg); 2706 mem_cgroup_cancel_charge(new_page, memcg);
2638 goto out_up_write; 2707 goto out_up_write;
@@ -2645,8 +2714,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2645{ 2714{
2646 pmd_t *pmd; 2715 pmd_t *pmd;
2647 pte_t *pte, *_pte; 2716 pte_t *pte, *_pte;
2648 int ret = 0, none_or_zero = 0; 2717 int ret = 0, none_or_zero = 0, result = 0;
2649 struct page *page; 2718 struct page *page = NULL;
2650 unsigned long _address; 2719 unsigned long _address;
2651 spinlock_t *ptl; 2720 spinlock_t *ptl;
2652 int node = NUMA_NO_NODE; 2721 int node = NUMA_NO_NODE;
@@ -2655,8 +2724,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2655 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2724 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2656 2725
2657 pmd = mm_find_pmd(mm, address); 2726 pmd = mm_find_pmd(mm, address);
2658 if (!pmd) 2727 if (!pmd) {
2728 result = SCAN_PMD_NULL;
2659 goto out; 2729 goto out;
2730 }
2660 2731
2661 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); 2732 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
2662 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2733 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2665,19 +2736,25 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2665 pte_t pteval = *_pte; 2736 pte_t pteval = *_pte;
2666 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { 2737 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
2667 if (!userfaultfd_armed(vma) && 2738 if (!userfaultfd_armed(vma) &&
2668 ++none_or_zero <= khugepaged_max_ptes_none) 2739 ++none_or_zero <= khugepaged_max_ptes_none) {
2669 continue; 2740 continue;
2670 else 2741 } else {
2742 result = SCAN_EXCEED_NONE_PTE;
2671 goto out_unmap; 2743 goto out_unmap;
2744 }
2672 } 2745 }
2673 if (!pte_present(pteval)) 2746 if (!pte_present(pteval)) {
2747 result = SCAN_PTE_NON_PRESENT;
2674 goto out_unmap; 2748 goto out_unmap;
2749 }
2675 if (pte_write(pteval)) 2750 if (pte_write(pteval))
2676 writable = true; 2751 writable = true;
2677 2752
2678 page = vm_normal_page(vma, _address, pteval); 2753 page = vm_normal_page(vma, _address, pteval);
2679 if (unlikely(!page)) 2754 if (unlikely(!page)) {
2755 result = SCAN_PAGE_NULL;
2680 goto out_unmap; 2756 goto out_unmap;
2757 }
2681 /* 2758 /*
2682 * Record which node the original page is from and save this 2759 * Record which node the original page is from and save this
2683 * information to khugepaged_node_load[]. 2760 * information to khugepaged_node_load[].
@@ -2685,26 +2762,49 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2685 * hit record. 2762 * hit record.
2686 */ 2763 */
2687 node = page_to_nid(page); 2764 node = page_to_nid(page);
2688 if (khugepaged_scan_abort(node)) 2765 if (khugepaged_scan_abort(node)) {
2766 result = SCAN_SCAN_ABORT;
2689 goto out_unmap; 2767 goto out_unmap;
2768 }
2690 khugepaged_node_load[node]++; 2769 khugepaged_node_load[node]++;
2691 VM_BUG_ON_PAGE(PageCompound(page), page); 2770 VM_BUG_ON_PAGE(PageCompound(page), page);
2692 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) 2771 if (!PageLRU(page)) {
2772 result = SCAN_SCAN_ABORT;
2773 goto out_unmap;
2774 }
2775 if (PageLocked(page)) {
2776 result = SCAN_PAGE_LOCK;
2777 goto out_unmap;
2778 }
2779 if (!PageAnon(page)) {
2780 result = SCAN_PAGE_ANON;
2693 goto out_unmap; 2781 goto out_unmap;
2782 }
2783
2694 /* 2784 /*
2695 * cannot use mapcount: can't collapse if there's a gup pin. 2785 * cannot use mapcount: can't collapse if there's a gup pin.
2696 * The page must only be referenced by the scanned process 2786 * The page must only be referenced by the scanned process
2697 * and page swap cache. 2787 * and page swap cache.
2698 */ 2788 */
2699 if (page_count(page) != 1 + !!PageSwapCache(page)) 2789 if (page_count(page) != 1 + !!PageSwapCache(page)) {
2790 result = SCAN_PAGE_COUNT;
2700 goto out_unmap; 2791 goto out_unmap;
2792 }
2701 if (pte_young(pteval) || 2793 if (pte_young(pteval) ||
2702 page_is_young(page) || PageReferenced(page) || 2794 page_is_young(page) || PageReferenced(page) ||
2703 mmu_notifier_test_young(vma->vm_mm, address)) 2795 mmu_notifier_test_young(vma->vm_mm, address))
2704 referenced = true; 2796 referenced = true;
2705 } 2797 }
2706 if (referenced && writable) 2798 if (writable) {
2707 ret = 1; 2799 if (referenced) {
2800 result = SCAN_SUCCEED;
2801 ret = 1;
2802 } else {
2803 result = SCAN_NO_REFERENCED_PAGE;
2804 }
2805 } else {
2806 result = SCAN_PAGE_RO;
2807 }
2708out_unmap: 2808out_unmap:
2709 pte_unmap_unlock(pte, ptl); 2809 pte_unmap_unlock(pte, ptl);
2710 if (ret) { 2810 if (ret) {
@@ -2713,6 +2813,8 @@ out_unmap:
2713 collapse_huge_page(mm, address, hpage, vma, node); 2813 collapse_huge_page(mm, address, hpage, vma, node);
2714 } 2814 }
2715out: 2815out:
2816 trace_mm_khugepaged_scan_pmd(mm, page_to_pfn(page), writable, referenced,
2817 none_or_zero, result);
2716 return ret; 2818 return ret;
2717} 2819}
2718 2820
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ef6963b577fd..be934df69b85 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4,7 +4,6 @@
4 */ 4 */
5#include <linux/list.h> 5#include <linux/list.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/module.h>
8#include <linux/mm.h> 7#include <linux/mm.h>
9#include <linux/seq_file.h> 8#include <linux/seq_file.h>
10#include <linux/sysctl.h> 9#include <linux/sysctl.h>
@@ -2549,25 +2548,6 @@ static void hugetlb_unregister_node(struct node *node)
2549 nhs->hugepages_kobj = NULL; 2548 nhs->hugepages_kobj = NULL;
2550} 2549}
2551 2550
2552/*
2553 * hugetlb module exit: unregister hstate attributes from node devices
2554 * that have them.
2555 */
2556static void hugetlb_unregister_all_nodes(void)
2557{
2558 int nid;
2559
2560 /*
2561 * disable node device registrations.
2562 */
2563 register_hugetlbfs_with_node(NULL, NULL);
2564
2565 /*
2566 * remove hstate attributes from any nodes that have them.
2567 */
2568 for (nid = 0; nid < nr_node_ids; nid++)
2569 hugetlb_unregister_node(node_devices[nid]);
2570}
2571 2551
2572/* 2552/*
2573 * Register hstate attributes for a single node device. 2553 * Register hstate attributes for a single node device.
@@ -2632,27 +2612,10 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
2632 return NULL; 2612 return NULL;
2633} 2613}
2634 2614
2635static void hugetlb_unregister_all_nodes(void) { }
2636
2637static void hugetlb_register_all_nodes(void) { } 2615static void hugetlb_register_all_nodes(void) { }
2638 2616
2639#endif 2617#endif
2640 2618
2641static void __exit hugetlb_exit(void)
2642{
2643 struct hstate *h;
2644
2645 hugetlb_unregister_all_nodes();
2646
2647 for_each_hstate(h) {
2648 kobject_put(hstate_kobjs[hstate_index(h)]);
2649 }
2650
2651 kobject_put(hugepages_kobj);
2652 kfree(hugetlb_fault_mutex_table);
2653}
2654module_exit(hugetlb_exit);
2655
2656static int __init hugetlb_init(void) 2619static int __init hugetlb_init(void)
2657{ 2620{
2658 int i; 2621 int i;
@@ -2690,7 +2653,7 @@ static int __init hugetlb_init(void)
2690 mutex_init(&hugetlb_fault_mutex_table[i]); 2653 mutex_init(&hugetlb_fault_mutex_table[i]);
2691 return 0; 2654 return 0;
2692} 2655}
2693module_init(hugetlb_init); 2656subsys_initcall(hugetlb_init);
2694 2657
2695/* Should be called on processing a hugepagesz=... option */ 2658/* Should be called on processing a hugepagesz=... option */
2696void __init hugetlb_add_hstate(unsigned int order) 2659void __init hugetlb_add_hstate(unsigned int order)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 19423a45d7d7..25c0ad36fe38 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -122,8 +122,7 @@
122#define BYTES_PER_POINTER sizeof(void *) 122#define BYTES_PER_POINTER sizeof(void *)
123 123
124/* GFP bitmask for kmemleak internal allocations */ 124/* GFP bitmask for kmemleak internal allocations */
125#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \ 125#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
126 __GFP_NOACCOUNT)) | \
127 __GFP_NORETRY | __GFP_NOMEMALLOC | \ 126 __GFP_NORETRY | __GFP_NOMEMALLOC | \
128 __GFP_NOWARN) 127 __GFP_NOWARN)
129 128
diff --git a/mm/ksm.c b/mm/ksm.c
index b5cd647daa52..2d162c5625f6 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -740,8 +740,7 @@ static int remove_stable_node(struct stable_node *stable_node)
740 740
741static int remove_all_stable_nodes(void) 741static int remove_all_stable_nodes(void)
742{ 742{
743 struct stable_node *stable_node; 743 struct stable_node *stable_node, *next;
744 struct list_head *this, *next;
745 int nid; 744 int nid;
746 int err = 0; 745 int err = 0;
747 746
@@ -756,8 +755,7 @@ static int remove_all_stable_nodes(void)
756 cond_resched(); 755 cond_resched();
757 } 756 }
758 } 757 }
759 list_for_each_safe(this, next, &migrate_nodes) { 758 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
760 stable_node = list_entry(this, struct stable_node, list);
761 if (remove_stable_node(stable_node)) 759 if (remove_stable_node(stable_node))
762 err = -EBUSY; 760 err = -EBUSY;
763 cond_resched(); 761 cond_resched();
@@ -1583,13 +1581,11 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1583 * so prune them once before each full scan. 1581 * so prune them once before each full scan.
1584 */ 1582 */
1585 if (!ksm_merge_across_nodes) { 1583 if (!ksm_merge_across_nodes) {
1586 struct stable_node *stable_node; 1584 struct stable_node *stable_node, *next;
1587 struct list_head *this, *next;
1588 struct page *page; 1585 struct page *page;
1589 1586
1590 list_for_each_safe(this, next, &migrate_nodes) { 1587 list_for_each_entry_safe(stable_node, next,
1591 stable_node = list_entry(this, 1588 &migrate_nodes, list) {
1592 struct stable_node, list);
1593 page = get_ksm_page(stable_node, false); 1589 page = get_ksm_page(stable_node, false);
1594 if (page) 1590 if (page)
1595 put_page(page); 1591 put_page(page);
@@ -2012,8 +2008,7 @@ static void wait_while_offlining(void)
2012static void ksm_check_stable_tree(unsigned long start_pfn, 2008static void ksm_check_stable_tree(unsigned long start_pfn,
2013 unsigned long end_pfn) 2009 unsigned long end_pfn)
2014{ 2010{
2015 struct stable_node *stable_node; 2011 struct stable_node *stable_node, *next;
2016 struct list_head *this, *next;
2017 struct rb_node *node; 2012 struct rb_node *node;
2018 int nid; 2013 int nid;
2019 2014
@@ -2034,8 +2029,7 @@ static void ksm_check_stable_tree(unsigned long start_pfn,
2034 cond_resched(); 2029 cond_resched();
2035 } 2030 }
2036 } 2031 }
2037 list_for_each_safe(this, next, &migrate_nodes) { 2032 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
2038 stable_node = list_entry(this, struct stable_node, list);
2039 if (stable_node->kpfn >= start_pfn && 2033 if (stable_node->kpfn >= start_pfn &&
2040 stable_node->kpfn < end_pfn) 2034 stable_node->kpfn < end_pfn)
2041 remove_node_from_stable_tree(stable_node); 2035 remove_node_from_stable_tree(stable_node);
diff --git a/mm/memblock.c b/mm/memblock.c
index 07ff069fef25..d2ed81e59a94 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -96,13 +96,10 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
96{ 96{
97 unsigned long i; 97 unsigned long i;
98 98
99 for (i = 0; i < type->cnt; i++) { 99 for (i = 0; i < type->cnt; i++)
100 phys_addr_t rgnbase = type->regions[i].base; 100 if (memblock_addrs_overlap(base, size, type->regions[i].base,
101 phys_addr_t rgnsize = type->regions[i].size; 101 type->regions[i].size))
102 if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
103 break; 102 break;
104 }
105
106 return i < type->cnt; 103 return i < type->cnt;
107} 104}
108 105
@@ -528,7 +525,8 @@ int __init_memblock memblock_add_range(struct memblock_type *type,
528 bool insert = false; 525 bool insert = false;
529 phys_addr_t obase = base; 526 phys_addr_t obase = base;
530 phys_addr_t end = base + memblock_cap_size(base, &size); 527 phys_addr_t end = base + memblock_cap_size(base, &size);
531 int i, nr_new; 528 int idx, nr_new;
529 struct memblock_region *rgn;
532 530
533 if (!size) 531 if (!size)
534 return 0; 532 return 0;
@@ -552,8 +550,7 @@ repeat:
552 base = obase; 550 base = obase;
553 nr_new = 0; 551 nr_new = 0;
554 552
555 for (i = 0; i < type->cnt; i++) { 553 for_each_memblock_type(type, rgn) {
556 struct memblock_region *rgn = &type->regions[i];
557 phys_addr_t rbase = rgn->base; 554 phys_addr_t rbase = rgn->base;
558 phys_addr_t rend = rbase + rgn->size; 555 phys_addr_t rend = rbase + rgn->size;
559 556
@@ -572,7 +569,7 @@ repeat:
572 WARN_ON(flags != rgn->flags); 569 WARN_ON(flags != rgn->flags);
573 nr_new++; 570 nr_new++;
574 if (insert) 571 if (insert)
575 memblock_insert_region(type, i++, base, 572 memblock_insert_region(type, idx++, base,
576 rbase - base, nid, 573 rbase - base, nid,
577 flags); 574 flags);
578 } 575 }
@@ -584,7 +581,7 @@ repeat:
584 if (base < end) { 581 if (base < end) {
585 nr_new++; 582 nr_new++;
586 if (insert) 583 if (insert)
587 memblock_insert_region(type, i, base, end - base, 584 memblock_insert_region(type, idx, base, end - base,
588 nid, flags); 585 nid, flags);
589 } 586 }
590 587
@@ -651,7 +648,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
651 int *start_rgn, int *end_rgn) 648 int *start_rgn, int *end_rgn)
652{ 649{
653 phys_addr_t end = base + memblock_cap_size(base, &size); 650 phys_addr_t end = base + memblock_cap_size(base, &size);
654 int i; 651 int idx;
652 struct memblock_region *rgn;
655 653
656 *start_rgn = *end_rgn = 0; 654 *start_rgn = *end_rgn = 0;
657 655
@@ -663,8 +661,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
663 if (memblock_double_array(type, base, size) < 0) 661 if (memblock_double_array(type, base, size) < 0)
664 return -ENOMEM; 662 return -ENOMEM;
665 663
666 for (i = 0; i < type->cnt; i++) { 664 for_each_memblock_type(type, rgn) {
667 struct memblock_region *rgn = &type->regions[i];
668 phys_addr_t rbase = rgn->base; 665 phys_addr_t rbase = rgn->base;
669 phys_addr_t rend = rbase + rgn->size; 666 phys_addr_t rend = rbase + rgn->size;
670 667
@@ -681,7 +678,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
681 rgn->base = base; 678 rgn->base = base;
682 rgn->size -= base - rbase; 679 rgn->size -= base - rbase;
683 type->total_size -= base - rbase; 680 type->total_size -= base - rbase;
684 memblock_insert_region(type, i, rbase, base - rbase, 681 memblock_insert_region(type, idx, rbase, base - rbase,
685 memblock_get_region_node(rgn), 682 memblock_get_region_node(rgn),
686 rgn->flags); 683 rgn->flags);
687 } else if (rend > end) { 684 } else if (rend > end) {
@@ -692,14 +689,14 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
692 rgn->base = end; 689 rgn->base = end;
693 rgn->size -= end - rbase; 690 rgn->size -= end - rbase;
694 type->total_size -= end - rbase; 691 type->total_size -= end - rbase;
695 memblock_insert_region(type, i--, rbase, end - rbase, 692 memblock_insert_region(type, idx--, rbase, end - rbase,
696 memblock_get_region_node(rgn), 693 memblock_get_region_node(rgn),
697 rgn->flags); 694 rgn->flags);
698 } else { 695 } else {
699 /* @rgn is fully contained, record it */ 696 /* @rgn is fully contained, record it */
700 if (!*end_rgn) 697 if (!*end_rgn)
701 *start_rgn = i; 698 *start_rgn = idx;
702 *end_rgn = i + 1; 699 *end_rgn = idx + 1;
703 } 700 }
704 } 701 }
705 702
@@ -1528,12 +1525,12 @@ static int __init_memblock memblock_search(struct memblock_type *type, phys_addr
1528 return -1; 1525 return -1;
1529} 1526}
1530 1527
1531int __init memblock_is_reserved(phys_addr_t addr) 1528bool __init memblock_is_reserved(phys_addr_t addr)
1532{ 1529{
1533 return memblock_search(&memblock.reserved, addr) != -1; 1530 return memblock_search(&memblock.reserved, addr) != -1;
1534} 1531}
1535 1532
1536int __init_memblock memblock_is_memory(phys_addr_t addr) 1533bool __init_memblock memblock_is_memory(phys_addr_t addr)
1537{ 1534{
1538 return memblock_search(&memblock.memory, addr) != -1; 1535 return memblock_search(&memblock.memory, addr) != -1;
1539} 1536}
@@ -1641,12 +1638,12 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
1641{ 1638{
1642 unsigned long long base, size; 1639 unsigned long long base, size;
1643 unsigned long flags; 1640 unsigned long flags;
1644 int i; 1641 int idx;
1642 struct memblock_region *rgn;
1645 1643
1646 pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); 1644 pr_info(" %s.cnt = 0x%lx\n", name, type->cnt);
1647 1645
1648 for (i = 0; i < type->cnt; i++) { 1646 for_each_memblock_type(type, rgn) {
1649 struct memblock_region *rgn = &type->regions[i];
1650 char nid_buf[32] = ""; 1647 char nid_buf[32] = "";
1651 1648
1652 base = rgn->base; 1649 base = rgn->base;
@@ -1658,7 +1655,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
1658 memblock_get_region_node(rgn)); 1655 memblock_get_region_node(rgn));
1659#endif 1656#endif
1660 pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n", 1657 pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n",
1661 name, i, base, base + size - 1, size, nid_buf, flags); 1658 name, idx, base, base + size - 1, size, nid_buf, flags);
1662 } 1659 }
1663} 1660}
1664 1661
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 14cb1db4c52b..54eae4f19d80 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -76,9 +76,12 @@
76struct cgroup_subsys memory_cgrp_subsys __read_mostly; 76struct cgroup_subsys memory_cgrp_subsys __read_mostly;
77EXPORT_SYMBOL(memory_cgrp_subsys); 77EXPORT_SYMBOL(memory_cgrp_subsys);
78 78
79struct mem_cgroup *root_mem_cgroup __read_mostly;
80
79#define MEM_CGROUP_RECLAIM_RETRIES 5 81#define MEM_CGROUP_RECLAIM_RETRIES 5
80static struct mem_cgroup *root_mem_cgroup __read_mostly; 82
81struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly; 83/* Socket memory accounting disabled? */
84static bool cgroup_memory_nosocket;
82 85
83/* Whether the swap controller is active */ 86/* Whether the swap controller is active */
84#ifdef CONFIG_MEMCG_SWAP 87#ifdef CONFIG_MEMCG_SWAP
@@ -87,6 +90,12 @@ int do_swap_account __read_mostly;
87#define do_swap_account 0 90#define do_swap_account 0
88#endif 91#endif
89 92
93/* Whether legacy memory+swap accounting is active */
94static bool do_memsw_account(void)
95{
96 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
97}
98
90static const char * const mem_cgroup_stat_names[] = { 99static const char * const mem_cgroup_stat_names[] = {
91 "cache", 100 "cache",
92 "rss", 101 "rss",
@@ -288,64 +297,6 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
288 return mem_cgroup_from_css(css); 297 return mem_cgroup_from_css(css);
289} 298}
290 299
291/* Writing them here to avoid exposing memcg's inner layout */
292#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
293
294void sock_update_memcg(struct sock *sk)
295{
296 if (mem_cgroup_sockets_enabled) {
297 struct mem_cgroup *memcg;
298 struct cg_proto *cg_proto;
299
300 BUG_ON(!sk->sk_prot->proto_cgroup);
301
302 /* Socket cloning can throw us here with sk_cgrp already
303 * filled. It won't however, necessarily happen from
304 * process context. So the test for root memcg given
305 * the current task's memcg won't help us in this case.
306 *
307 * Respecting the original socket's memcg is a better
308 * decision in this case.
309 */
310 if (sk->sk_cgrp) {
311 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
312 css_get(&sk->sk_cgrp->memcg->css);
313 return;
314 }
315
316 rcu_read_lock();
317 memcg = mem_cgroup_from_task(current);
318 cg_proto = sk->sk_prot->proto_cgroup(memcg);
319 if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) &&
320 css_tryget_online(&memcg->css)) {
321 sk->sk_cgrp = cg_proto;
322 }
323 rcu_read_unlock();
324 }
325}
326EXPORT_SYMBOL(sock_update_memcg);
327
328void sock_release_memcg(struct sock *sk)
329{
330 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
331 struct mem_cgroup *memcg;
332 WARN_ON(!sk->sk_cgrp->memcg);
333 memcg = sk->sk_cgrp->memcg;
334 css_put(&sk->sk_cgrp->memcg->css);
335 }
336}
337
338struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
339{
340 if (!memcg || mem_cgroup_is_root(memcg))
341 return NULL;
342
343 return &memcg->tcp_mem;
344}
345EXPORT_SYMBOL(tcp_proto_cgroup);
346
347#endif
348
349#ifdef CONFIG_MEMCG_KMEM 300#ifdef CONFIG_MEMCG_KMEM
350/* 301/*
351 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. 302 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
@@ -395,7 +346,7 @@ void memcg_put_cache_ids(void)
395 * conditional to this static branch, we'll have to allow modules that does 346 * conditional to this static branch, we'll have to allow modules that does
396 * kmem_cache_alloc and the such to see this symbol as well 347 * kmem_cache_alloc and the such to see this symbol as well
397 */ 348 */
398struct static_key memcg_kmem_enabled_key; 349DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
399EXPORT_SYMBOL(memcg_kmem_enabled_key); 350EXPORT_SYMBOL(memcg_kmem_enabled_key);
400 351
401#endif /* CONFIG_MEMCG_KMEM */ 352#endif /* CONFIG_MEMCG_KMEM */
@@ -1162,9 +1113,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1162 return ret; 1113 return ret;
1163} 1114}
1164 1115
1165#define mem_cgroup_from_counter(counter, member) \
1166 container_of(counter, struct mem_cgroup, member)
1167
1168/** 1116/**
1169 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1117 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1170 * @memcg: the memory cgroup 1118 * @memcg: the memory cgroup
@@ -1183,7 +1131,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1183 if (count < limit) 1131 if (count < limit)
1184 margin = limit - count; 1132 margin = limit - count;
1185 1133
1186 if (do_swap_account) { 1134 if (do_memsw_account()) {
1187 count = page_counter_read(&memcg->memsw); 1135 count = page_counter_read(&memcg->memsw);
1188 limit = READ_ONCE(memcg->memsw.limit); 1136 limit = READ_ONCE(memcg->memsw.limit);
1189 if (count <= limit) 1137 if (count <= limit)
@@ -1286,7 +1234,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1286 pr_cont(":"); 1234 pr_cont(":");
1287 1235
1288 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 1236 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
1289 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 1237 if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
1290 continue; 1238 continue;
1291 pr_cont(" %s:%luKB", mem_cgroup_stat_names[i], 1239 pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
1292 K(mem_cgroup_read_stat(iter, i))); 1240 K(mem_cgroup_read_stat(iter, i)));
@@ -1909,7 +1857,7 @@ static void drain_stock(struct memcg_stock_pcp *stock)
1909 1857
1910 if (stock->nr_pages) { 1858 if (stock->nr_pages) {
1911 page_counter_uncharge(&old->memory, stock->nr_pages); 1859 page_counter_uncharge(&old->memory, stock->nr_pages);
1912 if (do_swap_account) 1860 if (do_memsw_account())
1913 page_counter_uncharge(&old->memsw, stock->nr_pages); 1861 page_counter_uncharge(&old->memsw, stock->nr_pages);
1914 css_put_many(&old->css, stock->nr_pages); 1862 css_put_many(&old->css, stock->nr_pages);
1915 stock->nr_pages = 0; 1863 stock->nr_pages = 0;
@@ -1997,6 +1945,26 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
1997 return NOTIFY_OK; 1945 return NOTIFY_OK;
1998} 1946}
1999 1947
1948static void reclaim_high(struct mem_cgroup *memcg,
1949 unsigned int nr_pages,
1950 gfp_t gfp_mask)
1951{
1952 do {
1953 if (page_counter_read(&memcg->memory) <= memcg->high)
1954 continue;
1955 mem_cgroup_events(memcg, MEMCG_HIGH, 1);
1956 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
1957 } while ((memcg = parent_mem_cgroup(memcg)));
1958}
1959
1960static void high_work_func(struct work_struct *work)
1961{
1962 struct mem_cgroup *memcg;
1963
1964 memcg = container_of(work, struct mem_cgroup, high_work);
1965 reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL);
1966}
1967
2000/* 1968/*
2001 * Scheduled by try_charge() to be executed from the userland return path 1969 * Scheduled by try_charge() to be executed from the userland return path
2002 * and reclaims memory over the high limit. 1970 * and reclaims memory over the high limit.
@@ -2004,20 +1972,13 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
2004void mem_cgroup_handle_over_high(void) 1972void mem_cgroup_handle_over_high(void)
2005{ 1973{
2006 unsigned int nr_pages = current->memcg_nr_pages_over_high; 1974 unsigned int nr_pages = current->memcg_nr_pages_over_high;
2007 struct mem_cgroup *memcg, *pos; 1975 struct mem_cgroup *memcg;
2008 1976
2009 if (likely(!nr_pages)) 1977 if (likely(!nr_pages))
2010 return; 1978 return;
2011 1979
2012 pos = memcg = get_mem_cgroup_from_mm(current->mm); 1980 memcg = get_mem_cgroup_from_mm(current->mm);
2013 1981 reclaim_high(memcg, nr_pages, GFP_KERNEL);
2014 do {
2015 if (page_counter_read(&pos->memory) <= pos->high)
2016 continue;
2017 mem_cgroup_events(pos, MEMCG_HIGH, 1);
2018 try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
2019 } while ((pos = parent_mem_cgroup(pos)));
2020
2021 css_put(&memcg->css); 1982 css_put(&memcg->css);
2022 current->memcg_nr_pages_over_high = 0; 1983 current->memcg_nr_pages_over_high = 0;
2023} 1984}
@@ -2039,11 +2000,11 @@ retry:
2039 if (consume_stock(memcg, nr_pages)) 2000 if (consume_stock(memcg, nr_pages))
2040 return 0; 2001 return 0;
2041 2002
2042 if (!do_swap_account || 2003 if (!do_memsw_account() ||
2043 page_counter_try_charge(&memcg->memsw, batch, &counter)) { 2004 page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2044 if (page_counter_try_charge(&memcg->memory, batch, &counter)) 2005 if (page_counter_try_charge(&memcg->memory, batch, &counter))
2045 goto done_restock; 2006 goto done_restock;
2046 if (do_swap_account) 2007 if (do_memsw_account())
2047 page_counter_uncharge(&memcg->memsw, batch); 2008 page_counter_uncharge(&memcg->memsw, batch);
2048 mem_over_limit = mem_cgroup_from_counter(counter, memory); 2009 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2049 } else { 2010 } else {
@@ -2130,7 +2091,7 @@ force:
2130 * temporarily by force charging it. 2091 * temporarily by force charging it.
2131 */ 2092 */
2132 page_counter_charge(&memcg->memory, nr_pages); 2093 page_counter_charge(&memcg->memory, nr_pages);
2133 if (do_swap_account) 2094 if (do_memsw_account())
2134 page_counter_charge(&memcg->memsw, nr_pages); 2095 page_counter_charge(&memcg->memsw, nr_pages);
2135 css_get_many(&memcg->css, nr_pages); 2096 css_get_many(&memcg->css, nr_pages);
2136 2097
@@ -2152,6 +2113,11 @@ done_restock:
2152 */ 2113 */
2153 do { 2114 do {
2154 if (page_counter_read(&memcg->memory) > memcg->high) { 2115 if (page_counter_read(&memcg->memory) > memcg->high) {
2116 /* Don't bother a random interrupted task */
2117 if (in_interrupt()) {
2118 schedule_work(&memcg->high_work);
2119 break;
2120 }
2155 current->memcg_nr_pages_over_high += batch; 2121 current->memcg_nr_pages_over_high += batch;
2156 set_notify_resume(current); 2122 set_notify_resume(current);
2157 break; 2123 break;
@@ -2167,7 +2133,7 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2167 return; 2133 return;
2168 2134
2169 page_counter_uncharge(&memcg->memory, nr_pages); 2135 page_counter_uncharge(&memcg->memory, nr_pages);
2170 if (do_swap_account) 2136 if (do_memsw_account())
2171 page_counter_uncharge(&memcg->memsw, nr_pages); 2137 page_counter_uncharge(&memcg->memsw, nr_pages);
2172 2138
2173 css_put_many(&memcg->css, nr_pages); 2139 css_put_many(&memcg->css, nr_pages);
@@ -2356,7 +2322,7 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2356 * Can't be called in interrupt context or from kernel threads. 2322 * Can't be called in interrupt context or from kernel threads.
2357 * This function needs to be called with rcu_read_lock() held. 2323 * This function needs to be called with rcu_read_lock() held.
2358 */ 2324 */
2359struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) 2325struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
2360{ 2326{
2361 struct mem_cgroup *memcg; 2327 struct mem_cgroup *memcg;
2362 struct kmem_cache *memcg_cachep; 2328 struct kmem_cache *memcg_cachep;
@@ -2364,6 +2330,12 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
2364 2330
2365 VM_BUG_ON(!is_root_cache(cachep)); 2331 VM_BUG_ON(!is_root_cache(cachep));
2366 2332
2333 if (cachep->flags & SLAB_ACCOUNT)
2334 gfp |= __GFP_ACCOUNT;
2335
2336 if (!(gfp & __GFP_ACCOUNT))
2337 return cachep;
2338
2367 if (current->memcg_kmem_skip_account) 2339 if (current->memcg_kmem_skip_account)
2368 return cachep; 2340 return cachep;
2369 2341
@@ -2447,7 +2419,7 @@ void __memcg_kmem_uncharge(struct page *page, int order)
2447 2419
2448 page_counter_uncharge(&memcg->kmem, nr_pages); 2420 page_counter_uncharge(&memcg->kmem, nr_pages);
2449 page_counter_uncharge(&memcg->memory, nr_pages); 2421 page_counter_uncharge(&memcg->memory, nr_pages);
2450 if (do_swap_account) 2422 if (do_memsw_account())
2451 page_counter_uncharge(&memcg->memsw, nr_pages); 2423 page_counter_uncharge(&memcg->memsw, nr_pages);
2452 2424
2453 page->mem_cgroup = NULL; 2425 page->mem_cgroup = NULL;
@@ -2935,7 +2907,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
2935 err = page_counter_limit(&memcg->kmem, nr_pages); 2907 err = page_counter_limit(&memcg->kmem, nr_pages);
2936 VM_BUG_ON(err); 2908 VM_BUG_ON(err);
2937 2909
2938 static_key_slow_inc(&memcg_kmem_enabled_key); 2910 static_branch_inc(&memcg_kmem_enabled_key);
2939 /* 2911 /*
2940 * A memory cgroup is considered kmem-active as soon as it gets 2912 * A memory cgroup is considered kmem-active as soon as it gets
2941 * kmemcg_id. Setting the id after enabling static branching will 2913 * kmemcg_id. Setting the id after enabling static branching will
@@ -3162,7 +3134,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
3162 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 3134 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3163 3135
3164 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3136 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
3165 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 3137 if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
3166 continue; 3138 continue;
3167 seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i], 3139 seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
3168 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); 3140 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
@@ -3184,14 +3156,14 @@ static int memcg_stat_show(struct seq_file *m, void *v)
3184 } 3156 }
3185 seq_printf(m, "hierarchical_memory_limit %llu\n", 3157 seq_printf(m, "hierarchical_memory_limit %llu\n",
3186 (u64)memory * PAGE_SIZE); 3158 (u64)memory * PAGE_SIZE);
3187 if (do_swap_account) 3159 if (do_memsw_account())
3188 seq_printf(m, "hierarchical_memsw_limit %llu\n", 3160 seq_printf(m, "hierarchical_memsw_limit %llu\n",
3189 (u64)memsw * PAGE_SIZE); 3161 (u64)memsw * PAGE_SIZE);
3190 3162
3191 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3163 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
3192 unsigned long long val = 0; 3164 unsigned long long val = 0;
3193 3165
3194 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 3166 if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
3195 continue; 3167 continue;
3196 for_each_mem_cgroup_tree(mi, memcg) 3168 for_each_mem_cgroup_tree(mi, memcg)
3197 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; 3169 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
@@ -3322,7 +3294,7 @@ static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3322{ 3294{
3323 while (memcg) { 3295 while (memcg) {
3324 __mem_cgroup_threshold(memcg, false); 3296 __mem_cgroup_threshold(memcg, false);
3325 if (do_swap_account) 3297 if (do_memsw_account())
3326 __mem_cgroup_threshold(memcg, true); 3298 __mem_cgroup_threshold(memcg, true);
3327 3299
3328 memcg = parent_mem_cgroup(memcg); 3300 memcg = parent_mem_cgroup(memcg);
@@ -3621,7 +3593,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
3621 if (ret) 3593 if (ret)
3622 return ret; 3594 return ret;
3623 3595
3624 return mem_cgroup_sockets_init(memcg, ss); 3596 return tcp_init_cgroup(memcg, ss);
3625} 3597}
3626 3598
3627static void memcg_deactivate_kmem(struct mem_cgroup *memcg) 3599static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
@@ -3674,10 +3646,10 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg)
3674{ 3646{
3675 if (memcg->kmem_acct_activated) { 3647 if (memcg->kmem_acct_activated) {
3676 memcg_destroy_kmem_caches(memcg); 3648 memcg_destroy_kmem_caches(memcg);
3677 static_key_slow_dec(&memcg_kmem_enabled_key); 3649 static_branch_dec(&memcg_kmem_enabled_key);
3678 WARN_ON(page_counter_read(&memcg->kmem)); 3650 WARN_ON(page_counter_read(&memcg->kmem));
3679 } 3651 }
3680 mem_cgroup_sockets_destroy(memcg); 3652 tcp_destroy_cgroup(memcg);
3681} 3653}
3682#else 3654#else
3683static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 3655static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -4196,6 +4168,8 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
4196{ 4168{
4197 int node; 4169 int node;
4198 4170
4171 cancel_work_sync(&memcg->high_work);
4172
4199 mem_cgroup_remove_from_trees(memcg); 4173 mem_cgroup_remove_from_trees(memcg);
4200 4174
4201 for_each_node(node) 4175 for_each_node(node)
@@ -4206,17 +4180,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
4206 kfree(memcg); 4180 kfree(memcg);
4207} 4181}
4208 4182
4209/*
4210 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
4211 */
4212struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
4213{
4214 if (!memcg->memory.parent)
4215 return NULL;
4216 return mem_cgroup_from_counter(memcg->memory.parent, memory);
4217}
4218EXPORT_SYMBOL(parent_mem_cgroup);
4219
4220static struct cgroup_subsys_state * __ref 4183static struct cgroup_subsys_state * __ref
4221mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 4184mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4222{ 4185{
@@ -4235,7 +4198,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4235 /* root ? */ 4198 /* root ? */
4236 if (parent_css == NULL) { 4199 if (parent_css == NULL) {
4237 root_mem_cgroup = memcg; 4200 root_mem_cgroup = memcg;
4238 mem_cgroup_root_css = &memcg->css;
4239 page_counter_init(&memcg->memory, NULL); 4201 page_counter_init(&memcg->memory, NULL);
4240 memcg->high = PAGE_COUNTER_MAX; 4202 memcg->high = PAGE_COUNTER_MAX;
4241 memcg->soft_limit = PAGE_COUNTER_MAX; 4203 memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -4243,6 +4205,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4243 page_counter_init(&memcg->kmem, NULL); 4205 page_counter_init(&memcg->kmem, NULL);
4244 } 4206 }
4245 4207
4208 INIT_WORK(&memcg->high_work, high_work_func);
4246 memcg->last_scanned_node = MAX_NUMNODES; 4209 memcg->last_scanned_node = MAX_NUMNODES;
4247 INIT_LIST_HEAD(&memcg->oom_notify); 4210 INIT_LIST_HEAD(&memcg->oom_notify);
4248 memcg->move_charge_at_immigrate = 0; 4211 memcg->move_charge_at_immigrate = 0;
@@ -4257,6 +4220,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4257#ifdef CONFIG_CGROUP_WRITEBACK 4220#ifdef CONFIG_CGROUP_WRITEBACK
4258 INIT_LIST_HEAD(&memcg->cgwb_list); 4221 INIT_LIST_HEAD(&memcg->cgwb_list);
4259#endif 4222#endif
4223#ifdef CONFIG_INET
4224 memcg->socket_pressure = jiffies;
4225#endif
4260 return &memcg->css; 4226 return &memcg->css;
4261 4227
4262free_out: 4228free_out:
@@ -4314,6 +4280,11 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
4314 if (ret) 4280 if (ret)
4315 return ret; 4281 return ret;
4316 4282
4283#ifdef CONFIG_INET
4284 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4285 static_branch_inc(&memcg_sockets_enabled_key);
4286#endif
4287
4317 /* 4288 /*
4318 * Make sure the memcg is initialized: mem_cgroup_iter() 4289 * Make sure the memcg is initialized: mem_cgroup_iter()
4319 * orders reading memcg->initialized against its callers 4290 * orders reading memcg->initialized against its callers
@@ -4360,6 +4331,10 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
4360 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4331 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4361 4332
4362 memcg_destroy_kmem(memcg); 4333 memcg_destroy_kmem(memcg);
4334#ifdef CONFIG_INET
4335 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4336 static_branch_dec(&memcg_sockets_enabled_key);
4337#endif
4363 __mem_cgroup_free(memcg); 4338 __mem_cgroup_free(memcg);
4364} 4339}
4365 4340
@@ -4476,7 +4451,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4476 * we call find_get_page() with swapper_space directly. 4451 * we call find_get_page() with swapper_space directly.
4477 */ 4452 */
4478 page = find_get_page(swap_address_space(ent), ent.val); 4453 page = find_get_page(swap_address_space(ent), ent.val);
4479 if (do_swap_account) 4454 if (do_memsw_account())
4480 entry->val = ent.val; 4455 entry->val = ent.val;
4481 4456
4482 return page; 4457 return page;
@@ -4511,7 +4486,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4511 page = find_get_entry(mapping, pgoff); 4486 page = find_get_entry(mapping, pgoff);
4512 if (radix_tree_exceptional_entry(page)) { 4487 if (radix_tree_exceptional_entry(page)) {
4513 swp_entry_t swp = radix_to_swp_entry(page); 4488 swp_entry_t swp = radix_to_swp_entry(page);
4514 if (do_swap_account) 4489 if (do_memsw_account())
4515 *entry = swp; 4490 *entry = swp;
4516 page = find_get_page(swap_address_space(swp), swp.val); 4491 page = find_get_page(swap_address_space(swp), swp.val);
4517 } 4492 }
@@ -5304,7 +5279,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5304 if (page->mem_cgroup) 5279 if (page->mem_cgroup)
5305 goto out; 5280 goto out;
5306 5281
5307 if (do_swap_account) { 5282 if (do_memsw_account()) {
5308 swp_entry_t ent = { .val = page_private(page), }; 5283 swp_entry_t ent = { .val = page_private(page), };
5309 unsigned short id = lookup_swap_cgroup_id(ent); 5284 unsigned short id = lookup_swap_cgroup_id(ent);
5310 5285
@@ -5378,7 +5353,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
5378 memcg_check_events(memcg, page); 5353 memcg_check_events(memcg, page);
5379 local_irq_enable(); 5354 local_irq_enable();
5380 5355
5381 if (do_swap_account && PageSwapCache(page)) { 5356 if (do_memsw_account() && PageSwapCache(page)) {
5382 swp_entry_t entry = { .val = page_private(page) }; 5357 swp_entry_t entry = { .val = page_private(page) };
5383 /* 5358 /*
5384 * The swap entry might not get freed for a long time, 5359 * The swap entry might not get freed for a long time,
@@ -5427,7 +5402,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
5427 5402
5428 if (!mem_cgroup_is_root(memcg)) { 5403 if (!mem_cgroup_is_root(memcg)) {
5429 page_counter_uncharge(&memcg->memory, nr_pages); 5404 page_counter_uncharge(&memcg->memory, nr_pages);
5430 if (do_swap_account) 5405 if (do_memsw_account())
5431 page_counter_uncharge(&memcg->memsw, nr_pages); 5406 page_counter_uncharge(&memcg->memsw, nr_pages);
5432 memcg_oom_recover(memcg); 5407 memcg_oom_recover(memcg);
5433 } 5408 }
@@ -5580,6 +5555,121 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
5580 commit_charge(newpage, memcg, true); 5555 commit_charge(newpage, memcg, true);
5581} 5556}
5582 5557
5558#ifdef CONFIG_INET
5559
5560DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
5561EXPORT_SYMBOL(memcg_sockets_enabled_key);
5562
5563void sock_update_memcg(struct sock *sk)
5564{
5565 struct mem_cgroup *memcg;
5566
5567 /* Socket cloning can throw us here with sk_cgrp already
5568 * filled. It won't however, necessarily happen from
5569 * process context. So the test for root memcg given
5570 * the current task's memcg won't help us in this case.
5571 *
5572 * Respecting the original socket's memcg is a better
5573 * decision in this case.
5574 */
5575 if (sk->sk_memcg) {
5576 BUG_ON(mem_cgroup_is_root(sk->sk_memcg));
5577 css_get(&sk->sk_memcg->css);
5578 return;
5579 }
5580
5581 rcu_read_lock();
5582 memcg = mem_cgroup_from_task(current);
5583 if (memcg == root_mem_cgroup)
5584 goto out;
5585#ifdef CONFIG_MEMCG_KMEM
5586 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcp_mem.active)
5587 goto out;
5588#endif
5589 if (css_tryget_online(&memcg->css))
5590 sk->sk_memcg = memcg;
5591out:
5592 rcu_read_unlock();
5593}
5594EXPORT_SYMBOL(sock_update_memcg);
5595
5596void sock_release_memcg(struct sock *sk)
5597{
5598 WARN_ON(!sk->sk_memcg);
5599 css_put(&sk->sk_memcg->css);
5600}
5601
5602/**
5603 * mem_cgroup_charge_skmem - charge socket memory
5604 * @memcg: memcg to charge
5605 * @nr_pages: number of pages to charge
5606 *
5607 * Charges @nr_pages to @memcg. Returns %true if the charge fit within
5608 * @memcg's configured limit, %false if the charge had to be forced.
5609 */
5610bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
5611{
5612 gfp_t gfp_mask = GFP_KERNEL;
5613
5614#ifdef CONFIG_MEMCG_KMEM
5615 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
5616 struct page_counter *counter;
5617
5618 if (page_counter_try_charge(&memcg->tcp_mem.memory_allocated,
5619 nr_pages, &counter)) {
5620 memcg->tcp_mem.memory_pressure = 0;
5621 return true;
5622 }
5623 page_counter_charge(&memcg->tcp_mem.memory_allocated, nr_pages);
5624 memcg->tcp_mem.memory_pressure = 1;
5625 return false;
5626 }
5627#endif
5628 /* Don't block in the packet receive path */
5629 if (in_softirq())
5630 gfp_mask = GFP_NOWAIT;
5631
5632 if (try_charge(memcg, gfp_mask, nr_pages) == 0)
5633 return true;
5634
5635 try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
5636 return false;
5637}
5638
5639/**
5640 * mem_cgroup_uncharge_skmem - uncharge socket memory
5641 * @memcg - memcg to uncharge
5642 * @nr_pages - number of pages to uncharge
5643 */
5644void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
5645{
5646#ifdef CONFIG_MEMCG_KMEM
5647 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
5648 page_counter_uncharge(&memcg->tcp_mem.memory_allocated,
5649 nr_pages);
5650 return;
5651 }
5652#endif
5653 page_counter_uncharge(&memcg->memory, nr_pages);
5654 css_put_many(&memcg->css, nr_pages);
5655}
5656
5657#endif /* CONFIG_INET */
5658
5659static int __init cgroup_memory(char *s)
5660{
5661 char *token;
5662
5663 while ((token = strsep(&s, ",")) != NULL) {
5664 if (!*token)
5665 continue;
5666 if (!strcmp(token, "nosocket"))
5667 cgroup_memory_nosocket = true;
5668 }
5669 return 0;
5670}
5671__setup("cgroup.memory=", cgroup_memory);
5672
5583/* 5673/*
5584 * subsys_initcall() for memory controller. 5674 * subsys_initcall() for memory controller.
5585 * 5675 *
@@ -5635,7 +5725,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5635 VM_BUG_ON_PAGE(PageLRU(page), page); 5725 VM_BUG_ON_PAGE(PageLRU(page), page);
5636 VM_BUG_ON_PAGE(page_count(page), page); 5726 VM_BUG_ON_PAGE(page_count(page), page);
5637 5727
5638 if (!do_swap_account) 5728 if (!do_memsw_account())
5639 return; 5729 return;
5640 5730
5641 memcg = page->mem_cgroup; 5731 memcg = page->mem_cgroup;
@@ -5675,7 +5765,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
5675 struct mem_cgroup *memcg; 5765 struct mem_cgroup *memcg;
5676 unsigned short id; 5766 unsigned short id;
5677 5767
5678 if (!do_swap_account) 5768 if (!do_memsw_account())
5679 return; 5769 return;
5680 5770
5681 id = swap_cgroup_record(entry, 0); 5771 id = swap_cgroup_record(entry, 0);
diff --git a/mm/memory.c b/mm/memory.c
index c387430f06c3..d4e4d37c1989 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -832,10 +832,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
832 } else if (is_migration_entry(entry)) { 832 } else if (is_migration_entry(entry)) {
833 page = migration_entry_to_page(entry); 833 page = migration_entry_to_page(entry);
834 834
835 if (PageAnon(page)) 835 rss[mm_counter(page)]++;
836 rss[MM_ANONPAGES]++;
837 else
838 rss[MM_FILEPAGES]++;
839 836
840 if (is_write_migration_entry(entry) && 837 if (is_write_migration_entry(entry) &&
841 is_cow_mapping(vm_flags)) { 838 is_cow_mapping(vm_flags)) {
@@ -874,10 +871,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
874 if (page) { 871 if (page) {
875 get_page(page); 872 get_page(page);
876 page_dup_rmap(page); 873 page_dup_rmap(page);
877 if (PageAnon(page)) 874 rss[mm_counter(page)]++;
878 rss[MM_ANONPAGES]++;
879 else
880 rss[MM_FILEPAGES]++;
881 } 875 }
882 876
883out_set_pte: 877out_set_pte:
@@ -1113,9 +1107,8 @@ again:
1113 tlb_remove_tlb_entry(tlb, pte, addr); 1107 tlb_remove_tlb_entry(tlb, pte, addr);
1114 if (unlikely(!page)) 1108 if (unlikely(!page))
1115 continue; 1109 continue;
1116 if (PageAnon(page)) 1110
1117 rss[MM_ANONPAGES]--; 1111 if (!PageAnon(page)) {
1118 else {
1119 if (pte_dirty(ptent)) { 1112 if (pte_dirty(ptent)) {
1120 force_flush = 1; 1113 force_flush = 1;
1121 set_page_dirty(page); 1114 set_page_dirty(page);
@@ -1123,8 +1116,8 @@ again:
1123 if (pte_young(ptent) && 1116 if (pte_young(ptent) &&
1124 likely(!(vma->vm_flags & VM_SEQ_READ))) 1117 likely(!(vma->vm_flags & VM_SEQ_READ)))
1125 mark_page_accessed(page); 1118 mark_page_accessed(page);
1126 rss[MM_FILEPAGES]--;
1127 } 1119 }
1120 rss[mm_counter(page)]--;
1128 page_remove_rmap(page); 1121 page_remove_rmap(page);
1129 if (unlikely(page_mapcount(page) < 0)) 1122 if (unlikely(page_mapcount(page) < 0))
1130 print_bad_pte(vma, addr, ptent, page); 1123 print_bad_pte(vma, addr, ptent, page);
@@ -1146,11 +1139,7 @@ again:
1146 struct page *page; 1139 struct page *page;
1147 1140
1148 page = migration_entry_to_page(entry); 1141 page = migration_entry_to_page(entry);
1149 1142 rss[mm_counter(page)]--;
1150 if (PageAnon(page))
1151 rss[MM_ANONPAGES]--;
1152 else
1153 rss[MM_FILEPAGES]--;
1154 } 1143 }
1155 if (unlikely(!free_swap_and_cache(entry))) 1144 if (unlikely(!free_swap_and_cache(entry)))
1156 print_bad_pte(vma, addr, ptent, NULL); 1145 print_bad_pte(vma, addr, ptent, NULL);
@@ -1460,7 +1449,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1460 1449
1461 /* Ok, finally just insert the thing.. */ 1450 /* Ok, finally just insert the thing.. */
1462 get_page(page); 1451 get_page(page);
1463 inc_mm_counter_fast(mm, MM_FILEPAGES); 1452 inc_mm_counter_fast(mm, mm_counter_file(page));
1464 page_add_file_rmap(page); 1453 page_add_file_rmap(page);
1465 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 1454 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1466 1455
@@ -1949,6 +1938,20 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
1949 copy_user_highpage(dst, src, va, vma); 1938 copy_user_highpage(dst, src, va, vma);
1950} 1939}
1951 1940
1941static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
1942{
1943 struct file *vm_file = vma->vm_file;
1944
1945 if (vm_file)
1946 return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
1947
1948 /*
1949 * Special mappings (e.g. VDSO) do not have any file so fake
1950 * a default GFP_KERNEL for them.
1951 */
1952 return GFP_KERNEL;
1953}
1954
1952/* 1955/*
1953 * Notify the address space that the page is about to become writable so that 1956 * Notify the address space that the page is about to become writable so that
1954 * it can prohibit this or wait for the page to get into an appropriate state. 1957 * it can prohibit this or wait for the page to get into an appropriate state.
@@ -1964,6 +1967,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
1964 vmf.virtual_address = (void __user *)(address & PAGE_MASK); 1967 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
1965 vmf.pgoff = page->index; 1968 vmf.pgoff = page->index;
1966 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; 1969 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
1970 vmf.gfp_mask = __get_fault_gfp_mask(vma);
1967 vmf.page = page; 1971 vmf.page = page;
1968 vmf.cow_page = NULL; 1972 vmf.cow_page = NULL;
1969 1973
@@ -2097,7 +2101,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2097 if (likely(pte_same(*page_table, orig_pte))) { 2101 if (likely(pte_same(*page_table, orig_pte))) {
2098 if (old_page) { 2102 if (old_page) {
2099 if (!PageAnon(old_page)) { 2103 if (!PageAnon(old_page)) {
2100 dec_mm_counter_fast(mm, MM_FILEPAGES); 2104 dec_mm_counter_fast(mm,
2105 mm_counter_file(old_page));
2101 inc_mm_counter_fast(mm, MM_ANONPAGES); 2106 inc_mm_counter_fast(mm, MM_ANONPAGES);
2102 } 2107 }
2103 } else { 2108 } else {
@@ -2767,6 +2772,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
2767 vmf.pgoff = pgoff; 2772 vmf.pgoff = pgoff;
2768 vmf.flags = flags; 2773 vmf.flags = flags;
2769 vmf.page = NULL; 2774 vmf.page = NULL;
2775 vmf.gfp_mask = __get_fault_gfp_mask(vma);
2770 vmf.cow_page = cow_page; 2776 vmf.cow_page = cow_page;
2771 2777
2772 ret = vma->vm_ops->fault(vma, &vmf); 2778 ret = vma->vm_ops->fault(vma, &vmf);
@@ -2820,7 +2826,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
2820 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); 2826 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2821 page_add_new_anon_rmap(page, vma, address); 2827 page_add_new_anon_rmap(page, vma, address);
2822 } else { 2828 } else {
2823 inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES); 2829 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
2824 page_add_file_rmap(page); 2830 page_add_file_rmap(page);
2825 } 2831 }
2826 set_pte_at(vma->vm_mm, address, pte, entry); 2832 set_pte_at(vma->vm_mm, address, pte, entry);
@@ -2933,6 +2939,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
2933 vmf.pgoff = pgoff; 2939 vmf.pgoff = pgoff;
2934 vmf.max_pgoff = max_pgoff; 2940 vmf.max_pgoff = max_pgoff;
2935 vmf.flags = flags; 2941 vmf.flags = flags;
2942 vmf.gfp_mask = __get_fault_gfp_mask(vma);
2936 vma->vm_ops->map_pages(vma, &vmf); 2943 vma->vm_ops->map_pages(vma, &vmf);
2937} 2944}
2938 2945
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a042a9d537bb..92f95952692b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -131,7 +131,8 @@ static struct resource *register_memory_resource(u64 start, u64 size)
131{ 131{
132 struct resource *res; 132 struct resource *res;
133 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 133 res = kzalloc(sizeof(struct resource), GFP_KERNEL);
134 BUG_ON(!res); 134 if (!res)
135 return ERR_PTR(-ENOMEM);
135 136
136 res->name = "System RAM"; 137 res->name = "System RAM";
137 res->start = start; 138 res->start = start;
@@ -140,7 +141,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
140 if (request_resource(&iomem_resource, res) < 0) { 141 if (request_resource(&iomem_resource, res) < 0) {
141 pr_debug("System RAM resource %pR cannot be added\n", res); 142 pr_debug("System RAM resource %pR cannot be added\n", res);
142 kfree(res); 143 kfree(res);
143 res = NULL; 144 return ERR_PTR(-EEXIST);
144 } 145 }
145 return res; 146 return res;
146} 147}
@@ -1312,8 +1313,8 @@ int __ref add_memory(int nid, u64 start, u64 size)
1312 int ret; 1313 int ret;
1313 1314
1314 res = register_memory_resource(start, size); 1315 res = register_memory_resource(start, size);
1315 if (!res) 1316 if (IS_ERR(res))
1316 return -EEXIST; 1317 return PTR_ERR(res);
1317 1318
1318 ret = add_memory_resource(nid, res); 1319 ret = add_memory_resource(nid, res);
1319 if (ret < 0) 1320 if (ret < 0)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 87a177917cb2..d8caff071a30 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2142,12 +2142,14 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2142 * 2142 *
2143 * Remember policies even when nobody has shared memory mapped. 2143 * Remember policies even when nobody has shared memory mapped.
2144 * The policies are kept in Red-Black tree linked from the inode. 2144 * The policies are kept in Red-Black tree linked from the inode.
2145 * They are protected by the sp->lock spinlock, which should be held 2145 * They are protected by the sp->lock rwlock, which should be held
2146 * for any accesses to the tree. 2146 * for any accesses to the tree.
2147 */ 2147 */
2148 2148
2149/* lookup first element intersecting start-end */ 2149/*
2150/* Caller holds sp->lock */ 2150 * lookup first element intersecting start-end. Caller holds sp->lock for
2151 * reading or for writing
2152 */
2151static struct sp_node * 2153static struct sp_node *
2152sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) 2154sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2153{ 2155{
@@ -2178,8 +2180,10 @@ sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2178 return rb_entry(n, struct sp_node, nd); 2180 return rb_entry(n, struct sp_node, nd);
2179} 2181}
2180 2182
2181/* Insert a new shared policy into the list. */ 2183/*
2182/* Caller holds sp->lock */ 2184 * Insert a new shared policy into the list. Caller holds sp->lock for
2185 * writing.
2186 */
2183static void sp_insert(struct shared_policy *sp, struct sp_node *new) 2187static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2184{ 2188{
2185 struct rb_node **p = &sp->root.rb_node; 2189 struct rb_node **p = &sp->root.rb_node;
@@ -2211,13 +2215,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2211 2215
2212 if (!sp->root.rb_node) 2216 if (!sp->root.rb_node)
2213 return NULL; 2217 return NULL;
2214 spin_lock(&sp->lock); 2218 read_lock(&sp->lock);
2215 sn = sp_lookup(sp, idx, idx+1); 2219 sn = sp_lookup(sp, idx, idx+1);
2216 if (sn) { 2220 if (sn) {
2217 mpol_get(sn->policy); 2221 mpol_get(sn->policy);
2218 pol = sn->policy; 2222 pol = sn->policy;
2219 } 2223 }
2220 spin_unlock(&sp->lock); 2224 read_unlock(&sp->lock);
2221 return pol; 2225 return pol;
2222} 2226}
2223 2227
@@ -2360,7 +2364,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2360 int ret = 0; 2364 int ret = 0;
2361 2365
2362restart: 2366restart:
2363 spin_lock(&sp->lock); 2367 write_lock(&sp->lock);
2364 n = sp_lookup(sp, start, end); 2368 n = sp_lookup(sp, start, end);
2365 /* Take care of old policies in the same range. */ 2369 /* Take care of old policies in the same range. */
2366 while (n && n->start < end) { 2370 while (n && n->start < end) {
@@ -2393,7 +2397,7 @@ restart:
2393 } 2397 }
2394 if (new) 2398 if (new)
2395 sp_insert(sp, new); 2399 sp_insert(sp, new);
2396 spin_unlock(&sp->lock); 2400 write_unlock(&sp->lock);
2397 ret = 0; 2401 ret = 0;
2398 2402
2399err_out: 2403err_out:
@@ -2405,7 +2409,7 @@ err_out:
2405 return ret; 2409 return ret;
2406 2410
2407alloc_new: 2411alloc_new:
2408 spin_unlock(&sp->lock); 2412 write_unlock(&sp->lock);
2409 ret = -ENOMEM; 2413 ret = -ENOMEM;
2410 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); 2414 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2411 if (!n_new) 2415 if (!n_new)
@@ -2431,7 +2435,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2431 int ret; 2435 int ret;
2432 2436
2433 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 2437 sp->root = RB_ROOT; /* empty tree == default mempolicy */
2434 spin_lock_init(&sp->lock); 2438 rwlock_init(&sp->lock);
2435 2439
2436 if (mpol) { 2440 if (mpol) {
2437 struct vm_area_struct pvma; 2441 struct vm_area_struct pvma;
@@ -2497,14 +2501,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
2497 2501
2498 if (!p->root.rb_node) 2502 if (!p->root.rb_node)
2499 return; 2503 return;
2500 spin_lock(&p->lock); 2504 write_lock(&p->lock);
2501 next = rb_first(&p->root); 2505 next = rb_first(&p->root);
2502 while (next) { 2506 while (next) {
2503 n = rb_entry(next, struct sp_node, nd); 2507 n = rb_entry(next, struct sp_node, nd);
2504 next = rb_next(&n->nd); 2508 next = rb_next(&n->nd);
2505 sp_delete(p, n); 2509 sp_delete(p, n);
2506 } 2510 }
2507 spin_unlock(&p->lock); 2511 write_unlock(&p->lock);
2508} 2512}
2509 2513
2510#ifdef CONFIG_NUMA_BALANCING 2514#ifdef CONFIG_NUMA_BALANCING
diff --git a/mm/mlock.c b/mm/mlock.c
index 339d9e0949b6..9cb87cbc4071 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -425,7 +425,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
425 vma->vm_flags &= VM_LOCKED_CLEAR_MASK; 425 vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
426 426
427 while (start < end) { 427 while (start < end) {
428 struct page *page = NULL; 428 struct page *page;
429 unsigned int page_mask; 429 unsigned int page_mask;
430 unsigned long page_increm; 430 unsigned long page_increm;
431 struct pagevec pvec; 431 struct pagevec pvec;
diff --git a/mm/mmap.c b/mm/mmap.c
index 2ce04a649f6b..b3f00b616b81 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -58,6 +58,18 @@
58#define arch_rebalance_pgtables(addr, len) (addr) 58#define arch_rebalance_pgtables(addr, len) (addr)
59#endif 59#endif
60 60
61#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
62const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
63const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
64int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
65#endif
66#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
67const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
68const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
69int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
70#endif
71
72
61static void unmap_region(struct mm_struct *mm, 73static void unmap_region(struct mm_struct *mm,
62 struct vm_area_struct *vma, struct vm_area_struct *prev, 74 struct vm_area_struct *vma, struct vm_area_struct *prev,
63 unsigned long start, unsigned long end); 75 unsigned long start, unsigned long end);
@@ -1208,24 +1220,6 @@ none:
1208 return NULL; 1220 return NULL;
1209} 1221}
1210 1222
1211#ifdef CONFIG_PROC_FS
1212void vm_stat_account(struct mm_struct *mm, unsigned long flags,
1213 struct file *file, long pages)
1214{
1215 const unsigned long stack_flags
1216 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
1217
1218 mm->total_vm += pages;
1219
1220 if (file) {
1221 mm->shared_vm += pages;
1222 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
1223 mm->exec_vm += pages;
1224 } else if (flags & stack_flags)
1225 mm->stack_vm += pages;
1226}
1227#endif /* CONFIG_PROC_FS */
1228
1229/* 1223/*
1230 * If a hint addr is less than mmap_min_addr change hint to be as 1224 * If a hint addr is less than mmap_min_addr change hint to be as
1231 * low as possible but still greater than mmap_min_addr 1225 * low as possible but still greater than mmap_min_addr
@@ -1544,19 +1538,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1544 unsigned long charged = 0; 1538 unsigned long charged = 0;
1545 1539
1546 /* Check against address space limit. */ 1540 /* Check against address space limit. */
1547 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { 1541 if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
1548 unsigned long nr_pages; 1542 unsigned long nr_pages;
1549 1543
1550 /* 1544 /*
1551 * MAP_FIXED may remove pages of mappings that intersects with 1545 * MAP_FIXED may remove pages of mappings that intersects with
1552 * requested mapping. Account for the pages it would unmap. 1546 * requested mapping. Account for the pages it would unmap.
1553 */ 1547 */
1554 if (!(vm_flags & MAP_FIXED))
1555 return -ENOMEM;
1556
1557 nr_pages = count_vma_pages_range(mm, addr, addr + len); 1548 nr_pages = count_vma_pages_range(mm, addr, addr + len);
1558 1549
1559 if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) 1550 if (!may_expand_vm(mm, vm_flags,
1551 (len >> PAGE_SHIFT) - nr_pages))
1560 return -ENOMEM; 1552 return -ENOMEM;
1561 } 1553 }
1562 1554
@@ -1655,7 +1647,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1655out: 1647out:
1656 perf_event_mmap(vma); 1648 perf_event_mmap(vma);
1657 1649
1658 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1650 vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
1659 if (vm_flags & VM_LOCKED) { 1651 if (vm_flags & VM_LOCKED) {
1660 if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || 1652 if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
1661 vma == get_gate_vma(current->mm))) 1653 vma == get_gate_vma(current->mm)))
@@ -2102,7 +2094,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
2102 unsigned long new_start, actual_size; 2094 unsigned long new_start, actual_size;
2103 2095
2104 /* address space limit tests */ 2096 /* address space limit tests */
2105 if (!may_expand_vm(mm, grow)) 2097 if (!may_expand_vm(mm, vma->vm_flags, grow))
2106 return -ENOMEM; 2098 return -ENOMEM;
2107 2099
2108 /* Stack limit test */ 2100 /* Stack limit test */
@@ -2199,8 +2191,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
2199 spin_lock(&mm->page_table_lock); 2191 spin_lock(&mm->page_table_lock);
2200 if (vma->vm_flags & VM_LOCKED) 2192 if (vma->vm_flags & VM_LOCKED)
2201 mm->locked_vm += grow; 2193 mm->locked_vm += grow;
2202 vm_stat_account(mm, vma->vm_flags, 2194 vm_stat_account(mm, vma->vm_flags, grow);
2203 vma->vm_file, grow);
2204 anon_vma_interval_tree_pre_update_vma(vma); 2195 anon_vma_interval_tree_pre_update_vma(vma);
2205 vma->vm_end = address; 2196 vma->vm_end = address;
2206 anon_vma_interval_tree_post_update_vma(vma); 2197 anon_vma_interval_tree_post_update_vma(vma);
@@ -2275,8 +2266,7 @@ int expand_downwards(struct vm_area_struct *vma,
2275 spin_lock(&mm->page_table_lock); 2266 spin_lock(&mm->page_table_lock);
2276 if (vma->vm_flags & VM_LOCKED) 2267 if (vma->vm_flags & VM_LOCKED)
2277 mm->locked_vm += grow; 2268 mm->locked_vm += grow;
2278 vm_stat_account(mm, vma->vm_flags, 2269 vm_stat_account(mm, vma->vm_flags, grow);
2279 vma->vm_file, grow);
2280 anon_vma_interval_tree_pre_update_vma(vma); 2270 anon_vma_interval_tree_pre_update_vma(vma);
2281 vma->vm_start = address; 2271 vma->vm_start = address;
2282 vma->vm_pgoff -= grow; 2272 vma->vm_pgoff -= grow;
@@ -2390,7 +2380,7 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
2390 2380
2391 if (vma->vm_flags & VM_ACCOUNT) 2381 if (vma->vm_flags & VM_ACCOUNT)
2392 nr_accounted += nrpages; 2382 nr_accounted += nrpages;
2393 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); 2383 vm_stat_account(mm, vma->vm_flags, -nrpages);
2394 vma = remove_vma(vma); 2384 vma = remove_vma(vma);
2395 } while (vma); 2385 } while (vma);
2396 vm_unacct_memory(nr_accounted); 2386 vm_unacct_memory(nr_accounted);
@@ -2760,7 +2750,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
2760 } 2750 }
2761 2751
2762 /* Check against address space limits *after* clearing old maps... */ 2752 /* Check against address space limits *after* clearing old maps... */
2763 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) 2753 if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
2764 return -ENOMEM; 2754 return -ENOMEM;
2765 2755
2766 if (mm->map_count > sysctl_max_map_count) 2756 if (mm->map_count > sysctl_max_map_count)
@@ -2795,6 +2785,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
2795out: 2785out:
2796 perf_event_mmap(vma); 2786 perf_event_mmap(vma);
2797 mm->total_vm += len >> PAGE_SHIFT; 2787 mm->total_vm += len >> PAGE_SHIFT;
2788 mm->data_vm += len >> PAGE_SHIFT;
2798 if (flags & VM_LOCKED) 2789 if (flags & VM_LOCKED)
2799 mm->locked_vm += (len >> PAGE_SHIFT); 2790 mm->locked_vm += (len >> PAGE_SHIFT);
2800 vma->vm_flags |= VM_SOFTDIRTY; 2791 vma->vm_flags |= VM_SOFTDIRTY;
@@ -2986,16 +2977,28 @@ out:
2986 * Return true if the calling process may expand its vm space by the passed 2977 * Return true if the calling process may expand its vm space by the passed
2987 * number of pages 2978 * number of pages
2988 */ 2979 */
2989int may_expand_vm(struct mm_struct *mm, unsigned long npages) 2980bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
2990{ 2981{
2991 unsigned long cur = mm->total_vm; /* pages */ 2982 if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
2992 unsigned long lim; 2983 return false;
2993 2984
2994 lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT; 2985 if ((flags & (VM_WRITE | VM_SHARED | (VM_STACK_FLAGS &
2986 (VM_GROWSUP | VM_GROWSDOWN)))) == VM_WRITE)
2987 return mm->data_vm + npages <= rlimit(RLIMIT_DATA);
2995 2988
2996 if (cur + npages > lim) 2989 return true;
2997 return 0; 2990}
2998 return 1; 2991
2992void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
2993{
2994 mm->total_vm += npages;
2995
2996 if ((flags & (VM_EXEC | VM_WRITE)) == VM_EXEC)
2997 mm->exec_vm += npages;
2998 else if (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN)))
2999 mm->stack_vm += npages;
3000 else if ((flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
3001 mm->data_vm += npages;
2999} 3002}
3000 3003
3001static int special_mapping_fault(struct vm_area_struct *vma, 3004static int special_mapping_fault(struct vm_area_struct *vma,
@@ -3077,7 +3080,7 @@ static struct vm_area_struct *__install_special_mapping(
3077 if (ret) 3080 if (ret)
3078 goto out; 3081 goto out;
3079 3082
3080 mm->total_vm += len >> PAGE_SHIFT; 3083 vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
3081 3084
3082 perf_event_mmap(vma); 3085 perf_event_mmap(vma);
3083 3086
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 7d87ebb0d632..52687fb4de6f 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -72,16 +72,16 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
72} 72}
73 73
74#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL 74#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
75int memmap_valid_within(unsigned long pfn, 75bool memmap_valid_within(unsigned long pfn,
76 struct page *page, struct zone *zone) 76 struct page *page, struct zone *zone)
77{ 77{
78 if (page_to_pfn(page) != pfn) 78 if (page_to_pfn(page) != pfn)
79 return 0; 79 return false;
80 80
81 if (page_zone(page) != zone) 81 if (page_zone(page) != zone)
82 return 0; 82 return false;
83 83
84 return 1; 84 return true;
85} 85}
86#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ 86#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
87 87
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ef5be8eaab00..c764402c464f 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -278,6 +278,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
278 * even if read-only so there is no need to account for them here 278 * even if read-only so there is no need to account for them here
279 */ 279 */
280 if (newflags & VM_WRITE) { 280 if (newflags & VM_WRITE) {
281 /* Check space limits when area turns into data. */
282 if (!may_expand_vm(mm, newflags, nrpages) &&
283 may_expand_vm(mm, oldflags, nrpages))
284 return -ENOMEM;
281 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| 285 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
282 VM_SHARED|VM_NORESERVE))) { 286 VM_SHARED|VM_NORESERVE))) {
283 charged = nrpages; 287 charged = nrpages;
@@ -334,8 +338,8 @@ success:
334 populate_vma_page_range(vma, start, end, NULL); 338 populate_vma_page_range(vma, start, end, NULL);
335 } 339 }
336 340
337 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 341 vm_stat_account(mm, oldflags, -nrpages);
338 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 342 vm_stat_account(mm, newflags, nrpages);
339 perf_event_mmap(vma); 343 perf_event_mmap(vma);
340 return 0; 344 return 0;
341 345
diff --git a/mm/mremap.c b/mm/mremap.c
index de824e72c3e8..e55b157865d5 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -317,7 +317,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
317 * If this were a serious issue, we'd add a flag to do_munmap(). 317 * If this were a serious issue, we'd add a flag to do_munmap().
318 */ 318 */
319 hiwater_vm = mm->hiwater_vm; 319 hiwater_vm = mm->hiwater_vm;
320 vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); 320 vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
321 321
322 /* Tell pfnmap has moved from this vma */ 322 /* Tell pfnmap has moved from this vma */
323 if (unlikely(vma->vm_flags & VM_PFNMAP)) 323 if (unlikely(vma->vm_flags & VM_PFNMAP))
@@ -383,7 +383,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
383 return ERR_PTR(-EAGAIN); 383 return ERR_PTR(-EAGAIN);
384 } 384 }
385 385
386 if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) 386 if (!may_expand_vm(mm, vma->vm_flags,
387 (new_len - old_len) >> PAGE_SHIFT))
387 return ERR_PTR(-ENOMEM); 388 return ERR_PTR(-ENOMEM);
388 389
389 if (vma->vm_flags & VM_ACCOUNT) { 390 if (vma->vm_flags & VM_ACCOUNT) {
@@ -545,7 +546,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
545 goto out; 546 goto out;
546 } 547 }
547 548
548 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); 549 vm_stat_account(mm, vma->vm_flags, pages);
549 if (vma->vm_flags & VM_LOCKED) { 550 if (vma->vm_flags & VM_LOCKED) {
550 mm->locked_vm += pages; 551 mm->locked_vm += pages;
551 locked = true; 552 locked = true;
diff --git a/mm/nommu.c b/mm/nommu.c
index 92be862c859b..fbf6f0f1d6c9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -560,7 +560,7 @@ void __init mmap_init(void)
560 560
561 ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); 561 ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
562 VM_BUG_ON(ret); 562 VM_BUG_ON(ret);
563 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); 563 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
564} 564}
565 565
566/* 566/*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c12680993ff3..dc490c06941b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -585,10 +585,11 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
585 */ 585 */
586 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); 586 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
587 mark_oom_victim(victim); 587 mark_oom_victim(victim);
588 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", 588 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
589 task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), 589 task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
590 K(get_mm_counter(victim->mm, MM_ANONPAGES)), 590 K(get_mm_counter(victim->mm, MM_ANONPAGES)),
591 K(get_mm_counter(victim->mm, MM_FILEPAGES))); 591 K(get_mm_counter(victim->mm, MM_FILEPAGES)),
592 K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
592 task_unlock(victim); 593 task_unlock(victim);
593 594
594 /* 595 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d15d88c8efa1..6fe7d15bd1f7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -278,7 +278,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone)
278 unsigned long nr_pages; 278 unsigned long nr_pages;
279 279
280 nr_pages = zone_page_state(zone, NR_FREE_PAGES); 280 nr_pages = zone_page_state(zone, NR_FREE_PAGES);
281 nr_pages -= min(nr_pages, zone->dirty_balance_reserve); 281 /*
282 * Pages reserved for the kernel should not be considered
283 * dirtyable, to prevent a situation where reclaim has to
284 * clean pages in order to balance the zones.
285 */
286 nr_pages -= min(nr_pages, zone->totalreserve_pages);
282 287
283 nr_pages += zone_page_state(zone, NR_INACTIVE_FILE); 288 nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
284 nr_pages += zone_page_state(zone, NR_ACTIVE_FILE); 289 nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
@@ -332,7 +337,12 @@ static unsigned long global_dirtyable_memory(void)
332 unsigned long x; 337 unsigned long x;
333 338
334 x = global_page_state(NR_FREE_PAGES); 339 x = global_page_state(NR_FREE_PAGES);
335 x -= min(x, dirty_balance_reserve); 340 /*
341 * Pages reserved for the kernel should not be considered
342 * dirtyable, to prevent a situation where reclaim has to
343 * clean pages in order to balance the zones.
344 */
345 x -= min(x, totalreserve_pages);
336 346
337 x += global_page_state(NR_INACTIVE_FILE); 347 x += global_page_state(NR_INACTIVE_FILE);
338 x += global_page_state(NR_ACTIVE_FILE); 348 x += global_page_state(NR_ACTIVE_FILE);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9d666df5ef95..ce63d603820f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -114,13 +114,6 @@ static DEFINE_SPINLOCK(managed_page_count_lock);
114unsigned long totalram_pages __read_mostly; 114unsigned long totalram_pages __read_mostly;
115unsigned long totalreserve_pages __read_mostly; 115unsigned long totalreserve_pages __read_mostly;
116unsigned long totalcma_pages __read_mostly; 116unsigned long totalcma_pages __read_mostly;
117/*
118 * When calculating the number of globally allowed dirty pages, there
119 * is a certain number of per-zone reserves that should not be
120 * considered dirtyable memory. This is the sum of those reserves
121 * over all existing zones that contribute dirtyable memory.
122 */
123unsigned long dirty_balance_reserve __read_mostly;
124 117
125int percpu_pagelist_fraction; 118int percpu_pagelist_fraction;
126gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 119gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
@@ -812,7 +805,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
812 do { 805 do {
813 int mt; /* migratetype of the to-be-freed page */ 806 int mt; /* migratetype of the to-be-freed page */
814 807
815 page = list_entry(list->prev, struct page, lru); 808 page = list_last_entry(list, struct page, lru);
816 /* must delete as __free_one_page list manipulates */ 809 /* must delete as __free_one_page list manipulates */
817 list_del(&page->lru); 810 list_del(&page->lru);
818 811
@@ -1417,11 +1410,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
1417 /* Find a page of the appropriate size in the preferred list */ 1410 /* Find a page of the appropriate size in the preferred list */
1418 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 1411 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
1419 area = &(zone->free_area[current_order]); 1412 area = &(zone->free_area[current_order]);
1420 if (list_empty(&area->free_list[migratetype])) 1413 page = list_first_entry_or_null(&area->free_list[migratetype],
1421 continue;
1422
1423 page = list_entry(area->free_list[migratetype].next,
1424 struct page, lru); 1414 struct page, lru);
1415 if (!page)
1416 continue;
1425 list_del(&page->lru); 1417 list_del(&page->lru);
1426 rmv_page_order(page); 1418 rmv_page_order(page);
1427 area->nr_free--; 1419 area->nr_free--;
@@ -1700,12 +1692,12 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
1700 for (order = 0; order < MAX_ORDER; order++) { 1692 for (order = 0; order < MAX_ORDER; order++) {
1701 struct free_area *area = &(zone->free_area[order]); 1693 struct free_area *area = &(zone->free_area[order]);
1702 1694
1703 if (list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) 1695 page = list_first_entry_or_null(
1696 &area->free_list[MIGRATE_HIGHATOMIC],
1697 struct page, lru);
1698 if (!page)
1704 continue; 1699 continue;
1705 1700
1706 page = list_entry(area->free_list[MIGRATE_HIGHATOMIC].next,
1707 struct page, lru);
1708
1709 /* 1701 /*
1710 * It should never happen but changes to locking could 1702 * It should never happen but changes to locking could
1711 * inadvertently allow a per-cpu drain to add pages 1703 * inadvertently allow a per-cpu drain to add pages
@@ -1753,7 +1745,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
1753 if (fallback_mt == -1) 1745 if (fallback_mt == -1)
1754 continue; 1746 continue;
1755 1747
1756 page = list_entry(area->free_list[fallback_mt].next, 1748 page = list_first_entry(&area->free_list[fallback_mt],
1757 struct page, lru); 1749 struct page, lru);
1758 if (can_steal) 1750 if (can_steal)
1759 steal_suitable_fallback(zone, page, start_migratetype); 1751 steal_suitable_fallback(zone, page, start_migratetype);
@@ -1788,7 +1780,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
1788 * Call me with the zone->lock already held. 1780 * Call me with the zone->lock already held.
1789 */ 1781 */
1790static struct page *__rmqueue(struct zone *zone, unsigned int order, 1782static struct page *__rmqueue(struct zone *zone, unsigned int order,
1791 int migratetype, gfp_t gfp_flags) 1783 int migratetype)
1792{ 1784{
1793 struct page *page; 1785 struct page *page;
1794 1786
@@ -1818,7 +1810,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
1818 1810
1819 spin_lock(&zone->lock); 1811 spin_lock(&zone->lock);
1820 for (i = 0; i < count; ++i) { 1812 for (i = 0; i < count; ++i) {
1821 struct page *page = __rmqueue(zone, order, migratetype, 0); 1813 struct page *page = __rmqueue(zone, order, migratetype);
1822 if (unlikely(page == NULL)) 1814 if (unlikely(page == NULL))
1823 break; 1815 break;
1824 1816
@@ -1988,7 +1980,7 @@ void mark_free_pages(struct zone *zone)
1988 unsigned long pfn, max_zone_pfn; 1980 unsigned long pfn, max_zone_pfn;
1989 unsigned long flags; 1981 unsigned long flags;
1990 unsigned int order, t; 1982 unsigned int order, t;
1991 struct list_head *curr; 1983 struct page *page;
1992 1984
1993 if (zone_is_empty(zone)) 1985 if (zone_is_empty(zone))
1994 return; 1986 return;
@@ -1998,17 +1990,17 @@ void mark_free_pages(struct zone *zone)
1998 max_zone_pfn = zone_end_pfn(zone); 1990 max_zone_pfn = zone_end_pfn(zone);
1999 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1991 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
2000 if (pfn_valid(pfn)) { 1992 if (pfn_valid(pfn)) {
2001 struct page *page = pfn_to_page(pfn); 1993 page = pfn_to_page(pfn);
2002
2003 if (!swsusp_page_is_forbidden(page)) 1994 if (!swsusp_page_is_forbidden(page))
2004 swsusp_unset_page_free(page); 1995 swsusp_unset_page_free(page);
2005 } 1996 }
2006 1997
2007 for_each_migratetype_order(order, t) { 1998 for_each_migratetype_order(order, t) {
2008 list_for_each(curr, &zone->free_area[order].free_list[t]) { 1999 list_for_each_entry(page,
2000 &zone->free_area[order].free_list[t], lru) {
2009 unsigned long i; 2001 unsigned long i;
2010 2002
2011 pfn = page_to_pfn(list_entry(curr, struct page, lru)); 2003 pfn = page_to_pfn(page);
2012 for (i = 0; i < (1UL << order); i++) 2004 for (i = 0; i < (1UL << order); i++)
2013 swsusp_set_page_free(pfn_to_page(pfn + i)); 2005 swsusp_set_page_free(pfn_to_page(pfn + i));
2014 } 2006 }
@@ -2212,9 +2204,9 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
2212 } 2204 }
2213 2205
2214 if (cold) 2206 if (cold)
2215 page = list_entry(list->prev, struct page, lru); 2207 page = list_last_entry(list, struct page, lru);
2216 else 2208 else
2217 page = list_entry(list->next, struct page, lru); 2209 page = list_first_entry(list, struct page, lru);
2218 2210
2219 list_del(&page->lru); 2211 list_del(&page->lru);
2220 pcp->count--; 2212 pcp->count--;
@@ -2241,7 +2233,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
2241 trace_mm_page_alloc_zone_locked(page, order, migratetype); 2233 trace_mm_page_alloc_zone_locked(page, order, migratetype);
2242 } 2234 }
2243 if (!page) 2235 if (!page)
2244 page = __rmqueue(zone, order, migratetype, gfp_flags); 2236 page = __rmqueue(zone, order, migratetype);
2245 spin_unlock(&zone->lock); 2237 spin_unlock(&zone->lock);
2246 if (!page) 2238 if (!page)
2247 goto failed; 2239 goto failed;
@@ -2740,8 +2732,21 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2740 goto out; 2732 goto out;
2741 } 2733 }
2742 /* Exhausted what can be done so it's blamo time */ 2734 /* Exhausted what can be done so it's blamo time */
2743 if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) 2735 if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
2744 *did_some_progress = 1; 2736 *did_some_progress = 1;
2737
2738 if (gfp_mask & __GFP_NOFAIL) {
2739 page = get_page_from_freelist(gfp_mask, order,
2740 ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac);
2741 /*
2742 * fallback to ignore cpuset restriction if our nodes
2743 * are depleted
2744 */
2745 if (!page)
2746 page = get_page_from_freelist(gfp_mask, order,
2747 ALLOC_NO_WATERMARKS, ac);
2748 }
2749 }
2745out: 2750out:
2746 mutex_unlock(&oom_lock); 2751 mutex_unlock(&oom_lock);
2747 return page; 2752 return page;
@@ -2876,28 +2881,6 @@ retry:
2876 return page; 2881 return page;
2877} 2882}
2878 2883
2879/*
2880 * This is called in the allocator slow-path if the allocation request is of
2881 * sufficient urgency to ignore watermarks and take other desperate measures
2882 */
2883static inline struct page *
2884__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2885 const struct alloc_context *ac)
2886{
2887 struct page *page;
2888
2889 do {
2890 page = get_page_from_freelist(gfp_mask, order,
2891 ALLOC_NO_WATERMARKS, ac);
2892
2893 if (!page && gfp_mask & __GFP_NOFAIL)
2894 wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC,
2895 HZ/50);
2896 } while (!page && (gfp_mask & __GFP_NOFAIL));
2897
2898 return page;
2899}
2900
2901static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) 2884static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
2902{ 2885{
2903 struct zoneref *z; 2886 struct zoneref *z;
@@ -3042,28 +3025,36 @@ retry:
3042 * allocations are system rather than user orientated 3025 * allocations are system rather than user orientated
3043 */ 3026 */
3044 ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); 3027 ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
3045 3028 page = get_page_from_freelist(gfp_mask, order,
3046 page = __alloc_pages_high_priority(gfp_mask, order, ac); 3029 ALLOC_NO_WATERMARKS, ac);
3047 3030 if (page)
3048 if (page) {
3049 goto got_pg; 3031 goto got_pg;
3050 }
3051 } 3032 }
3052 3033
3053 /* Caller is not willing to reclaim, we can't balance anything */ 3034 /* Caller is not willing to reclaim, we can't balance anything */
3054 if (!can_direct_reclaim) { 3035 if (!can_direct_reclaim) {
3055 /* 3036 /*
3056 * All existing users of the deprecated __GFP_NOFAIL are 3037 * All existing users of the __GFP_NOFAIL are blockable, so warn
3057 * blockable, so warn of any new users that actually allow this 3038 * of any new users that actually allow this type of allocation
3058 * type of allocation to fail. 3039 * to fail.
3059 */ 3040 */
3060 WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); 3041 WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
3061 goto nopage; 3042 goto nopage;
3062 } 3043 }
3063 3044
3064 /* Avoid recursion of direct reclaim */ 3045 /* Avoid recursion of direct reclaim */
3065 if (current->flags & PF_MEMALLOC) 3046 if (current->flags & PF_MEMALLOC) {
3047 /*
3048 * __GFP_NOFAIL request from this context is rather bizarre
3049 * because we cannot reclaim anything and only can loop waiting
3050 * for somebody to do a work for us.
3051 */
3052 if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
3053 cond_resched();
3054 goto retry;
3055 }
3066 goto nopage; 3056 goto nopage;
3057 }
3067 3058
3068 /* Avoid allocations with no watermarks from looping endlessly */ 3059 /* Avoid allocations with no watermarks from looping endlessly */
3069 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 3060 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
@@ -3402,7 +3393,8 @@ EXPORT_SYMBOL(__free_page_frag);
3402 3393
3403/* 3394/*
3404 * alloc_kmem_pages charges newly allocated pages to the kmem resource counter 3395 * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
3405 * of the current memory cgroup. 3396 * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is
3397 * equivalent to alloc_pages.
3406 * 3398 *
3407 * It should be used when the caller would like to use kmalloc, but since the 3399 * It should be used when the caller would like to use kmalloc, but since the
3408 * allocation is large, it has to fall back to the page allocator. 3400 * allocation is large, it has to fall back to the page allocator.
@@ -4147,8 +4139,7 @@ static void set_zonelist_order(void)
4147 4139
4148static void build_zonelists(pg_data_t *pgdat) 4140static void build_zonelists(pg_data_t *pgdat)
4149{ 4141{
4150 int j, node, load; 4142 int i, node, load;
4151 enum zone_type i;
4152 nodemask_t used_mask; 4143 nodemask_t used_mask;
4153 int local_node, prev_node; 4144 int local_node, prev_node;
4154 struct zonelist *zonelist; 4145 struct zonelist *zonelist;
@@ -4168,7 +4159,7 @@ static void build_zonelists(pg_data_t *pgdat)
4168 nodes_clear(used_mask); 4159 nodes_clear(used_mask);
4169 4160
4170 memset(node_order, 0, sizeof(node_order)); 4161 memset(node_order, 0, sizeof(node_order));
4171 j = 0; 4162 i = 0;
4172 4163
4173 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 4164 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
4174 /* 4165 /*
@@ -4185,12 +4176,12 @@ static void build_zonelists(pg_data_t *pgdat)
4185 if (order == ZONELIST_ORDER_NODE) 4176 if (order == ZONELIST_ORDER_NODE)
4186 build_zonelists_in_node_order(pgdat, node); 4177 build_zonelists_in_node_order(pgdat, node);
4187 else 4178 else
4188 node_order[j++] = node; /* remember order */ 4179 node_order[i++] = node; /* remember order */
4189 } 4180 }
4190 4181
4191 if (order == ZONELIST_ORDER_ZONE) { 4182 if (order == ZONELIST_ORDER_ZONE) {
4192 /* calculate node order -- i.e., DMA last! */ 4183 /* calculate node order -- i.e., DMA last! */
4193 build_zonelists_in_zone_order(pgdat, j); 4184 build_zonelists_in_zone_order(pgdat, i);
4194 } 4185 }
4195 4186
4196 build_thisnode_zonelists(pgdat); 4187 build_thisnode_zonelists(pgdat);
@@ -5956,20 +5947,12 @@ static void calculate_totalreserve_pages(void)
5956 5947
5957 if (max > zone->managed_pages) 5948 if (max > zone->managed_pages)
5958 max = zone->managed_pages; 5949 max = zone->managed_pages;
5950
5951 zone->totalreserve_pages = max;
5952
5959 reserve_pages += max; 5953 reserve_pages += max;
5960 /*
5961 * Lowmem reserves are not available to
5962 * GFP_HIGHUSER page cache allocations and
5963 * kswapd tries to balance zones to their high
5964 * watermark. As a result, neither should be
5965 * regarded as dirtyable memory, to prevent a
5966 * situation where reclaim has to clean pages
5967 * in order to balance the zones.
5968 */
5969 zone->dirty_balance_reserve = max;
5970 } 5954 }
5971 } 5955 }
5972 dirty_balance_reserve = reserve_pages;
5973 totalreserve_pages = reserve_pages; 5956 totalreserve_pages = reserve_pages;
5974} 5957}
5975 5958
@@ -6724,8 +6707,12 @@ int alloc_contig_range(unsigned long start, unsigned long end,
6724 if (ret) 6707 if (ret)
6725 return ret; 6708 return ret;
6726 6709
6710 /*
6711 * In case of -EBUSY, we'd like to know which page causes problem.
6712 * So, just fall through. We will check it in test_pages_isolated().
6713 */
6727 ret = __alloc_contig_migrate_range(&cc, start, end); 6714 ret = __alloc_contig_migrate_range(&cc, start, end);
6728 if (ret) 6715 if (ret && ret != -EBUSY)
6729 goto done; 6716 goto done;
6730 6717
6731 /* 6718 /*
@@ -6752,12 +6739,25 @@ int alloc_contig_range(unsigned long start, unsigned long end,
6752 outer_start = start; 6739 outer_start = start;
6753 while (!PageBuddy(pfn_to_page(outer_start))) { 6740 while (!PageBuddy(pfn_to_page(outer_start))) {
6754 if (++order >= MAX_ORDER) { 6741 if (++order >= MAX_ORDER) {
6755 ret = -EBUSY; 6742 outer_start = start;
6756 goto done; 6743 break;
6757 } 6744 }
6758 outer_start &= ~0UL << order; 6745 outer_start &= ~0UL << order;
6759 } 6746 }
6760 6747
6748 if (outer_start != start) {
6749 order = page_order(pfn_to_page(outer_start));
6750
6751 /*
6752 * outer_start page could be small order buddy page and
6753 * it doesn't include start page. Adjust outer_start
6754 * in this case to report failed page properly
6755 * on tracepoint in test_pages_isolated()
6756 */
6757 if (outer_start + (1UL << order) <= start)
6758 outer_start = start;
6759 }
6760
6761 /* Make sure the range is really isolated. */ 6761 /* Make sure the range is really isolated. */
6762 if (test_pages_isolated(outer_start, end, false)) { 6762 if (test_pages_isolated(outer_start, end, false)) {
6763 pr_info("%s: [%lx, %lx) PFNs busy\n", 6763 pr_info("%s: [%lx, %lx) PFNs busy\n",
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 4568fd58f70a..5e139fec6c6c 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -9,6 +9,9 @@
9#include <linux/hugetlb.h> 9#include <linux/hugetlb.h>
10#include "internal.h" 10#include "internal.h"
11 11
12#define CREATE_TRACE_POINTS
13#include <trace/events/page_isolation.h>
14
12static int set_migratetype_isolate(struct page *page, 15static int set_migratetype_isolate(struct page *page,
13 bool skip_hwpoisoned_pages) 16 bool skip_hwpoisoned_pages)
14{ 17{
@@ -162,8 +165,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
162 unsigned long undo_pfn; 165 unsigned long undo_pfn;
163 struct page *page; 166 struct page *page;
164 167
165 BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); 168 BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
166 BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); 169 BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
167 170
168 for (pfn = start_pfn; 171 for (pfn = start_pfn;
169 pfn < end_pfn; 172 pfn < end_pfn;
@@ -212,7 +215,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
212 * 215 *
213 * Returns 1 if all pages in the range are isolated. 216 * Returns 1 if all pages in the range are isolated.
214 */ 217 */
215static int 218static unsigned long
216__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, 219__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
217 bool skip_hwpoisoned_pages) 220 bool skip_hwpoisoned_pages)
218{ 221{
@@ -237,9 +240,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
237 else 240 else
238 break; 241 break;
239 } 242 }
240 if (pfn < end_pfn) 243
241 return 0; 244 return pfn;
242 return 1;
243} 245}
244 246
245int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, 247int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
@@ -248,7 +250,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
248 unsigned long pfn, flags; 250 unsigned long pfn, flags;
249 struct page *page; 251 struct page *page;
250 struct zone *zone; 252 struct zone *zone;
251 int ret;
252 253
253 /* 254 /*
254 * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages 255 * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages
@@ -266,10 +267,13 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
266 /* Check all pages are free or marked as ISOLATED */ 267 /* Check all pages are free or marked as ISOLATED */
267 zone = page_zone(page); 268 zone = page_zone(page);
268 spin_lock_irqsave(&zone->lock, flags); 269 spin_lock_irqsave(&zone->lock, flags);
269 ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn, 270 pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
270 skip_hwpoisoned_pages); 271 skip_hwpoisoned_pages);
271 spin_unlock_irqrestore(&zone->lock, flags); 272 spin_unlock_irqrestore(&zone->lock, flags);
272 return ret ? 0 : -EBUSY; 273
274 trace_test_pages_isolated(start_pfn, end_pfn, pfn);
275
276 return pfn < end_pfn ? -EBUSY : 0;
273} 277}
274 278
275struct page *alloc_migrate_target(struct page *page, unsigned long private, 279struct page *alloc_migrate_target(struct page *page, unsigned long private,
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 7d3db0247983..4c681baff363 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -176,13 +176,10 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
176 176
177 /* FIFO */ 177 /* FIFO */
178 pgtable = pmd_huge_pte(mm, pmdp); 178 pgtable = pmd_huge_pte(mm, pmdp);
179 if (list_empty(&pgtable->lru)) 179 pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru,
180 pmd_huge_pte(mm, pmdp) = NULL; 180 struct page, lru);
181 else { 181 if (pmd_huge_pte(mm, pmdp))
182 pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next,
183 struct page, lru);
184 list_del(&pgtable->lru); 182 list_del(&pgtable->lru);
185 }
186 return pgtable; 183 return pgtable;
187} 184}
188#endif 185#endif
diff --git a/mm/readahead.c b/mm/readahead.c
index ba22d7fe0afb..20e58e820e44 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -17,6 +17,7 @@
17#include <linux/pagemap.h> 17#include <linux/pagemap.h>
18#include <linux/syscalls.h> 18#include <linux/syscalls.h>
19#include <linux/file.h> 19#include <linux/file.h>
20#include <linux/mm_inline.h>
20 21
21#include "internal.h" 22#include "internal.h"
22 23
@@ -32,8 +33,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
32} 33}
33EXPORT_SYMBOL_GPL(file_ra_state_init); 34EXPORT_SYMBOL_GPL(file_ra_state_init);
34 35
35#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
36
37/* 36/*
38 * see if a page needs releasing upon read_cache_pages() failure 37 * see if a page needs releasing upon read_cache_pages() failure
39 * - the caller of read_cache_pages() may have set PG_private or PG_fscache 38 * - the caller of read_cache_pages() may have set PG_private or PG_fscache
@@ -64,7 +63,7 @@ static void read_cache_pages_invalidate_pages(struct address_space *mapping,
64 struct page *victim; 63 struct page *victim;
65 64
66 while (!list_empty(pages)) { 65 while (!list_empty(pages)) {
67 victim = list_to_page(pages); 66 victim = lru_to_page(pages);
68 list_del(&victim->lru); 67 list_del(&victim->lru);
69 read_cache_pages_invalidate_page(mapping, victim); 68 read_cache_pages_invalidate_page(mapping, victim);
70 } 69 }
@@ -87,7 +86,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
87 int ret = 0; 86 int ret = 0;
88 87
89 while (!list_empty(pages)) { 88 while (!list_empty(pages)) {
90 page = list_to_page(pages); 89 page = lru_to_page(pages);
91 list_del(&page->lru); 90 list_del(&page->lru);
92 if (add_to_page_cache_lru(page, mapping, page->index, 91 if (add_to_page_cache_lru(page, mapping, page->index,
93 mapping_gfp_constraint(mapping, GFP_KERNEL))) { 92 mapping_gfp_constraint(mapping, GFP_KERNEL))) {
@@ -125,7 +124,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
125 } 124 }
126 125
127 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 126 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
128 struct page *page = list_to_page(pages); 127 struct page *page = lru_to_page(pages);
129 list_del(&page->lru); 128 list_del(&page->lru);
130 if (!add_to_page_cache_lru(page, mapping, page->index, 129 if (!add_to_page_cache_lru(page, mapping, page->index,
131 mapping_gfp_constraint(mapping, GFP_KERNEL))) { 130 mapping_gfp_constraint(mapping, GFP_KERNEL))) {
diff --git a/mm/rmap.c b/mm/rmap.c
index b577fbb98d4b..622756c16ac8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -428,8 +428,10 @@ static void anon_vma_ctor(void *data)
428void __init anon_vma_init(void) 428void __init anon_vma_init(void)
429{ 429{
430 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 430 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
431 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); 431 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
432 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); 432 anon_vma_ctor);
433 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
434 SLAB_PANIC|SLAB_ACCOUNT);
433} 435}
434 436
435/* 437/*
@@ -1362,10 +1364,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1362 if (PageHuge(page)) { 1364 if (PageHuge(page)) {
1363 hugetlb_count_sub(1 << compound_order(page), mm); 1365 hugetlb_count_sub(1 << compound_order(page), mm);
1364 } else { 1366 } else {
1365 if (PageAnon(page)) 1367 dec_mm_counter(mm, mm_counter(page));
1366 dec_mm_counter(mm, MM_ANONPAGES);
1367 else
1368 dec_mm_counter(mm, MM_FILEPAGES);
1369 } 1368 }
1370 set_pte_at(mm, address, pte, 1369 set_pte_at(mm, address, pte,
1371 swp_entry_to_pte(make_hwpoison_entry(page))); 1370 swp_entry_to_pte(make_hwpoison_entry(page)));
@@ -1375,10 +1374,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1375 * interest anymore. Simply discard the pte, vmscan 1374 * interest anymore. Simply discard the pte, vmscan
1376 * will take care of the rest. 1375 * will take care of the rest.
1377 */ 1376 */
1378 if (PageAnon(page)) 1377 dec_mm_counter(mm, mm_counter(page));
1379 dec_mm_counter(mm, MM_ANONPAGES);
1380 else
1381 dec_mm_counter(mm, MM_FILEPAGES);
1382 } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) { 1378 } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) {
1383 swp_entry_t entry; 1379 swp_entry_t entry;
1384 pte_t swp_pte; 1380 pte_t swp_pte;
@@ -1418,7 +1414,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1418 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1414 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1419 set_pte_at(mm, address, pte, swp_pte); 1415 set_pte_at(mm, address, pte, swp_pte);
1420 } else 1416 } else
1421 dec_mm_counter(mm, MM_FILEPAGES); 1417 dec_mm_counter(mm, mm_counter_file(page));
1422 1418
1423 page_remove_rmap(page); 1419 page_remove_rmap(page);
1424 page_cache_release(page); 1420 page_cache_release(page);
diff --git a/mm/shmem.c b/mm/shmem.c
index 642471b0ddea..970ff5b80853 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -360,6 +360,87 @@ static int shmem_free_swap(struct address_space *mapping,
360} 360}
361 361
362/* 362/*
363 * Determine (in bytes) how many of the shmem object's pages mapped by the
364 * given offsets are swapped out.
365 *
366 * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU,
367 * as long as the inode doesn't go away and racy results are not a problem.
368 */
369unsigned long shmem_partial_swap_usage(struct address_space *mapping,
370 pgoff_t start, pgoff_t end)
371{
372 struct radix_tree_iter iter;
373 void **slot;
374 struct page *page;
375 unsigned long swapped = 0;
376
377 rcu_read_lock();
378
379restart:
380 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
381 if (iter.index >= end)
382 break;
383
384 page = radix_tree_deref_slot(slot);
385
386 /*
387 * This should only be possible to happen at index 0, so we
388 * don't need to reset the counter, nor do we risk infinite
389 * restarts.
390 */
391 if (radix_tree_deref_retry(page))
392 goto restart;
393
394 if (radix_tree_exceptional_entry(page))
395 swapped++;
396
397 if (need_resched()) {
398 cond_resched_rcu();
399 start = iter.index + 1;
400 goto restart;
401 }
402 }
403
404 rcu_read_unlock();
405
406 return swapped << PAGE_SHIFT;
407}
408
409/*
410 * Determine (in bytes) how many of the shmem object's pages mapped by the
411 * given vma is swapped out.
412 *
413 * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU,
414 * as long as the inode doesn't go away and racy results are not a problem.
415 */
416unsigned long shmem_swap_usage(struct vm_area_struct *vma)
417{
418 struct inode *inode = file_inode(vma->vm_file);
419 struct shmem_inode_info *info = SHMEM_I(inode);
420 struct address_space *mapping = inode->i_mapping;
421 unsigned long swapped;
422
423 /* Be careful as we don't hold info->lock */
424 swapped = READ_ONCE(info->swapped);
425
426 /*
427 * The easier cases are when the shmem object has nothing in swap, or
428 * the vma maps it whole. Then we can simply use the stats that we
429 * already track.
430 */
431 if (!swapped)
432 return 0;
433
434 if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
435 return swapped << PAGE_SHIFT;
436
437 /* Here comes the more involved part */
438 return shmem_partial_swap_usage(mapping,
439 linear_page_index(vma, vma->vm_start),
440 linear_page_index(vma, vma->vm_end));
441}
442
443/*
363 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. 444 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
364 */ 445 */
365void shmem_unlock_mapping(struct address_space *mapping) 446void shmem_unlock_mapping(struct address_space *mapping)
@@ -3064,7 +3145,7 @@ static int shmem_init_inodecache(void)
3064{ 3145{
3065 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 3146 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
3066 sizeof(struct shmem_inode_info), 3147 sizeof(struct shmem_inode_info),
3067 0, SLAB_PANIC, shmem_init_inode); 3148 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
3068 return 0; 3149 return 0;
3069} 3150}
3070 3151
diff --git a/mm/slab.c b/mm/slab.c
index 4765c97ce690..6ecc697a8bc4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2756,6 +2756,21 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2756#define cache_free_debugcheck(x,objp,z) (objp) 2756#define cache_free_debugcheck(x,objp,z) (objp)
2757#endif 2757#endif
2758 2758
2759static struct page *get_first_slab(struct kmem_cache_node *n)
2760{
2761 struct page *page;
2762
2763 page = list_first_entry_or_null(&n->slabs_partial,
2764 struct page, lru);
2765 if (!page) {
2766 n->free_touched = 1;
2767 page = list_first_entry_or_null(&n->slabs_free,
2768 struct page, lru);
2769 }
2770
2771 return page;
2772}
2773
2759static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, 2774static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
2760 bool force_refill) 2775 bool force_refill)
2761{ 2776{
@@ -2791,18 +2806,12 @@ retry:
2791 } 2806 }
2792 2807
2793 while (batchcount > 0) { 2808 while (batchcount > 0) {
2794 struct list_head *entry;
2795 struct page *page; 2809 struct page *page;
2796 /* Get slab alloc is to come from. */ 2810 /* Get slab alloc is to come from. */
2797 entry = n->slabs_partial.next; 2811 page = get_first_slab(n);
2798 if (entry == &n->slabs_partial) { 2812 if (!page)
2799 n->free_touched = 1; 2813 goto must_grow;
2800 entry = n->slabs_free.next;
2801 if (entry == &n->slabs_free)
2802 goto must_grow;
2803 }
2804 2814
2805 page = list_entry(entry, struct page, lru);
2806 check_spinlock_acquired(cachep); 2815 check_spinlock_acquired(cachep);
2807 2816
2808 /* 2817 /*
@@ -3085,7 +3094,6 @@ retry:
3085static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3094static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3086 int nodeid) 3095 int nodeid)
3087{ 3096{
3088 struct list_head *entry;
3089 struct page *page; 3097 struct page *page;
3090 struct kmem_cache_node *n; 3098 struct kmem_cache_node *n;
3091 void *obj; 3099 void *obj;
@@ -3098,15 +3106,10 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3098retry: 3106retry:
3099 check_irq_off(); 3107 check_irq_off();
3100 spin_lock(&n->list_lock); 3108 spin_lock(&n->list_lock);
3101 entry = n->slabs_partial.next; 3109 page = get_first_slab(n);
3102 if (entry == &n->slabs_partial) { 3110 if (!page)
3103 n->free_touched = 1; 3111 goto must_grow;
3104 entry = n->slabs_free.next;
3105 if (entry == &n->slabs_free)
3106 goto must_grow;
3107 }
3108 3112
3109 page = list_entry(entry, struct page, lru);
3110 check_spinlock_acquired_node(cachep, nodeid); 3113 check_spinlock_acquired_node(cachep, nodeid);
3111 3114
3112 STATS_INC_NODEALLOCS(cachep); 3115 STATS_INC_NODEALLOCS(cachep);
@@ -3338,17 +3341,12 @@ free_done:
3338#if STATS 3341#if STATS
3339 { 3342 {
3340 int i = 0; 3343 int i = 0;
3341 struct list_head *p; 3344 struct page *page;
3342
3343 p = n->slabs_free.next;
3344 while (p != &(n->slabs_free)) {
3345 struct page *page;
3346 3345
3347 page = list_entry(p, struct page, lru); 3346 list_for_each_entry(page, &n->slabs_free, lru) {
3348 BUG_ON(page->active); 3347 BUG_ON(page->active);
3349 3348
3350 i++; 3349 i++;
3351 p = p->next;
3352 } 3350 }
3353 STATS_SET_FREEABLE(cachep, i); 3351 STATS_SET_FREEABLE(cachep, i);
3354 } 3352 }
diff --git a/mm/slab.h b/mm/slab.h
index 7b6087197997..c63b8699cfa3 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -128,10 +128,11 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
128 128
129#if defined(CONFIG_SLAB) 129#if defined(CONFIG_SLAB)
130#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ 130#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
131 SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK) 131 SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
132 SLAB_NOTRACK | SLAB_ACCOUNT)
132#elif defined(CONFIG_SLUB) 133#elif defined(CONFIG_SLUB)
133#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ 134#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
134 SLAB_TEMPORARY | SLAB_NOTRACK) 135 SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT)
135#else 136#else
136#define SLAB_CACHE_FLAGS (0) 137#define SLAB_CACHE_FLAGS (0)
137#endif 138#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3c6a86b4ec25..e016178063e1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -37,7 +37,8 @@ struct kmem_cache *kmem_cache;
37 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ 37 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
38 SLAB_FAILSLAB) 38 SLAB_FAILSLAB)
39 39
40#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | SLAB_NOTRACK) 40#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
41 SLAB_NOTRACK | SLAB_ACCOUNT)
41 42
42/* 43/*
43 * Merge control. If this is set then no merging of slab caches will occur. 44 * Merge control. If this is set then no merging of slab caches will occur.
diff --git a/mm/slub.c b/mm/slub.c
index 46997517406e..2d0e610d195a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5362,6 +5362,8 @@ static char *create_unique_id(struct kmem_cache *s)
5362 *p++ = 'F'; 5362 *p++ = 'F';
5363 if (!(s->flags & SLAB_NOTRACK)) 5363 if (!(s->flags & SLAB_NOTRACK))
5364 *p++ = 't'; 5364 *p++ = 't';
5365 if (s->flags & SLAB_ACCOUNT)
5366 *p++ = 'A';
5365 if (p != name + 1) 5367 if (p != name + 1)
5366 *p++ = '-'; 5368 *p++ = '-';
5367 p += sprintf(p, "%07d", s->size); 5369 p += sprintf(p, "%07d", s->size);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 58877312cf6b..e6b8591a3ed2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -165,8 +165,6 @@ static void discard_swap_cluster(struct swap_info_struct *si,
165 int found_extent = 0; 165 int found_extent = 0;
166 166
167 while (nr_pages) { 167 while (nr_pages) {
168 struct list_head *lh;
169
170 if (se->start_page <= start_page && 168 if (se->start_page <= start_page &&
171 start_page < se->start_page + se->nr_pages) { 169 start_page < se->start_page + se->nr_pages) {
172 pgoff_t offset = start_page - se->start_page; 170 pgoff_t offset = start_page - se->start_page;
@@ -188,8 +186,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
188 break; 186 break;
189 } 187 }
190 188
191 lh = se->list.next; 189 se = list_next_entry(se, list);
192 se = list_entry(lh, struct swap_extent, list);
193 } 190 }
194} 191}
195 192
@@ -903,7 +900,7 @@ int swp_swapcount(swp_entry_t entry)
903 VM_BUG_ON(page_private(page) != SWP_CONTINUED); 900 VM_BUG_ON(page_private(page) != SWP_CONTINUED);
904 901
905 do { 902 do {
906 page = list_entry(page->lru.next, struct page, lru); 903 page = list_next_entry(page, lru);
907 map = kmap_atomic(page); 904 map = kmap_atomic(page);
908 tmp_count = map[offset]; 905 tmp_count = map[offset];
909 kunmap_atomic(map); 906 kunmap_atomic(map);
@@ -1633,14 +1630,11 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1633 se = start_se; 1630 se = start_se;
1634 1631
1635 for ( ; ; ) { 1632 for ( ; ; ) {
1636 struct list_head *lh;
1637
1638 if (se->start_page <= offset && 1633 if (se->start_page <= offset &&
1639 offset < (se->start_page + se->nr_pages)) { 1634 offset < (se->start_page + se->nr_pages)) {
1640 return se->start_block + (offset - se->start_page); 1635 return se->start_block + (offset - se->start_page);
1641 } 1636 }
1642 lh = se->list.next; 1637 se = list_next_entry(se, list);
1643 se = list_entry(lh, struct swap_extent, list);
1644 sis->curr_swap_extent = se; 1638 sis->curr_swap_extent = se;
1645 BUG_ON(se == start_se); /* It *must* be present */ 1639 BUG_ON(se == start_se); /* It *must* be present */
1646 } 1640 }
@@ -1664,7 +1658,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
1664 while (!list_empty(&sis->first_swap_extent.list)) { 1658 while (!list_empty(&sis->first_swap_extent.list)) {
1665 struct swap_extent *se; 1659 struct swap_extent *se;
1666 1660
1667 se = list_entry(sis->first_swap_extent.list.next, 1661 se = list_first_entry(&sis->first_swap_extent.list,
1668 struct swap_extent, list); 1662 struct swap_extent, list);
1669 list_del(&se->list); 1663 list_del(&se->list);
1670 kfree(se); 1664 kfree(se);
@@ -2959,11 +2953,10 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
2959 struct page *head; 2953 struct page *head;
2960 head = vmalloc_to_page(si->swap_map + offset); 2954 head = vmalloc_to_page(si->swap_map + offset);
2961 if (page_private(head)) { 2955 if (page_private(head)) {
2962 struct list_head *this, *next; 2956 struct page *page, *next;
2963 list_for_each_safe(this, next, &head->lru) { 2957
2964 struct page *page; 2958 list_for_each_entry_safe(page, next, &head->lru, lru) {
2965 page = list_entry(this, struct page, lru); 2959 list_del(&page->lru);
2966 list_del(this);
2967 __free_page(page); 2960 __free_page(page);
2968 } 2961 }
2969 } 2962 }
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8e3c9c5a3042..58ceeb107960 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -441,8 +441,7 @@ nocache:
441 if (list_is_last(&first->list, &vmap_area_list)) 441 if (list_is_last(&first->list, &vmap_area_list))
442 goto found; 442 goto found;
443 443
444 first = list_entry(first->list.next, 444 first = list_next_entry(first, list);
445 struct vmap_area, list);
446 } 445 }
447 446
448found: 447found:
@@ -1477,13 +1476,10 @@ static void __vunmap(const void *addr, int deallocate_pages)
1477 struct page *page = area->pages[i]; 1476 struct page *page = area->pages[i];
1478 1477
1479 BUG_ON(!page); 1478 BUG_ON(!page);
1480 __free_page(page); 1479 __free_kmem_pages(page, 0);
1481 } 1480 }
1482 1481
1483 if (area->flags & VM_VPAGES) 1482 kvfree(area->pages);
1484 vfree(area->pages);
1485 else
1486 kfree(area->pages);
1487 } 1483 }
1488 1484
1489 kfree(area); 1485 kfree(area);
@@ -1593,7 +1589,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1593 if (array_size > PAGE_SIZE) { 1589 if (array_size > PAGE_SIZE) {
1594 pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, 1590 pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
1595 PAGE_KERNEL, node, area->caller); 1591 PAGE_KERNEL, node, area->caller);
1596 area->flags |= VM_VPAGES;
1597 } else { 1592 } else {
1598 pages = kmalloc_node(array_size, nested_gfp, node); 1593 pages = kmalloc_node(array_size, nested_gfp, node);
1599 } 1594 }
@@ -1608,9 +1603,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1608 struct page *page; 1603 struct page *page;
1609 1604
1610 if (node == NUMA_NO_NODE) 1605 if (node == NUMA_NO_NODE)
1611 page = alloc_page(alloc_mask); 1606 page = alloc_kmem_pages(alloc_mask, order);
1612 else 1607 else
1613 page = alloc_pages_node(node, alloc_mask, order); 1608 page = alloc_kmem_pages_node(node, alloc_mask, order);
1614 1609
1615 if (unlikely(!page)) { 1610 if (unlikely(!page)) {
1616 /* Successfully allocated i pages, free them in __vunmap() */ 1611 /* Successfully allocated i pages, free them in __vunmap() */
@@ -2559,10 +2554,10 @@ static void *s_start(struct seq_file *m, loff_t *pos)
2559 struct vmap_area *va; 2554 struct vmap_area *va;
2560 2555
2561 spin_lock(&vmap_area_lock); 2556 spin_lock(&vmap_area_lock);
2562 va = list_entry((&vmap_area_list)->next, typeof(*va), list); 2557 va = list_first_entry(&vmap_area_list, typeof(*va), list);
2563 while (n > 0 && &va->list != &vmap_area_list) { 2558 while (n > 0 && &va->list != &vmap_area_list) {
2564 n--; 2559 n--;
2565 va = list_entry(va->list.next, typeof(*va), list); 2560 va = list_next_entry(va, list);
2566 } 2561 }
2567 if (!n && &va->list != &vmap_area_list) 2562 if (!n && &va->list != &vmap_area_list)
2568 return va; 2563 return va;
@@ -2576,7 +2571,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
2576 struct vmap_area *va = p, *next; 2571 struct vmap_area *va = p, *next;
2577 2572
2578 ++*pos; 2573 ++*pos;
2579 next = list_entry(va->list.next, typeof(*va), list); 2574 next = list_next_entry(va, list);
2580 if (&next->list != &vmap_area_list) 2575 if (&next->list != &vmap_area_list)
2581 return next; 2576 return next;
2582 2577
@@ -2651,7 +2646,7 @@ static int s_show(struct seq_file *m, void *p)
2651 if (v->flags & VM_USERMAP) 2646 if (v->flags & VM_USERMAP)
2652 seq_puts(m, " user"); 2647 seq_puts(m, " user");
2653 2648
2654 if (v->flags & VM_VPAGES) 2649 if (is_vmalloc_addr(v->pages))
2655 seq_puts(m, " vpages"); 2650 seq_puts(m, " vpages");
2656 2651
2657 show_numa_info(m, v); 2652 show_numa_info(m, v);
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index c5afd573d7da..9a6c0704211c 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -137,14 +137,11 @@ struct vmpressure_event {
137}; 137};
138 138
139static bool vmpressure_event(struct vmpressure *vmpr, 139static bool vmpressure_event(struct vmpressure *vmpr,
140 unsigned long scanned, unsigned long reclaimed) 140 enum vmpressure_levels level)
141{ 141{
142 struct vmpressure_event *ev; 142 struct vmpressure_event *ev;
143 enum vmpressure_levels level;
144 bool signalled = false; 143 bool signalled = false;
145 144
146 level = vmpressure_calc_level(scanned, reclaimed);
147
148 mutex_lock(&vmpr->events_lock); 145 mutex_lock(&vmpr->events_lock);
149 146
150 list_for_each_entry(ev, &vmpr->events, node) { 147 list_for_each_entry(ev, &vmpr->events, node) {
@@ -164,6 +161,7 @@ static void vmpressure_work_fn(struct work_struct *work)
164 struct vmpressure *vmpr = work_to_vmpressure(work); 161 struct vmpressure *vmpr = work_to_vmpressure(work);
165 unsigned long scanned; 162 unsigned long scanned;
166 unsigned long reclaimed; 163 unsigned long reclaimed;
164 enum vmpressure_levels level;
167 165
168 spin_lock(&vmpr->sr_lock); 166 spin_lock(&vmpr->sr_lock);
169 /* 167 /*
@@ -174,19 +172,21 @@ static void vmpressure_work_fn(struct work_struct *work)
174 * here. No need for any locks here since we don't care if 172 * here. No need for any locks here since we don't care if
175 * vmpr->reclaimed is in sync. 173 * vmpr->reclaimed is in sync.
176 */ 174 */
177 scanned = vmpr->scanned; 175 scanned = vmpr->tree_scanned;
178 if (!scanned) { 176 if (!scanned) {
179 spin_unlock(&vmpr->sr_lock); 177 spin_unlock(&vmpr->sr_lock);
180 return; 178 return;
181 } 179 }
182 180
183 reclaimed = vmpr->reclaimed; 181 reclaimed = vmpr->tree_reclaimed;
184 vmpr->scanned = 0; 182 vmpr->tree_scanned = 0;
185 vmpr->reclaimed = 0; 183 vmpr->tree_reclaimed = 0;
186 spin_unlock(&vmpr->sr_lock); 184 spin_unlock(&vmpr->sr_lock);
187 185
186 level = vmpressure_calc_level(scanned, reclaimed);
187
188 do { 188 do {
189 if (vmpressure_event(vmpr, scanned, reclaimed)) 189 if (vmpressure_event(vmpr, level))
190 break; 190 break;
191 /* 191 /*
192 * If not handled, propagate the event upward into the 192 * If not handled, propagate the event upward into the
@@ -199,6 +199,7 @@ static void vmpressure_work_fn(struct work_struct *work)
199 * vmpressure() - Account memory pressure through scanned/reclaimed ratio 199 * vmpressure() - Account memory pressure through scanned/reclaimed ratio
200 * @gfp: reclaimer's gfp mask 200 * @gfp: reclaimer's gfp mask
201 * @memcg: cgroup memory controller handle 201 * @memcg: cgroup memory controller handle
202 * @tree: legacy subtree mode
202 * @scanned: number of pages scanned 203 * @scanned: number of pages scanned
203 * @reclaimed: number of pages reclaimed 204 * @reclaimed: number of pages reclaimed
204 * 205 *
@@ -206,9 +207,16 @@ static void vmpressure_work_fn(struct work_struct *work)
206 * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw 207 * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
207 * pressure index is then further refined and averaged over time. 208 * pressure index is then further refined and averaged over time.
208 * 209 *
210 * If @tree is set, vmpressure is in traditional userspace reporting
211 * mode: @memcg is considered the pressure root and userspace is
212 * notified of the entire subtree's reclaim efficiency.
213 *
214 * If @tree is not set, reclaim efficiency is recorded for @memcg, and
215 * only in-kernel users are notified.
216 *
209 * This function does not return any value. 217 * This function does not return any value.
210 */ 218 */
211void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, 219void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
212 unsigned long scanned, unsigned long reclaimed) 220 unsigned long scanned, unsigned long reclaimed)
213{ 221{
214 struct vmpressure *vmpr = memcg_to_vmpressure(memcg); 222 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
@@ -238,15 +246,47 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
238 if (!scanned) 246 if (!scanned)
239 return; 247 return;
240 248
241 spin_lock(&vmpr->sr_lock); 249 if (tree) {
242 vmpr->scanned += scanned; 250 spin_lock(&vmpr->sr_lock);
243 vmpr->reclaimed += reclaimed; 251 vmpr->tree_scanned += scanned;
244 scanned = vmpr->scanned; 252 vmpr->tree_reclaimed += reclaimed;
245 spin_unlock(&vmpr->sr_lock); 253 scanned = vmpr->scanned;
254 spin_unlock(&vmpr->sr_lock);
246 255
247 if (scanned < vmpressure_win) 256 if (scanned < vmpressure_win)
248 return; 257 return;
249 schedule_work(&vmpr->work); 258 schedule_work(&vmpr->work);
259 } else {
260 enum vmpressure_levels level;
261
262 /* For now, no users for root-level efficiency */
263 if (!memcg || memcg == root_mem_cgroup)
264 return;
265
266 spin_lock(&vmpr->sr_lock);
267 scanned = vmpr->scanned += scanned;
268 reclaimed = vmpr->reclaimed += reclaimed;
269 if (scanned < vmpressure_win) {
270 spin_unlock(&vmpr->sr_lock);
271 return;
272 }
273 vmpr->scanned = vmpr->reclaimed = 0;
274 spin_unlock(&vmpr->sr_lock);
275
276 level = vmpressure_calc_level(scanned, reclaimed);
277
278 if (level > VMPRESSURE_LOW) {
279 /*
280 * Let the socket buffer allocator know that
281 * we are having trouble reclaiming LRU pages.
282 *
283 * For hysteresis keep the pressure state
284 * asserted for a second in which subsequent
285 * pressure events can occur.
286 */
287 memcg->socket_pressure = jiffies + HZ;
288 }
289 }
250} 290}
251 291
252/** 292/**
@@ -276,7 +316,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
276 * to the vmpressure() basically means that we signal 'critical' 316 * to the vmpressure() basically means that we signal 'critical'
277 * level. 317 * level.
278 */ 318 */
279 vmpressure(gfp, memcg, vmpressure_win, 0); 319 vmpressure(gfp, memcg, true, vmpressure_win, 0);
280} 320}
281 321
282/** 322/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2aec4241b42a..108bd119f2f6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -106,8 +106,6 @@ struct scan_control {
106 unsigned long nr_reclaimed; 106 unsigned long nr_reclaimed;
107}; 107};
108 108
109#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
110
111#ifdef ARCH_HAS_PREFETCH 109#ifdef ARCH_HAS_PREFETCH
112#define prefetch_prev_lru_page(_page, _base, _field) \ 110#define prefetch_prev_lru_page(_page, _base, _field) \
113 do { \ 111 do { \
@@ -197,11 +195,13 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
197 unsigned long nr; 195 unsigned long nr;
198 196
199 nr = zone_page_state(zone, NR_ACTIVE_FILE) + 197 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
200 zone_page_state(zone, NR_INACTIVE_FILE); 198 zone_page_state(zone, NR_INACTIVE_FILE) +
199 zone_page_state(zone, NR_ISOLATED_FILE);
201 200
202 if (get_nr_swap_pages() > 0) 201 if (get_nr_swap_pages() > 0)
203 nr += zone_page_state(zone, NR_ACTIVE_ANON) + 202 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
204 zone_page_state(zone, NR_INACTIVE_ANON); 203 zone_page_state(zone, NR_INACTIVE_ANON) +
204 zone_page_state(zone, NR_ISOLATED_ANON);
205 205
206 return nr; 206 return nr;
207} 207}
@@ -594,7 +594,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
594 /* synchronous write or broken a_ops? */ 594 /* synchronous write or broken a_ops? */
595 ClearPageReclaim(page); 595 ClearPageReclaim(page);
596 } 596 }
597 trace_mm_vmscan_writepage(page, trace_reclaim_flags(page)); 597 trace_mm_vmscan_writepage(page);
598 inc_zone_page_state(page, NR_VMSCAN_WRITE); 598 inc_zone_page_state(page, NR_VMSCAN_WRITE);
599 return PAGE_SUCCESS; 599 return PAGE_SUCCESS;
600 } 600 }
@@ -1426,6 +1426,7 @@ int isolate_lru_page(struct page *page)
1426 int ret = -EBUSY; 1426 int ret = -EBUSY;
1427 1427
1428 VM_BUG_ON_PAGE(!page_count(page), page); 1428 VM_BUG_ON_PAGE(!page_count(page), page);
1429 VM_BUG_ON_PAGE(PageTail(page), page);
1429 1430
1430 if (PageLRU(page)) { 1431 if (PageLRU(page)) {
1431 struct zone *zone = page_zone(page); 1432 struct zone *zone = page_zone(page);
@@ -1691,11 +1692,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1691 current_may_throttle()) 1692 current_may_throttle())
1692 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); 1693 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1693 1694
1694 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1695 trace_mm_vmscan_lru_shrink_inactive(zone, nr_scanned, nr_reclaimed,
1695 zone_idx(zone), 1696 sc->priority, file);
1696 nr_scanned, nr_reclaimed,
1697 sc->priority,
1698 trace_shrink_flags(file));
1699 return nr_reclaimed; 1697 return nr_reclaimed;
1700} 1698}
1701 1699
@@ -2046,10 +2044,16 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness,
2046 } 2044 }
2047 2045
2048 /* 2046 /*
2049 * There is enough inactive page cache, do not reclaim 2047 * If there is enough inactive page cache, i.e. if the size of the
2050 * anything from the anonymous working set right now. 2048 * inactive list is greater than that of the active list *and* the
2049 * inactive list actually has some pages to scan on this priority, we
2050 * do not reclaim anything from the anonymous working set right now.
2051 * Without the second condition we could end up never scanning an
2052 * lruvec even if it has plenty of old anonymous pages unless the
2053 * system is under heavy pressure.
2051 */ 2054 */
2052 if (!inactive_file_is_low(lruvec)) { 2055 if (!inactive_file_is_low(lruvec) &&
2056 get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
2053 scan_balance = SCAN_FILE; 2057 scan_balance = SCAN_FILE;
2054 goto out; 2058 goto out;
2055 } 2059 }
@@ -2393,6 +2397,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2393 memcg = mem_cgroup_iter(root, NULL, &reclaim); 2397 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2394 do { 2398 do {
2395 unsigned long lru_pages; 2399 unsigned long lru_pages;
2400 unsigned long reclaimed;
2396 unsigned long scanned; 2401 unsigned long scanned;
2397 struct lruvec *lruvec; 2402 struct lruvec *lruvec;
2398 int swappiness; 2403 int swappiness;
@@ -2405,6 +2410,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2405 2410
2406 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2411 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2407 swappiness = mem_cgroup_swappiness(memcg); 2412 swappiness = mem_cgroup_swappiness(memcg);
2413 reclaimed = sc->nr_reclaimed;
2408 scanned = sc->nr_scanned; 2414 scanned = sc->nr_scanned;
2409 2415
2410 shrink_lruvec(lruvec, swappiness, sc, &lru_pages); 2416 shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
@@ -2415,6 +2421,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2415 memcg, sc->nr_scanned - scanned, 2421 memcg, sc->nr_scanned - scanned,
2416 lru_pages); 2422 lru_pages);
2417 2423
2424 /* Record the group's reclaim efficiency */
2425 vmpressure(sc->gfp_mask, memcg, false,
2426 sc->nr_scanned - scanned,
2427 sc->nr_reclaimed - reclaimed);
2428
2418 /* 2429 /*
2419 * Direct reclaim and kswapd have to scan all memory 2430 * Direct reclaim and kswapd have to scan all memory
2420 * cgroups to fulfill the overall scan target for the 2431 * cgroups to fulfill the overall scan target for the
@@ -2446,7 +2457,8 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2446 reclaim_state->reclaimed_slab = 0; 2457 reclaim_state->reclaimed_slab = 0;
2447 } 2458 }
2448 2459
2449 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, 2460 /* Record the subtree's reclaim efficiency */
2461 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
2450 sc->nr_scanned - nr_scanned, 2462 sc->nr_scanned - nr_scanned,
2451 sc->nr_reclaimed - nr_reclaimed); 2463 sc->nr_reclaimed - nr_reclaimed);
2452 2464
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c54fd2924f25..83a003bc3cae 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -460,7 +460,7 @@ static int fold_diff(int *diff)
460 * 460 *
461 * The function returns the number of global counters updated. 461 * The function returns the number of global counters updated.
462 */ 462 */
463static int refresh_cpu_vm_stats(void) 463static int refresh_cpu_vm_stats(bool do_pagesets)
464{ 464{
465 struct zone *zone; 465 struct zone *zone;
466 int i; 466 int i;
@@ -484,33 +484,35 @@ static int refresh_cpu_vm_stats(void)
484#endif 484#endif
485 } 485 }
486 } 486 }
487 cond_resched();
488#ifdef CONFIG_NUMA 487#ifdef CONFIG_NUMA
489 /* 488 if (do_pagesets) {
490 * Deal with draining the remote pageset of this 489 cond_resched();
491 * processor 490 /*
492 * 491 * Deal with draining the remote pageset of this
493 * Check if there are pages remaining in this pageset 492 * processor
494 * if not then there is nothing to expire. 493 *
495 */ 494 * Check if there are pages remaining in this pageset
496 if (!__this_cpu_read(p->expire) || 495 * if not then there is nothing to expire.
496 */
497 if (!__this_cpu_read(p->expire) ||
497 !__this_cpu_read(p->pcp.count)) 498 !__this_cpu_read(p->pcp.count))
498 continue; 499 continue;
499 500
500 /* 501 /*
501 * We never drain zones local to this processor. 502 * We never drain zones local to this processor.
502 */ 503 */
503 if (zone_to_nid(zone) == numa_node_id()) { 504 if (zone_to_nid(zone) == numa_node_id()) {
504 __this_cpu_write(p->expire, 0); 505 __this_cpu_write(p->expire, 0);
505 continue; 506 continue;
506 } 507 }
507 508
508 if (__this_cpu_dec_return(p->expire)) 509 if (__this_cpu_dec_return(p->expire))
509 continue; 510 continue;
510 511
511 if (__this_cpu_read(p->pcp.count)) { 512 if (__this_cpu_read(p->pcp.count)) {
512 drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); 513 drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
513 changes++; 514 changes++;
515 }
514 } 516 }
515#endif 517#endif
516 } 518 }
@@ -1386,7 +1388,7 @@ static cpumask_var_t cpu_stat_off;
1386 1388
1387static void vmstat_update(struct work_struct *w) 1389static void vmstat_update(struct work_struct *w)
1388{ 1390{
1389 if (refresh_cpu_vm_stats()) { 1391 if (refresh_cpu_vm_stats(true)) {
1390 /* 1392 /*
1391 * Counters were updated so we expect more updates 1393 * Counters were updated so we expect more updates
1392 * to occur in the future. Keep on running the 1394 * to occur in the future. Keep on running the
@@ -1418,6 +1420,23 @@ static void vmstat_update(struct work_struct *w)
1418} 1420}
1419 1421
1420/* 1422/*
1423 * Switch off vmstat processing and then fold all the remaining differentials
1424 * until the diffs stay at zero. The function is used by NOHZ and can only be
1425 * invoked when tick processing is not active.
1426 */
1427void quiet_vmstat(void)
1428{
1429 if (system_state != SYSTEM_RUNNING)
1430 return;
1431
1432 do {
1433 if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
1434 cancel_delayed_work(this_cpu_ptr(&vmstat_work));
1435
1436 } while (refresh_cpu_vm_stats(false));
1437}
1438
1439/*
1421 * Check if the diffs for a certain cpu indicate that 1440 * Check if the diffs for a certain cpu indicate that
1422 * an update is needed. 1441 * an update is needed.
1423 */ 1442 */
@@ -1449,7 +1468,7 @@ static bool need_update(int cpu)
1449 */ 1468 */
1450static void vmstat_shepherd(struct work_struct *w); 1469static void vmstat_shepherd(struct work_struct *w);
1451 1470
1452static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd); 1471static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
1453 1472
1454static void vmstat_shepherd(struct work_struct *w) 1473static void vmstat_shepherd(struct work_struct *w)
1455{ 1474{
diff --git a/mm/zbud.c b/mm/zbud.c
index d8a181fd779b..b42322e50f63 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -463,9 +463,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
463 spin_unlock(&pool->lock); 463 spin_unlock(&pool->lock);
464} 464}
465 465
466#define list_tail_entry(ptr, type, member) \
467 list_entry((ptr)->prev, type, member)
468
469/** 466/**
470 * zbud_reclaim_page() - evicts allocations from a pool page and frees it 467 * zbud_reclaim_page() - evicts allocations from a pool page and frees it
471 * @pool: pool from which a page will attempt to be evicted 468 * @pool: pool from which a page will attempt to be evicted
@@ -514,7 +511,7 @@ int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
514 return -EINVAL; 511 return -EINVAL;
515 } 512 }
516 for (i = 0; i < retries; i++) { 513 for (i = 0; i < retries; i++) {
517 zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru); 514 zhdr = list_last_entry(&pool->lru, struct zbud_header, lru);
518 list_del(&zhdr->lru); 515 list_del(&zhdr->lru);
519 list_del(&zhdr->buddy); 516 list_del(&zhdr->buddy);
520 /* Protect zbud page against free */ 517 /* Protect zbud page against free */
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 9f15bdd9163c..e7414cec220b 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -213,10 +213,10 @@ struct size_class {
213 int size; 213 int size;
214 unsigned int index; 214 unsigned int index;
215 215
216 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
217 int pages_per_zspage;
218 struct zs_size_stat stats; 216 struct zs_size_stat stats;
219 217
218 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
219 int pages_per_zspage;
220 /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ 220 /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
221 bool huge; 221 bool huge;
222}; 222};
diff --git a/net/core/sock.c b/net/core/sock.c
index 51270238e269..6c1c8bc93412 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -195,44 +195,6 @@ bool sk_net_capable(const struct sock *sk, int cap)
195} 195}
196EXPORT_SYMBOL(sk_net_capable); 196EXPORT_SYMBOL(sk_net_capable);
197 197
198
199#ifdef CONFIG_MEMCG_KMEM
200int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
201{
202 struct proto *proto;
203 int ret = 0;
204
205 mutex_lock(&proto_list_mutex);
206 list_for_each_entry(proto, &proto_list, node) {
207 if (proto->init_cgroup) {
208 ret = proto->init_cgroup(memcg, ss);
209 if (ret)
210 goto out;
211 }
212 }
213
214 mutex_unlock(&proto_list_mutex);
215 return ret;
216out:
217 list_for_each_entry_continue_reverse(proto, &proto_list, node)
218 if (proto->destroy_cgroup)
219 proto->destroy_cgroup(memcg);
220 mutex_unlock(&proto_list_mutex);
221 return ret;
222}
223
224void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
225{
226 struct proto *proto;
227
228 mutex_lock(&proto_list_mutex);
229 list_for_each_entry_reverse(proto, &proto_list, node)
230 if (proto->destroy_cgroup)
231 proto->destroy_cgroup(memcg);
232 mutex_unlock(&proto_list_mutex);
233}
234#endif
235
236/* 198/*
237 * Each address family might have different locking rules, so we have 199 * Each address family might have different locking rules, so we have
238 * one slock key per address family: 200 * one slock key per address family:
@@ -240,11 +202,6 @@ void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
240static struct lock_class_key af_family_keys[AF_MAX]; 202static struct lock_class_key af_family_keys[AF_MAX];
241static struct lock_class_key af_family_slock_keys[AF_MAX]; 203static struct lock_class_key af_family_slock_keys[AF_MAX];
242 204
243#if defined(CONFIG_MEMCG_KMEM)
244struct static_key memcg_socket_limit_enabled;
245EXPORT_SYMBOL(memcg_socket_limit_enabled);
246#endif
247
248/* 205/*
249 * Make lock validator output more readable. (we pre-construct these 206 * Make lock validator output more readable. (we pre-construct these
250 * strings build-time, so that runtime initialization of socket 207 * strings build-time, so that runtime initialization of socket
@@ -1507,12 +1464,6 @@ void sk_free(struct sock *sk)
1507} 1464}
1508EXPORT_SYMBOL(sk_free); 1465EXPORT_SYMBOL(sk_free);
1509 1466
1510static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1511{
1512 if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1513 sock_update_memcg(newsk);
1514}
1515
1516/** 1467/**
1517 * sk_clone_lock - clone a socket, and lock its clone 1468 * sk_clone_lock - clone a socket, and lock its clone
1518 * @sk: the socket to clone 1469 * @sk: the socket to clone
@@ -1607,7 +1558,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1607 sk_set_socket(newsk, NULL); 1558 sk_set_socket(newsk, NULL);
1608 newsk->sk_wq = NULL; 1559 newsk->sk_wq = NULL;
1609 1560
1610 sk_update_clone(sk, newsk); 1561 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1562 sock_update_memcg(newsk);
1611 1563
1612 if (newsk->sk_prot->sockets_allocated) 1564 if (newsk->sk_prot->sockets_allocated)
1613 sk_sockets_allocated_inc(newsk); 1565 sk_sockets_allocated_inc(newsk);
@@ -2089,27 +2041,27 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
2089 struct proto *prot = sk->sk_prot; 2041 struct proto *prot = sk->sk_prot;
2090 int amt = sk_mem_pages(size); 2042 int amt = sk_mem_pages(size);
2091 long allocated; 2043 long allocated;
2092 int parent_status = UNDER_LIMIT;
2093 2044
2094 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; 2045 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2095 2046
2096 allocated = sk_memory_allocated_add(sk, amt, &parent_status); 2047 allocated = sk_memory_allocated_add(sk, amt);
2048
2049 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2050 !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2051 goto suppress_allocation;
2097 2052
2098 /* Under limit. */ 2053 /* Under limit. */
2099 if (parent_status == UNDER_LIMIT && 2054 if (allocated <= sk_prot_mem_limits(sk, 0)) {
2100 allocated <= sk_prot_mem_limits(sk, 0)) {
2101 sk_leave_memory_pressure(sk); 2055 sk_leave_memory_pressure(sk);
2102 return 1; 2056 return 1;
2103 } 2057 }
2104 2058
2105 /* Under pressure. (we or our parents) */ 2059 /* Under pressure. */
2106 if ((parent_status > SOFT_LIMIT) || 2060 if (allocated > sk_prot_mem_limits(sk, 1))
2107 allocated > sk_prot_mem_limits(sk, 1))
2108 sk_enter_memory_pressure(sk); 2061 sk_enter_memory_pressure(sk);
2109 2062
2110 /* Over hard limit (we or our parents) */ 2063 /* Over hard limit. */
2111 if ((parent_status == OVER_LIMIT) || 2064 if (allocated > sk_prot_mem_limits(sk, 2))
2112 (allocated > sk_prot_mem_limits(sk, 2)))
2113 goto suppress_allocation; 2065 goto suppress_allocation;
2114 2066
2115 /* guarantee minimum buffer size under pressure */ 2067 /* guarantee minimum buffer size under pressure */
@@ -2158,6 +2110,9 @@ suppress_allocation:
2158 2110
2159 sk_memory_allocated_sub(sk, amt); 2111 sk_memory_allocated_sub(sk, amt);
2160 2112
2113 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2114 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2115
2161 return 0; 2116 return 0;
2162} 2117}
2163EXPORT_SYMBOL(__sk_mem_schedule); 2118EXPORT_SYMBOL(__sk_mem_schedule);
@@ -2173,6 +2128,9 @@ void __sk_mem_reclaim(struct sock *sk, int amount)
2173 sk_memory_allocated_sub(sk, amount); 2128 sk_memory_allocated_sub(sk, amount);
2174 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; 2129 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2175 2130
2131 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2132 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2133
2176 if (sk_under_memory_pressure(sk) && 2134 if (sk_under_memory_pressure(sk) &&
2177 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2135 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2178 sk_leave_memory_pressure(sk); 2136 sk_leave_memory_pressure(sk);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7bb1b091efd1..fd17eec93525 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -422,7 +422,8 @@ void tcp_init_sock(struct sock *sk)
422 sk->sk_rcvbuf = sysctl_tcp_rmem[1]; 422 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
423 423
424 local_bh_disable(); 424 local_bh_disable();
425 sock_update_memcg(sk); 425 if (mem_cgroup_sockets_enabled)
426 sock_update_memcg(sk);
426 sk_sockets_allocated_inc(sk); 427 sk_sockets_allocated_inc(sk);
427 local_bh_enable(); 428 local_bh_enable();
428} 429}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 65947c1f4733..c7d1fb50f381 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1818,7 +1818,9 @@ void tcp_v4_destroy_sock(struct sock *sk)
1818 tcp_saved_syn_free(tp); 1818 tcp_saved_syn_free(tp);
1819 1819
1820 sk_sockets_allocated_dec(sk); 1820 sk_sockets_allocated_dec(sk);
1821 sock_release_memcg(sk); 1821
1822 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1823 sock_release_memcg(sk);
1822} 1824}
1823EXPORT_SYMBOL(tcp_v4_destroy_sock); 1825EXPORT_SYMBOL(tcp_v4_destroy_sock);
1824 1826
@@ -2342,11 +2344,6 @@ struct proto tcp_prot = {
2342 .compat_setsockopt = compat_tcp_setsockopt, 2344 .compat_setsockopt = compat_tcp_setsockopt,
2343 .compat_getsockopt = compat_tcp_getsockopt, 2345 .compat_getsockopt = compat_tcp_getsockopt,
2344#endif 2346#endif
2345#ifdef CONFIG_MEMCG_KMEM
2346 .init_cgroup = tcp_init_cgroup,
2347 .destroy_cgroup = tcp_destroy_cgroup,
2348 .proto_cgroup = tcp_proto_cgroup,
2349#endif
2350 .diag_destroy = tcp_abort, 2347 .diag_destroy = tcp_abort,
2351}; 2348};
2352EXPORT_SYMBOL(tcp_prot); 2349EXPORT_SYMBOL(tcp_prot);
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 2379c1b4efb2..18bc7f745e9c 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -8,75 +8,49 @@
8 8
9int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 9int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
10{ 10{
11 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
12 struct page_counter *counter_parent = NULL;
11 /* 13 /*
12 * The root cgroup does not use page_counters, but rather, 14 * The root cgroup does not use page_counters, but rather,
13 * rely on the data already collected by the network 15 * rely on the data already collected by the network
14 * subsystem 16 * subsystem
15 */ 17 */
16 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 18 if (memcg == root_mem_cgroup)
17 struct page_counter *counter_parent = NULL;
18 struct cg_proto *cg_proto, *parent_cg;
19
20 cg_proto = tcp_prot.proto_cgroup(memcg);
21 if (!cg_proto)
22 return 0; 19 return 0;
23 20
24 cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0]; 21 memcg->tcp_mem.memory_pressure = 0;
25 cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1];
26 cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2];
27 cg_proto->memory_pressure = 0;
28 cg_proto->memcg = memcg;
29 22
30 parent_cg = tcp_prot.proto_cgroup(parent); 23 if (parent)
31 if (parent_cg) 24 counter_parent = &parent->tcp_mem.memory_allocated;
32 counter_parent = &parent_cg->memory_allocated;
33 25
34 page_counter_init(&cg_proto->memory_allocated, counter_parent); 26 page_counter_init(&memcg->tcp_mem.memory_allocated, counter_parent);
35 percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL);
36 27
37 return 0; 28 return 0;
38} 29}
39EXPORT_SYMBOL(tcp_init_cgroup);
40 30
41void tcp_destroy_cgroup(struct mem_cgroup *memcg) 31void tcp_destroy_cgroup(struct mem_cgroup *memcg)
42{ 32{
43 struct cg_proto *cg_proto; 33 if (memcg == root_mem_cgroup)
44
45 cg_proto = tcp_prot.proto_cgroup(memcg);
46 if (!cg_proto)
47 return; 34 return;
48 35
49 percpu_counter_destroy(&cg_proto->sockets_allocated); 36 if (memcg->tcp_mem.active)
50 37 static_branch_dec(&memcg_sockets_enabled_key);
51 if (test_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
52 static_key_slow_dec(&memcg_socket_limit_enabled);
53
54} 38}
55EXPORT_SYMBOL(tcp_destroy_cgroup);
56 39
57static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) 40static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
58{ 41{
59 struct cg_proto *cg_proto;
60 int i;
61 int ret; 42 int ret;
62 43
63 cg_proto = tcp_prot.proto_cgroup(memcg); 44 if (memcg == root_mem_cgroup)
64 if (!cg_proto)
65 return -EINVAL; 45 return -EINVAL;
66 46
67 ret = page_counter_limit(&cg_proto->memory_allocated, nr_pages); 47 ret = page_counter_limit(&memcg->tcp_mem.memory_allocated, nr_pages);
68 if (ret) 48 if (ret)
69 return ret; 49 return ret;
70 50
71 for (i = 0; i < 3; i++) 51 if (!memcg->tcp_mem.active) {
72 cg_proto->sysctl_mem[i] = min_t(long, nr_pages,
73 sysctl_tcp_mem[i]);
74
75 if (nr_pages == PAGE_COUNTER_MAX)
76 clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
77 else {
78 /* 52 /*
79 * The active bit needs to be written after the static_key 53 * The active flag needs to be written after the static_key
80 * update. This is what guarantees that the socket activation 54 * update. This is what guarantees that the socket activation
81 * function is the last one to run. See sock_update_memcg() for 55 * function is the last one to run. See sock_update_memcg() for
82 * details, and note that we don't mark any socket as belonging 56 * details, and note that we don't mark any socket as belonging
@@ -90,14 +64,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
90 * We never race with the readers in sock_update_memcg(), 64 * We never race with the readers in sock_update_memcg(),
91 * because when this value change, the code to process it is not 65 * because when this value change, the code to process it is not
92 * patched in yet. 66 * patched in yet.
93 *
94 * The activated bit is used to guarantee that no two writers
95 * will do the update in the same memcg. Without that, we can't
96 * properly shutdown the static key.
97 */ 67 */
98 if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags)) 68 static_branch_inc(&memcg_sockets_enabled_key);
99 static_key_slow_inc(&memcg_socket_limit_enabled); 69 memcg->tcp_mem.active = true;
100 set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
101 } 70 }
102 71
103 return 0; 72 return 0;
@@ -141,32 +110,32 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
141static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) 110static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
142{ 111{
143 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 112 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
144 struct cg_proto *cg_proto = tcp_prot.proto_cgroup(memcg);
145 u64 val; 113 u64 val;
146 114
147 switch (cft->private) { 115 switch (cft->private) {
148 case RES_LIMIT: 116 case RES_LIMIT:
149 if (!cg_proto) 117 if (memcg == root_mem_cgroup)
150 return PAGE_COUNTER_MAX; 118 val = PAGE_COUNTER_MAX;
151 val = cg_proto->memory_allocated.limit; 119 else
120 val = memcg->tcp_mem.memory_allocated.limit;
152 val *= PAGE_SIZE; 121 val *= PAGE_SIZE;
153 break; 122 break;
154 case RES_USAGE: 123 case RES_USAGE:
155 if (!cg_proto) 124 if (memcg == root_mem_cgroup)
156 val = atomic_long_read(&tcp_memory_allocated); 125 val = atomic_long_read(&tcp_memory_allocated);
157 else 126 else
158 val = page_counter_read(&cg_proto->memory_allocated); 127 val = page_counter_read(&memcg->tcp_mem.memory_allocated);
159 val *= PAGE_SIZE; 128 val *= PAGE_SIZE;
160 break; 129 break;
161 case RES_FAILCNT: 130 case RES_FAILCNT:
162 if (!cg_proto) 131 if (memcg == root_mem_cgroup)
163 return 0; 132 return 0;
164 val = cg_proto->memory_allocated.failcnt; 133 val = memcg->tcp_mem.memory_allocated.failcnt;
165 break; 134 break;
166 case RES_MAX_USAGE: 135 case RES_MAX_USAGE:
167 if (!cg_proto) 136 if (memcg == root_mem_cgroup)
168 return 0; 137 return 0;
169 val = cg_proto->memory_allocated.watermark; 138 val = memcg->tcp_mem.memory_allocated.watermark;
170 val *= PAGE_SIZE; 139 val *= PAGE_SIZE;
171 break; 140 break;
172 default: 141 default:
@@ -179,19 +148,17 @@ static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of,
179 char *buf, size_t nbytes, loff_t off) 148 char *buf, size_t nbytes, loff_t off)
180{ 149{
181 struct mem_cgroup *memcg; 150 struct mem_cgroup *memcg;
182 struct cg_proto *cg_proto;
183 151
184 memcg = mem_cgroup_from_css(of_css(of)); 152 memcg = mem_cgroup_from_css(of_css(of));
185 cg_proto = tcp_prot.proto_cgroup(memcg); 153 if (memcg == root_mem_cgroup)
186 if (!cg_proto)
187 return nbytes; 154 return nbytes;
188 155
189 switch (of_cft(of)->private) { 156 switch (of_cft(of)->private) {
190 case RES_MAX_USAGE: 157 case RES_MAX_USAGE:
191 page_counter_reset_watermark(&cg_proto->memory_allocated); 158 page_counter_reset_watermark(&memcg->tcp_mem.memory_allocated);
192 break; 159 break;
193 case RES_FAILCNT: 160 case RES_FAILCNT:
194 cg_proto->memory_allocated.failcnt = 0; 161 memcg->tcp_mem.memory_allocated.failcnt = 0;
195 break; 162 break;
196 } 163 }
197 164
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 412a920fe0ec..fda379cd600d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2813,13 +2813,16 @@ begin_fwd:
2813 */ 2813 */
2814void sk_forced_mem_schedule(struct sock *sk, int size) 2814void sk_forced_mem_schedule(struct sock *sk, int size)
2815{ 2815{
2816 int amt, status; 2816 int amt;
2817 2817
2818 if (size <= sk->sk_forward_alloc) 2818 if (size <= sk->sk_forward_alloc)
2819 return; 2819 return;
2820 amt = sk_mem_pages(size); 2820 amt = sk_mem_pages(size);
2821 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; 2821 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2822 sk_memory_allocated_add(sk, amt, &status); 2822 sk_memory_allocated_add(sk, amt);
2823
2824 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2825 mem_cgroup_charge_skmem(sk->sk_memcg, amt);
2823} 2826}
2824 2827
2825/* Send a FIN. The caller locks the socket for us. 2828/* Send a FIN. The caller locks the socket for us.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index db9f1c318afc..4ad8edb46f7c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1889,9 +1889,6 @@ struct proto tcpv6_prot = {
1889 .compat_setsockopt = compat_tcp_setsockopt, 1889 .compat_setsockopt = compat_tcp_setsockopt,
1890 .compat_getsockopt = compat_tcp_getsockopt, 1890 .compat_getsockopt = compat_tcp_getsockopt,
1891#endif 1891#endif
1892#ifdef CONFIG_MEMCG_KMEM
1893 .proto_cgroup = tcp_proto_cgroup,
1894#endif
1895 .clear_sk = tcp_v6_clear_sk, 1892 .clear_sk = tcp_v6_clear_sk,
1896 .diag_destroy = tcp_abort, 1893 .diag_destroy = tcp_abort,
1897}; 1894};
diff --git a/net/socket.c b/net/socket.c
index 91c2de6f5020..c044d1e8508c 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -294,7 +294,7 @@ static int init_inodecache(void)
294 0, 294 0,
295 (SLAB_HWCACHE_ALIGN | 295 (SLAB_HWCACHE_ALIGN |
296 SLAB_RECLAIM_ACCOUNT | 296 SLAB_RECLAIM_ACCOUNT |
297 SLAB_MEM_SPREAD), 297 SLAB_MEM_SPREAD | SLAB_ACCOUNT),
298 init_once); 298 init_once);
299 if (sock_inode_cachep == NULL) 299 if (sock_inode_cachep == NULL)
300 return -ENOMEM; 300 return -ENOMEM;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index d81186d34558..14f45bf0410c 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -1500,7 +1500,7 @@ int register_rpc_pipefs(void)
1500 rpc_inode_cachep = kmem_cache_create("rpc_inode_cache", 1500 rpc_inode_cachep = kmem_cache_create("rpc_inode_cache",
1501 sizeof(struct rpc_inode), 1501 sizeof(struct rpc_inode),
1502 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 1502 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
1503 SLAB_MEM_SPREAD), 1503 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
1504 init_once); 1504 init_once);
1505 if (!rpc_inode_cachep) 1505 if (!rpc_inode_cachep)
1506 return -ENOMEM; 1506 return -ENOMEM;
diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter
index 23e78dcd12bf..38b64f487315 100755
--- a/scripts/bloat-o-meter
+++ b/scripts/bloat-o-meter
@@ -58,8 +58,8 @@ for name in common:
58delta.sort() 58delta.sort()
59delta.reverse() 59delta.reverse()
60 60
61print "add/remove: %s/%s grow/shrink: %s/%s up/down: %s/%s (%s)" % \ 61print("add/remove: %s/%s grow/shrink: %s/%s up/down: %s/%s (%s)" % \
62 (add, remove, grow, shrink, up, -down, up-down) 62 (add, remove, grow, shrink, up, -down, up-down))
63print "%-40s %7s %7s %+7s" % ("function", "old", "new", "delta") 63print("%-40s %7s %7s %+7s" % ("function", "old", "new", "delta"))
64for d, n in delta: 64for d, n in delta:
65 if d: print "%-40s %7s %7s %+7d" % (n, old.get(n,"-"), new.get(n,"-"), d) 65 if d: print("%-40s %7s %7s %+7d" % (n, old.get(n,"-"), new.get(n,"-"), d))
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 8adca4406198..161dd0d67da8 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -125,7 +125,7 @@ do { \
125 sprintf(str + strlen(str), "*"); \ 125 sprintf(str + strlen(str), "*"); \
126} while(0) 126} while(0)
127 127
128/* Always end in a wildcard, for future extension */ 128/* End in a wildcard, for future extension */
129static inline void add_wildcard(char *str) 129static inline void add_wildcard(char *str)
130{ 130{
131 int len = strlen(str); 131 int len = strlen(str);
@@ -704,7 +704,6 @@ static int do_of_entry (const char *filename, void *symval, char *alias)
704 if (isspace (*tmp)) 704 if (isspace (*tmp))
705 *tmp = '_'; 705 *tmp = '_';
706 706
707 add_wildcard(alias);
708 return 1; 707 return 1;
709} 708}
710ADD_TO_DEVTABLE("of", of_device_id, do_of_entry); 709ADD_TO_DEVTABLE("of", of_device_id, do_of_entry);