aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads5
-rw-r--r--Documentation/cgroups/hugetlb.txt45
-rw-r--r--Documentation/cgroups/memory.txt12
-rw-r--r--Documentation/feature-removal-schedule.txt8
-rw-r--r--Documentation/filesystems/Locking13
-rw-r--r--Documentation/filesystems/vfs.txt12
-rw-r--r--Documentation/sysctl/vm.txt30
-rw-r--r--arch/ia64/kernel/perfmon.c1
-rw-r--r--arch/mips/sgi-ip27/ip27-memory.c1
-rw-r--r--arch/powerpc/configs/chroma_defconfig4
-rw-r--r--arch/s390/defconfig2
-rw-r--r--arch/sh/configs/apsh4ad0a_defconfig2
-rw-r--r--arch/sh/configs/sdk7786_defconfig4
-rw-r--r--arch/sh/configs/se7206_defconfig2
-rw-r--r--arch/sh/configs/shx3_defconfig2
-rw-r--r--arch/sh/configs/urquell_defconfig4
-rw-r--r--arch/tile/configs/tilegx_defconfig4
-rw-r--r--arch/tile/configs/tilepro_defconfig4
-rw-r--r--arch/um/defconfig8
-rw-r--r--arch/xtensa/Kconfig1
-rw-r--r--drivers/base/Kconfig1
-rw-r--r--drivers/block/nbd.c6
-rw-r--r--drivers/net/ethernet/chelsio/cxgb4/sge.c2
-rw-r--r--drivers/net/ethernet/chelsio/cxgb4vf/sge.c2
-rw-r--r--drivers/net/ethernet/intel/igb/igb_main.c2
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe_main.c4
-rw-r--r--drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c1
-rw-r--r--drivers/net/usb/cdc-phonet.c2
-rw-r--r--drivers/rtc/rtc-88pm80x.c4
-rw-r--r--drivers/usb/gadget/f_phonet.c2
-rw-r--r--fs/fs-writeback.c5
-rw-r--r--fs/hugetlbfs/inode.c4
-rw-r--r--fs/nfs/Kconfig8
-rw-r--r--fs/nfs/direct.c82
-rw-r--r--fs/nfs/file.c28
-rw-r--r--fs/nfs/inode.c4
-rw-r--r--fs/nfs/internal.h7
-rw-r--r--fs/nfs/pagelist.c4
-rw-r--r--fs/nfs/read.c6
-rw-r--r--fs/nfs/write.c89
-rw-r--r--fs/super.c2
-rw-r--r--include/linux/backing-dev.h3
-rw-r--r--include/linux/blk_types.h2
-rw-r--r--include/linux/cgroup_subsys.h8
-rw-r--r--include/linux/compaction.h4
-rw-r--r--include/linux/fs.h8
-rw-r--r--include/linux/gfp.h13
-rw-r--r--include/linux/highmem.h7
-rw-r--r--include/linux/hugetlb.h50
-rw-r--r--include/linux/hugetlb_cgroup.h126
-rw-r--r--include/linux/memcontrol.h34
-rw-r--r--include/linux/migrate.h4
-rw-r--r--include/linux/mm.h31
-rw-r--r--include/linux/mm_types.h9
-rw-r--r--include/linux/mmzone.h26
-rw-r--r--include/linux/nfs_fs.h4
-rw-r--r--include/linux/oom.h21
-rw-r--r--include/linux/page-flags.h29
-rw-r--r--include/linux/page-isolation.h13
-rw-r--r--include/linux/page_cgroup.h10
-rw-r--r--include/linux/pagemap.h5
-rw-r--r--include/linux/sched.h9
-rw-r--r--include/linux/shrinker.h1
-rw-r--r--include/linux/skbuff.h80
-rw-r--r--include/linux/sunrpc/xprt.h3
-rw-r--r--include/linux/swap.h14
-rw-r--r--include/linux/vm_event_item.h1
-rw-r--r--include/linux/vmstat.h5
-rw-r--r--include/linux/writeback.h5
-rw-r--r--include/net/sock.h40
-rw-r--r--include/trace/events/gfpflags.h1
-rw-r--r--init/Kconfig29
-rw-r--r--init/main.c2
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/softirq.c9
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--mm/Kconfig5
-rw-r--r--mm/Makefile8
-rw-r--r--mm/backing-dev.c20
-rw-r--r--mm/compaction.c63
-rw-r--r--mm/fadvise.c18
-rw-r--r--mm/highmem.c12
-rw-r--r--mm/hugetlb.c195
-rw-r--r--mm/hugetlb_cgroup.c418
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/internal.h8
-rw-r--r--mm/memblock.c35
-rw-r--r--mm/memcontrol.c390
-rw-r--r--mm/memory-failure.c17
-rw-r--r--mm/memory.c9
-rw-r--r--mm/memory_hotplug.c20
-rw-r--r--mm/migrate.c81
-rw-r--r--mm/mmap.c5
-rw-r--r--mm/mmu_notifier.c45
-rw-r--r--mm/mmzone.c2
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/oom_kill.c223
-rw-r--r--mm/page_alloc.c318
-rw-r--r--mm/page_cgroup.c2
-rw-r--r--mm/page_io.c145
-rw-r--r--mm/page_isolation.c93
-rw-r--r--mm/shmem.c6
-rw-r--r--mm/slab.c216
-rw-r--r--mm/slub.c30
-rw-r--r--mm/sparse.c29
-rw-r--r--mm/swap.c52
-rw-r--r--mm/swap_state.c7
-rw-r--r--mm/swapfile.c145
-rw-r--r--mm/vmalloc.c16
-rw-r--r--mm/vmscan.c175
-rw-r--r--mm/vmstat.c1
-rw-r--r--net/caif/caif_socket.c2
-rw-r--r--net/core/dev.c53
-rw-r--r--net/core/filter.c8
-rw-r--r--net/core/skbuff.c124
-rw-r--r--net/core/sock.c59
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c4
-rw-r--r--net/ipv4/tcp_input.c21
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_output.c12
-rw-r--r--net/ipv6/tcp_ipv6.c10
-rw-r--r--net/sctp/ulpevent.c3
-rw-r--r--net/sunrpc/Kconfig5
-rw-r--r--net/sunrpc/clnt.c9
-rw-r--r--net/sunrpc/sched.c7
-rw-r--r--net/sunrpc/xprtsock.c43
-rw-r--r--security/selinux/avc.c2
-rw-r--r--tools/testing/fault-injection/failcmd.sh2
131 files changed, 3174 insertions, 1060 deletions
diff --git a/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads b/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads
new file mode 100644
index 000000000000..b0b0eeb20fe3
--- /dev/null
+++ b/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads
@@ -0,0 +1,5 @@
1What: /proc/sys/vm/nr_pdflush_threads
2Date: June 2012
3Contact: Wanpeng Li <liwp@linux.vnet.ibm.com>
4Description: Since pdflush is replaced by per-BDI flusher, the interface of old pdflush
5 exported in /proc/sys/vm/ should be removed.
diff --git a/Documentation/cgroups/hugetlb.txt b/Documentation/cgroups/hugetlb.txt
new file mode 100644
index 000000000000..a9faaca1f029
--- /dev/null
+++ b/Documentation/cgroups/hugetlb.txt
@@ -0,0 +1,45 @@
1HugeTLB Controller
2-------------------
3
4The HugeTLB controller allows to limit the HugeTLB usage per control group and
5enforces the controller limit during page fault. Since HugeTLB doesn't
6support page reclaim, enforcing the limit at page fault time implies that,
7the application will get SIGBUS signal if it tries to access HugeTLB pages
8beyond its limit. This requires the application to know beforehand how much
9HugeTLB pages it would require for its use.
10
11HugeTLB controller can be created by first mounting the cgroup filesystem.
12
13# mount -t cgroup -o hugetlb none /sys/fs/cgroup
14
15With the above step, the initial or the parent HugeTLB group becomes
16visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
17the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
18
19New groups can be created under the parent group /sys/fs/cgroup.
20
21# cd /sys/fs/cgroup
22# mkdir g1
23# echo $$ > g1/tasks
24
25The above steps create a new group g1 and move the current shell
26process (bash) into it.
27
28Brief summary of control files
29
30 hugetlb.<hugepagesize>.limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage
31 hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded
32 hugetlb.<hugepagesize>.usage_in_bytes # show current res_counter usage for "hugepagesize" hugetlb
33 hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB limit
34
35For a system supporting two hugepage size (16M and 16G) the control
36files include:
37
38hugetlb.16GB.limit_in_bytes
39hugetlb.16GB.max_usage_in_bytes
40hugetlb.16GB.usage_in_bytes
41hugetlb.16GB.failcnt
42hugetlb.16MB.limit_in_bytes
43hugetlb.16MB.max_usage_in_bytes
44hugetlb.16MB.usage_in_bytes
45hugetlb.16MB.failcnt
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index dd88540bb995..4372e6b8a353 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -73,6 +73,8 @@ Brief summary of control files.
73 73
74 memory.kmem.tcp.limit_in_bytes # set/show hard limit for tcp buf memory 74 memory.kmem.tcp.limit_in_bytes # set/show hard limit for tcp buf memory
75 memory.kmem.tcp.usage_in_bytes # show current tcp buf memory allocation 75 memory.kmem.tcp.usage_in_bytes # show current tcp buf memory allocation
76 memory.kmem.tcp.failcnt # show the number of tcp buf memory usage hits limits
77 memory.kmem.tcp.max_usage_in_bytes # show max tcp buf memory usage recorded
76 78
771. History 791. History
78 80
@@ -187,12 +189,12 @@ the cgroup that brought it in -- this will happen on memory pressure).
187But see section 8.2: when moving a task to another cgroup, its pages may 189But see section 8.2: when moving a task to another cgroup, its pages may
188be recharged to the new cgroup, if move_charge_at_immigrate has been chosen. 190be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
189 191
190Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used. 192Exception: If CONFIG_CGROUP_CGROUP_MEMCG_SWAP is not used.
191When you do swapoff and make swapped-out pages of shmem(tmpfs) to 193When you do swapoff and make swapped-out pages of shmem(tmpfs) to
192be backed into memory in force, charges for pages are accounted against the 194be backed into memory in force, charges for pages are accounted against the
193caller of swapoff rather than the users of shmem. 195caller of swapoff rather than the users of shmem.
194 196
1952.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) 1972.4 Swap Extension (CONFIG_MEMCG_SWAP)
196 198
197Swap Extension allows you to record charge for swap. A swapped-in page is 199Swap Extension allows you to record charge for swap. A swapped-in page is
198charged back to original page allocator if possible. 200charged back to original page allocator if possible.
@@ -259,7 +261,7 @@ When oom event notifier is registered, event will be delivered.
259 per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by 261 per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
260 zone->lru_lock, it has no lock of its own. 262 zone->lru_lock, it has no lock of its own.
261 263
2622.7 Kernel Memory Extension (CONFIG_CGROUP_MEM_RES_CTLR_KMEM) 2642.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
263 265
264With the Kernel memory extension, the Memory Controller is able to limit 266With the Kernel memory extension, the Memory Controller is able to limit
265the amount of kernel memory used by the system. Kernel memory is fundamentally 267the amount of kernel memory used by the system. Kernel memory is fundamentally
@@ -286,8 +288,8 @@ per cgroup, instead of globally.
286 288
287a. Enable CONFIG_CGROUPS 289a. Enable CONFIG_CGROUPS
288b. Enable CONFIG_RESOURCE_COUNTERS 290b. Enable CONFIG_RESOURCE_COUNTERS
289c. Enable CONFIG_CGROUP_MEM_RES_CTLR 291c. Enable CONFIG_MEMCG
290d. Enable CONFIG_CGROUP_MEM_RES_CTLR_SWAP (to use swap extension) 292d. Enable CONFIG_MEMCG_SWAP (to use swap extension)
291 293
2921. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) 2941. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
293# mount -t tmpfs none /sys/fs/cgroup 295# mount -t tmpfs none /sys/fs/cgroup
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 24fec7603e5e..72ed15075f79 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -13,6 +13,14 @@ Who: Jim Cromie <jim.cromie@gmail.com>, Jason Baron <jbaron@redhat.com>
13 13
14--------------------------- 14---------------------------
15 15
16What: /proc/sys/vm/nr_pdflush_threads
17When: 2012
18Why: Since pdflush is deprecated, the interface exported in /proc/sys/vm/
19 should be removed.
20Who: Wanpeng Li <liwp@linux.vnet.ibm.com>
21
22---------------------------
23
16What: CONFIG_APM_CPU_IDLE, and its ability to call APM BIOS in idle 24What: CONFIG_APM_CPU_IDLE, and its ability to call APM BIOS in idle
17When: 2012 25When: 2012
18Why: This optional sub-feature of APM is of dubious reliability, 26Why: This optional sub-feature of APM is of dubious reliability,
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index e0cce2a5f820..2db1900d7538 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -206,6 +206,8 @@ prototypes:
206 int (*launder_page)(struct page *); 206 int (*launder_page)(struct page *);
207 int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long); 207 int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long);
208 int (*error_remove_page)(struct address_space *, struct page *); 208 int (*error_remove_page)(struct address_space *, struct page *);
209 int (*swap_activate)(struct file *);
210 int (*swap_deactivate)(struct file *);
209 211
210locking rules: 212locking rules:
211 All except set_page_dirty and freepage may block 213 All except set_page_dirty and freepage may block
@@ -229,6 +231,8 @@ migratepage: yes (both)
229launder_page: yes 231launder_page: yes
230is_partially_uptodate: yes 232is_partially_uptodate: yes
231error_remove_page: yes 233error_remove_page: yes
234swap_activate: no
235swap_deactivate: no
232 236
233 ->write_begin(), ->write_end(), ->sync_page() and ->readpage() 237 ->write_begin(), ->write_end(), ->sync_page() and ->readpage()
234may be called from the request handler (/dev/loop). 238may be called from the request handler (/dev/loop).
@@ -330,6 +334,15 @@ cleaned, or an error value if not. Note that in order to prevent the page
330getting mapped back in and redirtied, it needs to be kept locked 334getting mapped back in and redirtied, it needs to be kept locked
331across the entire operation. 335across the entire operation.
332 336
337 ->swap_activate will be called with a non-zero argument on
338files backing (non block device backed) swapfiles. A return value
339of zero indicates success, in which case this file can be used for
340backing swapspace. The swapspace operations will be proxied to the
341address space operations.
342
343 ->swap_deactivate() will be called in the sys_swapoff()
344path after ->swap_activate() returned success.
345
333----------------------- file_lock_operations ------------------------------ 346----------------------- file_lock_operations ------------------------------
334prototypes: 347prototypes:
335 void (*fl_copy_lock)(struct file_lock *, struct file_lock *); 348 void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index aa754e01464e..065aa2dc0835 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -592,6 +592,8 @@ struct address_space_operations {
592 int (*migratepage) (struct page *, struct page *); 592 int (*migratepage) (struct page *, struct page *);
593 int (*launder_page) (struct page *); 593 int (*launder_page) (struct page *);
594 int (*error_remove_page) (struct mapping *mapping, struct page *page); 594 int (*error_remove_page) (struct mapping *mapping, struct page *page);
595 int (*swap_activate)(struct file *);
596 int (*swap_deactivate)(struct file *);
595}; 597};
596 598
597 writepage: called by the VM to write a dirty page to backing store. 599 writepage: called by the VM to write a dirty page to backing store.
@@ -760,6 +762,16 @@ struct address_space_operations {
760 Setting this implies you deal with pages going away under you, 762 Setting this implies you deal with pages going away under you,
761 unless you have them locked or reference counts increased. 763 unless you have them locked or reference counts increased.
762 764
765 swap_activate: Called when swapon is used on a file to allocate
766 space if necessary and pin the block lookup information in
767 memory. A return value of zero indicates success,
768 in which case this file can be used to back swapspace. The
769 swapspace operations will be proxied to this address space's
770 ->swap_{out,in} methods.
771
772 swap_deactivate: Called during swapoff on files where swap_activate
773 was successful.
774
763 775
764The File Object 776The File Object
765=============== 777===============
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 96f0ee825bed..dcc2a94ae34e 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -42,7 +42,6 @@ Currently, these files are in /proc/sys/vm:
42- mmap_min_addr 42- mmap_min_addr
43- nr_hugepages 43- nr_hugepages
44- nr_overcommit_hugepages 44- nr_overcommit_hugepages
45- nr_pdflush_threads
46- nr_trim_pages (only if CONFIG_MMU=n) 45- nr_trim_pages (only if CONFIG_MMU=n)
47- numa_zonelist_order 46- numa_zonelist_order
48- oom_dump_tasks 47- oom_dump_tasks
@@ -426,16 +425,6 @@ See Documentation/vm/hugetlbpage.txt
426 425
427============================================================== 426==============================================================
428 427
429nr_pdflush_threads
430
431The current number of pdflush threads. This value is read-only.
432The value changes according to the number of dirty pages in the system.
433
434When necessary, additional pdflush threads are created, one per second, up to
435nr_pdflush_threads_max.
436
437==============================================================
438
439nr_trim_pages 428nr_trim_pages
440 429
441This is available only on NOMMU kernels. 430This is available only on NOMMU kernels.
@@ -502,9 +491,10 @@ oom_dump_tasks
502 491
503Enables a system-wide task dump (excluding kernel threads) to be 492Enables a system-wide task dump (excluding kernel threads) to be
504produced when the kernel performs an OOM-killing and includes such 493produced when the kernel performs an OOM-killing and includes such
505information as pid, uid, tgid, vm size, rss, cpu, oom_adj score, and 494information as pid, uid, tgid, vm size, rss, nr_ptes, swapents,
506name. This is helpful to determine why the OOM killer was invoked 495oom_score_adj score, and name. This is helpful to determine why the
507and to identify the rogue task that caused it. 496OOM killer was invoked, to identify the rogue task that caused it,
497and to determine why the OOM killer chose the task it did to kill.
508 498
509If this is set to zero, this information is suppressed. On very 499If this is set to zero, this information is suppressed. On very
510large systems with thousands of tasks it may not be feasible to dump 500large systems with thousands of tasks it may not be feasible to dump
@@ -574,16 +564,24 @@ of physical RAM. See above.
574 564
575page-cluster 565page-cluster
576 566
577page-cluster controls the number of pages which are written to swap in 567page-cluster controls the number of pages up to which consecutive pages
578a single attempt. The swap I/O size. 568are read in from swap in a single attempt. This is the swap counterpart
569to page cache readahead.
570The mentioned consecutivity is not in terms of virtual/physical addresses,
571but consecutive on swap space - that means they were swapped out together.
579 572
580It is a logarithmic value - setting it to zero means "1 page", setting 573It is a logarithmic value - setting it to zero means "1 page", setting
581it to 1 means "2 pages", setting it to 2 means "4 pages", etc. 574it to 1 means "2 pages", setting it to 2 means "4 pages", etc.
575Zero disables swap readahead completely.
582 576
583The default value is three (eight pages at a time). There may be some 577The default value is three (eight pages at a time). There may be some
584small benefits in tuning this to a different value if your workload is 578small benefits in tuning this to a different value if your workload is
585swap-intensive. 579swap-intensive.
586 580
581Lower values mean lower latencies for initial faults, but at the same time
582extra faults and I/O delays for following faults if they would have been part of
583that consecutive pages readahead would have brought in.
584
587============================================================= 585=============================================================
588 586
589panic_on_oom 587panic_on_oom
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index d7f558c1e711..3fa4bc536953 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2353,7 +2353,6 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
2353 */ 2353 */
2354 insert_vm_struct(mm, vma); 2354 insert_vm_struct(mm, vma);
2355 2355
2356 mm->total_vm += size >> PAGE_SHIFT;
2357 vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, 2356 vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
2358 vma_pages(vma)); 2357 vma_pages(vma));
2359 up_write(&task->mm->mmap_sem); 2358 up_write(&task->mm->mmap_sem);
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index b105eca3c020..cd8fcab6b054 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -401,6 +401,7 @@ static void __init node_mem_init(cnodeid_t node)
401 * Allocate the node data structures on the node first. 401 * Allocate the node data structures on the node first.
402 */ 402 */
403 __node_data[node] = __va(slot_freepfn << PAGE_SHIFT); 403 __node_data[node] = __va(slot_freepfn << PAGE_SHIFT);
404 memset(__node_data[node], 0, PAGE_SIZE);
404 405
405 NODE_DATA(node)->bdata = &bootmem_node_data[node]; 406 NODE_DATA(node)->bdata = &bootmem_node_data[node];
406 NODE_DATA(node)->node_start_pfn = start_pfn; 407 NODE_DATA(node)->node_start_pfn = start_pfn;
diff --git a/arch/powerpc/configs/chroma_defconfig b/arch/powerpc/configs/chroma_defconfig
index b1f9597fe312..29bb11ec6c64 100644
--- a/arch/powerpc/configs/chroma_defconfig
+++ b/arch/powerpc/configs/chroma_defconfig
@@ -21,8 +21,8 @@ CONFIG_CGROUP_DEVICE=y
21CONFIG_CPUSETS=y 21CONFIG_CPUSETS=y
22CONFIG_CGROUP_CPUACCT=y 22CONFIG_CGROUP_CPUACCT=y
23CONFIG_RESOURCE_COUNTERS=y 23CONFIG_RESOURCE_COUNTERS=y
24CONFIG_CGROUP_MEM_RES_CTLR=y 24CONFIG_CGROUP_MEMCG=y
25CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y 25CONFIG_CGROUP_MEMCG_SWAP=y
26CONFIG_NAMESPACES=y 26CONFIG_NAMESPACES=y
27CONFIG_RELAY=y 27CONFIG_RELAY=y
28CONFIG_BLK_DEV_INITRD=y 28CONFIG_BLK_DEV_INITRD=y
diff --git a/arch/s390/defconfig b/arch/s390/defconfig
index 967923dea98d..f39cd710980b 100644
--- a/arch/s390/defconfig
+++ b/arch/s390/defconfig
@@ -16,7 +16,7 @@ CONFIG_CGROUPS=y
16CONFIG_CPUSETS=y 16CONFIG_CPUSETS=y
17CONFIG_CGROUP_CPUACCT=y 17CONFIG_CGROUP_CPUACCT=y
18CONFIG_RESOURCE_COUNTERS=y 18CONFIG_RESOURCE_COUNTERS=y
19CONFIG_CGROUP_MEM_RES_CTLR=y 19CONFIG_CGROUP_MEMCG=y
20CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y 20CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y
21CONFIG_CGROUP_SCHED=y 21CONFIG_CGROUP_SCHED=y
22CONFIG_RT_GROUP_SCHED=y 22CONFIG_RT_GROUP_SCHED=y
diff --git a/arch/sh/configs/apsh4ad0a_defconfig b/arch/sh/configs/apsh4ad0a_defconfig
index e7583484cc07..95ae23fcfdd6 100644
--- a/arch/sh/configs/apsh4ad0a_defconfig
+++ b/arch/sh/configs/apsh4ad0a_defconfig
@@ -11,7 +11,7 @@ CONFIG_CGROUP_FREEZER=y
11CONFIG_CGROUP_DEVICE=y 11CONFIG_CGROUP_DEVICE=y
12CONFIG_CGROUP_CPUACCT=y 12CONFIG_CGROUP_CPUACCT=y
13CONFIG_RESOURCE_COUNTERS=y 13CONFIG_RESOURCE_COUNTERS=y
14CONFIG_CGROUP_MEM_RES_CTLR=y 14CONFIG_CGROUP_MEMCG=y
15CONFIG_BLK_CGROUP=y 15CONFIG_BLK_CGROUP=y
16CONFIG_NAMESPACES=y 16CONFIG_NAMESPACES=y
17CONFIG_BLK_DEV_INITRD=y 17CONFIG_BLK_DEV_INITRD=y
diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig
index 8a7dd7b59c5c..76a76a295d74 100644
--- a/arch/sh/configs/sdk7786_defconfig
+++ b/arch/sh/configs/sdk7786_defconfig
@@ -18,8 +18,8 @@ CONFIG_CPUSETS=y
18# CONFIG_PROC_PID_CPUSET is not set 18# CONFIG_PROC_PID_CPUSET is not set
19CONFIG_CGROUP_CPUACCT=y 19CONFIG_CGROUP_CPUACCT=y
20CONFIG_RESOURCE_COUNTERS=y 20CONFIG_RESOURCE_COUNTERS=y
21CONFIG_CGROUP_MEM_RES_CTLR=y 21CONFIG_CGROUP_MEMCG=y
22CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y 22CONFIG_CGROUP_MEMCG_SWAP=y
23CONFIG_CGROUP_SCHED=y 23CONFIG_CGROUP_SCHED=y
24CONFIG_RT_GROUP_SCHED=y 24CONFIG_RT_GROUP_SCHED=y
25CONFIG_BLK_CGROUP=y 25CONFIG_BLK_CGROUP=y
diff --git a/arch/sh/configs/se7206_defconfig b/arch/sh/configs/se7206_defconfig
index 72c3fad7383f..6bc30ab9fd18 100644
--- a/arch/sh/configs/se7206_defconfig
+++ b/arch/sh/configs/se7206_defconfig
@@ -11,7 +11,7 @@ CONFIG_CGROUP_DEBUG=y
11CONFIG_CGROUP_DEVICE=y 11CONFIG_CGROUP_DEVICE=y
12CONFIG_CGROUP_CPUACCT=y 12CONFIG_CGROUP_CPUACCT=y
13CONFIG_RESOURCE_COUNTERS=y 13CONFIG_RESOURCE_COUNTERS=y
14CONFIG_CGROUP_MEM_RES_CTLR=y 14CONFIG_CGROUP_MEMCG=y
15CONFIG_RELAY=y 15CONFIG_RELAY=y
16CONFIG_NAMESPACES=y 16CONFIG_NAMESPACES=y
17CONFIG_UTS_NS=y 17CONFIG_UTS_NS=y
diff --git a/arch/sh/configs/shx3_defconfig b/arch/sh/configs/shx3_defconfig
index 6bb413036892..cd6c519f8fad 100644
--- a/arch/sh/configs/shx3_defconfig
+++ b/arch/sh/configs/shx3_defconfig
@@ -13,7 +13,7 @@ CONFIG_CGROUP_FREEZER=y
13CONFIG_CGROUP_DEVICE=y 13CONFIG_CGROUP_DEVICE=y
14CONFIG_CGROUP_CPUACCT=y 14CONFIG_CGROUP_CPUACCT=y
15CONFIG_RESOURCE_COUNTERS=y 15CONFIG_RESOURCE_COUNTERS=y
16CONFIG_CGROUP_MEM_RES_CTLR=y 16CONFIG_CGROUP_MEMCG=y
17CONFIG_RELAY=y 17CONFIG_RELAY=y
18CONFIG_NAMESPACES=y 18CONFIG_NAMESPACES=y
19CONFIG_UTS_NS=y 19CONFIG_UTS_NS=y
diff --git a/arch/sh/configs/urquell_defconfig b/arch/sh/configs/urquell_defconfig
index 8bfa4d056d7a..d7f89be9f474 100644
--- a/arch/sh/configs/urquell_defconfig
+++ b/arch/sh/configs/urquell_defconfig
@@ -15,8 +15,8 @@ CONFIG_CPUSETS=y
15# CONFIG_PROC_PID_CPUSET is not set 15# CONFIG_PROC_PID_CPUSET is not set
16CONFIG_CGROUP_CPUACCT=y 16CONFIG_CGROUP_CPUACCT=y
17CONFIG_RESOURCE_COUNTERS=y 17CONFIG_RESOURCE_COUNTERS=y
18CONFIG_CGROUP_MEM_RES_CTLR=y 18CONFIG_CGROUP_MEMCG=y
19CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y 19CONFIG_CGROUP_MEMCG_SWAP=y
20CONFIG_CGROUP_SCHED=y 20CONFIG_CGROUP_SCHED=y
21CONFIG_RT_GROUP_SCHED=y 21CONFIG_RT_GROUP_SCHED=y
22CONFIG_BLK_DEV_INITRD=y 22CONFIG_BLK_DEV_INITRD=y
diff --git a/arch/tile/configs/tilegx_defconfig b/arch/tile/configs/tilegx_defconfig
index b8d99aca5431..0270620a1692 100644
--- a/arch/tile/configs/tilegx_defconfig
+++ b/arch/tile/configs/tilegx_defconfig
@@ -18,8 +18,8 @@ CONFIG_CGROUP_DEVICE=y
18CONFIG_CPUSETS=y 18CONFIG_CPUSETS=y
19CONFIG_CGROUP_CPUACCT=y 19CONFIG_CGROUP_CPUACCT=y
20CONFIG_RESOURCE_COUNTERS=y 20CONFIG_RESOURCE_COUNTERS=y
21CONFIG_CGROUP_MEM_RES_CTLR=y 21CONFIG_CGROUP_MEMCG=y
22CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y 22CONFIG_CGROUP_MEMCG_SWAP=y
23CONFIG_CGROUP_SCHED=y 23CONFIG_CGROUP_SCHED=y
24CONFIG_RT_GROUP_SCHED=y 24CONFIG_RT_GROUP_SCHED=y
25CONFIG_BLK_CGROUP=y 25CONFIG_BLK_CGROUP=y
diff --git a/arch/tile/configs/tilepro_defconfig b/arch/tile/configs/tilepro_defconfig
index 2b1fd31894f1..c11de27a9bcb 100644
--- a/arch/tile/configs/tilepro_defconfig
+++ b/arch/tile/configs/tilepro_defconfig
@@ -17,8 +17,8 @@ CONFIG_CGROUP_DEVICE=y
17CONFIG_CPUSETS=y 17CONFIG_CPUSETS=y
18CONFIG_CGROUP_CPUACCT=y 18CONFIG_CGROUP_CPUACCT=y
19CONFIG_RESOURCE_COUNTERS=y 19CONFIG_RESOURCE_COUNTERS=y
20CONFIG_CGROUP_MEM_RES_CTLR=y 20CONFIG_CGROUP_MEMCG=y
21CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y 21CONFIG_CGROUP_MEMCG_SWAP=y
22CONFIG_CGROUP_SCHED=y 22CONFIG_CGROUP_SCHED=y
23CONFIG_RT_GROUP_SCHED=y 23CONFIG_RT_GROUP_SCHED=y
24CONFIG_BLK_CGROUP=y 24CONFIG_BLK_CGROUP=y
diff --git a/arch/um/defconfig b/arch/um/defconfig
index 7823ab12e6a4..fec0d5d27460 100644
--- a/arch/um/defconfig
+++ b/arch/um/defconfig
@@ -155,10 +155,10 @@ CONFIG_CPUSETS=y
155CONFIG_PROC_PID_CPUSET=y 155CONFIG_PROC_PID_CPUSET=y
156CONFIG_CGROUP_CPUACCT=y 156CONFIG_CGROUP_CPUACCT=y
157CONFIG_RESOURCE_COUNTERS=y 157CONFIG_RESOURCE_COUNTERS=y
158CONFIG_CGROUP_MEM_RES_CTLR=y 158CONFIG_CGROUP_MEMCG=y
159CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y 159CONFIG_CGROUP_MEMCG_SWAP=y
160# CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED is not set 160# CONFIG_CGROUP_MEMCG_SWAP_ENABLED is not set
161# CONFIG_CGROUP_MEM_RES_CTLR_KMEM is not set 161# CONFIG_CGROUP_MEMCG_KMEM is not set
162CONFIG_CGROUP_SCHED=y 162CONFIG_CGROUP_SCHED=y
163CONFIG_FAIR_GROUP_SCHED=y 163CONFIG_FAIR_GROUP_SCHED=y
164# CONFIG_CFS_BANDWIDTH is not set 164# CONFIG_CFS_BANDWIDTH is not set
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 8a3f8351f438..8ed64cfae4ff 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -7,6 +7,7 @@ config ZONE_DMA
7config XTENSA 7config XTENSA
8 def_bool y 8 def_bool y
9 select HAVE_IDE 9 select HAVE_IDE
10 select GENERIC_ATOMIC64
10 select HAVE_GENERIC_HARDIRQS 11 select HAVE_GENERIC_HARDIRQS
11 select GENERIC_IRQ_SHOW 12 select GENERIC_IRQ_SHOW
12 select GENERIC_CPU_DEVICES 13 select GENERIC_CPU_DEVICES
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 9b21469482ae..08b4c5209384 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -196,6 +196,7 @@ config CMA
196 bool "Contiguous Memory Allocator (EXPERIMENTAL)" 196 bool "Contiguous Memory Allocator (EXPERIMENTAL)"
197 depends on HAVE_DMA_CONTIGUOUS && HAVE_MEMBLOCK && EXPERIMENTAL 197 depends on HAVE_DMA_CONTIGUOUS && HAVE_MEMBLOCK && EXPERIMENTAL
198 select MIGRATION 198 select MIGRATION
199 select MEMORY_ISOLATION
199 help 200 help
200 This enables the Contiguous Memory Allocator which allows drivers 201 This enables the Contiguous Memory Allocator which allows drivers
201 to allocate big physically-contiguous blocks of memory for use with 202 to allocate big physically-contiguous blocks of memory for use with
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 061427a75d37..76bc96fd01c8 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -154,6 +154,7 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
154 struct msghdr msg; 154 struct msghdr msg;
155 struct kvec iov; 155 struct kvec iov;
156 sigset_t blocked, oldset; 156 sigset_t blocked, oldset;
157 unsigned long pflags = current->flags;
157 158
158 if (unlikely(!sock)) { 159 if (unlikely(!sock)) {
159 dev_err(disk_to_dev(nbd->disk), 160 dev_err(disk_to_dev(nbd->disk),
@@ -167,8 +168,9 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
167 siginitsetinv(&blocked, sigmask(SIGKILL)); 168 siginitsetinv(&blocked, sigmask(SIGKILL));
168 sigprocmask(SIG_SETMASK, &blocked, &oldset); 169 sigprocmask(SIG_SETMASK, &blocked, &oldset);
169 170
171 current->flags |= PF_MEMALLOC;
170 do { 172 do {
171 sock->sk->sk_allocation = GFP_NOIO; 173 sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
172 iov.iov_base = buf; 174 iov.iov_base = buf;
173 iov.iov_len = size; 175 iov.iov_len = size;
174 msg.msg_name = NULL; 176 msg.msg_name = NULL;
@@ -214,6 +216,7 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
214 } while (size > 0); 216 } while (size > 0);
215 217
216 sigprocmask(SIG_SETMASK, &oldset, NULL); 218 sigprocmask(SIG_SETMASK, &oldset, NULL);
219 tsk_restore_flags(current, pflags, PF_MEMALLOC);
217 220
218 return result; 221 return result;
219} 222}
@@ -405,6 +408,7 @@ static int nbd_do_it(struct nbd_device *nbd)
405 408
406 BUG_ON(nbd->magic != NBD_MAGIC); 409 BUG_ON(nbd->magic != NBD_MAGIC);
407 410
411 sk_set_memalloc(nbd->sock->sk);
408 nbd->pid = task_pid_nr(current); 412 nbd->pid = task_pid_nr(current);
409 ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr); 413 ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
410 if (ret) { 414 if (ret) {
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index 8596acaa402b..d49933ed551f 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -528,7 +528,7 @@ static unsigned int refill_fl(struct adapter *adap, struct sge_fl *q, int n,
528#endif 528#endif
529 529
530 while (n--) { 530 while (n--) {
531 pg = alloc_page(gfp); 531 pg = __skb_alloc_page(gfp, NULL);
532 if (unlikely(!pg)) { 532 if (unlikely(!pg)) {
533 q->alloc_failed++; 533 q->alloc_failed++;
534 break; 534 break;
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
index f2d1ecdcaf98..8877fbfefb63 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
@@ -653,7 +653,7 @@ static unsigned int refill_fl(struct adapter *adapter, struct sge_fl *fl,
653 653
654alloc_small_pages: 654alloc_small_pages:
655 while (n--) { 655 while (n--) {
656 page = alloc_page(gfp | __GFP_NOWARN | __GFP_COLD); 656 page = __skb_alloc_page(gfp | __GFP_NOWARN, NULL);
657 if (unlikely(!page)) { 657 if (unlikely(!page)) {
658 fl->alloc_failed++; 658 fl->alloc_failed++;
659 break; 659 break;
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 1050411e7ca3..b7c2d5050572 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6235,7 +6235,7 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring,
6235 return true; 6235 return true;
6236 6236
6237 if (!page) { 6237 if (!page) {
6238 page = alloc_page(GFP_ATOMIC | __GFP_COLD); 6238 page = __skb_alloc_page(GFP_ATOMIC, bi->skb);
6239 bi->page = page; 6239 bi->page = page;
6240 if (unlikely(!page)) { 6240 if (unlikely(!page)) {
6241 rx_ring->rx_stats.alloc_failed++; 6241 rx_ring->rx_stats.alloc_failed++;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index c709eae58c63..4326f74f7137 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1141,8 +1141,8 @@ static bool ixgbe_alloc_mapped_page(struct ixgbe_ring *rx_ring,
1141 1141
1142 /* alloc new page for storage */ 1142 /* alloc new page for storage */
1143 if (likely(!page)) { 1143 if (likely(!page)) {
1144 page = alloc_pages(GFP_ATOMIC | __GFP_COLD | __GFP_COMP, 1144 page = __skb_alloc_pages(GFP_ATOMIC | __GFP_COLD | __GFP_COMP,
1145 ixgbe_rx_pg_order(rx_ring)); 1145 bi->skb, ixgbe_rx_pg_order(rx_ring));
1146 if (unlikely(!page)) { 1146 if (unlikely(!page)) {
1147 rx_ring->rx_stats.alloc_rx_page_failed++; 1147 rx_ring->rx_stats.alloc_rx_page_failed++;
1148 return false; 1148 return false;
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 3f9841d619ad..60ef64587412 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -352,7 +352,6 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_adapter *adapter,
352 adapter->alloc_rx_buff_failed++; 352 adapter->alloc_rx_buff_failed++;
353 goto no_buffers; 353 goto no_buffers;
354 } 354 }
355
356 bi->skb = skb; 355 bi->skb = skb;
357 } 356 }
358 if (!bi->dma) { 357 if (!bi->dma) {
diff --git a/drivers/net/usb/cdc-phonet.c b/drivers/net/usb/cdc-phonet.c
index 187c144c5e5b..64610048ce87 100644
--- a/drivers/net/usb/cdc-phonet.c
+++ b/drivers/net/usb/cdc-phonet.c
@@ -130,7 +130,7 @@ static int rx_submit(struct usbpn_dev *pnd, struct urb *req, gfp_t gfp_flags)
130 struct page *page; 130 struct page *page;
131 int err; 131 int err;
132 132
133 page = alloc_page(gfp_flags); 133 page = __skb_alloc_page(gfp_flags | __GFP_NOMEMALLOC, NULL);
134 if (!page) 134 if (!page)
135 return -ENOMEM; 135 return -ENOMEM;
136 136
diff --git a/drivers/rtc/rtc-88pm80x.c b/drivers/rtc/rtc-88pm80x.c
index a2f956d90de0..6367984e0565 100644
--- a/drivers/rtc/rtc-88pm80x.c
+++ b/drivers/rtc/rtc-88pm80x.c
@@ -314,8 +314,8 @@ static int __devinit pm80x_rtc_probe(struct platform_device *pdev)
314 314
315 info->rtc_dev = rtc_device_register("88pm80x-rtc", &pdev->dev, 315 info->rtc_dev = rtc_device_register("88pm80x-rtc", &pdev->dev,
316 &pm80x_rtc_ops, THIS_MODULE); 316 &pm80x_rtc_ops, THIS_MODULE);
317 ret = PTR_ERR(info->rtc_dev);
318 if (IS_ERR(info->rtc_dev)) { 317 if (IS_ERR(info->rtc_dev)) {
318 ret = PTR_ERR(info->rtc_dev);
319 dev_err(&pdev->dev, "Failed to register RTC device: %d\n", ret); 319 dev_err(&pdev->dev, "Failed to register RTC device: %d\n", ret);
320 goto out_rtc; 320 goto out_rtc;
321 } 321 }
@@ -339,7 +339,6 @@ static int __devinit pm80x_rtc_probe(struct platform_device *pdev)
339out_rtc: 339out_rtc:
340 pm80x_free_irq(chip, info->irq, info); 340 pm80x_free_irq(chip, info->irq, info);
341out: 341out:
342 devm_kfree(&pdev->dev, info);
343 return ret; 342 return ret;
344} 343}
345 344
@@ -349,7 +348,6 @@ static int __devexit pm80x_rtc_remove(struct platform_device *pdev)
349 platform_set_drvdata(pdev, NULL); 348 platform_set_drvdata(pdev, NULL);
350 rtc_device_unregister(info->rtc_dev); 349 rtc_device_unregister(info->rtc_dev);
351 pm80x_free_irq(info->chip, info->irq, info); 350 pm80x_free_irq(info->chip, info->irq, info);
352 devm_kfree(&pdev->dev, info);
353 return 0; 351 return 0;
354} 352}
355 353
diff --git a/drivers/usb/gadget/f_phonet.c b/drivers/usb/gadget/f_phonet.c
index 965a6293206a..8ee9268fe253 100644
--- a/drivers/usb/gadget/f_phonet.c
+++ b/drivers/usb/gadget/f_phonet.c
@@ -301,7 +301,7 @@ pn_rx_submit(struct f_phonet *fp, struct usb_request *req, gfp_t gfp_flags)
301 struct page *page; 301 struct page *page;
302 int err; 302 int err;
303 303
304 page = alloc_page(gfp_flags); 304 page = __skb_alloc_page(gfp_flags | __GFP_NOMEMALLOC, NULL);
305 if (!page) 305 if (!page)
306 return -ENOMEM; 306 return -ENOMEM;
307 307
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 50d0b78130a1..be3efc4f64f4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -52,11 +52,6 @@ struct wb_writeback_work {
52 struct completion *done; /* set if the caller waits */ 52 struct completion *done; /* set if the caller waits */
53}; 53};
54 54
55/*
56 * We don't actually have pdflush, but this one is exported though /proc...
57 */
58int nr_pdflush_threads;
59
60/** 55/**
61 * writeback_in_progress - determine whether there is writeback in progress 56 * writeback_in_progress - determine whether there is writeback in progress
62 * @bdi: the device's backing_dev_info structure. 57 * @bdi: the device's backing_dev_info structure.
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e13e9bdb0bf5..8349a899912e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -416,8 +416,8 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff)
416 else 416 else
417 v_offset = 0; 417 v_offset = 0;
418 418
419 __unmap_hugepage_range(vma, 419 unmap_hugepage_range(vma, vma->vm_start + v_offset,
420 vma->vm_start + v_offset, vma->vm_end, NULL); 420 vma->vm_end, NULL);
421 } 421 }
422} 422}
423 423
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 195c1ea6151a..db7ad719628a 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -86,6 +86,14 @@ config NFS_V4
86 86
87 If unsure, say Y. 87 If unsure, say Y.
88 88
89config NFS_SWAP
90 bool "Provide swap over NFS support"
91 default n
92 depends on NFS_FS
93 select SUNRPC_SWAP
94 help
95 This option enables swapon to work on files located on NFS mounts.
96
89config NFS_V4_1 97config NFS_V4_1
90 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" 98 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
91 depends on NFS_V4 && EXPERIMENTAL 99 depends on NFS_V4 && EXPERIMENTAL
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index b7b4f80968b5..1ba385b7c90d 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -115,17 +115,28 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
115 * @nr_segs: size of iovec array 115 * @nr_segs: size of iovec array
116 * 116 *
117 * The presence of this routine in the address space ops vector means 117 * The presence of this routine in the address space ops vector means
118 * the NFS client supports direct I/O. However, we shunt off direct 118 * the NFS client supports direct I/O. However, for most direct IO, we
119 * read and write requests before the VFS gets them, so this method 119 * shunt off direct read and write requests before the VFS gets them,
120 * should never be called. 120 * so this method is only ever called for swap.
121 */ 121 */
122ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) 122ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
123{ 123{
124#ifndef CONFIG_NFS_SWAP
124 dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n", 125 dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
125 iocb->ki_filp->f_path.dentry->d_name.name, 126 iocb->ki_filp->f_path.dentry->d_name.name,
126 (long long) pos, nr_segs); 127 (long long) pos, nr_segs);
127 128
128 return -EINVAL; 129 return -EINVAL;
130#else
131 VM_BUG_ON(iocb->ki_left != PAGE_SIZE);
132 VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
133
134 if (rw == READ || rw == KERNEL_READ)
135 return nfs_file_direct_read(iocb, iov, nr_segs, pos,
136 rw == READ ? true : false);
137 return nfs_file_direct_write(iocb, iov, nr_segs, pos,
138 rw == WRITE ? true : false);
139#endif /* CONFIG_NFS_SWAP */
129} 140}
130 141
131static void nfs_direct_release_pages(struct page **pages, unsigned int npages) 142static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
@@ -303,7 +314,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
303 */ 314 */
304static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc, 315static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc,
305 const struct iovec *iov, 316 const struct iovec *iov,
306 loff_t pos) 317 loff_t pos, bool uio)
307{ 318{
308 struct nfs_direct_req *dreq = desc->pg_dreq; 319 struct nfs_direct_req *dreq = desc->pg_dreq;
309 struct nfs_open_context *ctx = dreq->ctx; 320 struct nfs_open_context *ctx = dreq->ctx;
@@ -331,12 +342,20 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
331 GFP_KERNEL); 342 GFP_KERNEL);
332 if (!pagevec) 343 if (!pagevec)
333 break; 344 break;
334 down_read(&current->mm->mmap_sem); 345 if (uio) {
335 result = get_user_pages(current, current->mm, user_addr, 346 down_read(&current->mm->mmap_sem);
347 result = get_user_pages(current, current->mm, user_addr,
336 npages, 1, 0, pagevec, NULL); 348 npages, 1, 0, pagevec, NULL);
337 up_read(&current->mm->mmap_sem); 349 up_read(&current->mm->mmap_sem);
338 if (result < 0) 350 if (result < 0)
339 break; 351 break;
352 } else {
353 WARN_ON(npages != 1);
354 result = get_kernel_page(user_addr, 1, pagevec);
355 if (WARN_ON(result != 1))
356 break;
357 }
358
340 if ((unsigned)result < npages) { 359 if ((unsigned)result < npages) {
341 bytes = result * PAGE_SIZE; 360 bytes = result * PAGE_SIZE;
342 if (bytes <= pgbase) { 361 if (bytes <= pgbase) {
@@ -386,7 +405,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
386static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, 405static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
387 const struct iovec *iov, 406 const struct iovec *iov,
388 unsigned long nr_segs, 407 unsigned long nr_segs,
389 loff_t pos) 408 loff_t pos, bool uio)
390{ 409{
391 struct nfs_pageio_descriptor desc; 410 struct nfs_pageio_descriptor desc;
392 ssize_t result = -EINVAL; 411 ssize_t result = -EINVAL;
@@ -400,7 +419,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
400 419
401 for (seg = 0; seg < nr_segs; seg++) { 420 for (seg = 0; seg < nr_segs; seg++) {
402 const struct iovec *vec = &iov[seg]; 421 const struct iovec *vec = &iov[seg];
403 result = nfs_direct_read_schedule_segment(&desc, vec, pos); 422 result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio);
404 if (result < 0) 423 if (result < 0)
405 break; 424 break;
406 requested_bytes += result; 425 requested_bytes += result;
@@ -426,7 +445,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
426} 445}
427 446
428static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, 447static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
429 unsigned long nr_segs, loff_t pos) 448 unsigned long nr_segs, loff_t pos, bool uio)
430{ 449{
431 ssize_t result = -ENOMEM; 450 ssize_t result = -ENOMEM;
432 struct inode *inode = iocb->ki_filp->f_mapping->host; 451 struct inode *inode = iocb->ki_filp->f_mapping->host;
@@ -444,7 +463,7 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
444 if (!is_sync_kiocb(iocb)) 463 if (!is_sync_kiocb(iocb))
445 dreq->iocb = iocb; 464 dreq->iocb = iocb;
446 465
447 result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos); 466 result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
448 if (!result) 467 if (!result)
449 result = nfs_direct_wait(dreq); 468 result = nfs_direct_wait(dreq);
450 NFS_I(inode)->read_io += result; 469 NFS_I(inode)->read_io += result;
@@ -610,7 +629,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
610 */ 629 */
611static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc, 630static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc,
612 const struct iovec *iov, 631 const struct iovec *iov,
613 loff_t pos) 632 loff_t pos, bool uio)
614{ 633{
615 struct nfs_direct_req *dreq = desc->pg_dreq; 634 struct nfs_direct_req *dreq = desc->pg_dreq;
616 struct nfs_open_context *ctx = dreq->ctx; 635 struct nfs_open_context *ctx = dreq->ctx;
@@ -638,12 +657,19 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
638 if (!pagevec) 657 if (!pagevec)
639 break; 658 break;
640 659
641 down_read(&current->mm->mmap_sem); 660 if (uio) {
642 result = get_user_pages(current, current->mm, user_addr, 661 down_read(&current->mm->mmap_sem);
643 npages, 0, 0, pagevec, NULL); 662 result = get_user_pages(current, current->mm, user_addr,
644 up_read(&current->mm->mmap_sem); 663 npages, 0, 0, pagevec, NULL);
645 if (result < 0) 664 up_read(&current->mm->mmap_sem);
646 break; 665 if (result < 0)
666 break;
667 } else {
668 WARN_ON(npages != 1);
669 result = get_kernel_page(user_addr, 0, pagevec);
670 if (WARN_ON(result != 1))
671 break;
672 }
647 673
648 if ((unsigned)result < npages) { 674 if ((unsigned)result < npages) {
649 bytes = result * PAGE_SIZE; 675 bytes = result * PAGE_SIZE;
@@ -774,7 +800,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
774static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, 800static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
775 const struct iovec *iov, 801 const struct iovec *iov,
776 unsigned long nr_segs, 802 unsigned long nr_segs,
777 loff_t pos) 803 loff_t pos, bool uio)
778{ 804{
779 struct nfs_pageio_descriptor desc; 805 struct nfs_pageio_descriptor desc;
780 struct inode *inode = dreq->inode; 806 struct inode *inode = dreq->inode;
@@ -790,7 +816,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
790 816
791 for (seg = 0; seg < nr_segs; seg++) { 817 for (seg = 0; seg < nr_segs; seg++) {
792 const struct iovec *vec = &iov[seg]; 818 const struct iovec *vec = &iov[seg];
793 result = nfs_direct_write_schedule_segment(&desc, vec, pos); 819 result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio);
794 if (result < 0) 820 if (result < 0)
795 break; 821 break;
796 requested_bytes += result; 822 requested_bytes += result;
@@ -818,7 +844,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
818 844
819static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, 845static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
820 unsigned long nr_segs, loff_t pos, 846 unsigned long nr_segs, loff_t pos,
821 size_t count) 847 size_t count, bool uio)
822{ 848{
823 ssize_t result = -ENOMEM; 849 ssize_t result = -ENOMEM;
824 struct inode *inode = iocb->ki_filp->f_mapping->host; 850 struct inode *inode = iocb->ki_filp->f_mapping->host;
@@ -836,7 +862,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
836 if (!is_sync_kiocb(iocb)) 862 if (!is_sync_kiocb(iocb))
837 dreq->iocb = iocb; 863 dreq->iocb = iocb;
838 864
839 result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos); 865 result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
840 if (!result) 866 if (!result)
841 result = nfs_direct_wait(dreq); 867 result = nfs_direct_wait(dreq);
842out_release: 868out_release:
@@ -867,7 +893,7 @@ out:
867 * cache. 893 * cache.
868 */ 894 */
869ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, 895ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
870 unsigned long nr_segs, loff_t pos) 896 unsigned long nr_segs, loff_t pos, bool uio)
871{ 897{
872 ssize_t retval = -EINVAL; 898 ssize_t retval = -EINVAL;
873 struct file *file = iocb->ki_filp; 899 struct file *file = iocb->ki_filp;
@@ -892,7 +918,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
892 918
893 task_io_account_read(count); 919 task_io_account_read(count);
894 920
895 retval = nfs_direct_read(iocb, iov, nr_segs, pos); 921 retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio);
896 if (retval > 0) 922 if (retval > 0)
897 iocb->ki_pos = pos + retval; 923 iocb->ki_pos = pos + retval;
898 924
@@ -923,7 +949,7 @@ out:
923 * is no atomic O_APPEND write facility in the NFS protocol. 949 * is no atomic O_APPEND write facility in the NFS protocol.
924 */ 950 */
925ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, 951ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
926 unsigned long nr_segs, loff_t pos) 952 unsigned long nr_segs, loff_t pos, bool uio)
927{ 953{
928 ssize_t retval = -EINVAL; 954 ssize_t retval = -EINVAL;
929 struct file *file = iocb->ki_filp; 955 struct file *file = iocb->ki_filp;
@@ -955,7 +981,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
955 981
956 task_io_account_write(count); 982 task_io_account_write(count);
957 983
958 retval = nfs_direct_write(iocb, iov, nr_segs, pos, count); 984 retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio);
959 if (retval > 0) { 985 if (retval > 0) {
960 struct inode *inode = mapping->host; 986 struct inode *inode = mapping->host;
961 987
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index b039a17ee941..75d6d0a3d32e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -180,7 +180,7 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
180 ssize_t result; 180 ssize_t result;
181 181
182 if (iocb->ki_filp->f_flags & O_DIRECT) 182 if (iocb->ki_filp->f_flags & O_DIRECT)
183 return nfs_file_direct_read(iocb, iov, nr_segs, pos); 183 return nfs_file_direct_read(iocb, iov, nr_segs, pos, true);
184 184
185 dprintk("NFS: read(%s/%s, %lu@%lu)\n", 185 dprintk("NFS: read(%s/%s, %lu@%lu)\n",
186 dentry->d_parent->d_name.name, dentry->d_name.name, 186 dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -439,7 +439,7 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
439 if (offset != 0) 439 if (offset != 0)
440 return; 440 return;
441 /* Cancel any unstarted writes on this page */ 441 /* Cancel any unstarted writes on this page */
442 nfs_wb_page_cancel(page->mapping->host, page); 442 nfs_wb_page_cancel(page_file_mapping(page)->host, page);
443 443
444 nfs_fscache_invalidate_page(page, page->mapping->host); 444 nfs_fscache_invalidate_page(page, page->mapping->host);
445} 445}
@@ -484,7 +484,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
484 */ 484 */
485static int nfs_launder_page(struct page *page) 485static int nfs_launder_page(struct page *page)
486{ 486{
487 struct inode *inode = page->mapping->host; 487 struct inode *inode = page_file_mapping(page)->host;
488 struct nfs_inode *nfsi = NFS_I(inode); 488 struct nfs_inode *nfsi = NFS_I(inode);
489 489
490 dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", 490 dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
@@ -494,6 +494,20 @@ static int nfs_launder_page(struct page *page)
494 return nfs_wb_page(inode, page); 494 return nfs_wb_page(inode, page);
495} 495}
496 496
497#ifdef CONFIG_NFS_SWAP
498static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
499 sector_t *span)
500{
501 *span = sis->pages;
502 return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1);
503}
504
505static void nfs_swap_deactivate(struct file *file)
506{
507 xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0);
508}
509#endif
510
497const struct address_space_operations nfs_file_aops = { 511const struct address_space_operations nfs_file_aops = {
498 .readpage = nfs_readpage, 512 .readpage = nfs_readpage,
499 .readpages = nfs_readpages, 513 .readpages = nfs_readpages,
@@ -508,6 +522,10 @@ const struct address_space_operations nfs_file_aops = {
508 .migratepage = nfs_migrate_page, 522 .migratepage = nfs_migrate_page,
509 .launder_page = nfs_launder_page, 523 .launder_page = nfs_launder_page,
510 .error_remove_page = generic_error_remove_page, 524 .error_remove_page = generic_error_remove_page,
525#ifdef CONFIG_NFS_SWAP
526 .swap_activate = nfs_swap_activate,
527 .swap_deactivate = nfs_swap_deactivate,
528#endif
511}; 529};
512 530
513/* 531/*
@@ -533,7 +551,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
533 nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); 551 nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
534 552
535 lock_page(page); 553 lock_page(page);
536 mapping = page->mapping; 554 mapping = page_file_mapping(page);
537 if (mapping != dentry->d_inode->i_mapping) 555 if (mapping != dentry->d_inode->i_mapping)
538 goto out_unlock; 556 goto out_unlock;
539 557
@@ -582,7 +600,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
582 size_t count = iov_length(iov, nr_segs); 600 size_t count = iov_length(iov, nr_segs);
583 601
584 if (iocb->ki_filp->f_flags & O_DIRECT) 602 if (iocb->ki_filp->f_flags & O_DIRECT)
585 return nfs_file_direct_write(iocb, iov, nr_segs, pos); 603 return nfs_file_direct_write(iocb, iov, nr_segs, pos, true);
586 604
587 dprintk("NFS: write(%s/%s, %lu@%Ld)\n", 605 dprintk("NFS: write(%s/%s, %lu@%Ld)\n",
588 dentry->d_parent->d_name.name, dentry->d_name.name, 606 dentry->d_parent->d_name.name, dentry->d_name.name,
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 2ed6138f32ad..c6e895f0fbf3 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -897,6 +897,10 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
897 struct nfs_inode *nfsi = NFS_I(inode); 897 struct nfs_inode *nfsi = NFS_I(inode);
898 int ret = 0; 898 int ret = 0;
899 899
900 /* swapfiles are not supposed to be shared. */
901 if (IS_SWAPFILE(inode))
902 goto out;
903
900 if (nfs_mapping_need_revalidate_inode(inode)) { 904 if (nfs_mapping_need_revalidate_inode(inode)) {
901 ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); 905 ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
902 if (ret < 0) 906 if (ret < 0)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 8865538b26b6..31fdb03225cd 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -554,13 +554,14 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
554static inline 554static inline
555unsigned int nfs_page_length(struct page *page) 555unsigned int nfs_page_length(struct page *page)
556{ 556{
557 loff_t i_size = i_size_read(page->mapping->host); 557 loff_t i_size = i_size_read(page_file_mapping(page)->host);
558 558
559 if (i_size > 0) { 559 if (i_size > 0) {
560 pgoff_t page_index = page_file_index(page);
560 pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; 561 pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
561 if (page->index < end_index) 562 if (page_index < end_index)
562 return PAGE_CACHE_SIZE; 563 return PAGE_CACHE_SIZE;
563 if (page->index == end_index) 564 if (page_index == end_index)
564 return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; 565 return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1;
565 } 566 }
566 return 0; 567 return 0;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 1e7d8879dae6..1a6732ed04a4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -71,7 +71,7 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
71static inline struct nfs_page * 71static inline struct nfs_page *
72nfs_page_alloc(void) 72nfs_page_alloc(void)
73{ 73{
74 struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL); 74 struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_NOIO);
75 if (p) 75 if (p)
76 INIT_LIST_HEAD(&p->wb_list); 76 INIT_LIST_HEAD(&p->wb_list);
77 return p; 77 return p;
@@ -118,7 +118,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
118 * long write-back delay. This will be adjusted in 118 * long write-back delay. This will be adjusted in
119 * update_nfs_request below if the region is not locked. */ 119 * update_nfs_request below if the region is not locked. */
120 req->wb_page = page; 120 req->wb_page = page;
121 req->wb_index = page->index; 121 req->wb_index = page_file_index(page);
122 page_cache_get(page); 122 page_cache_get(page);
123 req->wb_offset = offset; 123 req->wb_offset = offset;
124 req->wb_pgbase = offset; 124 req->wb_pgbase = offset;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 6935e401ad76..b6bdb18e892c 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -527,11 +527,11 @@ static const struct rpc_call_ops nfs_read_common_ops = {
527int nfs_readpage(struct file *file, struct page *page) 527int nfs_readpage(struct file *file, struct page *page)
528{ 528{
529 struct nfs_open_context *ctx; 529 struct nfs_open_context *ctx;
530 struct inode *inode = page->mapping->host; 530 struct inode *inode = page_file_mapping(page)->host;
531 int error; 531 int error;
532 532
533 dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", 533 dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
534 page, PAGE_CACHE_SIZE, page->index); 534 page, PAGE_CACHE_SIZE, page_file_index(page));
535 nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); 535 nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
536 nfs_add_stats(inode, NFSIOS_READPAGES, 1); 536 nfs_add_stats(inode, NFSIOS_READPAGES, 1);
537 537
@@ -585,7 +585,7 @@ static int
585readpage_async_filler(void *data, struct page *page) 585readpage_async_filler(void *data, struct page *page)
586{ 586{
587 struct nfs_readdesc *desc = (struct nfs_readdesc *)data; 587 struct nfs_readdesc *desc = (struct nfs_readdesc *)data;
588 struct inode *inode = page->mapping->host; 588 struct inode *inode = page_file_mapping(page)->host;
589 struct nfs_page *new; 589 struct nfs_page *new;
590 unsigned int len; 590 unsigned int len;
591 int error; 591 int error;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e4a2ad2059bd..5829d0ce7cfb 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -52,7 +52,7 @@ static mempool_t *nfs_commit_mempool;
52 52
53struct nfs_commit_data *nfs_commitdata_alloc(void) 53struct nfs_commit_data *nfs_commitdata_alloc(void)
54{ 54{
55 struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS); 55 struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
56 56
57 if (p) { 57 if (p) {
58 memset(p, 0, sizeof(*p)); 58 memset(p, 0, sizeof(*p));
@@ -70,7 +70,7 @@ EXPORT_SYMBOL_GPL(nfs_commit_free);
70 70
71struct nfs_write_header *nfs_writehdr_alloc(void) 71struct nfs_write_header *nfs_writehdr_alloc(void)
72{ 72{
73 struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS); 73 struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
74 74
75 if (p) { 75 if (p) {
76 struct nfs_pgio_header *hdr = &p->header; 76 struct nfs_pgio_header *hdr = &p->header;
@@ -142,25 +142,38 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
142 set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); 142 set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
143} 143}
144 144
145static struct nfs_page *nfs_page_find_request_locked(struct page *page) 145static struct nfs_page *
146nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page)
146{ 147{
147 struct nfs_page *req = NULL; 148 struct nfs_page *req = NULL;
148 149
149 if (PagePrivate(page)) { 150 if (PagePrivate(page))
150 req = (struct nfs_page *)page_private(page); 151 req = (struct nfs_page *)page_private(page);
151 if (req != NULL) 152 else if (unlikely(PageSwapCache(page))) {
152 kref_get(&req->wb_kref); 153 struct nfs_page *freq, *t;
154
155 /* Linearly search the commit list for the correct req */
156 list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) {
157 if (freq->wb_page == page) {
158 req = freq;
159 break;
160 }
161 }
153 } 162 }
163
164 if (req)
165 kref_get(&req->wb_kref);
166
154 return req; 167 return req;
155} 168}
156 169
157static struct nfs_page *nfs_page_find_request(struct page *page) 170static struct nfs_page *nfs_page_find_request(struct page *page)
158{ 171{
159 struct inode *inode = page->mapping->host; 172 struct inode *inode = page_file_mapping(page)->host;
160 struct nfs_page *req = NULL; 173 struct nfs_page *req = NULL;
161 174
162 spin_lock(&inode->i_lock); 175 spin_lock(&inode->i_lock);
163 req = nfs_page_find_request_locked(page); 176 req = nfs_page_find_request_locked(NFS_I(inode), page);
164 spin_unlock(&inode->i_lock); 177 spin_unlock(&inode->i_lock);
165 return req; 178 return req;
166} 179}
@@ -168,16 +181,16 @@ static struct nfs_page *nfs_page_find_request(struct page *page)
168/* Adjust the file length if we're writing beyond the end */ 181/* Adjust the file length if we're writing beyond the end */
169static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) 182static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
170{ 183{
171 struct inode *inode = page->mapping->host; 184 struct inode *inode = page_file_mapping(page)->host;
172 loff_t end, i_size; 185 loff_t end, i_size;
173 pgoff_t end_index; 186 pgoff_t end_index;
174 187
175 spin_lock(&inode->i_lock); 188 spin_lock(&inode->i_lock);
176 i_size = i_size_read(inode); 189 i_size = i_size_read(inode);
177 end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; 190 end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
178 if (i_size > 0 && page->index < end_index) 191 if (i_size > 0 && page_file_index(page) < end_index)
179 goto out; 192 goto out;
180 end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count); 193 end = page_file_offset(page) + ((loff_t)offset+count);
181 if (i_size >= end) 194 if (i_size >= end)
182 goto out; 195 goto out;
183 i_size_write(inode, end); 196 i_size_write(inode, end);
@@ -190,7 +203,7 @@ out:
190static void nfs_set_pageerror(struct page *page) 203static void nfs_set_pageerror(struct page *page)
191{ 204{
192 SetPageError(page); 205 SetPageError(page);
193 nfs_zap_mapping(page->mapping->host, page->mapping); 206 nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page));
194} 207}
195 208
196/* We can set the PG_uptodate flag if we see that a write request 209/* We can set the PG_uptodate flag if we see that a write request
@@ -231,7 +244,7 @@ static int nfs_set_page_writeback(struct page *page)
231 int ret = test_set_page_writeback(page); 244 int ret = test_set_page_writeback(page);
232 245
233 if (!ret) { 246 if (!ret) {
234 struct inode *inode = page->mapping->host; 247 struct inode *inode = page_file_mapping(page)->host;
235 struct nfs_server *nfss = NFS_SERVER(inode); 248 struct nfs_server *nfss = NFS_SERVER(inode);
236 249
237 if (atomic_long_inc_return(&nfss->writeback) > 250 if (atomic_long_inc_return(&nfss->writeback) >
@@ -245,7 +258,7 @@ static int nfs_set_page_writeback(struct page *page)
245 258
246static void nfs_end_page_writeback(struct page *page) 259static void nfs_end_page_writeback(struct page *page)
247{ 260{
248 struct inode *inode = page->mapping->host; 261 struct inode *inode = page_file_mapping(page)->host;
249 struct nfs_server *nfss = NFS_SERVER(inode); 262 struct nfs_server *nfss = NFS_SERVER(inode);
250 263
251 end_page_writeback(page); 264 end_page_writeback(page);
@@ -255,13 +268,13 @@ static void nfs_end_page_writeback(struct page *page)
255 268
256static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) 269static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock)
257{ 270{
258 struct inode *inode = page->mapping->host; 271 struct inode *inode = page_file_mapping(page)->host;
259 struct nfs_page *req; 272 struct nfs_page *req;
260 int ret; 273 int ret;
261 274
262 spin_lock(&inode->i_lock); 275 spin_lock(&inode->i_lock);
263 for (;;) { 276 for (;;) {
264 req = nfs_page_find_request_locked(page); 277 req = nfs_page_find_request_locked(NFS_I(inode), page);
265 if (req == NULL) 278 if (req == NULL)
266 break; 279 break;
267 if (nfs_lock_request(req)) 280 if (nfs_lock_request(req))
@@ -316,13 +329,13 @@ out:
316 329
317static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) 330static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
318{ 331{
319 struct inode *inode = page->mapping->host; 332 struct inode *inode = page_file_mapping(page)->host;
320 int ret; 333 int ret;
321 334
322 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); 335 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
323 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); 336 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
324 337
325 nfs_pageio_cond_complete(pgio, page->index); 338 nfs_pageio_cond_complete(pgio, page_file_index(page));
326 ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); 339 ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
327 if (ret == -EAGAIN) { 340 if (ret == -EAGAIN) {
328 redirty_page_for_writepage(wbc, page); 341 redirty_page_for_writepage(wbc, page);
@@ -339,7 +352,7 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
339 struct nfs_pageio_descriptor pgio; 352 struct nfs_pageio_descriptor pgio;
340 int err; 353 int err;
341 354
342 NFS_PROTO(page->mapping->host)->write_pageio_init(&pgio, 355 NFS_PROTO(page_file_mapping(page)->host)->write_pageio_init(&pgio,
343 page->mapping->host, 356 page->mapping->host,
344 wb_priority(wbc), 357 wb_priority(wbc),
345 &nfs_async_write_completion_ops); 358 &nfs_async_write_completion_ops);
@@ -416,9 +429,15 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
416 spin_lock(&inode->i_lock); 429 spin_lock(&inode->i_lock);
417 if (!nfsi->npages && NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) 430 if (!nfsi->npages && NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
418 inode->i_version++; 431 inode->i_version++;
419 set_bit(PG_MAPPED, &req->wb_flags); 432 /*
420 SetPagePrivate(req->wb_page); 433 * Swap-space should not get truncated. Hence no need to plug the race
421 set_page_private(req->wb_page, (unsigned long)req); 434 * with invalidate/truncate.
435 */
436 if (likely(!PageSwapCache(req->wb_page))) {
437 set_bit(PG_MAPPED, &req->wb_flags);
438 SetPagePrivate(req->wb_page);
439 set_page_private(req->wb_page, (unsigned long)req);
440 }
422 nfsi->npages++; 441 nfsi->npages++;
423 kref_get(&req->wb_kref); 442 kref_get(&req->wb_kref);
424 spin_unlock(&inode->i_lock); 443 spin_unlock(&inode->i_lock);
@@ -435,9 +454,11 @@ static void nfs_inode_remove_request(struct nfs_page *req)
435 BUG_ON (!NFS_WBACK_BUSY(req)); 454 BUG_ON (!NFS_WBACK_BUSY(req));
436 455
437 spin_lock(&inode->i_lock); 456 spin_lock(&inode->i_lock);
438 set_page_private(req->wb_page, 0); 457 if (likely(!PageSwapCache(req->wb_page))) {
439 ClearPagePrivate(req->wb_page); 458 set_page_private(req->wb_page, 0);
440 clear_bit(PG_MAPPED, &req->wb_flags); 459 ClearPagePrivate(req->wb_page);
460 clear_bit(PG_MAPPED, &req->wb_flags);
461 }
441 nfsi->npages--; 462 nfsi->npages--;
442 spin_unlock(&inode->i_lock); 463 spin_unlock(&inode->i_lock);
443 nfs_release_request(req); 464 nfs_release_request(req);
@@ -474,7 +495,7 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
474 spin_unlock(cinfo->lock); 495 spin_unlock(cinfo->lock);
475 if (!cinfo->dreq) { 496 if (!cinfo->dreq) {
476 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 497 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
477 inc_bdi_stat(req->wb_page->mapping->backing_dev_info, 498 inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
478 BDI_RECLAIMABLE); 499 BDI_RECLAIMABLE);
479 __mark_inode_dirty(req->wb_context->dentry->d_inode, 500 __mark_inode_dirty(req->wb_context->dentry->d_inode,
480 I_DIRTY_DATASYNC); 501 I_DIRTY_DATASYNC);
@@ -541,7 +562,7 @@ static void
541nfs_clear_page_commit(struct page *page) 562nfs_clear_page_commit(struct page *page)
542{ 563{
543 dec_zone_page_state(page, NR_UNSTABLE_NFS); 564 dec_zone_page_state(page, NR_UNSTABLE_NFS);
544 dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); 565 dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
545} 566}
546 567
547static void 568static void
@@ -733,7 +754,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
733 spin_lock(&inode->i_lock); 754 spin_lock(&inode->i_lock);
734 755
735 for (;;) { 756 for (;;) {
736 req = nfs_page_find_request_locked(page); 757 req = nfs_page_find_request_locked(NFS_I(inode), page);
737 if (req == NULL) 758 if (req == NULL)
738 goto out_unlock; 759 goto out_unlock;
739 760
@@ -792,7 +813,7 @@ out_err:
792static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, 813static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
793 struct page *page, unsigned int offset, unsigned int bytes) 814 struct page *page, unsigned int offset, unsigned int bytes)
794{ 815{
795 struct inode *inode = page->mapping->host; 816 struct inode *inode = page_file_mapping(page)->host;
796 struct nfs_page *req; 817 struct nfs_page *req;
797 818
798 req = nfs_try_to_update_request(inode, page, offset, bytes); 819 req = nfs_try_to_update_request(inode, page, offset, bytes);
@@ -845,7 +866,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
845 nfs_release_request(req); 866 nfs_release_request(req);
846 if (!do_flush) 867 if (!do_flush)
847 return 0; 868 return 0;
848 status = nfs_wb_page(page->mapping->host, page); 869 status = nfs_wb_page(page_file_mapping(page)->host, page);
849 } while (status == 0); 870 } while (status == 0);
850 return status; 871 return status;
851} 872}
@@ -875,7 +896,7 @@ int nfs_updatepage(struct file *file, struct page *page,
875 unsigned int offset, unsigned int count) 896 unsigned int offset, unsigned int count)
876{ 897{
877 struct nfs_open_context *ctx = nfs_file_open_context(file); 898 struct nfs_open_context *ctx = nfs_file_open_context(file);
878 struct inode *inode = page->mapping->host; 899 struct inode *inode = page_file_mapping(page)->host;
879 int status = 0; 900 int status = 0;
880 901
881 nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); 902 nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
@@ -883,7 +904,7 @@ int nfs_updatepage(struct file *file, struct page *page,
883 dprintk("NFS: nfs_updatepage(%s/%s %d@%lld)\n", 904 dprintk("NFS: nfs_updatepage(%s/%s %d@%lld)\n",
884 file->f_path.dentry->d_parent->d_name.name, 905 file->f_path.dentry->d_parent->d_name.name,
885 file->f_path.dentry->d_name.name, count, 906 file->f_path.dentry->d_name.name, count,
886 (long long)(page_offset(page) + offset)); 907 (long long)(page_file_offset(page) + offset));
887 908
888 /* If we're not using byte range locks, and we know the page 909 /* If we're not using byte range locks, and we know the page
889 * is up to date, it may be more efficient to extend the write 910 * is up to date, it may be more efficient to extend the write
@@ -1474,7 +1495,7 @@ void nfs_retry_commit(struct list_head *page_list,
1474 nfs_mark_request_commit(req, lseg, cinfo); 1495 nfs_mark_request_commit(req, lseg, cinfo);
1475 if (!cinfo->dreq) { 1496 if (!cinfo->dreq) {
1476 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 1497 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1477 dec_bdi_stat(req->wb_page->mapping->backing_dev_info, 1498 dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
1478 BDI_RECLAIMABLE); 1499 BDI_RECLAIMABLE);
1479 } 1500 }
1480 nfs_unlock_and_release_request(req); 1501 nfs_unlock_and_release_request(req);
@@ -1731,7 +1752,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
1731 */ 1752 */
1732int nfs_wb_page(struct inode *inode, struct page *page) 1753int nfs_wb_page(struct inode *inode, struct page *page)
1733{ 1754{
1734 loff_t range_start = page_offset(page); 1755 loff_t range_start = page_file_offset(page);
1735 loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); 1756 loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
1736 struct writeback_control wbc = { 1757 struct writeback_control wbc = {
1737 .sync_mode = WB_SYNC_ALL, 1758 .sync_mode = WB_SYNC_ALL,
diff --git a/fs/super.c b/fs/super.c
index 4c5d82f56ec4..4bf714459a4b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -62,7 +62,7 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
62 return -1; 62 return -1;
63 63
64 if (!grab_super_passive(sb)) 64 if (!grab_super_passive(sb))
65 return !sc->nr_to_scan ? 0 : -1; 65 return -1;
66 66
67 if (sb->s_op && sb->s_op->nr_cached_objects) 67 if (sb->s_op && sb->s_op->nr_cached_objects)
68 fs_objects = sb->s_op->nr_cached_objects(sb); 68 fs_objects = sb->s_op->nr_cached_objects(sb);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 489de625cd25..c97c6b9cd38e 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -17,6 +17,7 @@
17#include <linux/timer.h> 17#include <linux/timer.h>
18#include <linux/writeback.h> 18#include <linux/writeback.h>
19#include <linux/atomic.h> 19#include <linux/atomic.h>
20#include <linux/sysctl.h>
20 21
21struct page; 22struct page;
22struct device; 23struct device;
@@ -304,6 +305,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
304void set_bdi_congested(struct backing_dev_info *bdi, int sync); 305void set_bdi_congested(struct backing_dev_info *bdi, int sync);
305long congestion_wait(int sync, long timeout); 306long congestion_wait(int sync, long timeout);
306long wait_iff_congested(struct zone *zone, int sync, long timeout); 307long wait_iff_congested(struct zone *zone, int sync, long timeout);
308int pdflush_proc_obsolete(struct ctl_table *table, int write,
309 void __user *buffer, size_t *lenp, loff_t *ppos);
307 310
308static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) 311static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi)
309{ 312{
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 0edb65dd8edd..7b7ac9ccec7a 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -160,6 +160,7 @@ enum rq_flag_bits {
160 __REQ_FLUSH_SEQ, /* request for flush sequence */ 160 __REQ_FLUSH_SEQ, /* request for flush sequence */
161 __REQ_IO_STAT, /* account I/O stat */ 161 __REQ_IO_STAT, /* account I/O stat */
162 __REQ_MIXED_MERGE, /* merge of different types, fail separately */ 162 __REQ_MIXED_MERGE, /* merge of different types, fail separately */
163 __REQ_KERNEL, /* direct IO to kernel pages */
163 __REQ_NR_BITS, /* stops here */ 164 __REQ_NR_BITS, /* stops here */
164}; 165};
165 166
@@ -201,5 +202,6 @@ enum rq_flag_bits {
201#define REQ_IO_STAT (1 << __REQ_IO_STAT) 202#define REQ_IO_STAT (1 << __REQ_IO_STAT)
202#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) 203#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE)
203#define REQ_SECURE (1 << __REQ_SECURE) 204#define REQ_SECURE (1 << __REQ_SECURE)
205#define REQ_KERNEL (1 << __REQ_KERNEL)
204 206
205#endif /* __LINUX_BLK_TYPES_H */ 207#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 0bd390ce98b2..dfae957398c3 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -31,7 +31,7 @@ SUBSYS(cpuacct)
31 31
32/* */ 32/* */
33 33
34#ifdef CONFIG_CGROUP_MEM_RES_CTLR 34#ifdef CONFIG_MEMCG
35SUBSYS(mem_cgroup) 35SUBSYS(mem_cgroup)
36#endif 36#endif
37 37
@@ -72,3 +72,9 @@ SUBSYS(net_prio)
72#endif 72#endif
73 73
74/* */ 74/* */
75
76#ifdef CONFIG_CGROUP_HUGETLB
77SUBSYS(hugetlb)
78#endif
79
80/* */
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 51a90b7f2d60..133ddcf83397 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -58,7 +58,7 @@ static inline bool compaction_deferred(struct zone *zone, int order)
58 if (++zone->compact_considered > defer_limit) 58 if (++zone->compact_considered > defer_limit)
59 zone->compact_considered = defer_limit; 59 zone->compact_considered = defer_limit;
60 60
61 return zone->compact_considered < (1UL << zone->compact_defer_shift); 61 return zone->compact_considered < defer_limit;
62} 62}
63 63
64#else 64#else
@@ -85,7 +85,7 @@ static inline void defer_compaction(struct zone *zone, int order)
85 85
86static inline bool compaction_deferred(struct zone *zone, int order) 86static inline bool compaction_deferred(struct zone *zone, int order)
87{ 87{
88 return 1; 88 return true;
89} 89}
90 90
91#endif /* CONFIG_COMPACTION */ 91#endif /* CONFIG_COMPACTION */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b178f9e91e23..d7eed5b98ae2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -165,6 +165,8 @@ struct inodes_stat_t {
165#define READ 0 165#define READ 0
166#define WRITE RW_MASK 166#define WRITE RW_MASK
167#define READA RWA_MASK 167#define READA RWA_MASK
168#define KERNEL_READ (READ|REQ_KERNEL)
169#define KERNEL_WRITE (WRITE|REQ_KERNEL)
168 170
169#define READ_SYNC (READ | REQ_SYNC) 171#define READ_SYNC (READ | REQ_SYNC)
170#define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE) 172#define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE)
@@ -427,6 +429,7 @@ struct kstatfs;
427struct vm_area_struct; 429struct vm_area_struct;
428struct vfsmount; 430struct vfsmount;
429struct cred; 431struct cred;
432struct swap_info_struct;
430 433
431extern void __init inode_init(void); 434extern void __init inode_init(void);
432extern void __init inode_init_early(void); 435extern void __init inode_init_early(void);
@@ -636,6 +639,11 @@ struct address_space_operations {
636 int (*is_partially_uptodate) (struct page *, read_descriptor_t *, 639 int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
637 unsigned long); 640 unsigned long);
638 int (*error_remove_page)(struct address_space *, struct page *); 641 int (*error_remove_page)(struct address_space *, struct page *);
642
643 /* swapfile support */
644 int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
645 sector_t *span);
646 void (*swap_deactivate)(struct file *file);
639}; 647};
640 648
641extern const struct address_space_operations empty_aops; 649extern const struct address_space_operations empty_aops;
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 1e49be49d324..4883f393f50a 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -23,6 +23,7 @@ struct vm_area_struct;
23#define ___GFP_REPEAT 0x400u 23#define ___GFP_REPEAT 0x400u
24#define ___GFP_NOFAIL 0x800u 24#define ___GFP_NOFAIL 0x800u
25#define ___GFP_NORETRY 0x1000u 25#define ___GFP_NORETRY 0x1000u
26#define ___GFP_MEMALLOC 0x2000u
26#define ___GFP_COMP 0x4000u 27#define ___GFP_COMP 0x4000u
27#define ___GFP_ZERO 0x8000u 28#define ___GFP_ZERO 0x8000u
28#define ___GFP_NOMEMALLOC 0x10000u 29#define ___GFP_NOMEMALLOC 0x10000u
@@ -76,9 +77,14 @@ struct vm_area_struct;
76#define __GFP_REPEAT ((__force gfp_t)___GFP_REPEAT) /* See above */ 77#define __GFP_REPEAT ((__force gfp_t)___GFP_REPEAT) /* See above */
77#define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) /* See above */ 78#define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) /* See above */
78#define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) /* See above */ 79#define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) /* See above */
80#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC)/* Allow access to emergency reserves */
79#define __GFP_COMP ((__force gfp_t)___GFP_COMP) /* Add compound page metadata */ 81#define __GFP_COMP ((__force gfp_t)___GFP_COMP) /* Add compound page metadata */
80#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) /* Return zeroed page on success */ 82#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) /* Return zeroed page on success */
81#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves */ 83#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves.
84 * This takes precedence over the
85 * __GFP_MEMALLOC flag if both are
86 * set
87 */
82#define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */ 88#define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */
83#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */ 89#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */
84#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ 90#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
@@ -129,7 +135,7 @@ struct vm_area_struct;
129/* Control page allocator reclaim behavior */ 135/* Control page allocator reclaim behavior */
130#define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\ 136#define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\
131 __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ 137 __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\
132 __GFP_NORETRY|__GFP_NOMEMALLOC) 138 __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC)
133 139
134/* Control slab gfp mask during early boot */ 140/* Control slab gfp mask during early boot */
135#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS)) 141#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS))
@@ -379,6 +385,9 @@ void drain_local_pages(void *dummy);
379 */ 385 */
380extern gfp_t gfp_allowed_mask; 386extern gfp_t gfp_allowed_mask;
381 387
388/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
389bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
390
382extern void pm_restrict_gfp_mask(void); 391extern void pm_restrict_gfp_mask(void);
383extern void pm_restore_gfp_mask(void); 392extern void pm_restore_gfp_mask(void);
384 393
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 774fa47b3b5b..ef788b5b4a35 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -39,10 +39,17 @@ extern unsigned long totalhigh_pages;
39 39
40void kmap_flush_unused(void); 40void kmap_flush_unused(void);
41 41
42struct page *kmap_to_page(void *addr);
43
42#else /* CONFIG_HIGHMEM */ 44#else /* CONFIG_HIGHMEM */
43 45
44static inline unsigned int nr_free_highpages(void) { return 0; } 46static inline unsigned int nr_free_highpages(void) { return 0; }
45 47
48static inline struct page *kmap_to_page(void *addr)
49{
50 return virt_to_page(addr);
51}
52
46#define totalhigh_pages 0UL 53#define totalhigh_pages 0UL
47 54
48#ifndef ARCH_HAS_KMAP 55#ifndef ARCH_HAS_KMAP
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d5d6bbe2259e..225164842ab6 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -4,9 +4,11 @@
4#include <linux/mm_types.h> 4#include <linux/mm_types.h>
5#include <linux/fs.h> 5#include <linux/fs.h>
6#include <linux/hugetlb_inline.h> 6#include <linux/hugetlb_inline.h>
7#include <linux/cgroup.h>
7 8
8struct ctl_table; 9struct ctl_table;
9struct user_struct; 10struct user_struct;
11struct mmu_gather;
10 12
11#ifdef CONFIG_HUGETLB_PAGE 13#ifdef CONFIG_HUGETLB_PAGE
12 14
@@ -20,6 +22,11 @@ struct hugepage_subpool {
20 long max_hpages, used_hpages; 22 long max_hpages, used_hpages;
21}; 23};
22 24
25extern spinlock_t hugetlb_lock;
26extern int hugetlb_max_hstate __read_mostly;
27#define for_each_hstate(h) \
28 for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++)
29
23struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); 30struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
24void hugepage_put_subpool(struct hugepage_subpool *spool); 31void hugepage_put_subpool(struct hugepage_subpool *spool);
25 32
@@ -40,9 +47,14 @@ int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
40 struct page **, struct vm_area_struct **, 47 struct page **, struct vm_area_struct **,
41 unsigned long *, int *, int, unsigned int flags); 48 unsigned long *, int *, int, unsigned int flags);
42void unmap_hugepage_range(struct vm_area_struct *, 49void unmap_hugepage_range(struct vm_area_struct *,
43 unsigned long, unsigned long, struct page *); 50 unsigned long, unsigned long, struct page *);
44void __unmap_hugepage_range(struct vm_area_struct *, 51void __unmap_hugepage_range_final(struct mmu_gather *tlb,
45 unsigned long, unsigned long, struct page *); 52 struct vm_area_struct *vma,
53 unsigned long start, unsigned long end,
54 struct page *ref_page);
55void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
56 unsigned long start, unsigned long end,
57 struct page *ref_page);
46int hugetlb_prefault(struct address_space *, struct vm_area_struct *); 58int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
47void hugetlb_report_meminfo(struct seq_file *); 59void hugetlb_report_meminfo(struct seq_file *);
48int hugetlb_report_node_meminfo(int, char *); 60int hugetlb_report_node_meminfo(int, char *);
@@ -98,7 +110,6 @@ static inline unsigned long hugetlb_total_pages(void)
98#define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) 110#define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL)
99#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) 111#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
100#define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) 112#define hugetlb_prefault(mapping, vma) ({ BUG(); 0; })
101#define unmap_hugepage_range(vma, start, end, page) BUG()
102static inline void hugetlb_report_meminfo(struct seq_file *m) 113static inline void hugetlb_report_meminfo(struct seq_file *m)
103{ 114{
104} 115}
@@ -112,13 +123,31 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
112#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) 123#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
113#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) 124#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
114#define huge_pte_offset(mm, address) 0 125#define huge_pte_offset(mm, address) 0
115#define dequeue_hwpoisoned_huge_page(page) 0 126static inline int dequeue_hwpoisoned_huge_page(struct page *page)
127{
128 return 0;
129}
130
116static inline void copy_huge_page(struct page *dst, struct page *src) 131static inline void copy_huge_page(struct page *dst, struct page *src)
117{ 132{
118} 133}
119 134
120#define hugetlb_change_protection(vma, address, end, newprot) 135#define hugetlb_change_protection(vma, address, end, newprot)
121 136
137static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
138 struct vm_area_struct *vma, unsigned long start,
139 unsigned long end, struct page *ref_page)
140{
141 BUG();
142}
143
144static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
145 struct vm_area_struct *vma, unsigned long start,
146 unsigned long end, struct page *ref_page)
147{
148 BUG();
149}
150
122#endif /* !CONFIG_HUGETLB_PAGE */ 151#endif /* !CONFIG_HUGETLB_PAGE */
123 152
124#define HUGETLB_ANON_FILE "anon_hugepage" 153#define HUGETLB_ANON_FILE "anon_hugepage"
@@ -199,10 +228,15 @@ struct hstate {
199 unsigned long resv_huge_pages; 228 unsigned long resv_huge_pages;
200 unsigned long surplus_huge_pages; 229 unsigned long surplus_huge_pages;
201 unsigned long nr_overcommit_huge_pages; 230 unsigned long nr_overcommit_huge_pages;
231 struct list_head hugepage_activelist;
202 struct list_head hugepage_freelists[MAX_NUMNODES]; 232 struct list_head hugepage_freelists[MAX_NUMNODES];
203 unsigned int nr_huge_pages_node[MAX_NUMNODES]; 233 unsigned int nr_huge_pages_node[MAX_NUMNODES];
204 unsigned int free_huge_pages_node[MAX_NUMNODES]; 234 unsigned int free_huge_pages_node[MAX_NUMNODES];
205 unsigned int surplus_huge_pages_node[MAX_NUMNODES]; 235 unsigned int surplus_huge_pages_node[MAX_NUMNODES];
236#ifdef CONFIG_CGROUP_HUGETLB
237 /* cgroup control files */
238 struct cftype cgroup_files[5];
239#endif
206 char name[HSTATE_NAME_LEN]; 240 char name[HSTATE_NAME_LEN];
207}; 241};
208 242
@@ -302,6 +336,11 @@ static inline unsigned hstate_index_to_shift(unsigned index)
302 return hstates[index].order + PAGE_SHIFT; 336 return hstates[index].order + PAGE_SHIFT;
303} 337}
304 338
339static inline int hstate_index(struct hstate *h)
340{
341 return h - hstates;
342}
343
305#else 344#else
306struct hstate {}; 345struct hstate {};
307#define alloc_huge_page_node(h, nid) NULL 346#define alloc_huge_page_node(h, nid) NULL
@@ -320,6 +359,7 @@ static inline unsigned int pages_per_huge_page(struct hstate *h)
320 return 1; 359 return 1;
321} 360}
322#define hstate_index_to_shift(index) 0 361#define hstate_index_to_shift(index) 0
362#define hstate_index(h) 0
323#endif 363#endif
324 364
325#endif /* _LINUX_HUGETLB_H */ 365#endif /* _LINUX_HUGETLB_H */
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
new file mode 100644
index 000000000000..d73878c694b3
--- /dev/null
+++ b/include/linux/hugetlb_cgroup.h
@@ -0,0 +1,126 @@
1/*
2 * Copyright IBM Corporation, 2012
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14
15#ifndef _LINUX_HUGETLB_CGROUP_H
16#define _LINUX_HUGETLB_CGROUP_H
17
18#include <linux/res_counter.h>
19
20struct hugetlb_cgroup;
21/*
22 * Minimum page order trackable by hugetlb cgroup.
23 * At least 3 pages are necessary for all the tracking information.
24 */
25#define HUGETLB_CGROUP_MIN_ORDER 2
26
27#ifdef CONFIG_CGROUP_HUGETLB
28
29static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page)
30{
31 VM_BUG_ON(!PageHuge(page));
32
33 if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
34 return NULL;
35 return (struct hugetlb_cgroup *)page[2].lru.next;
36}
37
38static inline
39int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
40{
41 VM_BUG_ON(!PageHuge(page));
42
43 if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
44 return -1;
45 page[2].lru.next = (void *)h_cg;
46 return 0;
47}
48
49static inline bool hugetlb_cgroup_disabled(void)
50{
51 if (hugetlb_subsys.disabled)
52 return true;
53 return false;
54}
55
56extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
57 struct hugetlb_cgroup **ptr);
58extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
59 struct hugetlb_cgroup *h_cg,
60 struct page *page);
61extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
62 struct page *page);
63extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
64 struct hugetlb_cgroup *h_cg);
65extern int hugetlb_cgroup_file_init(int idx) __init;
66extern void hugetlb_cgroup_migrate(struct page *oldhpage,
67 struct page *newhpage);
68
69#else
70static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page)
71{
72 return NULL;
73}
74
75static inline
76int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
77{
78 return 0;
79}
80
81static inline bool hugetlb_cgroup_disabled(void)
82{
83 return true;
84}
85
86static inline int
87hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
88 struct hugetlb_cgroup **ptr)
89{
90 return 0;
91}
92
93static inline void
94hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
95 struct hugetlb_cgroup *h_cg,
96 struct page *page)
97{
98 return;
99}
100
101static inline void
102hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page)
103{
104 return;
105}
106
107static inline void
108hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
109 struct hugetlb_cgroup *h_cg)
110{
111 return;
112}
113
114static inline int __init hugetlb_cgroup_file_init(int idx)
115{
116 return 0;
117}
118
119static inline void hugetlb_cgroup_migrate(struct page *oldhpage,
120 struct page *newhpage)
121{
122 return;
123}
124
125#endif /* CONFIG_MEM_RES_CTLR_HUGETLB */
126#endif
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 83e7ba90d6e5..8d9489fdab2e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -38,7 +38,7 @@ struct mem_cgroup_reclaim_cookie {
38 unsigned int generation; 38 unsigned int generation;
39}; 39};
40 40
41#ifdef CONFIG_CGROUP_MEM_RES_CTLR 41#ifdef CONFIG_MEMCG
42/* 42/*
43 * All "charge" functions with gfp_mask should use GFP_KERNEL or 43 * All "charge" functions with gfp_mask should use GFP_KERNEL or
44 * (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't 44 * (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't
@@ -72,8 +72,6 @@ extern void mem_cgroup_uncharge_end(void);
72extern void mem_cgroup_uncharge_page(struct page *page); 72extern void mem_cgroup_uncharge_page(struct page *page);
73extern void mem_cgroup_uncharge_cache_page(struct page *page); 73extern void mem_cgroup_uncharge_cache_page(struct page *page);
74 74
75extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
76 int order);
77bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 75bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
78 struct mem_cgroup *memcg); 76 struct mem_cgroup *memcg);
79int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg); 77int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg);
@@ -100,9 +98,9 @@ int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
100 98
101extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg); 99extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
102 100
103extern int 101extern void
104mem_cgroup_prepare_migration(struct page *page, 102mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
105 struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask); 103 struct mem_cgroup **memcgp);
106extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, 104extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
107 struct page *oldpage, struct page *newpage, bool migration_ok); 105 struct page *oldpage, struct page *newpage, bool migration_ok);
108 106
@@ -124,7 +122,7 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
124extern void mem_cgroup_replace_page_cache(struct page *oldpage, 122extern void mem_cgroup_replace_page_cache(struct page *oldpage,
125 struct page *newpage); 123 struct page *newpage);
126 124
127#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 125#ifdef CONFIG_MEMCG_SWAP
128extern int do_swap_account; 126extern int do_swap_account;
129#endif 127#endif
130 128
@@ -182,7 +180,6 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
182unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 180unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
183 gfp_t gfp_mask, 181 gfp_t gfp_mask,
184 unsigned long *total_scanned); 182 unsigned long *total_scanned);
185u64 mem_cgroup_get_limit(struct mem_cgroup *memcg);
186 183
187void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); 184void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
188#ifdef CONFIG_TRANSPARENT_HUGEPAGE 185#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -193,7 +190,7 @@ void mem_cgroup_split_huge_fixup(struct page *head);
193bool mem_cgroup_bad_page_check(struct page *page); 190bool mem_cgroup_bad_page_check(struct page *page);
194void mem_cgroup_print_bad_page(struct page *page); 191void mem_cgroup_print_bad_page(struct page *page);
195#endif 192#endif
196#else /* CONFIG_CGROUP_MEM_RES_CTLR */ 193#else /* CONFIG_MEMCG */
197struct mem_cgroup; 194struct mem_cgroup;
198 195
199static inline int mem_cgroup_newpage_charge(struct page *page, 196static inline int mem_cgroup_newpage_charge(struct page *page,
@@ -279,11 +276,10 @@ static inline struct cgroup_subsys_state
279 return NULL; 276 return NULL;
280} 277}
281 278
282static inline int 279static inline void
283mem_cgroup_prepare_migration(struct page *page, struct page *newpage, 280mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
284 struct mem_cgroup **memcgp, gfp_t gfp_mask) 281 struct mem_cgroup **memcgp)
285{ 282{
286 return 0;
287} 283}
288 284
289static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg, 285static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
@@ -366,12 +362,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
366 return 0; 362 return 0;
367} 363}
368 364
369static inline
370u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
371{
372 return 0;
373}
374
375static inline void mem_cgroup_split_huge_fixup(struct page *head) 365static inline void mem_cgroup_split_huge_fixup(struct page *head)
376{ 366{
377} 367}
@@ -384,9 +374,9 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
384 struct page *newpage) 374 struct page *newpage)
385{ 375{
386} 376}
387#endif /* CONFIG_CGROUP_MEM_RES_CTLR */ 377#endif /* CONFIG_MEMCG */
388 378
389#if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) 379#if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM)
390static inline bool 380static inline bool
391mem_cgroup_bad_page_check(struct page *page) 381mem_cgroup_bad_page_check(struct page *page)
392{ 382{
@@ -406,7 +396,7 @@ enum {
406}; 396};
407 397
408struct sock; 398struct sock;
409#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 399#ifdef CONFIG_MEMCG_KMEM
410void sock_update_memcg(struct sock *sk); 400void sock_update_memcg(struct sock *sk);
411void sock_release_memcg(struct sock *sk); 401void sock_release_memcg(struct sock *sk);
412#else 402#else
@@ -416,6 +406,6 @@ static inline void sock_update_memcg(struct sock *sk)
416static inline void sock_release_memcg(struct sock *sk) 406static inline void sock_release_memcg(struct sock *sk)
417{ 407{
418} 408}
419#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ 409#endif /* CONFIG_MEMCG_KMEM */
420#endif /* _LINUX_MEMCONTROL_H */ 410#endif /* _LINUX_MEMCONTROL_H */
421 411
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 855c337b20c3..ce7e6671968b 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -15,7 +15,7 @@ extern int migrate_page(struct address_space *,
15extern int migrate_pages(struct list_head *l, new_page_t x, 15extern int migrate_pages(struct list_head *l, new_page_t x,
16 unsigned long private, bool offlining, 16 unsigned long private, bool offlining,
17 enum migrate_mode mode); 17 enum migrate_mode mode);
18extern int migrate_huge_pages(struct list_head *l, new_page_t x, 18extern int migrate_huge_page(struct page *, new_page_t x,
19 unsigned long private, bool offlining, 19 unsigned long private, bool offlining,
20 enum migrate_mode mode); 20 enum migrate_mode mode);
21 21
@@ -36,7 +36,7 @@ static inline void putback_lru_pages(struct list_head *l) {}
36static inline int migrate_pages(struct list_head *l, new_page_t x, 36static inline int migrate_pages(struct list_head *l, new_page_t x,
37 unsigned long private, bool offlining, 37 unsigned long private, bool offlining,
38 enum migrate_mode mode) { return -ENOSYS; } 38 enum migrate_mode mode) { return -ENOSYS; }
39static inline int migrate_huge_pages(struct list_head *l, new_page_t x, 39static inline int migrate_huge_page(struct page *page, new_page_t x,
40 unsigned long private, bool offlining, 40 unsigned long private, bool offlining,
41 enum migrate_mode mode) { return -ENOSYS; } 41 enum migrate_mode mode) { return -ENOSYS; }
42 42
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f9f279cf5b1b..bd079a1b0fdc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -805,6 +805,17 @@ static inline void *page_rmapping(struct page *page)
805 return (void *)((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS); 805 return (void *)((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS);
806} 806}
807 807
808extern struct address_space *__page_file_mapping(struct page *);
809
810static inline
811struct address_space *page_file_mapping(struct page *page)
812{
813 if (unlikely(PageSwapCache(page)))
814 return __page_file_mapping(page);
815
816 return page->mapping;
817}
818
808static inline int PageAnon(struct page *page) 819static inline int PageAnon(struct page *page)
809{ 820{
810 return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; 821 return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
@@ -821,6 +832,20 @@ static inline pgoff_t page_index(struct page *page)
821 return page->index; 832 return page->index;
822} 833}
823 834
835extern pgoff_t __page_file_index(struct page *page);
836
837/*
838 * Return the file index of the page. Regular pagecache pages use ->index
839 * whereas swapcache pages use swp_offset(->private)
840 */
841static inline pgoff_t page_file_index(struct page *page)
842{
843 if (unlikely(PageSwapCache(page)))
844 return __page_file_index(page);
845
846 return page->index;
847}
848
824/* 849/*
825 * Return true if this page is mapped into pagetables. 850 * Return true if this page is mapped into pagetables.
826 */ 851 */
@@ -994,6 +1019,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
994 struct page **pages, struct vm_area_struct **vmas); 1019 struct page **pages, struct vm_area_struct **vmas);
995int get_user_pages_fast(unsigned long start, int nr_pages, int write, 1020int get_user_pages_fast(unsigned long start, int nr_pages, int write,
996 struct page **pages); 1021 struct page **pages);
1022struct kvec;
1023int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
1024 struct page **pages);
1025int get_kernel_page(unsigned long start, int write, struct page **pages);
997struct page *get_dump_page(unsigned long addr); 1026struct page *get_dump_page(unsigned long addr);
998 1027
999extern int try_to_release_page(struct page * page, gfp_t gfp_mask); 1028extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
@@ -1331,6 +1360,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...);
1331extern void setup_per_cpu_pageset(void); 1360extern void setup_per_cpu_pageset(void);
1332 1361
1333extern void zone_pcp_update(struct zone *zone); 1362extern void zone_pcp_update(struct zone *zone);
1363extern void zone_pcp_reset(struct zone *zone);
1334 1364
1335/* nommu.c */ 1365/* nommu.c */
1336extern atomic_long_t mmap_pages_allocated; 1366extern atomic_long_t mmap_pages_allocated;
@@ -1528,6 +1558,7 @@ void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
1528static inline void vm_stat_account(struct mm_struct *mm, 1558static inline void vm_stat_account(struct mm_struct *mm,
1529 unsigned long flags, struct file *file, long pages) 1559 unsigned long flags, struct file *file, long pages)
1530{ 1560{
1561 mm->total_vm += pages;
1531} 1562}
1532#endif /* CONFIG_PROC_FS */ 1563#endif /* CONFIG_PROC_FS */
1533 1564
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 074eb98fe15d..bf7867200b95 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -54,6 +54,15 @@ struct page {
54 union { 54 union {
55 pgoff_t index; /* Our offset within mapping. */ 55 pgoff_t index; /* Our offset within mapping. */
56 void *freelist; /* slub/slob first free object */ 56 void *freelist; /* slub/slob first free object */
57 bool pfmemalloc; /* If set by the page allocator,
58 * ALLOC_NO_WATERMARKS was set
59 * and the low watermark was not
60 * met implying that the system
61 * is under some pressure. The
62 * caller should try ensure
63 * this page is only used to
64 * free other pages.
65 */
57 }; 66 };
58 67
59 union { 68 union {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 458988bd55a1..2daa54f55db7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -201,7 +201,7 @@ struct zone_reclaim_stat {
201struct lruvec { 201struct lruvec {
202 struct list_head lists[NR_LRU_LISTS]; 202 struct list_head lists[NR_LRU_LISTS];
203 struct zone_reclaim_stat reclaim_stat; 203 struct zone_reclaim_stat reclaim_stat;
204#ifdef CONFIG_CGROUP_MEM_RES_CTLR 204#ifdef CONFIG_MEMCG
205 struct zone *zone; 205 struct zone *zone;
206#endif 206#endif
207}; 207};
@@ -209,7 +209,6 @@ struct lruvec {
209/* Mask used at gathering information at once (see memcontrol.c) */ 209/* Mask used at gathering information at once (see memcontrol.c) */
210#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 210#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
211#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 211#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
212#define LRU_ALL_EVICTABLE (LRU_ALL_FILE | LRU_ALL_ANON)
213#define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 212#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
214 213
215/* Isolate clean file */ 214/* Isolate clean file */
@@ -369,6 +368,10 @@ struct zone {
369 */ 368 */
370 spinlock_t lock; 369 spinlock_t lock;
371 int all_unreclaimable; /* All pages pinned */ 370 int all_unreclaimable; /* All pages pinned */
371#if defined CONFIG_COMPACTION || defined CONFIG_CMA
372 /* pfn where the last incremental compaction isolated free pages */
373 unsigned long compact_cached_free_pfn;
374#endif
372#ifdef CONFIG_MEMORY_HOTPLUG 375#ifdef CONFIG_MEMORY_HOTPLUG
373 /* see spanned/present_pages for more description */ 376 /* see spanned/present_pages for more description */
374 seqlock_t span_seqlock; 377 seqlock_t span_seqlock;
@@ -475,6 +478,14 @@ struct zone {
475 * rarely used fields: 478 * rarely used fields:
476 */ 479 */
477 const char *name; 480 const char *name;
481#ifdef CONFIG_MEMORY_ISOLATION
482 /*
483 * the number of MIGRATE_ISOLATE *pageblock*.
484 * We need this for free page counting. Look at zone_watermark_ok_safe.
485 * It's protected by zone->lock
486 */
487 int nr_pageblock_isolate;
488#endif
478} ____cacheline_internodealigned_in_smp; 489} ____cacheline_internodealigned_in_smp;
479 490
480typedef enum { 491typedef enum {
@@ -671,7 +682,7 @@ typedef struct pglist_data {
671 int nr_zones; 682 int nr_zones;
672#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ 683#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
673 struct page *node_mem_map; 684 struct page *node_mem_map;
674#ifdef CONFIG_CGROUP_MEM_RES_CTLR 685#ifdef CONFIG_MEMCG
675 struct page_cgroup *node_page_cgroup; 686 struct page_cgroup *node_page_cgroup;
676#endif 687#endif
677#endif 688#endif
@@ -694,6 +705,7 @@ typedef struct pglist_data {
694 range, including holes */ 705 range, including holes */
695 int node_id; 706 int node_id;
696 wait_queue_head_t kswapd_wait; 707 wait_queue_head_t kswapd_wait;
708 wait_queue_head_t pfmemalloc_wait;
697 struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ 709 struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */
698 int kswapd_max_order; 710 int kswapd_max_order;
699 enum zone_type classzone_idx; 711 enum zone_type classzone_idx;
@@ -718,7 +730,7 @@ typedef struct pglist_data {
718#include <linux/memory_hotplug.h> 730#include <linux/memory_hotplug.h>
719 731
720extern struct mutex zonelists_mutex; 732extern struct mutex zonelists_mutex;
721void build_all_zonelists(void *data); 733void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
722void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); 734void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
723bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 735bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
724 int classzone_idx, int alloc_flags); 736 int classzone_idx, int alloc_flags);
@@ -736,7 +748,7 @@ extern void lruvec_init(struct lruvec *lruvec, struct zone *zone);
736 748
737static inline struct zone *lruvec_zone(struct lruvec *lruvec) 749static inline struct zone *lruvec_zone(struct lruvec *lruvec)
738{ 750{
739#ifdef CONFIG_CGROUP_MEM_RES_CTLR 751#ifdef CONFIG_MEMCG
740 return lruvec->zone; 752 return lruvec->zone;
741#else 753#else
742 return container_of(lruvec, struct zone, lruvec); 754 return container_of(lruvec, struct zone, lruvec);
@@ -773,7 +785,7 @@ extern int movable_zone;
773 785
774static inline int zone_movable_is_highmem(void) 786static inline int zone_movable_is_highmem(void)
775{ 787{
776#if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE) 788#if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
777 return movable_zone == ZONE_HIGHMEM; 789 return movable_zone == ZONE_HIGHMEM;
778#else 790#else
779 return 0; 791 return 0;
@@ -1052,7 +1064,7 @@ struct mem_section {
1052 1064
1053 /* See declaration of similar field in struct zone */ 1065 /* See declaration of similar field in struct zone */
1054 unsigned long *pageblock_flags; 1066 unsigned long *pageblock_flags;
1055#ifdef CONFIG_CGROUP_MEM_RES_CTLR 1067#ifdef CONFIG_MEMCG
1056 /* 1068 /*
1057 * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use 1069 * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
1058 * section. (see memcontrol.h/page_cgroup.h about this.) 1070 * section. (see memcontrol.h/page_cgroup.h about this.)
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 2889877318bc..1f8fc7f9bcd8 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -473,10 +473,10 @@ extern ssize_t nfs_direct_IO(int, struct kiocb *, const struct iovec *, loff_t,
473 unsigned long); 473 unsigned long);
474extern ssize_t nfs_file_direct_read(struct kiocb *iocb, 474extern ssize_t nfs_file_direct_read(struct kiocb *iocb,
475 const struct iovec *iov, unsigned long nr_segs, 475 const struct iovec *iov, unsigned long nr_segs,
476 loff_t pos); 476 loff_t pos, bool uio);
477extern ssize_t nfs_file_direct_write(struct kiocb *iocb, 477extern ssize_t nfs_file_direct_write(struct kiocb *iocb,
478 const struct iovec *iov, unsigned long nr_segs, 478 const struct iovec *iov, unsigned long nr_segs,
479 loff_t pos); 479 loff_t pos, bool uio);
480 480
481/* 481/*
482 * linux/fs/nfs/dir.c 482 * linux/fs/nfs/dir.c
diff --git a/include/linux/oom.h b/include/linux/oom.h
index e4c29bc72e70..49a3031fda50 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -40,15 +40,36 @@ enum oom_constraint {
40 CONSTRAINT_MEMCG, 40 CONSTRAINT_MEMCG,
41}; 41};
42 42
43enum oom_scan_t {
44 OOM_SCAN_OK, /* scan thread and find its badness */
45 OOM_SCAN_CONTINUE, /* do not consider thread for oom kill */
46 OOM_SCAN_ABORT, /* abort the iteration and return */
47 OOM_SCAN_SELECT, /* always select this thread first */
48};
49
43extern void compare_swap_oom_score_adj(int old_val, int new_val); 50extern void compare_swap_oom_score_adj(int old_val, int new_val);
44extern int test_set_oom_score_adj(int new_val); 51extern int test_set_oom_score_adj(int new_val);
45 52
46extern unsigned long oom_badness(struct task_struct *p, 53extern unsigned long oom_badness(struct task_struct *p,
47 struct mem_cgroup *memcg, const nodemask_t *nodemask, 54 struct mem_cgroup *memcg, const nodemask_t *nodemask,
48 unsigned long totalpages); 55 unsigned long totalpages);
56extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
57 unsigned int points, unsigned long totalpages,
58 struct mem_cgroup *memcg, nodemask_t *nodemask,
59 const char *message);
60
49extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); 61extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
50extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); 62extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
51 63
64extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
65 int order, const nodemask_t *nodemask);
66
67extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
68 unsigned long totalpages, const nodemask_t *nodemask,
69 bool force_kill);
70extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
71 int order);
72
52extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 73extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
53 int order, nodemask_t *mask, bool force_kill); 74 int order, nodemask_t *mask, bool force_kill);
54extern int register_oom_notifier(struct notifier_block *nb); 75extern int register_oom_notifier(struct notifier_block *nb);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index c88d2a9451af..b5d13841604e 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -7,6 +7,7 @@
7 7
8#include <linux/types.h> 8#include <linux/types.h>
9#include <linux/bug.h> 9#include <linux/bug.h>
10#include <linux/mmdebug.h>
10#ifndef __GENERATING_BOUNDS_H 11#ifndef __GENERATING_BOUNDS_H
11#include <linux/mm_types.h> 12#include <linux/mm_types.h>
12#include <generated/bounds.h> 13#include <generated/bounds.h>
@@ -453,6 +454,34 @@ static inline int PageTransTail(struct page *page)
453} 454}
454#endif 455#endif
455 456
457/*
458 * If network-based swap is enabled, sl*b must keep track of whether pages
459 * were allocated from pfmemalloc reserves.
460 */
461static inline int PageSlabPfmemalloc(struct page *page)
462{
463 VM_BUG_ON(!PageSlab(page));
464 return PageActive(page);
465}
466
467static inline void SetPageSlabPfmemalloc(struct page *page)
468{
469 VM_BUG_ON(!PageSlab(page));
470 SetPageActive(page);
471}
472
473static inline void __ClearPageSlabPfmemalloc(struct page *page)
474{
475 VM_BUG_ON(!PageSlab(page));
476 __ClearPageActive(page);
477}
478
479static inline void ClearPageSlabPfmemalloc(struct page *page)
480{
481 VM_BUG_ON(!PageSlab(page));
482 ClearPageActive(page);
483}
484
456#ifdef CONFIG_MMU 485#ifdef CONFIG_MMU
457#define __PG_MLOCKED (1 << PG_mlocked) 486#define __PG_MLOCKED (1 << PG_mlocked)
458#else 487#else
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 3bdcab30ca41..105077aa7685 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -1,6 +1,11 @@
1#ifndef __LINUX_PAGEISOLATION_H 1#ifndef __LINUX_PAGEISOLATION_H
2#define __LINUX_PAGEISOLATION_H 2#define __LINUX_PAGEISOLATION_H
3 3
4
5bool has_unmovable_pages(struct zone *zone, struct page *page, int count);
6void set_pageblock_migratetype(struct page *page, int migratetype);
7int move_freepages_block(struct zone *zone, struct page *page,
8 int migratetype);
4/* 9/*
5 * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE. 10 * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
6 * If specified range includes migrate types other than MOVABLE or CMA, 11 * If specified range includes migrate types other than MOVABLE or CMA,
@@ -10,7 +15,7 @@
10 * free all pages in the range. test_page_isolated() can be used for 15 * free all pages in the range. test_page_isolated() can be used for
11 * test it. 16 * test it.
12 */ 17 */
13extern int 18int
14start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, 19start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
15 unsigned migratetype); 20 unsigned migratetype);
16 21
@@ -18,7 +23,7 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
18 * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE. 23 * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
19 * target range is [start_pfn, end_pfn) 24 * target range is [start_pfn, end_pfn)
20 */ 25 */
21extern int 26int
22undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, 27undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
23 unsigned migratetype); 28 unsigned migratetype);
24 29
@@ -30,8 +35,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn);
30/* 35/*
31 * Internal functions. Changes pageblock's migrate type. 36 * Internal functions. Changes pageblock's migrate type.
32 */ 37 */
33extern int set_migratetype_isolate(struct page *page); 38int set_migratetype_isolate(struct page *page);
34extern void unset_migratetype_isolate(struct page *page, unsigned migratetype); 39void unset_migratetype_isolate(struct page *page, unsigned migratetype);
35 40
36 41
37#endif 42#endif
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index a88cdba27809..777a524716db 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -12,7 +12,7 @@ enum {
12#ifndef __GENERATING_BOUNDS_H 12#ifndef __GENERATING_BOUNDS_H
13#include <generated/bounds.h> 13#include <generated/bounds.h>
14 14
15#ifdef CONFIG_CGROUP_MEM_RES_CTLR 15#ifdef CONFIG_MEMCG
16#include <linux/bit_spinlock.h> 16#include <linux/bit_spinlock.h>
17 17
18/* 18/*
@@ -82,7 +82,7 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc)
82 bit_spin_unlock(PCG_LOCK, &pc->flags); 82 bit_spin_unlock(PCG_LOCK, &pc->flags);
83} 83}
84 84
85#else /* CONFIG_CGROUP_MEM_RES_CTLR */ 85#else /* CONFIG_MEMCG */
86struct page_cgroup; 86struct page_cgroup;
87 87
88static inline void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) 88static inline void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
@@ -102,11 +102,11 @@ static inline void __init page_cgroup_init_flatmem(void)
102{ 102{
103} 103}
104 104
105#endif /* CONFIG_CGROUP_MEM_RES_CTLR */ 105#endif /* CONFIG_MEMCG */
106 106
107#include <linux/swap.h> 107#include <linux/swap.h>
108 108
109#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 109#ifdef CONFIG_MEMCG_SWAP
110extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, 110extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
111 unsigned short old, unsigned short new); 111 unsigned short old, unsigned short new);
112extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); 112extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
@@ -138,7 +138,7 @@ static inline void swap_cgroup_swapoff(int type)
138 return; 138 return;
139} 139}
140 140
141#endif /* CONFIG_CGROUP_MEM_RES_CTLR_SWAP */ 141#endif /* CONFIG_MEMCG_SWAP */
142 142
143#endif /* !__GENERATING_BOUNDS_H */ 143#endif /* !__GENERATING_BOUNDS_H */
144 144
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7cfad3bbb0cc..e42c762f0dc7 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -286,6 +286,11 @@ static inline loff_t page_offset(struct page *page)
286 return ((loff_t)page->index) << PAGE_CACHE_SHIFT; 286 return ((loff_t)page->index) << PAGE_CACHE_SHIFT;
287} 287}
288 288
289static inline loff_t page_file_offset(struct page *page)
290{
291 return ((loff_t)page_file_index(page)) << PAGE_CACHE_SHIFT;
292}
293
289extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma, 294extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
290 unsigned long address); 295 unsigned long address);
291 296
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 68dcffaa62a0..c147e7024f11 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1584,7 +1584,7 @@ struct task_struct {
1584 /* bitmask and counter of trace recursion */ 1584 /* bitmask and counter of trace recursion */
1585 unsigned long trace_recursion; 1585 unsigned long trace_recursion;
1586#endif /* CONFIG_TRACING */ 1586#endif /* CONFIG_TRACING */
1587#ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */ 1587#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
1588 struct memcg_batch_info { 1588 struct memcg_batch_info {
1589 int do_batch; /* incremented when batch uncharge started */ 1589 int do_batch; /* incremented when batch uncharge started */
1590 struct mem_cgroup *memcg; /* target memcg of uncharge */ 1590 struct mem_cgroup *memcg; /* target memcg of uncharge */
@@ -1894,6 +1894,13 @@ static inline void rcu_copy_process(struct task_struct *p)
1894 1894
1895#endif 1895#endif
1896 1896
1897static inline void tsk_restore_flags(struct task_struct *task,
1898 unsigned long orig_flags, unsigned long flags)
1899{
1900 task->flags &= ~flags;
1901 task->flags |= orig_flags & flags;
1902}
1903
1897#ifdef CONFIG_SMP 1904#ifdef CONFIG_SMP
1898extern void do_set_cpus_allowed(struct task_struct *p, 1905extern void do_set_cpus_allowed(struct task_struct *p,
1899 const struct cpumask *new_mask); 1906 const struct cpumask *new_mask);
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 07ceb97d53fa..ac6b8ee07825 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -20,7 +20,6 @@ struct shrink_control {
20 * 'nr_to_scan' entries and attempt to free them up. It should return 20 * 'nr_to_scan' entries and attempt to free them up. It should return
21 * the number of objects which remain in the cache. If it returns -1, it means 21 * the number of objects which remain in the cache. If it returns -1, it means
22 * it cannot do any scanning at this time (eg. there is a risk of deadlock). 22 * it cannot do any scanning at this time (eg. there is a risk of deadlock).
23 * The callback must not return -1 if nr_to_scan is zero.
24 * 23 *
25 * The 'gfpmask' refers to the allocation we are currently trying to 24 * The 'gfpmask' refers to the allocation we are currently trying to
26 * fulfil. 25 * fulfil.
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d205c4be7f5b..7632c87da2c9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -462,6 +462,7 @@ struct sk_buff {
462#ifdef CONFIG_IPV6_NDISC_NODETYPE 462#ifdef CONFIG_IPV6_NDISC_NODETYPE
463 __u8 ndisc_nodetype:2; 463 __u8 ndisc_nodetype:2;
464#endif 464#endif
465 __u8 pfmemalloc:1;
465 __u8 ooo_okay:1; 466 __u8 ooo_okay:1;
466 __u8 l4_rxhash:1; 467 __u8 l4_rxhash:1;
467 __u8 wifi_acked_valid:1; 468 __u8 wifi_acked_valid:1;
@@ -502,6 +503,15 @@ struct sk_buff {
502#include <linux/slab.h> 503#include <linux/slab.h>
503 504
504 505
506#define SKB_ALLOC_FCLONE 0x01
507#define SKB_ALLOC_RX 0x02
508
509/* Returns true if the skb was allocated from PFMEMALLOC reserves */
510static inline bool skb_pfmemalloc(const struct sk_buff *skb)
511{
512 return unlikely(skb->pfmemalloc);
513}
514
505/* 515/*
506 * skb might have a dst pointer attached, refcounted or not. 516 * skb might have a dst pointer attached, refcounted or not.
507 * _skb_refdst low order bit is set if refcount was _not_ taken 517 * _skb_refdst low order bit is set if refcount was _not_ taken
@@ -565,7 +575,7 @@ extern bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
565 bool *fragstolen, int *delta_truesize); 575 bool *fragstolen, int *delta_truesize);
566 576
567extern struct sk_buff *__alloc_skb(unsigned int size, 577extern struct sk_buff *__alloc_skb(unsigned int size,
568 gfp_t priority, int fclone, int node); 578 gfp_t priority, int flags, int node);
569extern struct sk_buff *build_skb(void *data, unsigned int frag_size); 579extern struct sk_buff *build_skb(void *data, unsigned int frag_size);
570static inline struct sk_buff *alloc_skb(unsigned int size, 580static inline struct sk_buff *alloc_skb(unsigned int size,
571 gfp_t priority) 581 gfp_t priority)
@@ -576,7 +586,7 @@ static inline struct sk_buff *alloc_skb(unsigned int size,
576static inline struct sk_buff *alloc_skb_fclone(unsigned int size, 586static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
577 gfp_t priority) 587 gfp_t priority)
578{ 588{
579 return __alloc_skb(size, priority, 1, NUMA_NO_NODE); 589 return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);
580} 590}
581 591
582extern void skb_recycle(struct sk_buff *skb); 592extern void skb_recycle(struct sk_buff *skb);
@@ -1237,6 +1247,17 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
1237{ 1247{
1238 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 1248 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
1239 1249
1250 /*
1251 * Propagate page->pfmemalloc to the skb if we can. The problem is
1252 * that not all callers have unique ownership of the page. If
1253 * pfmemalloc is set, we check the mapping as a mapping implies
1254 * page->index is set (index and pfmemalloc share space).
1255 * If it's a valid mapping, we cannot use page->pfmemalloc but we
1256 * do not lose pfmemalloc information as the pages would not be
1257 * allocated using __GFP_MEMALLOC.
1258 */
1259 if (page->pfmemalloc && !page->mapping)
1260 skb->pfmemalloc = true;
1240 frag->page.p = page; 1261 frag->page.p = page;
1241 frag->page_offset = off; 1262 frag->page_offset = off;
1242 skb_frag_size_set(frag, size); 1263 skb_frag_size_set(frag, size);
@@ -1753,6 +1774,61 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
1753 return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC); 1774 return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC);
1754} 1775}
1755 1776
1777/*
1778 * __skb_alloc_page - allocate pages for ps-rx on a skb and preserve pfmemalloc data
1779 * @gfp_mask: alloc_pages_node mask. Set __GFP_NOMEMALLOC if not for network packet RX
1780 * @skb: skb to set pfmemalloc on if __GFP_MEMALLOC is used
1781 * @order: size of the allocation
1782 *
1783 * Allocate a new page.
1784 *
1785 * %NULL is returned if there is no free memory.
1786*/
1787static inline struct page *__skb_alloc_pages(gfp_t gfp_mask,
1788 struct sk_buff *skb,
1789 unsigned int order)
1790{
1791 struct page *page;
1792
1793 gfp_mask |= __GFP_COLD;
1794
1795 if (!(gfp_mask & __GFP_NOMEMALLOC))
1796 gfp_mask |= __GFP_MEMALLOC;
1797
1798 page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
1799 if (skb && page && page->pfmemalloc)
1800 skb->pfmemalloc = true;
1801
1802 return page;
1803}
1804
1805/**
1806 * __skb_alloc_page - allocate a page for ps-rx for a given skb and preserve pfmemalloc data
1807 * @gfp_mask: alloc_pages_node mask. Set __GFP_NOMEMALLOC if not for network packet RX
1808 * @skb: skb to set pfmemalloc on if __GFP_MEMALLOC is used
1809 *
1810 * Allocate a new page.
1811 *
1812 * %NULL is returned if there is no free memory.
1813 */
1814static inline struct page *__skb_alloc_page(gfp_t gfp_mask,
1815 struct sk_buff *skb)
1816{
1817 return __skb_alloc_pages(gfp_mask, skb, 0);
1818}
1819
1820/**
1821 * skb_propagate_pfmemalloc - Propagate pfmemalloc if skb is allocated after RX page
1822 * @page: The page that was allocated from skb_alloc_page
1823 * @skb: The skb that may need pfmemalloc set
1824 */
1825static inline void skb_propagate_pfmemalloc(struct page *page,
1826 struct sk_buff *skb)
1827{
1828 if (page && page->pfmemalloc)
1829 skb->pfmemalloc = true;
1830}
1831
1756/** 1832/**
1757 * skb_frag_page - retrieve the page refered to by a paged fragment 1833 * skb_frag_page - retrieve the page refered to by a paged fragment
1758 * @frag: the paged fragment 1834 * @frag: the paged fragment
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 77d278defa70..cff40aa7db62 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -174,6 +174,8 @@ struct rpc_xprt {
174 unsigned long state; /* transport state */ 174 unsigned long state; /* transport state */
175 unsigned char shutdown : 1, /* being shut down */ 175 unsigned char shutdown : 1, /* being shut down */
176 resvport : 1; /* use a reserved port */ 176 resvport : 1; /* use a reserved port */
177 unsigned int swapper; /* we're swapping over this
178 transport */
177 unsigned int bind_index; /* bind function index */ 179 unsigned int bind_index; /* bind function index */
178 180
179 /* 181 /*
@@ -316,6 +318,7 @@ void xprt_release_rqst_cong(struct rpc_task *task);
316void xprt_disconnect_done(struct rpc_xprt *xprt); 318void xprt_disconnect_done(struct rpc_xprt *xprt);
317void xprt_force_disconnect(struct rpc_xprt *xprt); 319void xprt_force_disconnect(struct rpc_xprt *xprt);
318void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie); 320void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
321int xs_swapper(struct rpc_xprt *xprt, int enable);
319 322
320/* 323/*
321 * Reserved bit positions in xprt->state 324 * Reserved bit positions in xprt->state
diff --git a/include/linux/swap.h b/include/linux/swap.h
index c84ec68eaec9..388e70601413 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -151,6 +151,7 @@ enum {
151 SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ 151 SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */
152 SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ 152 SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */
153 SWP_BLKDEV = (1 << 6), /* its a block device */ 153 SWP_BLKDEV = (1 << 6), /* its a block device */
154 SWP_FILE = (1 << 7), /* set after swap_activate success */
154 /* add others here before... */ 155 /* add others here before... */
155 SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ 156 SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
156}; 157};
@@ -301,7 +302,7 @@ static inline void scan_unevictable_unregister_node(struct node *node)
301 302
302extern int kswapd_run(int nid); 303extern int kswapd_run(int nid);
303extern void kswapd_stop(int nid); 304extern void kswapd_stop(int nid);
304#ifdef CONFIG_CGROUP_MEM_RES_CTLR 305#ifdef CONFIG_MEMCG
305extern int mem_cgroup_swappiness(struct mem_cgroup *mem); 306extern int mem_cgroup_swappiness(struct mem_cgroup *mem);
306#else 307#else
307static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) 308static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
@@ -309,7 +310,7 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
309 return vm_swappiness; 310 return vm_swappiness;
310} 311}
311#endif 312#endif
312#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 313#ifdef CONFIG_MEMCG_SWAP
313extern void mem_cgroup_uncharge_swap(swp_entry_t ent); 314extern void mem_cgroup_uncharge_swap(swp_entry_t ent);
314#else 315#else
315static inline void mem_cgroup_uncharge_swap(swp_entry_t ent) 316static inline void mem_cgroup_uncharge_swap(swp_entry_t ent)
@@ -320,8 +321,14 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t ent)
320/* linux/mm/page_io.c */ 321/* linux/mm/page_io.c */
321extern int swap_readpage(struct page *); 322extern int swap_readpage(struct page *);
322extern int swap_writepage(struct page *page, struct writeback_control *wbc); 323extern int swap_writepage(struct page *page, struct writeback_control *wbc);
324extern int swap_set_page_dirty(struct page *page);
323extern void end_swap_bio_read(struct bio *bio, int err); 325extern void end_swap_bio_read(struct bio *bio, int err);
324 326
327int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
328 unsigned long nr_pages, sector_t start_block);
329int generic_swapfile_activate(struct swap_info_struct *, struct file *,
330 sector_t *);
331
325/* linux/mm/swap_state.c */ 332/* linux/mm/swap_state.c */
326extern struct address_space swapper_space; 333extern struct address_space swapper_space;
327#define total_swapcache_pages swapper_space.nrpages 334#define total_swapcache_pages swapper_space.nrpages
@@ -356,11 +363,12 @@ extern unsigned int count_swap_pages(int, int);
356extern sector_t map_swap_page(struct page *, struct block_device **); 363extern sector_t map_swap_page(struct page *, struct block_device **);
357extern sector_t swapdev_block(int, pgoff_t); 364extern sector_t swapdev_block(int, pgoff_t);
358extern int page_swapcount(struct page *); 365extern int page_swapcount(struct page *);
366extern struct swap_info_struct *page_swap_info(struct page *);
359extern int reuse_swap_page(struct page *); 367extern int reuse_swap_page(struct page *);
360extern int try_to_free_swap(struct page *); 368extern int try_to_free_swap(struct page *);
361struct backing_dev_info; 369struct backing_dev_info;
362 370
363#ifdef CONFIG_CGROUP_MEM_RES_CTLR 371#ifdef CONFIG_MEMCG
364extern void 372extern void
365mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout); 373mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout);
366#else 374#else
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 06f8e3858251..57f7b1091511 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -30,6 +30,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
30 FOR_ALL_ZONES(PGSTEAL_DIRECT), 30 FOR_ALL_ZONES(PGSTEAL_DIRECT),
31 FOR_ALL_ZONES(PGSCAN_KSWAPD), 31 FOR_ALL_ZONES(PGSCAN_KSWAPD),
32 FOR_ALL_ZONES(PGSCAN_DIRECT), 32 FOR_ALL_ZONES(PGSCAN_DIRECT),
33 PGSCAN_DIRECT_THROTTLE,
33#ifdef CONFIG_NUMA 34#ifdef CONFIG_NUMA
34 PGSCAN_ZONE_RECLAIM_FAILED, 35 PGSCAN_ZONE_RECLAIM_FAILED,
35#endif 36#endif
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 65efb92da996..ad2cfd53dadc 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -179,11 +179,6 @@ extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp);
179#define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d) 179#define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d)
180#define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d)) 180#define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d))
181 181
182static inline void zap_zone_vm_stats(struct zone *zone)
183{
184 memset(zone->vm_stat, 0, sizeof(zone->vm_stat));
185}
186
187extern void inc_zone_state(struct zone *, enum zone_stat_item); 182extern void inc_zone_state(struct zone *, enum zone_stat_item);
188 183
189#ifdef CONFIG_SMP 184#ifdef CONFIG_SMP
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 6d0a0fcd80e7..c66fe3332d83 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -189,9 +189,4 @@ void tag_pages_for_writeback(struct address_space *mapping,
189 189
190void account_page_redirty(struct page *page); 190void account_page_redirty(struct page *page);
191 191
192/* pdflush.c */
193extern int nr_pdflush_threads; /* Global so it can be exported to sysctl
194 read-only. */
195
196
197#endif /* WRITEBACK_H */ 192#endif /* WRITEBACK_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index e067f8c18f88..b3730239bf18 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -621,6 +621,7 @@ enum sock_flags {
621 SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ 621 SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
622 SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ 622 SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
623 SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */ 623 SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
624 SOCK_MEMALLOC, /* VM depends on this socket for swapping */
624 SOCK_TIMESTAMPING_TX_HARDWARE, /* %SOF_TIMESTAMPING_TX_HARDWARE */ 625 SOCK_TIMESTAMPING_TX_HARDWARE, /* %SOF_TIMESTAMPING_TX_HARDWARE */
625 SOCK_TIMESTAMPING_TX_SOFTWARE, /* %SOF_TIMESTAMPING_TX_SOFTWARE */ 626 SOCK_TIMESTAMPING_TX_SOFTWARE, /* %SOF_TIMESTAMPING_TX_SOFTWARE */
626 SOCK_TIMESTAMPING_RX_HARDWARE, /* %SOF_TIMESTAMPING_RX_HARDWARE */ 627 SOCK_TIMESTAMPING_RX_HARDWARE, /* %SOF_TIMESTAMPING_RX_HARDWARE */
@@ -658,6 +659,26 @@ static inline bool sock_flag(const struct sock *sk, enum sock_flags flag)
658 return test_bit(flag, &sk->sk_flags); 659 return test_bit(flag, &sk->sk_flags);
659} 660}
660 661
662#ifdef CONFIG_NET
663extern struct static_key memalloc_socks;
664static inline int sk_memalloc_socks(void)
665{
666 return static_key_false(&memalloc_socks);
667}
668#else
669
670static inline int sk_memalloc_socks(void)
671{
672 return 0;
673}
674
675#endif
676
677static inline gfp_t sk_gfp_atomic(struct sock *sk, gfp_t gfp_mask)
678{
679 return GFP_ATOMIC | (sk->sk_allocation & __GFP_MEMALLOC);
680}
681
661static inline void sk_acceptq_removed(struct sock *sk) 682static inline void sk_acceptq_removed(struct sock *sk)
662{ 683{
663 sk->sk_ack_backlog--; 684 sk->sk_ack_backlog--;
@@ -733,8 +754,13 @@ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *s
733 return 0; 754 return 0;
734} 755}
735 756
757extern int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);
758
736static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 759static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
737{ 760{
761 if (sk_memalloc_socks() && skb_pfmemalloc(skb))
762 return __sk_backlog_rcv(sk, skb);
763
738 return sk->sk_backlog_rcv(sk, skb); 764 return sk->sk_backlog_rcv(sk, skb);
739} 765}
740 766
@@ -798,6 +824,8 @@ extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
798extern void sk_stream_wait_close(struct sock *sk, long timeo_p); 824extern void sk_stream_wait_close(struct sock *sk, long timeo_p);
799extern int sk_stream_error(struct sock *sk, int flags, int err); 825extern int sk_stream_error(struct sock *sk, int flags, int err);
800extern void sk_stream_kill_queues(struct sock *sk); 826extern void sk_stream_kill_queues(struct sock *sk);
827extern void sk_set_memalloc(struct sock *sk);
828extern void sk_clear_memalloc(struct sock *sk);
801 829
802extern int sk_wait_data(struct sock *sk, long *timeo); 830extern int sk_wait_data(struct sock *sk, long *timeo);
803 831
@@ -913,7 +941,7 @@ struct proto {
913#ifdef SOCK_REFCNT_DEBUG 941#ifdef SOCK_REFCNT_DEBUG
914 atomic_t socks; 942 atomic_t socks;
915#endif 943#endif
916#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 944#ifdef CONFIG_MEMCG_KMEM
917 /* 945 /*
918 * cgroup specific init/deinit functions. Called once for all 946 * cgroup specific init/deinit functions. Called once for all
919 * protocols that implement it, from cgroups populate function. 947 * protocols that implement it, from cgroups populate function.
@@ -994,7 +1022,7 @@ inline void sk_refcnt_debug_release(const struct sock *sk)
994#define sk_refcnt_debug_release(sk) do { } while (0) 1022#define sk_refcnt_debug_release(sk) do { } while (0)
995#endif /* SOCK_REFCNT_DEBUG */ 1023#endif /* SOCK_REFCNT_DEBUG */
996 1024
997#if defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) && defined(CONFIG_NET) 1025#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_NET)
998extern struct static_key memcg_socket_limit_enabled; 1026extern struct static_key memcg_socket_limit_enabled;
999static inline struct cg_proto *parent_cg_proto(struct proto *proto, 1027static inline struct cg_proto *parent_cg_proto(struct proto *proto,
1000 struct cg_proto *cg_proto) 1028 struct cg_proto *cg_proto)
@@ -1301,12 +1329,14 @@ static inline bool sk_wmem_schedule(struct sock *sk, int size)
1301 __sk_mem_schedule(sk, size, SK_MEM_SEND); 1329 __sk_mem_schedule(sk, size, SK_MEM_SEND);
1302} 1330}
1303 1331
1304static inline bool sk_rmem_schedule(struct sock *sk, int size) 1332static inline bool
1333sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, unsigned int size)
1305{ 1334{
1306 if (!sk_has_account(sk)) 1335 if (!sk_has_account(sk))
1307 return true; 1336 return true;
1308 return size <= sk->sk_forward_alloc || 1337 return size<= sk->sk_forward_alloc ||
1309 __sk_mem_schedule(sk, size, SK_MEM_RECV); 1338 __sk_mem_schedule(sk, size, SK_MEM_RECV) ||
1339 skb_pfmemalloc(skb);
1310} 1340}
1311 1341
1312static inline void sk_mem_reclaim(struct sock *sk) 1342static inline void sk_mem_reclaim(struct sock *sk)
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h
index 9fe3a36646e9..d6fd8e5b14b7 100644
--- a/include/trace/events/gfpflags.h
+++ b/include/trace/events/gfpflags.h
@@ -30,6 +30,7 @@
30 {(unsigned long)__GFP_COMP, "GFP_COMP"}, \ 30 {(unsigned long)__GFP_COMP, "GFP_COMP"}, \
31 {(unsigned long)__GFP_ZERO, "GFP_ZERO"}, \ 31 {(unsigned long)__GFP_ZERO, "GFP_ZERO"}, \
32 {(unsigned long)__GFP_NOMEMALLOC, "GFP_NOMEMALLOC"}, \ 32 {(unsigned long)__GFP_NOMEMALLOC, "GFP_NOMEMALLOC"}, \
33 {(unsigned long)__GFP_MEMALLOC, "GFP_MEMALLOC"}, \
33 {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ 34 {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \
34 {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ 35 {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \
35 {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ 36 {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \
diff --git a/init/Kconfig b/init/Kconfig
index b3f55f15e107..af6c7f8ba019 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -686,7 +686,7 @@ config RESOURCE_COUNTERS
686 This option enables controller independent resource accounting 686 This option enables controller independent resource accounting
687 infrastructure that works with cgroups. 687 infrastructure that works with cgroups.
688 688
689config CGROUP_MEM_RES_CTLR 689config MEMCG
690 bool "Memory Resource Controller for Control Groups" 690 bool "Memory Resource Controller for Control Groups"
691 depends on RESOURCE_COUNTERS 691 depends on RESOURCE_COUNTERS
692 select MM_OWNER 692 select MM_OWNER
@@ -709,9 +709,9 @@ config CGROUP_MEM_RES_CTLR
709 This config option also selects MM_OWNER config option, which 709 This config option also selects MM_OWNER config option, which
710 could in turn add some fork/exit overhead. 710 could in turn add some fork/exit overhead.
711 711
712config CGROUP_MEM_RES_CTLR_SWAP 712config MEMCG_SWAP
713 bool "Memory Resource Controller Swap Extension" 713 bool "Memory Resource Controller Swap Extension"
714 depends on CGROUP_MEM_RES_CTLR && SWAP 714 depends on MEMCG && SWAP
715 help 715 help
716 Add swap management feature to memory resource controller. When you 716 Add swap management feature to memory resource controller. When you
717 enable this, you can limit mem+swap usage per cgroup. In other words, 717 enable this, you can limit mem+swap usage per cgroup. In other words,
@@ -726,9 +726,9 @@ config CGROUP_MEM_RES_CTLR_SWAP
726 if boot option "swapaccount=0" is set, swap will not be accounted. 726 if boot option "swapaccount=0" is set, swap will not be accounted.
727 Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page 727 Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
728 size is 4096bytes, 512k per 1Gbytes of swap. 728 size is 4096bytes, 512k per 1Gbytes of swap.
729config CGROUP_MEM_RES_CTLR_SWAP_ENABLED 729config MEMCG_SWAP_ENABLED
730 bool "Memory Resource Controller Swap Extension enabled by default" 730 bool "Memory Resource Controller Swap Extension enabled by default"
731 depends on CGROUP_MEM_RES_CTLR_SWAP 731 depends on MEMCG_SWAP
732 default y 732 default y
733 help 733 help
734 Memory Resource Controller Swap Extension comes with its price in 734 Memory Resource Controller Swap Extension comes with its price in
@@ -739,9 +739,9 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
739 For those who want to have the feature enabled by default should 739 For those who want to have the feature enabled by default should
740 select this option (if, for some reason, they need to disable it 740 select this option (if, for some reason, they need to disable it
741 then swapaccount=0 does the trick). 741 then swapaccount=0 does the trick).
742config CGROUP_MEM_RES_CTLR_KMEM 742config MEMCG_KMEM
743 bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)" 743 bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)"
744 depends on CGROUP_MEM_RES_CTLR && EXPERIMENTAL 744 depends on MEMCG && EXPERIMENTAL
745 default n 745 default n
746 help 746 help
747 The Kernel Memory extension for Memory Resource Controller can limit 747 The Kernel Memory extension for Memory Resource Controller can limit
@@ -751,6 +751,21 @@ config CGROUP_MEM_RES_CTLR_KMEM
751 the kmem extension can use it to guarantee that no group of processes 751 the kmem extension can use it to guarantee that no group of processes
752 will ever exhaust kernel resources alone. 752 will ever exhaust kernel resources alone.
753 753
754config CGROUP_HUGETLB
755 bool "HugeTLB Resource Controller for Control Groups"
756 depends on RESOURCE_COUNTERS && HUGETLB_PAGE && EXPERIMENTAL
757 default n
758 help
759 Provides a cgroup Resource Controller for HugeTLB pages.
760 When you enable this, you can put a per cgroup limit on HugeTLB usage.
761 The limit is enforced during page fault. Since HugeTLB doesn't
762 support page reclaim, enforcing the limit at page fault time implies
763 that, the application will get SIGBUS signal if it tries to access
764 HugeTLB pages beyond its limit. This requires the application to know
765 beforehand how much HugeTLB pages it would require for its use. The
766 control group is tracked in the third page lru pointer. This means
767 that we cannot use the controller with huge page less than 3 pages.
768
754config CGROUP_PERF 769config CGROUP_PERF
755 bool "Enable perf_event per-cpu per-container group (cgroup) monitoring" 770 bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
756 depends on PERF_EVENTS && CGROUPS 771 depends on PERF_EVENTS && CGROUPS
diff --git a/init/main.c b/init/main.c
index 95316a1b4a76..e60679de61c3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -506,7 +506,7 @@ asmlinkage void __init start_kernel(void)
506 setup_per_cpu_areas(); 506 setup_per_cpu_areas();
507 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ 507 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
508 508
509 build_all_zonelists(NULL); 509 build_all_zonelists(NULL, NULL);
510 page_alloc_init(); 510 page_alloc_init();
511 511
512 printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); 512 printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a4eb5227a19e..14d32588cccd 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -416,7 +416,7 @@ int __cpuinit cpu_up(unsigned int cpu)
416 416
417 if (pgdat->node_zonelists->_zonerefs->zone == NULL) { 417 if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
418 mutex_lock(&zonelists_mutex); 418 mutex_lock(&zonelists_mutex);
419 build_all_zonelists(NULL); 419 build_all_zonelists(NULL, NULL);
420 mutex_unlock(&zonelists_mutex); 420 mutex_unlock(&zonelists_mutex);
421 } 421 }
422#endif 422#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 8efac1fe56bc..3bd2280d79f6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -381,10 +381,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
381 struct file *file; 381 struct file *file;
382 382
383 if (mpnt->vm_flags & VM_DONTCOPY) { 383 if (mpnt->vm_flags & VM_DONTCOPY) {
384 long pages = vma_pages(mpnt);
385 mm->total_vm -= pages;
386 vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, 384 vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
387 -pages); 385 -vma_pages(mpnt));
388 continue; 386 continue;
389 } 387 }
390 charge = 0; 388 charge = 0;
@@ -1308,7 +1306,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1308#ifdef CONFIG_DEBUG_MUTEXES 1306#ifdef CONFIG_DEBUG_MUTEXES
1309 p->blocked_on = NULL; /* not blocked yet */ 1307 p->blocked_on = NULL; /* not blocked yet */
1310#endif 1308#endif
1311#ifdef CONFIG_CGROUP_MEM_RES_CTLR 1309#ifdef CONFIG_MEMCG
1312 p->memcg_batch.do_batch = 0; 1310 p->memcg_batch.do_batch = 0;
1313 p->memcg_batch.memcg = NULL; 1311 p->memcg_batch.memcg = NULL;
1314#endif 1312#endif
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 671f9594e368..b73e681df09e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -210,6 +210,14 @@ asmlinkage void __do_softirq(void)
210 __u32 pending; 210 __u32 pending;
211 int max_restart = MAX_SOFTIRQ_RESTART; 211 int max_restart = MAX_SOFTIRQ_RESTART;
212 int cpu; 212 int cpu;
213 unsigned long old_flags = current->flags;
214
215 /*
216 * Mask out PF_MEMALLOC s current task context is borrowed for the
217 * softirq. A softirq handled such as network RX might set PF_MEMALLOC
218 * again if the socket is related to swap
219 */
220 current->flags &= ~PF_MEMALLOC;
213 221
214 pending = local_softirq_pending(); 222 pending = local_softirq_pending();
215 account_system_vtime(current); 223 account_system_vtime(current);
@@ -265,6 +273,7 @@ restart:
265 273
266 account_system_vtime(current); 274 account_system_vtime(current);
267 __local_bh_enable(SOFTIRQ_OFFSET); 275 __local_bh_enable(SOFTIRQ_OFFSET);
276 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
268} 277}
269 278
270#ifndef __ARCH_HAS_DO_SOFTIRQ 279#ifndef __ARCH_HAS_DO_SOFTIRQ
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 97186b99b0e4..6502d35a25ba 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1101,11 +1101,9 @@ static struct ctl_table vm_table[] = {
1101 .extra1 = &zero, 1101 .extra1 = &zero,
1102 }, 1102 },
1103 { 1103 {
1104 .procname = "nr_pdflush_threads", 1104 .procname = "nr_pdflush_threads",
1105 .data = &nr_pdflush_threads, 1105 .mode = 0444 /* read-only */,
1106 .maxlen = sizeof nr_pdflush_threads, 1106 .proc_handler = pdflush_proc_obsolete,
1107 .mode = 0444 /* read-only*/,
1108 .proc_handler = proc_dointvec,
1109 }, 1107 },
1110 { 1108 {
1111 .procname = "swappiness", 1109 .procname = "swappiness",
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index a650694883a1..65bdcf198d4e 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -147,7 +147,7 @@ static const struct bin_table bin_vm_table[] = {
147 { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, 147 { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" },
148 /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ 148 /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
149 /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ 149 /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
150 { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, 150 /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */
151 { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, 151 { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
152 /* VM_PAGEBUF unused */ 152 /* VM_PAGEBUF unused */
153 /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ 153 /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
diff --git a/mm/Kconfig b/mm/Kconfig
index 82fed4eb2b6f..d5c8019c6627 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -140,9 +140,13 @@ config ARCH_DISCARD_MEMBLOCK
140config NO_BOOTMEM 140config NO_BOOTMEM
141 boolean 141 boolean
142 142
143config MEMORY_ISOLATION
144 boolean
145
143# eventually, we can have this option just 'select SPARSEMEM' 146# eventually, we can have this option just 'select SPARSEMEM'
144config MEMORY_HOTPLUG 147config MEMORY_HOTPLUG
145 bool "Allow for memory hot-add" 148 bool "Allow for memory hot-add"
149 select MEMORY_ISOLATION
146 depends on SPARSEMEM || X86_64_ACPI_NUMA 150 depends on SPARSEMEM || X86_64_ACPI_NUMA
147 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG 151 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
148 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) 152 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
@@ -272,6 +276,7 @@ config MEMORY_FAILURE
272 depends on MMU 276 depends on MMU
273 depends on ARCH_SUPPORTS_MEMORY_FAILURE 277 depends on ARCH_SUPPORTS_MEMORY_FAILURE
274 bool "Enable recovery from hardware memory errors" 278 bool "Enable recovery from hardware memory errors"
279 select MEMORY_ISOLATION
275 help 280 help
276 Enables code to recover from some memory failures on systems 281 Enables code to recover from some memory failures on systems
277 with MCA recovery. This allows a system to continue running 282 with MCA recovery. This allows a system to continue running
diff --git a/mm/Makefile b/mm/Makefile
index 8e81fe263c94..92753e2d82da 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,8 +15,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
15 maccess.o page_alloc.o page-writeback.o \ 15 maccess.o page_alloc.o page-writeback.o \
16 readahead.o swap.o truncate.o vmscan.o shmem.o \ 16 readahead.o swap.o truncate.o vmscan.o shmem.o \
17 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 17 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
18 page_isolation.o mm_init.o mmu_context.o percpu.o \ 18 mm_init.o mmu_context.o percpu.o slab_common.o \
19 compaction.o slab_common.o $(mmu-y) 19 compaction.o $(mmu-y)
20 20
21obj-y += init-mm.o 21obj-y += init-mm.o
22 22
@@ -49,9 +49,11 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
49obj-$(CONFIG_MIGRATION) += migrate.o 49obj-$(CONFIG_MIGRATION) += migrate.o
50obj-$(CONFIG_QUICKLIST) += quicklist.o 50obj-$(CONFIG_QUICKLIST) += quicklist.o
51obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o 51obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
52obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 52obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
53obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
53obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o 54obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
54obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o 55obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
55obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o 56obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
56obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 57obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
57obj-$(CONFIG_CLEANCACHE) += cleancache.o 58obj-$(CONFIG_CLEANCACHE) += cleancache.o
59obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 3387aea11209..6b4718e2ee34 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -886,3 +886,23 @@ out:
886 return ret; 886 return ret;
887} 887}
888EXPORT_SYMBOL(wait_iff_congested); 888EXPORT_SYMBOL(wait_iff_congested);
889
890int pdflush_proc_obsolete(struct ctl_table *table, int write,
891 void __user *buffer, size_t *lenp, loff_t *ppos)
892{
893 char kbuf[] = "0\n";
894
895 if (*ppos) {
896 *lenp = 0;
897 return 0;
898 }
899
900 if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
901 return -EFAULT;
902 printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
903 table->procname);
904
905 *lenp = 2;
906 *ppos += *lenp;
907 return 2;
908}
diff --git a/mm/compaction.c b/mm/compaction.c
index 2f42d9528539..e78cb9688421 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -422,6 +422,17 @@ static void isolate_freepages(struct zone *zone,
422 pfn -= pageblock_nr_pages) { 422 pfn -= pageblock_nr_pages) {
423 unsigned long isolated; 423 unsigned long isolated;
424 424
425 /*
426 * Skip ahead if another thread is compacting in the area
427 * simultaneously. If we wrapped around, we can only skip
428 * ahead if zone->compact_cached_free_pfn also wrapped to
429 * above our starting point.
430 */
431 if (cc->order > 0 && (!cc->wrapped ||
432 zone->compact_cached_free_pfn >
433 cc->start_free_pfn))
434 pfn = min(pfn, zone->compact_cached_free_pfn);
435
425 if (!pfn_valid(pfn)) 436 if (!pfn_valid(pfn))
426 continue; 437 continue;
427 438
@@ -461,8 +472,11 @@ static void isolate_freepages(struct zone *zone,
461 * looking for free pages, the search will restart here as 472 * looking for free pages, the search will restart here as
462 * page migration may have returned some pages to the allocator 473 * page migration may have returned some pages to the allocator
463 */ 474 */
464 if (isolated) 475 if (isolated) {
465 high_pfn = max(high_pfn, pfn); 476 high_pfn = max(high_pfn, pfn);
477 if (cc->order > 0)
478 zone->compact_cached_free_pfn = high_pfn;
479 }
466 } 480 }
467 481
468 /* split_free_page does not map the pages */ 482 /* split_free_page does not map the pages */
@@ -556,6 +570,20 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
556 return ISOLATE_SUCCESS; 570 return ISOLATE_SUCCESS;
557} 571}
558 572
573/*
574 * Returns the start pfn of the last page block in a zone. This is the starting
575 * point for full compaction of a zone. Compaction searches for free pages from
576 * the end of each zone, while isolate_freepages_block scans forward inside each
577 * page block.
578 */
579static unsigned long start_free_pfn(struct zone *zone)
580{
581 unsigned long free_pfn;
582 free_pfn = zone->zone_start_pfn + zone->spanned_pages;
583 free_pfn &= ~(pageblock_nr_pages-1);
584 return free_pfn;
585}
586
559static int compact_finished(struct zone *zone, 587static int compact_finished(struct zone *zone,
560 struct compact_control *cc) 588 struct compact_control *cc)
561{ 589{
@@ -565,8 +593,26 @@ static int compact_finished(struct zone *zone,
565 if (fatal_signal_pending(current)) 593 if (fatal_signal_pending(current))
566 return COMPACT_PARTIAL; 594 return COMPACT_PARTIAL;
567 595
568 /* Compaction run completes if the migrate and free scanner meet */ 596 /*
569 if (cc->free_pfn <= cc->migrate_pfn) 597 * A full (order == -1) compaction run starts at the beginning and
598 * end of a zone; it completes when the migrate and free scanner meet.
599 * A partial (order > 0) compaction can start with the free scanner
600 * at a random point in the zone, and may have to restart.
601 */
602 if (cc->free_pfn <= cc->migrate_pfn) {
603 if (cc->order > 0 && !cc->wrapped) {
604 /* We started partway through; restart at the end. */
605 unsigned long free_pfn = start_free_pfn(zone);
606 zone->compact_cached_free_pfn = free_pfn;
607 cc->free_pfn = free_pfn;
608 cc->wrapped = 1;
609 return COMPACT_CONTINUE;
610 }
611 return COMPACT_COMPLETE;
612 }
613
614 /* We wrapped around and ended up where we started. */
615 if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
570 return COMPACT_COMPLETE; 616 return COMPACT_COMPLETE;
571 617
572 /* 618 /*
@@ -664,8 +710,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
664 710
665 /* Setup to move all movable pages to the end of the zone */ 711 /* Setup to move all movable pages to the end of the zone */
666 cc->migrate_pfn = zone->zone_start_pfn; 712 cc->migrate_pfn = zone->zone_start_pfn;
667 cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; 713
668 cc->free_pfn &= ~(pageblock_nr_pages-1); 714 if (cc->order > 0) {
715 /* Incremental compaction. Start where the last one stopped. */
716 cc->free_pfn = zone->compact_cached_free_pfn;
717 cc->start_free_pfn = cc->free_pfn;
718 } else {
719 /* Order == -1 starts at the end of the zone. */
720 cc->free_pfn = start_free_pfn(zone);
721 }
669 722
670 migrate_prep_local(); 723 migrate_prep_local();
671 724
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 469491e0af79..9b75a045dbf4 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -93,11 +93,6 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
93 spin_unlock(&file->f_lock); 93 spin_unlock(&file->f_lock);
94 break; 94 break;
95 case POSIX_FADV_WILLNEED: 95 case POSIX_FADV_WILLNEED:
96 if (!mapping->a_ops->readpage) {
97 ret = -EINVAL;
98 break;
99 }
100
101 /* First and last PARTIAL page! */ 96 /* First and last PARTIAL page! */
102 start_index = offset >> PAGE_CACHE_SHIFT; 97 start_index = offset >> PAGE_CACHE_SHIFT;
103 end_index = endbyte >> PAGE_CACHE_SHIFT; 98 end_index = endbyte >> PAGE_CACHE_SHIFT;
@@ -106,12 +101,13 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
106 nrpages = end_index - start_index + 1; 101 nrpages = end_index - start_index + 1;
107 if (!nrpages) 102 if (!nrpages)
108 nrpages = ~0UL; 103 nrpages = ~0UL;
109 104
110 ret = force_page_cache_readahead(mapping, file, 105 /*
111 start_index, 106 * Ignore return value because fadvise() shall return
112 nrpages); 107 * success even if filesystem can't retrieve a hint,
113 if (ret > 0) 108 */
114 ret = 0; 109 force_page_cache_readahead(mapping, file, start_index,
110 nrpages);
115 break; 111 break;
116 case POSIX_FADV_NOREUSE: 112 case POSIX_FADV_NOREUSE:
117 break; 113 break;
diff --git a/mm/highmem.c b/mm/highmem.c
index 57d82c6250c3..d517cd16a6eb 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -94,6 +94,18 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
94 do { spin_unlock(&kmap_lock); (void)(flags); } while (0) 94 do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
95#endif 95#endif
96 96
97struct page *kmap_to_page(void *vaddr)
98{
99 unsigned long addr = (unsigned long)vaddr;
100
101 if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) {
102 int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
103 return pte_page(pkmap_page_table[i]);
104 }
105
106 return virt_to_page(addr);
107}
108
97static void flush_all_zero_pkmaps(void) 109static void flush_all_zero_pkmaps(void)
98{ 110{
99 int i; 111 int i;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e198831276a3..bc727122dd44 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,17 +24,20 @@
24 24
25#include <asm/page.h> 25#include <asm/page.h>
26#include <asm/pgtable.h> 26#include <asm/pgtable.h>
27#include <linux/io.h> 27#include <asm/tlb.h>
28 28
29#include <linux/io.h>
29#include <linux/hugetlb.h> 30#include <linux/hugetlb.h>
31#include <linux/hugetlb_cgroup.h>
30#include <linux/node.h> 32#include <linux/node.h>
33#include <linux/hugetlb_cgroup.h>
31#include "internal.h" 34#include "internal.h"
32 35
33const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 36const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
34static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 37static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
35unsigned long hugepages_treat_as_movable; 38unsigned long hugepages_treat_as_movable;
36 39
37static int max_hstate; 40int hugetlb_max_hstate __read_mostly;
38unsigned int default_hstate_idx; 41unsigned int default_hstate_idx;
39struct hstate hstates[HUGE_MAX_HSTATE]; 42struct hstate hstates[HUGE_MAX_HSTATE];
40 43
@@ -45,13 +48,10 @@ static struct hstate * __initdata parsed_hstate;
45static unsigned long __initdata default_hstate_max_huge_pages; 48static unsigned long __initdata default_hstate_max_huge_pages;
46static unsigned long __initdata default_hstate_size; 49static unsigned long __initdata default_hstate_size;
47 50
48#define for_each_hstate(h) \
49 for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
50
51/* 51/*
52 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 52 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
53 */ 53 */
54static DEFINE_SPINLOCK(hugetlb_lock); 54DEFINE_SPINLOCK(hugetlb_lock);
55 55
56static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) 56static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
57{ 57{
@@ -509,7 +509,7 @@ void copy_huge_page(struct page *dst, struct page *src)
509static void enqueue_huge_page(struct hstate *h, struct page *page) 509static void enqueue_huge_page(struct hstate *h, struct page *page)
510{ 510{
511 int nid = page_to_nid(page); 511 int nid = page_to_nid(page);
512 list_add(&page->lru, &h->hugepage_freelists[nid]); 512 list_move(&page->lru, &h->hugepage_freelists[nid]);
513 h->free_huge_pages++; 513 h->free_huge_pages++;
514 h->free_huge_pages_node[nid]++; 514 h->free_huge_pages_node[nid]++;
515} 515}
@@ -521,7 +521,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
521 if (list_empty(&h->hugepage_freelists[nid])) 521 if (list_empty(&h->hugepage_freelists[nid]))
522 return NULL; 522 return NULL;
523 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); 523 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
524 list_del(&page->lru); 524 list_move(&page->lru, &h->hugepage_activelist);
525 set_page_refcounted(page); 525 set_page_refcounted(page);
526 h->free_huge_pages--; 526 h->free_huge_pages--;
527 h->free_huge_pages_node[nid]--; 527 h->free_huge_pages_node[nid]--;
@@ -593,6 +593,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
593 1 << PG_active | 1 << PG_reserved | 593 1 << PG_active | 1 << PG_reserved |
594 1 << PG_private | 1 << PG_writeback); 594 1 << PG_private | 1 << PG_writeback);
595 } 595 }
596 VM_BUG_ON(hugetlb_cgroup_from_page(page));
596 set_compound_page_dtor(page, NULL); 597 set_compound_page_dtor(page, NULL);
597 set_page_refcounted(page); 598 set_page_refcounted(page);
598 arch_release_hugepage(page); 599 arch_release_hugepage(page);
@@ -625,10 +626,13 @@ static void free_huge_page(struct page *page)
625 page->mapping = NULL; 626 page->mapping = NULL;
626 BUG_ON(page_count(page)); 627 BUG_ON(page_count(page));
627 BUG_ON(page_mapcount(page)); 628 BUG_ON(page_mapcount(page));
628 INIT_LIST_HEAD(&page->lru);
629 629
630 spin_lock(&hugetlb_lock); 630 spin_lock(&hugetlb_lock);
631 hugetlb_cgroup_uncharge_page(hstate_index(h),
632 pages_per_huge_page(h), page);
631 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { 633 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
634 /* remove the page from active list */
635 list_del(&page->lru);
632 update_and_free_page(h, page); 636 update_and_free_page(h, page);
633 h->surplus_huge_pages--; 637 h->surplus_huge_pages--;
634 h->surplus_huge_pages_node[nid]--; 638 h->surplus_huge_pages_node[nid]--;
@@ -641,8 +645,10 @@ static void free_huge_page(struct page *page)
641 645
642static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 646static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
643{ 647{
648 INIT_LIST_HEAD(&page->lru);
644 set_compound_page_dtor(page, free_huge_page); 649 set_compound_page_dtor(page, free_huge_page);
645 spin_lock(&hugetlb_lock); 650 spin_lock(&hugetlb_lock);
651 set_hugetlb_cgroup(page, NULL);
646 h->nr_huge_pages++; 652 h->nr_huge_pages++;
647 h->nr_huge_pages_node[nid]++; 653 h->nr_huge_pages_node[nid]++;
648 spin_unlock(&hugetlb_lock); 654 spin_unlock(&hugetlb_lock);
@@ -889,8 +895,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
889 895
890 spin_lock(&hugetlb_lock); 896 spin_lock(&hugetlb_lock);
891 if (page) { 897 if (page) {
898 INIT_LIST_HEAD(&page->lru);
892 r_nid = page_to_nid(page); 899 r_nid = page_to_nid(page);
893 set_compound_page_dtor(page, free_huge_page); 900 set_compound_page_dtor(page, free_huge_page);
901 set_hugetlb_cgroup(page, NULL);
894 /* 902 /*
895 * We incremented the global counters already 903 * We incremented the global counters already
896 */ 904 */
@@ -993,7 +1001,6 @@ retry:
993 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1001 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
994 if ((--needed) < 0) 1002 if ((--needed) < 0)
995 break; 1003 break;
996 list_del(&page->lru);
997 /* 1004 /*
998 * This page is now managed by the hugetlb allocator and has 1005 * This page is now managed by the hugetlb allocator and has
999 * no users -- drop the buddy allocator's reference. 1006 * no users -- drop the buddy allocator's reference.
@@ -1008,7 +1015,6 @@ free:
1008 /* Free unnecessary surplus pages to the buddy allocator */ 1015 /* Free unnecessary surplus pages to the buddy allocator */
1009 if (!list_empty(&surplus_list)) { 1016 if (!list_empty(&surplus_list)) {
1010 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1017 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
1011 list_del(&page->lru);
1012 put_page(page); 1018 put_page(page);
1013 } 1019 }
1014 } 1020 }
@@ -1112,7 +1118,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1112 struct hstate *h = hstate_vma(vma); 1118 struct hstate *h = hstate_vma(vma);
1113 struct page *page; 1119 struct page *page;
1114 long chg; 1120 long chg;
1121 int ret, idx;
1122 struct hugetlb_cgroup *h_cg;
1115 1123
1124 idx = hstate_index(h);
1116 /* 1125 /*
1117 * Processes that did not create the mapping will have no 1126 * Processes that did not create the mapping will have no
1118 * reserves and will not have accounted against subpool 1127 * reserves and will not have accounted against subpool
@@ -1123,27 +1132,43 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1123 */ 1132 */
1124 chg = vma_needs_reservation(h, vma, addr); 1133 chg = vma_needs_reservation(h, vma, addr);
1125 if (chg < 0) 1134 if (chg < 0)
1126 return ERR_PTR(-VM_FAULT_OOM); 1135 return ERR_PTR(-ENOMEM);
1127 if (chg) 1136 if (chg)
1128 if (hugepage_subpool_get_pages(spool, chg)) 1137 if (hugepage_subpool_get_pages(spool, chg))
1129 return ERR_PTR(-VM_FAULT_SIGBUS); 1138 return ERR_PTR(-ENOSPC);
1130 1139
1140 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
1141 if (ret) {
1142 hugepage_subpool_put_pages(spool, chg);
1143 return ERR_PTR(-ENOSPC);
1144 }
1131 spin_lock(&hugetlb_lock); 1145 spin_lock(&hugetlb_lock);
1132 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); 1146 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
1133 spin_unlock(&hugetlb_lock); 1147 if (page) {
1134 1148 /* update page cgroup details */
1135 if (!page) { 1149 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
1150 h_cg, page);
1151 spin_unlock(&hugetlb_lock);
1152 } else {
1153 spin_unlock(&hugetlb_lock);
1136 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1154 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1137 if (!page) { 1155 if (!page) {
1156 hugetlb_cgroup_uncharge_cgroup(idx,
1157 pages_per_huge_page(h),
1158 h_cg);
1138 hugepage_subpool_put_pages(spool, chg); 1159 hugepage_subpool_put_pages(spool, chg);
1139 return ERR_PTR(-VM_FAULT_SIGBUS); 1160 return ERR_PTR(-ENOSPC);
1140 } 1161 }
1162 spin_lock(&hugetlb_lock);
1163 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
1164 h_cg, page);
1165 list_move(&page->lru, &h->hugepage_activelist);
1166 spin_unlock(&hugetlb_lock);
1141 } 1167 }
1142 1168
1143 set_page_private(page, (unsigned long)spool); 1169 set_page_private(page, (unsigned long)spool);
1144 1170
1145 vma_commit_reservation(h, vma, addr); 1171 vma_commit_reservation(h, vma, addr);
1146
1147 return page; 1172 return page;
1148} 1173}
1149 1174
@@ -1646,7 +1671,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
1646 struct attribute_group *hstate_attr_group) 1671 struct attribute_group *hstate_attr_group)
1647{ 1672{
1648 int retval; 1673 int retval;
1649 int hi = h - hstates; 1674 int hi = hstate_index(h);
1650 1675
1651 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); 1676 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
1652 if (!hstate_kobjs[hi]) 1677 if (!hstate_kobjs[hi])
@@ -1741,11 +1766,13 @@ void hugetlb_unregister_node(struct node *node)
1741 if (!nhs->hugepages_kobj) 1766 if (!nhs->hugepages_kobj)
1742 return; /* no hstate attributes */ 1767 return; /* no hstate attributes */
1743 1768
1744 for_each_hstate(h) 1769 for_each_hstate(h) {
1745 if (nhs->hstate_kobjs[h - hstates]) { 1770 int idx = hstate_index(h);
1746 kobject_put(nhs->hstate_kobjs[h - hstates]); 1771 if (nhs->hstate_kobjs[idx]) {
1747 nhs->hstate_kobjs[h - hstates] = NULL; 1772 kobject_put(nhs->hstate_kobjs[idx]);
1773 nhs->hstate_kobjs[idx] = NULL;
1748 } 1774 }
1775 }
1749 1776
1750 kobject_put(nhs->hugepages_kobj); 1777 kobject_put(nhs->hugepages_kobj);
1751 nhs->hugepages_kobj = NULL; 1778 nhs->hugepages_kobj = NULL;
@@ -1848,7 +1875,7 @@ static void __exit hugetlb_exit(void)
1848 hugetlb_unregister_all_nodes(); 1875 hugetlb_unregister_all_nodes();
1849 1876
1850 for_each_hstate(h) { 1877 for_each_hstate(h) {
1851 kobject_put(hstate_kobjs[h - hstates]); 1878 kobject_put(hstate_kobjs[hstate_index(h)]);
1852 } 1879 }
1853 1880
1854 kobject_put(hugepages_kobj); 1881 kobject_put(hugepages_kobj);
@@ -1869,7 +1896,7 @@ static int __init hugetlb_init(void)
1869 if (!size_to_hstate(default_hstate_size)) 1896 if (!size_to_hstate(default_hstate_size))
1870 hugetlb_add_hstate(HUGETLB_PAGE_ORDER); 1897 hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
1871 } 1898 }
1872 default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; 1899 default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
1873 if (default_hstate_max_huge_pages) 1900 if (default_hstate_max_huge_pages)
1874 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 1901 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1875 1902
@@ -1897,19 +1924,27 @@ void __init hugetlb_add_hstate(unsigned order)
1897 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); 1924 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
1898 return; 1925 return;
1899 } 1926 }
1900 BUG_ON(max_hstate >= HUGE_MAX_HSTATE); 1927 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
1901 BUG_ON(order == 0); 1928 BUG_ON(order == 0);
1902 h = &hstates[max_hstate++]; 1929 h = &hstates[hugetlb_max_hstate++];
1903 h->order = order; 1930 h->order = order;
1904 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); 1931 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
1905 h->nr_huge_pages = 0; 1932 h->nr_huge_pages = 0;
1906 h->free_huge_pages = 0; 1933 h->free_huge_pages = 0;
1907 for (i = 0; i < MAX_NUMNODES; ++i) 1934 for (i = 0; i < MAX_NUMNODES; ++i)
1908 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1935 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1936 INIT_LIST_HEAD(&h->hugepage_activelist);
1909 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); 1937 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
1910 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); 1938 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
1911 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1939 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1912 huge_page_size(h)/1024); 1940 huge_page_size(h)/1024);
1941 /*
1942 * Add cgroup control files only if the huge page consists
1943 * of more than two normal pages. This is because we use
1944 * page[2].lru.next for storing cgoup details.
1945 */
1946 if (order >= HUGETLB_CGROUP_MIN_ORDER)
1947 hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
1913 1948
1914 parsed_hstate = h; 1949 parsed_hstate = h;
1915} 1950}
@@ -1920,10 +1955,10 @@ static int __init hugetlb_nrpages_setup(char *s)
1920 static unsigned long *last_mhp; 1955 static unsigned long *last_mhp;
1921 1956
1922 /* 1957 /*
1923 * !max_hstate means we haven't parsed a hugepagesz= parameter yet, 1958 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
1924 * so this hugepages= parameter goes to the "default hstate". 1959 * so this hugepages= parameter goes to the "default hstate".
1925 */ 1960 */
1926 if (!max_hstate) 1961 if (!hugetlb_max_hstate)
1927 mhp = &default_hstate_max_huge_pages; 1962 mhp = &default_hstate_max_huge_pages;
1928 else 1963 else
1929 mhp = &parsed_hstate->max_huge_pages; 1964 mhp = &parsed_hstate->max_huge_pages;
@@ -1942,7 +1977,7 @@ static int __init hugetlb_nrpages_setup(char *s)
1942 * But we need to allocate >= MAX_ORDER hstates here early to still 1977 * But we need to allocate >= MAX_ORDER hstates here early to still
1943 * use the bootmem allocator. 1978 * use the bootmem allocator.
1944 */ 1979 */
1945 if (max_hstate && parsed_hstate->order >= MAX_ORDER) 1980 if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
1946 hugetlb_hstate_alloc_pages(parsed_hstate); 1981 hugetlb_hstate_alloc_pages(parsed_hstate);
1947 1982
1948 last_mhp = mhp; 1983 last_mhp = mhp;
@@ -2308,30 +2343,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2308 return 0; 2343 return 0;
2309} 2344}
2310 2345
2311void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2346void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
2312 unsigned long end, struct page *ref_page) 2347 unsigned long start, unsigned long end,
2348 struct page *ref_page)
2313{ 2349{
2350 int force_flush = 0;
2314 struct mm_struct *mm = vma->vm_mm; 2351 struct mm_struct *mm = vma->vm_mm;
2315 unsigned long address; 2352 unsigned long address;
2316 pte_t *ptep; 2353 pte_t *ptep;
2317 pte_t pte; 2354 pte_t pte;
2318 struct page *page; 2355 struct page *page;
2319 struct page *tmp;
2320 struct hstate *h = hstate_vma(vma); 2356 struct hstate *h = hstate_vma(vma);
2321 unsigned long sz = huge_page_size(h); 2357 unsigned long sz = huge_page_size(h);
2322 2358
2323 /*
2324 * A page gathering list, protected by per file i_mmap_mutex. The
2325 * lock is used to avoid list corruption from multiple unmapping
2326 * of the same page since we are using page->lru.
2327 */
2328 LIST_HEAD(page_list);
2329
2330 WARN_ON(!is_vm_hugetlb_page(vma)); 2359 WARN_ON(!is_vm_hugetlb_page(vma));
2331 BUG_ON(start & ~huge_page_mask(h)); 2360 BUG_ON(start & ~huge_page_mask(h));
2332 BUG_ON(end & ~huge_page_mask(h)); 2361 BUG_ON(end & ~huge_page_mask(h));
2333 2362
2363 tlb_start_vma(tlb, vma);
2334 mmu_notifier_invalidate_range_start(mm, start, end); 2364 mmu_notifier_invalidate_range_start(mm, start, end);
2365again:
2335 spin_lock(&mm->page_table_lock); 2366 spin_lock(&mm->page_table_lock);
2336 for (address = start; address < end; address += sz) { 2367 for (address = start; address < end; address += sz) {
2337 ptep = huge_pte_offset(mm, address); 2368 ptep = huge_pte_offset(mm, address);
@@ -2370,30 +2401,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2370 } 2401 }
2371 2402
2372 pte = huge_ptep_get_and_clear(mm, address, ptep); 2403 pte = huge_ptep_get_and_clear(mm, address, ptep);
2404 tlb_remove_tlb_entry(tlb, ptep, address);
2373 if (pte_dirty(pte)) 2405 if (pte_dirty(pte))
2374 set_page_dirty(page); 2406 set_page_dirty(page);
2375 list_add(&page->lru, &page_list);
2376 2407
2408 page_remove_rmap(page);
2409 force_flush = !__tlb_remove_page(tlb, page);
2410 if (force_flush)
2411 break;
2377 /* Bail out after unmapping reference page if supplied */ 2412 /* Bail out after unmapping reference page if supplied */
2378 if (ref_page) 2413 if (ref_page)
2379 break; 2414 break;
2380 } 2415 }
2381 flush_tlb_range(vma, start, end);
2382 spin_unlock(&mm->page_table_lock); 2416 spin_unlock(&mm->page_table_lock);
2383 mmu_notifier_invalidate_range_end(mm, start, end); 2417 /*
2384 list_for_each_entry_safe(page, tmp, &page_list, lru) { 2418 * mmu_gather ran out of room to batch pages, we break out of
2385 page_remove_rmap(page); 2419 * the PTE lock to avoid doing the potential expensive TLB invalidate
2386 list_del(&page->lru); 2420 * and page-free while holding it.
2387 put_page(page); 2421 */
2422 if (force_flush) {
2423 force_flush = 0;
2424 tlb_flush_mmu(tlb);
2425 if (address < end && !ref_page)
2426 goto again;
2388 } 2427 }
2428 mmu_notifier_invalidate_range_end(mm, start, end);
2429 tlb_end_vma(tlb, vma);
2430}
2431
2432void __unmap_hugepage_range_final(struct mmu_gather *tlb,
2433 struct vm_area_struct *vma, unsigned long start,
2434 unsigned long end, struct page *ref_page)
2435{
2436 __unmap_hugepage_range(tlb, vma, start, end, ref_page);
2437
2438 /*
2439 * Clear this flag so that x86's huge_pmd_share page_table_shareable
2440 * test will fail on a vma being torn down, and not grab a page table
2441 * on its way out. We're lucky that the flag has such an appropriate
2442 * name, and can in fact be safely cleared here. We could clear it
2443 * before the __unmap_hugepage_range above, but all that's necessary
2444 * is to clear it before releasing the i_mmap_mutex. This works
2445 * because in the context this is called, the VMA is about to be
2446 * destroyed and the i_mmap_mutex is held.
2447 */
2448 vma->vm_flags &= ~VM_MAYSHARE;
2389} 2449}
2390 2450
2391void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2451void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2392 unsigned long end, struct page *ref_page) 2452 unsigned long end, struct page *ref_page)
2393{ 2453{
2394 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); 2454 struct mm_struct *mm;
2395 __unmap_hugepage_range(vma, start, end, ref_page); 2455 struct mmu_gather tlb;
2396 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 2456
2457 mm = vma->vm_mm;
2458
2459 tlb_gather_mmu(&tlb, mm, 0);
2460 __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
2461 tlb_finish_mmu(&tlb, start, end);
2397} 2462}
2398 2463
2399/* 2464/*
@@ -2438,9 +2503,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2438 * from the time of fork. This would look like data corruption 2503 * from the time of fork. This would look like data corruption
2439 */ 2504 */
2440 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 2505 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
2441 __unmap_hugepage_range(iter_vma, 2506 unmap_hugepage_range(iter_vma, address,
2442 address, address + huge_page_size(h), 2507 address + huge_page_size(h), page);
2443 page);
2444 } 2508 }
2445 mutex_unlock(&mapping->i_mmap_mutex); 2509 mutex_unlock(&mapping->i_mmap_mutex);
2446 2510
@@ -2496,6 +2560,7 @@ retry_avoidcopy:
2496 new_page = alloc_huge_page(vma, address, outside_reserve); 2560 new_page = alloc_huge_page(vma, address, outside_reserve);
2497 2561
2498 if (IS_ERR(new_page)) { 2562 if (IS_ERR(new_page)) {
2563 long err = PTR_ERR(new_page);
2499 page_cache_release(old_page); 2564 page_cache_release(old_page);
2500 2565
2501 /* 2566 /*
@@ -2524,7 +2589,10 @@ retry_avoidcopy:
2524 2589
2525 /* Caller expects lock to be held */ 2590 /* Caller expects lock to be held */
2526 spin_lock(&mm->page_table_lock); 2591 spin_lock(&mm->page_table_lock);
2527 return -PTR_ERR(new_page); 2592 if (err == -ENOMEM)
2593 return VM_FAULT_OOM;
2594 else
2595 return VM_FAULT_SIGBUS;
2528 } 2596 }
2529 2597
2530 /* 2598 /*
@@ -2642,7 +2710,11 @@ retry:
2642 goto out; 2710 goto out;
2643 page = alloc_huge_page(vma, address, 0); 2711 page = alloc_huge_page(vma, address, 0);
2644 if (IS_ERR(page)) { 2712 if (IS_ERR(page)) {
2645 ret = -PTR_ERR(page); 2713 ret = PTR_ERR(page);
2714 if (ret == -ENOMEM)
2715 ret = VM_FAULT_OOM;
2716 else
2717 ret = VM_FAULT_SIGBUS;
2646 goto out; 2718 goto out;
2647 } 2719 }
2648 clear_huge_page(page, address, pages_per_huge_page(h)); 2720 clear_huge_page(page, address, pages_per_huge_page(h));
@@ -2679,7 +2751,7 @@ retry:
2679 */ 2751 */
2680 if (unlikely(PageHWPoison(page))) { 2752 if (unlikely(PageHWPoison(page))) {
2681 ret = VM_FAULT_HWPOISON | 2753 ret = VM_FAULT_HWPOISON |
2682 VM_FAULT_SET_HINDEX(h - hstates); 2754 VM_FAULT_SET_HINDEX(hstate_index(h));
2683 goto backout_unlocked; 2755 goto backout_unlocked;
2684 } 2756 }
2685 } 2757 }
@@ -2752,7 +2824,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2752 return 0; 2824 return 0;
2753 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2825 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2754 return VM_FAULT_HWPOISON_LARGE | 2826 return VM_FAULT_HWPOISON_LARGE |
2755 VM_FAULT_SET_HINDEX(h - hstates); 2827 VM_FAULT_SET_HINDEX(hstate_index(h));
2756 } 2828 }
2757 2829
2758 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 2830 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2959,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
2959 } 3031 }
2960 } 3032 }
2961 spin_unlock(&mm->page_table_lock); 3033 spin_unlock(&mm->page_table_lock);
2962 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 3034 /*
2963 3035 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
3036 * may have cleared our pud entry and done put_page on the page table:
3037 * once we release i_mmap_mutex, another task can do the final put_page
3038 * and that page table be reused and filled with junk.
3039 */
2964 flush_tlb_range(vma, start, end); 3040 flush_tlb_range(vma, start, end);
3041 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
2965} 3042}
2966 3043
2967int hugetlb_reserve_pages(struct inode *inode, 3044int hugetlb_reserve_pages(struct inode *inode,
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
new file mode 100644
index 000000000000..a3f358fb8a0c
--- /dev/null
+++ b/mm/hugetlb_cgroup.c
@@ -0,0 +1,418 @@
1/*
2 *
3 * Copyright IBM Corporation, 2012
4 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of version 2.1 of the GNU Lesser General Public License
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13 *
14 */
15
16#include <linux/cgroup.h>
17#include <linux/slab.h>
18#include <linux/hugetlb.h>
19#include <linux/hugetlb_cgroup.h>
20
21struct hugetlb_cgroup {
22 struct cgroup_subsys_state css;
23 /*
24 * the counter to account for hugepages from hugetlb.
25 */
26 struct res_counter hugepage[HUGE_MAX_HSTATE];
27};
28
29#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
30#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
31#define MEMFILE_ATTR(val) ((val) & 0xffff)
32
33struct cgroup_subsys hugetlb_subsys __read_mostly;
34static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
35
36static inline
37struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
38{
39 return container_of(s, struct hugetlb_cgroup, css);
40}
41
42static inline
43struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup)
44{
45 return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup,
46 hugetlb_subsys_id));
47}
48
49static inline
50struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
51{
52 return hugetlb_cgroup_from_css(task_subsys_state(task,
53 hugetlb_subsys_id));
54}
55
56static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
57{
58 return (h_cg == root_h_cgroup);
59}
60
61static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg)
62{
63 if (!cg->parent)
64 return NULL;
65 return hugetlb_cgroup_from_cgroup(cg->parent);
66}
67
68static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
69{
70 int idx;
71 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg);
72
73 for (idx = 0; idx < hugetlb_max_hstate; idx++) {
74 if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0)
75 return true;
76 }
77 return false;
78}
79
80static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
81{
82 int idx;
83 struct cgroup *parent_cgroup;
84 struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup;
85
86 h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
87 if (!h_cgroup)
88 return ERR_PTR(-ENOMEM);
89
90 parent_cgroup = cgroup->parent;
91 if (parent_cgroup) {
92 parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup);
93 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
94 res_counter_init(&h_cgroup->hugepage[idx],
95 &parent_h_cgroup->hugepage[idx]);
96 } else {
97 root_h_cgroup = h_cgroup;
98 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
99 res_counter_init(&h_cgroup->hugepage[idx], NULL);
100 }
101 return &h_cgroup->css;
102}
103
104static void hugetlb_cgroup_destroy(struct cgroup *cgroup)
105{
106 struct hugetlb_cgroup *h_cgroup;
107
108 h_cgroup = hugetlb_cgroup_from_cgroup(cgroup);
109 kfree(h_cgroup);
110}
111
112
113/*
114 * Should be called with hugetlb_lock held.
115 * Since we are holding hugetlb_lock, pages cannot get moved from
116 * active list or uncharged from the cgroup, So no need to get
117 * page reference and test for page active here. This function
118 * cannot fail.
119 */
120static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup,
121 struct page *page)
122{
123 int csize;
124 struct res_counter *counter;
125 struct res_counter *fail_res;
126 struct hugetlb_cgroup *page_hcg;
127 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
128 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup);
129
130 page_hcg = hugetlb_cgroup_from_page(page);
131 /*
132 * We can have pages in active list without any cgroup
133 * ie, hugepage with less than 3 pages. We can safely
134 * ignore those pages.
135 */
136 if (!page_hcg || page_hcg != h_cg)
137 goto out;
138
139 csize = PAGE_SIZE << compound_order(page);
140 if (!parent) {
141 parent = root_h_cgroup;
142 /* root has no limit */
143 res_counter_charge_nofail(&parent->hugepage[idx],
144 csize, &fail_res);
145 }
146 counter = &h_cg->hugepage[idx];
147 res_counter_uncharge_until(counter, counter->parent, csize);
148
149 set_hugetlb_cgroup(page, parent);
150out:
151 return;
152}
153
154/*
155 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
156 * the parent cgroup.
157 */
158static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
159{
160 struct hstate *h;
161 struct page *page;
162 int ret = 0, idx = 0;
163
164 do {
165 if (cgroup_task_count(cgroup) ||
166 !list_empty(&cgroup->children)) {
167 ret = -EBUSY;
168 goto out;
169 }
170 for_each_hstate(h) {
171 spin_lock(&hugetlb_lock);
172 list_for_each_entry(page, &h->hugepage_activelist, lru)
173 hugetlb_cgroup_move_parent(idx, cgroup, page);
174
175 spin_unlock(&hugetlb_lock);
176 idx++;
177 }
178 cond_resched();
179 } while (hugetlb_cgroup_have_usage(cgroup));
180out:
181 return ret;
182}
183
184int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
185 struct hugetlb_cgroup **ptr)
186{
187 int ret = 0;
188 struct res_counter *fail_res;
189 struct hugetlb_cgroup *h_cg = NULL;
190 unsigned long csize = nr_pages * PAGE_SIZE;
191
192 if (hugetlb_cgroup_disabled())
193 goto done;
194 /*
195 * We don't charge any cgroup if the compound page have less
196 * than 3 pages.
197 */
198 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
199 goto done;
200again:
201 rcu_read_lock();
202 h_cg = hugetlb_cgroup_from_task(current);
203 if (!css_tryget(&h_cg->css)) {
204 rcu_read_unlock();
205 goto again;
206 }
207 rcu_read_unlock();
208
209 ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res);
210 css_put(&h_cg->css);
211done:
212 *ptr = h_cg;
213 return ret;
214}
215
216/* Should be called with hugetlb_lock held */
217void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
218 struct hugetlb_cgroup *h_cg,
219 struct page *page)
220{
221 if (hugetlb_cgroup_disabled() || !h_cg)
222 return;
223
224 set_hugetlb_cgroup(page, h_cg);
225 return;
226}
227
228/*
229 * Should be called with hugetlb_lock held
230 */
231void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
232 struct page *page)
233{
234 struct hugetlb_cgroup *h_cg;
235 unsigned long csize = nr_pages * PAGE_SIZE;
236
237 if (hugetlb_cgroup_disabled())
238 return;
239 VM_BUG_ON(!spin_is_locked(&hugetlb_lock));
240 h_cg = hugetlb_cgroup_from_page(page);
241 if (unlikely(!h_cg))
242 return;
243 set_hugetlb_cgroup(page, NULL);
244 res_counter_uncharge(&h_cg->hugepage[idx], csize);
245 return;
246}
247
248void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
249 struct hugetlb_cgroup *h_cg)
250{
251 unsigned long csize = nr_pages * PAGE_SIZE;
252
253 if (hugetlb_cgroup_disabled() || !h_cg)
254 return;
255
256 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
257 return;
258
259 res_counter_uncharge(&h_cg->hugepage[idx], csize);
260 return;
261}
262
263static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft,
264 struct file *file, char __user *buf,
265 size_t nbytes, loff_t *ppos)
266{
267 u64 val;
268 char str[64];
269 int idx, name, len;
270 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
271
272 idx = MEMFILE_IDX(cft->private);
273 name = MEMFILE_ATTR(cft->private);
274
275 val = res_counter_read_u64(&h_cg->hugepage[idx], name);
276 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
277 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
278}
279
280static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft,
281 const char *buffer)
282{
283 int idx, name, ret;
284 unsigned long long val;
285 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
286
287 idx = MEMFILE_IDX(cft->private);
288 name = MEMFILE_ATTR(cft->private);
289
290 switch (name) {
291 case RES_LIMIT:
292 if (hugetlb_cgroup_is_root(h_cg)) {
293 /* Can't set limit on root */
294 ret = -EINVAL;
295 break;
296 }
297 /* This function does all necessary parse...reuse it */
298 ret = res_counter_memparse_write_strategy(buffer, &val);
299 if (ret)
300 break;
301 ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
302 break;
303 default:
304 ret = -EINVAL;
305 break;
306 }
307 return ret;
308}
309
310static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event)
311{
312 int idx, name, ret = 0;
313 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
314
315 idx = MEMFILE_IDX(event);
316 name = MEMFILE_ATTR(event);
317
318 switch (name) {
319 case RES_MAX_USAGE:
320 res_counter_reset_max(&h_cg->hugepage[idx]);
321 break;
322 case RES_FAILCNT:
323 res_counter_reset_failcnt(&h_cg->hugepage[idx]);
324 break;
325 default:
326 ret = -EINVAL;
327 break;
328 }
329 return ret;
330}
331
332static char *mem_fmt(char *buf, int size, unsigned long hsize)
333{
334 if (hsize >= (1UL << 30))
335 snprintf(buf, size, "%luGB", hsize >> 30);
336 else if (hsize >= (1UL << 20))
337 snprintf(buf, size, "%luMB", hsize >> 20);
338 else
339 snprintf(buf, size, "%luKB", hsize >> 10);
340 return buf;
341}
342
343int __init hugetlb_cgroup_file_init(int idx)
344{
345 char buf[32];
346 struct cftype *cft;
347 struct hstate *h = &hstates[idx];
348
349 /* format the size */
350 mem_fmt(buf, 32, huge_page_size(h));
351
352 /* Add the limit file */
353 cft = &h->cgroup_files[0];
354 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
355 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
356 cft->read = hugetlb_cgroup_read;
357 cft->write_string = hugetlb_cgroup_write;
358
359 /* Add the usage file */
360 cft = &h->cgroup_files[1];
361 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
362 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
363 cft->read = hugetlb_cgroup_read;
364
365 /* Add the MAX usage file */
366 cft = &h->cgroup_files[2];
367 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
368 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
369 cft->trigger = hugetlb_cgroup_reset;
370 cft->read = hugetlb_cgroup_read;
371
372 /* Add the failcntfile */
373 cft = &h->cgroup_files[3];
374 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
375 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
376 cft->trigger = hugetlb_cgroup_reset;
377 cft->read = hugetlb_cgroup_read;
378
379 /* NULL terminate the last cft */
380 cft = &h->cgroup_files[4];
381 memset(cft, 0, sizeof(*cft));
382
383 WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
384
385 return 0;
386}
387
388/*
389 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
390 * when we migrate hugepages
391 */
392void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
393{
394 struct hugetlb_cgroup *h_cg;
395 struct hstate *h = page_hstate(oldhpage);
396
397 if (hugetlb_cgroup_disabled())
398 return;
399
400 VM_BUG_ON(!PageHuge(oldhpage));
401 spin_lock(&hugetlb_lock);
402 h_cg = hugetlb_cgroup_from_page(oldhpage);
403 set_hugetlb_cgroup(oldhpage, NULL);
404
405 /* move the h_cg details to new cgroup */
406 set_hugetlb_cgroup(newhpage, h_cg);
407 list_move(&newhpage->lru, &h->hugepage_activelist);
408 spin_unlock(&hugetlb_lock);
409 return;
410}
411
412struct cgroup_subsys hugetlb_subsys = {
413 .name = "hugetlb",
414 .create = hugetlb_cgroup_create,
415 .pre_destroy = hugetlb_cgroup_pre_destroy,
416 .destroy = hugetlb_cgroup_destroy,
417 .subsys_id = hugetlb_subsys_id,
418};
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index cc448bb983ba..3a61efc518d5 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -123,7 +123,7 @@ static int pfn_inject_init(void)
123 if (!dentry) 123 if (!dentry)
124 goto fail; 124 goto fail;
125 125
126#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 126#ifdef CONFIG_MEMCG_SWAP
127 dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, 127 dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
128 hwpoison_dir, &hwpoison_filter_memcg); 128 hwpoison_dir, &hwpoison_filter_memcg);
129 if (!dentry) 129 if (!dentry)
diff --git a/mm/internal.h b/mm/internal.h
index 2ba87fbfb75b..3314f79d775a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -118,8 +118,14 @@ struct compact_control {
118 unsigned long nr_freepages; /* Number of isolated free pages */ 118 unsigned long nr_freepages; /* Number of isolated free pages */
119 unsigned long nr_migratepages; /* Number of pages to migrate */ 119 unsigned long nr_migratepages; /* Number of pages to migrate */
120 unsigned long free_pfn; /* isolate_freepages search base */ 120 unsigned long free_pfn; /* isolate_freepages search base */
121 unsigned long start_free_pfn; /* where we started the search */
121 unsigned long migrate_pfn; /* isolate_migratepages search base */ 122 unsigned long migrate_pfn; /* isolate_migratepages search base */
122 bool sync; /* Synchronous migration */ 123 bool sync; /* Synchronous migration */
124 bool wrapped; /* Order > 0 compactions are
125 incremental, once free_pfn
126 and migrate_pfn meet, we restart
127 from the top of the zone;
128 remember we wrapped around. */
123 129
124 int order; /* order a direct compactor needs */ 130 int order; /* order a direct compactor needs */
125 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 131 int migratetype; /* MOVABLE, RECLAIMABLE etc */
@@ -347,3 +353,5 @@ extern u32 hwpoison_filter_enable;
347extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, 353extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
348 unsigned long, unsigned long, 354 unsigned long, unsigned long,
349 unsigned long, unsigned long); 355 unsigned long, unsigned long);
356
357extern void set_pageblock_order(void);
diff --git a/mm/memblock.c b/mm/memblock.c
index 5cc6731b00cc..4d9393c7edc9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -222,13 +222,13 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
222 /* Try to find some space for it. 222 /* Try to find some space for it.
223 * 223 *
224 * WARNING: We assume that either slab_is_available() and we use it or 224 * WARNING: We assume that either slab_is_available() and we use it or
225 * we use MEMBLOCK for allocations. That means that this is unsafe to use 225 * we use MEMBLOCK for allocations. That means that this is unsafe to
226 * when bootmem is currently active (unless bootmem itself is implemented 226 * use when bootmem is currently active (unless bootmem itself is
227 * on top of MEMBLOCK which isn't the case yet) 227 * implemented on top of MEMBLOCK which isn't the case yet)
228 * 228 *
229 * This should however not be an issue for now, as we currently only 229 * This should however not be an issue for now, as we currently only
230 * call into MEMBLOCK while it's still active, or much later when slab is 230 * call into MEMBLOCK while it's still active, or much later when slab
231 * active for memory hotplug operations 231 * is active for memory hotplug operations
232 */ 232 */
233 if (use_slab) { 233 if (use_slab) {
234 new_array = kmalloc(new_size, GFP_KERNEL); 234 new_array = kmalloc(new_size, GFP_KERNEL);
@@ -243,8 +243,8 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
243 new_alloc_size, PAGE_SIZE); 243 new_alloc_size, PAGE_SIZE);
244 if (!addr && new_area_size) 244 if (!addr && new_area_size)
245 addr = memblock_find_in_range(0, 245 addr = memblock_find_in_range(0,
246 min(new_area_start, memblock.current_limit), 246 min(new_area_start, memblock.current_limit),
247 new_alloc_size, PAGE_SIZE); 247 new_alloc_size, PAGE_SIZE);
248 248
249 new_array = addr ? __va(addr) : 0; 249 new_array = addr ? __va(addr) : 0;
250 } 250 }
@@ -254,12 +254,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
254 return -1; 254 return -1;
255 } 255 }
256 256
257 memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", 257 memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]",
258 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); 258 memblock_type_name(type), type->max * 2, (u64)addr,
259 (u64)addr + new_size - 1);
259 260
260 /* Found space, we now need to move the array over before 261 /*
261 * we add the reserved region since it may be our reserved 262 * Found space, we now need to move the array over before we add the
262 * array itself that is full. 263 * reserved region since it may be our reserved array itself that is
264 * full.
263 */ 265 */
264 memcpy(new_array, type->regions, old_size); 266 memcpy(new_array, type->regions, old_size);
265 memset(new_array + type->max, 0, old_size); 267 memset(new_array + type->max, 0, old_size);
@@ -267,17 +269,16 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
267 type->regions = new_array; 269 type->regions = new_array;
268 type->max <<= 1; 270 type->max <<= 1;
269 271
270 /* Free old array. We needn't free it if the array is the 272 /* Free old array. We needn't free it if the array is the static one */
271 * static one
272 */
273 if (*in_slab) 273 if (*in_slab)
274 kfree(old_array); 274 kfree(old_array);
275 else if (old_array != memblock_memory_init_regions && 275 else if (old_array != memblock_memory_init_regions &&
276 old_array != memblock_reserved_init_regions) 276 old_array != memblock_reserved_init_regions)
277 memblock_free(__pa(old_array), old_alloc_size); 277 memblock_free(__pa(old_array), old_alloc_size);
278 278
279 /* Reserve the new array if that comes from the memblock. 279 /*
280 * Otherwise, we needn't do it 280 * Reserve the new array if that comes from the memblock. Otherwise, we
281 * needn't do it
281 */ 282 */
282 if (!use_slab) 283 if (!use_slab)
283 BUG_ON(memblock_reserve(addr, new_alloc_size)); 284 BUG_ON(memblock_reserve(addr, new_alloc_size));
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f72b5e52451a..795e525afaba 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -61,12 +61,12 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly;
61#define MEM_CGROUP_RECLAIM_RETRIES 5 61#define MEM_CGROUP_RECLAIM_RETRIES 5
62static struct mem_cgroup *root_mem_cgroup __read_mostly; 62static struct mem_cgroup *root_mem_cgroup __read_mostly;
63 63
64#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 64#ifdef CONFIG_MEMCG_SWAP
65/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 65/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
66int do_swap_account __read_mostly; 66int do_swap_account __read_mostly;
67 67
68/* for remember boot option*/ 68/* for remember boot option*/
69#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED 69#ifdef CONFIG_MEMCG_SWAP_ENABLED
70static int really_do_swap_account __initdata = 1; 70static int really_do_swap_account __initdata = 1;
71#else 71#else
72static int really_do_swap_account __initdata = 0; 72static int really_do_swap_account __initdata = 0;
@@ -87,7 +87,7 @@ enum mem_cgroup_stat_index {
87 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 87 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
88 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 88 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 90 MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
91 MEM_CGROUP_STAT_NSTATS, 91 MEM_CGROUP_STAT_NSTATS,
92}; 92};
93 93
@@ -378,9 +378,7 @@ static bool move_file(void)
378 378
379enum charge_type { 379enum charge_type {
380 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 380 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
381 MEM_CGROUP_CHARGE_TYPE_MAPPED, 381 MEM_CGROUP_CHARGE_TYPE_ANON,
382 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
383 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
384 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 382 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
385 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 383 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
386 NR_CHARGE_TYPE, 384 NR_CHARGE_TYPE,
@@ -407,8 +405,14 @@ enum charge_type {
407static void mem_cgroup_get(struct mem_cgroup *memcg); 405static void mem_cgroup_get(struct mem_cgroup *memcg);
408static void mem_cgroup_put(struct mem_cgroup *memcg); 406static void mem_cgroup_put(struct mem_cgroup *memcg);
409 407
408static inline
409struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
410{
411 return container_of(s, struct mem_cgroup, css);
412}
413
410/* Writing them here to avoid exposing memcg's inner layout */ 414/* Writing them here to avoid exposing memcg's inner layout */
411#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 415#ifdef CONFIG_MEMCG_KMEM
412#include <net/sock.h> 416#include <net/sock.h>
413#include <net/ip.h> 417#include <net/ip.h>
414 418
@@ -467,9 +471,9 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
467} 471}
468EXPORT_SYMBOL(tcp_proto_cgroup); 472EXPORT_SYMBOL(tcp_proto_cgroup);
469#endif /* CONFIG_INET */ 473#endif /* CONFIG_INET */
470#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ 474#endif /* CONFIG_MEMCG_KMEM */
471 475
472#if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) 476#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
473static void disarm_sock_keys(struct mem_cgroup *memcg) 477static void disarm_sock_keys(struct mem_cgroup *memcg)
474{ 478{
475 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) 479 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
@@ -703,7 +707,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
703 bool charge) 707 bool charge)
704{ 708{
705 int val = (charge) ? 1 : -1; 709 int val = (charge) ? 1 : -1;
706 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 710 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
707} 711}
708 712
709static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 713static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
@@ -864,9 +868,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
864 868
865struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 869struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
866{ 870{
867 return container_of(cgroup_subsys_state(cont, 871 return mem_cgroup_from_css(
868 mem_cgroup_subsys_id), struct mem_cgroup, 872 cgroup_subsys_state(cont, mem_cgroup_subsys_id));
869 css);
870} 873}
871 874
872struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 875struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
@@ -879,8 +882,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
879 if (unlikely(!p)) 882 if (unlikely(!p))
880 return NULL; 883 return NULL;
881 884
882 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 885 return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
883 struct mem_cgroup, css);
884} 886}
885 887
886struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 888struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
@@ -966,8 +968,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
966 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); 968 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
967 if (css) { 969 if (css) {
968 if (css == &root->css || css_tryget(css)) 970 if (css == &root->css || css_tryget(css))
969 memcg = container_of(css, 971 memcg = mem_cgroup_from_css(css);
970 struct mem_cgroup, css);
971 } else 972 } else
972 id = 0; 973 id = 0;
973 rcu_read_unlock(); 974 rcu_read_unlock();
@@ -1454,7 +1455,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1454/* 1455/*
1455 * Return the memory (and swap, if configured) limit for a memcg. 1456 * Return the memory (and swap, if configured) limit for a memcg.
1456 */ 1457 */
1457u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1458static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1458{ 1459{
1459 u64 limit; 1460 u64 limit;
1460 u64 memsw; 1461 u64 memsw;
@@ -1470,6 +1471,73 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1470 return min(limit, memsw); 1471 return min(limit, memsw);
1471} 1472}
1472 1473
1474void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1475 int order)
1476{
1477 struct mem_cgroup *iter;
1478 unsigned long chosen_points = 0;
1479 unsigned long totalpages;
1480 unsigned int points = 0;
1481 struct task_struct *chosen = NULL;
1482
1483 /*
1484 * If current has a pending SIGKILL, then automatically select it. The
1485 * goal is to allow it to allocate so that it may quickly exit and free
1486 * its memory.
1487 */
1488 if (fatal_signal_pending(current)) {
1489 set_thread_flag(TIF_MEMDIE);
1490 return;
1491 }
1492
1493 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1494 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
1495 for_each_mem_cgroup_tree(iter, memcg) {
1496 struct cgroup *cgroup = iter->css.cgroup;
1497 struct cgroup_iter it;
1498 struct task_struct *task;
1499
1500 cgroup_iter_start(cgroup, &it);
1501 while ((task = cgroup_iter_next(cgroup, &it))) {
1502 switch (oom_scan_process_thread(task, totalpages, NULL,
1503 false)) {
1504 case OOM_SCAN_SELECT:
1505 if (chosen)
1506 put_task_struct(chosen);
1507 chosen = task;
1508 chosen_points = ULONG_MAX;
1509 get_task_struct(chosen);
1510 /* fall through */
1511 case OOM_SCAN_CONTINUE:
1512 continue;
1513 case OOM_SCAN_ABORT:
1514 cgroup_iter_end(cgroup, &it);
1515 mem_cgroup_iter_break(memcg, iter);
1516 if (chosen)
1517 put_task_struct(chosen);
1518 return;
1519 case OOM_SCAN_OK:
1520 break;
1521 };
1522 points = oom_badness(task, memcg, NULL, totalpages);
1523 if (points > chosen_points) {
1524 if (chosen)
1525 put_task_struct(chosen);
1526 chosen = task;
1527 chosen_points = points;
1528 get_task_struct(chosen);
1529 }
1530 }
1531 cgroup_iter_end(cgroup, &it);
1532 }
1533
1534 if (!chosen)
1535 return;
1536 points = chosen_points * 1000 / totalpages;
1537 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
1538 NULL, "Memory cgroup out of memory");
1539}
1540
1473static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, 1541static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1474 gfp_t gfp_mask, 1542 gfp_t gfp_mask,
1475 unsigned long flags) 1543 unsigned long flags)
@@ -1899,7 +1967,7 @@ again:
1899 return; 1967 return;
1900 /* 1968 /*
1901 * If this memory cgroup is not under account moving, we don't 1969 * If this memory cgroup is not under account moving, we don't
1902 * need to take move_lock_page_cgroup(). Because we already hold 1970 * need to take move_lock_mem_cgroup(). Because we already hold
1903 * rcu_read_lock(), any calls to move_account will be delayed until 1971 * rcu_read_lock(), any calls to move_account will be delayed until
1904 * rcu_read_unlock() if mem_cgroup_stolen() == true. 1972 * rcu_read_unlock() if mem_cgroup_stolen() == true.
1905 */ 1973 */
@@ -1921,7 +1989,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
1921 /* 1989 /*
1922 * It's guaranteed that pc->mem_cgroup never changes while 1990 * It's guaranteed that pc->mem_cgroup never changes while
1923 * lock is held because a routine modifies pc->mem_cgroup 1991 * lock is held because a routine modifies pc->mem_cgroup
1924 * should take move_lock_page_cgroup(). 1992 * should take move_lock_mem_cgroup().
1925 */ 1993 */
1926 move_unlock_mem_cgroup(pc->mem_cgroup, flags); 1994 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
1927} 1995}
@@ -2268,7 +2336,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2268 * We always charge the cgroup the mm_struct belongs to. 2336 * We always charge the cgroup the mm_struct belongs to.
2269 * The mm_struct's mem_cgroup changes on task migration if the 2337 * The mm_struct's mem_cgroup changes on task migration if the
2270 * thread group leader migrates. It's possible that mm is not 2338 * thread group leader migrates. It's possible that mm is not
2271 * set, if so charge the init_mm (happens for pagecache usage). 2339 * set, if so charge the root memcg (happens for pagecache usage).
2272 */ 2340 */
2273 if (!*ptr && !mm) 2341 if (!*ptr && !mm)
2274 *ptr = root_mem_cgroup; 2342 *ptr = root_mem_cgroup;
@@ -2429,7 +2497,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2429 css = css_lookup(&mem_cgroup_subsys, id); 2497 css = css_lookup(&mem_cgroup_subsys, id);
2430 if (!css) 2498 if (!css)
2431 return NULL; 2499 return NULL;
2432 return container_of(css, struct mem_cgroup, css); 2500 return mem_cgroup_from_css(css);
2433} 2501}
2434 2502
2435struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2503struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -2473,11 +2541,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2473 bool anon; 2541 bool anon;
2474 2542
2475 lock_page_cgroup(pc); 2543 lock_page_cgroup(pc);
2476 if (unlikely(PageCgroupUsed(pc))) { 2544 VM_BUG_ON(PageCgroupUsed(pc));
2477 unlock_page_cgroup(pc);
2478 __mem_cgroup_cancel_charge(memcg, nr_pages);
2479 return;
2480 }
2481 /* 2545 /*
2482 * we don't need page_cgroup_lock about tail pages, becase they are not 2546 * we don't need page_cgroup_lock about tail pages, becase they are not
2483 * accessed by any other context at this point. 2547 * accessed by any other context at this point.
@@ -2519,7 +2583,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2519 spin_unlock_irq(&zone->lru_lock); 2583 spin_unlock_irq(&zone->lru_lock);
2520 } 2584 }
2521 2585
2522 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 2586 if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
2523 anon = true; 2587 anon = true;
2524 else 2588 else
2525 anon = false; 2589 anon = false;
@@ -2644,8 +2708,7 @@ out:
2644 2708
2645static int mem_cgroup_move_parent(struct page *page, 2709static int mem_cgroup_move_parent(struct page *page,
2646 struct page_cgroup *pc, 2710 struct page_cgroup *pc,
2647 struct mem_cgroup *child, 2711 struct mem_cgroup *child)
2648 gfp_t gfp_mask)
2649{ 2712{
2650 struct mem_cgroup *parent; 2713 struct mem_cgroup *parent;
2651 unsigned int nr_pages; 2714 unsigned int nr_pages;
@@ -2728,38 +2791,7 @@ int mem_cgroup_newpage_charge(struct page *page,
2728 VM_BUG_ON(page->mapping && !PageAnon(page)); 2791 VM_BUG_ON(page->mapping && !PageAnon(page));
2729 VM_BUG_ON(!mm); 2792 VM_BUG_ON(!mm);
2730 return mem_cgroup_charge_common(page, mm, gfp_mask, 2793 return mem_cgroup_charge_common(page, mm, gfp_mask,
2731 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2794 MEM_CGROUP_CHARGE_TYPE_ANON);
2732}
2733
2734static void
2735__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2736 enum charge_type ctype);
2737
2738int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2739 gfp_t gfp_mask)
2740{
2741 struct mem_cgroup *memcg = NULL;
2742 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
2743 int ret;
2744
2745 if (mem_cgroup_disabled())
2746 return 0;
2747 if (PageCompound(page))
2748 return 0;
2749
2750 if (unlikely(!mm))
2751 mm = &init_mm;
2752 if (!page_is_file_cache(page))
2753 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2754
2755 if (!PageSwapCache(page))
2756 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
2757 else { /* page is swapcache/shmem */
2758 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
2759 if (!ret)
2760 __mem_cgroup_commit_charge_swapin(page, memcg, type);
2761 }
2762 return ret;
2763} 2795}
2764 2796
2765/* 2797/*
@@ -2768,27 +2800,26 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2768 * struct page_cgroup is acquired. This refcnt will be consumed by 2800 * struct page_cgroup is acquired. This refcnt will be consumed by
2769 * "commit()" or removed by "cancel()" 2801 * "commit()" or removed by "cancel()"
2770 */ 2802 */
2771int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2803static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2772 struct page *page, 2804 struct page *page,
2773 gfp_t mask, struct mem_cgroup **memcgp) 2805 gfp_t mask,
2806 struct mem_cgroup **memcgp)
2774{ 2807{
2775 struct mem_cgroup *memcg; 2808 struct mem_cgroup *memcg;
2809 struct page_cgroup *pc;
2776 int ret; 2810 int ret;
2777 2811
2778 *memcgp = NULL; 2812 pc = lookup_page_cgroup(page);
2779
2780 if (mem_cgroup_disabled())
2781 return 0;
2782
2783 if (!do_swap_account)
2784 goto charge_cur_mm;
2785 /* 2813 /*
2786 * A racing thread's fault, or swapoff, may have already updated 2814 * Every swap fault against a single page tries to charge the
2787 * the pte, and even removed page from swap cache: in those cases 2815 * page, bail as early as possible. shmem_unuse() encounters
2788 * do_swap_page()'s pte_same() test will fail; but there's also a 2816 * already charged pages, too. The USED bit is protected by
2789 * KSM case which does need to charge the page. 2817 * the page lock, which serializes swap cache removal, which
2818 * in turn serializes uncharging.
2790 */ 2819 */
2791 if (!PageSwapCache(page)) 2820 if (PageCgroupUsed(pc))
2821 return 0;
2822 if (!do_swap_account)
2792 goto charge_cur_mm; 2823 goto charge_cur_mm;
2793 memcg = try_get_mem_cgroup_from_page(page); 2824 memcg = try_get_mem_cgroup_from_page(page);
2794 if (!memcg) 2825 if (!memcg)
@@ -2800,14 +2831,44 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2800 ret = 0; 2831 ret = 0;
2801 return ret; 2832 return ret;
2802charge_cur_mm: 2833charge_cur_mm:
2803 if (unlikely(!mm))
2804 mm = &init_mm;
2805 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); 2834 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
2806 if (ret == -EINTR) 2835 if (ret == -EINTR)
2807 ret = 0; 2836 ret = 0;
2808 return ret; 2837 return ret;
2809} 2838}
2810 2839
2840int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
2841 gfp_t gfp_mask, struct mem_cgroup **memcgp)
2842{
2843 *memcgp = NULL;
2844 if (mem_cgroup_disabled())
2845 return 0;
2846 /*
2847 * A racing thread's fault, or swapoff, may have already
2848 * updated the pte, and even removed page from swap cache: in
2849 * those cases unuse_pte()'s pte_same() test will fail; but
2850 * there's also a KSM case which does need to charge the page.
2851 */
2852 if (!PageSwapCache(page)) {
2853 int ret;
2854
2855 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
2856 if (ret == -EINTR)
2857 ret = 0;
2858 return ret;
2859 }
2860 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
2861}
2862
2863void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
2864{
2865 if (mem_cgroup_disabled())
2866 return;
2867 if (!memcg)
2868 return;
2869 __mem_cgroup_cancel_charge(memcg, 1);
2870}
2871
2811static void 2872static void
2812__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, 2873__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2813 enum charge_type ctype) 2874 enum charge_type ctype)
@@ -2842,16 +2903,30 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
2842 struct mem_cgroup *memcg) 2903 struct mem_cgroup *memcg)
2843{ 2904{
2844 __mem_cgroup_commit_charge_swapin(page, memcg, 2905 __mem_cgroup_commit_charge_swapin(page, memcg,
2845 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2906 MEM_CGROUP_CHARGE_TYPE_ANON);
2846} 2907}
2847 2908
2848void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) 2909int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2910 gfp_t gfp_mask)
2849{ 2911{
2912 struct mem_cgroup *memcg = NULL;
2913 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
2914 int ret;
2915
2850 if (mem_cgroup_disabled()) 2916 if (mem_cgroup_disabled())
2851 return; 2917 return 0;
2852 if (!memcg) 2918 if (PageCompound(page))
2853 return; 2919 return 0;
2854 __mem_cgroup_cancel_charge(memcg, 1); 2920
2921 if (!PageSwapCache(page))
2922 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
2923 else { /* page is swapcache/shmem */
2924 ret = __mem_cgroup_try_charge_swapin(mm, page,
2925 gfp_mask, &memcg);
2926 if (!ret)
2927 __mem_cgroup_commit_charge_swapin(page, memcg, type);
2928 }
2929 return ret;
2855} 2930}
2856 2931
2857static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, 2932static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
@@ -2911,7 +2986,8 @@ direct_uncharge:
2911 * uncharge if !page_mapped(page) 2986 * uncharge if !page_mapped(page)
2912 */ 2987 */
2913static struct mem_cgroup * 2988static struct mem_cgroup *
2914__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2989__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
2990 bool end_migration)
2915{ 2991{
2916 struct mem_cgroup *memcg = NULL; 2992 struct mem_cgroup *memcg = NULL;
2917 unsigned int nr_pages = 1; 2993 unsigned int nr_pages = 1;
@@ -2921,8 +2997,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2921 if (mem_cgroup_disabled()) 2997 if (mem_cgroup_disabled())
2922 return NULL; 2998 return NULL;
2923 2999
2924 if (PageSwapCache(page)) 3000 VM_BUG_ON(PageSwapCache(page));
2925 return NULL;
2926 3001
2927 if (PageTransHuge(page)) { 3002 if (PageTransHuge(page)) {
2928 nr_pages <<= compound_order(page); 3003 nr_pages <<= compound_order(page);
@@ -2945,7 +3020,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2945 anon = PageAnon(page); 3020 anon = PageAnon(page);
2946 3021
2947 switch (ctype) { 3022 switch (ctype) {
2948 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 3023 case MEM_CGROUP_CHARGE_TYPE_ANON:
2949 /* 3024 /*
2950 * Generally PageAnon tells if it's the anon statistics to be 3025 * Generally PageAnon tells if it's the anon statistics to be
2951 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is 3026 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
@@ -2955,7 +3030,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2955 /* fallthrough */ 3030 /* fallthrough */
2956 case MEM_CGROUP_CHARGE_TYPE_DROP: 3031 case MEM_CGROUP_CHARGE_TYPE_DROP:
2957 /* See mem_cgroup_prepare_migration() */ 3032 /* See mem_cgroup_prepare_migration() */
2958 if (page_mapped(page) || PageCgroupMigration(pc)) 3033 if (page_mapped(page))
3034 goto unlock_out;
3035 /*
3036 * Pages under migration may not be uncharged. But
3037 * end_migration() /must/ be the one uncharging the
3038 * unused post-migration page and so it has to call
3039 * here with the migration bit still set. See the
3040 * res_counter handling below.
3041 */
3042 if (!end_migration && PageCgroupMigration(pc))
2959 goto unlock_out; 3043 goto unlock_out;
2960 break; 3044 break;
2961 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 3045 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
@@ -2989,7 +3073,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2989 mem_cgroup_swap_statistics(memcg, true); 3073 mem_cgroup_swap_statistics(memcg, true);
2990 mem_cgroup_get(memcg); 3074 mem_cgroup_get(memcg);
2991 } 3075 }
2992 if (!mem_cgroup_is_root(memcg)) 3076 /*
3077 * Migration does not charge the res_counter for the
3078 * replacement page, so leave it alone when phasing out the
3079 * page that is unused after the migration.
3080 */
3081 if (!end_migration && !mem_cgroup_is_root(memcg))
2993 mem_cgroup_do_uncharge(memcg, nr_pages, ctype); 3082 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
2994 3083
2995 return memcg; 3084 return memcg;
@@ -3005,14 +3094,16 @@ void mem_cgroup_uncharge_page(struct page *page)
3005 if (page_mapped(page)) 3094 if (page_mapped(page))
3006 return; 3095 return;
3007 VM_BUG_ON(page->mapping && !PageAnon(page)); 3096 VM_BUG_ON(page->mapping && !PageAnon(page));
3008 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 3097 if (PageSwapCache(page))
3098 return;
3099 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
3009} 3100}
3010 3101
3011void mem_cgroup_uncharge_cache_page(struct page *page) 3102void mem_cgroup_uncharge_cache_page(struct page *page)
3012{ 3103{
3013 VM_BUG_ON(page_mapped(page)); 3104 VM_BUG_ON(page_mapped(page));
3014 VM_BUG_ON(page->mapping); 3105 VM_BUG_ON(page->mapping);
3015 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 3106 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
3016} 3107}
3017 3108
3018/* 3109/*
@@ -3076,7 +3167,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3076 if (!swapout) /* this was a swap cache but the swap is unused ! */ 3167 if (!swapout) /* this was a swap cache but the swap is unused ! */
3077 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 3168 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
3078 3169
3079 memcg = __mem_cgroup_uncharge_common(page, ctype); 3170 memcg = __mem_cgroup_uncharge_common(page, ctype, false);
3080 3171
3081 /* 3172 /*
3082 * record memcg information, if swapout && memcg != NULL, 3173 * record memcg information, if swapout && memcg != NULL,
@@ -3087,7 +3178,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3087} 3178}
3088#endif 3179#endif
3089 3180
3090#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3181#ifdef CONFIG_MEMCG_SWAP
3091/* 3182/*
3092 * called from swap_entry_free(). remove record in swap_cgroup and 3183 * called from swap_entry_free(). remove record in swap_cgroup and
3093 * uncharge "memsw" account. 3184 * uncharge "memsw" account.
@@ -3166,19 +3257,18 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3166 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 3257 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
3167 * page belongs to. 3258 * page belongs to.
3168 */ 3259 */
3169int mem_cgroup_prepare_migration(struct page *page, 3260void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
3170 struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) 3261 struct mem_cgroup **memcgp)
3171{ 3262{
3172 struct mem_cgroup *memcg = NULL; 3263 struct mem_cgroup *memcg = NULL;
3173 struct page_cgroup *pc; 3264 struct page_cgroup *pc;
3174 enum charge_type ctype; 3265 enum charge_type ctype;
3175 int ret = 0;
3176 3266
3177 *memcgp = NULL; 3267 *memcgp = NULL;
3178 3268
3179 VM_BUG_ON(PageTransHuge(page)); 3269 VM_BUG_ON(PageTransHuge(page));
3180 if (mem_cgroup_disabled()) 3270 if (mem_cgroup_disabled())
3181 return 0; 3271 return;
3182 3272
3183 pc = lookup_page_cgroup(page); 3273 pc = lookup_page_cgroup(page);
3184 lock_page_cgroup(pc); 3274 lock_page_cgroup(pc);
@@ -3223,24 +3313,9 @@ int mem_cgroup_prepare_migration(struct page *page,
3223 * we return here. 3313 * we return here.
3224 */ 3314 */
3225 if (!memcg) 3315 if (!memcg)
3226 return 0; 3316 return;
3227 3317
3228 *memcgp = memcg; 3318 *memcgp = memcg;
3229 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
3230 css_put(&memcg->css);/* drop extra refcnt */
3231 if (ret) {
3232 if (PageAnon(page)) {
3233 lock_page_cgroup(pc);
3234 ClearPageCgroupMigration(pc);
3235 unlock_page_cgroup(pc);
3236 /*
3237 * The old page may be fully unmapped while we kept it.
3238 */
3239 mem_cgroup_uncharge_page(page);
3240 }
3241 /* we'll need to revisit this error code (we have -EINTR) */
3242 return -ENOMEM;
3243 }
3244 /* 3319 /*
3245 * We charge new page before it's used/mapped. So, even if unlock_page() 3320 * We charge new page before it's used/mapped. So, even if unlock_page()
3246 * is called before end_migration, we can catch all events on this new 3321 * is called before end_migration, we can catch all events on this new
@@ -3248,13 +3323,15 @@ int mem_cgroup_prepare_migration(struct page *page,
3248 * mapcount will be finally 0 and we call uncharge in end_migration(). 3323 * mapcount will be finally 0 and we call uncharge in end_migration().
3249 */ 3324 */
3250 if (PageAnon(page)) 3325 if (PageAnon(page))
3251 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 3326 ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
3252 else if (page_is_file_cache(page))
3253 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3254 else 3327 else
3255 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3328 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3329 /*
3330 * The page is committed to the memcg, but it's not actually
3331 * charged to the res_counter since we plan on replacing the
3332 * old one and only one page is going to be left afterwards.
3333 */
3256 __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); 3334 __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
3257 return ret;
3258} 3335}
3259 3336
3260/* remove redundant charge if migration failed*/ 3337/* remove redundant charge if migration failed*/
@@ -3276,6 +3353,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3276 used = newpage; 3353 used = newpage;
3277 unused = oldpage; 3354 unused = oldpage;
3278 } 3355 }
3356 anon = PageAnon(used);
3357 __mem_cgroup_uncharge_common(unused,
3358 anon ? MEM_CGROUP_CHARGE_TYPE_ANON
3359 : MEM_CGROUP_CHARGE_TYPE_CACHE,
3360 true);
3361 css_put(&memcg->css);
3279 /* 3362 /*
3280 * We disallowed uncharge of pages under migration because mapcount 3363 * We disallowed uncharge of pages under migration because mapcount
3281 * of the page goes down to zero, temporarly. 3364 * of the page goes down to zero, temporarly.
@@ -3285,10 +3368,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3285 lock_page_cgroup(pc); 3368 lock_page_cgroup(pc);
3286 ClearPageCgroupMigration(pc); 3369 ClearPageCgroupMigration(pc);
3287 unlock_page_cgroup(pc); 3370 unlock_page_cgroup(pc);
3288 anon = PageAnon(used);
3289 __mem_cgroup_uncharge_common(unused,
3290 anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
3291 : MEM_CGROUP_CHARGE_TYPE_CACHE);
3292 3371
3293 /* 3372 /*
3294 * If a page is a file cache, radix-tree replacement is very atomic 3373 * If a page is a file cache, radix-tree replacement is very atomic
@@ -3340,10 +3419,6 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
3340 */ 3419 */
3341 if (!memcg) 3420 if (!memcg)
3342 return; 3421 return;
3343
3344 if (PageSwapBacked(oldpage))
3345 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3346
3347 /* 3422 /*
3348 * Even if newpage->mapping was NULL before starting replacement, 3423 * Even if newpage->mapping was NULL before starting replacement,
3349 * the newpage may be on LRU(or pagevec for LRU) already. We lock 3424 * the newpage may be on LRU(or pagevec for LRU) already. We lock
@@ -3418,7 +3493,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3418 /* 3493 /*
3419 * Rather than hide all in some function, I do this in 3494 * Rather than hide all in some function, I do this in
3420 * open coded manner. You see what this really does. 3495 * open coded manner. You see what this really does.
3421 * We have to guarantee memcg->res.limit < memcg->memsw.limit. 3496 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
3422 */ 3497 */
3423 mutex_lock(&set_limit_mutex); 3498 mutex_lock(&set_limit_mutex);
3424 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3499 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
@@ -3479,7 +3554,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3479 /* 3554 /*
3480 * Rather than hide all in some function, I do this in 3555 * Rather than hide all in some function, I do this in
3481 * open coded manner. You see what this really does. 3556 * open coded manner. You see what this really does.
3482 * We have to guarantee memcg->res.limit < memcg->memsw.limit. 3557 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
3483 */ 3558 */
3484 mutex_lock(&set_limit_mutex); 3559 mutex_lock(&set_limit_mutex);
3485 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3560 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -3611,10 +3686,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3611} 3686}
3612 3687
3613/* 3688/*
3614 * This routine traverse page_cgroup in given list and drop them all. 3689 * Traverse a specified page_cgroup list and try to drop them all. This doesn't
3615 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 3690 * reclaim the pages page themselves - it just removes the page_cgroups.
3691 * Returns true if some page_cgroups were not freed, indicating that the caller
3692 * must retry this operation.
3616 */ 3693 */
3617static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 3694static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3618 int node, int zid, enum lru_list lru) 3695 int node, int zid, enum lru_list lru)
3619{ 3696{
3620 struct mem_cgroup_per_zone *mz; 3697 struct mem_cgroup_per_zone *mz;
@@ -3622,7 +3699,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3622 struct list_head *list; 3699 struct list_head *list;
3623 struct page *busy; 3700 struct page *busy;
3624 struct zone *zone; 3701 struct zone *zone;
3625 int ret = 0;
3626 3702
3627 zone = &NODE_DATA(node)->node_zones[zid]; 3703 zone = &NODE_DATA(node)->node_zones[zid];
3628 mz = mem_cgroup_zoneinfo(memcg, node, zid); 3704 mz = mem_cgroup_zoneinfo(memcg, node, zid);
@@ -3636,7 +3712,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3636 struct page_cgroup *pc; 3712 struct page_cgroup *pc;
3637 struct page *page; 3713 struct page *page;
3638 3714
3639 ret = 0;
3640 spin_lock_irqsave(&zone->lru_lock, flags); 3715 spin_lock_irqsave(&zone->lru_lock, flags);
3641 if (list_empty(list)) { 3716 if (list_empty(list)) {
3642 spin_unlock_irqrestore(&zone->lru_lock, flags); 3717 spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -3653,21 +3728,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3653 3728
3654 pc = lookup_page_cgroup(page); 3729 pc = lookup_page_cgroup(page);
3655 3730
3656 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); 3731 if (mem_cgroup_move_parent(page, pc, memcg)) {
3657 if (ret == -ENOMEM || ret == -EINTR)
3658 break;
3659
3660 if (ret == -EBUSY || ret == -EINVAL) {
3661 /* found lock contention or "pc" is obsolete. */ 3732 /* found lock contention or "pc" is obsolete. */
3662 busy = page; 3733 busy = page;
3663 cond_resched(); 3734 cond_resched();
3664 } else 3735 } else
3665 busy = NULL; 3736 busy = NULL;
3666 } 3737 }
3667 3738 return !list_empty(list);
3668 if (!ret && !list_empty(list))
3669 return -EBUSY;
3670 return ret;
3671} 3739}
3672 3740
3673/* 3741/*
@@ -3692,9 +3760,6 @@ move_account:
3692 ret = -EBUSY; 3760 ret = -EBUSY;
3693 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 3761 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3694 goto out; 3762 goto out;
3695 ret = -EINTR;
3696 if (signal_pending(current))
3697 goto out;
3698 /* This is for making all *used* pages to be on LRU. */ 3763 /* This is for making all *used* pages to be on LRU. */
3699 lru_add_drain_all(); 3764 lru_add_drain_all();
3700 drain_all_stock_sync(memcg); 3765 drain_all_stock_sync(memcg);
@@ -3715,9 +3780,6 @@ move_account:
3715 } 3780 }
3716 mem_cgroup_end_move(memcg); 3781 mem_cgroup_end_move(memcg);
3717 memcg_oom_recover(memcg); 3782 memcg_oom_recover(memcg);
3718 /* it seems parent cgroup doesn't have enough mem */
3719 if (ret == -ENOMEM)
3720 goto try_to_free;
3721 cond_resched(); 3783 cond_resched();
3722 /* "ret" should also be checked to ensure all lists are empty. */ 3784 /* "ret" should also be checked to ensure all lists are empty. */
3723 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); 3785 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
@@ -3779,6 +3841,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3779 parent_memcg = mem_cgroup_from_cont(parent); 3841 parent_memcg = mem_cgroup_from_cont(parent);
3780 3842
3781 cgroup_lock(); 3843 cgroup_lock();
3844
3845 if (memcg->use_hierarchy == val)
3846 goto out;
3847
3782 /* 3848 /*
3783 * If parent's use_hierarchy is set, we can't make any modifications 3849 * If parent's use_hierarchy is set, we can't make any modifications
3784 * in the child subtrees. If it is unset, then the change can 3850 * in the child subtrees. If it is unset, then the change can
@@ -3795,6 +3861,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3795 retval = -EBUSY; 3861 retval = -EBUSY;
3796 } else 3862 } else
3797 retval = -EINVAL; 3863 retval = -EINVAL;
3864
3865out:
3798 cgroup_unlock(); 3866 cgroup_unlock();
3799 3867
3800 return retval; 3868 return retval;
@@ -3831,7 +3899,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3831 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); 3899 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
3832 3900
3833 if (swap) 3901 if (swap)
3834 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); 3902 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
3835 3903
3836 return val << PAGE_SHIFT; 3904 return val << PAGE_SHIFT;
3837} 3905}
@@ -4015,7 +4083,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4015#endif 4083#endif
4016 4084
4017#ifdef CONFIG_NUMA 4085#ifdef CONFIG_NUMA
4018static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft, 4086static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4019 struct seq_file *m) 4087 struct seq_file *m)
4020{ 4088{
4021 int nid; 4089 int nid;
@@ -4074,7 +4142,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
4074 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 4142 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
4075} 4143}
4076 4144
4077static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4145static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
4078 struct seq_file *m) 4146 struct seq_file *m)
4079{ 4147{
4080 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4148 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
@@ -4082,7 +4150,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4082 unsigned int i; 4150 unsigned int i;
4083 4151
4084 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 4152 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4085 if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account) 4153 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
4086 continue; 4154 continue;
4087 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], 4155 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
4088 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); 4156 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
@@ -4109,7 +4177,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4109 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 4177 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4110 long long val = 0; 4178 long long val = 0;
4111 4179
4112 if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account) 4180 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
4113 continue; 4181 continue;
4114 for_each_mem_cgroup_tree(mi, memcg) 4182 for_each_mem_cgroup_tree(mi, memcg)
4115 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; 4183 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
@@ -4533,7 +4601,7 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4533 return 0; 4601 return 0;
4534} 4602}
4535 4603
4536#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 4604#ifdef CONFIG_MEMCG_KMEM
4537static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 4605static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4538{ 4606{
4539 return mem_cgroup_sockets_init(memcg, ss); 4607 return mem_cgroup_sockets_init(memcg, ss);
@@ -4588,7 +4656,7 @@ static struct cftype mem_cgroup_files[] = {
4588 }, 4656 },
4589 { 4657 {
4590 .name = "stat", 4658 .name = "stat",
4591 .read_seq_string = mem_control_stat_show, 4659 .read_seq_string = memcg_stat_show,
4592 }, 4660 },
4593 { 4661 {
4594 .name = "force_empty", 4662 .name = "force_empty",
@@ -4620,10 +4688,10 @@ static struct cftype mem_cgroup_files[] = {
4620#ifdef CONFIG_NUMA 4688#ifdef CONFIG_NUMA
4621 { 4689 {
4622 .name = "numa_stat", 4690 .name = "numa_stat",
4623 .read_seq_string = mem_control_numa_stat_show, 4691 .read_seq_string = memcg_numa_stat_show,
4624 }, 4692 },
4625#endif 4693#endif
4626#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4694#ifdef CONFIG_MEMCG_SWAP
4627 { 4695 {
4628 .name = "memsw.usage_in_bytes", 4696 .name = "memsw.usage_in_bytes",
4629 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 4697 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
@@ -4810,7 +4878,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
4810} 4878}
4811EXPORT_SYMBOL(parent_mem_cgroup); 4879EXPORT_SYMBOL(parent_mem_cgroup);
4812 4880
4813#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4881#ifdef CONFIG_MEMCG_SWAP
4814static void __init enable_swap_cgroup(void) 4882static void __init enable_swap_cgroup(void)
4815{ 4883{
4816 if (!mem_cgroup_disabled() && really_do_swap_account) 4884 if (!mem_cgroup_disabled() && really_do_swap_account)
@@ -5541,7 +5609,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
5541 .__DEPRECATED_clear_css_refs = true, 5609 .__DEPRECATED_clear_css_refs = true,
5542}; 5610};
5543 5611
5544#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5612#ifdef CONFIG_MEMCG_SWAP
5545static int __init enable_swap_account(char *s) 5613static int __init enable_swap_account(char *s)
5546{ 5614{
5547 /* consider enabled if no parameter or 1 is given */ 5615 /* consider enabled if no parameter or 1 is given */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6de0d613bbe6..a6e2141a6610 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -128,7 +128,7 @@ static int hwpoison_filter_flags(struct page *p)
128 * can only guarantee that the page either belongs to the memcg tasks, or is 128 * can only guarantee that the page either belongs to the memcg tasks, or is
129 * a freed page. 129 * a freed page.
130 */ 130 */
131#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 131#ifdef CONFIG_MEMCG_SWAP
132u64 hwpoison_filter_memcg; 132u64 hwpoison_filter_memcg;
133EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); 133EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
134static int hwpoison_filter_task(struct page *p) 134static int hwpoison_filter_task(struct page *p)
@@ -1416,7 +1416,6 @@ static int soft_offline_huge_page(struct page *page, int flags)
1416 int ret; 1416 int ret;
1417 unsigned long pfn = page_to_pfn(page); 1417 unsigned long pfn = page_to_pfn(page);
1418 struct page *hpage = compound_head(page); 1418 struct page *hpage = compound_head(page);
1419 LIST_HEAD(pagelist);
1420 1419
1421 ret = get_any_page(page, pfn, flags); 1420 ret = get_any_page(page, pfn, flags);
1422 if (ret < 0) 1421 if (ret < 0)
@@ -1431,24 +1430,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
1431 } 1430 }
1432 1431
1433 /* Keep page count to indicate a given hugepage is isolated. */ 1432 /* Keep page count to indicate a given hugepage is isolated. */
1434 1433 ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false,
1435 list_add(&hpage->lru, &pagelist);
1436 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, false,
1437 MIGRATE_SYNC); 1434 MIGRATE_SYNC);
1435 put_page(hpage);
1438 if (ret) { 1436 if (ret) {
1439 struct page *page1, *page2;
1440 list_for_each_entry_safe(page1, page2, &pagelist, lru)
1441 put_page(page1);
1442
1443 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1437 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1444 pfn, ret, page->flags); 1438 pfn, ret, page->flags);
1445 if (ret > 0)
1446 ret = -EIO;
1447 return ret; 1439 return ret;
1448 } 1440 }
1449done: 1441done:
1450 if (!PageHWPoison(hpage)) 1442 if (!PageHWPoison(hpage))
1451 atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); 1443 atomic_long_add(1 << compound_trans_order(hpage),
1444 &mce_bad_pages);
1452 set_page_hwpoison_huge_page(hpage); 1445 set_page_hwpoison_huge_page(hpage);
1453 dequeue_hwpoisoned_huge_page(hpage); 1446 dequeue_hwpoisoned_huge_page(hpage);
1454 /* keep elevated page count for bad page */ 1447 /* keep elevated page count for bad page */
diff --git a/mm/memory.c b/mm/memory.c
index 91f69459d3e8..482f089765ff 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1343,8 +1343,11 @@ static void unmap_single_vma(struct mmu_gather *tlb,
1343 * Since no pte has actually been setup, it is 1343 * Since no pte has actually been setup, it is
1344 * safe to do nothing in this case. 1344 * safe to do nothing in this case.
1345 */ 1345 */
1346 if (vma->vm_file) 1346 if (vma->vm_file) {
1347 unmap_hugepage_range(vma, start, end, NULL); 1347 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
1348 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1349 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
1350 }
1348 } else 1351 } else
1349 unmap_page_range(tlb, vma, start, end, details); 1352 unmap_page_range(tlb, vma, start, end, details);
1350 } 1353 }
@@ -3938,7 +3941,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
3938 free_page((unsigned long)buf); 3941 free_page((unsigned long)buf);
3939 } 3942 }
3940 } 3943 }
3941 up_read(&current->mm->mmap_sem); 3944 up_read(&mm->mmap_sem);
3942} 3945}
3943 3946
3944#ifdef CONFIG_PROVE_LOCKING 3947#ifdef CONFIG_PROVE_LOCKING
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 427bb291dd0f..3ad25f9d1fc1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -512,19 +512,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
512 512
513 zone->present_pages += onlined_pages; 513 zone->present_pages += onlined_pages;
514 zone->zone_pgdat->node_present_pages += onlined_pages; 514 zone->zone_pgdat->node_present_pages += onlined_pages;
515 if (need_zonelists_rebuild) 515 if (onlined_pages) {
516 build_all_zonelists(zone); 516 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
517 else 517 if (need_zonelists_rebuild)
518 zone_pcp_update(zone); 518 build_all_zonelists(NULL, zone);
519 else
520 zone_pcp_update(zone);
521 }
519 522
520 mutex_unlock(&zonelists_mutex); 523 mutex_unlock(&zonelists_mutex);
521 524
522 init_per_zone_wmark_min(); 525 init_per_zone_wmark_min();
523 526
524 if (onlined_pages) { 527 if (onlined_pages)
525 kswapd_run(zone_to_nid(zone)); 528 kswapd_run(zone_to_nid(zone));
526 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
527 }
528 529
529 vm_total_pages = nr_free_pagecache_pages(); 530 vm_total_pages = nr_free_pagecache_pages();
530 531
@@ -562,7 +563,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
562 * to access not-initialized zonelist, build here. 563 * to access not-initialized zonelist, build here.
563 */ 564 */
564 mutex_lock(&zonelists_mutex); 565 mutex_lock(&zonelists_mutex);
565 build_all_zonelists(NULL); 566 build_all_zonelists(pgdat, NULL);
566 mutex_unlock(&zonelists_mutex); 567 mutex_unlock(&zonelists_mutex);
567 568
568 return pgdat; 569 return pgdat;
@@ -965,6 +966,9 @@ repeat:
965 966
966 init_per_zone_wmark_min(); 967 init_per_zone_wmark_min();
967 968
969 if (!populated_zone(zone))
970 zone_pcp_reset(zone);
971
968 if (!node_present_pages(node)) { 972 if (!node_present_pages(node)) {
969 node_clear_state(node, N_HIGH_MEMORY); 973 node_clear_state(node, N_HIGH_MEMORY);
970 kswapd_stop(node); 974 kswapd_stop(node);
diff --git a/mm/migrate.c b/mm/migrate.c
index be26d5cbe56b..77ed2d773705 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -33,6 +33,7 @@
33#include <linux/memcontrol.h> 33#include <linux/memcontrol.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/hugetlb.h> 35#include <linux/hugetlb.h>
36#include <linux/hugetlb_cgroup.h>
36#include <linux/gfp.h> 37#include <linux/gfp.h>
37 38
38#include <asm/tlbflush.h> 39#include <asm/tlbflush.h>
@@ -682,7 +683,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
682{ 683{
683 int rc = -EAGAIN; 684 int rc = -EAGAIN;
684 int remap_swapcache = 1; 685 int remap_swapcache = 1;
685 int charge = 0;
686 struct mem_cgroup *mem; 686 struct mem_cgroup *mem;
687 struct anon_vma *anon_vma = NULL; 687 struct anon_vma *anon_vma = NULL;
688 688
@@ -724,12 +724,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
724 } 724 }
725 725
726 /* charge against new page */ 726 /* charge against new page */
727 charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL); 727 mem_cgroup_prepare_migration(page, newpage, &mem);
728 if (charge == -ENOMEM) {
729 rc = -ENOMEM;
730 goto unlock;
731 }
732 BUG_ON(charge);
733 728
734 if (PageWriteback(page)) { 729 if (PageWriteback(page)) {
735 /* 730 /*
@@ -819,8 +814,7 @@ skip_unmap:
819 put_anon_vma(anon_vma); 814 put_anon_vma(anon_vma);
820 815
821uncharge: 816uncharge:
822 if (!charge) 817 mem_cgroup_end_migration(mem, page, newpage, rc == 0);
823 mem_cgroup_end_migration(mem, page, newpage, rc == 0);
824unlock: 818unlock:
825 unlock_page(page); 819 unlock_page(page);
826out: 820out:
@@ -931,16 +925,13 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
931 925
932 if (anon_vma) 926 if (anon_vma)
933 put_anon_vma(anon_vma); 927 put_anon_vma(anon_vma);
934 unlock_page(hpage);
935 928
936out: 929 if (!rc)
937 if (rc != -EAGAIN) { 930 hugetlb_cgroup_migrate(hpage, new_hpage);
938 list_del(&hpage->lru);
939 put_page(hpage);
940 }
941 931
932 unlock_page(hpage);
933out:
942 put_page(new_hpage); 934 put_page(new_hpage);
943
944 if (result) { 935 if (result) {
945 if (rc) 936 if (rc)
946 *result = rc; 937 *result = rc;
@@ -1016,48 +1007,32 @@ out:
1016 return nr_failed + retry; 1007 return nr_failed + retry;
1017} 1008}
1018 1009
1019int migrate_huge_pages(struct list_head *from, 1010int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
1020 new_page_t get_new_page, unsigned long private, bool offlining, 1011 unsigned long private, bool offlining,
1021 enum migrate_mode mode) 1012 enum migrate_mode mode)
1022{ 1013{
1023 int retry = 1; 1014 int pass, rc;
1024 int nr_failed = 0; 1015
1025 int pass = 0; 1016 for (pass = 0; pass < 10; pass++) {
1026 struct page *page; 1017 rc = unmap_and_move_huge_page(get_new_page,
1027 struct page *page2; 1018 private, hpage, pass > 2, offlining,
1028 int rc; 1019 mode);
1029 1020 switch (rc) {
1030 for (pass = 0; pass < 10 && retry; pass++) { 1021 case -ENOMEM:
1031 retry = 0; 1022 goto out;
1032 1023 case -EAGAIN:
1033 list_for_each_entry_safe(page, page2, from, lru) { 1024 /* try again */
1034 cond_resched(); 1025 cond_resched();
1035 1026 break;
1036 rc = unmap_and_move_huge_page(get_new_page, 1027 case 0:
1037 private, page, pass > 2, offlining, 1028 goto out;
1038 mode); 1029 default:
1039 1030 rc = -EIO;
1040 switch(rc) { 1031 goto out;
1041 case -ENOMEM:
1042 goto out;
1043 case -EAGAIN:
1044 retry++;
1045 break;
1046 case 0:
1047 break;
1048 default:
1049 /* Permanent failure */
1050 nr_failed++;
1051 break;
1052 }
1053 } 1032 }
1054 } 1033 }
1055 rc = 0;
1056out: 1034out:
1057 if (rc) 1035 return rc;
1058 return rc;
1059
1060 return nr_failed + retry;
1061} 1036}
1062 1037
1063#ifdef CONFIG_NUMA 1038#ifdef CONFIG_NUMA
diff --git a/mm/mmap.c b/mm/mmap.c
index 4fe2697339ed..e3e86914f11a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -943,6 +943,8 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
943 const unsigned long stack_flags 943 const unsigned long stack_flags
944 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); 944 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
945 945
946 mm->total_vm += pages;
947
946 if (file) { 948 if (file) {
947 mm->shared_vm += pages; 949 mm->shared_vm += pages;
948 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) 950 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
@@ -1347,7 +1349,6 @@ munmap_back:
1347out: 1349out:
1348 perf_event_mmap(vma); 1350 perf_event_mmap(vma);
1349 1351
1350 mm->total_vm += len >> PAGE_SHIFT;
1351 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1352 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1352 if (vm_flags & VM_LOCKED) { 1353 if (vm_flags & VM_LOCKED) {
1353 if (!mlock_vma_pages_range(vma, addr, addr + len)) 1354 if (!mlock_vma_pages_range(vma, addr, addr + len))
@@ -1707,7 +1708,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
1707 return -ENOMEM; 1708 return -ENOMEM;
1708 1709
1709 /* Ok, everything looks good - let it rip */ 1710 /* Ok, everything looks good - let it rip */
1710 mm->total_vm += grow;
1711 if (vma->vm_flags & VM_LOCKED) 1711 if (vma->vm_flags & VM_LOCKED)
1712 mm->locked_vm += grow; 1712 mm->locked_vm += grow;
1713 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); 1713 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
@@ -1889,7 +1889,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
1889 1889
1890 if (vma->vm_flags & VM_ACCOUNT) 1890 if (vma->vm_flags & VM_ACCOUNT)
1891 nr_accounted += nrpages; 1891 nr_accounted += nrpages;
1892 mm->total_vm -= nrpages;
1893 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); 1892 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
1894 vma = remove_vma(vma); 1893 vma = remove_vma(vma);
1895 } while (vma); 1894 } while (vma);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 9a611d3a1848..862b60822d9f 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -33,6 +33,24 @@
33void __mmu_notifier_release(struct mm_struct *mm) 33void __mmu_notifier_release(struct mm_struct *mm)
34{ 34{
35 struct mmu_notifier *mn; 35 struct mmu_notifier *mn;
36 struct hlist_node *n;
37
38 /*
39 * RCU here will block mmu_notifier_unregister until
40 * ->release returns.
41 */
42 rcu_read_lock();
43 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
44 /*
45 * if ->release runs before mmu_notifier_unregister it
46 * must be handled as it's the only way for the driver
47 * to flush all existing sptes and stop the driver
48 * from establishing any more sptes before all the
49 * pages in the mm are freed.
50 */
51 if (mn->ops->release)
52 mn->ops->release(mn, mm);
53 rcu_read_unlock();
36 54
37 spin_lock(&mm->mmu_notifier_mm->lock); 55 spin_lock(&mm->mmu_notifier_mm->lock);
38 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { 56 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
46 * mmu_notifier_unregister to return. 64 * mmu_notifier_unregister to return.
47 */ 65 */
48 hlist_del_init_rcu(&mn->hlist); 66 hlist_del_init_rcu(&mn->hlist);
49 /*
50 * RCU here will block mmu_notifier_unregister until
51 * ->release returns.
52 */
53 rcu_read_lock();
54 spin_unlock(&mm->mmu_notifier_mm->lock);
55 /*
56 * if ->release runs before mmu_notifier_unregister it
57 * must be handled as it's the only way for the driver
58 * to flush all existing sptes and stop the driver
59 * from establishing any more sptes before all the
60 * pages in the mm are freed.
61 */
62 if (mn->ops->release)
63 mn->ops->release(mn, mm);
64 rcu_read_unlock();
65 spin_lock(&mm->mmu_notifier_mm->lock);
66 } 67 }
67 spin_unlock(&mm->mmu_notifier_mm->lock); 68 spin_unlock(&mm->mmu_notifier_mm->lock);
68 69
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
284{ 285{
285 BUG_ON(atomic_read(&mm->mm_count) <= 0); 286 BUG_ON(atomic_read(&mm->mm_count) <= 0);
286 287
287 spin_lock(&mm->mmu_notifier_mm->lock);
288 if (!hlist_unhashed(&mn->hlist)) { 288 if (!hlist_unhashed(&mn->hlist)) {
289 hlist_del_rcu(&mn->hlist);
290
291 /* 289 /*
292 * RCU here will force exit_mmap to wait ->release to finish 290 * RCU here will force exit_mmap to wait ->release to finish
293 * before freeing the pages. 291 * before freeing the pages.
294 */ 292 */
295 rcu_read_lock(); 293 rcu_read_lock();
296 spin_unlock(&mm->mmu_notifier_mm->lock); 294
297 /* 295 /*
298 * exit_mmap will block in mmu_notifier_release to 296 * exit_mmap will block in mmu_notifier_release to
299 * guarantee ->release is called before freeing the 297 * guarantee ->release is called before freeing the
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
302 if (mn->ops->release) 300 if (mn->ops->release)
303 mn->ops->release(mn, mm); 301 mn->ops->release(mn, mm);
304 rcu_read_unlock(); 302 rcu_read_unlock();
305 } else 303
304 spin_lock(&mm->mmu_notifier_mm->lock);
305 hlist_del_rcu(&mn->hlist);
306 spin_unlock(&mm->mmu_notifier_mm->lock); 306 spin_unlock(&mm->mmu_notifier_mm->lock);
307 }
307 308
308 /* 309 /*
309 * Wait any running method to finish, of course including 310 * Wait any running method to finish, of course including
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 6830eab5bf09..3cef80f6ac79 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -96,7 +96,7 @@ void lruvec_init(struct lruvec *lruvec, struct zone *zone)
96 for_each_lru(lru) 96 for_each_lru(lru)
97 INIT_LIST_HEAD(&lruvec->lists[lru]); 97 INIT_LIST_HEAD(&lruvec->lists[lru]);
98 98
99#ifdef CONFIG_CGROUP_MEM_RES_CTLR 99#ifdef CONFIG_MEMCG
100 lruvec->zone = zone; 100 lruvec->zone = zone;
101#endif 101#endif
102} 102}
diff --git a/mm/mremap.c b/mm/mremap.c
index 21fed202ddad..cc06d0e48d05 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -260,7 +260,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
260 * If this were a serious issue, we'd add a flag to do_munmap(). 260 * If this were a serious issue, we'd add a flag to do_munmap().
261 */ 261 */
262 hiwater_vm = mm->hiwater_vm; 262 hiwater_vm = mm->hiwater_vm;
263 mm->total_vm += new_len >> PAGE_SHIFT;
264 vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); 263 vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
265 264
266 if (do_munmap(mm, old_addr, old_len) < 0) { 265 if (do_munmap(mm, old_addr, old_len) < 0) {
@@ -497,7 +496,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
497 goto out; 496 goto out;
498 } 497 }
499 498
500 mm->total_vm += pages;
501 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); 499 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
502 if (vma->vm_flags & VM_LOCKED) { 500 if (vma->vm_flags & VM_LOCKED) {
503 mm->locked_vm += pages; 501 mm->locked_vm += pages;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ac300c99baf6..198600861638 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -288,76 +288,93 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
288} 288}
289#endif 289#endif
290 290
291enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
292 unsigned long totalpages, const nodemask_t *nodemask,
293 bool force_kill)
294{
295 if (task->exit_state)
296 return OOM_SCAN_CONTINUE;
297 if (oom_unkillable_task(task, NULL, nodemask))
298 return OOM_SCAN_CONTINUE;
299
300 /*
301 * This task already has access to memory reserves and is being killed.
302 * Don't allow any other task to have access to the reserves.
303 */
304 if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
305 if (unlikely(frozen(task)))
306 __thaw_task(task);
307 if (!force_kill)
308 return OOM_SCAN_ABORT;
309 }
310 if (!task->mm)
311 return OOM_SCAN_CONTINUE;
312
313 if (task->flags & PF_EXITING) {
314 /*
315 * If task is current and is in the process of releasing memory,
316 * allow the "kill" to set TIF_MEMDIE, which will allow it to
317 * access memory reserves. Otherwise, it may stall forever.
318 *
319 * The iteration isn't broken here, however, in case other
320 * threads are found to have already been oom killed.
321 */
322 if (task == current)
323 return OOM_SCAN_SELECT;
324 else if (!force_kill) {
325 /*
326 * If this task is not being ptraced on exit, then wait
327 * for it to finish before killing some other task
328 * unnecessarily.
329 */
330 if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
331 return OOM_SCAN_ABORT;
332 }
333 }
334 return OOM_SCAN_OK;
335}
336
291/* 337/*
292 * Simple selection loop. We chose the process with the highest 338 * Simple selection loop. We chose the process with the highest
293 * number of 'points'. We expect the caller will lock the tasklist. 339 * number of 'points'.
294 * 340 *
295 * (not docbooked, we don't want this one cluttering up the manual) 341 * (not docbooked, we don't want this one cluttering up the manual)
296 */ 342 */
297static struct task_struct *select_bad_process(unsigned int *ppoints, 343static struct task_struct *select_bad_process(unsigned int *ppoints,
298 unsigned long totalpages, struct mem_cgroup *memcg, 344 unsigned long totalpages, const nodemask_t *nodemask,
299 const nodemask_t *nodemask, bool force_kill) 345 bool force_kill)
300{ 346{
301 struct task_struct *g, *p; 347 struct task_struct *g, *p;
302 struct task_struct *chosen = NULL; 348 struct task_struct *chosen = NULL;
303 unsigned long chosen_points = 0; 349 unsigned long chosen_points = 0;
304 350
351 rcu_read_lock();
305 do_each_thread(g, p) { 352 do_each_thread(g, p) {
306 unsigned int points; 353 unsigned int points;
307 354
308 if (p->exit_state) 355 switch (oom_scan_process_thread(p, totalpages, nodemask,
309 continue; 356 force_kill)) {
310 if (oom_unkillable_task(p, memcg, nodemask)) 357 case OOM_SCAN_SELECT:
311 continue; 358 chosen = p;
312 359 chosen_points = ULONG_MAX;
313 /* 360 /* fall through */
314 * This task already has access to memory reserves and is 361 case OOM_SCAN_CONTINUE:
315 * being killed. Don't allow any other task access to the
316 * memory reserve.
317 *
318 * Note: this may have a chance of deadlock if it gets
319 * blocked waiting for another task which itself is waiting
320 * for memory. Is there a better alternative?
321 */
322 if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
323 if (unlikely(frozen(p)))
324 __thaw_task(p);
325 if (!force_kill)
326 return ERR_PTR(-1UL);
327 }
328 if (!p->mm)
329 continue; 362 continue;
330 363 case OOM_SCAN_ABORT:
331 if (p->flags & PF_EXITING) { 364 rcu_read_unlock();
332 /* 365 return ERR_PTR(-1UL);
333 * If p is the current task and is in the process of 366 case OOM_SCAN_OK:
334 * releasing memory, we allow the "kill" to set 367 break;
335 * TIF_MEMDIE, which will allow it to gain access to 368 };
336 * memory reserves. Otherwise, it may stall forever. 369 points = oom_badness(p, NULL, nodemask, totalpages);
337 *
338 * The loop isn't broken here, however, in case other
339 * threads are found to have already been oom killed.
340 */
341 if (p == current) {
342 chosen = p;
343 chosen_points = ULONG_MAX;
344 } else if (!force_kill) {
345 /*
346 * If this task is not being ptraced on exit,
347 * then wait for it to finish before killing
348 * some other task unnecessarily.
349 */
350 if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
351 return ERR_PTR(-1UL);
352 }
353 }
354
355 points = oom_badness(p, memcg, nodemask, totalpages);
356 if (points > chosen_points) { 370 if (points > chosen_points) {
357 chosen = p; 371 chosen = p;
358 chosen_points = points; 372 chosen_points = points;
359 } 373 }
360 } while_each_thread(g, p); 374 } while_each_thread(g, p);
375 if (chosen)
376 get_task_struct(chosen);
377 rcu_read_unlock();
361 378
362 *ppoints = chosen_points * 1000 / totalpages; 379 *ppoints = chosen_points * 1000 / totalpages;
363 return chosen; 380 return chosen;
@@ -371,17 +388,16 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
371 * Dumps the current memory state of all eligible tasks. Tasks not in the same 388 * Dumps the current memory state of all eligible tasks. Tasks not in the same
372 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes 389 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
373 * are not shown. 390 * are not shown.
374 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj 391 * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
375 * value, oom_score_adj value, and name. 392 * swapents, oom_score_adj value, and name.
376 *
377 * Call with tasklist_lock read-locked.
378 */ 393 */
379static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) 394static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
380{ 395{
381 struct task_struct *p; 396 struct task_struct *p;
382 struct task_struct *task; 397 struct task_struct *task;
383 398
384 pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); 399 pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n");
400 rcu_read_lock();
385 for_each_process(p) { 401 for_each_process(p) {
386 if (oom_unkillable_task(p, memcg, nodemask)) 402 if (oom_unkillable_task(p, memcg, nodemask))
387 continue; 403 continue;
@@ -396,13 +412,15 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
396 continue; 412 continue;
397 } 413 }
398 414
399 pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n", 415 pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n",
400 task->pid, from_kuid(&init_user_ns, task_uid(task)), 416 task->pid, from_kuid(&init_user_ns, task_uid(task)),
401 task->tgid, task->mm->total_vm, get_mm_rss(task->mm), 417 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
402 task_cpu(task), task->signal->oom_adj, 418 task->mm->nr_ptes,
419 get_mm_counter(task->mm, MM_SWAPENTS),
403 task->signal->oom_score_adj, task->comm); 420 task->signal->oom_score_adj, task->comm);
404 task_unlock(task); 421 task_unlock(task);
405 } 422 }
423 rcu_read_unlock();
406} 424}
407 425
408static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, 426static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
@@ -423,10 +441,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
423} 441}
424 442
425#define K(x) ((x) << (PAGE_SHIFT-10)) 443#define K(x) ((x) << (PAGE_SHIFT-10))
426static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, 444/*
427 unsigned int points, unsigned long totalpages, 445 * Must be called while holding a reference to p, which will be released upon
428 struct mem_cgroup *memcg, nodemask_t *nodemask, 446 * returning.
429 const char *message) 447 */
448void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
449 unsigned int points, unsigned long totalpages,
450 struct mem_cgroup *memcg, nodemask_t *nodemask,
451 const char *message)
430{ 452{
431 struct task_struct *victim = p; 453 struct task_struct *victim = p;
432 struct task_struct *child; 454 struct task_struct *child;
@@ -442,6 +464,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
442 */ 464 */
443 if (p->flags & PF_EXITING) { 465 if (p->flags & PF_EXITING) {
444 set_tsk_thread_flag(p, TIF_MEMDIE); 466 set_tsk_thread_flag(p, TIF_MEMDIE);
467 put_task_struct(p);
445 return; 468 return;
446 } 469 }
447 470
@@ -459,6 +482,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
459 * parent. This attempts to lose the minimal amount of work done while 482 * parent. This attempts to lose the minimal amount of work done while
460 * still freeing memory. 483 * still freeing memory.
461 */ 484 */
485 read_lock(&tasklist_lock);
462 do { 486 do {
463 list_for_each_entry(child, &t->children, sibling) { 487 list_for_each_entry(child, &t->children, sibling) {
464 unsigned int child_points; 488 unsigned int child_points;
@@ -471,15 +495,26 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
471 child_points = oom_badness(child, memcg, nodemask, 495 child_points = oom_badness(child, memcg, nodemask,
472 totalpages); 496 totalpages);
473 if (child_points > victim_points) { 497 if (child_points > victim_points) {
498 put_task_struct(victim);
474 victim = child; 499 victim = child;
475 victim_points = child_points; 500 victim_points = child_points;
501 get_task_struct(victim);
476 } 502 }
477 } 503 }
478 } while_each_thread(p, t); 504 } while_each_thread(p, t);
505 read_unlock(&tasklist_lock);
479 506
480 victim = find_lock_task_mm(victim); 507 rcu_read_lock();
481 if (!victim) 508 p = find_lock_task_mm(victim);
509 if (!p) {
510 rcu_read_unlock();
511 put_task_struct(victim);
482 return; 512 return;
513 } else if (victim != p) {
514 get_task_struct(p);
515 put_task_struct(victim);
516 victim = p;
517 }
483 518
484 /* mm cannot safely be dereferenced after task_unlock(victim) */ 519 /* mm cannot safely be dereferenced after task_unlock(victim) */
485 mm = victim->mm; 520 mm = victim->mm;
@@ -510,17 +545,19 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
510 task_unlock(p); 545 task_unlock(p);
511 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); 546 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
512 } 547 }
548 rcu_read_unlock();
513 549
514 set_tsk_thread_flag(victim, TIF_MEMDIE); 550 set_tsk_thread_flag(victim, TIF_MEMDIE);
515 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); 551 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
552 put_task_struct(victim);
516} 553}
517#undef K 554#undef K
518 555
519/* 556/*
520 * Determines whether the kernel must panic because of the panic_on_oom sysctl. 557 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
521 */ 558 */
522static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, 559void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
523 int order, const nodemask_t *nodemask) 560 int order, const nodemask_t *nodemask)
524{ 561{
525 if (likely(!sysctl_panic_on_oom)) 562 if (likely(!sysctl_panic_on_oom))
526 return; 563 return;
@@ -533,42 +570,11 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
533 if (constraint != CONSTRAINT_NONE) 570 if (constraint != CONSTRAINT_NONE)
534 return; 571 return;
535 } 572 }
536 read_lock(&tasklist_lock);
537 dump_header(NULL, gfp_mask, order, NULL, nodemask); 573 dump_header(NULL, gfp_mask, order, NULL, nodemask);
538 read_unlock(&tasklist_lock);
539 panic("Out of memory: %s panic_on_oom is enabled\n", 574 panic("Out of memory: %s panic_on_oom is enabled\n",
540 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); 575 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
541} 576}
542 577
543#ifdef CONFIG_CGROUP_MEM_RES_CTLR
544void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
545 int order)
546{
547 unsigned long limit;
548 unsigned int points = 0;
549 struct task_struct *p;
550
551 /*
552 * If current has a pending SIGKILL, then automatically select it. The
553 * goal is to allow it to allocate so that it may quickly exit and free
554 * its memory.
555 */
556 if (fatal_signal_pending(current)) {
557 set_thread_flag(TIF_MEMDIE);
558 return;
559 }
560
561 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
562 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
563 read_lock(&tasklist_lock);
564 p = select_bad_process(&points, limit, memcg, NULL, false);
565 if (p && PTR_ERR(p) != -1UL)
566 oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL,
567 "Memory cgroup out of memory");
568 read_unlock(&tasklist_lock);
569}
570#endif
571
572static BLOCKING_NOTIFIER_HEAD(oom_notify_list); 578static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
573 579
574int register_oom_notifier(struct notifier_block *nb) 580int register_oom_notifier(struct notifier_block *nb)
@@ -690,7 +696,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
690 struct task_struct *p; 696 struct task_struct *p;
691 unsigned long totalpages; 697 unsigned long totalpages;
692 unsigned long freed = 0; 698 unsigned long freed = 0;
693 unsigned int points; 699 unsigned int uninitialized_var(points);
694 enum oom_constraint constraint = CONSTRAINT_NONE; 700 enum oom_constraint constraint = CONSTRAINT_NONE;
695 int killed = 0; 701 int killed = 0;
696 702
@@ -718,22 +724,20 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
718 mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; 724 mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
719 check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); 725 check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
720 726
721 read_lock(&tasklist_lock); 727 if (sysctl_oom_kill_allocating_task && current->mm &&
722 if (sysctl_oom_kill_allocating_task &&
723 !oom_unkillable_task(current, NULL, nodemask) && 728 !oom_unkillable_task(current, NULL, nodemask) &&
724 current->mm) { 729 current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
730 get_task_struct(current);
725 oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, 731 oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
726 nodemask, 732 nodemask,
727 "Out of memory (oom_kill_allocating_task)"); 733 "Out of memory (oom_kill_allocating_task)");
728 goto out; 734 goto out;
729 } 735 }
730 736
731 p = select_bad_process(&points, totalpages, NULL, mpol_mask, 737 p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
732 force_kill);
733 /* Found nothing?!?! Either we hang forever, or we panic. */ 738 /* Found nothing?!?! Either we hang forever, or we panic. */
734 if (!p) { 739 if (!p) {
735 dump_header(NULL, gfp_mask, order, NULL, mpol_mask); 740 dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
736 read_unlock(&tasklist_lock);
737 panic("Out of memory and no killable processes...\n"); 741 panic("Out of memory and no killable processes...\n");
738 } 742 }
739 if (PTR_ERR(p) != -1UL) { 743 if (PTR_ERR(p) != -1UL) {
@@ -742,14 +746,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
742 killed = 1; 746 killed = 1;
743 } 747 }
744out: 748out:
745 read_unlock(&tasklist_lock);
746
747 /* 749 /*
748 * Give "p" a good chance of killing itself before we 750 * Give the killed threads a good chance of exiting before trying to
749 * retry to allocate memory unless "p" is current 751 * allocate memory again.
750 */ 752 */
751 if (killed && !test_thread_flag(TIF_MEMDIE)) 753 if (killed)
752 schedule_timeout_uninterruptible(1); 754 schedule_timeout_killable(1);
753} 755}
754 756
755/* 757/*
@@ -764,6 +766,5 @@ void pagefault_out_of_memory(void)
764 out_of_memory(NULL, 0, 0, NULL, false); 766 out_of_memory(NULL, 0, 0, NULL, false);
765 clear_system_oom(); 767 clear_system_oom();
766 } 768 }
767 if (!test_thread_flag(TIF_MEMDIE)) 769 schedule_timeout_killable(1);
768 schedule_timeout_uninterruptible(1);
769} 770}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4a4f9219683f..889532b8e6c1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -51,7 +51,6 @@
51#include <linux/page_cgroup.h> 51#include <linux/page_cgroup.h>
52#include <linux/debugobjects.h> 52#include <linux/debugobjects.h>
53#include <linux/kmemleak.h> 53#include <linux/kmemleak.h>
54#include <linux/memory.h>
55#include <linux/compaction.h> 54#include <linux/compaction.h>
56#include <trace/events/kmem.h> 55#include <trace/events/kmem.h>
57#include <linux/ftrace_event.h> 56#include <linux/ftrace_event.h>
@@ -219,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes);
219 218
220int page_group_by_mobility_disabled __read_mostly; 219int page_group_by_mobility_disabled __read_mostly;
221 220
222static void set_pageblock_migratetype(struct page *page, int migratetype) 221/*
222 * NOTE:
223 * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
224 * Instead, use {un}set_pageblock_isolate.
225 */
226void set_pageblock_migratetype(struct page *page, int migratetype)
223{ 227{
224 228
225 if (unlikely(page_group_by_mobility_disabled)) 229 if (unlikely(page_group_by_mobility_disabled))
@@ -954,7 +958,7 @@ static int move_freepages(struct zone *zone,
954 return pages_moved; 958 return pages_moved;
955} 959}
956 960
957static int move_freepages_block(struct zone *zone, struct page *page, 961int move_freepages_block(struct zone *zone, struct page *page,
958 int migratetype) 962 int migratetype)
959{ 963{
960 unsigned long start_pfn, end_pfn; 964 unsigned long start_pfn, end_pfn;
@@ -1158,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1158 to_drain = pcp->batch; 1162 to_drain = pcp->batch;
1159 else 1163 else
1160 to_drain = pcp->count; 1164 to_drain = pcp->count;
1161 free_pcppages_bulk(zone, to_drain, pcp); 1165 if (to_drain > 0) {
1162 pcp->count -= to_drain; 1166 free_pcppages_bulk(zone, to_drain, pcp);
1167 pcp->count -= to_drain;
1168 }
1163 local_irq_restore(flags); 1169 local_irq_restore(flags);
1164} 1170}
1165#endif 1171#endif
@@ -1529,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str)
1529} 1535}
1530__setup("fail_page_alloc=", setup_fail_page_alloc); 1536__setup("fail_page_alloc=", setup_fail_page_alloc);
1531 1537
1532static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1538static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1533{ 1539{
1534 if (order < fail_page_alloc.min_order) 1540 if (order < fail_page_alloc.min_order)
1535 return 0; 1541 return false;
1536 if (gfp_mask & __GFP_NOFAIL) 1542 if (gfp_mask & __GFP_NOFAIL)
1537 return 0; 1543 return false;
1538 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 1544 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1539 return 0; 1545 return false;
1540 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) 1546 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1541 return 0; 1547 return false;
1542 1548
1543 return should_fail(&fail_page_alloc.attr, 1 << order); 1549 return should_fail(&fail_page_alloc.attr, 1 << order);
1544} 1550}
@@ -1578,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs);
1578 1584
1579#else /* CONFIG_FAIL_PAGE_ALLOC */ 1585#else /* CONFIG_FAIL_PAGE_ALLOC */
1580 1586
1581static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1587static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1582{ 1588{
1583 return 0; 1589 return false;
1584} 1590}
1585 1591
1586#endif /* CONFIG_FAIL_PAGE_ALLOC */ 1592#endif /* CONFIG_FAIL_PAGE_ALLOC */
@@ -1594,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1594{ 1600{
1595 /* free_pages my go negative - that's OK */ 1601 /* free_pages my go negative - that's OK */
1596 long min = mark; 1602 long min = mark;
1603 long lowmem_reserve = z->lowmem_reserve[classzone_idx];
1597 int o; 1604 int o;
1598 1605
1599 free_pages -= (1 << order) - 1; 1606 free_pages -= (1 << order) - 1;
@@ -1602,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1602 if (alloc_flags & ALLOC_HARDER) 1609 if (alloc_flags & ALLOC_HARDER)
1603 min -= min / 4; 1610 min -= min / 4;
1604 1611
1605 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1612 if (free_pages <= min + lowmem_reserve)
1606 return false; 1613 return false;
1607 for (o = 0; o < order; o++) { 1614 for (o = 0; o < order; o++) {
1608 /* At the next order, this order's pages become unavailable */ 1615 /* At the next order, this order's pages become unavailable */
@@ -1617,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1617 return true; 1624 return true;
1618} 1625}
1619 1626
1627#ifdef CONFIG_MEMORY_ISOLATION
1628static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1629{
1630 if (unlikely(zone->nr_pageblock_isolate))
1631 return zone->nr_pageblock_isolate * pageblock_nr_pages;
1632 return 0;
1633}
1634#else
1635static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1636{
1637 return 0;
1638}
1639#endif
1640
1620bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1641bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1621 int classzone_idx, int alloc_flags) 1642 int classzone_idx, int alloc_flags)
1622{ 1643{
@@ -1632,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1632 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 1653 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1633 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 1654 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1634 1655
1656 /*
1657 * If the zone has MIGRATE_ISOLATE type free pages, we should consider
1658 * it. nr_zone_isolate_freepages is never accurate so kswapd might not
1659 * sleep although it could do so. But this is more desirable for memory
1660 * hotplug than sleeping which can cause a livelock in the direct
1661 * reclaim path.
1662 */
1663 free_pages -= nr_zone_isolate_freepages(z);
1635 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1664 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1636 free_pages); 1665 free_pages);
1637} 1666}
@@ -2087,8 +2116,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2087 2116
2088 page = get_page_from_freelist(gfp_mask, nodemask, 2117 page = get_page_from_freelist(gfp_mask, nodemask,
2089 order, zonelist, high_zoneidx, 2118 order, zonelist, high_zoneidx,
2090 alloc_flags, preferred_zone, 2119 alloc_flags & ~ALLOC_NO_WATERMARKS,
2091 migratetype); 2120 preferred_zone, migratetype);
2092 if (page) { 2121 if (page) {
2093 preferred_zone->compact_considered = 0; 2122 preferred_zone->compact_considered = 0;
2094 preferred_zone->compact_defer_shift = 0; 2123 preferred_zone->compact_defer_shift = 0;
@@ -2180,8 +2209,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2180retry: 2209retry:
2181 page = get_page_from_freelist(gfp_mask, nodemask, order, 2210 page = get_page_from_freelist(gfp_mask, nodemask, order,
2182 zonelist, high_zoneidx, 2211 zonelist, high_zoneidx,
2183 alloc_flags, preferred_zone, 2212 alloc_flags & ~ALLOC_NO_WATERMARKS,
2184 migratetype); 2213 preferred_zone, migratetype);
2185 2214
2186 /* 2215 /*
2187 * If an allocation failed after direct reclaim, it could be because 2216 * If an allocation failed after direct reclaim, it could be because
@@ -2265,15 +2294,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
2265 alloc_flags |= ALLOC_HARDER; 2294 alloc_flags |= ALLOC_HARDER;
2266 2295
2267 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 2296 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
2268 if (!in_interrupt() && 2297 if (gfp_mask & __GFP_MEMALLOC)
2269 ((current->flags & PF_MEMALLOC) || 2298 alloc_flags |= ALLOC_NO_WATERMARKS;
2270 unlikely(test_thread_flag(TIF_MEMDIE)))) 2299 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
2300 alloc_flags |= ALLOC_NO_WATERMARKS;
2301 else if (!in_interrupt() &&
2302 ((current->flags & PF_MEMALLOC) ||
2303 unlikely(test_thread_flag(TIF_MEMDIE))))
2271 alloc_flags |= ALLOC_NO_WATERMARKS; 2304 alloc_flags |= ALLOC_NO_WATERMARKS;
2272 } 2305 }
2273 2306
2274 return alloc_flags; 2307 return alloc_flags;
2275} 2308}
2276 2309
2310bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2311{
2312 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
2313}
2314
2277static inline struct page * 2315static inline struct page *
2278__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2316__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2279 struct zonelist *zonelist, enum zone_type high_zoneidx, 2317 struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -2340,11 +2378,27 @@ rebalance:
2340 2378
2341 /* Allocate without watermarks if the context allows */ 2379 /* Allocate without watermarks if the context allows */
2342 if (alloc_flags & ALLOC_NO_WATERMARKS) { 2380 if (alloc_flags & ALLOC_NO_WATERMARKS) {
2381 /*
2382 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
2383 * the allocation is high priority and these type of
2384 * allocations are system rather than user orientated
2385 */
2386 zonelist = node_zonelist(numa_node_id(), gfp_mask);
2387
2343 page = __alloc_pages_high_priority(gfp_mask, order, 2388 page = __alloc_pages_high_priority(gfp_mask, order,
2344 zonelist, high_zoneidx, nodemask, 2389 zonelist, high_zoneidx, nodemask,
2345 preferred_zone, migratetype); 2390 preferred_zone, migratetype);
2346 if (page) 2391 if (page) {
2392 /*
2393 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2394 * necessary to allocate the page. The expectation is
2395 * that the caller is taking steps that will free more
2396 * memory. The caller should avoid the page being used
2397 * for !PFMEMALLOC purposes.
2398 */
2399 page->pfmemalloc = true;
2347 goto got_pg; 2400 goto got_pg;
2401 }
2348 } 2402 }
2349 2403
2350 /* Atomic allocations - we can't balance anything */ 2404 /* Atomic allocations - we can't balance anything */
@@ -2463,8 +2517,8 @@ nopage:
2463got_pg: 2517got_pg:
2464 if (kmemcheck_enabled) 2518 if (kmemcheck_enabled)
2465 kmemcheck_pagealloc_alloc(page, order, gfp_mask); 2519 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2466 return page;
2467 2520
2521 return page;
2468} 2522}
2469 2523
2470/* 2524/*
@@ -2515,6 +2569,8 @@ retry_cpuset:
2515 page = __alloc_pages_slowpath(gfp_mask, order, 2569 page = __alloc_pages_slowpath(gfp_mask, order,
2516 zonelist, high_zoneidx, nodemask, 2570 zonelist, high_zoneidx, nodemask,
2517 preferred_zone, migratetype); 2571 preferred_zone, migratetype);
2572 else
2573 page->pfmemalloc = false;
2518 2574
2519 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2575 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2520 2576
@@ -3030,7 +3086,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
3030 user_zonelist_order = oldval; 3086 user_zonelist_order = oldval;
3031 } else if (oldval != user_zonelist_order) { 3087 } else if (oldval != user_zonelist_order) {
3032 mutex_lock(&zonelists_mutex); 3088 mutex_lock(&zonelists_mutex);
3033 build_all_zonelists(NULL); 3089 build_all_zonelists(NULL, NULL);
3034 mutex_unlock(&zonelists_mutex); 3090 mutex_unlock(&zonelists_mutex);
3035 } 3091 }
3036 } 3092 }
@@ -3409,14 +3465,21 @@ static void setup_zone_pageset(struct zone *zone);
3409DEFINE_MUTEX(zonelists_mutex); 3465DEFINE_MUTEX(zonelists_mutex);
3410 3466
3411/* return values int ....just for stop_machine() */ 3467/* return values int ....just for stop_machine() */
3412static __init_refok int __build_all_zonelists(void *data) 3468static int __build_all_zonelists(void *data)
3413{ 3469{
3414 int nid; 3470 int nid;
3415 int cpu; 3471 int cpu;
3472 pg_data_t *self = data;
3416 3473
3417#ifdef CONFIG_NUMA 3474#ifdef CONFIG_NUMA
3418 memset(node_load, 0, sizeof(node_load)); 3475 memset(node_load, 0, sizeof(node_load));
3419#endif 3476#endif
3477
3478 if (self && !node_online(self->node_id)) {
3479 build_zonelists(self);
3480 build_zonelist_cache(self);
3481 }
3482
3420 for_each_online_node(nid) { 3483 for_each_online_node(nid) {
3421 pg_data_t *pgdat = NODE_DATA(nid); 3484 pg_data_t *pgdat = NODE_DATA(nid);
3422 3485
@@ -3461,7 +3524,7 @@ static __init_refok int __build_all_zonelists(void *data)
3461 * Called with zonelists_mutex held always 3524 * Called with zonelists_mutex held always
3462 * unless system_state == SYSTEM_BOOTING. 3525 * unless system_state == SYSTEM_BOOTING.
3463 */ 3526 */
3464void __ref build_all_zonelists(void *data) 3527void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3465{ 3528{
3466 set_zonelist_order(); 3529 set_zonelist_order();
3467 3530
@@ -3473,10 +3536,10 @@ void __ref build_all_zonelists(void *data)
3473 /* we have to stop all cpus to guarantee there is no user 3536 /* we have to stop all cpus to guarantee there is no user
3474 of zonelist */ 3537 of zonelist */
3475#ifdef CONFIG_MEMORY_HOTPLUG 3538#ifdef CONFIG_MEMORY_HOTPLUG
3476 if (data) 3539 if (zone)
3477 setup_zone_pageset((struct zone *)data); 3540 setup_zone_pageset(zone);
3478#endif 3541#endif
3479 stop_machine(__build_all_zonelists, NULL, NULL); 3542 stop_machine(__build_all_zonelists, pgdat, NULL);
3480 /* cpuset refresh routine should be here */ 3543 /* cpuset refresh routine should be here */
3481 } 3544 }
3482 vm_total_pages = nr_free_pagecache_pages(); 3545 vm_total_pages = nr_free_pagecache_pages();
@@ -3746,7 +3809,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
3746 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 3809 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
3747#endif 3810#endif
3748 3811
3749static int zone_batchsize(struct zone *zone) 3812static int __meminit zone_batchsize(struct zone *zone)
3750{ 3813{
3751#ifdef CONFIG_MMU 3814#ifdef CONFIG_MMU
3752 int batch; 3815 int batch;
@@ -3828,7 +3891,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3828 pcp->batch = PAGE_SHIFT * 8; 3891 pcp->batch = PAGE_SHIFT * 8;
3829} 3892}
3830 3893
3831static void setup_zone_pageset(struct zone *zone) 3894static void __meminit setup_zone_pageset(struct zone *zone)
3832{ 3895{
3833 int cpu; 3896 int cpu;
3834 3897
@@ -3901,32 +3964,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3901 return 0; 3964 return 0;
3902} 3965}
3903 3966
3904static int __zone_pcp_update(void *data)
3905{
3906 struct zone *zone = data;
3907 int cpu;
3908 unsigned long batch = zone_batchsize(zone), flags;
3909
3910 for_each_possible_cpu(cpu) {
3911 struct per_cpu_pageset *pset;
3912 struct per_cpu_pages *pcp;
3913
3914 pset = per_cpu_ptr(zone->pageset, cpu);
3915 pcp = &pset->pcp;
3916
3917 local_irq_save(flags);
3918 free_pcppages_bulk(zone, pcp->count, pcp);
3919 setup_pageset(pset, batch);
3920 local_irq_restore(flags);
3921 }
3922 return 0;
3923}
3924
3925void zone_pcp_update(struct zone *zone)
3926{
3927 stop_machine(__zone_pcp_update, zone, NULL);
3928}
3929
3930static __meminit void zone_pcp_init(struct zone *zone) 3967static __meminit void zone_pcp_init(struct zone *zone)
3931{ 3968{
3932 /* 3969 /*
@@ -3942,7 +3979,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
3942 zone_batchsize(zone)); 3979 zone_batchsize(zone));
3943} 3980}
3944 3981
3945__meminit int init_currently_empty_zone(struct zone *zone, 3982int __meminit init_currently_empty_zone(struct zone *zone,
3946 unsigned long zone_start_pfn, 3983 unsigned long zone_start_pfn,
3947 unsigned long size, 3984 unsigned long size,
3948 enum memmap_context context) 3985 enum memmap_context context)
@@ -4301,7 +4338,7 @@ static inline void setup_usemap(struct pglist_data *pgdat,
4301#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4338#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4302 4339
4303/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 4340/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4304static inline void __init set_pageblock_order(void) 4341void __init set_pageblock_order(void)
4305{ 4342{
4306 unsigned int order; 4343 unsigned int order;
4307 4344
@@ -4329,7 +4366,7 @@ static inline void __init set_pageblock_order(void)
4329 * include/linux/pageblock-flags.h for the values of pageblock_order based on 4366 * include/linux/pageblock-flags.h for the values of pageblock_order based on
4330 * the kernel config 4367 * the kernel config
4331 */ 4368 */
4332static inline void set_pageblock_order(void) 4369void __init set_pageblock_order(void)
4333{ 4370{
4334} 4371}
4335 4372
@@ -4340,6 +4377,8 @@ static inline void set_pageblock_order(void)
4340 * - mark all pages reserved 4377 * - mark all pages reserved
4341 * - mark all memory queues empty 4378 * - mark all memory queues empty
4342 * - clear the memory bitmaps 4379 * - clear the memory bitmaps
4380 *
4381 * NOTE: pgdat should get zeroed by caller.
4343 */ 4382 */
4344static void __paginginit free_area_init_core(struct pglist_data *pgdat, 4383static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4345 unsigned long *zones_size, unsigned long *zholes_size) 4384 unsigned long *zones_size, unsigned long *zholes_size)
@@ -4350,9 +4389,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4350 int ret; 4389 int ret;
4351 4390
4352 pgdat_resize_init(pgdat); 4391 pgdat_resize_init(pgdat);
4353 pgdat->nr_zones = 0;
4354 init_waitqueue_head(&pgdat->kswapd_wait); 4392 init_waitqueue_head(&pgdat->kswapd_wait);
4355 pgdat->kswapd_max_order = 0; 4393 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4356 pgdat_page_cgroup_init(pgdat); 4394 pgdat_page_cgroup_init(pgdat);
4357 4395
4358 for (j = 0; j < MAX_NR_ZONES; j++) { 4396 for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -4394,6 +4432,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4394 4432
4395 zone->spanned_pages = size; 4433 zone->spanned_pages = size;
4396 zone->present_pages = realsize; 4434 zone->present_pages = realsize;
4435#if defined CONFIG_COMPACTION || defined CONFIG_CMA
4436 zone->compact_cached_free_pfn = zone->zone_start_pfn +
4437 zone->spanned_pages;
4438 zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
4439#endif
4397#ifdef CONFIG_NUMA 4440#ifdef CONFIG_NUMA
4398 zone->node = nid; 4441 zone->node = nid;
4399 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 4442 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
@@ -4408,8 +4451,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4408 4451
4409 zone_pcp_init(zone); 4452 zone_pcp_init(zone);
4410 lruvec_init(&zone->lruvec, zone); 4453 lruvec_init(&zone->lruvec, zone);
4411 zap_zone_vm_stats(zone);
4412 zone->flags = 0;
4413 if (!size) 4454 if (!size)
4414 continue; 4455 continue;
4415 4456
@@ -4469,6 +4510,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4469{ 4510{
4470 pg_data_t *pgdat = NODE_DATA(nid); 4511 pg_data_t *pgdat = NODE_DATA(nid);
4471 4512
4513 /* pg_data_t should be reset to zero when it's allocated */
4514 WARN_ON(pgdat->nr_zones || pgdat->node_start_pfn || pgdat->classzone_idx);
4515
4472 pgdat->node_id = nid; 4516 pgdat->node_id = nid;
4473 pgdat->node_start_pfn = node_start_pfn; 4517 pgdat->node_start_pfn = node_start_pfn;
4474 calculate_node_totalpages(pgdat, zones_size, zholes_size); 4518 calculate_node_totalpages(pgdat, zones_size, zholes_size);
@@ -4750,7 +4794,7 @@ out:
4750} 4794}
4751 4795
4752/* Any regular memory on that node ? */ 4796/* Any regular memory on that node ? */
4753static void check_for_regular_memory(pg_data_t *pgdat) 4797static void __init check_for_regular_memory(pg_data_t *pgdat)
4754{ 4798{
4755#ifdef CONFIG_HIGHMEM 4799#ifdef CONFIG_HIGHMEM
4756 enum zone_type zone_type; 4800 enum zone_type zone_type;
@@ -5468,26 +5512,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5468} 5512}
5469 5513
5470/* 5514/*
5471 * This is designed as sub function...plz see page_isolation.c also. 5515 * This function checks whether pageblock includes unmovable pages or not.
5472 * set/clear page block's type to be ISOLATE. 5516 * If @count is not zero, it is okay to include less @count unmovable pages
5473 * page allocater never alloc memory from ISOLATE block. 5517 *
5518 * PageLRU check wihtout isolation or lru_lock could race so that
5519 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
5520 * expect this function should be exact.
5474 */ 5521 */
5475 5522bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
5476static int
5477__count_immobile_pages(struct zone *zone, struct page *page, int count)
5478{ 5523{
5479 unsigned long pfn, iter, found; 5524 unsigned long pfn, iter, found;
5480 int mt; 5525 int mt;
5481 5526
5482 /* 5527 /*
5483 * For avoiding noise data, lru_add_drain_all() should be called 5528 * For avoiding noise data, lru_add_drain_all() should be called
5484 * If ZONE_MOVABLE, the zone never contains immobile pages 5529 * If ZONE_MOVABLE, the zone never contains unmovable pages
5485 */ 5530 */
5486 if (zone_idx(zone) == ZONE_MOVABLE) 5531 if (zone_idx(zone) == ZONE_MOVABLE)
5487 return true; 5532 return false;
5488 mt = get_pageblock_migratetype(page); 5533 mt = get_pageblock_migratetype(page);
5489 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) 5534 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
5490 return true; 5535 return false;
5491 5536
5492 pfn = page_to_pfn(page); 5537 pfn = page_to_pfn(page);
5493 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 5538 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
@@ -5497,11 +5542,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
5497 continue; 5542 continue;
5498 5543
5499 page = pfn_to_page(check); 5544 page = pfn_to_page(check);
5500 if (!page_count(page)) { 5545 /*
5546 * We can't use page_count without pin a page
5547 * because another CPU can free compound page.
5548 * This check already skips compound tails of THP
5549 * because their page->_count is zero at all time.
5550 */
5551 if (!atomic_read(&page->_count)) {
5501 if (PageBuddy(page)) 5552 if (PageBuddy(page))
5502 iter += (1 << page_order(page)) - 1; 5553 iter += (1 << page_order(page)) - 1;
5503 continue; 5554 continue;
5504 } 5555 }
5556
5505 if (!PageLRU(page)) 5557 if (!PageLRU(page))
5506 found++; 5558 found++;
5507 /* 5559 /*
@@ -5518,9 +5570,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
5518 * page at boot. 5570 * page at boot.
5519 */ 5571 */
5520 if (found > count) 5572 if (found > count)
5521 return false; 5573 return true;
5522 } 5574 }
5523 return true; 5575 return false;
5524} 5576}
5525 5577
5526bool is_pageblock_removable_nolock(struct page *page) 5578bool is_pageblock_removable_nolock(struct page *page)
@@ -5544,77 +5596,7 @@ bool is_pageblock_removable_nolock(struct page *page)
5544 zone->zone_start_pfn + zone->spanned_pages <= pfn) 5596 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5545 return false; 5597 return false;
5546 5598
5547 return __count_immobile_pages(zone, page, 0); 5599 return !has_unmovable_pages(zone, page, 0);
5548}
5549
5550int set_migratetype_isolate(struct page *page)
5551{
5552 struct zone *zone;
5553 unsigned long flags, pfn;
5554 struct memory_isolate_notify arg;
5555 int notifier_ret;
5556 int ret = -EBUSY;
5557
5558 zone = page_zone(page);
5559
5560 spin_lock_irqsave(&zone->lock, flags);
5561
5562 pfn = page_to_pfn(page);
5563 arg.start_pfn = pfn;
5564 arg.nr_pages = pageblock_nr_pages;
5565 arg.pages_found = 0;
5566
5567 /*
5568 * It may be possible to isolate a pageblock even if the
5569 * migratetype is not MIGRATE_MOVABLE. The memory isolation
5570 * notifier chain is used by balloon drivers to return the
5571 * number of pages in a range that are held by the balloon
5572 * driver to shrink memory. If all the pages are accounted for
5573 * by balloons, are free, or on the LRU, isolation can continue.
5574 * Later, for example, when memory hotplug notifier runs, these
5575 * pages reported as "can be isolated" should be isolated(freed)
5576 * by the balloon driver through the memory notifier chain.
5577 */
5578 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
5579 notifier_ret = notifier_to_errno(notifier_ret);
5580 if (notifier_ret)
5581 goto out;
5582 /*
5583 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
5584 * We just check MOVABLE pages.
5585 */
5586 if (__count_immobile_pages(zone, page, arg.pages_found))
5587 ret = 0;
5588
5589 /*
5590 * immobile means "not-on-lru" paes. If immobile is larger than
5591 * removable-by-driver pages reported by notifier, we'll fail.
5592 */
5593
5594out:
5595 if (!ret) {
5596 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
5597 move_freepages_block(zone, page, MIGRATE_ISOLATE);
5598 }
5599
5600 spin_unlock_irqrestore(&zone->lock, flags);
5601 if (!ret)
5602 drain_all_pages();
5603 return ret;
5604}
5605
5606void unset_migratetype_isolate(struct page *page, unsigned migratetype)
5607{
5608 struct zone *zone;
5609 unsigned long flags;
5610 zone = page_zone(page);
5611 spin_lock_irqsave(&zone->lock, flags);
5612 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
5613 goto out;
5614 set_pageblock_migratetype(page, migratetype);
5615 move_freepages_block(zone, page, migratetype);
5616out:
5617 spin_unlock_irqrestore(&zone->lock, flags);
5618} 5600}
5619 5601
5620#ifdef CONFIG_CMA 5602#ifdef CONFIG_CMA
@@ -5869,7 +5851,49 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
5869} 5851}
5870#endif 5852#endif
5871 5853
5854#ifdef CONFIG_MEMORY_HOTPLUG
5855static int __meminit __zone_pcp_update(void *data)
5856{
5857 struct zone *zone = data;
5858 int cpu;
5859 unsigned long batch = zone_batchsize(zone), flags;
5860
5861 for_each_possible_cpu(cpu) {
5862 struct per_cpu_pageset *pset;
5863 struct per_cpu_pages *pcp;
5864
5865 pset = per_cpu_ptr(zone->pageset, cpu);
5866 pcp = &pset->pcp;
5867
5868 local_irq_save(flags);
5869 if (pcp->count > 0)
5870 free_pcppages_bulk(zone, pcp->count, pcp);
5871 setup_pageset(pset, batch);
5872 local_irq_restore(flags);
5873 }
5874 return 0;
5875}
5876
5877void __meminit zone_pcp_update(struct zone *zone)
5878{
5879 stop_machine(__zone_pcp_update, zone, NULL);
5880}
5881#endif
5882
5872#ifdef CONFIG_MEMORY_HOTREMOVE 5883#ifdef CONFIG_MEMORY_HOTREMOVE
5884void zone_pcp_reset(struct zone *zone)
5885{
5886 unsigned long flags;
5887
5888 /* avoid races with drain_pages() */
5889 local_irq_save(flags);
5890 if (zone->pageset != &boot_pageset) {
5891 free_percpu(zone->pageset);
5892 zone->pageset = &boot_pageset;
5893 }
5894 local_irq_restore(flags);
5895}
5896
5873/* 5897/*
5874 * All pages in the range must be isolated before calling this. 5898 * All pages in the range must be isolated before calling this.
5875 */ 5899 */
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index eb750f851395..5ddad0c6daa6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -317,7 +317,7 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
317#endif 317#endif
318 318
319 319
320#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 320#ifdef CONFIG_MEMCG_SWAP
321 321
322static DEFINE_MUTEX(swap_cgroup_mutex); 322static DEFINE_MUTEX(swap_cgroup_mutex);
323struct swap_cgroup_ctrl { 323struct swap_cgroup_ctrl {
diff --git a/mm/page_io.c b/mm/page_io.c
index 34f02923744c..78eee32ee486 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -17,6 +17,7 @@
17#include <linux/swap.h> 17#include <linux/swap.h>
18#include <linux/bio.h> 18#include <linux/bio.h>
19#include <linux/swapops.h> 19#include <linux/swapops.h>
20#include <linux/buffer_head.h>
20#include <linux/writeback.h> 21#include <linux/writeback.h>
21#include <linux/frontswap.h> 22#include <linux/frontswap.h>
22#include <asm/pgtable.h> 23#include <asm/pgtable.h>
@@ -86,6 +87,98 @@ void end_swap_bio_read(struct bio *bio, int err)
86 bio_put(bio); 87 bio_put(bio);
87} 88}
88 89
90int generic_swapfile_activate(struct swap_info_struct *sis,
91 struct file *swap_file,
92 sector_t *span)
93{
94 struct address_space *mapping = swap_file->f_mapping;
95 struct inode *inode = mapping->host;
96 unsigned blocks_per_page;
97 unsigned long page_no;
98 unsigned blkbits;
99 sector_t probe_block;
100 sector_t last_block;
101 sector_t lowest_block = -1;
102 sector_t highest_block = 0;
103 int nr_extents = 0;
104 int ret;
105
106 blkbits = inode->i_blkbits;
107 blocks_per_page = PAGE_SIZE >> blkbits;
108
109 /*
110 * Map all the blocks into the extent list. This code doesn't try
111 * to be very smart.
112 */
113 probe_block = 0;
114 page_no = 0;
115 last_block = i_size_read(inode) >> blkbits;
116 while ((probe_block + blocks_per_page) <= last_block &&
117 page_no < sis->max) {
118 unsigned block_in_page;
119 sector_t first_block;
120
121 first_block = bmap(inode, probe_block);
122 if (first_block == 0)
123 goto bad_bmap;
124
125 /*
126 * It must be PAGE_SIZE aligned on-disk
127 */
128 if (first_block & (blocks_per_page - 1)) {
129 probe_block++;
130 goto reprobe;
131 }
132
133 for (block_in_page = 1; block_in_page < blocks_per_page;
134 block_in_page++) {
135 sector_t block;
136
137 block = bmap(inode, probe_block + block_in_page);
138 if (block == 0)
139 goto bad_bmap;
140 if (block != first_block + block_in_page) {
141 /* Discontiguity */
142 probe_block++;
143 goto reprobe;
144 }
145 }
146
147 first_block >>= (PAGE_SHIFT - blkbits);
148 if (page_no) { /* exclude the header page */
149 if (first_block < lowest_block)
150 lowest_block = first_block;
151 if (first_block > highest_block)
152 highest_block = first_block;
153 }
154
155 /*
156 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
157 */
158 ret = add_swap_extent(sis, page_no, 1, first_block);
159 if (ret < 0)
160 goto out;
161 nr_extents += ret;
162 page_no++;
163 probe_block += blocks_per_page;
164reprobe:
165 continue;
166 }
167 ret = nr_extents;
168 *span = 1 + highest_block - lowest_block;
169 if (page_no == 0)
170 page_no = 1; /* force Empty message */
171 sis->max = page_no;
172 sis->pages = page_no - 1;
173 sis->highest_bit = page_no - 1;
174out:
175 return ret;
176bad_bmap:
177 printk(KERN_ERR "swapon: swapfile has holes\n");
178 ret = -EINVAL;
179 goto out;
180}
181
89/* 182/*
90 * We may have stale swap cache pages in memory: notice 183 * We may have stale swap cache pages in memory: notice
91 * them here and get rid of the unnecessary final write. 184 * them here and get rid of the unnecessary final write.
@@ -94,6 +187,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
94{ 187{
95 struct bio *bio; 188 struct bio *bio;
96 int ret = 0, rw = WRITE; 189 int ret = 0, rw = WRITE;
190 struct swap_info_struct *sis = page_swap_info(page);
97 191
98 if (try_to_free_swap(page)) { 192 if (try_to_free_swap(page)) {
99 unlock_page(page); 193 unlock_page(page);
@@ -105,6 +199,33 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
105 end_page_writeback(page); 199 end_page_writeback(page);
106 goto out; 200 goto out;
107 } 201 }
202
203 if (sis->flags & SWP_FILE) {
204 struct kiocb kiocb;
205 struct file *swap_file = sis->swap_file;
206 struct address_space *mapping = swap_file->f_mapping;
207 struct iovec iov = {
208 .iov_base = kmap(page),
209 .iov_len = PAGE_SIZE,
210 };
211
212 init_sync_kiocb(&kiocb, swap_file);
213 kiocb.ki_pos = page_file_offset(page);
214 kiocb.ki_left = PAGE_SIZE;
215 kiocb.ki_nbytes = PAGE_SIZE;
216
217 unlock_page(page);
218 ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
219 &kiocb, &iov,
220 kiocb.ki_pos, 1);
221 kunmap(page);
222 if (ret == PAGE_SIZE) {
223 count_vm_event(PSWPOUT);
224 ret = 0;
225 }
226 return ret;
227 }
228
108 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); 229 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
109 if (bio == NULL) { 230 if (bio == NULL) {
110 set_page_dirty(page); 231 set_page_dirty(page);
@@ -126,6 +247,7 @@ int swap_readpage(struct page *page)
126{ 247{
127 struct bio *bio; 248 struct bio *bio;
128 int ret = 0; 249 int ret = 0;
250 struct swap_info_struct *sis = page_swap_info(page);
129 251
130 VM_BUG_ON(!PageLocked(page)); 252 VM_BUG_ON(!PageLocked(page));
131 VM_BUG_ON(PageUptodate(page)); 253 VM_BUG_ON(PageUptodate(page));
@@ -134,6 +256,17 @@ int swap_readpage(struct page *page)
134 unlock_page(page); 256 unlock_page(page);
135 goto out; 257 goto out;
136 } 258 }
259
260 if (sis->flags & SWP_FILE) {
261 struct file *swap_file = sis->swap_file;
262 struct address_space *mapping = swap_file->f_mapping;
263
264 ret = mapping->a_ops->readpage(swap_file, page);
265 if (!ret)
266 count_vm_event(PSWPIN);
267 return ret;
268 }
269
137 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); 270 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
138 if (bio == NULL) { 271 if (bio == NULL) {
139 unlock_page(page); 272 unlock_page(page);
@@ -145,3 +278,15 @@ int swap_readpage(struct page *page)
145out: 278out:
146 return ret; 279 return ret;
147} 280}
281
282int swap_set_page_dirty(struct page *page)
283{
284 struct swap_info_struct *sis = page_swap_info(page);
285
286 if (sis->flags & SWP_FILE) {
287 struct address_space *mapping = sis->swap_file->f_mapping;
288 return mapping->a_ops->set_page_dirty(page);
289 } else {
290 return __set_page_dirty_no_writeback(page);
291 }
292}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c9f04774f2b8..247d1f175739 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -5,8 +5,101 @@
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/page-isolation.h> 6#include <linux/page-isolation.h>
7#include <linux/pageblock-flags.h> 7#include <linux/pageblock-flags.h>
8#include <linux/memory.h>
8#include "internal.h" 9#include "internal.h"
9 10
11/* called while holding zone->lock */
12static void set_pageblock_isolate(struct page *page)
13{
14 if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
15 return;
16
17 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
18 page_zone(page)->nr_pageblock_isolate++;
19}
20
21/* called while holding zone->lock */
22static void restore_pageblock_isolate(struct page *page, int migratetype)
23{
24 struct zone *zone = page_zone(page);
25 if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
26 return;
27
28 BUG_ON(zone->nr_pageblock_isolate <= 0);
29 set_pageblock_migratetype(page, migratetype);
30 zone->nr_pageblock_isolate--;
31}
32
33int set_migratetype_isolate(struct page *page)
34{
35 struct zone *zone;
36 unsigned long flags, pfn;
37 struct memory_isolate_notify arg;
38 int notifier_ret;
39 int ret = -EBUSY;
40
41 zone = page_zone(page);
42
43 spin_lock_irqsave(&zone->lock, flags);
44
45 pfn = page_to_pfn(page);
46 arg.start_pfn = pfn;
47 arg.nr_pages = pageblock_nr_pages;
48 arg.pages_found = 0;
49
50 /*
51 * It may be possible to isolate a pageblock even if the
52 * migratetype is not MIGRATE_MOVABLE. The memory isolation
53 * notifier chain is used by balloon drivers to return the
54 * number of pages in a range that are held by the balloon
55 * driver to shrink memory. If all the pages are accounted for
56 * by balloons, are free, or on the LRU, isolation can continue.
57 * Later, for example, when memory hotplug notifier runs, these
58 * pages reported as "can be isolated" should be isolated(freed)
59 * by the balloon driver through the memory notifier chain.
60 */
61 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
62 notifier_ret = notifier_to_errno(notifier_ret);
63 if (notifier_ret)
64 goto out;
65 /*
66 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
67 * We just check MOVABLE pages.
68 */
69 if (!has_unmovable_pages(zone, page, arg.pages_found))
70 ret = 0;
71
72 /*
73 * immobile means "not-on-lru" paes. If immobile is larger than
74 * removable-by-driver pages reported by notifier, we'll fail.
75 */
76
77out:
78 if (!ret) {
79 set_pageblock_isolate(page);
80 move_freepages_block(zone, page, MIGRATE_ISOLATE);
81 }
82
83 spin_unlock_irqrestore(&zone->lock, flags);
84 if (!ret)
85 drain_all_pages();
86 return ret;
87}
88
89void unset_migratetype_isolate(struct page *page, unsigned migratetype)
90{
91 struct zone *zone;
92 unsigned long flags;
93 zone = page_zone(page);
94 spin_lock_irqsave(&zone->lock, flags);
95 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
96 goto out;
97 move_freepages_block(zone, page, migratetype);
98 restore_pageblock_isolate(page, migratetype);
99out:
100 spin_unlock_irqrestore(&zone->lock, flags);
101}
102
10static inline struct page * 103static inline struct page *
11__first_valid_page(unsigned long pfn, unsigned long nr_pages) 104__first_valid_page(unsigned long pfn, unsigned long nr_pages)
12{ 105{
diff --git a/mm/shmem.c b/mm/shmem.c
index c15b998e5a86..d4e184e2a38e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -929,7 +929,8 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
929 929
930 /* Create a pseudo vma that just contains the policy */ 930 /* Create a pseudo vma that just contains the policy */
931 pvma.vm_start = 0; 931 pvma.vm_start = 0;
932 pvma.vm_pgoff = index; 932 /* Bias interleave by inode number to distribute better across nodes */
933 pvma.vm_pgoff = index + info->vfs_inode.i_ino;
933 pvma.vm_ops = NULL; 934 pvma.vm_ops = NULL;
934 pvma.vm_policy = spol; 935 pvma.vm_policy = spol;
935 return swapin_readahead(swap, gfp, &pvma, 0); 936 return swapin_readahead(swap, gfp, &pvma, 0);
@@ -942,7 +943,8 @@ static struct page *shmem_alloc_page(gfp_t gfp,
942 943
943 /* Create a pseudo vma that just contains the policy */ 944 /* Create a pseudo vma that just contains the policy */
944 pvma.vm_start = 0; 945 pvma.vm_start = 0;
945 pvma.vm_pgoff = index; 946 /* Bias interleave by inode number to distribute better across nodes */
947 pvma.vm_pgoff = index + info->vfs_inode.i_ino;
946 pvma.vm_ops = NULL; 948 pvma.vm_ops = NULL;
947 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); 949 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
948 950
diff --git a/mm/slab.c b/mm/slab.c
index 1fcf3ac94b6c..f8b0d539b482 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -118,12 +118,16 @@
118#include <linux/memory.h> 118#include <linux/memory.h>
119#include <linux/prefetch.h> 119#include <linux/prefetch.h>
120 120
121#include <net/sock.h>
122
121#include <asm/cacheflush.h> 123#include <asm/cacheflush.h>
122#include <asm/tlbflush.h> 124#include <asm/tlbflush.h>
123#include <asm/page.h> 125#include <asm/page.h>
124 126
125#include <trace/events/kmem.h> 127#include <trace/events/kmem.h>
126 128
129#include "internal.h"
130
127/* 131/*
128 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. 132 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
129 * 0 for faster, smaller code (especially in the critical paths). 133 * 0 for faster, smaller code (especially in the critical paths).
@@ -152,6 +156,12 @@
152#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 156#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
153#endif 157#endif
154 158
159/*
160 * true if a page was allocated from pfmemalloc reserves for network-based
161 * swap
162 */
163static bool pfmemalloc_active __read_mostly;
164
155/* Legal flag mask for kmem_cache_create(). */ 165/* Legal flag mask for kmem_cache_create(). */
156#if DEBUG 166#if DEBUG
157# define CREATE_MASK (SLAB_RED_ZONE | \ 167# define CREATE_MASK (SLAB_RED_ZONE | \
@@ -257,9 +267,30 @@ struct array_cache {
257 * Must have this definition in here for the proper 267 * Must have this definition in here for the proper
258 * alignment of array_cache. Also simplifies accessing 268 * alignment of array_cache. Also simplifies accessing
259 * the entries. 269 * the entries.
270 *
271 * Entries should not be directly dereferenced as
272 * entries belonging to slabs marked pfmemalloc will
273 * have the lower bits set SLAB_OBJ_PFMEMALLOC
260 */ 274 */
261}; 275};
262 276
277#define SLAB_OBJ_PFMEMALLOC 1
278static inline bool is_obj_pfmemalloc(void *objp)
279{
280 return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
281}
282
283static inline void set_obj_pfmemalloc(void **objp)
284{
285 *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
286 return;
287}
288
289static inline void clear_obj_pfmemalloc(void **objp)
290{
291 *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
292}
293
263/* 294/*
264 * bootstrap: The caches do not work without cpuarrays anymore, but the 295 * bootstrap: The caches do not work without cpuarrays anymore, but the
265 * cpuarrays are allocated from the generic caches... 296 * cpuarrays are allocated from the generic caches...
@@ -900,6 +931,124 @@ static struct array_cache *alloc_arraycache(int node, int entries,
900 return nc; 931 return nc;
901} 932}
902 933
934static inline bool is_slab_pfmemalloc(struct slab *slabp)
935{
936 struct page *page = virt_to_page(slabp->s_mem);
937
938 return PageSlabPfmemalloc(page);
939}
940
941/* Clears pfmemalloc_active if no slabs have pfmalloc set */
942static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
943 struct array_cache *ac)
944{
945 struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()];
946 struct slab *slabp;
947 unsigned long flags;
948
949 if (!pfmemalloc_active)
950 return;
951
952 spin_lock_irqsave(&l3->list_lock, flags);
953 list_for_each_entry(slabp, &l3->slabs_full, list)
954 if (is_slab_pfmemalloc(slabp))
955 goto out;
956
957 list_for_each_entry(slabp, &l3->slabs_partial, list)
958 if (is_slab_pfmemalloc(slabp))
959 goto out;
960
961 list_for_each_entry(slabp, &l3->slabs_free, list)
962 if (is_slab_pfmemalloc(slabp))
963 goto out;
964
965 pfmemalloc_active = false;
966out:
967 spin_unlock_irqrestore(&l3->list_lock, flags);
968}
969
970static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
971 gfp_t flags, bool force_refill)
972{
973 int i;
974 void *objp = ac->entry[--ac->avail];
975
976 /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
977 if (unlikely(is_obj_pfmemalloc(objp))) {
978 struct kmem_list3 *l3;
979
980 if (gfp_pfmemalloc_allowed(flags)) {
981 clear_obj_pfmemalloc(&objp);
982 return objp;
983 }
984
985 /* The caller cannot use PFMEMALLOC objects, find another one */
986 for (i = 1; i < ac->avail; i++) {
987 /* If a !PFMEMALLOC object is found, swap them */
988 if (!is_obj_pfmemalloc(ac->entry[i])) {
989 objp = ac->entry[i];
990 ac->entry[i] = ac->entry[ac->avail];
991 ac->entry[ac->avail] = objp;
992 return objp;
993 }
994 }
995
996 /*
997 * If there are empty slabs on the slabs_free list and we are
998 * being forced to refill the cache, mark this one !pfmemalloc.
999 */
1000 l3 = cachep->nodelists[numa_mem_id()];
1001 if (!list_empty(&l3->slabs_free) && force_refill) {
1002 struct slab *slabp = virt_to_slab(objp);
1003 ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem));
1004 clear_obj_pfmemalloc(&objp);
1005 recheck_pfmemalloc_active(cachep, ac);
1006 return objp;
1007 }
1008
1009 /* No !PFMEMALLOC objects available */
1010 ac->avail++;
1011 objp = NULL;
1012 }
1013
1014 return objp;
1015}
1016
1017static inline void *ac_get_obj(struct kmem_cache *cachep,
1018 struct array_cache *ac, gfp_t flags, bool force_refill)
1019{
1020 void *objp;
1021
1022 if (unlikely(sk_memalloc_socks()))
1023 objp = __ac_get_obj(cachep, ac, flags, force_refill);
1024 else
1025 objp = ac->entry[--ac->avail];
1026
1027 return objp;
1028}
1029
1030static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
1031 void *objp)
1032{
1033 if (unlikely(pfmemalloc_active)) {
1034 /* Some pfmemalloc slabs exist, check if this is one */
1035 struct page *page = virt_to_page(objp);
1036 if (PageSlabPfmemalloc(page))
1037 set_obj_pfmemalloc(&objp);
1038 }
1039
1040 return objp;
1041}
1042
1043static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
1044 void *objp)
1045{
1046 if (unlikely(sk_memalloc_socks()))
1047 objp = __ac_put_obj(cachep, ac, objp);
1048
1049 ac->entry[ac->avail++] = objp;
1050}
1051
903/* 1052/*
904 * Transfer objects in one arraycache to another. 1053 * Transfer objects in one arraycache to another.
905 * Locking must be handled by the caller. 1054 * Locking must be handled by the caller.
@@ -1076,7 +1225,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1076 STATS_INC_ACOVERFLOW(cachep); 1225 STATS_INC_ACOVERFLOW(cachep);
1077 __drain_alien_cache(cachep, alien, nodeid); 1226 __drain_alien_cache(cachep, alien, nodeid);
1078 } 1227 }
1079 alien->entry[alien->avail++] = objp; 1228 ac_put_obj(cachep, alien, objp);
1080 spin_unlock(&alien->lock); 1229 spin_unlock(&alien->lock);
1081 } else { 1230 } else {
1082 spin_lock(&(cachep->nodelists[nodeid])->list_lock); 1231 spin_lock(&(cachep->nodelists[nodeid])->list_lock);
@@ -1759,6 +1908,10 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1759 return NULL; 1908 return NULL;
1760 } 1909 }
1761 1910
1911 /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
1912 if (unlikely(page->pfmemalloc))
1913 pfmemalloc_active = true;
1914
1762 nr_pages = (1 << cachep->gfporder); 1915 nr_pages = (1 << cachep->gfporder);
1763 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1916 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1764 add_zone_page_state(page_zone(page), 1917 add_zone_page_state(page_zone(page),
@@ -1766,9 +1919,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1766 else 1919 else
1767 add_zone_page_state(page_zone(page), 1920 add_zone_page_state(page_zone(page),
1768 NR_SLAB_UNRECLAIMABLE, nr_pages); 1921 NR_SLAB_UNRECLAIMABLE, nr_pages);
1769 for (i = 0; i < nr_pages; i++) 1922 for (i = 0; i < nr_pages; i++) {
1770 __SetPageSlab(page + i); 1923 __SetPageSlab(page + i);
1771 1924
1925 if (page->pfmemalloc)
1926 SetPageSlabPfmemalloc(page + i);
1927 }
1928
1772 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { 1929 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1773 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); 1930 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1774 1931
@@ -1800,6 +1957,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1800 NR_SLAB_UNRECLAIMABLE, nr_freed); 1957 NR_SLAB_UNRECLAIMABLE, nr_freed);
1801 while (i--) { 1958 while (i--) {
1802 BUG_ON(!PageSlab(page)); 1959 BUG_ON(!PageSlab(page));
1960 __ClearPageSlabPfmemalloc(page);
1803 __ClearPageSlab(page); 1961 __ClearPageSlab(page);
1804 page++; 1962 page++;
1805 } 1963 }
@@ -3015,16 +3173,19 @@ bad:
3015#define check_slabp(x,y) do { } while(0) 3173#define check_slabp(x,y) do { } while(0)
3016#endif 3174#endif
3017 3175
3018static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) 3176static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
3177 bool force_refill)
3019{ 3178{
3020 int batchcount; 3179 int batchcount;
3021 struct kmem_list3 *l3; 3180 struct kmem_list3 *l3;
3022 struct array_cache *ac; 3181 struct array_cache *ac;
3023 int node; 3182 int node;
3024 3183
3025retry:
3026 check_irq_off(); 3184 check_irq_off();
3027 node = numa_mem_id(); 3185 node = numa_mem_id();
3186 if (unlikely(force_refill))
3187 goto force_grow;
3188retry:
3028 ac = cpu_cache_get(cachep); 3189 ac = cpu_cache_get(cachep);
3029 batchcount = ac->batchcount; 3190 batchcount = ac->batchcount;
3030 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 3191 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -3074,8 +3235,8 @@ retry:
3074 STATS_INC_ACTIVE(cachep); 3235 STATS_INC_ACTIVE(cachep);
3075 STATS_SET_HIGH(cachep); 3236 STATS_SET_HIGH(cachep);
3076 3237
3077 ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, 3238 ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
3078 node); 3239 node));
3079 } 3240 }
3080 check_slabp(cachep, slabp); 3241 check_slabp(cachep, slabp);
3081 3242
@@ -3094,18 +3255,22 @@ alloc_done:
3094 3255
3095 if (unlikely(!ac->avail)) { 3256 if (unlikely(!ac->avail)) {
3096 int x; 3257 int x;
3258force_grow:
3097 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); 3259 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
3098 3260
3099 /* cache_grow can reenable interrupts, then ac could change. */ 3261 /* cache_grow can reenable interrupts, then ac could change. */
3100 ac = cpu_cache_get(cachep); 3262 ac = cpu_cache_get(cachep);
3101 if (!x && ac->avail == 0) /* no objects in sight? abort */ 3263
3264 /* no objects in sight? abort */
3265 if (!x && (ac->avail == 0 || force_refill))
3102 return NULL; 3266 return NULL;
3103 3267
3104 if (!ac->avail) /* objects refilled by interrupt? */ 3268 if (!ac->avail) /* objects refilled by interrupt? */
3105 goto retry; 3269 goto retry;
3106 } 3270 }
3107 ac->touched = 1; 3271 ac->touched = 1;
3108 return ac->entry[--ac->avail]; 3272
3273 return ac_get_obj(cachep, ac, flags, force_refill);
3109} 3274}
3110 3275
3111static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, 3276static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -3187,23 +3352,35 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3187{ 3352{
3188 void *objp; 3353 void *objp;
3189 struct array_cache *ac; 3354 struct array_cache *ac;
3355 bool force_refill = false;
3190 3356
3191 check_irq_off(); 3357 check_irq_off();
3192 3358
3193 ac = cpu_cache_get(cachep); 3359 ac = cpu_cache_get(cachep);
3194 if (likely(ac->avail)) { 3360 if (likely(ac->avail)) {
3195 STATS_INC_ALLOCHIT(cachep);
3196 ac->touched = 1; 3361 ac->touched = 1;
3197 objp = ac->entry[--ac->avail]; 3362 objp = ac_get_obj(cachep, ac, flags, false);
3198 } else { 3363
3199 STATS_INC_ALLOCMISS(cachep);
3200 objp = cache_alloc_refill(cachep, flags);
3201 /* 3364 /*
3202 * the 'ac' may be updated by cache_alloc_refill(), 3365 * Allow for the possibility all avail objects are not allowed
3203 * and kmemleak_erase() requires its correct value. 3366 * by the current flags
3204 */ 3367 */
3205 ac = cpu_cache_get(cachep); 3368 if (objp) {
3369 STATS_INC_ALLOCHIT(cachep);
3370 goto out;
3371 }
3372 force_refill = true;
3206 } 3373 }
3374
3375 STATS_INC_ALLOCMISS(cachep);
3376 objp = cache_alloc_refill(cachep, flags, force_refill);
3377 /*
3378 * the 'ac' may be updated by cache_alloc_refill(),
3379 * and kmemleak_erase() requires its correct value.
3380 */
3381 ac = cpu_cache_get(cachep);
3382
3383out:
3207 /* 3384 /*
3208 * To avoid a false negative, if an object that is in one of the 3385 * To avoid a false negative, if an object that is in one of the
3209 * per-CPU caches is leaked, we need to make sure kmemleak doesn't 3386 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
@@ -3525,9 +3702,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3525 struct kmem_list3 *l3; 3702 struct kmem_list3 *l3;
3526 3703
3527 for (i = 0; i < nr_objects; i++) { 3704 for (i = 0; i < nr_objects; i++) {
3528 void *objp = objpp[i]; 3705 void *objp;
3529 struct slab *slabp; 3706 struct slab *slabp;
3530 3707
3708 clear_obj_pfmemalloc(&objpp[i]);
3709 objp = objpp[i];
3710
3531 slabp = virt_to_slab(objp); 3711 slabp = virt_to_slab(objp);
3532 l3 = cachep->nodelists[node]; 3712 l3 = cachep->nodelists[node];
3533 list_del(&slabp->list); 3713 list_del(&slabp->list);
@@ -3645,7 +3825,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3645 cache_flusharray(cachep, ac); 3825 cache_flusharray(cachep, ac);
3646 } 3826 }
3647 3827
3648 ac->entry[ac->avail++] = objp; 3828 ac_put_obj(cachep, ac, objp);
3649} 3829}
3650 3830
3651/** 3831/**
diff --git a/mm/slub.c b/mm/slub.c
index e517d435e5dc..8f78e2577031 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -34,6 +34,8 @@
34 34
35#include <trace/events/kmem.h> 35#include <trace/events/kmem.h>
36 36
37#include "internal.h"
38
37/* 39/*
38 * Lock order: 40 * Lock order:
39 * 1. slab_mutex (Global Mutex) 41 * 1. slab_mutex (Global Mutex)
@@ -1354,6 +1356,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1354 inc_slabs_node(s, page_to_nid(page), page->objects); 1356 inc_slabs_node(s, page_to_nid(page), page->objects);
1355 page->slab = s; 1357 page->slab = s;
1356 __SetPageSlab(page); 1358 __SetPageSlab(page);
1359 if (page->pfmemalloc)
1360 SetPageSlabPfmemalloc(page);
1357 1361
1358 start = page_address(page); 1362 start = page_address(page);
1359 1363
@@ -1397,6 +1401,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1397 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1401 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1398 -pages); 1402 -pages);
1399 1403
1404 __ClearPageSlabPfmemalloc(page);
1400 __ClearPageSlab(page); 1405 __ClearPageSlab(page);
1401 reset_page_mapcount(page); 1406 reset_page_mapcount(page);
1402 if (current->reclaim_state) 1407 if (current->reclaim_state)
@@ -2126,6 +2131,14 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2126 return freelist; 2131 return freelist;
2127} 2132}
2128 2133
2134static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
2135{
2136 if (unlikely(PageSlabPfmemalloc(page)))
2137 return gfp_pfmemalloc_allowed(gfpflags);
2138
2139 return true;
2140}
2141
2129/* 2142/*
2130 * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist 2143 * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist
2131 * or deactivate the page. 2144 * or deactivate the page.
@@ -2206,6 +2219,18 @@ redo:
2206 goto new_slab; 2219 goto new_slab;
2207 } 2220 }
2208 2221
2222 /*
2223 * By rights, we should be searching for a slab page that was
2224 * PFMEMALLOC but right now, we are losing the pfmemalloc
2225 * information when the page leaves the per-cpu allocator
2226 */
2227 if (unlikely(!pfmemalloc_match(page, gfpflags))) {
2228 deactivate_slab(s, page, c->freelist);
2229 c->page = NULL;
2230 c->freelist = NULL;
2231 goto new_slab;
2232 }
2233
2209 /* must check again c->freelist in case of cpu migration or IRQ */ 2234 /* must check again c->freelist in case of cpu migration or IRQ */
2210 freelist = c->freelist; 2235 freelist = c->freelist;
2211 if (freelist) 2236 if (freelist)
@@ -2256,11 +2281,11 @@ new_slab:
2256 } 2281 }
2257 2282
2258 page = c->page; 2283 page = c->page;
2259 if (likely(!kmem_cache_debug(s))) 2284 if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
2260 goto load_freelist; 2285 goto load_freelist;
2261 2286
2262 /* Only entered in the debug case */ 2287 /* Only entered in the debug case */
2263 if (!alloc_debug_processing(s, page, freelist, addr)) 2288 if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr))
2264 goto new_slab; /* Slab failed checks. Next slab needed */ 2289 goto new_slab; /* Slab failed checks. Next slab needed */
2265 2290
2266 deactivate_slab(s, page, get_freepointer(s, freelist)); 2291 deactivate_slab(s, page, get_freepointer(s, freelist));
@@ -2313,7 +2338,6 @@ redo:
2313 object = c->freelist; 2338 object = c->freelist;
2314 page = c->page; 2339 page = c->page;
2315 if (unlikely(!object || !node_match(page, node))) 2340 if (unlikely(!object || !node_match(page, node)))
2316
2317 object = __slab_alloc(s, gfpflags, node, addr, c); 2341 object = __slab_alloc(s, gfpflags, node, addr, c);
2318 2342
2319 else { 2343 else {
diff --git a/mm/sparse.c b/mm/sparse.c
index c7bb952400c8..fac95f2888f2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -65,21 +65,18 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
65 65
66 if (slab_is_available()) { 66 if (slab_is_available()) {
67 if (node_state(nid, N_HIGH_MEMORY)) 67 if (node_state(nid, N_HIGH_MEMORY))
68 section = kmalloc_node(array_size, GFP_KERNEL, nid); 68 section = kzalloc_node(array_size, GFP_KERNEL, nid);
69 else 69 else
70 section = kmalloc(array_size, GFP_KERNEL); 70 section = kzalloc(array_size, GFP_KERNEL);
71 } else 71 } else {
72 section = alloc_bootmem_node(NODE_DATA(nid), array_size); 72 section = alloc_bootmem_node(NODE_DATA(nid), array_size);
73 73 }
74 if (section)
75 memset(section, 0, array_size);
76 74
77 return section; 75 return section;
78} 76}
79 77
80static int __meminit sparse_index_init(unsigned long section_nr, int nid) 78static int __meminit sparse_index_init(unsigned long section_nr, int nid)
81{ 79{
82 static DEFINE_SPINLOCK(index_init_lock);
83 unsigned long root = SECTION_NR_TO_ROOT(section_nr); 80 unsigned long root = SECTION_NR_TO_ROOT(section_nr);
84 struct mem_section *section; 81 struct mem_section *section;
85 int ret = 0; 82 int ret = 0;
@@ -90,20 +87,9 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
90 section = sparse_index_alloc(nid); 87 section = sparse_index_alloc(nid);
91 if (!section) 88 if (!section)
92 return -ENOMEM; 89 return -ENOMEM;
93 /*
94 * This lock keeps two different sections from
95 * reallocating for the same index
96 */
97 spin_lock(&index_init_lock);
98
99 if (mem_section[root]) {
100 ret = -EEXIST;
101 goto out;
102 }
103 90
104 mem_section[root] = section; 91 mem_section[root] = section;
105out: 92
106 spin_unlock(&index_init_lock);
107 return ret; 93 return ret;
108} 94}
109#else /* !SPARSEMEM_EXTREME */ 95#else /* !SPARSEMEM_EXTREME */
@@ -132,6 +118,8 @@ int __section_nr(struct mem_section* ms)
132 break; 118 break;
133 } 119 }
134 120
121 VM_BUG_ON(root_nr == NR_SECTION_ROOTS);
122
135 return (root_nr * SECTIONS_PER_ROOT) + (ms - root); 123 return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
136} 124}
137 125
@@ -493,6 +481,9 @@ void __init sparse_init(void)
493 struct page **map_map; 481 struct page **map_map;
494#endif 482#endif
495 483
484 /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
485 set_pageblock_order();
486
496 /* 487 /*
497 * map is using big page (aka 2M in x86 64 bit) 488 * map is using big page (aka 2M in x86 64 bit)
498 * usemap is less one page (aka 24 bytes) 489 * usemap is less one page (aka 24 bytes)
diff --git a/mm/swap.c b/mm/swap.c
index 4e7e2ec67078..77825883298f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -236,6 +236,58 @@ void put_pages_list(struct list_head *pages)
236} 236}
237EXPORT_SYMBOL(put_pages_list); 237EXPORT_SYMBOL(put_pages_list);
238 238
239/*
240 * get_kernel_pages() - pin kernel pages in memory
241 * @kiov: An array of struct kvec structures
242 * @nr_segs: number of segments to pin
243 * @write: pinning for read/write, currently ignored
244 * @pages: array that receives pointers to the pages pinned.
245 * Should be at least nr_segs long.
246 *
247 * Returns number of pages pinned. This may be fewer than the number
248 * requested. If nr_pages is 0 or negative, returns 0. If no pages
249 * were pinned, returns -errno. Each page returned must be released
250 * with a put_page() call when it is finished with.
251 */
252int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
253 struct page **pages)
254{
255 int seg;
256
257 for (seg = 0; seg < nr_segs; seg++) {
258 if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
259 return seg;
260
261 pages[seg] = kmap_to_page(kiov[seg].iov_base);
262 page_cache_get(pages[seg]);
263 }
264
265 return seg;
266}
267EXPORT_SYMBOL_GPL(get_kernel_pages);
268
269/*
270 * get_kernel_page() - pin a kernel page in memory
271 * @start: starting kernel address
272 * @write: pinning for read/write, currently ignored
273 * @pages: array that receives pointer to the page pinned.
274 * Must be at least nr_segs long.
275 *
276 * Returns 1 if page is pinned. If the page was not pinned, returns
277 * -errno. The page returned must be released with a put_page() call
278 * when it is finished with.
279 */
280int get_kernel_page(unsigned long start, int write, struct page **pages)
281{
282 const struct kvec kiov = {
283 .iov_base = (void *)start,
284 .iov_len = PAGE_SIZE
285 };
286
287 return get_kernel_pages(&kiov, 1, write, pages);
288}
289EXPORT_SYMBOL_GPL(get_kernel_page);
290
239static void pagevec_lru_move_fn(struct pagevec *pvec, 291static void pagevec_lru_move_fn(struct pagevec *pvec,
240 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), 292 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
241 void *arg) 293 void *arg)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 4c5ff7f284d9..0cb36fb1f61c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
17#include <linux/blkdev.h>
17#include <linux/pagevec.h> 18#include <linux/pagevec.h>
18#include <linux/migrate.h> 19#include <linux/migrate.h>
19#include <linux/page_cgroup.h> 20#include <linux/page_cgroup.h>
@@ -26,7 +27,7 @@
26 */ 27 */
27static const struct address_space_operations swap_aops = { 28static const struct address_space_operations swap_aops = {
28 .writepage = swap_writepage, 29 .writepage = swap_writepage,
29 .set_page_dirty = __set_page_dirty_no_writeback, 30 .set_page_dirty = swap_set_page_dirty,
30 .migratepage = migrate_page, 31 .migratepage = migrate_page,
31}; 32};
32 33
@@ -376,6 +377,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
376 unsigned long offset = swp_offset(entry); 377 unsigned long offset = swp_offset(entry);
377 unsigned long start_offset, end_offset; 378 unsigned long start_offset, end_offset;
378 unsigned long mask = (1UL << page_cluster) - 1; 379 unsigned long mask = (1UL << page_cluster) - 1;
380 struct blk_plug plug;
379 381
380 /* Read a page_cluster sized and aligned cluster around offset. */ 382 /* Read a page_cluster sized and aligned cluster around offset. */
381 start_offset = offset & ~mask; 383 start_offset = offset & ~mask;
@@ -383,6 +385,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
383 if (!start_offset) /* First page is swap header. */ 385 if (!start_offset) /* First page is swap header. */
384 start_offset++; 386 start_offset++;
385 387
388 blk_start_plug(&plug);
386 for (offset = start_offset; offset <= end_offset ; offset++) { 389 for (offset = start_offset; offset <= end_offset ; offset++) {
387 /* Ok, do the async read-ahead now */ 390 /* Ok, do the async read-ahead now */
388 page = read_swap_cache_async(swp_entry(swp_type(entry), offset), 391 page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
@@ -391,6 +394,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
391 continue; 394 continue;
392 page_cache_release(page); 395 page_cache_release(page);
393 } 396 }
397 blk_finish_plug(&plug);
398
394 lru_add_drain(); /* Push any new pages onto the LRU now */ 399 lru_add_drain(); /* Push any new pages onto the LRU now */
395 return read_swap_cache_async(entry, gfp_mask, vma, addr); 400 return read_swap_cache_async(entry, gfp_mask, vma, addr);
396} 401}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 71373d03fcee..14e254c768fc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,6 +33,7 @@
33#include <linux/oom.h> 33#include <linux/oom.h>
34#include <linux/frontswap.h> 34#include <linux/frontswap.h>
35#include <linux/swapfile.h> 35#include <linux/swapfile.h>
36#include <linux/export.h>
36 37
37#include <asm/pgtable.h> 38#include <asm/pgtable.h>
38#include <asm/tlbflush.h> 39#include <asm/tlbflush.h>
@@ -548,7 +549,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
548 549
549 /* free if no reference */ 550 /* free if no reference */
550 if (!usage) { 551 if (!usage) {
551 struct gendisk *disk = p->bdev->bd_disk;
552 if (offset < p->lowest_bit) 552 if (offset < p->lowest_bit)
553 p->lowest_bit = offset; 553 p->lowest_bit = offset;
554 if (offset > p->highest_bit) 554 if (offset > p->highest_bit)
@@ -559,9 +559,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
559 nr_swap_pages++; 559 nr_swap_pages++;
560 p->inuse_pages--; 560 p->inuse_pages--;
561 frontswap_invalidate_page(p->type, offset); 561 frontswap_invalidate_page(p->type, offset);
562 if ((p->flags & SWP_BLKDEV) && 562 if (p->flags & SWP_BLKDEV) {
563 disk->fops->swap_slot_free_notify) 563 struct gendisk *disk = p->bdev->bd_disk;
564 disk->fops->swap_slot_free_notify(p->bdev, offset); 564 if (disk->fops->swap_slot_free_notify)
565 disk->fops->swap_slot_free_notify(p->bdev,
566 offset);
567 }
565 } 568 }
566 569
567 return usage; 570 return usage;
@@ -832,8 +835,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
832 835
833 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 836 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
834 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { 837 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
835 if (ret > 0) 838 mem_cgroup_cancel_charge_swapin(memcg);
836 mem_cgroup_cancel_charge_swapin(memcg);
837 ret = 0; 839 ret = 0;
838 goto out; 840 goto out;
839 } 841 }
@@ -1328,6 +1330,14 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
1328 list_del(&se->list); 1330 list_del(&se->list);
1329 kfree(se); 1331 kfree(se);
1330 } 1332 }
1333
1334 if (sis->flags & SWP_FILE) {
1335 struct file *swap_file = sis->swap_file;
1336 struct address_space *mapping = swap_file->f_mapping;
1337
1338 sis->flags &= ~SWP_FILE;
1339 mapping->a_ops->swap_deactivate(swap_file);
1340 }
1331} 1341}
1332 1342
1333/* 1343/*
@@ -1336,7 +1346,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
1336 * 1346 *
1337 * This function rather assumes that it is called in ascending page order. 1347 * This function rather assumes that it is called in ascending page order.
1338 */ 1348 */
1339static int 1349int
1340add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 1350add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1341 unsigned long nr_pages, sector_t start_block) 1351 unsigned long nr_pages, sector_t start_block)
1342{ 1352{
@@ -1409,98 +1419,28 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1409 */ 1419 */
1410static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) 1420static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1411{ 1421{
1412 struct inode *inode; 1422 struct file *swap_file = sis->swap_file;
1413 unsigned blocks_per_page; 1423 struct address_space *mapping = swap_file->f_mapping;
1414 unsigned long page_no; 1424 struct inode *inode = mapping->host;
1415 unsigned blkbits;
1416 sector_t probe_block;
1417 sector_t last_block;
1418 sector_t lowest_block = -1;
1419 sector_t highest_block = 0;
1420 int nr_extents = 0;
1421 int ret; 1425 int ret;
1422 1426
1423 inode = sis->swap_file->f_mapping->host;
1424 if (S_ISBLK(inode->i_mode)) { 1427 if (S_ISBLK(inode->i_mode)) {
1425 ret = add_swap_extent(sis, 0, sis->max, 0); 1428 ret = add_swap_extent(sis, 0, sis->max, 0);
1426 *span = sis->pages; 1429 *span = sis->pages;
1427 goto out; 1430 return ret;
1428 } 1431 }
1429 1432
1430 blkbits = inode->i_blkbits; 1433 if (mapping->a_ops->swap_activate) {
1431 blocks_per_page = PAGE_SIZE >> blkbits; 1434 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
1432 1435 if (!ret) {
1433 /* 1436 sis->flags |= SWP_FILE;
1434 * Map all the blocks into the extent list. This code doesn't try 1437 ret = add_swap_extent(sis, 0, sis->max, 0);
1435 * to be very smart. 1438 *span = sis->pages;
1436 */
1437 probe_block = 0;
1438 page_no = 0;
1439 last_block = i_size_read(inode) >> blkbits;
1440 while ((probe_block + blocks_per_page) <= last_block &&
1441 page_no < sis->max) {
1442 unsigned block_in_page;
1443 sector_t first_block;
1444
1445 first_block = bmap(inode, probe_block);
1446 if (first_block == 0)
1447 goto bad_bmap;
1448
1449 /*
1450 * It must be PAGE_SIZE aligned on-disk
1451 */
1452 if (first_block & (blocks_per_page - 1)) {
1453 probe_block++;
1454 goto reprobe;
1455 }
1456
1457 for (block_in_page = 1; block_in_page < blocks_per_page;
1458 block_in_page++) {
1459 sector_t block;
1460
1461 block = bmap(inode, probe_block + block_in_page);
1462 if (block == 0)
1463 goto bad_bmap;
1464 if (block != first_block + block_in_page) {
1465 /* Discontiguity */
1466 probe_block++;
1467 goto reprobe;
1468 }
1469 }
1470
1471 first_block >>= (PAGE_SHIFT - blkbits);
1472 if (page_no) { /* exclude the header page */
1473 if (first_block < lowest_block)
1474 lowest_block = first_block;
1475 if (first_block > highest_block)
1476 highest_block = first_block;
1477 } 1439 }
1440 return ret;
1441 }
1478 1442
1479 /* 1443 return generic_swapfile_activate(sis, swap_file, span);
1480 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
1481 */
1482 ret = add_swap_extent(sis, page_no, 1, first_block);
1483 if (ret < 0)
1484 goto out;
1485 nr_extents += ret;
1486 page_no++;
1487 probe_block += blocks_per_page;
1488reprobe:
1489 continue;
1490 }
1491 ret = nr_extents;
1492 *span = 1 + highest_block - lowest_block;
1493 if (page_no == 0)
1494 page_no = 1; /* force Empty message */
1495 sis->max = page_no;
1496 sis->pages = page_no - 1;
1497 sis->highest_bit = page_no - 1;
1498out:
1499 return ret;
1500bad_bmap:
1501 printk(KERN_ERR "swapon: swapfile has holes\n");
1502 ret = -EINVAL;
1503 goto out;
1504} 1444}
1505 1445
1506static void enable_swap_info(struct swap_info_struct *p, int prio, 1446static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -2285,6 +2225,31 @@ int swapcache_prepare(swp_entry_t entry)
2285 return __swap_duplicate(entry, SWAP_HAS_CACHE); 2225 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2286} 2226}
2287 2227
2228struct swap_info_struct *page_swap_info(struct page *page)
2229{
2230 swp_entry_t swap = { .val = page_private(page) };
2231 BUG_ON(!PageSwapCache(page));
2232 return swap_info[swp_type(swap)];
2233}
2234
2235/*
2236 * out-of-line __page_file_ methods to avoid include hell.
2237 */
2238struct address_space *__page_file_mapping(struct page *page)
2239{
2240 VM_BUG_ON(!PageSwapCache(page));
2241 return page_swap_info(page)->swap_file->f_mapping;
2242}
2243EXPORT_SYMBOL_GPL(__page_file_mapping);
2244
2245pgoff_t __page_file_index(struct page *page)
2246{
2247 swp_entry_t swap = { .val = page_private(page) };
2248 VM_BUG_ON(!PageSwapCache(page));
2249 return swp_offset(swap);
2250}
2251EXPORT_SYMBOL_GPL(__page_file_index);
2252
2288/* 2253/*
2289 * add_swap_count_continuation - called when a swap count is duplicated 2254 * add_swap_count_continuation - called when a swap count is duplicated
2290 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's 2255 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e03f4c7307a5..2bb90b1d241c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -413,11 +413,11 @@ nocache:
413 if (addr + size - 1 < addr) 413 if (addr + size - 1 < addr)
414 goto overflow; 414 goto overflow;
415 415
416 n = rb_next(&first->rb_node); 416 if (list_is_last(&first->list, &vmap_area_list))
417 if (n)
418 first = rb_entry(n, struct vmap_area, rb_node);
419 else
420 goto found; 417 goto found;
418
419 first = list_entry(first->list.next,
420 struct vmap_area, list);
421 } 421 }
422 422
423found: 423found:
@@ -904,6 +904,14 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
904 904
905 BUG_ON(size & ~PAGE_MASK); 905 BUG_ON(size & ~PAGE_MASK);
906 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 906 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
907 if (WARN_ON(size == 0)) {
908 /*
909 * Allocating 0 bytes isn't what caller wants since
910 * get_order(0) returns funny result. Just warn and terminate
911 * early.
912 */
913 return NULL;
914 }
907 order = get_order(size); 915 order = get_order(size);
908 916
909again: 917again:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 347b3ff2a478..8d01243d9560 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -133,7 +133,7 @@ long vm_total_pages; /* The total number of pages which the VM controls */
133static LIST_HEAD(shrinker_list); 133static LIST_HEAD(shrinker_list);
134static DECLARE_RWSEM(shrinker_rwsem); 134static DECLARE_RWSEM(shrinker_rwsem);
135 135
136#ifdef CONFIG_CGROUP_MEM_RES_CTLR 136#ifdef CONFIG_MEMCG
137static bool global_reclaim(struct scan_control *sc) 137static bool global_reclaim(struct scan_control *sc)
138{ 138{
139 return !sc->target_mem_cgroup; 139 return !sc->target_mem_cgroup;
@@ -687,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
687 687
688 cond_resched(); 688 cond_resched();
689 689
690 mem_cgroup_uncharge_start();
690 while (!list_empty(page_list)) { 691 while (!list_empty(page_list)) {
691 enum page_references references; 692 enum page_references references;
692 struct address_space *mapping; 693 struct address_space *mapping;
@@ -720,9 +721,41 @@ static unsigned long shrink_page_list(struct list_head *page_list,
720 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 721 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
721 722
722 if (PageWriteback(page)) { 723 if (PageWriteback(page)) {
723 nr_writeback++; 724 /*
724 unlock_page(page); 725 * memcg doesn't have any dirty pages throttling so we
725 goto keep; 726 * could easily OOM just because too many pages are in
727 * writeback and there is nothing else to reclaim.
728 *
729 * Check __GFP_IO, certainly because a loop driver
730 * thread might enter reclaim, and deadlock if it waits
731 * on a page for which it is needed to do the write
732 * (loop masks off __GFP_IO|__GFP_FS for this reason);
733 * but more thought would probably show more reasons.
734 *
735 * Don't require __GFP_FS, since we're not going into
736 * the FS, just waiting on its writeback completion.
737 * Worryingly, ext4 gfs2 and xfs allocate pages with
738 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
739 * testing may_enter_fs here is liable to OOM on them.
740 */
741 if (global_reclaim(sc) ||
742 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
743 /*
744 * This is slightly racy - end_page_writeback()
745 * might have just cleared PageReclaim, then
746 * setting PageReclaim here end up interpreted
747 * as PageReadahead - but that does not matter
748 * enough to care. What we do want is for this
749 * page to have PageReclaim set next time memcg
750 * reclaim reaches the tests above, so it will
751 * then wait_on_page_writeback() to avoid OOM;
752 * and it's also appropriate in global reclaim.
753 */
754 SetPageReclaim(page);
755 nr_writeback++;
756 goto keep_locked;
757 }
758 wait_on_page_writeback(page);
726 } 759 }
727 760
728 references = page_check_references(page, sc); 761 references = page_check_references(page, sc);
@@ -921,6 +954,7 @@ keep:
921 954
922 list_splice(&ret_pages, page_list); 955 list_splice(&ret_pages, page_list);
923 count_vm_events(PGACTIVATE, pgactivate); 956 count_vm_events(PGACTIVATE, pgactivate);
957 mem_cgroup_uncharge_end();
924 *ret_nr_dirty += nr_dirty; 958 *ret_nr_dirty += nr_dirty;
925 *ret_nr_writeback += nr_writeback; 959 *ret_nr_writeback += nr_writeback;
926 return nr_reclaimed; 960 return nr_reclaimed;
@@ -2112,6 +2146,83 @@ out:
2112 return 0; 2146 return 0;
2113} 2147}
2114 2148
2149static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2150{
2151 struct zone *zone;
2152 unsigned long pfmemalloc_reserve = 0;
2153 unsigned long free_pages = 0;
2154 int i;
2155 bool wmark_ok;
2156
2157 for (i = 0; i <= ZONE_NORMAL; i++) {
2158 zone = &pgdat->node_zones[i];
2159 pfmemalloc_reserve += min_wmark_pages(zone);
2160 free_pages += zone_page_state(zone, NR_FREE_PAGES);
2161 }
2162
2163 wmark_ok = free_pages > pfmemalloc_reserve / 2;
2164
2165 /* kswapd must be awake if processes are being throttled */
2166 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
2167 pgdat->classzone_idx = min(pgdat->classzone_idx,
2168 (enum zone_type)ZONE_NORMAL);
2169 wake_up_interruptible(&pgdat->kswapd_wait);
2170 }
2171
2172 return wmark_ok;
2173}
2174
2175/*
2176 * Throttle direct reclaimers if backing storage is backed by the network
2177 * and the PFMEMALLOC reserve for the preferred node is getting dangerously
2178 * depleted. kswapd will continue to make progress and wake the processes
2179 * when the low watermark is reached
2180 */
2181static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2182 nodemask_t *nodemask)
2183{
2184 struct zone *zone;
2185 int high_zoneidx = gfp_zone(gfp_mask);
2186 pg_data_t *pgdat;
2187
2188 /*
2189 * Kernel threads should not be throttled as they may be indirectly
2190 * responsible for cleaning pages necessary for reclaim to make forward
2191 * progress. kjournald for example may enter direct reclaim while
2192 * committing a transaction where throttling it could forcing other
2193 * processes to block on log_wait_commit().
2194 */
2195 if (current->flags & PF_KTHREAD)
2196 return;
2197
2198 /* Check if the pfmemalloc reserves are ok */
2199 first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
2200 pgdat = zone->zone_pgdat;
2201 if (pfmemalloc_watermark_ok(pgdat))
2202 return;
2203
2204 /* Account for the throttling */
2205 count_vm_event(PGSCAN_DIRECT_THROTTLE);
2206
2207 /*
2208 * If the caller cannot enter the filesystem, it's possible that it
2209 * is due to the caller holding an FS lock or performing a journal
2210 * transaction in the case of a filesystem like ext[3|4]. In this case,
2211 * it is not safe to block on pfmemalloc_wait as kswapd could be
2212 * blocked waiting on the same lock. Instead, throttle for up to a
2213 * second before continuing.
2214 */
2215 if (!(gfp_mask & __GFP_FS)) {
2216 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
2217 pfmemalloc_watermark_ok(pgdat), HZ);
2218 return;
2219 }
2220
2221 /* Throttle until kswapd wakes the process */
2222 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
2223 pfmemalloc_watermark_ok(pgdat));
2224}
2225
2115unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 2226unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2116 gfp_t gfp_mask, nodemask_t *nodemask) 2227 gfp_t gfp_mask, nodemask_t *nodemask)
2117{ 2228{
@@ -2131,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2131 .gfp_mask = sc.gfp_mask, 2242 .gfp_mask = sc.gfp_mask,
2132 }; 2243 };
2133 2244
2245 throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
2246
2247 /*
2248 * Do not enter reclaim if fatal signal is pending. 1 is returned so
2249 * that the page allocator does not consider triggering OOM
2250 */
2251 if (fatal_signal_pending(current))
2252 return 1;
2253
2134 trace_mm_vmscan_direct_reclaim_begin(order, 2254 trace_mm_vmscan_direct_reclaim_begin(order,
2135 sc.may_writepage, 2255 sc.may_writepage,
2136 gfp_mask); 2256 gfp_mask);
@@ -2142,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2142 return nr_reclaimed; 2262 return nr_reclaimed;
2143} 2263}
2144 2264
2145#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2265#ifdef CONFIG_MEMCG
2146 2266
2147unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, 2267unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2148 gfp_t gfp_mask, bool noswap, 2268 gfp_t gfp_mask, bool noswap,
@@ -2275,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2275 return balanced_pages >= (present_pages >> 2); 2395 return balanced_pages >= (present_pages >> 2);
2276} 2396}
2277 2397
2278/* is kswapd sleeping prematurely? */ 2398/*
2279static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, 2399 * Prepare kswapd for sleeping. This verifies that there are no processes
2400 * waiting in throttle_direct_reclaim() and that watermarks have been met.
2401 *
2402 * Returns true if kswapd is ready to sleep
2403 */
2404static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2280 int classzone_idx) 2405 int classzone_idx)
2281{ 2406{
2282 int i; 2407 int i;
@@ -2285,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2285 2410
2286 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2411 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2287 if (remaining) 2412 if (remaining)
2288 return true; 2413 return false;
2414
2415 /*
2416 * There is a potential race between when kswapd checks its watermarks
2417 * and a process gets throttled. There is also a potential race if
2418 * processes get throttled, kswapd wakes, a large process exits therby
2419 * balancing the zones that causes kswapd to miss a wakeup. If kswapd
2420 * is going to sleep, no process should be sleeping on pfmemalloc_wait
2421 * so wake them now if necessary. If necessary, processes will wake
2422 * kswapd and get throttled again
2423 */
2424 if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
2425 wake_up(&pgdat->pfmemalloc_wait);
2426 return false;
2427 }
2289 2428
2290 /* Check the watermark levels */ 2429 /* Check the watermark levels */
2291 for (i = 0; i <= classzone_idx; i++) { 2430 for (i = 0; i <= classzone_idx; i++) {
@@ -2318,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2318 * must be balanced 2457 * must be balanced
2319 */ 2458 */
2320 if (order) 2459 if (order)
2321 return !pgdat_balanced(pgdat, balanced, classzone_idx); 2460 return pgdat_balanced(pgdat, balanced, classzone_idx);
2322 else 2461 else
2323 return !all_zones_ok; 2462 return all_zones_ok;
2324} 2463}
2325 2464
2326/* 2465/*
@@ -2546,6 +2685,16 @@ loop_again:
2546 } 2685 }
2547 2686
2548 } 2687 }
2688
2689 /*
2690 * If the low watermark is met there is no need for processes
2691 * to be throttled on pfmemalloc_wait as they should not be
2692 * able to safely make forward progress. Wake them
2693 */
2694 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
2695 pfmemalloc_watermark_ok(pgdat))
2696 wake_up(&pgdat->pfmemalloc_wait);
2697
2549 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) 2698 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2550 break; /* kswapd: all done */ 2699 break; /* kswapd: all done */
2551 /* 2700 /*
@@ -2647,7 +2796,7 @@ out:
2647 } 2796 }
2648 2797
2649 /* 2798 /*
2650 * Return the order we were reclaiming at so sleeping_prematurely() 2799 * Return the order we were reclaiming at so prepare_kswapd_sleep()
2651 * makes a decision on the order we were last reclaiming at. However, 2800 * makes a decision on the order we were last reclaiming at. However,
2652 * if another caller entered the allocator slow path while kswapd 2801 * if another caller entered the allocator slow path while kswapd
2653 * was awake, order will remain at the higher level 2802 * was awake, order will remain at the higher level
@@ -2667,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2667 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2816 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2668 2817
2669 /* Try to sleep for a short interval */ 2818 /* Try to sleep for a short interval */
2670 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { 2819 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
2671 remaining = schedule_timeout(HZ/10); 2820 remaining = schedule_timeout(HZ/10);
2672 finish_wait(&pgdat->kswapd_wait, &wait); 2821 finish_wait(&pgdat->kswapd_wait, &wait);
2673 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2822 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -2677,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2677 * After a short sleep, check if it was a premature sleep. If not, then 2826 * After a short sleep, check if it was a premature sleep. If not, then
2678 * go fully to sleep until explicitly woken up. 2827 * go fully to sleep until explicitly woken up.
2679 */ 2828 */
2680 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { 2829 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
2681 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 2830 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2682 2831
2683 /* 2832 /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1bbbbd9776ad..df7a6748231d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -745,6 +745,7 @@ const char * const vmstat_text[] = {
745 TEXTS_FOR_ZONES("pgsteal_direct") 745 TEXTS_FOR_ZONES("pgsteal_direct")
746 TEXTS_FOR_ZONES("pgscan_kswapd") 746 TEXTS_FOR_ZONES("pgscan_kswapd")
747 TEXTS_FOR_ZONES("pgscan_direct") 747 TEXTS_FOR_ZONES("pgscan_direct")
748 "pgscan_direct_throttle",
748 749
749#ifdef CONFIG_NUMA 750#ifdef CONFIG_NUMA
750 "zone_reclaim_failed", 751 "zone_reclaim_failed",
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 78f1cdad5b33..095259f83902 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -141,7 +141,7 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
141 err = sk_filter(sk, skb); 141 err = sk_filter(sk, skb);
142 if (err) 142 if (err)
143 return err; 143 return err;
144 if (!sk_rmem_schedule(sk, skb->truesize) && rx_flow_is_on(cf_sk)) { 144 if (!sk_rmem_schedule(sk, skb, skb->truesize) && rx_flow_is_on(cf_sk)) {
145 set_rx_flow_off(cf_sk); 145 set_rx_flow_off(cf_sk);
146 net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n"); 146 net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n");
147 caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); 147 caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ);
diff --git a/net/core/dev.c b/net/core/dev.c
index c8569f826b71..0cb3fe8d8e72 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3156,6 +3156,23 @@ void netdev_rx_handler_unregister(struct net_device *dev)
3156} 3156}
3157EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); 3157EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3158 3158
3159/*
3160 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3161 * the special handling of PFMEMALLOC skbs.
3162 */
3163static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3164{
3165 switch (skb->protocol) {
3166 case __constant_htons(ETH_P_ARP):
3167 case __constant_htons(ETH_P_IP):
3168 case __constant_htons(ETH_P_IPV6):
3169 case __constant_htons(ETH_P_8021Q):
3170 return true;
3171 default:
3172 return false;
3173 }
3174}
3175
3159static int __netif_receive_skb(struct sk_buff *skb) 3176static int __netif_receive_skb(struct sk_buff *skb)
3160{ 3177{
3161 struct packet_type *ptype, *pt_prev; 3178 struct packet_type *ptype, *pt_prev;
@@ -3165,14 +3182,27 @@ static int __netif_receive_skb(struct sk_buff *skb)
3165 bool deliver_exact = false; 3182 bool deliver_exact = false;
3166 int ret = NET_RX_DROP; 3183 int ret = NET_RX_DROP;
3167 __be16 type; 3184 __be16 type;
3185 unsigned long pflags = current->flags;
3168 3186
3169 net_timestamp_check(!netdev_tstamp_prequeue, skb); 3187 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3170 3188
3171 trace_netif_receive_skb(skb); 3189 trace_netif_receive_skb(skb);
3172 3190
3191 /*
3192 * PFMEMALLOC skbs are special, they should
3193 * - be delivered to SOCK_MEMALLOC sockets only
3194 * - stay away from userspace
3195 * - have bounded memory usage
3196 *
3197 * Use PF_MEMALLOC as this saves us from propagating the allocation
3198 * context down to all allocation sites.
3199 */
3200 if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3201 current->flags |= PF_MEMALLOC;
3202
3173 /* if we've gotten here through NAPI, check netpoll */ 3203 /* if we've gotten here through NAPI, check netpoll */
3174 if (netpoll_receive_skb(skb)) 3204 if (netpoll_receive_skb(skb))
3175 return NET_RX_DROP; 3205 goto out;
3176 3206
3177 orig_dev = skb->dev; 3207 orig_dev = skb->dev;
3178 3208
@@ -3192,7 +3222,7 @@ another_round:
3192 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { 3222 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3193 skb = vlan_untag(skb); 3223 skb = vlan_untag(skb);
3194 if (unlikely(!skb)) 3224 if (unlikely(!skb))
3195 goto out; 3225 goto unlock;
3196 } 3226 }
3197 3227
3198#ifdef CONFIG_NET_CLS_ACT 3228#ifdef CONFIG_NET_CLS_ACT
@@ -3202,6 +3232,9 @@ another_round:
3202 } 3232 }
3203#endif 3233#endif
3204 3234
3235 if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3236 goto skip_taps;
3237
3205 list_for_each_entry_rcu(ptype, &ptype_all, list) { 3238 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3206 if (!ptype->dev || ptype->dev == skb->dev) { 3239 if (!ptype->dev || ptype->dev == skb->dev) {
3207 if (pt_prev) 3240 if (pt_prev)
@@ -3210,13 +3243,18 @@ another_round:
3210 } 3243 }
3211 } 3244 }
3212 3245
3246skip_taps:
3213#ifdef CONFIG_NET_CLS_ACT 3247#ifdef CONFIG_NET_CLS_ACT
3214 skb = handle_ing(skb, &pt_prev, &ret, orig_dev); 3248 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3215 if (!skb) 3249 if (!skb)
3216 goto out; 3250 goto unlock;
3217ncls: 3251ncls:
3218#endif 3252#endif
3219 3253
3254 if (sk_memalloc_socks() && skb_pfmemalloc(skb)
3255 && !skb_pfmemalloc_protocol(skb))
3256 goto drop;
3257
3220 rx_handler = rcu_dereference(skb->dev->rx_handler); 3258 rx_handler = rcu_dereference(skb->dev->rx_handler);
3221 if (vlan_tx_tag_present(skb)) { 3259 if (vlan_tx_tag_present(skb)) {
3222 if (pt_prev) { 3260 if (pt_prev) {
@@ -3226,7 +3264,7 @@ ncls:
3226 if (vlan_do_receive(&skb, !rx_handler)) 3264 if (vlan_do_receive(&skb, !rx_handler))
3227 goto another_round; 3265 goto another_round;
3228 else if (unlikely(!skb)) 3266 else if (unlikely(!skb))
3229 goto out; 3267 goto unlock;
3230 } 3268 }
3231 3269
3232 if (rx_handler) { 3270 if (rx_handler) {
@@ -3236,7 +3274,7 @@ ncls:
3236 } 3274 }
3237 switch (rx_handler(&skb)) { 3275 switch (rx_handler(&skb)) {
3238 case RX_HANDLER_CONSUMED: 3276 case RX_HANDLER_CONSUMED:
3239 goto out; 3277 goto unlock;
3240 case RX_HANDLER_ANOTHER: 3278 case RX_HANDLER_ANOTHER:
3241 goto another_round; 3279 goto another_round;
3242 case RX_HANDLER_EXACT: 3280 case RX_HANDLER_EXACT:
@@ -3269,6 +3307,7 @@ ncls:
3269 else 3307 else
3270 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 3308 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3271 } else { 3309 } else {
3310drop:
3272 atomic_long_inc(&skb->dev->rx_dropped); 3311 atomic_long_inc(&skb->dev->rx_dropped);
3273 kfree_skb(skb); 3312 kfree_skb(skb);
3274 /* Jamal, now you will not able to escape explaining 3313 /* Jamal, now you will not able to escape explaining
@@ -3277,8 +3316,10 @@ ncls:
3277 ret = NET_RX_DROP; 3316 ret = NET_RX_DROP;
3278 } 3317 }
3279 3318
3280out: 3319unlock:
3281 rcu_read_unlock(); 3320 rcu_read_unlock();
3321out:
3322 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3282 return ret; 3323 return ret;
3283} 3324}
3284 3325
diff --git a/net/core/filter.c b/net/core/filter.c
index d4ce2dc712e3..907efd27ec77 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -83,6 +83,14 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
83 int err; 83 int err;
84 struct sk_filter *filter; 84 struct sk_filter *filter;
85 85
86 /*
87 * If the skb was allocated from pfmemalloc reserves, only
88 * allow SOCK_MEMALLOC sockets to use it as this socket is
89 * helping free memory
90 */
91 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
92 return -ENOMEM;
93
86 err = security_sock_rcv_skb(sk, skb); 94 err = security_sock_rcv_skb(sk, skb);
87 if (err) 95 if (err)
88 return err; 96 return err;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 368f65c15e4f..fe00d1208167 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -145,6 +145,43 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
145 BUG(); 145 BUG();
146} 146}
147 147
148
149/*
150 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
151 * the caller if emergency pfmemalloc reserves are being used. If it is and
152 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
153 * may be used. Otherwise, the packet data may be discarded until enough
154 * memory is free
155 */
156#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
157 __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
158void *__kmalloc_reserve(size_t size, gfp_t flags, int node, unsigned long ip,
159 bool *pfmemalloc)
160{
161 void *obj;
162 bool ret_pfmemalloc = false;
163
164 /*
165 * Try a regular allocation, when that fails and we're not entitled
166 * to the reserves, fail.
167 */
168 obj = kmalloc_node_track_caller(size,
169 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
170 node);
171 if (obj || !(gfp_pfmemalloc_allowed(flags)))
172 goto out;
173
174 /* Try again but now we are using pfmemalloc reserves */
175 ret_pfmemalloc = true;
176 obj = kmalloc_node_track_caller(size, flags, node);
177
178out:
179 if (pfmemalloc)
180 *pfmemalloc = ret_pfmemalloc;
181
182 return obj;
183}
184
148/* Allocate a new skbuff. We do this ourselves so we can fill in a few 185/* Allocate a new skbuff. We do this ourselves so we can fill in a few
149 * 'private' fields and also do memory statistics to find all the 186 * 'private' fields and also do memory statistics to find all the
150 * [BEEP] leaks. 187 * [BEEP] leaks.
@@ -155,8 +192,10 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
155 * __alloc_skb - allocate a network buffer 192 * __alloc_skb - allocate a network buffer
156 * @size: size to allocate 193 * @size: size to allocate
157 * @gfp_mask: allocation mask 194 * @gfp_mask: allocation mask
158 * @fclone: allocate from fclone cache instead of head cache 195 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
159 * and allocate a cloned (child) skb 196 * instead of head cache and allocate a cloned (child) skb.
197 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
198 * allocations in case the data is required for writeback
160 * @node: numa node to allocate memory on 199 * @node: numa node to allocate memory on
161 * 200 *
162 * Allocate a new &sk_buff. The returned buffer has no headroom and a 201 * Allocate a new &sk_buff. The returned buffer has no headroom and a
@@ -167,14 +206,19 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
167 * %GFP_ATOMIC. 206 * %GFP_ATOMIC.
168 */ 207 */
169struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, 208struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
170 int fclone, int node) 209 int flags, int node)
171{ 210{
172 struct kmem_cache *cache; 211 struct kmem_cache *cache;
173 struct skb_shared_info *shinfo; 212 struct skb_shared_info *shinfo;
174 struct sk_buff *skb; 213 struct sk_buff *skb;
175 u8 *data; 214 u8 *data;
215 bool pfmemalloc;
176 216
177 cache = fclone ? skbuff_fclone_cache : skbuff_head_cache; 217 cache = (flags & SKB_ALLOC_FCLONE)
218 ? skbuff_fclone_cache : skbuff_head_cache;
219
220 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
221 gfp_mask |= __GFP_MEMALLOC;
178 222
179 /* Get the HEAD */ 223 /* Get the HEAD */
180 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); 224 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
@@ -189,7 +233,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
189 */ 233 */
190 size = SKB_DATA_ALIGN(size); 234 size = SKB_DATA_ALIGN(size);
191 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 235 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
192 data = kmalloc_node_track_caller(size, gfp_mask, node); 236 data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
193 if (!data) 237 if (!data)
194 goto nodata; 238 goto nodata;
195 /* kmalloc(size) might give us more room than requested. 239 /* kmalloc(size) might give us more room than requested.
@@ -207,6 +251,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
207 memset(skb, 0, offsetof(struct sk_buff, tail)); 251 memset(skb, 0, offsetof(struct sk_buff, tail));
208 /* Account for allocated memory : skb + skb->head */ 252 /* Account for allocated memory : skb + skb->head */
209 skb->truesize = SKB_TRUESIZE(size); 253 skb->truesize = SKB_TRUESIZE(size);
254 skb->pfmemalloc = pfmemalloc;
210 atomic_set(&skb->users, 1); 255 atomic_set(&skb->users, 1);
211 skb->head = data; 256 skb->head = data;
212 skb->data = data; 257 skb->data = data;
@@ -222,7 +267,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
222 atomic_set(&shinfo->dataref, 1); 267 atomic_set(&shinfo->dataref, 1);
223 kmemcheck_annotate_variable(shinfo->destructor_arg); 268 kmemcheck_annotate_variable(shinfo->destructor_arg);
224 269
225 if (fclone) { 270 if (flags & SKB_ALLOC_FCLONE) {
226 struct sk_buff *child = skb + 1; 271 struct sk_buff *child = skb + 1;
227 atomic_t *fclone_ref = (atomic_t *) (child + 1); 272 atomic_t *fclone_ref = (atomic_t *) (child + 1);
228 273
@@ -232,6 +277,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
232 atomic_set(fclone_ref, 1); 277 atomic_set(fclone_ref, 1);
233 278
234 child->fclone = SKB_FCLONE_UNAVAILABLE; 279 child->fclone = SKB_FCLONE_UNAVAILABLE;
280 child->pfmemalloc = pfmemalloc;
235 } 281 }
236out: 282out:
237 return skb; 283 return skb;
@@ -302,14 +348,7 @@ static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
302 348
303#define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES) 349#define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES)
304 350
305/** 351static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
306 * netdev_alloc_frag - allocate a page fragment
307 * @fragsz: fragment size
308 *
309 * Allocates a frag from a page for receive buffer.
310 * Uses GFP_ATOMIC allocations.
311 */
312void *netdev_alloc_frag(unsigned int fragsz)
313{ 352{
314 struct netdev_alloc_cache *nc; 353 struct netdev_alloc_cache *nc;
315 void *data = NULL; 354 void *data = NULL;
@@ -319,7 +358,7 @@ void *netdev_alloc_frag(unsigned int fragsz)
319 nc = &__get_cpu_var(netdev_alloc_cache); 358 nc = &__get_cpu_var(netdev_alloc_cache);
320 if (unlikely(!nc->page)) { 359 if (unlikely(!nc->page)) {
321refill: 360refill:
322 nc->page = alloc_page(GFP_ATOMIC | __GFP_COLD); 361 nc->page = alloc_page(gfp_mask);
323 if (unlikely(!nc->page)) 362 if (unlikely(!nc->page))
324 goto end; 363 goto end;
325recycle: 364recycle:
@@ -343,6 +382,18 @@ end:
343 local_irq_restore(flags); 382 local_irq_restore(flags);
344 return data; 383 return data;
345} 384}
385
386/**
387 * netdev_alloc_frag - allocate a page fragment
388 * @fragsz: fragment size
389 *
390 * Allocates a frag from a page for receive buffer.
391 * Uses GFP_ATOMIC allocations.
392 */
393void *netdev_alloc_frag(unsigned int fragsz)
394{
395 return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
396}
346EXPORT_SYMBOL(netdev_alloc_frag); 397EXPORT_SYMBOL(netdev_alloc_frag);
347 398
348/** 399/**
@@ -366,7 +417,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
366 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 417 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
367 418
368 if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { 419 if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
369 void *data = netdev_alloc_frag(fragsz); 420 void *data;
421
422 if (sk_memalloc_socks())
423 gfp_mask |= __GFP_MEMALLOC;
424
425 data = __netdev_alloc_frag(fragsz, gfp_mask);
370 426
371 if (likely(data)) { 427 if (likely(data)) {
372 skb = build_skb(data, fragsz); 428 skb = build_skb(data, fragsz);
@@ -374,7 +430,8 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
374 put_page(virt_to_head_page(data)); 430 put_page(virt_to_head_page(data));
375 } 431 }
376 } else { 432 } else {
377 skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE); 433 skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
434 SKB_ALLOC_RX, NUMA_NO_NODE);
378 } 435 }
379 if (likely(skb)) { 436 if (likely(skb)) {
380 skb_reserve(skb, NET_SKB_PAD); 437 skb_reserve(skb, NET_SKB_PAD);
@@ -656,6 +713,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
656#if IS_ENABLED(CONFIG_IP_VS) 713#if IS_ENABLED(CONFIG_IP_VS)
657 new->ipvs_property = old->ipvs_property; 714 new->ipvs_property = old->ipvs_property;
658#endif 715#endif
716 new->pfmemalloc = old->pfmemalloc;
659 new->protocol = old->protocol; 717 new->protocol = old->protocol;
660 new->mark = old->mark; 718 new->mark = old->mark;
661 new->skb_iif = old->skb_iif; 719 new->skb_iif = old->skb_iif;
@@ -814,6 +872,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
814 n->fclone = SKB_FCLONE_CLONE; 872 n->fclone = SKB_FCLONE_CLONE;
815 atomic_inc(fclone_ref); 873 atomic_inc(fclone_ref);
816 } else { 874 } else {
875 if (skb_pfmemalloc(skb))
876 gfp_mask |= __GFP_MEMALLOC;
877
817 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); 878 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
818 if (!n) 879 if (!n)
819 return NULL; 880 return NULL;
@@ -850,6 +911,13 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
850 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; 911 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
851} 912}
852 913
914static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
915{
916 if (skb_pfmemalloc(skb))
917 return SKB_ALLOC_RX;
918 return 0;
919}
920
853/** 921/**
854 * skb_copy - create private copy of an sk_buff 922 * skb_copy - create private copy of an sk_buff
855 * @skb: buffer to copy 923 * @skb: buffer to copy
@@ -871,7 +939,8 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
871{ 939{
872 int headerlen = skb_headroom(skb); 940 int headerlen = skb_headroom(skb);
873 unsigned int size = skb_end_offset(skb) + skb->data_len; 941 unsigned int size = skb_end_offset(skb) + skb->data_len;
874 struct sk_buff *n = alloc_skb(size, gfp_mask); 942 struct sk_buff *n = __alloc_skb(size, gfp_mask,
943 skb_alloc_rx_flag(skb), NUMA_NO_NODE);
875 944
876 if (!n) 945 if (!n)
877 return NULL; 946 return NULL;
@@ -906,7 +975,8 @@ EXPORT_SYMBOL(skb_copy);
906struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) 975struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
907{ 976{
908 unsigned int size = skb_headlen(skb) + headroom; 977 unsigned int size = skb_headlen(skb) + headroom;
909 struct sk_buff *n = alloc_skb(size, gfp_mask); 978 struct sk_buff *n = __alloc_skb(size, gfp_mask,
979 skb_alloc_rx_flag(skb), NUMA_NO_NODE);
910 980
911 if (!n) 981 if (!n)
912 goto out; 982 goto out;
@@ -979,8 +1049,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
979 1049
980 size = SKB_DATA_ALIGN(size); 1050 size = SKB_DATA_ALIGN(size);
981 1051
982 data = kmalloc(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), 1052 if (skb_pfmemalloc(skb))
983 gfp_mask); 1053 gfp_mask |= __GFP_MEMALLOC;
1054 data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
1055 gfp_mask, NUMA_NO_NODE, NULL);
984 if (!data) 1056 if (!data)
985 goto nodata; 1057 goto nodata;
986 size = SKB_WITH_OVERHEAD(ksize(data)); 1058 size = SKB_WITH_OVERHEAD(ksize(data));
@@ -1092,8 +1164,9 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
1092 /* 1164 /*
1093 * Allocate the copy buffer 1165 * Allocate the copy buffer
1094 */ 1166 */
1095 struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, 1167 struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
1096 gfp_mask); 1168 gfp_mask, skb_alloc_rx_flag(skb),
1169 NUMA_NO_NODE);
1097 int oldheadroom = skb_headroom(skb); 1170 int oldheadroom = skb_headroom(skb);
1098 int head_copy_len, head_copy_off; 1171 int head_copy_len, head_copy_off;
1099 int off; 1172 int off;
@@ -2775,8 +2848,9 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
2775 skb_release_head_state(nskb); 2848 skb_release_head_state(nskb);
2776 __skb_push(nskb, doffset); 2849 __skb_push(nskb, doffset);
2777 } else { 2850 } else {
2778 nskb = alloc_skb(hsize + doffset + headroom, 2851 nskb = __alloc_skb(hsize + doffset + headroom,
2779 GFP_ATOMIC); 2852 GFP_ATOMIC, skb_alloc_rx_flag(skb),
2853 NUMA_NO_NODE);
2780 2854
2781 if (unlikely(!nskb)) 2855 if (unlikely(!nskb))
2782 goto err; 2856 goto err;
diff --git a/net/core/sock.c b/net/core/sock.c
index 2676a88f533e..6b654b3ddfda 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -142,7 +142,7 @@
142static DEFINE_MUTEX(proto_list_mutex); 142static DEFINE_MUTEX(proto_list_mutex);
143static LIST_HEAD(proto_list); 143static LIST_HEAD(proto_list);
144 144
145#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 145#ifdef CONFIG_MEMCG_KMEM
146int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 146int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
147{ 147{
148 struct proto *proto; 148 struct proto *proto;
@@ -271,6 +271,61 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
271int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 271int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
272EXPORT_SYMBOL(sysctl_optmem_max); 272EXPORT_SYMBOL(sysctl_optmem_max);
273 273
274struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
275EXPORT_SYMBOL_GPL(memalloc_socks);
276
277/**
278 * sk_set_memalloc - sets %SOCK_MEMALLOC
279 * @sk: socket to set it on
280 *
281 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
282 * It's the responsibility of the admin to adjust min_free_kbytes
283 * to meet the requirements
284 */
285void sk_set_memalloc(struct sock *sk)
286{
287 sock_set_flag(sk, SOCK_MEMALLOC);
288 sk->sk_allocation |= __GFP_MEMALLOC;
289 static_key_slow_inc(&memalloc_socks);
290}
291EXPORT_SYMBOL_GPL(sk_set_memalloc);
292
293void sk_clear_memalloc(struct sock *sk)
294{
295 sock_reset_flag(sk, SOCK_MEMALLOC);
296 sk->sk_allocation &= ~__GFP_MEMALLOC;
297 static_key_slow_dec(&memalloc_socks);
298
299 /*
300 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
301 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
302 * it has rmem allocations there is a risk that the user of the
303 * socket cannot make forward progress due to exceeding the rmem
304 * limits. By rights, sk_clear_memalloc() should only be called
305 * on sockets being torn down but warn and reset the accounting if
306 * that assumption breaks.
307 */
308 if (WARN_ON(sk->sk_forward_alloc))
309 sk_mem_reclaim(sk);
310}
311EXPORT_SYMBOL_GPL(sk_clear_memalloc);
312
313int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
314{
315 int ret;
316 unsigned long pflags = current->flags;
317
318 /* these should have been dropped before queueing */
319 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
320
321 current->flags |= PF_MEMALLOC;
322 ret = sk->sk_backlog_rcv(sk, skb);
323 tsk_restore_flags(current, pflags, PF_MEMALLOC);
324
325 return ret;
326}
327EXPORT_SYMBOL(__sk_backlog_rcv);
328
274#if defined(CONFIG_CGROUPS) 329#if defined(CONFIG_CGROUPS)
275#if !defined(CONFIG_NET_CLS_CGROUP) 330#if !defined(CONFIG_NET_CLS_CGROUP)
276int net_cls_subsys_id = -1; 331int net_cls_subsys_id = -1;
@@ -353,7 +408,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
353 if (err) 408 if (err)
354 return err; 409 return err;
355 410
356 if (!sk_rmem_schedule(sk, skb->truesize)) { 411 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
357 atomic_inc(&sk->sk_drops); 412 atomic_inc(&sk->sk_drops);
358 return -ENOBUFS; 413 return -ENOBUFS;
359 } 414 }
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ae2ccf2890e4..15ca63ec604e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -49,7 +49,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
49obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o 49obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
50obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o 50obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
51obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o 51obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
52obj-$(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) += tcp_memcontrol.o 52obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
53obj-$(CONFIG_NETLABEL) += cipso_ipv4.o 53obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
54 54
55obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ 55obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4b6487a68279..1b5ce96707a3 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -184,7 +184,7 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
184 int ret; 184 int ret;
185 unsigned long vec[3]; 185 unsigned long vec[3];
186 struct net *net = current->nsproxy->net_ns; 186 struct net *net = current->nsproxy->net_ns;
187#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 187#ifdef CONFIG_MEMCG_KMEM
188 struct mem_cgroup *memcg; 188 struct mem_cgroup *memcg;
189#endif 189#endif
190 190
@@ -203,7 +203,7 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
203 if (ret) 203 if (ret)
204 return ret; 204 return ret;
205 205
206#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 206#ifdef CONFIG_MEMCG_KMEM
207 rcu_read_lock(); 207 rcu_read_lock();
208 memcg = mem_cgroup_from_task(current); 208 memcg = mem_cgroup_from_task(current);
209 209
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9be30b039ae3..2fd2bc9e3c64 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4351,19 +4351,20 @@ static void tcp_ofo_queue(struct sock *sk)
4351static bool tcp_prune_ofo_queue(struct sock *sk); 4351static bool tcp_prune_ofo_queue(struct sock *sk);
4352static int tcp_prune_queue(struct sock *sk); 4352static int tcp_prune_queue(struct sock *sk);
4353 4353
4354static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) 4354static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4355 unsigned int size)
4355{ 4356{
4356 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || 4357 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
4357 !sk_rmem_schedule(sk, size)) { 4358 !sk_rmem_schedule(sk, skb, size)) {
4358 4359
4359 if (tcp_prune_queue(sk) < 0) 4360 if (tcp_prune_queue(sk) < 0)
4360 return -1; 4361 return -1;
4361 4362
4362 if (!sk_rmem_schedule(sk, size)) { 4363 if (!sk_rmem_schedule(sk, skb, size)) {
4363 if (!tcp_prune_ofo_queue(sk)) 4364 if (!tcp_prune_ofo_queue(sk))
4364 return -1; 4365 return -1;
4365 4366
4366 if (!sk_rmem_schedule(sk, size)) 4367 if (!sk_rmem_schedule(sk, skb, size))
4367 return -1; 4368 return -1;
4368 } 4369 }
4369 } 4370 }
@@ -4418,7 +4419,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4418 4419
4419 TCP_ECN_check_ce(tp, skb); 4420 TCP_ECN_check_ce(tp, skb);
4420 4421
4421 if (unlikely(tcp_try_rmem_schedule(sk, skb->truesize))) { 4422 if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
4422 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); 4423 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
4423 __kfree_skb(skb); 4424 __kfree_skb(skb);
4424 return; 4425 return;
@@ -4552,17 +4553,17 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
4552 4553
4553int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) 4554int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
4554{ 4555{
4555 struct sk_buff *skb; 4556 struct sk_buff *skb = NULL;
4556 struct tcphdr *th; 4557 struct tcphdr *th;
4557 bool fragstolen; 4558 bool fragstolen;
4558 4559
4559 if (tcp_try_rmem_schedule(sk, size + sizeof(*th)))
4560 goto err;
4561
4562 skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); 4560 skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
4563 if (!skb) 4561 if (!skb)
4564 goto err; 4562 goto err;
4565 4563
4564 if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th)))
4565 goto err_free;
4566
4566 th = (struct tcphdr *)skb_put(skb, sizeof(*th)); 4567 th = (struct tcphdr *)skb_put(skb, sizeof(*th));
4567 skb_reset_transport_header(skb); 4568 skb_reset_transport_header(skb);
4568 memset(th, 0, sizeof(*th)); 4569 memset(th, 0, sizeof(*th));
@@ -4633,7 +4634,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4633 if (eaten <= 0) { 4634 if (eaten <= 0) {
4634queue_and_out: 4635queue_and_out:
4635 if (eaten < 0 && 4636 if (eaten < 0 &&
4636 tcp_try_rmem_schedule(sk, skb->truesize)) 4637 tcp_try_rmem_schedule(sk, skb, skb->truesize))
4637 goto drop; 4638 goto drop;
4638 4639
4639 eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); 4640 eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7f91e5ac8277..42b2a6a73092 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2633,7 +2633,7 @@ struct proto tcp_prot = {
2633 .compat_setsockopt = compat_tcp_setsockopt, 2633 .compat_setsockopt = compat_tcp_setsockopt,
2634 .compat_getsockopt = compat_tcp_getsockopt, 2634 .compat_getsockopt = compat_tcp_getsockopt,
2635#endif 2635#endif
2636#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 2636#ifdef CONFIG_MEMCG_KMEM
2637 .init_cgroup = tcp_init_cgroup, 2637 .init_cgroup = tcp_init_cgroup,
2638 .destroy_cgroup = tcp_destroy_cgroup, 2638 .destroy_cgroup = tcp_destroy_cgroup,
2639 .proto_cgroup = tcp_proto_cgroup, 2639 .proto_cgroup = tcp_proto_cgroup,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 33cd065cfbd8..3f1bcff0b10b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2045,7 +2045,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
2045 if (unlikely(sk->sk_state == TCP_CLOSE)) 2045 if (unlikely(sk->sk_state == TCP_CLOSE))
2046 return; 2046 return;
2047 2047
2048 if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC)) 2048 if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
2049 sk_gfp_atomic(sk, GFP_ATOMIC)))
2049 tcp_check_probe_timer(sk); 2050 tcp_check_probe_timer(sk);
2050} 2051}
2051 2052
@@ -2666,7 +2667,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2666 2667
2667 if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) 2668 if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
2668 s_data_desired = cvp->s_data_desired; 2669 s_data_desired = cvp->s_data_desired;
2669 skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC); 2670 skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired,
2671 sk_gfp_atomic(sk, GFP_ATOMIC));
2670 if (unlikely(!skb)) { 2672 if (unlikely(!skb)) {
2671 dst_release(dst); 2673 dst_release(dst);
2672 return NULL; 2674 return NULL;
@@ -3064,7 +3066,7 @@ void tcp_send_ack(struct sock *sk)
3064 * tcp_transmit_skb() will set the ownership to this 3066 * tcp_transmit_skb() will set the ownership to this
3065 * sock. 3067 * sock.
3066 */ 3068 */
3067 buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); 3069 buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
3068 if (buff == NULL) { 3070 if (buff == NULL) {
3069 inet_csk_schedule_ack(sk); 3071 inet_csk_schedule_ack(sk);
3070 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; 3072 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
@@ -3079,7 +3081,7 @@ void tcp_send_ack(struct sock *sk)
3079 3081
3080 /* Send it off, this clears delayed acks for us. */ 3082 /* Send it off, this clears delayed acks for us. */
3081 TCP_SKB_CB(buff)->when = tcp_time_stamp; 3083 TCP_SKB_CB(buff)->when = tcp_time_stamp;
3082 tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); 3084 tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
3083} 3085}
3084 3086
3085/* This routine sends a packet with an out of date sequence 3087/* This routine sends a packet with an out of date sequence
@@ -3099,7 +3101,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
3099 struct sk_buff *skb; 3101 struct sk_buff *skb;
3100 3102
3101 /* We don't queue it, tcp_transmit_skb() sets ownership. */ 3103 /* We don't queue it, tcp_transmit_skb() sets ownership. */
3102 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); 3104 skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
3103 if (skb == NULL) 3105 if (skb == NULL)
3104 return -1; 3106 return -1;
3105 3107
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 221224e72507..c66b90f71c9b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1299,7 +1299,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1299 /* Clone pktoptions received with SYN */ 1299 /* Clone pktoptions received with SYN */
1300 newnp->pktoptions = NULL; 1300 newnp->pktoptions = NULL;
1301 if (treq->pktopts != NULL) { 1301 if (treq->pktopts != NULL) {
1302 newnp->pktoptions = skb_clone(treq->pktopts, GFP_ATOMIC); 1302 newnp->pktoptions = skb_clone(treq->pktopts,
1303 sk_gfp_atomic(sk, GFP_ATOMIC));
1303 consume_skb(treq->pktopts); 1304 consume_skb(treq->pktopts);
1304 treq->pktopts = NULL; 1305 treq->pktopts = NULL;
1305 if (newnp->pktoptions) 1306 if (newnp->pktoptions)
@@ -1349,7 +1350,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1349 * across. Shucks. 1350 * across. Shucks.
1350 */ 1351 */
1351 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newnp->daddr, 1352 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newnp->daddr,
1352 AF_INET6, key->key, key->keylen, GFP_ATOMIC); 1353 AF_INET6, key->key, key->keylen,
1354 sk_gfp_atomic(sk, GFP_ATOMIC));
1353 } 1355 }
1354#endif 1356#endif
1355 1357
@@ -1442,7 +1444,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
1442 --ANK (980728) 1444 --ANK (980728)
1443 */ 1445 */
1444 if (np->rxopt.all) 1446 if (np->rxopt.all)
1445 opt_skb = skb_clone(skb, GFP_ATOMIC); 1447 opt_skb = skb_clone(skb, sk_gfp_atomic(sk, GFP_ATOMIC));
1446 1448
1447 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1449 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1448 sock_rps_save_rxhash(sk, skb); 1450 sock_rps_save_rxhash(sk, skb);
@@ -2015,7 +2017,7 @@ struct proto tcpv6_prot = {
2015 .compat_setsockopt = compat_tcp_setsockopt, 2017 .compat_setsockopt = compat_tcp_setsockopt,
2016 .compat_getsockopt = compat_tcp_getsockopt, 2018 .compat_getsockopt = compat_tcp_getsockopt,
2017#endif 2019#endif
2018#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 2020#ifdef CONFIG_MEMCG_KMEM
2019 .proto_cgroup = tcp_proto_cgroup, 2021 .proto_cgroup = tcp_proto_cgroup,
2020#endif 2022#endif
2021}; 2023};
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 33d894776192..10c018a5b9fe 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -702,7 +702,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
702 if (rx_count >= asoc->base.sk->sk_rcvbuf) { 702 if (rx_count >= asoc->base.sk->sk_rcvbuf) {
703 703
704 if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) || 704 if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
705 (!sk_rmem_schedule(asoc->base.sk, chunk->skb->truesize))) 705 (!sk_rmem_schedule(asoc->base.sk, chunk->skb,
706 chunk->skb->truesize)))
706 goto fail; 707 goto fail;
707 } 708 }
708 709
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 9fe8857d8d59..03d03e37a7d5 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -21,6 +21,11 @@ config SUNRPC_XPRT_RDMA
21 21
22 If unsure, say N. 22 If unsure, say N.
23 23
24config SUNRPC_SWAP
25 bool
26 depends on SUNRPC
27 select NETVM
28
24config RPCSEC_GSS_KRB5 29config RPCSEC_GSS_KRB5
25 tristate "Secure RPC: Kerberos V mechanism" 30 tristate "Secure RPC: Kerberos V mechanism"
26 depends on SUNRPC && CRYPTO 31 depends on SUNRPC && CRYPTO
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b05df36692ff..fa48c60aef23 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -717,6 +717,15 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
717 atomic_inc(&clnt->cl_count); 717 atomic_inc(&clnt->cl_count);
718 if (clnt->cl_softrtry) 718 if (clnt->cl_softrtry)
719 task->tk_flags |= RPC_TASK_SOFT; 719 task->tk_flags |= RPC_TASK_SOFT;
720 if (sk_memalloc_socks()) {
721 struct rpc_xprt *xprt;
722
723 rcu_read_lock();
724 xprt = rcu_dereference(clnt->cl_xprt);
725 if (xprt->swapper)
726 task->tk_flags |= RPC_TASK_SWAPPER;
727 rcu_read_unlock();
728 }
720 /* Add to the client's list of all tasks */ 729 /* Add to the client's list of all tasks */
721 spin_lock(&clnt->cl_lock); 730 spin_lock(&clnt->cl_lock);
722 list_add_tail(&task->tk_task, &clnt->cl_tasks); 731 list_add_tail(&task->tk_task, &clnt->cl_tasks);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 1f19aa15f89b..128494ec9a64 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -815,7 +815,10 @@ static void rpc_async_schedule(struct work_struct *work)
815void *rpc_malloc(struct rpc_task *task, size_t size) 815void *rpc_malloc(struct rpc_task *task, size_t size)
816{ 816{
817 struct rpc_buffer *buf; 817 struct rpc_buffer *buf;
818 gfp_t gfp = RPC_IS_SWAPPER(task) ? GFP_ATOMIC : GFP_NOWAIT; 818 gfp_t gfp = GFP_NOWAIT;
819
820 if (RPC_IS_SWAPPER(task))
821 gfp |= __GFP_MEMALLOC;
819 822
820 size += sizeof(struct rpc_buffer); 823 size += sizeof(struct rpc_buffer);
821 if (size <= RPC_BUFFER_MAXSIZE) 824 if (size <= RPC_BUFFER_MAXSIZE)
@@ -889,7 +892,7 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
889static struct rpc_task * 892static struct rpc_task *
890rpc_alloc_task(void) 893rpc_alloc_task(void)
891{ 894{
892 return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS); 895 return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOIO);
893} 896}
894 897
895/* 898/*
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 926679459e71..400567243f84 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1930,6 +1930,45 @@ out:
1930 current->flags &= ~PF_FSTRANS; 1930 current->flags &= ~PF_FSTRANS;
1931} 1931}
1932 1932
1933#ifdef CONFIG_SUNRPC_SWAP
1934static void xs_set_memalloc(struct rpc_xprt *xprt)
1935{
1936 struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
1937 xprt);
1938
1939 if (xprt->swapper)
1940 sk_set_memalloc(transport->inet);
1941}
1942
1943/**
1944 * xs_swapper - Tag this transport as being used for swap.
1945 * @xprt: transport to tag
1946 * @enable: enable/disable
1947 *
1948 */
1949int xs_swapper(struct rpc_xprt *xprt, int enable)
1950{
1951 struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
1952 xprt);
1953 int err = 0;
1954
1955 if (enable) {
1956 xprt->swapper++;
1957 xs_set_memalloc(xprt);
1958 } else if (xprt->swapper) {
1959 xprt->swapper--;
1960 sk_clear_memalloc(transport->inet);
1961 }
1962
1963 return err;
1964}
1965EXPORT_SYMBOL_GPL(xs_swapper);
1966#else
1967static void xs_set_memalloc(struct rpc_xprt *xprt)
1968{
1969}
1970#endif
1971
1933static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) 1972static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
1934{ 1973{
1935 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 1974 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -1954,6 +1993,8 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
1954 transport->sock = sock; 1993 transport->sock = sock;
1955 transport->inet = sk; 1994 transport->inet = sk;
1956 1995
1996 xs_set_memalloc(xprt);
1997
1957 write_unlock_bh(&sk->sk_callback_lock); 1998 write_unlock_bh(&sk->sk_callback_lock);
1958 } 1999 }
1959 xs_udp_do_set_buffer_size(xprt); 2000 xs_udp_do_set_buffer_size(xprt);
@@ -2081,6 +2122,8 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
2081 if (!xprt_bound(xprt)) 2122 if (!xprt_bound(xprt))
2082 goto out; 2123 goto out;
2083 2124
2125 xs_set_memalloc(xprt);
2126
2084 /* Tell the socket layer to start connecting... */ 2127 /* Tell the socket layer to start connecting... */
2085 xprt->stat.connect_count++; 2128 xprt->stat.connect_count++;
2086 xprt->stat.connect_start = jiffies; 2129 xprt->stat.connect_start = jiffies;
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 68d82daed257..4d3fab47e643 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -274,7 +274,7 @@ static struct avc_node *avc_alloc_node(void)
274{ 274{
275 struct avc_node *node; 275 struct avc_node *node;
276 276
277 node = kmem_cache_zalloc(avc_node_cachep, GFP_ATOMIC); 277 node = kmem_cache_zalloc(avc_node_cachep, GFP_ATOMIC|__GFP_NOMEMALLOC);
278 if (!node) 278 if (!node)
279 goto out; 279 goto out;
280 280
diff --git a/tools/testing/fault-injection/failcmd.sh b/tools/testing/fault-injection/failcmd.sh
index 1776e924b202..78a9ed7fecdb 100644
--- a/tools/testing/fault-injection/failcmd.sh
+++ b/tools/testing/fault-injection/failcmd.sh
@@ -206,7 +206,7 @@ while true; do
206 esac 206 esac
207done 207done
208 208
209[ -z "$@" ] && exit 0 209[ -z "$1" ] && exit 0
210 210
211echo $oom_kill_allocating_task > /proc/sys/vm/oom_kill_allocating_task 211echo $oom_kill_allocating_task > /proc/sys/vm/oom_kill_allocating_task
212echo $task_filter > $FAULTATTR/task-filter 212echo $task_filter > $FAULTATTR/task-filter