diff options
131 files changed, 3174 insertions, 1060 deletions
diff --git a/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads b/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads new file mode 100644 index 000000000000..b0b0eeb20fe3 --- /dev/null +++ b/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads | |||
@@ -0,0 +1,5 @@ | |||
1 | What: /proc/sys/vm/nr_pdflush_threads | ||
2 | Date: June 2012 | ||
3 | Contact: Wanpeng Li <liwp@linux.vnet.ibm.com> | ||
4 | Description: Since pdflush is replaced by per-BDI flusher, the interface of old pdflush | ||
5 | exported in /proc/sys/vm/ should be removed. | ||
diff --git a/Documentation/cgroups/hugetlb.txt b/Documentation/cgroups/hugetlb.txt new file mode 100644 index 000000000000..a9faaca1f029 --- /dev/null +++ b/Documentation/cgroups/hugetlb.txt | |||
@@ -0,0 +1,45 @@ | |||
1 | HugeTLB Controller | ||
2 | ------------------- | ||
3 | |||
4 | The HugeTLB controller allows to limit the HugeTLB usage per control group and | ||
5 | enforces the controller limit during page fault. Since HugeTLB doesn't | ||
6 | support page reclaim, enforcing the limit at page fault time implies that, | ||
7 | the application will get SIGBUS signal if it tries to access HugeTLB pages | ||
8 | beyond its limit. This requires the application to know beforehand how much | ||
9 | HugeTLB pages it would require for its use. | ||
10 | |||
11 | HugeTLB controller can be created by first mounting the cgroup filesystem. | ||
12 | |||
13 | # mount -t cgroup -o hugetlb none /sys/fs/cgroup | ||
14 | |||
15 | With the above step, the initial or the parent HugeTLB group becomes | ||
16 | visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in | ||
17 | the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. | ||
18 | |||
19 | New groups can be created under the parent group /sys/fs/cgroup. | ||
20 | |||
21 | # cd /sys/fs/cgroup | ||
22 | # mkdir g1 | ||
23 | # echo $$ > g1/tasks | ||
24 | |||
25 | The above steps create a new group g1 and move the current shell | ||
26 | process (bash) into it. | ||
27 | |||
28 | Brief summary of control files | ||
29 | |||
30 | hugetlb.<hugepagesize>.limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage | ||
31 | hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded | ||
32 | hugetlb.<hugepagesize>.usage_in_bytes # show current res_counter usage for "hugepagesize" hugetlb | ||
33 | hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB limit | ||
34 | |||
35 | For a system supporting two hugepage size (16M and 16G) the control | ||
36 | files include: | ||
37 | |||
38 | hugetlb.16GB.limit_in_bytes | ||
39 | hugetlb.16GB.max_usage_in_bytes | ||
40 | hugetlb.16GB.usage_in_bytes | ||
41 | hugetlb.16GB.failcnt | ||
42 | hugetlb.16MB.limit_in_bytes | ||
43 | hugetlb.16MB.max_usage_in_bytes | ||
44 | hugetlb.16MB.usage_in_bytes | ||
45 | hugetlb.16MB.failcnt | ||
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index dd88540bb995..4372e6b8a353 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -73,6 +73,8 @@ Brief summary of control files. | |||
73 | 73 | ||
74 | memory.kmem.tcp.limit_in_bytes # set/show hard limit for tcp buf memory | 74 | memory.kmem.tcp.limit_in_bytes # set/show hard limit for tcp buf memory |
75 | memory.kmem.tcp.usage_in_bytes # show current tcp buf memory allocation | 75 | memory.kmem.tcp.usage_in_bytes # show current tcp buf memory allocation |
76 | memory.kmem.tcp.failcnt # show the number of tcp buf memory usage hits limits | ||
77 | memory.kmem.tcp.max_usage_in_bytes # show max tcp buf memory usage recorded | ||
76 | 78 | ||
77 | 1. History | 79 | 1. History |
78 | 80 | ||
@@ -187,12 +189,12 @@ the cgroup that brought it in -- this will happen on memory pressure). | |||
187 | But see section 8.2: when moving a task to another cgroup, its pages may | 189 | But see section 8.2: when moving a task to another cgroup, its pages may |
188 | be recharged to the new cgroup, if move_charge_at_immigrate has been chosen. | 190 | be recharged to the new cgroup, if move_charge_at_immigrate has been chosen. |
189 | 191 | ||
190 | Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used. | 192 | Exception: If CONFIG_CGROUP_CGROUP_MEMCG_SWAP is not used. |
191 | When you do swapoff and make swapped-out pages of shmem(tmpfs) to | 193 | When you do swapoff and make swapped-out pages of shmem(tmpfs) to |
192 | be backed into memory in force, charges for pages are accounted against the | 194 | be backed into memory in force, charges for pages are accounted against the |
193 | caller of swapoff rather than the users of shmem. | 195 | caller of swapoff rather than the users of shmem. |
194 | 196 | ||
195 | 2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) | 197 | 2.4 Swap Extension (CONFIG_MEMCG_SWAP) |
196 | 198 | ||
197 | Swap Extension allows you to record charge for swap. A swapped-in page is | 199 | Swap Extension allows you to record charge for swap. A swapped-in page is |
198 | charged back to original page allocator if possible. | 200 | charged back to original page allocator if possible. |
@@ -259,7 +261,7 @@ When oom event notifier is registered, event will be delivered. | |||
259 | per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by | 261 | per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by |
260 | zone->lru_lock, it has no lock of its own. | 262 | zone->lru_lock, it has no lock of its own. |
261 | 263 | ||
262 | 2.7 Kernel Memory Extension (CONFIG_CGROUP_MEM_RES_CTLR_KMEM) | 264 | 2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) |
263 | 265 | ||
264 | With the Kernel memory extension, the Memory Controller is able to limit | 266 | With the Kernel memory extension, the Memory Controller is able to limit |
265 | the amount of kernel memory used by the system. Kernel memory is fundamentally | 267 | the amount of kernel memory used by the system. Kernel memory is fundamentally |
@@ -286,8 +288,8 @@ per cgroup, instead of globally. | |||
286 | 288 | ||
287 | a. Enable CONFIG_CGROUPS | 289 | a. Enable CONFIG_CGROUPS |
288 | b. Enable CONFIG_RESOURCE_COUNTERS | 290 | b. Enable CONFIG_RESOURCE_COUNTERS |
289 | c. Enable CONFIG_CGROUP_MEM_RES_CTLR | 291 | c. Enable CONFIG_MEMCG |
290 | d. Enable CONFIG_CGROUP_MEM_RES_CTLR_SWAP (to use swap extension) | 292 | d. Enable CONFIG_MEMCG_SWAP (to use swap extension) |
291 | 293 | ||
292 | 1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) | 294 | 1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) |
293 | # mount -t tmpfs none /sys/fs/cgroup | 295 | # mount -t tmpfs none /sys/fs/cgroup |
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 24fec7603e5e..72ed15075f79 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt | |||
@@ -13,6 +13,14 @@ Who: Jim Cromie <jim.cromie@gmail.com>, Jason Baron <jbaron@redhat.com> | |||
13 | 13 | ||
14 | --------------------------- | 14 | --------------------------- |
15 | 15 | ||
16 | What: /proc/sys/vm/nr_pdflush_threads | ||
17 | When: 2012 | ||
18 | Why: Since pdflush is deprecated, the interface exported in /proc/sys/vm/ | ||
19 | should be removed. | ||
20 | Who: Wanpeng Li <liwp@linux.vnet.ibm.com> | ||
21 | |||
22 | --------------------------- | ||
23 | |||
16 | What: CONFIG_APM_CPU_IDLE, and its ability to call APM BIOS in idle | 24 | What: CONFIG_APM_CPU_IDLE, and its ability to call APM BIOS in idle |
17 | When: 2012 | 25 | When: 2012 |
18 | Why: This optional sub-feature of APM is of dubious reliability, | 26 | Why: This optional sub-feature of APM is of dubious reliability, |
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index e0cce2a5f820..2db1900d7538 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -206,6 +206,8 @@ prototypes: | |||
206 | int (*launder_page)(struct page *); | 206 | int (*launder_page)(struct page *); |
207 | int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long); | 207 | int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long); |
208 | int (*error_remove_page)(struct address_space *, struct page *); | 208 | int (*error_remove_page)(struct address_space *, struct page *); |
209 | int (*swap_activate)(struct file *); | ||
210 | int (*swap_deactivate)(struct file *); | ||
209 | 211 | ||
210 | locking rules: | 212 | locking rules: |
211 | All except set_page_dirty and freepage may block | 213 | All except set_page_dirty and freepage may block |
@@ -229,6 +231,8 @@ migratepage: yes (both) | |||
229 | launder_page: yes | 231 | launder_page: yes |
230 | is_partially_uptodate: yes | 232 | is_partially_uptodate: yes |
231 | error_remove_page: yes | 233 | error_remove_page: yes |
234 | swap_activate: no | ||
235 | swap_deactivate: no | ||
232 | 236 | ||
233 | ->write_begin(), ->write_end(), ->sync_page() and ->readpage() | 237 | ->write_begin(), ->write_end(), ->sync_page() and ->readpage() |
234 | may be called from the request handler (/dev/loop). | 238 | may be called from the request handler (/dev/loop). |
@@ -330,6 +334,15 @@ cleaned, or an error value if not. Note that in order to prevent the page | |||
330 | getting mapped back in and redirtied, it needs to be kept locked | 334 | getting mapped back in and redirtied, it needs to be kept locked |
331 | across the entire operation. | 335 | across the entire operation. |
332 | 336 | ||
337 | ->swap_activate will be called with a non-zero argument on | ||
338 | files backing (non block device backed) swapfiles. A return value | ||
339 | of zero indicates success, in which case this file can be used for | ||
340 | backing swapspace. The swapspace operations will be proxied to the | ||
341 | address space operations. | ||
342 | |||
343 | ->swap_deactivate() will be called in the sys_swapoff() | ||
344 | path after ->swap_activate() returned success. | ||
345 | |||
333 | ----------------------- file_lock_operations ------------------------------ | 346 | ----------------------- file_lock_operations ------------------------------ |
334 | prototypes: | 347 | prototypes: |
335 | void (*fl_copy_lock)(struct file_lock *, struct file_lock *); | 348 | void (*fl_copy_lock)(struct file_lock *, struct file_lock *); |
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index aa754e01464e..065aa2dc0835 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt | |||
@@ -592,6 +592,8 @@ struct address_space_operations { | |||
592 | int (*migratepage) (struct page *, struct page *); | 592 | int (*migratepage) (struct page *, struct page *); |
593 | int (*launder_page) (struct page *); | 593 | int (*launder_page) (struct page *); |
594 | int (*error_remove_page) (struct mapping *mapping, struct page *page); | 594 | int (*error_remove_page) (struct mapping *mapping, struct page *page); |
595 | int (*swap_activate)(struct file *); | ||
596 | int (*swap_deactivate)(struct file *); | ||
595 | }; | 597 | }; |
596 | 598 | ||
597 | writepage: called by the VM to write a dirty page to backing store. | 599 | writepage: called by the VM to write a dirty page to backing store. |
@@ -760,6 +762,16 @@ struct address_space_operations { | |||
760 | Setting this implies you deal with pages going away under you, | 762 | Setting this implies you deal with pages going away under you, |
761 | unless you have them locked or reference counts increased. | 763 | unless you have them locked or reference counts increased. |
762 | 764 | ||
765 | swap_activate: Called when swapon is used on a file to allocate | ||
766 | space if necessary and pin the block lookup information in | ||
767 | memory. A return value of zero indicates success, | ||
768 | in which case this file can be used to back swapspace. The | ||
769 | swapspace operations will be proxied to this address space's | ||
770 | ->swap_{out,in} methods. | ||
771 | |||
772 | swap_deactivate: Called during swapoff on files where swap_activate | ||
773 | was successful. | ||
774 | |||
763 | 775 | ||
764 | The File Object | 776 | The File Object |
765 | =============== | 777 | =============== |
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 96f0ee825bed..dcc2a94ae34e 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt | |||
@@ -42,7 +42,6 @@ Currently, these files are in /proc/sys/vm: | |||
42 | - mmap_min_addr | 42 | - mmap_min_addr |
43 | - nr_hugepages | 43 | - nr_hugepages |
44 | - nr_overcommit_hugepages | 44 | - nr_overcommit_hugepages |
45 | - nr_pdflush_threads | ||
46 | - nr_trim_pages (only if CONFIG_MMU=n) | 45 | - nr_trim_pages (only if CONFIG_MMU=n) |
47 | - numa_zonelist_order | 46 | - numa_zonelist_order |
48 | - oom_dump_tasks | 47 | - oom_dump_tasks |
@@ -426,16 +425,6 @@ See Documentation/vm/hugetlbpage.txt | |||
426 | 425 | ||
427 | ============================================================== | 426 | ============================================================== |
428 | 427 | ||
429 | nr_pdflush_threads | ||
430 | |||
431 | The current number of pdflush threads. This value is read-only. | ||
432 | The value changes according to the number of dirty pages in the system. | ||
433 | |||
434 | When necessary, additional pdflush threads are created, one per second, up to | ||
435 | nr_pdflush_threads_max. | ||
436 | |||
437 | ============================================================== | ||
438 | |||
439 | nr_trim_pages | 428 | nr_trim_pages |
440 | 429 | ||
441 | This is available only on NOMMU kernels. | 430 | This is available only on NOMMU kernels. |
@@ -502,9 +491,10 @@ oom_dump_tasks | |||
502 | 491 | ||
503 | Enables a system-wide task dump (excluding kernel threads) to be | 492 | Enables a system-wide task dump (excluding kernel threads) to be |
504 | produced when the kernel performs an OOM-killing and includes such | 493 | produced when the kernel performs an OOM-killing and includes such |
505 | information as pid, uid, tgid, vm size, rss, cpu, oom_adj score, and | 494 | information as pid, uid, tgid, vm size, rss, nr_ptes, swapents, |
506 | name. This is helpful to determine why the OOM killer was invoked | 495 | oom_score_adj score, and name. This is helpful to determine why the |
507 | and to identify the rogue task that caused it. | 496 | OOM killer was invoked, to identify the rogue task that caused it, |
497 | and to determine why the OOM killer chose the task it did to kill. | ||
508 | 498 | ||
509 | If this is set to zero, this information is suppressed. On very | 499 | If this is set to zero, this information is suppressed. On very |
510 | large systems with thousands of tasks it may not be feasible to dump | 500 | large systems with thousands of tasks it may not be feasible to dump |
@@ -574,16 +564,24 @@ of physical RAM. See above. | |||
574 | 564 | ||
575 | page-cluster | 565 | page-cluster |
576 | 566 | ||
577 | page-cluster controls the number of pages which are written to swap in | 567 | page-cluster controls the number of pages up to which consecutive pages |
578 | a single attempt. The swap I/O size. | 568 | are read in from swap in a single attempt. This is the swap counterpart |
569 | to page cache readahead. | ||
570 | The mentioned consecutivity is not in terms of virtual/physical addresses, | ||
571 | but consecutive on swap space - that means they were swapped out together. | ||
579 | 572 | ||
580 | It is a logarithmic value - setting it to zero means "1 page", setting | 573 | It is a logarithmic value - setting it to zero means "1 page", setting |
581 | it to 1 means "2 pages", setting it to 2 means "4 pages", etc. | 574 | it to 1 means "2 pages", setting it to 2 means "4 pages", etc. |
575 | Zero disables swap readahead completely. | ||
582 | 576 | ||
583 | The default value is three (eight pages at a time). There may be some | 577 | The default value is three (eight pages at a time). There may be some |
584 | small benefits in tuning this to a different value if your workload is | 578 | small benefits in tuning this to a different value if your workload is |
585 | swap-intensive. | 579 | swap-intensive. |
586 | 580 | ||
581 | Lower values mean lower latencies for initial faults, but at the same time | ||
582 | extra faults and I/O delays for following faults if they would have been part of | ||
583 | that consecutive pages readahead would have brought in. | ||
584 | |||
587 | ============================================================= | 585 | ============================================================= |
588 | 586 | ||
589 | panic_on_oom | 587 | panic_on_oom |
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index d7f558c1e711..3fa4bc536953 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c | |||
@@ -2353,7 +2353,6 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t | |||
2353 | */ | 2353 | */ |
2354 | insert_vm_struct(mm, vma); | 2354 | insert_vm_struct(mm, vma); |
2355 | 2355 | ||
2356 | mm->total_vm += size >> PAGE_SHIFT; | ||
2357 | vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, | 2356 | vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, |
2358 | vma_pages(vma)); | 2357 | vma_pages(vma)); |
2359 | up_write(&task->mm->mmap_sem); | 2358 | up_write(&task->mm->mmap_sem); |
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index b105eca3c020..cd8fcab6b054 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c | |||
@@ -401,6 +401,7 @@ static void __init node_mem_init(cnodeid_t node) | |||
401 | * Allocate the node data structures on the node first. | 401 | * Allocate the node data structures on the node first. |
402 | */ | 402 | */ |
403 | __node_data[node] = __va(slot_freepfn << PAGE_SHIFT); | 403 | __node_data[node] = __va(slot_freepfn << PAGE_SHIFT); |
404 | memset(__node_data[node], 0, PAGE_SIZE); | ||
404 | 405 | ||
405 | NODE_DATA(node)->bdata = &bootmem_node_data[node]; | 406 | NODE_DATA(node)->bdata = &bootmem_node_data[node]; |
406 | NODE_DATA(node)->node_start_pfn = start_pfn; | 407 | NODE_DATA(node)->node_start_pfn = start_pfn; |
diff --git a/arch/powerpc/configs/chroma_defconfig b/arch/powerpc/configs/chroma_defconfig index b1f9597fe312..29bb11ec6c64 100644 --- a/arch/powerpc/configs/chroma_defconfig +++ b/arch/powerpc/configs/chroma_defconfig | |||
@@ -21,8 +21,8 @@ CONFIG_CGROUP_DEVICE=y | |||
21 | CONFIG_CPUSETS=y | 21 | CONFIG_CPUSETS=y |
22 | CONFIG_CGROUP_CPUACCT=y | 22 | CONFIG_CGROUP_CPUACCT=y |
23 | CONFIG_RESOURCE_COUNTERS=y | 23 | CONFIG_RESOURCE_COUNTERS=y |
24 | CONFIG_CGROUP_MEM_RES_CTLR=y | 24 | CONFIG_CGROUP_MEMCG=y |
25 | CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y | 25 | CONFIG_CGROUP_MEMCG_SWAP=y |
26 | CONFIG_NAMESPACES=y | 26 | CONFIG_NAMESPACES=y |
27 | CONFIG_RELAY=y | 27 | CONFIG_RELAY=y |
28 | CONFIG_BLK_DEV_INITRD=y | 28 | CONFIG_BLK_DEV_INITRD=y |
diff --git a/arch/s390/defconfig b/arch/s390/defconfig index 967923dea98d..f39cd710980b 100644 --- a/arch/s390/defconfig +++ b/arch/s390/defconfig | |||
@@ -16,7 +16,7 @@ CONFIG_CGROUPS=y | |||
16 | CONFIG_CPUSETS=y | 16 | CONFIG_CPUSETS=y |
17 | CONFIG_CGROUP_CPUACCT=y | 17 | CONFIG_CGROUP_CPUACCT=y |
18 | CONFIG_RESOURCE_COUNTERS=y | 18 | CONFIG_RESOURCE_COUNTERS=y |
19 | CONFIG_CGROUP_MEM_RES_CTLR=y | 19 | CONFIG_CGROUP_MEMCG=y |
20 | CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y | 20 | CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y |
21 | CONFIG_CGROUP_SCHED=y | 21 | CONFIG_CGROUP_SCHED=y |
22 | CONFIG_RT_GROUP_SCHED=y | 22 | CONFIG_RT_GROUP_SCHED=y |
diff --git a/arch/sh/configs/apsh4ad0a_defconfig b/arch/sh/configs/apsh4ad0a_defconfig index e7583484cc07..95ae23fcfdd6 100644 --- a/arch/sh/configs/apsh4ad0a_defconfig +++ b/arch/sh/configs/apsh4ad0a_defconfig | |||
@@ -11,7 +11,7 @@ CONFIG_CGROUP_FREEZER=y | |||
11 | CONFIG_CGROUP_DEVICE=y | 11 | CONFIG_CGROUP_DEVICE=y |
12 | CONFIG_CGROUP_CPUACCT=y | 12 | CONFIG_CGROUP_CPUACCT=y |
13 | CONFIG_RESOURCE_COUNTERS=y | 13 | CONFIG_RESOURCE_COUNTERS=y |
14 | CONFIG_CGROUP_MEM_RES_CTLR=y | 14 | CONFIG_CGROUP_MEMCG=y |
15 | CONFIG_BLK_CGROUP=y | 15 | CONFIG_BLK_CGROUP=y |
16 | CONFIG_NAMESPACES=y | 16 | CONFIG_NAMESPACES=y |
17 | CONFIG_BLK_DEV_INITRD=y | 17 | CONFIG_BLK_DEV_INITRD=y |
diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig index 8a7dd7b59c5c..76a76a295d74 100644 --- a/arch/sh/configs/sdk7786_defconfig +++ b/arch/sh/configs/sdk7786_defconfig | |||
@@ -18,8 +18,8 @@ CONFIG_CPUSETS=y | |||
18 | # CONFIG_PROC_PID_CPUSET is not set | 18 | # CONFIG_PROC_PID_CPUSET is not set |
19 | CONFIG_CGROUP_CPUACCT=y | 19 | CONFIG_CGROUP_CPUACCT=y |
20 | CONFIG_RESOURCE_COUNTERS=y | 20 | CONFIG_RESOURCE_COUNTERS=y |
21 | CONFIG_CGROUP_MEM_RES_CTLR=y | 21 | CONFIG_CGROUP_MEMCG=y |
22 | CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y | 22 | CONFIG_CGROUP_MEMCG_SWAP=y |
23 | CONFIG_CGROUP_SCHED=y | 23 | CONFIG_CGROUP_SCHED=y |
24 | CONFIG_RT_GROUP_SCHED=y | 24 | CONFIG_RT_GROUP_SCHED=y |
25 | CONFIG_BLK_CGROUP=y | 25 | CONFIG_BLK_CGROUP=y |
diff --git a/arch/sh/configs/se7206_defconfig b/arch/sh/configs/se7206_defconfig index 72c3fad7383f..6bc30ab9fd18 100644 --- a/arch/sh/configs/se7206_defconfig +++ b/arch/sh/configs/se7206_defconfig | |||
@@ -11,7 +11,7 @@ CONFIG_CGROUP_DEBUG=y | |||
11 | CONFIG_CGROUP_DEVICE=y | 11 | CONFIG_CGROUP_DEVICE=y |
12 | CONFIG_CGROUP_CPUACCT=y | 12 | CONFIG_CGROUP_CPUACCT=y |
13 | CONFIG_RESOURCE_COUNTERS=y | 13 | CONFIG_RESOURCE_COUNTERS=y |
14 | CONFIG_CGROUP_MEM_RES_CTLR=y | 14 | CONFIG_CGROUP_MEMCG=y |
15 | CONFIG_RELAY=y | 15 | CONFIG_RELAY=y |
16 | CONFIG_NAMESPACES=y | 16 | CONFIG_NAMESPACES=y |
17 | CONFIG_UTS_NS=y | 17 | CONFIG_UTS_NS=y |
diff --git a/arch/sh/configs/shx3_defconfig b/arch/sh/configs/shx3_defconfig index 6bb413036892..cd6c519f8fad 100644 --- a/arch/sh/configs/shx3_defconfig +++ b/arch/sh/configs/shx3_defconfig | |||
@@ -13,7 +13,7 @@ CONFIG_CGROUP_FREEZER=y | |||
13 | CONFIG_CGROUP_DEVICE=y | 13 | CONFIG_CGROUP_DEVICE=y |
14 | CONFIG_CGROUP_CPUACCT=y | 14 | CONFIG_CGROUP_CPUACCT=y |
15 | CONFIG_RESOURCE_COUNTERS=y | 15 | CONFIG_RESOURCE_COUNTERS=y |
16 | CONFIG_CGROUP_MEM_RES_CTLR=y | 16 | CONFIG_CGROUP_MEMCG=y |
17 | CONFIG_RELAY=y | 17 | CONFIG_RELAY=y |
18 | CONFIG_NAMESPACES=y | 18 | CONFIG_NAMESPACES=y |
19 | CONFIG_UTS_NS=y | 19 | CONFIG_UTS_NS=y |
diff --git a/arch/sh/configs/urquell_defconfig b/arch/sh/configs/urquell_defconfig index 8bfa4d056d7a..d7f89be9f474 100644 --- a/arch/sh/configs/urquell_defconfig +++ b/arch/sh/configs/urquell_defconfig | |||
@@ -15,8 +15,8 @@ CONFIG_CPUSETS=y | |||
15 | # CONFIG_PROC_PID_CPUSET is not set | 15 | # CONFIG_PROC_PID_CPUSET is not set |
16 | CONFIG_CGROUP_CPUACCT=y | 16 | CONFIG_CGROUP_CPUACCT=y |
17 | CONFIG_RESOURCE_COUNTERS=y | 17 | CONFIG_RESOURCE_COUNTERS=y |
18 | CONFIG_CGROUP_MEM_RES_CTLR=y | 18 | CONFIG_CGROUP_MEMCG=y |
19 | CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y | 19 | CONFIG_CGROUP_MEMCG_SWAP=y |
20 | CONFIG_CGROUP_SCHED=y | 20 | CONFIG_CGROUP_SCHED=y |
21 | CONFIG_RT_GROUP_SCHED=y | 21 | CONFIG_RT_GROUP_SCHED=y |
22 | CONFIG_BLK_DEV_INITRD=y | 22 | CONFIG_BLK_DEV_INITRD=y |
diff --git a/arch/tile/configs/tilegx_defconfig b/arch/tile/configs/tilegx_defconfig index b8d99aca5431..0270620a1692 100644 --- a/arch/tile/configs/tilegx_defconfig +++ b/arch/tile/configs/tilegx_defconfig | |||
@@ -18,8 +18,8 @@ CONFIG_CGROUP_DEVICE=y | |||
18 | CONFIG_CPUSETS=y | 18 | CONFIG_CPUSETS=y |
19 | CONFIG_CGROUP_CPUACCT=y | 19 | CONFIG_CGROUP_CPUACCT=y |
20 | CONFIG_RESOURCE_COUNTERS=y | 20 | CONFIG_RESOURCE_COUNTERS=y |
21 | CONFIG_CGROUP_MEM_RES_CTLR=y | 21 | CONFIG_CGROUP_MEMCG=y |
22 | CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y | 22 | CONFIG_CGROUP_MEMCG_SWAP=y |
23 | CONFIG_CGROUP_SCHED=y | 23 | CONFIG_CGROUP_SCHED=y |
24 | CONFIG_RT_GROUP_SCHED=y | 24 | CONFIG_RT_GROUP_SCHED=y |
25 | CONFIG_BLK_CGROUP=y | 25 | CONFIG_BLK_CGROUP=y |
diff --git a/arch/tile/configs/tilepro_defconfig b/arch/tile/configs/tilepro_defconfig index 2b1fd31894f1..c11de27a9bcb 100644 --- a/arch/tile/configs/tilepro_defconfig +++ b/arch/tile/configs/tilepro_defconfig | |||
@@ -17,8 +17,8 @@ CONFIG_CGROUP_DEVICE=y | |||
17 | CONFIG_CPUSETS=y | 17 | CONFIG_CPUSETS=y |
18 | CONFIG_CGROUP_CPUACCT=y | 18 | CONFIG_CGROUP_CPUACCT=y |
19 | CONFIG_RESOURCE_COUNTERS=y | 19 | CONFIG_RESOURCE_COUNTERS=y |
20 | CONFIG_CGROUP_MEM_RES_CTLR=y | 20 | CONFIG_CGROUP_MEMCG=y |
21 | CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y | 21 | CONFIG_CGROUP_MEMCG_SWAP=y |
22 | CONFIG_CGROUP_SCHED=y | 22 | CONFIG_CGROUP_SCHED=y |
23 | CONFIG_RT_GROUP_SCHED=y | 23 | CONFIG_RT_GROUP_SCHED=y |
24 | CONFIG_BLK_CGROUP=y | 24 | CONFIG_BLK_CGROUP=y |
diff --git a/arch/um/defconfig b/arch/um/defconfig index 7823ab12e6a4..fec0d5d27460 100644 --- a/arch/um/defconfig +++ b/arch/um/defconfig | |||
@@ -155,10 +155,10 @@ CONFIG_CPUSETS=y | |||
155 | CONFIG_PROC_PID_CPUSET=y | 155 | CONFIG_PROC_PID_CPUSET=y |
156 | CONFIG_CGROUP_CPUACCT=y | 156 | CONFIG_CGROUP_CPUACCT=y |
157 | CONFIG_RESOURCE_COUNTERS=y | 157 | CONFIG_RESOURCE_COUNTERS=y |
158 | CONFIG_CGROUP_MEM_RES_CTLR=y | 158 | CONFIG_CGROUP_MEMCG=y |
159 | CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y | 159 | CONFIG_CGROUP_MEMCG_SWAP=y |
160 | # CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED is not set | 160 | # CONFIG_CGROUP_MEMCG_SWAP_ENABLED is not set |
161 | # CONFIG_CGROUP_MEM_RES_CTLR_KMEM is not set | 161 | # CONFIG_CGROUP_MEMCG_KMEM is not set |
162 | CONFIG_CGROUP_SCHED=y | 162 | CONFIG_CGROUP_SCHED=y |
163 | CONFIG_FAIR_GROUP_SCHED=y | 163 | CONFIG_FAIR_GROUP_SCHED=y |
164 | # CONFIG_CFS_BANDWIDTH is not set | 164 | # CONFIG_CFS_BANDWIDTH is not set |
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 8a3f8351f438..8ed64cfae4ff 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig | |||
@@ -7,6 +7,7 @@ config ZONE_DMA | |||
7 | config XTENSA | 7 | config XTENSA |
8 | def_bool y | 8 | def_bool y |
9 | select HAVE_IDE | 9 | select HAVE_IDE |
10 | select GENERIC_ATOMIC64 | ||
10 | select HAVE_GENERIC_HARDIRQS | 11 | select HAVE_GENERIC_HARDIRQS |
11 | select GENERIC_IRQ_SHOW | 12 | select GENERIC_IRQ_SHOW |
12 | select GENERIC_CPU_DEVICES | 13 | select GENERIC_CPU_DEVICES |
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 9b21469482ae..08b4c5209384 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig | |||
@@ -196,6 +196,7 @@ config CMA | |||
196 | bool "Contiguous Memory Allocator (EXPERIMENTAL)" | 196 | bool "Contiguous Memory Allocator (EXPERIMENTAL)" |
197 | depends on HAVE_DMA_CONTIGUOUS && HAVE_MEMBLOCK && EXPERIMENTAL | 197 | depends on HAVE_DMA_CONTIGUOUS && HAVE_MEMBLOCK && EXPERIMENTAL |
198 | select MIGRATION | 198 | select MIGRATION |
199 | select MEMORY_ISOLATION | ||
199 | help | 200 | help |
200 | This enables the Contiguous Memory Allocator which allows drivers | 201 | This enables the Contiguous Memory Allocator which allows drivers |
201 | to allocate big physically-contiguous blocks of memory for use with | 202 | to allocate big physically-contiguous blocks of memory for use with |
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 061427a75d37..76bc96fd01c8 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c | |||
@@ -154,6 +154,7 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, | |||
154 | struct msghdr msg; | 154 | struct msghdr msg; |
155 | struct kvec iov; | 155 | struct kvec iov; |
156 | sigset_t blocked, oldset; | 156 | sigset_t blocked, oldset; |
157 | unsigned long pflags = current->flags; | ||
157 | 158 | ||
158 | if (unlikely(!sock)) { | 159 | if (unlikely(!sock)) { |
159 | dev_err(disk_to_dev(nbd->disk), | 160 | dev_err(disk_to_dev(nbd->disk), |
@@ -167,8 +168,9 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, | |||
167 | siginitsetinv(&blocked, sigmask(SIGKILL)); | 168 | siginitsetinv(&blocked, sigmask(SIGKILL)); |
168 | sigprocmask(SIG_SETMASK, &blocked, &oldset); | 169 | sigprocmask(SIG_SETMASK, &blocked, &oldset); |
169 | 170 | ||
171 | current->flags |= PF_MEMALLOC; | ||
170 | do { | 172 | do { |
171 | sock->sk->sk_allocation = GFP_NOIO; | 173 | sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC; |
172 | iov.iov_base = buf; | 174 | iov.iov_base = buf; |
173 | iov.iov_len = size; | 175 | iov.iov_len = size; |
174 | msg.msg_name = NULL; | 176 | msg.msg_name = NULL; |
@@ -214,6 +216,7 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, | |||
214 | } while (size > 0); | 216 | } while (size > 0); |
215 | 217 | ||
216 | sigprocmask(SIG_SETMASK, &oldset, NULL); | 218 | sigprocmask(SIG_SETMASK, &oldset, NULL); |
219 | tsk_restore_flags(current, pflags, PF_MEMALLOC); | ||
217 | 220 | ||
218 | return result; | 221 | return result; |
219 | } | 222 | } |
@@ -405,6 +408,7 @@ static int nbd_do_it(struct nbd_device *nbd) | |||
405 | 408 | ||
406 | BUG_ON(nbd->magic != NBD_MAGIC); | 409 | BUG_ON(nbd->magic != NBD_MAGIC); |
407 | 410 | ||
411 | sk_set_memalloc(nbd->sock->sk); | ||
408 | nbd->pid = task_pid_nr(current); | 412 | nbd->pid = task_pid_nr(current); |
409 | ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr); | 413 | ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr); |
410 | if (ret) { | 414 | if (ret) { |
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index 8596acaa402b..d49933ed551f 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c | |||
@@ -528,7 +528,7 @@ static unsigned int refill_fl(struct adapter *adap, struct sge_fl *q, int n, | |||
528 | #endif | 528 | #endif |
529 | 529 | ||
530 | while (n--) { | 530 | while (n--) { |
531 | pg = alloc_page(gfp); | 531 | pg = __skb_alloc_page(gfp, NULL); |
532 | if (unlikely(!pg)) { | 532 | if (unlikely(!pg)) { |
533 | q->alloc_failed++; | 533 | q->alloc_failed++; |
534 | break; | 534 | break; |
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c index f2d1ecdcaf98..8877fbfefb63 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c | |||
@@ -653,7 +653,7 @@ static unsigned int refill_fl(struct adapter *adapter, struct sge_fl *fl, | |||
653 | 653 | ||
654 | alloc_small_pages: | 654 | alloc_small_pages: |
655 | while (n--) { | 655 | while (n--) { |
656 | page = alloc_page(gfp | __GFP_NOWARN | __GFP_COLD); | 656 | page = __skb_alloc_page(gfp | __GFP_NOWARN, NULL); |
657 | if (unlikely(!page)) { | 657 | if (unlikely(!page)) { |
658 | fl->alloc_failed++; | 658 | fl->alloc_failed++; |
659 | break; | 659 | break; |
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 1050411e7ca3..b7c2d5050572 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c | |||
@@ -6235,7 +6235,7 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring, | |||
6235 | return true; | 6235 | return true; |
6236 | 6236 | ||
6237 | if (!page) { | 6237 | if (!page) { |
6238 | page = alloc_page(GFP_ATOMIC | __GFP_COLD); | 6238 | page = __skb_alloc_page(GFP_ATOMIC, bi->skb); |
6239 | bi->page = page; | 6239 | bi->page = page; |
6240 | if (unlikely(!page)) { | 6240 | if (unlikely(!page)) { |
6241 | rx_ring->rx_stats.alloc_failed++; | 6241 | rx_ring->rx_stats.alloc_failed++; |
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index c709eae58c63..4326f74f7137 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | |||
@@ -1141,8 +1141,8 @@ static bool ixgbe_alloc_mapped_page(struct ixgbe_ring *rx_ring, | |||
1141 | 1141 | ||
1142 | /* alloc new page for storage */ | 1142 | /* alloc new page for storage */ |
1143 | if (likely(!page)) { | 1143 | if (likely(!page)) { |
1144 | page = alloc_pages(GFP_ATOMIC | __GFP_COLD | __GFP_COMP, | 1144 | page = __skb_alloc_pages(GFP_ATOMIC | __GFP_COLD | __GFP_COMP, |
1145 | ixgbe_rx_pg_order(rx_ring)); | 1145 | bi->skb, ixgbe_rx_pg_order(rx_ring)); |
1146 | if (unlikely(!page)) { | 1146 | if (unlikely(!page)) { |
1147 | rx_ring->rx_stats.alloc_rx_page_failed++; | 1147 | rx_ring->rx_stats.alloc_rx_page_failed++; |
1148 | return false; | 1148 | return false; |
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 3f9841d619ad..60ef64587412 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | |||
@@ -352,7 +352,6 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_adapter *adapter, | |||
352 | adapter->alloc_rx_buff_failed++; | 352 | adapter->alloc_rx_buff_failed++; |
353 | goto no_buffers; | 353 | goto no_buffers; |
354 | } | 354 | } |
355 | |||
356 | bi->skb = skb; | 355 | bi->skb = skb; |
357 | } | 356 | } |
358 | if (!bi->dma) { | 357 | if (!bi->dma) { |
diff --git a/drivers/net/usb/cdc-phonet.c b/drivers/net/usb/cdc-phonet.c index 187c144c5e5b..64610048ce87 100644 --- a/drivers/net/usb/cdc-phonet.c +++ b/drivers/net/usb/cdc-phonet.c | |||
@@ -130,7 +130,7 @@ static int rx_submit(struct usbpn_dev *pnd, struct urb *req, gfp_t gfp_flags) | |||
130 | struct page *page; | 130 | struct page *page; |
131 | int err; | 131 | int err; |
132 | 132 | ||
133 | page = alloc_page(gfp_flags); | 133 | page = __skb_alloc_page(gfp_flags | __GFP_NOMEMALLOC, NULL); |
134 | if (!page) | 134 | if (!page) |
135 | return -ENOMEM; | 135 | return -ENOMEM; |
136 | 136 | ||
diff --git a/drivers/rtc/rtc-88pm80x.c b/drivers/rtc/rtc-88pm80x.c index a2f956d90de0..6367984e0565 100644 --- a/drivers/rtc/rtc-88pm80x.c +++ b/drivers/rtc/rtc-88pm80x.c | |||
@@ -314,8 +314,8 @@ static int __devinit pm80x_rtc_probe(struct platform_device *pdev) | |||
314 | 314 | ||
315 | info->rtc_dev = rtc_device_register("88pm80x-rtc", &pdev->dev, | 315 | info->rtc_dev = rtc_device_register("88pm80x-rtc", &pdev->dev, |
316 | &pm80x_rtc_ops, THIS_MODULE); | 316 | &pm80x_rtc_ops, THIS_MODULE); |
317 | ret = PTR_ERR(info->rtc_dev); | ||
318 | if (IS_ERR(info->rtc_dev)) { | 317 | if (IS_ERR(info->rtc_dev)) { |
318 | ret = PTR_ERR(info->rtc_dev); | ||
319 | dev_err(&pdev->dev, "Failed to register RTC device: %d\n", ret); | 319 | dev_err(&pdev->dev, "Failed to register RTC device: %d\n", ret); |
320 | goto out_rtc; | 320 | goto out_rtc; |
321 | } | 321 | } |
@@ -339,7 +339,6 @@ static int __devinit pm80x_rtc_probe(struct platform_device *pdev) | |||
339 | out_rtc: | 339 | out_rtc: |
340 | pm80x_free_irq(chip, info->irq, info); | 340 | pm80x_free_irq(chip, info->irq, info); |
341 | out: | 341 | out: |
342 | devm_kfree(&pdev->dev, info); | ||
343 | return ret; | 342 | return ret; |
344 | } | 343 | } |
345 | 344 | ||
@@ -349,7 +348,6 @@ static int __devexit pm80x_rtc_remove(struct platform_device *pdev) | |||
349 | platform_set_drvdata(pdev, NULL); | 348 | platform_set_drvdata(pdev, NULL); |
350 | rtc_device_unregister(info->rtc_dev); | 349 | rtc_device_unregister(info->rtc_dev); |
351 | pm80x_free_irq(info->chip, info->irq, info); | 350 | pm80x_free_irq(info->chip, info->irq, info); |
352 | devm_kfree(&pdev->dev, info); | ||
353 | return 0; | 351 | return 0; |
354 | } | 352 | } |
355 | 353 | ||
diff --git a/drivers/usb/gadget/f_phonet.c b/drivers/usb/gadget/f_phonet.c index 965a6293206a..8ee9268fe253 100644 --- a/drivers/usb/gadget/f_phonet.c +++ b/drivers/usb/gadget/f_phonet.c | |||
@@ -301,7 +301,7 @@ pn_rx_submit(struct f_phonet *fp, struct usb_request *req, gfp_t gfp_flags) | |||
301 | struct page *page; | 301 | struct page *page; |
302 | int err; | 302 | int err; |
303 | 303 | ||
304 | page = alloc_page(gfp_flags); | 304 | page = __skb_alloc_page(gfp_flags | __GFP_NOMEMALLOC, NULL); |
305 | if (!page) | 305 | if (!page) |
306 | return -ENOMEM; | 306 | return -ENOMEM; |
307 | 307 | ||
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 50d0b78130a1..be3efc4f64f4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -52,11 +52,6 @@ struct wb_writeback_work { | |||
52 | struct completion *done; /* set if the caller waits */ | 52 | struct completion *done; /* set if the caller waits */ |
53 | }; | 53 | }; |
54 | 54 | ||
55 | /* | ||
56 | * We don't actually have pdflush, but this one is exported though /proc... | ||
57 | */ | ||
58 | int nr_pdflush_threads; | ||
59 | |||
60 | /** | 55 | /** |
61 | * writeback_in_progress - determine whether there is writeback in progress | 56 | * writeback_in_progress - determine whether there is writeback in progress |
62 | * @bdi: the device's backing_dev_info structure. | 57 | * @bdi: the device's backing_dev_info structure. |
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index e13e9bdb0bf5..8349a899912e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -416,8 +416,8 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff) | |||
416 | else | 416 | else |
417 | v_offset = 0; | 417 | v_offset = 0; |
418 | 418 | ||
419 | __unmap_hugepage_range(vma, | 419 | unmap_hugepage_range(vma, vma->vm_start + v_offset, |
420 | vma->vm_start + v_offset, vma->vm_end, NULL); | 420 | vma->vm_end, NULL); |
421 | } | 421 | } |
422 | } | 422 | } |
423 | 423 | ||
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 195c1ea6151a..db7ad719628a 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig | |||
@@ -86,6 +86,14 @@ config NFS_V4 | |||
86 | 86 | ||
87 | If unsure, say Y. | 87 | If unsure, say Y. |
88 | 88 | ||
89 | config NFS_SWAP | ||
90 | bool "Provide swap over NFS support" | ||
91 | default n | ||
92 | depends on NFS_FS | ||
93 | select SUNRPC_SWAP | ||
94 | help | ||
95 | This option enables swapon to work on files located on NFS mounts. | ||
96 | |||
89 | config NFS_V4_1 | 97 | config NFS_V4_1 |
90 | bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" | 98 | bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" |
91 | depends on NFS_V4 && EXPERIMENTAL | 99 | depends on NFS_V4 && EXPERIMENTAL |
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index b7b4f80968b5..1ba385b7c90d 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c | |||
@@ -115,17 +115,28 @@ static inline int put_dreq(struct nfs_direct_req *dreq) | |||
115 | * @nr_segs: size of iovec array | 115 | * @nr_segs: size of iovec array |
116 | * | 116 | * |
117 | * The presence of this routine in the address space ops vector means | 117 | * The presence of this routine in the address space ops vector means |
118 | * the NFS client supports direct I/O. However, we shunt off direct | 118 | * the NFS client supports direct I/O. However, for most direct IO, we |
119 | * read and write requests before the VFS gets them, so this method | 119 | * shunt off direct read and write requests before the VFS gets them, |
120 | * should never be called. | 120 | * so this method is only ever called for swap. |
121 | */ | 121 | */ |
122 | ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) | 122 | ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) |
123 | { | 123 | { |
124 | #ifndef CONFIG_NFS_SWAP | ||
124 | dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n", | 125 | dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n", |
125 | iocb->ki_filp->f_path.dentry->d_name.name, | 126 | iocb->ki_filp->f_path.dentry->d_name.name, |
126 | (long long) pos, nr_segs); | 127 | (long long) pos, nr_segs); |
127 | 128 | ||
128 | return -EINVAL; | 129 | return -EINVAL; |
130 | #else | ||
131 | VM_BUG_ON(iocb->ki_left != PAGE_SIZE); | ||
132 | VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); | ||
133 | |||
134 | if (rw == READ || rw == KERNEL_READ) | ||
135 | return nfs_file_direct_read(iocb, iov, nr_segs, pos, | ||
136 | rw == READ ? true : false); | ||
137 | return nfs_file_direct_write(iocb, iov, nr_segs, pos, | ||
138 | rw == WRITE ? true : false); | ||
139 | #endif /* CONFIG_NFS_SWAP */ | ||
129 | } | 140 | } |
130 | 141 | ||
131 | static void nfs_direct_release_pages(struct page **pages, unsigned int npages) | 142 | static void nfs_direct_release_pages(struct page **pages, unsigned int npages) |
@@ -303,7 +314,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = { | |||
303 | */ | 314 | */ |
304 | static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc, | 315 | static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc, |
305 | const struct iovec *iov, | 316 | const struct iovec *iov, |
306 | loff_t pos) | 317 | loff_t pos, bool uio) |
307 | { | 318 | { |
308 | struct nfs_direct_req *dreq = desc->pg_dreq; | 319 | struct nfs_direct_req *dreq = desc->pg_dreq; |
309 | struct nfs_open_context *ctx = dreq->ctx; | 320 | struct nfs_open_context *ctx = dreq->ctx; |
@@ -331,12 +342,20 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de | |||
331 | GFP_KERNEL); | 342 | GFP_KERNEL); |
332 | if (!pagevec) | 343 | if (!pagevec) |
333 | break; | 344 | break; |
334 | down_read(¤t->mm->mmap_sem); | 345 | if (uio) { |
335 | result = get_user_pages(current, current->mm, user_addr, | 346 | down_read(¤t->mm->mmap_sem); |
347 | result = get_user_pages(current, current->mm, user_addr, | ||
336 | npages, 1, 0, pagevec, NULL); | 348 | npages, 1, 0, pagevec, NULL); |
337 | up_read(¤t->mm->mmap_sem); | 349 | up_read(¤t->mm->mmap_sem); |
338 | if (result < 0) | 350 | if (result < 0) |
339 | break; | 351 | break; |
352 | } else { | ||
353 | WARN_ON(npages != 1); | ||
354 | result = get_kernel_page(user_addr, 1, pagevec); | ||
355 | if (WARN_ON(result != 1)) | ||
356 | break; | ||
357 | } | ||
358 | |||
340 | if ((unsigned)result < npages) { | 359 | if ((unsigned)result < npages) { |
341 | bytes = result * PAGE_SIZE; | 360 | bytes = result * PAGE_SIZE; |
342 | if (bytes <= pgbase) { | 361 | if (bytes <= pgbase) { |
@@ -386,7 +405,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de | |||
386 | static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, | 405 | static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, |
387 | const struct iovec *iov, | 406 | const struct iovec *iov, |
388 | unsigned long nr_segs, | 407 | unsigned long nr_segs, |
389 | loff_t pos) | 408 | loff_t pos, bool uio) |
390 | { | 409 | { |
391 | struct nfs_pageio_descriptor desc; | 410 | struct nfs_pageio_descriptor desc; |
392 | ssize_t result = -EINVAL; | 411 | ssize_t result = -EINVAL; |
@@ -400,7 +419,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, | |||
400 | 419 | ||
401 | for (seg = 0; seg < nr_segs; seg++) { | 420 | for (seg = 0; seg < nr_segs; seg++) { |
402 | const struct iovec *vec = &iov[seg]; | 421 | const struct iovec *vec = &iov[seg]; |
403 | result = nfs_direct_read_schedule_segment(&desc, vec, pos); | 422 | result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio); |
404 | if (result < 0) | 423 | if (result < 0) |
405 | break; | 424 | break; |
406 | requested_bytes += result; | 425 | requested_bytes += result; |
@@ -426,7 +445,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, | |||
426 | } | 445 | } |
427 | 446 | ||
428 | static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, | 447 | static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, |
429 | unsigned long nr_segs, loff_t pos) | 448 | unsigned long nr_segs, loff_t pos, bool uio) |
430 | { | 449 | { |
431 | ssize_t result = -ENOMEM; | 450 | ssize_t result = -ENOMEM; |
432 | struct inode *inode = iocb->ki_filp->f_mapping->host; | 451 | struct inode *inode = iocb->ki_filp->f_mapping->host; |
@@ -444,7 +463,7 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, | |||
444 | if (!is_sync_kiocb(iocb)) | 463 | if (!is_sync_kiocb(iocb)) |
445 | dreq->iocb = iocb; | 464 | dreq->iocb = iocb; |
446 | 465 | ||
447 | result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos); | 466 | result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio); |
448 | if (!result) | 467 | if (!result) |
449 | result = nfs_direct_wait(dreq); | 468 | result = nfs_direct_wait(dreq); |
450 | NFS_I(inode)->read_io += result; | 469 | NFS_I(inode)->read_io += result; |
@@ -610,7 +629,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode | |||
610 | */ | 629 | */ |
611 | static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc, | 630 | static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc, |
612 | const struct iovec *iov, | 631 | const struct iovec *iov, |
613 | loff_t pos) | 632 | loff_t pos, bool uio) |
614 | { | 633 | { |
615 | struct nfs_direct_req *dreq = desc->pg_dreq; | 634 | struct nfs_direct_req *dreq = desc->pg_dreq; |
616 | struct nfs_open_context *ctx = dreq->ctx; | 635 | struct nfs_open_context *ctx = dreq->ctx; |
@@ -638,12 +657,19 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d | |||
638 | if (!pagevec) | 657 | if (!pagevec) |
639 | break; | 658 | break; |
640 | 659 | ||
641 | down_read(¤t->mm->mmap_sem); | 660 | if (uio) { |
642 | result = get_user_pages(current, current->mm, user_addr, | 661 | down_read(¤t->mm->mmap_sem); |
643 | npages, 0, 0, pagevec, NULL); | 662 | result = get_user_pages(current, current->mm, user_addr, |
644 | up_read(¤t->mm->mmap_sem); | 663 | npages, 0, 0, pagevec, NULL); |
645 | if (result < 0) | 664 | up_read(¤t->mm->mmap_sem); |
646 | break; | 665 | if (result < 0) |
666 | break; | ||
667 | } else { | ||
668 | WARN_ON(npages != 1); | ||
669 | result = get_kernel_page(user_addr, 0, pagevec); | ||
670 | if (WARN_ON(result != 1)) | ||
671 | break; | ||
672 | } | ||
647 | 673 | ||
648 | if ((unsigned)result < npages) { | 674 | if ((unsigned)result < npages) { |
649 | bytes = result * PAGE_SIZE; | 675 | bytes = result * PAGE_SIZE; |
@@ -774,7 +800,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { | |||
774 | static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, | 800 | static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, |
775 | const struct iovec *iov, | 801 | const struct iovec *iov, |
776 | unsigned long nr_segs, | 802 | unsigned long nr_segs, |
777 | loff_t pos) | 803 | loff_t pos, bool uio) |
778 | { | 804 | { |
779 | struct nfs_pageio_descriptor desc; | 805 | struct nfs_pageio_descriptor desc; |
780 | struct inode *inode = dreq->inode; | 806 | struct inode *inode = dreq->inode; |
@@ -790,7 +816,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, | |||
790 | 816 | ||
791 | for (seg = 0; seg < nr_segs; seg++) { | 817 | for (seg = 0; seg < nr_segs; seg++) { |
792 | const struct iovec *vec = &iov[seg]; | 818 | const struct iovec *vec = &iov[seg]; |
793 | result = nfs_direct_write_schedule_segment(&desc, vec, pos); | 819 | result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio); |
794 | if (result < 0) | 820 | if (result < 0) |
795 | break; | 821 | break; |
796 | requested_bytes += result; | 822 | requested_bytes += result; |
@@ -818,7 +844,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, | |||
818 | 844 | ||
819 | static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, | 845 | static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, |
820 | unsigned long nr_segs, loff_t pos, | 846 | unsigned long nr_segs, loff_t pos, |
821 | size_t count) | 847 | size_t count, bool uio) |
822 | { | 848 | { |
823 | ssize_t result = -ENOMEM; | 849 | ssize_t result = -ENOMEM; |
824 | struct inode *inode = iocb->ki_filp->f_mapping->host; | 850 | struct inode *inode = iocb->ki_filp->f_mapping->host; |
@@ -836,7 +862,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
836 | if (!is_sync_kiocb(iocb)) | 862 | if (!is_sync_kiocb(iocb)) |
837 | dreq->iocb = iocb; | 863 | dreq->iocb = iocb; |
838 | 864 | ||
839 | result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos); | 865 | result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio); |
840 | if (!result) | 866 | if (!result) |
841 | result = nfs_direct_wait(dreq); | 867 | result = nfs_direct_wait(dreq); |
842 | out_release: | 868 | out_release: |
@@ -867,7 +893,7 @@ out: | |||
867 | * cache. | 893 | * cache. |
868 | */ | 894 | */ |
869 | ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, | 895 | ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, |
870 | unsigned long nr_segs, loff_t pos) | 896 | unsigned long nr_segs, loff_t pos, bool uio) |
871 | { | 897 | { |
872 | ssize_t retval = -EINVAL; | 898 | ssize_t retval = -EINVAL; |
873 | struct file *file = iocb->ki_filp; | 899 | struct file *file = iocb->ki_filp; |
@@ -892,7 +918,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, | |||
892 | 918 | ||
893 | task_io_account_read(count); | 919 | task_io_account_read(count); |
894 | 920 | ||
895 | retval = nfs_direct_read(iocb, iov, nr_segs, pos); | 921 | retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio); |
896 | if (retval > 0) | 922 | if (retval > 0) |
897 | iocb->ki_pos = pos + retval; | 923 | iocb->ki_pos = pos + retval; |
898 | 924 | ||
@@ -923,7 +949,7 @@ out: | |||
923 | * is no atomic O_APPEND write facility in the NFS protocol. | 949 | * is no atomic O_APPEND write facility in the NFS protocol. |
924 | */ | 950 | */ |
925 | ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | 951 | ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, |
926 | unsigned long nr_segs, loff_t pos) | 952 | unsigned long nr_segs, loff_t pos, bool uio) |
927 | { | 953 | { |
928 | ssize_t retval = -EINVAL; | 954 | ssize_t retval = -EINVAL; |
929 | struct file *file = iocb->ki_filp; | 955 | struct file *file = iocb->ki_filp; |
@@ -955,7 +981,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
955 | 981 | ||
956 | task_io_account_write(count); | 982 | task_io_account_write(count); |
957 | 983 | ||
958 | retval = nfs_direct_write(iocb, iov, nr_segs, pos, count); | 984 | retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio); |
959 | if (retval > 0) { | 985 | if (retval > 0) { |
960 | struct inode *inode = mapping->host; | 986 | struct inode *inode = mapping->host; |
961 | 987 | ||
diff --git a/fs/nfs/file.c b/fs/nfs/file.c index b039a17ee941..75d6d0a3d32e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c | |||
@@ -180,7 +180,7 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov, | |||
180 | ssize_t result; | 180 | ssize_t result; |
181 | 181 | ||
182 | if (iocb->ki_filp->f_flags & O_DIRECT) | 182 | if (iocb->ki_filp->f_flags & O_DIRECT) |
183 | return nfs_file_direct_read(iocb, iov, nr_segs, pos); | 183 | return nfs_file_direct_read(iocb, iov, nr_segs, pos, true); |
184 | 184 | ||
185 | dprintk("NFS: read(%s/%s, %lu@%lu)\n", | 185 | dprintk("NFS: read(%s/%s, %lu@%lu)\n", |
186 | dentry->d_parent->d_name.name, dentry->d_name.name, | 186 | dentry->d_parent->d_name.name, dentry->d_name.name, |
@@ -439,7 +439,7 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset) | |||
439 | if (offset != 0) | 439 | if (offset != 0) |
440 | return; | 440 | return; |
441 | /* Cancel any unstarted writes on this page */ | 441 | /* Cancel any unstarted writes on this page */ |
442 | nfs_wb_page_cancel(page->mapping->host, page); | 442 | nfs_wb_page_cancel(page_file_mapping(page)->host, page); |
443 | 443 | ||
444 | nfs_fscache_invalidate_page(page, page->mapping->host); | 444 | nfs_fscache_invalidate_page(page, page->mapping->host); |
445 | } | 445 | } |
@@ -484,7 +484,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp) | |||
484 | */ | 484 | */ |
485 | static int nfs_launder_page(struct page *page) | 485 | static int nfs_launder_page(struct page *page) |
486 | { | 486 | { |
487 | struct inode *inode = page->mapping->host; | 487 | struct inode *inode = page_file_mapping(page)->host; |
488 | struct nfs_inode *nfsi = NFS_I(inode); | 488 | struct nfs_inode *nfsi = NFS_I(inode); |
489 | 489 | ||
490 | dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", | 490 | dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", |
@@ -494,6 +494,20 @@ static int nfs_launder_page(struct page *page) | |||
494 | return nfs_wb_page(inode, page); | 494 | return nfs_wb_page(inode, page); |
495 | } | 495 | } |
496 | 496 | ||
497 | #ifdef CONFIG_NFS_SWAP | ||
498 | static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, | ||
499 | sector_t *span) | ||
500 | { | ||
501 | *span = sis->pages; | ||
502 | return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1); | ||
503 | } | ||
504 | |||
505 | static void nfs_swap_deactivate(struct file *file) | ||
506 | { | ||
507 | xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0); | ||
508 | } | ||
509 | #endif | ||
510 | |||
497 | const struct address_space_operations nfs_file_aops = { | 511 | const struct address_space_operations nfs_file_aops = { |
498 | .readpage = nfs_readpage, | 512 | .readpage = nfs_readpage, |
499 | .readpages = nfs_readpages, | 513 | .readpages = nfs_readpages, |
@@ -508,6 +522,10 @@ const struct address_space_operations nfs_file_aops = { | |||
508 | .migratepage = nfs_migrate_page, | 522 | .migratepage = nfs_migrate_page, |
509 | .launder_page = nfs_launder_page, | 523 | .launder_page = nfs_launder_page, |
510 | .error_remove_page = generic_error_remove_page, | 524 | .error_remove_page = generic_error_remove_page, |
525 | #ifdef CONFIG_NFS_SWAP | ||
526 | .swap_activate = nfs_swap_activate, | ||
527 | .swap_deactivate = nfs_swap_deactivate, | ||
528 | #endif | ||
511 | }; | 529 | }; |
512 | 530 | ||
513 | /* | 531 | /* |
@@ -533,7 +551,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
533 | nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); | 551 | nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); |
534 | 552 | ||
535 | lock_page(page); | 553 | lock_page(page); |
536 | mapping = page->mapping; | 554 | mapping = page_file_mapping(page); |
537 | if (mapping != dentry->d_inode->i_mapping) | 555 | if (mapping != dentry->d_inode->i_mapping) |
538 | goto out_unlock; | 556 | goto out_unlock; |
539 | 557 | ||
@@ -582,7 +600,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, | |||
582 | size_t count = iov_length(iov, nr_segs); | 600 | size_t count = iov_length(iov, nr_segs); |
583 | 601 | ||
584 | if (iocb->ki_filp->f_flags & O_DIRECT) | 602 | if (iocb->ki_filp->f_flags & O_DIRECT) |
585 | return nfs_file_direct_write(iocb, iov, nr_segs, pos); | 603 | return nfs_file_direct_write(iocb, iov, nr_segs, pos, true); |
586 | 604 | ||
587 | dprintk("NFS: write(%s/%s, %lu@%Ld)\n", | 605 | dprintk("NFS: write(%s/%s, %lu@%Ld)\n", |
588 | dentry->d_parent->d_name.name, dentry->d_name.name, | 606 | dentry->d_parent->d_name.name, dentry->d_name.name, |
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 2ed6138f32ad..c6e895f0fbf3 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c | |||
@@ -897,6 +897,10 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) | |||
897 | struct nfs_inode *nfsi = NFS_I(inode); | 897 | struct nfs_inode *nfsi = NFS_I(inode); |
898 | int ret = 0; | 898 | int ret = 0; |
899 | 899 | ||
900 | /* swapfiles are not supposed to be shared. */ | ||
901 | if (IS_SWAPFILE(inode)) | ||
902 | goto out; | ||
903 | |||
900 | if (nfs_mapping_need_revalidate_inode(inode)) { | 904 | if (nfs_mapping_need_revalidate_inode(inode)) { |
901 | ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); | 905 | ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); |
902 | if (ret < 0) | 906 | if (ret < 0) |
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 8865538b26b6..31fdb03225cd 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h | |||
@@ -554,13 +554,14 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) | |||
554 | static inline | 554 | static inline |
555 | unsigned int nfs_page_length(struct page *page) | 555 | unsigned int nfs_page_length(struct page *page) |
556 | { | 556 | { |
557 | loff_t i_size = i_size_read(page->mapping->host); | 557 | loff_t i_size = i_size_read(page_file_mapping(page)->host); |
558 | 558 | ||
559 | if (i_size > 0) { | 559 | if (i_size > 0) { |
560 | pgoff_t page_index = page_file_index(page); | ||
560 | pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; | 561 | pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; |
561 | if (page->index < end_index) | 562 | if (page_index < end_index) |
562 | return PAGE_CACHE_SIZE; | 563 | return PAGE_CACHE_SIZE; |
563 | if (page->index == end_index) | 564 | if (page_index == end_index) |
564 | return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; | 565 | return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; |
565 | } | 566 | } |
566 | return 0; | 567 | return 0; |
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 1e7d8879dae6..1a6732ed04a4 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c | |||
@@ -71,7 +71,7 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) | |||
71 | static inline struct nfs_page * | 71 | static inline struct nfs_page * |
72 | nfs_page_alloc(void) | 72 | nfs_page_alloc(void) |
73 | { | 73 | { |
74 | struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL); | 74 | struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_NOIO); |
75 | if (p) | 75 | if (p) |
76 | INIT_LIST_HEAD(&p->wb_list); | 76 | INIT_LIST_HEAD(&p->wb_list); |
77 | return p; | 77 | return p; |
@@ -118,7 +118,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, | |||
118 | * long write-back delay. This will be adjusted in | 118 | * long write-back delay. This will be adjusted in |
119 | * update_nfs_request below if the region is not locked. */ | 119 | * update_nfs_request below if the region is not locked. */ |
120 | req->wb_page = page; | 120 | req->wb_page = page; |
121 | req->wb_index = page->index; | 121 | req->wb_index = page_file_index(page); |
122 | page_cache_get(page); | 122 | page_cache_get(page); |
123 | req->wb_offset = offset; | 123 | req->wb_offset = offset; |
124 | req->wb_pgbase = offset; | 124 | req->wb_pgbase = offset; |
diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 6935e401ad76..b6bdb18e892c 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c | |||
@@ -527,11 +527,11 @@ static const struct rpc_call_ops nfs_read_common_ops = { | |||
527 | int nfs_readpage(struct file *file, struct page *page) | 527 | int nfs_readpage(struct file *file, struct page *page) |
528 | { | 528 | { |
529 | struct nfs_open_context *ctx; | 529 | struct nfs_open_context *ctx; |
530 | struct inode *inode = page->mapping->host; | 530 | struct inode *inode = page_file_mapping(page)->host; |
531 | int error; | 531 | int error; |
532 | 532 | ||
533 | dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", | 533 | dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", |
534 | page, PAGE_CACHE_SIZE, page->index); | 534 | page, PAGE_CACHE_SIZE, page_file_index(page)); |
535 | nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); | 535 | nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); |
536 | nfs_add_stats(inode, NFSIOS_READPAGES, 1); | 536 | nfs_add_stats(inode, NFSIOS_READPAGES, 1); |
537 | 537 | ||
@@ -585,7 +585,7 @@ static int | |||
585 | readpage_async_filler(void *data, struct page *page) | 585 | readpage_async_filler(void *data, struct page *page) |
586 | { | 586 | { |
587 | struct nfs_readdesc *desc = (struct nfs_readdesc *)data; | 587 | struct nfs_readdesc *desc = (struct nfs_readdesc *)data; |
588 | struct inode *inode = page->mapping->host; | 588 | struct inode *inode = page_file_mapping(page)->host; |
589 | struct nfs_page *new; | 589 | struct nfs_page *new; |
590 | unsigned int len; | 590 | unsigned int len; |
591 | int error; | 591 | int error; |
diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e4a2ad2059bd..5829d0ce7cfb 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c | |||
@@ -52,7 +52,7 @@ static mempool_t *nfs_commit_mempool; | |||
52 | 52 | ||
53 | struct nfs_commit_data *nfs_commitdata_alloc(void) | 53 | struct nfs_commit_data *nfs_commitdata_alloc(void) |
54 | { | 54 | { |
55 | struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS); | 55 | struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO); |
56 | 56 | ||
57 | if (p) { | 57 | if (p) { |
58 | memset(p, 0, sizeof(*p)); | 58 | memset(p, 0, sizeof(*p)); |
@@ -70,7 +70,7 @@ EXPORT_SYMBOL_GPL(nfs_commit_free); | |||
70 | 70 | ||
71 | struct nfs_write_header *nfs_writehdr_alloc(void) | 71 | struct nfs_write_header *nfs_writehdr_alloc(void) |
72 | { | 72 | { |
73 | struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS); | 73 | struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); |
74 | 74 | ||
75 | if (p) { | 75 | if (p) { |
76 | struct nfs_pgio_header *hdr = &p->header; | 76 | struct nfs_pgio_header *hdr = &p->header; |
@@ -142,25 +142,38 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) | |||
142 | set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); | 142 | set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); |
143 | } | 143 | } |
144 | 144 | ||
145 | static struct nfs_page *nfs_page_find_request_locked(struct page *page) | 145 | static struct nfs_page * |
146 | nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page) | ||
146 | { | 147 | { |
147 | struct nfs_page *req = NULL; | 148 | struct nfs_page *req = NULL; |
148 | 149 | ||
149 | if (PagePrivate(page)) { | 150 | if (PagePrivate(page)) |
150 | req = (struct nfs_page *)page_private(page); | 151 | req = (struct nfs_page *)page_private(page); |
151 | if (req != NULL) | 152 | else if (unlikely(PageSwapCache(page))) { |
152 | kref_get(&req->wb_kref); | 153 | struct nfs_page *freq, *t; |
154 | |||
155 | /* Linearly search the commit list for the correct req */ | ||
156 | list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) { | ||
157 | if (freq->wb_page == page) { | ||
158 | req = freq; | ||
159 | break; | ||
160 | } | ||
161 | } | ||
153 | } | 162 | } |
163 | |||
164 | if (req) | ||
165 | kref_get(&req->wb_kref); | ||
166 | |||
154 | return req; | 167 | return req; |
155 | } | 168 | } |
156 | 169 | ||
157 | static struct nfs_page *nfs_page_find_request(struct page *page) | 170 | static struct nfs_page *nfs_page_find_request(struct page *page) |
158 | { | 171 | { |
159 | struct inode *inode = page->mapping->host; | 172 | struct inode *inode = page_file_mapping(page)->host; |
160 | struct nfs_page *req = NULL; | 173 | struct nfs_page *req = NULL; |
161 | 174 | ||
162 | spin_lock(&inode->i_lock); | 175 | spin_lock(&inode->i_lock); |
163 | req = nfs_page_find_request_locked(page); | 176 | req = nfs_page_find_request_locked(NFS_I(inode), page); |
164 | spin_unlock(&inode->i_lock); | 177 | spin_unlock(&inode->i_lock); |
165 | return req; | 178 | return req; |
166 | } | 179 | } |
@@ -168,16 +181,16 @@ static struct nfs_page *nfs_page_find_request(struct page *page) | |||
168 | /* Adjust the file length if we're writing beyond the end */ | 181 | /* Adjust the file length if we're writing beyond the end */ |
169 | static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) | 182 | static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) |
170 | { | 183 | { |
171 | struct inode *inode = page->mapping->host; | 184 | struct inode *inode = page_file_mapping(page)->host; |
172 | loff_t end, i_size; | 185 | loff_t end, i_size; |
173 | pgoff_t end_index; | 186 | pgoff_t end_index; |
174 | 187 | ||
175 | spin_lock(&inode->i_lock); | 188 | spin_lock(&inode->i_lock); |
176 | i_size = i_size_read(inode); | 189 | i_size = i_size_read(inode); |
177 | end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; | 190 | end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; |
178 | if (i_size > 0 && page->index < end_index) | 191 | if (i_size > 0 && page_file_index(page) < end_index) |
179 | goto out; | 192 | goto out; |
180 | end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count); | 193 | end = page_file_offset(page) + ((loff_t)offset+count); |
181 | if (i_size >= end) | 194 | if (i_size >= end) |
182 | goto out; | 195 | goto out; |
183 | i_size_write(inode, end); | 196 | i_size_write(inode, end); |
@@ -190,7 +203,7 @@ out: | |||
190 | static void nfs_set_pageerror(struct page *page) | 203 | static void nfs_set_pageerror(struct page *page) |
191 | { | 204 | { |
192 | SetPageError(page); | 205 | SetPageError(page); |
193 | nfs_zap_mapping(page->mapping->host, page->mapping); | 206 | nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page)); |
194 | } | 207 | } |
195 | 208 | ||
196 | /* We can set the PG_uptodate flag if we see that a write request | 209 | /* We can set the PG_uptodate flag if we see that a write request |
@@ -231,7 +244,7 @@ static int nfs_set_page_writeback(struct page *page) | |||
231 | int ret = test_set_page_writeback(page); | 244 | int ret = test_set_page_writeback(page); |
232 | 245 | ||
233 | if (!ret) { | 246 | if (!ret) { |
234 | struct inode *inode = page->mapping->host; | 247 | struct inode *inode = page_file_mapping(page)->host; |
235 | struct nfs_server *nfss = NFS_SERVER(inode); | 248 | struct nfs_server *nfss = NFS_SERVER(inode); |
236 | 249 | ||
237 | if (atomic_long_inc_return(&nfss->writeback) > | 250 | if (atomic_long_inc_return(&nfss->writeback) > |
@@ -245,7 +258,7 @@ static int nfs_set_page_writeback(struct page *page) | |||
245 | 258 | ||
246 | static void nfs_end_page_writeback(struct page *page) | 259 | static void nfs_end_page_writeback(struct page *page) |
247 | { | 260 | { |
248 | struct inode *inode = page->mapping->host; | 261 | struct inode *inode = page_file_mapping(page)->host; |
249 | struct nfs_server *nfss = NFS_SERVER(inode); | 262 | struct nfs_server *nfss = NFS_SERVER(inode); |
250 | 263 | ||
251 | end_page_writeback(page); | 264 | end_page_writeback(page); |
@@ -255,13 +268,13 @@ static void nfs_end_page_writeback(struct page *page) | |||
255 | 268 | ||
256 | static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) | 269 | static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) |
257 | { | 270 | { |
258 | struct inode *inode = page->mapping->host; | 271 | struct inode *inode = page_file_mapping(page)->host; |
259 | struct nfs_page *req; | 272 | struct nfs_page *req; |
260 | int ret; | 273 | int ret; |
261 | 274 | ||
262 | spin_lock(&inode->i_lock); | 275 | spin_lock(&inode->i_lock); |
263 | for (;;) { | 276 | for (;;) { |
264 | req = nfs_page_find_request_locked(page); | 277 | req = nfs_page_find_request_locked(NFS_I(inode), page); |
265 | if (req == NULL) | 278 | if (req == NULL) |
266 | break; | 279 | break; |
267 | if (nfs_lock_request(req)) | 280 | if (nfs_lock_request(req)) |
@@ -316,13 +329,13 @@ out: | |||
316 | 329 | ||
317 | static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) | 330 | static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) |
318 | { | 331 | { |
319 | struct inode *inode = page->mapping->host; | 332 | struct inode *inode = page_file_mapping(page)->host; |
320 | int ret; | 333 | int ret; |
321 | 334 | ||
322 | nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); | 335 | nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); |
323 | nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); | 336 | nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); |
324 | 337 | ||
325 | nfs_pageio_cond_complete(pgio, page->index); | 338 | nfs_pageio_cond_complete(pgio, page_file_index(page)); |
326 | ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); | 339 | ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); |
327 | if (ret == -EAGAIN) { | 340 | if (ret == -EAGAIN) { |
328 | redirty_page_for_writepage(wbc, page); | 341 | redirty_page_for_writepage(wbc, page); |
@@ -339,7 +352,7 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc | |||
339 | struct nfs_pageio_descriptor pgio; | 352 | struct nfs_pageio_descriptor pgio; |
340 | int err; | 353 | int err; |
341 | 354 | ||
342 | NFS_PROTO(page->mapping->host)->write_pageio_init(&pgio, | 355 | NFS_PROTO(page_file_mapping(page)->host)->write_pageio_init(&pgio, |
343 | page->mapping->host, | 356 | page->mapping->host, |
344 | wb_priority(wbc), | 357 | wb_priority(wbc), |
345 | &nfs_async_write_completion_ops); | 358 | &nfs_async_write_completion_ops); |
@@ -416,9 +429,15 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) | |||
416 | spin_lock(&inode->i_lock); | 429 | spin_lock(&inode->i_lock); |
417 | if (!nfsi->npages && NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) | 430 | if (!nfsi->npages && NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) |
418 | inode->i_version++; | 431 | inode->i_version++; |
419 | set_bit(PG_MAPPED, &req->wb_flags); | 432 | /* |
420 | SetPagePrivate(req->wb_page); | 433 | * Swap-space should not get truncated. Hence no need to plug the race |
421 | set_page_private(req->wb_page, (unsigned long)req); | 434 | * with invalidate/truncate. |
435 | */ | ||
436 | if (likely(!PageSwapCache(req->wb_page))) { | ||
437 | set_bit(PG_MAPPED, &req->wb_flags); | ||
438 | SetPagePrivate(req->wb_page); | ||
439 | set_page_private(req->wb_page, (unsigned long)req); | ||
440 | } | ||
422 | nfsi->npages++; | 441 | nfsi->npages++; |
423 | kref_get(&req->wb_kref); | 442 | kref_get(&req->wb_kref); |
424 | spin_unlock(&inode->i_lock); | 443 | spin_unlock(&inode->i_lock); |
@@ -435,9 +454,11 @@ static void nfs_inode_remove_request(struct nfs_page *req) | |||
435 | BUG_ON (!NFS_WBACK_BUSY(req)); | 454 | BUG_ON (!NFS_WBACK_BUSY(req)); |
436 | 455 | ||
437 | spin_lock(&inode->i_lock); | 456 | spin_lock(&inode->i_lock); |
438 | set_page_private(req->wb_page, 0); | 457 | if (likely(!PageSwapCache(req->wb_page))) { |
439 | ClearPagePrivate(req->wb_page); | 458 | set_page_private(req->wb_page, 0); |
440 | clear_bit(PG_MAPPED, &req->wb_flags); | 459 | ClearPagePrivate(req->wb_page); |
460 | clear_bit(PG_MAPPED, &req->wb_flags); | ||
461 | } | ||
441 | nfsi->npages--; | 462 | nfsi->npages--; |
442 | spin_unlock(&inode->i_lock); | 463 | spin_unlock(&inode->i_lock); |
443 | nfs_release_request(req); | 464 | nfs_release_request(req); |
@@ -474,7 +495,7 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, | |||
474 | spin_unlock(cinfo->lock); | 495 | spin_unlock(cinfo->lock); |
475 | if (!cinfo->dreq) { | 496 | if (!cinfo->dreq) { |
476 | inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); | 497 | inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); |
477 | inc_bdi_stat(req->wb_page->mapping->backing_dev_info, | 498 | inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, |
478 | BDI_RECLAIMABLE); | 499 | BDI_RECLAIMABLE); |
479 | __mark_inode_dirty(req->wb_context->dentry->d_inode, | 500 | __mark_inode_dirty(req->wb_context->dentry->d_inode, |
480 | I_DIRTY_DATASYNC); | 501 | I_DIRTY_DATASYNC); |
@@ -541,7 +562,7 @@ static void | |||
541 | nfs_clear_page_commit(struct page *page) | 562 | nfs_clear_page_commit(struct page *page) |
542 | { | 563 | { |
543 | dec_zone_page_state(page, NR_UNSTABLE_NFS); | 564 | dec_zone_page_state(page, NR_UNSTABLE_NFS); |
544 | dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); | 565 | dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE); |
545 | } | 566 | } |
546 | 567 | ||
547 | static void | 568 | static void |
@@ -733,7 +754,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, | |||
733 | spin_lock(&inode->i_lock); | 754 | spin_lock(&inode->i_lock); |
734 | 755 | ||
735 | for (;;) { | 756 | for (;;) { |
736 | req = nfs_page_find_request_locked(page); | 757 | req = nfs_page_find_request_locked(NFS_I(inode), page); |
737 | if (req == NULL) | 758 | if (req == NULL) |
738 | goto out_unlock; | 759 | goto out_unlock; |
739 | 760 | ||
@@ -792,7 +813,7 @@ out_err: | |||
792 | static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, | 813 | static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, |
793 | struct page *page, unsigned int offset, unsigned int bytes) | 814 | struct page *page, unsigned int offset, unsigned int bytes) |
794 | { | 815 | { |
795 | struct inode *inode = page->mapping->host; | 816 | struct inode *inode = page_file_mapping(page)->host; |
796 | struct nfs_page *req; | 817 | struct nfs_page *req; |
797 | 818 | ||
798 | req = nfs_try_to_update_request(inode, page, offset, bytes); | 819 | req = nfs_try_to_update_request(inode, page, offset, bytes); |
@@ -845,7 +866,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page) | |||
845 | nfs_release_request(req); | 866 | nfs_release_request(req); |
846 | if (!do_flush) | 867 | if (!do_flush) |
847 | return 0; | 868 | return 0; |
848 | status = nfs_wb_page(page->mapping->host, page); | 869 | status = nfs_wb_page(page_file_mapping(page)->host, page); |
849 | } while (status == 0); | 870 | } while (status == 0); |
850 | return status; | 871 | return status; |
851 | } | 872 | } |
@@ -875,7 +896,7 @@ int nfs_updatepage(struct file *file, struct page *page, | |||
875 | unsigned int offset, unsigned int count) | 896 | unsigned int offset, unsigned int count) |
876 | { | 897 | { |
877 | struct nfs_open_context *ctx = nfs_file_open_context(file); | 898 | struct nfs_open_context *ctx = nfs_file_open_context(file); |
878 | struct inode *inode = page->mapping->host; | 899 | struct inode *inode = page_file_mapping(page)->host; |
879 | int status = 0; | 900 | int status = 0; |
880 | 901 | ||
881 | nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); | 902 | nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); |
@@ -883,7 +904,7 @@ int nfs_updatepage(struct file *file, struct page *page, | |||
883 | dprintk("NFS: nfs_updatepage(%s/%s %d@%lld)\n", | 904 | dprintk("NFS: nfs_updatepage(%s/%s %d@%lld)\n", |
884 | file->f_path.dentry->d_parent->d_name.name, | 905 | file->f_path.dentry->d_parent->d_name.name, |
885 | file->f_path.dentry->d_name.name, count, | 906 | file->f_path.dentry->d_name.name, count, |
886 | (long long)(page_offset(page) + offset)); | 907 | (long long)(page_file_offset(page) + offset)); |
887 | 908 | ||
888 | /* If we're not using byte range locks, and we know the page | 909 | /* If we're not using byte range locks, and we know the page |
889 | * is up to date, it may be more efficient to extend the write | 910 | * is up to date, it may be more efficient to extend the write |
@@ -1474,7 +1495,7 @@ void nfs_retry_commit(struct list_head *page_list, | |||
1474 | nfs_mark_request_commit(req, lseg, cinfo); | 1495 | nfs_mark_request_commit(req, lseg, cinfo); |
1475 | if (!cinfo->dreq) { | 1496 | if (!cinfo->dreq) { |
1476 | dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); | 1497 | dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); |
1477 | dec_bdi_stat(req->wb_page->mapping->backing_dev_info, | 1498 | dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, |
1478 | BDI_RECLAIMABLE); | 1499 | BDI_RECLAIMABLE); |
1479 | } | 1500 | } |
1480 | nfs_unlock_and_release_request(req); | 1501 | nfs_unlock_and_release_request(req); |
@@ -1731,7 +1752,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) | |||
1731 | */ | 1752 | */ |
1732 | int nfs_wb_page(struct inode *inode, struct page *page) | 1753 | int nfs_wb_page(struct inode *inode, struct page *page) |
1733 | { | 1754 | { |
1734 | loff_t range_start = page_offset(page); | 1755 | loff_t range_start = page_file_offset(page); |
1735 | loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); | 1756 | loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); |
1736 | struct writeback_control wbc = { | 1757 | struct writeback_control wbc = { |
1737 | .sync_mode = WB_SYNC_ALL, | 1758 | .sync_mode = WB_SYNC_ALL, |
diff --git a/fs/super.c b/fs/super.c index 4c5d82f56ec4..4bf714459a4b 100644 --- a/fs/super.c +++ b/fs/super.c | |||
@@ -62,7 +62,7 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc) | |||
62 | return -1; | 62 | return -1; |
63 | 63 | ||
64 | if (!grab_super_passive(sb)) | 64 | if (!grab_super_passive(sb)) |
65 | return !sc->nr_to_scan ? 0 : -1; | 65 | return -1; |
66 | 66 | ||
67 | if (sb->s_op && sb->s_op->nr_cached_objects) | 67 | if (sb->s_op && sb->s_op->nr_cached_objects) |
68 | fs_objects = sb->s_op->nr_cached_objects(sb); | 68 | fs_objects = sb->s_op->nr_cached_objects(sb); |
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 489de625cd25..c97c6b9cd38e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/timer.h> | 17 | #include <linux/timer.h> |
18 | #include <linux/writeback.h> | 18 | #include <linux/writeback.h> |
19 | #include <linux/atomic.h> | 19 | #include <linux/atomic.h> |
20 | #include <linux/sysctl.h> | ||
20 | 21 | ||
21 | struct page; | 22 | struct page; |
22 | struct device; | 23 | struct device; |
@@ -304,6 +305,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync); | |||
304 | void set_bdi_congested(struct backing_dev_info *bdi, int sync); | 305 | void set_bdi_congested(struct backing_dev_info *bdi, int sync); |
305 | long congestion_wait(int sync, long timeout); | 306 | long congestion_wait(int sync, long timeout); |
306 | long wait_iff_congested(struct zone *zone, int sync, long timeout); | 307 | long wait_iff_congested(struct zone *zone, int sync, long timeout); |
308 | int pdflush_proc_obsolete(struct ctl_table *table, int write, | ||
309 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
307 | 310 | ||
308 | static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) | 311 | static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) |
309 | { | 312 | { |
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 0edb65dd8edd..7b7ac9ccec7a 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h | |||
@@ -160,6 +160,7 @@ enum rq_flag_bits { | |||
160 | __REQ_FLUSH_SEQ, /* request for flush sequence */ | 160 | __REQ_FLUSH_SEQ, /* request for flush sequence */ |
161 | __REQ_IO_STAT, /* account I/O stat */ | 161 | __REQ_IO_STAT, /* account I/O stat */ |
162 | __REQ_MIXED_MERGE, /* merge of different types, fail separately */ | 162 | __REQ_MIXED_MERGE, /* merge of different types, fail separately */ |
163 | __REQ_KERNEL, /* direct IO to kernel pages */ | ||
163 | __REQ_NR_BITS, /* stops here */ | 164 | __REQ_NR_BITS, /* stops here */ |
164 | }; | 165 | }; |
165 | 166 | ||
@@ -201,5 +202,6 @@ enum rq_flag_bits { | |||
201 | #define REQ_IO_STAT (1 << __REQ_IO_STAT) | 202 | #define REQ_IO_STAT (1 << __REQ_IO_STAT) |
202 | #define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) | 203 | #define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) |
203 | #define REQ_SECURE (1 << __REQ_SECURE) | 204 | #define REQ_SECURE (1 << __REQ_SECURE) |
205 | #define REQ_KERNEL (1 << __REQ_KERNEL) | ||
204 | 206 | ||
205 | #endif /* __LINUX_BLK_TYPES_H */ | 207 | #endif /* __LINUX_BLK_TYPES_H */ |
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 0bd390ce98b2..dfae957398c3 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h | |||
@@ -31,7 +31,7 @@ SUBSYS(cpuacct) | |||
31 | 31 | ||
32 | /* */ | 32 | /* */ |
33 | 33 | ||
34 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 34 | #ifdef CONFIG_MEMCG |
35 | SUBSYS(mem_cgroup) | 35 | SUBSYS(mem_cgroup) |
36 | #endif | 36 | #endif |
37 | 37 | ||
@@ -72,3 +72,9 @@ SUBSYS(net_prio) | |||
72 | #endif | 72 | #endif |
73 | 73 | ||
74 | /* */ | 74 | /* */ |
75 | |||
76 | #ifdef CONFIG_CGROUP_HUGETLB | ||
77 | SUBSYS(hugetlb) | ||
78 | #endif | ||
79 | |||
80 | /* */ | ||
diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 51a90b7f2d60..133ddcf83397 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h | |||
@@ -58,7 +58,7 @@ static inline bool compaction_deferred(struct zone *zone, int order) | |||
58 | if (++zone->compact_considered > defer_limit) | 58 | if (++zone->compact_considered > defer_limit) |
59 | zone->compact_considered = defer_limit; | 59 | zone->compact_considered = defer_limit; |
60 | 60 | ||
61 | return zone->compact_considered < (1UL << zone->compact_defer_shift); | 61 | return zone->compact_considered < defer_limit; |
62 | } | 62 | } |
63 | 63 | ||
64 | #else | 64 | #else |
@@ -85,7 +85,7 @@ static inline void defer_compaction(struct zone *zone, int order) | |||
85 | 85 | ||
86 | static inline bool compaction_deferred(struct zone *zone, int order) | 86 | static inline bool compaction_deferred(struct zone *zone, int order) |
87 | { | 87 | { |
88 | return 1; | 88 | return true; |
89 | } | 89 | } |
90 | 90 | ||
91 | #endif /* CONFIG_COMPACTION */ | 91 | #endif /* CONFIG_COMPACTION */ |
diff --git a/include/linux/fs.h b/include/linux/fs.h index b178f9e91e23..d7eed5b98ae2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -165,6 +165,8 @@ struct inodes_stat_t { | |||
165 | #define READ 0 | 165 | #define READ 0 |
166 | #define WRITE RW_MASK | 166 | #define WRITE RW_MASK |
167 | #define READA RWA_MASK | 167 | #define READA RWA_MASK |
168 | #define KERNEL_READ (READ|REQ_KERNEL) | ||
169 | #define KERNEL_WRITE (WRITE|REQ_KERNEL) | ||
168 | 170 | ||
169 | #define READ_SYNC (READ | REQ_SYNC) | 171 | #define READ_SYNC (READ | REQ_SYNC) |
170 | #define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE) | 172 | #define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE) |
@@ -427,6 +429,7 @@ struct kstatfs; | |||
427 | struct vm_area_struct; | 429 | struct vm_area_struct; |
428 | struct vfsmount; | 430 | struct vfsmount; |
429 | struct cred; | 431 | struct cred; |
432 | struct swap_info_struct; | ||
430 | 433 | ||
431 | extern void __init inode_init(void); | 434 | extern void __init inode_init(void); |
432 | extern void __init inode_init_early(void); | 435 | extern void __init inode_init_early(void); |
@@ -636,6 +639,11 @@ struct address_space_operations { | |||
636 | int (*is_partially_uptodate) (struct page *, read_descriptor_t *, | 639 | int (*is_partially_uptodate) (struct page *, read_descriptor_t *, |
637 | unsigned long); | 640 | unsigned long); |
638 | int (*error_remove_page)(struct address_space *, struct page *); | 641 | int (*error_remove_page)(struct address_space *, struct page *); |
642 | |||
643 | /* swapfile support */ | ||
644 | int (*swap_activate)(struct swap_info_struct *sis, struct file *file, | ||
645 | sector_t *span); | ||
646 | void (*swap_deactivate)(struct file *file); | ||
639 | }; | 647 | }; |
640 | 648 | ||
641 | extern const struct address_space_operations empty_aops; | 649 | extern const struct address_space_operations empty_aops; |
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 1e49be49d324..4883f393f50a 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h | |||
@@ -23,6 +23,7 @@ struct vm_area_struct; | |||
23 | #define ___GFP_REPEAT 0x400u | 23 | #define ___GFP_REPEAT 0x400u |
24 | #define ___GFP_NOFAIL 0x800u | 24 | #define ___GFP_NOFAIL 0x800u |
25 | #define ___GFP_NORETRY 0x1000u | 25 | #define ___GFP_NORETRY 0x1000u |
26 | #define ___GFP_MEMALLOC 0x2000u | ||
26 | #define ___GFP_COMP 0x4000u | 27 | #define ___GFP_COMP 0x4000u |
27 | #define ___GFP_ZERO 0x8000u | 28 | #define ___GFP_ZERO 0x8000u |
28 | #define ___GFP_NOMEMALLOC 0x10000u | 29 | #define ___GFP_NOMEMALLOC 0x10000u |
@@ -76,9 +77,14 @@ struct vm_area_struct; | |||
76 | #define __GFP_REPEAT ((__force gfp_t)___GFP_REPEAT) /* See above */ | 77 | #define __GFP_REPEAT ((__force gfp_t)___GFP_REPEAT) /* See above */ |
77 | #define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) /* See above */ | 78 | #define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) /* See above */ |
78 | #define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) /* See above */ | 79 | #define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) /* See above */ |
80 | #define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC)/* Allow access to emergency reserves */ | ||
79 | #define __GFP_COMP ((__force gfp_t)___GFP_COMP) /* Add compound page metadata */ | 81 | #define __GFP_COMP ((__force gfp_t)___GFP_COMP) /* Add compound page metadata */ |
80 | #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) /* Return zeroed page on success */ | 82 | #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) /* Return zeroed page on success */ |
81 | #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves */ | 83 | #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves. |
84 | * This takes precedence over the | ||
85 | * __GFP_MEMALLOC flag if both are | ||
86 | * set | ||
87 | */ | ||
82 | #define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */ | 88 | #define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */ |
83 | #define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */ | 89 | #define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */ |
84 | #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ | 90 | #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ |
@@ -129,7 +135,7 @@ struct vm_area_struct; | |||
129 | /* Control page allocator reclaim behavior */ | 135 | /* Control page allocator reclaim behavior */ |
130 | #define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\ | 136 | #define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\ |
131 | __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ | 137 | __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ |
132 | __GFP_NORETRY|__GFP_NOMEMALLOC) | 138 | __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC) |
133 | 139 | ||
134 | /* Control slab gfp mask during early boot */ | 140 | /* Control slab gfp mask during early boot */ |
135 | #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS)) | 141 | #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS)) |
@@ -379,6 +385,9 @@ void drain_local_pages(void *dummy); | |||
379 | */ | 385 | */ |
380 | extern gfp_t gfp_allowed_mask; | 386 | extern gfp_t gfp_allowed_mask; |
381 | 387 | ||
388 | /* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */ | ||
389 | bool gfp_pfmemalloc_allowed(gfp_t gfp_mask); | ||
390 | |||
382 | extern void pm_restrict_gfp_mask(void); | 391 | extern void pm_restrict_gfp_mask(void); |
383 | extern void pm_restore_gfp_mask(void); | 392 | extern void pm_restore_gfp_mask(void); |
384 | 393 | ||
diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 774fa47b3b5b..ef788b5b4a35 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h | |||
@@ -39,10 +39,17 @@ extern unsigned long totalhigh_pages; | |||
39 | 39 | ||
40 | void kmap_flush_unused(void); | 40 | void kmap_flush_unused(void); |
41 | 41 | ||
42 | struct page *kmap_to_page(void *addr); | ||
43 | |||
42 | #else /* CONFIG_HIGHMEM */ | 44 | #else /* CONFIG_HIGHMEM */ |
43 | 45 | ||
44 | static inline unsigned int nr_free_highpages(void) { return 0; } | 46 | static inline unsigned int nr_free_highpages(void) { return 0; } |
45 | 47 | ||
48 | static inline struct page *kmap_to_page(void *addr) | ||
49 | { | ||
50 | return virt_to_page(addr); | ||
51 | } | ||
52 | |||
46 | #define totalhigh_pages 0UL | 53 | #define totalhigh_pages 0UL |
47 | 54 | ||
48 | #ifndef ARCH_HAS_KMAP | 55 | #ifndef ARCH_HAS_KMAP |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d5d6bbe2259e..225164842ab6 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -4,9 +4,11 @@ | |||
4 | #include <linux/mm_types.h> | 4 | #include <linux/mm_types.h> |
5 | #include <linux/fs.h> | 5 | #include <linux/fs.h> |
6 | #include <linux/hugetlb_inline.h> | 6 | #include <linux/hugetlb_inline.h> |
7 | #include <linux/cgroup.h> | ||
7 | 8 | ||
8 | struct ctl_table; | 9 | struct ctl_table; |
9 | struct user_struct; | 10 | struct user_struct; |
11 | struct mmu_gather; | ||
10 | 12 | ||
11 | #ifdef CONFIG_HUGETLB_PAGE | 13 | #ifdef CONFIG_HUGETLB_PAGE |
12 | 14 | ||
@@ -20,6 +22,11 @@ struct hugepage_subpool { | |||
20 | long max_hpages, used_hpages; | 22 | long max_hpages, used_hpages; |
21 | }; | 23 | }; |
22 | 24 | ||
25 | extern spinlock_t hugetlb_lock; | ||
26 | extern int hugetlb_max_hstate __read_mostly; | ||
27 | #define for_each_hstate(h) \ | ||
28 | for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++) | ||
29 | |||
23 | struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); | 30 | struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); |
24 | void hugepage_put_subpool(struct hugepage_subpool *spool); | 31 | void hugepage_put_subpool(struct hugepage_subpool *spool); |
25 | 32 | ||
@@ -40,9 +47,14 @@ int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, | |||
40 | struct page **, struct vm_area_struct **, | 47 | struct page **, struct vm_area_struct **, |
41 | unsigned long *, int *, int, unsigned int flags); | 48 | unsigned long *, int *, int, unsigned int flags); |
42 | void unmap_hugepage_range(struct vm_area_struct *, | 49 | void unmap_hugepage_range(struct vm_area_struct *, |
43 | unsigned long, unsigned long, struct page *); | 50 | unsigned long, unsigned long, struct page *); |
44 | void __unmap_hugepage_range(struct vm_area_struct *, | 51 | void __unmap_hugepage_range_final(struct mmu_gather *tlb, |
45 | unsigned long, unsigned long, struct page *); | 52 | struct vm_area_struct *vma, |
53 | unsigned long start, unsigned long end, | ||
54 | struct page *ref_page); | ||
55 | void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | ||
56 | unsigned long start, unsigned long end, | ||
57 | struct page *ref_page); | ||
46 | int hugetlb_prefault(struct address_space *, struct vm_area_struct *); | 58 | int hugetlb_prefault(struct address_space *, struct vm_area_struct *); |
47 | void hugetlb_report_meminfo(struct seq_file *); | 59 | void hugetlb_report_meminfo(struct seq_file *); |
48 | int hugetlb_report_node_meminfo(int, char *); | 60 | int hugetlb_report_node_meminfo(int, char *); |
@@ -98,7 +110,6 @@ static inline unsigned long hugetlb_total_pages(void) | |||
98 | #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) | 110 | #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) |
99 | #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) | 111 | #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) |
100 | #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) | 112 | #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) |
101 | #define unmap_hugepage_range(vma, start, end, page) BUG() | ||
102 | static inline void hugetlb_report_meminfo(struct seq_file *m) | 113 | static inline void hugetlb_report_meminfo(struct seq_file *m) |
103 | { | 114 | { |
104 | } | 115 | } |
@@ -112,13 +123,31 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) | |||
112 | #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) | 123 | #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) |
113 | #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) | 124 | #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) |
114 | #define huge_pte_offset(mm, address) 0 | 125 | #define huge_pte_offset(mm, address) 0 |
115 | #define dequeue_hwpoisoned_huge_page(page) 0 | 126 | static inline int dequeue_hwpoisoned_huge_page(struct page *page) |
127 | { | ||
128 | return 0; | ||
129 | } | ||
130 | |||
116 | static inline void copy_huge_page(struct page *dst, struct page *src) | 131 | static inline void copy_huge_page(struct page *dst, struct page *src) |
117 | { | 132 | { |
118 | } | 133 | } |
119 | 134 | ||
120 | #define hugetlb_change_protection(vma, address, end, newprot) | 135 | #define hugetlb_change_protection(vma, address, end, newprot) |
121 | 136 | ||
137 | static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, | ||
138 | struct vm_area_struct *vma, unsigned long start, | ||
139 | unsigned long end, struct page *ref_page) | ||
140 | { | ||
141 | BUG(); | ||
142 | } | ||
143 | |||
144 | static inline void __unmap_hugepage_range(struct mmu_gather *tlb, | ||
145 | struct vm_area_struct *vma, unsigned long start, | ||
146 | unsigned long end, struct page *ref_page) | ||
147 | { | ||
148 | BUG(); | ||
149 | } | ||
150 | |||
122 | #endif /* !CONFIG_HUGETLB_PAGE */ | 151 | #endif /* !CONFIG_HUGETLB_PAGE */ |
123 | 152 | ||
124 | #define HUGETLB_ANON_FILE "anon_hugepage" | 153 | #define HUGETLB_ANON_FILE "anon_hugepage" |
@@ -199,10 +228,15 @@ struct hstate { | |||
199 | unsigned long resv_huge_pages; | 228 | unsigned long resv_huge_pages; |
200 | unsigned long surplus_huge_pages; | 229 | unsigned long surplus_huge_pages; |
201 | unsigned long nr_overcommit_huge_pages; | 230 | unsigned long nr_overcommit_huge_pages; |
231 | struct list_head hugepage_activelist; | ||
202 | struct list_head hugepage_freelists[MAX_NUMNODES]; | 232 | struct list_head hugepage_freelists[MAX_NUMNODES]; |
203 | unsigned int nr_huge_pages_node[MAX_NUMNODES]; | 233 | unsigned int nr_huge_pages_node[MAX_NUMNODES]; |
204 | unsigned int free_huge_pages_node[MAX_NUMNODES]; | 234 | unsigned int free_huge_pages_node[MAX_NUMNODES]; |
205 | unsigned int surplus_huge_pages_node[MAX_NUMNODES]; | 235 | unsigned int surplus_huge_pages_node[MAX_NUMNODES]; |
236 | #ifdef CONFIG_CGROUP_HUGETLB | ||
237 | /* cgroup control files */ | ||
238 | struct cftype cgroup_files[5]; | ||
239 | #endif | ||
206 | char name[HSTATE_NAME_LEN]; | 240 | char name[HSTATE_NAME_LEN]; |
207 | }; | 241 | }; |
208 | 242 | ||
@@ -302,6 +336,11 @@ static inline unsigned hstate_index_to_shift(unsigned index) | |||
302 | return hstates[index].order + PAGE_SHIFT; | 336 | return hstates[index].order + PAGE_SHIFT; |
303 | } | 337 | } |
304 | 338 | ||
339 | static inline int hstate_index(struct hstate *h) | ||
340 | { | ||
341 | return h - hstates; | ||
342 | } | ||
343 | |||
305 | #else | 344 | #else |
306 | struct hstate {}; | 345 | struct hstate {}; |
307 | #define alloc_huge_page_node(h, nid) NULL | 346 | #define alloc_huge_page_node(h, nid) NULL |
@@ -320,6 +359,7 @@ static inline unsigned int pages_per_huge_page(struct hstate *h) | |||
320 | return 1; | 359 | return 1; |
321 | } | 360 | } |
322 | #define hstate_index_to_shift(index) 0 | 361 | #define hstate_index_to_shift(index) 0 |
362 | #define hstate_index(h) 0 | ||
323 | #endif | 363 | #endif |
324 | 364 | ||
325 | #endif /* _LINUX_HUGETLB_H */ | 365 | #endif /* _LINUX_HUGETLB_H */ |
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h new file mode 100644 index 000000000000..d73878c694b3 --- /dev/null +++ b/include/linux/hugetlb_cgroup.h | |||
@@ -0,0 +1,126 @@ | |||
1 | /* | ||
2 | * Copyright IBM Corporation, 2012 | ||
3 | * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms of version 2.1 of the GNU Lesser General Public License | ||
7 | * as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope that it would be useful, but | ||
10 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | ||
12 | * | ||
13 | */ | ||
14 | |||
15 | #ifndef _LINUX_HUGETLB_CGROUP_H | ||
16 | #define _LINUX_HUGETLB_CGROUP_H | ||
17 | |||
18 | #include <linux/res_counter.h> | ||
19 | |||
20 | struct hugetlb_cgroup; | ||
21 | /* | ||
22 | * Minimum page order trackable by hugetlb cgroup. | ||
23 | * At least 3 pages are necessary for all the tracking information. | ||
24 | */ | ||
25 | #define HUGETLB_CGROUP_MIN_ORDER 2 | ||
26 | |||
27 | #ifdef CONFIG_CGROUP_HUGETLB | ||
28 | |||
29 | static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) | ||
30 | { | ||
31 | VM_BUG_ON(!PageHuge(page)); | ||
32 | |||
33 | if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER) | ||
34 | return NULL; | ||
35 | return (struct hugetlb_cgroup *)page[2].lru.next; | ||
36 | } | ||
37 | |||
38 | static inline | ||
39 | int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg) | ||
40 | { | ||
41 | VM_BUG_ON(!PageHuge(page)); | ||
42 | |||
43 | if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER) | ||
44 | return -1; | ||
45 | page[2].lru.next = (void *)h_cg; | ||
46 | return 0; | ||
47 | } | ||
48 | |||
49 | static inline bool hugetlb_cgroup_disabled(void) | ||
50 | { | ||
51 | if (hugetlb_subsys.disabled) | ||
52 | return true; | ||
53 | return false; | ||
54 | } | ||
55 | |||
56 | extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, | ||
57 | struct hugetlb_cgroup **ptr); | ||
58 | extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, | ||
59 | struct hugetlb_cgroup *h_cg, | ||
60 | struct page *page); | ||
61 | extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, | ||
62 | struct page *page); | ||
63 | extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, | ||
64 | struct hugetlb_cgroup *h_cg); | ||
65 | extern int hugetlb_cgroup_file_init(int idx) __init; | ||
66 | extern void hugetlb_cgroup_migrate(struct page *oldhpage, | ||
67 | struct page *newhpage); | ||
68 | |||
69 | #else | ||
70 | static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) | ||
71 | { | ||
72 | return NULL; | ||
73 | } | ||
74 | |||
75 | static inline | ||
76 | int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg) | ||
77 | { | ||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | static inline bool hugetlb_cgroup_disabled(void) | ||
82 | { | ||
83 | return true; | ||
84 | } | ||
85 | |||
86 | static inline int | ||
87 | hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, | ||
88 | struct hugetlb_cgroup **ptr) | ||
89 | { | ||
90 | return 0; | ||
91 | } | ||
92 | |||
93 | static inline void | ||
94 | hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, | ||
95 | struct hugetlb_cgroup *h_cg, | ||
96 | struct page *page) | ||
97 | { | ||
98 | return; | ||
99 | } | ||
100 | |||
101 | static inline void | ||
102 | hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page) | ||
103 | { | ||
104 | return; | ||
105 | } | ||
106 | |||
107 | static inline void | ||
108 | hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, | ||
109 | struct hugetlb_cgroup *h_cg) | ||
110 | { | ||
111 | return; | ||
112 | } | ||
113 | |||
114 | static inline int __init hugetlb_cgroup_file_init(int idx) | ||
115 | { | ||
116 | return 0; | ||
117 | } | ||
118 | |||
119 | static inline void hugetlb_cgroup_migrate(struct page *oldhpage, | ||
120 | struct page *newhpage) | ||
121 | { | ||
122 | return; | ||
123 | } | ||
124 | |||
125 | #endif /* CONFIG_MEM_RES_CTLR_HUGETLB */ | ||
126 | #endif | ||
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 83e7ba90d6e5..8d9489fdab2e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -38,7 +38,7 @@ struct mem_cgroup_reclaim_cookie { | |||
38 | unsigned int generation; | 38 | unsigned int generation; |
39 | }; | 39 | }; |
40 | 40 | ||
41 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 41 | #ifdef CONFIG_MEMCG |
42 | /* | 42 | /* |
43 | * All "charge" functions with gfp_mask should use GFP_KERNEL or | 43 | * All "charge" functions with gfp_mask should use GFP_KERNEL or |
44 | * (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't | 44 | * (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't |
@@ -72,8 +72,6 @@ extern void mem_cgroup_uncharge_end(void); | |||
72 | extern void mem_cgroup_uncharge_page(struct page *page); | 72 | extern void mem_cgroup_uncharge_page(struct page *page); |
73 | extern void mem_cgroup_uncharge_cache_page(struct page *page); | 73 | extern void mem_cgroup_uncharge_cache_page(struct page *page); |
74 | 74 | ||
75 | extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | ||
76 | int order); | ||
77 | bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | 75 | bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, |
78 | struct mem_cgroup *memcg); | 76 | struct mem_cgroup *memcg); |
79 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg); | 77 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg); |
@@ -100,9 +98,9 @@ int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup) | |||
100 | 98 | ||
101 | extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg); | 99 | extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg); |
102 | 100 | ||
103 | extern int | 101 | extern void |
104 | mem_cgroup_prepare_migration(struct page *page, | 102 | mem_cgroup_prepare_migration(struct page *page, struct page *newpage, |
105 | struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask); | 103 | struct mem_cgroup **memcgp); |
106 | extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, | 104 | extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, |
107 | struct page *oldpage, struct page *newpage, bool migration_ok); | 105 | struct page *oldpage, struct page *newpage, bool migration_ok); |
108 | 106 | ||
@@ -124,7 +122,7 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, | |||
124 | extern void mem_cgroup_replace_page_cache(struct page *oldpage, | 122 | extern void mem_cgroup_replace_page_cache(struct page *oldpage, |
125 | struct page *newpage); | 123 | struct page *newpage); |
126 | 124 | ||
127 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 125 | #ifdef CONFIG_MEMCG_SWAP |
128 | extern int do_swap_account; | 126 | extern int do_swap_account; |
129 | #endif | 127 | #endif |
130 | 128 | ||
@@ -182,7 +180,6 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, | |||
182 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | 180 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, |
183 | gfp_t gfp_mask, | 181 | gfp_t gfp_mask, |
184 | unsigned long *total_scanned); | 182 | unsigned long *total_scanned); |
185 | u64 mem_cgroup_get_limit(struct mem_cgroup *memcg); | ||
186 | 183 | ||
187 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); | 184 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); |
188 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 185 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
@@ -193,7 +190,7 @@ void mem_cgroup_split_huge_fixup(struct page *head); | |||
193 | bool mem_cgroup_bad_page_check(struct page *page); | 190 | bool mem_cgroup_bad_page_check(struct page *page); |
194 | void mem_cgroup_print_bad_page(struct page *page); | 191 | void mem_cgroup_print_bad_page(struct page *page); |
195 | #endif | 192 | #endif |
196 | #else /* CONFIG_CGROUP_MEM_RES_CTLR */ | 193 | #else /* CONFIG_MEMCG */ |
197 | struct mem_cgroup; | 194 | struct mem_cgroup; |
198 | 195 | ||
199 | static inline int mem_cgroup_newpage_charge(struct page *page, | 196 | static inline int mem_cgroup_newpage_charge(struct page *page, |
@@ -279,11 +276,10 @@ static inline struct cgroup_subsys_state | |||
279 | return NULL; | 276 | return NULL; |
280 | } | 277 | } |
281 | 278 | ||
282 | static inline int | 279 | static inline void |
283 | mem_cgroup_prepare_migration(struct page *page, struct page *newpage, | 280 | mem_cgroup_prepare_migration(struct page *page, struct page *newpage, |
284 | struct mem_cgroup **memcgp, gfp_t gfp_mask) | 281 | struct mem_cgroup **memcgp) |
285 | { | 282 | { |
286 | return 0; | ||
287 | } | 283 | } |
288 | 284 | ||
289 | static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg, | 285 | static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg, |
@@ -366,12 +362,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
366 | return 0; | 362 | return 0; |
367 | } | 363 | } |
368 | 364 | ||
369 | static inline | ||
370 | u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | ||
371 | { | ||
372 | return 0; | ||
373 | } | ||
374 | |||
375 | static inline void mem_cgroup_split_huge_fixup(struct page *head) | 365 | static inline void mem_cgroup_split_huge_fixup(struct page *head) |
376 | { | 366 | { |
377 | } | 367 | } |
@@ -384,9 +374,9 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage, | |||
384 | struct page *newpage) | 374 | struct page *newpage) |
385 | { | 375 | { |
386 | } | 376 | } |
387 | #endif /* CONFIG_CGROUP_MEM_RES_CTLR */ | 377 | #endif /* CONFIG_MEMCG */ |
388 | 378 | ||
389 | #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) | 379 | #if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM) |
390 | static inline bool | 380 | static inline bool |
391 | mem_cgroup_bad_page_check(struct page *page) | 381 | mem_cgroup_bad_page_check(struct page *page) |
392 | { | 382 | { |
@@ -406,7 +396,7 @@ enum { | |||
406 | }; | 396 | }; |
407 | 397 | ||
408 | struct sock; | 398 | struct sock; |
409 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | 399 | #ifdef CONFIG_MEMCG_KMEM |
410 | void sock_update_memcg(struct sock *sk); | 400 | void sock_update_memcg(struct sock *sk); |
411 | void sock_release_memcg(struct sock *sk); | 401 | void sock_release_memcg(struct sock *sk); |
412 | #else | 402 | #else |
@@ -416,6 +406,6 @@ static inline void sock_update_memcg(struct sock *sk) | |||
416 | static inline void sock_release_memcg(struct sock *sk) | 406 | static inline void sock_release_memcg(struct sock *sk) |
417 | { | 407 | { |
418 | } | 408 | } |
419 | #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ | 409 | #endif /* CONFIG_MEMCG_KMEM */ |
420 | #endif /* _LINUX_MEMCONTROL_H */ | 410 | #endif /* _LINUX_MEMCONTROL_H */ |
421 | 411 | ||
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 855c337b20c3..ce7e6671968b 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h | |||
@@ -15,7 +15,7 @@ extern int migrate_page(struct address_space *, | |||
15 | extern int migrate_pages(struct list_head *l, new_page_t x, | 15 | extern int migrate_pages(struct list_head *l, new_page_t x, |
16 | unsigned long private, bool offlining, | 16 | unsigned long private, bool offlining, |
17 | enum migrate_mode mode); | 17 | enum migrate_mode mode); |
18 | extern int migrate_huge_pages(struct list_head *l, new_page_t x, | 18 | extern int migrate_huge_page(struct page *, new_page_t x, |
19 | unsigned long private, bool offlining, | 19 | unsigned long private, bool offlining, |
20 | enum migrate_mode mode); | 20 | enum migrate_mode mode); |
21 | 21 | ||
@@ -36,7 +36,7 @@ static inline void putback_lru_pages(struct list_head *l) {} | |||
36 | static inline int migrate_pages(struct list_head *l, new_page_t x, | 36 | static inline int migrate_pages(struct list_head *l, new_page_t x, |
37 | unsigned long private, bool offlining, | 37 | unsigned long private, bool offlining, |
38 | enum migrate_mode mode) { return -ENOSYS; } | 38 | enum migrate_mode mode) { return -ENOSYS; } |
39 | static inline int migrate_huge_pages(struct list_head *l, new_page_t x, | 39 | static inline int migrate_huge_page(struct page *page, new_page_t x, |
40 | unsigned long private, bool offlining, | 40 | unsigned long private, bool offlining, |
41 | enum migrate_mode mode) { return -ENOSYS; } | 41 | enum migrate_mode mode) { return -ENOSYS; } |
42 | 42 | ||
diff --git a/include/linux/mm.h b/include/linux/mm.h index f9f279cf5b1b..bd079a1b0fdc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -805,6 +805,17 @@ static inline void *page_rmapping(struct page *page) | |||
805 | return (void *)((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS); | 805 | return (void *)((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS); |
806 | } | 806 | } |
807 | 807 | ||
808 | extern struct address_space *__page_file_mapping(struct page *); | ||
809 | |||
810 | static inline | ||
811 | struct address_space *page_file_mapping(struct page *page) | ||
812 | { | ||
813 | if (unlikely(PageSwapCache(page))) | ||
814 | return __page_file_mapping(page); | ||
815 | |||
816 | return page->mapping; | ||
817 | } | ||
818 | |||
808 | static inline int PageAnon(struct page *page) | 819 | static inline int PageAnon(struct page *page) |
809 | { | 820 | { |
810 | return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; | 821 | return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; |
@@ -821,6 +832,20 @@ static inline pgoff_t page_index(struct page *page) | |||
821 | return page->index; | 832 | return page->index; |
822 | } | 833 | } |
823 | 834 | ||
835 | extern pgoff_t __page_file_index(struct page *page); | ||
836 | |||
837 | /* | ||
838 | * Return the file index of the page. Regular pagecache pages use ->index | ||
839 | * whereas swapcache pages use swp_offset(->private) | ||
840 | */ | ||
841 | static inline pgoff_t page_file_index(struct page *page) | ||
842 | { | ||
843 | if (unlikely(PageSwapCache(page))) | ||
844 | return __page_file_index(page); | ||
845 | |||
846 | return page->index; | ||
847 | } | ||
848 | |||
824 | /* | 849 | /* |
825 | * Return true if this page is mapped into pagetables. | 850 | * Return true if this page is mapped into pagetables. |
826 | */ | 851 | */ |
@@ -994,6 +1019,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
994 | struct page **pages, struct vm_area_struct **vmas); | 1019 | struct page **pages, struct vm_area_struct **vmas); |
995 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | 1020 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, |
996 | struct page **pages); | 1021 | struct page **pages); |
1022 | struct kvec; | ||
1023 | int get_kernel_pages(const struct kvec *iov, int nr_pages, int write, | ||
1024 | struct page **pages); | ||
1025 | int get_kernel_page(unsigned long start, int write, struct page **pages); | ||
997 | struct page *get_dump_page(unsigned long addr); | 1026 | struct page *get_dump_page(unsigned long addr); |
998 | 1027 | ||
999 | extern int try_to_release_page(struct page * page, gfp_t gfp_mask); | 1028 | extern int try_to_release_page(struct page * page, gfp_t gfp_mask); |
@@ -1331,6 +1360,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...); | |||
1331 | extern void setup_per_cpu_pageset(void); | 1360 | extern void setup_per_cpu_pageset(void); |
1332 | 1361 | ||
1333 | extern void zone_pcp_update(struct zone *zone); | 1362 | extern void zone_pcp_update(struct zone *zone); |
1363 | extern void zone_pcp_reset(struct zone *zone); | ||
1334 | 1364 | ||
1335 | /* nommu.c */ | 1365 | /* nommu.c */ |
1336 | extern atomic_long_t mmap_pages_allocated; | 1366 | extern atomic_long_t mmap_pages_allocated; |
@@ -1528,6 +1558,7 @@ void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); | |||
1528 | static inline void vm_stat_account(struct mm_struct *mm, | 1558 | static inline void vm_stat_account(struct mm_struct *mm, |
1529 | unsigned long flags, struct file *file, long pages) | 1559 | unsigned long flags, struct file *file, long pages) |
1530 | { | 1560 | { |
1561 | mm->total_vm += pages; | ||
1531 | } | 1562 | } |
1532 | #endif /* CONFIG_PROC_FS */ | 1563 | #endif /* CONFIG_PROC_FS */ |
1533 | 1564 | ||
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 074eb98fe15d..bf7867200b95 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -54,6 +54,15 @@ struct page { | |||
54 | union { | 54 | union { |
55 | pgoff_t index; /* Our offset within mapping. */ | 55 | pgoff_t index; /* Our offset within mapping. */ |
56 | void *freelist; /* slub/slob first free object */ | 56 | void *freelist; /* slub/slob first free object */ |
57 | bool pfmemalloc; /* If set by the page allocator, | ||
58 | * ALLOC_NO_WATERMARKS was set | ||
59 | * and the low watermark was not | ||
60 | * met implying that the system | ||
61 | * is under some pressure. The | ||
62 | * caller should try ensure | ||
63 | * this page is only used to | ||
64 | * free other pages. | ||
65 | */ | ||
57 | }; | 66 | }; |
58 | 67 | ||
59 | union { | 68 | union { |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 458988bd55a1..2daa54f55db7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -201,7 +201,7 @@ struct zone_reclaim_stat { | |||
201 | struct lruvec { | 201 | struct lruvec { |
202 | struct list_head lists[NR_LRU_LISTS]; | 202 | struct list_head lists[NR_LRU_LISTS]; |
203 | struct zone_reclaim_stat reclaim_stat; | 203 | struct zone_reclaim_stat reclaim_stat; |
204 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 204 | #ifdef CONFIG_MEMCG |
205 | struct zone *zone; | 205 | struct zone *zone; |
206 | #endif | 206 | #endif |
207 | }; | 207 | }; |
@@ -209,7 +209,6 @@ struct lruvec { | |||
209 | /* Mask used at gathering information at once (see memcontrol.c) */ | 209 | /* Mask used at gathering information at once (see memcontrol.c) */ |
210 | #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) | 210 | #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) |
211 | #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) | 211 | #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) |
212 | #define LRU_ALL_EVICTABLE (LRU_ALL_FILE | LRU_ALL_ANON) | ||
213 | #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) | 212 | #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) |
214 | 213 | ||
215 | /* Isolate clean file */ | 214 | /* Isolate clean file */ |
@@ -369,6 +368,10 @@ struct zone { | |||
369 | */ | 368 | */ |
370 | spinlock_t lock; | 369 | spinlock_t lock; |
371 | int all_unreclaimable; /* All pages pinned */ | 370 | int all_unreclaimable; /* All pages pinned */ |
371 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | ||
372 | /* pfn where the last incremental compaction isolated free pages */ | ||
373 | unsigned long compact_cached_free_pfn; | ||
374 | #endif | ||
372 | #ifdef CONFIG_MEMORY_HOTPLUG | 375 | #ifdef CONFIG_MEMORY_HOTPLUG |
373 | /* see spanned/present_pages for more description */ | 376 | /* see spanned/present_pages for more description */ |
374 | seqlock_t span_seqlock; | 377 | seqlock_t span_seqlock; |
@@ -475,6 +478,14 @@ struct zone { | |||
475 | * rarely used fields: | 478 | * rarely used fields: |
476 | */ | 479 | */ |
477 | const char *name; | 480 | const char *name; |
481 | #ifdef CONFIG_MEMORY_ISOLATION | ||
482 | /* | ||
483 | * the number of MIGRATE_ISOLATE *pageblock*. | ||
484 | * We need this for free page counting. Look at zone_watermark_ok_safe. | ||
485 | * It's protected by zone->lock | ||
486 | */ | ||
487 | int nr_pageblock_isolate; | ||
488 | #endif | ||
478 | } ____cacheline_internodealigned_in_smp; | 489 | } ____cacheline_internodealigned_in_smp; |
479 | 490 | ||
480 | typedef enum { | 491 | typedef enum { |
@@ -671,7 +682,7 @@ typedef struct pglist_data { | |||
671 | int nr_zones; | 682 | int nr_zones; |
672 | #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ | 683 | #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ |
673 | struct page *node_mem_map; | 684 | struct page *node_mem_map; |
674 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 685 | #ifdef CONFIG_MEMCG |
675 | struct page_cgroup *node_page_cgroup; | 686 | struct page_cgroup *node_page_cgroup; |
676 | #endif | 687 | #endif |
677 | #endif | 688 | #endif |
@@ -694,6 +705,7 @@ typedef struct pglist_data { | |||
694 | range, including holes */ | 705 | range, including holes */ |
695 | int node_id; | 706 | int node_id; |
696 | wait_queue_head_t kswapd_wait; | 707 | wait_queue_head_t kswapd_wait; |
708 | wait_queue_head_t pfmemalloc_wait; | ||
697 | struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ | 709 | struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ |
698 | int kswapd_max_order; | 710 | int kswapd_max_order; |
699 | enum zone_type classzone_idx; | 711 | enum zone_type classzone_idx; |
@@ -718,7 +730,7 @@ typedef struct pglist_data { | |||
718 | #include <linux/memory_hotplug.h> | 730 | #include <linux/memory_hotplug.h> |
719 | 731 | ||
720 | extern struct mutex zonelists_mutex; | 732 | extern struct mutex zonelists_mutex; |
721 | void build_all_zonelists(void *data); | 733 | void build_all_zonelists(pg_data_t *pgdat, struct zone *zone); |
722 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); | 734 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); |
723 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 735 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
724 | int classzone_idx, int alloc_flags); | 736 | int classzone_idx, int alloc_flags); |
@@ -736,7 +748,7 @@ extern void lruvec_init(struct lruvec *lruvec, struct zone *zone); | |||
736 | 748 | ||
737 | static inline struct zone *lruvec_zone(struct lruvec *lruvec) | 749 | static inline struct zone *lruvec_zone(struct lruvec *lruvec) |
738 | { | 750 | { |
739 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 751 | #ifdef CONFIG_MEMCG |
740 | return lruvec->zone; | 752 | return lruvec->zone; |
741 | #else | 753 | #else |
742 | return container_of(lruvec, struct zone, lruvec); | 754 | return container_of(lruvec, struct zone, lruvec); |
@@ -773,7 +785,7 @@ extern int movable_zone; | |||
773 | 785 | ||
774 | static inline int zone_movable_is_highmem(void) | 786 | static inline int zone_movable_is_highmem(void) |
775 | { | 787 | { |
776 | #if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE) | 788 | #if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) |
777 | return movable_zone == ZONE_HIGHMEM; | 789 | return movable_zone == ZONE_HIGHMEM; |
778 | #else | 790 | #else |
779 | return 0; | 791 | return 0; |
@@ -1052,7 +1064,7 @@ struct mem_section { | |||
1052 | 1064 | ||
1053 | /* See declaration of similar field in struct zone */ | 1065 | /* See declaration of similar field in struct zone */ |
1054 | unsigned long *pageblock_flags; | 1066 | unsigned long *pageblock_flags; |
1055 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 1067 | #ifdef CONFIG_MEMCG |
1056 | /* | 1068 | /* |
1057 | * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use | 1069 | * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use |
1058 | * section. (see memcontrol.h/page_cgroup.h about this.) | 1070 | * section. (see memcontrol.h/page_cgroup.h about this.) |
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 2889877318bc..1f8fc7f9bcd8 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h | |||
@@ -473,10 +473,10 @@ extern ssize_t nfs_direct_IO(int, struct kiocb *, const struct iovec *, loff_t, | |||
473 | unsigned long); | 473 | unsigned long); |
474 | extern ssize_t nfs_file_direct_read(struct kiocb *iocb, | 474 | extern ssize_t nfs_file_direct_read(struct kiocb *iocb, |
475 | const struct iovec *iov, unsigned long nr_segs, | 475 | const struct iovec *iov, unsigned long nr_segs, |
476 | loff_t pos); | 476 | loff_t pos, bool uio); |
477 | extern ssize_t nfs_file_direct_write(struct kiocb *iocb, | 477 | extern ssize_t nfs_file_direct_write(struct kiocb *iocb, |
478 | const struct iovec *iov, unsigned long nr_segs, | 478 | const struct iovec *iov, unsigned long nr_segs, |
479 | loff_t pos); | 479 | loff_t pos, bool uio); |
480 | 480 | ||
481 | /* | 481 | /* |
482 | * linux/fs/nfs/dir.c | 482 | * linux/fs/nfs/dir.c |
diff --git a/include/linux/oom.h b/include/linux/oom.h index e4c29bc72e70..49a3031fda50 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h | |||
@@ -40,15 +40,36 @@ enum oom_constraint { | |||
40 | CONSTRAINT_MEMCG, | 40 | CONSTRAINT_MEMCG, |
41 | }; | 41 | }; |
42 | 42 | ||
43 | enum oom_scan_t { | ||
44 | OOM_SCAN_OK, /* scan thread and find its badness */ | ||
45 | OOM_SCAN_CONTINUE, /* do not consider thread for oom kill */ | ||
46 | OOM_SCAN_ABORT, /* abort the iteration and return */ | ||
47 | OOM_SCAN_SELECT, /* always select this thread first */ | ||
48 | }; | ||
49 | |||
43 | extern void compare_swap_oom_score_adj(int old_val, int new_val); | 50 | extern void compare_swap_oom_score_adj(int old_val, int new_val); |
44 | extern int test_set_oom_score_adj(int new_val); | 51 | extern int test_set_oom_score_adj(int new_val); |
45 | 52 | ||
46 | extern unsigned long oom_badness(struct task_struct *p, | 53 | extern unsigned long oom_badness(struct task_struct *p, |
47 | struct mem_cgroup *memcg, const nodemask_t *nodemask, | 54 | struct mem_cgroup *memcg, const nodemask_t *nodemask, |
48 | unsigned long totalpages); | 55 | unsigned long totalpages); |
56 | extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | ||
57 | unsigned int points, unsigned long totalpages, | ||
58 | struct mem_cgroup *memcg, nodemask_t *nodemask, | ||
59 | const char *message); | ||
60 | |||
49 | extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); | 61 | extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); |
50 | extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); | 62 | extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); |
51 | 63 | ||
64 | extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, | ||
65 | int order, const nodemask_t *nodemask); | ||
66 | |||
67 | extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | ||
68 | unsigned long totalpages, const nodemask_t *nodemask, | ||
69 | bool force_kill); | ||
70 | extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | ||
71 | int order); | ||
72 | |||
52 | extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | 73 | extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, |
53 | int order, nodemask_t *mask, bool force_kill); | 74 | int order, nodemask_t *mask, bool force_kill); |
54 | extern int register_oom_notifier(struct notifier_block *nb); | 75 | extern int register_oom_notifier(struct notifier_block *nb); |
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index c88d2a9451af..b5d13841604e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h | |||
@@ -7,6 +7,7 @@ | |||
7 | 7 | ||
8 | #include <linux/types.h> | 8 | #include <linux/types.h> |
9 | #include <linux/bug.h> | 9 | #include <linux/bug.h> |
10 | #include <linux/mmdebug.h> | ||
10 | #ifndef __GENERATING_BOUNDS_H | 11 | #ifndef __GENERATING_BOUNDS_H |
11 | #include <linux/mm_types.h> | 12 | #include <linux/mm_types.h> |
12 | #include <generated/bounds.h> | 13 | #include <generated/bounds.h> |
@@ -453,6 +454,34 @@ static inline int PageTransTail(struct page *page) | |||
453 | } | 454 | } |
454 | #endif | 455 | #endif |
455 | 456 | ||
457 | /* | ||
458 | * If network-based swap is enabled, sl*b must keep track of whether pages | ||
459 | * were allocated from pfmemalloc reserves. | ||
460 | */ | ||
461 | static inline int PageSlabPfmemalloc(struct page *page) | ||
462 | { | ||
463 | VM_BUG_ON(!PageSlab(page)); | ||
464 | return PageActive(page); | ||
465 | } | ||
466 | |||
467 | static inline void SetPageSlabPfmemalloc(struct page *page) | ||
468 | { | ||
469 | VM_BUG_ON(!PageSlab(page)); | ||
470 | SetPageActive(page); | ||
471 | } | ||
472 | |||
473 | static inline void __ClearPageSlabPfmemalloc(struct page *page) | ||
474 | { | ||
475 | VM_BUG_ON(!PageSlab(page)); | ||
476 | __ClearPageActive(page); | ||
477 | } | ||
478 | |||
479 | static inline void ClearPageSlabPfmemalloc(struct page *page) | ||
480 | { | ||
481 | VM_BUG_ON(!PageSlab(page)); | ||
482 | ClearPageActive(page); | ||
483 | } | ||
484 | |||
456 | #ifdef CONFIG_MMU | 485 | #ifdef CONFIG_MMU |
457 | #define __PG_MLOCKED (1 << PG_mlocked) | 486 | #define __PG_MLOCKED (1 << PG_mlocked) |
458 | #else | 487 | #else |
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 3bdcab30ca41..105077aa7685 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h | |||
@@ -1,6 +1,11 @@ | |||
1 | #ifndef __LINUX_PAGEISOLATION_H | 1 | #ifndef __LINUX_PAGEISOLATION_H |
2 | #define __LINUX_PAGEISOLATION_H | 2 | #define __LINUX_PAGEISOLATION_H |
3 | 3 | ||
4 | |||
5 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count); | ||
6 | void set_pageblock_migratetype(struct page *page, int migratetype); | ||
7 | int move_freepages_block(struct zone *zone, struct page *page, | ||
8 | int migratetype); | ||
4 | /* | 9 | /* |
5 | * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE. | 10 | * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE. |
6 | * If specified range includes migrate types other than MOVABLE or CMA, | 11 | * If specified range includes migrate types other than MOVABLE or CMA, |
@@ -10,7 +15,7 @@ | |||
10 | * free all pages in the range. test_page_isolated() can be used for | 15 | * free all pages in the range. test_page_isolated() can be used for |
11 | * test it. | 16 | * test it. |
12 | */ | 17 | */ |
13 | extern int | 18 | int |
14 | start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, | 19 | start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, |
15 | unsigned migratetype); | 20 | unsigned migratetype); |
16 | 21 | ||
@@ -18,7 +23,7 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, | |||
18 | * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE. | 23 | * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE. |
19 | * target range is [start_pfn, end_pfn) | 24 | * target range is [start_pfn, end_pfn) |
20 | */ | 25 | */ |
21 | extern int | 26 | int |
22 | undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, | 27 | undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, |
23 | unsigned migratetype); | 28 | unsigned migratetype); |
24 | 29 | ||
@@ -30,8 +35,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn); | |||
30 | /* | 35 | /* |
31 | * Internal functions. Changes pageblock's migrate type. | 36 | * Internal functions. Changes pageblock's migrate type. |
32 | */ | 37 | */ |
33 | extern int set_migratetype_isolate(struct page *page); | 38 | int set_migratetype_isolate(struct page *page); |
34 | extern void unset_migratetype_isolate(struct page *page, unsigned migratetype); | 39 | void unset_migratetype_isolate(struct page *page, unsigned migratetype); |
35 | 40 | ||
36 | 41 | ||
37 | #endif | 42 | #endif |
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index a88cdba27809..777a524716db 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h | |||
@@ -12,7 +12,7 @@ enum { | |||
12 | #ifndef __GENERATING_BOUNDS_H | 12 | #ifndef __GENERATING_BOUNDS_H |
13 | #include <generated/bounds.h> | 13 | #include <generated/bounds.h> |
14 | 14 | ||
15 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 15 | #ifdef CONFIG_MEMCG |
16 | #include <linux/bit_spinlock.h> | 16 | #include <linux/bit_spinlock.h> |
17 | 17 | ||
18 | /* | 18 | /* |
@@ -82,7 +82,7 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc) | |||
82 | bit_spin_unlock(PCG_LOCK, &pc->flags); | 82 | bit_spin_unlock(PCG_LOCK, &pc->flags); |
83 | } | 83 | } |
84 | 84 | ||
85 | #else /* CONFIG_CGROUP_MEM_RES_CTLR */ | 85 | #else /* CONFIG_MEMCG */ |
86 | struct page_cgroup; | 86 | struct page_cgroup; |
87 | 87 | ||
88 | static inline void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) | 88 | static inline void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) |
@@ -102,11 +102,11 @@ static inline void __init page_cgroup_init_flatmem(void) | |||
102 | { | 102 | { |
103 | } | 103 | } |
104 | 104 | ||
105 | #endif /* CONFIG_CGROUP_MEM_RES_CTLR */ | 105 | #endif /* CONFIG_MEMCG */ |
106 | 106 | ||
107 | #include <linux/swap.h> | 107 | #include <linux/swap.h> |
108 | 108 | ||
109 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 109 | #ifdef CONFIG_MEMCG_SWAP |
110 | extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | 110 | extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, |
111 | unsigned short old, unsigned short new); | 111 | unsigned short old, unsigned short new); |
112 | extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); | 112 | extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); |
@@ -138,7 +138,7 @@ static inline void swap_cgroup_swapoff(int type) | |||
138 | return; | 138 | return; |
139 | } | 139 | } |
140 | 140 | ||
141 | #endif /* CONFIG_CGROUP_MEM_RES_CTLR_SWAP */ | 141 | #endif /* CONFIG_MEMCG_SWAP */ |
142 | 142 | ||
143 | #endif /* !__GENERATING_BOUNDS_H */ | 143 | #endif /* !__GENERATING_BOUNDS_H */ |
144 | 144 | ||
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 7cfad3bbb0cc..e42c762f0dc7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h | |||
@@ -286,6 +286,11 @@ static inline loff_t page_offset(struct page *page) | |||
286 | return ((loff_t)page->index) << PAGE_CACHE_SHIFT; | 286 | return ((loff_t)page->index) << PAGE_CACHE_SHIFT; |
287 | } | 287 | } |
288 | 288 | ||
289 | static inline loff_t page_file_offset(struct page *page) | ||
290 | { | ||
291 | return ((loff_t)page_file_index(page)) << PAGE_CACHE_SHIFT; | ||
292 | } | ||
293 | |||
289 | extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma, | 294 | extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma, |
290 | unsigned long address); | 295 | unsigned long address); |
291 | 296 | ||
diff --git a/include/linux/sched.h b/include/linux/sched.h index 68dcffaa62a0..c147e7024f11 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1584,7 +1584,7 @@ struct task_struct { | |||
1584 | /* bitmask and counter of trace recursion */ | 1584 | /* bitmask and counter of trace recursion */ |
1585 | unsigned long trace_recursion; | 1585 | unsigned long trace_recursion; |
1586 | #endif /* CONFIG_TRACING */ | 1586 | #endif /* CONFIG_TRACING */ |
1587 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */ | 1587 | #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ |
1588 | struct memcg_batch_info { | 1588 | struct memcg_batch_info { |
1589 | int do_batch; /* incremented when batch uncharge started */ | 1589 | int do_batch; /* incremented when batch uncharge started */ |
1590 | struct mem_cgroup *memcg; /* target memcg of uncharge */ | 1590 | struct mem_cgroup *memcg; /* target memcg of uncharge */ |
@@ -1894,6 +1894,13 @@ static inline void rcu_copy_process(struct task_struct *p) | |||
1894 | 1894 | ||
1895 | #endif | 1895 | #endif |
1896 | 1896 | ||
1897 | static inline void tsk_restore_flags(struct task_struct *task, | ||
1898 | unsigned long orig_flags, unsigned long flags) | ||
1899 | { | ||
1900 | task->flags &= ~flags; | ||
1901 | task->flags |= orig_flags & flags; | ||
1902 | } | ||
1903 | |||
1897 | #ifdef CONFIG_SMP | 1904 | #ifdef CONFIG_SMP |
1898 | extern void do_set_cpus_allowed(struct task_struct *p, | 1905 | extern void do_set_cpus_allowed(struct task_struct *p, |
1899 | const struct cpumask *new_mask); | 1906 | const struct cpumask *new_mask); |
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 07ceb97d53fa..ac6b8ee07825 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h | |||
@@ -20,7 +20,6 @@ struct shrink_control { | |||
20 | * 'nr_to_scan' entries and attempt to free them up. It should return | 20 | * 'nr_to_scan' entries and attempt to free them up. It should return |
21 | * the number of objects which remain in the cache. If it returns -1, it means | 21 | * the number of objects which remain in the cache. If it returns -1, it means |
22 | * it cannot do any scanning at this time (eg. there is a risk of deadlock). | 22 | * it cannot do any scanning at this time (eg. there is a risk of deadlock). |
23 | * The callback must not return -1 if nr_to_scan is zero. | ||
24 | * | 23 | * |
25 | * The 'gfpmask' refers to the allocation we are currently trying to | 24 | * The 'gfpmask' refers to the allocation we are currently trying to |
26 | * fulfil. | 25 | * fulfil. |
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d205c4be7f5b..7632c87da2c9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h | |||
@@ -462,6 +462,7 @@ struct sk_buff { | |||
462 | #ifdef CONFIG_IPV6_NDISC_NODETYPE | 462 | #ifdef CONFIG_IPV6_NDISC_NODETYPE |
463 | __u8 ndisc_nodetype:2; | 463 | __u8 ndisc_nodetype:2; |
464 | #endif | 464 | #endif |
465 | __u8 pfmemalloc:1; | ||
465 | __u8 ooo_okay:1; | 466 | __u8 ooo_okay:1; |
466 | __u8 l4_rxhash:1; | 467 | __u8 l4_rxhash:1; |
467 | __u8 wifi_acked_valid:1; | 468 | __u8 wifi_acked_valid:1; |
@@ -502,6 +503,15 @@ struct sk_buff { | |||
502 | #include <linux/slab.h> | 503 | #include <linux/slab.h> |
503 | 504 | ||
504 | 505 | ||
506 | #define SKB_ALLOC_FCLONE 0x01 | ||
507 | #define SKB_ALLOC_RX 0x02 | ||
508 | |||
509 | /* Returns true if the skb was allocated from PFMEMALLOC reserves */ | ||
510 | static inline bool skb_pfmemalloc(const struct sk_buff *skb) | ||
511 | { | ||
512 | return unlikely(skb->pfmemalloc); | ||
513 | } | ||
514 | |||
505 | /* | 515 | /* |
506 | * skb might have a dst pointer attached, refcounted or not. | 516 | * skb might have a dst pointer attached, refcounted or not. |
507 | * _skb_refdst low order bit is set if refcount was _not_ taken | 517 | * _skb_refdst low order bit is set if refcount was _not_ taken |
@@ -565,7 +575,7 @@ extern bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, | |||
565 | bool *fragstolen, int *delta_truesize); | 575 | bool *fragstolen, int *delta_truesize); |
566 | 576 | ||
567 | extern struct sk_buff *__alloc_skb(unsigned int size, | 577 | extern struct sk_buff *__alloc_skb(unsigned int size, |
568 | gfp_t priority, int fclone, int node); | 578 | gfp_t priority, int flags, int node); |
569 | extern struct sk_buff *build_skb(void *data, unsigned int frag_size); | 579 | extern struct sk_buff *build_skb(void *data, unsigned int frag_size); |
570 | static inline struct sk_buff *alloc_skb(unsigned int size, | 580 | static inline struct sk_buff *alloc_skb(unsigned int size, |
571 | gfp_t priority) | 581 | gfp_t priority) |
@@ -576,7 +586,7 @@ static inline struct sk_buff *alloc_skb(unsigned int size, | |||
576 | static inline struct sk_buff *alloc_skb_fclone(unsigned int size, | 586 | static inline struct sk_buff *alloc_skb_fclone(unsigned int size, |
577 | gfp_t priority) | 587 | gfp_t priority) |
578 | { | 588 | { |
579 | return __alloc_skb(size, priority, 1, NUMA_NO_NODE); | 589 | return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE); |
580 | } | 590 | } |
581 | 591 | ||
582 | extern void skb_recycle(struct sk_buff *skb); | 592 | extern void skb_recycle(struct sk_buff *skb); |
@@ -1237,6 +1247,17 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, | |||
1237 | { | 1247 | { |
1238 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; | 1248 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; |
1239 | 1249 | ||
1250 | /* | ||
1251 | * Propagate page->pfmemalloc to the skb if we can. The problem is | ||
1252 | * that not all callers have unique ownership of the page. If | ||
1253 | * pfmemalloc is set, we check the mapping as a mapping implies | ||
1254 | * page->index is set (index and pfmemalloc share space). | ||
1255 | * If it's a valid mapping, we cannot use page->pfmemalloc but we | ||
1256 | * do not lose pfmemalloc information as the pages would not be | ||
1257 | * allocated using __GFP_MEMALLOC. | ||
1258 | */ | ||
1259 | if (page->pfmemalloc && !page->mapping) | ||
1260 | skb->pfmemalloc = true; | ||
1240 | frag->page.p = page; | 1261 | frag->page.p = page; |
1241 | frag->page_offset = off; | 1262 | frag->page_offset = off; |
1242 | skb_frag_size_set(frag, size); | 1263 | skb_frag_size_set(frag, size); |
@@ -1753,6 +1774,61 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev, | |||
1753 | return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC); | 1774 | return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC); |
1754 | } | 1775 | } |
1755 | 1776 | ||
1777 | /* | ||
1778 | * __skb_alloc_page - allocate pages for ps-rx on a skb and preserve pfmemalloc data | ||
1779 | * @gfp_mask: alloc_pages_node mask. Set __GFP_NOMEMALLOC if not for network packet RX | ||
1780 | * @skb: skb to set pfmemalloc on if __GFP_MEMALLOC is used | ||
1781 | * @order: size of the allocation | ||
1782 | * | ||
1783 | * Allocate a new page. | ||
1784 | * | ||
1785 | * %NULL is returned if there is no free memory. | ||
1786 | */ | ||
1787 | static inline struct page *__skb_alloc_pages(gfp_t gfp_mask, | ||
1788 | struct sk_buff *skb, | ||
1789 | unsigned int order) | ||
1790 | { | ||
1791 | struct page *page; | ||
1792 | |||
1793 | gfp_mask |= __GFP_COLD; | ||
1794 | |||
1795 | if (!(gfp_mask & __GFP_NOMEMALLOC)) | ||
1796 | gfp_mask |= __GFP_MEMALLOC; | ||
1797 | |||
1798 | page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); | ||
1799 | if (skb && page && page->pfmemalloc) | ||
1800 | skb->pfmemalloc = true; | ||
1801 | |||
1802 | return page; | ||
1803 | } | ||
1804 | |||
1805 | /** | ||
1806 | * __skb_alloc_page - allocate a page for ps-rx for a given skb and preserve pfmemalloc data | ||
1807 | * @gfp_mask: alloc_pages_node mask. Set __GFP_NOMEMALLOC if not for network packet RX | ||
1808 | * @skb: skb to set pfmemalloc on if __GFP_MEMALLOC is used | ||
1809 | * | ||
1810 | * Allocate a new page. | ||
1811 | * | ||
1812 | * %NULL is returned if there is no free memory. | ||
1813 | */ | ||
1814 | static inline struct page *__skb_alloc_page(gfp_t gfp_mask, | ||
1815 | struct sk_buff *skb) | ||
1816 | { | ||
1817 | return __skb_alloc_pages(gfp_mask, skb, 0); | ||
1818 | } | ||
1819 | |||
1820 | /** | ||
1821 | * skb_propagate_pfmemalloc - Propagate pfmemalloc if skb is allocated after RX page | ||
1822 | * @page: The page that was allocated from skb_alloc_page | ||
1823 | * @skb: The skb that may need pfmemalloc set | ||
1824 | */ | ||
1825 | static inline void skb_propagate_pfmemalloc(struct page *page, | ||
1826 | struct sk_buff *skb) | ||
1827 | { | ||
1828 | if (page && page->pfmemalloc) | ||
1829 | skb->pfmemalloc = true; | ||
1830 | } | ||
1831 | |||
1756 | /** | 1832 | /** |
1757 | * skb_frag_page - retrieve the page refered to by a paged fragment | 1833 | * skb_frag_page - retrieve the page refered to by a paged fragment |
1758 | * @frag: the paged fragment | 1834 | * @frag: the paged fragment |
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 77d278defa70..cff40aa7db62 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h | |||
@@ -174,6 +174,8 @@ struct rpc_xprt { | |||
174 | unsigned long state; /* transport state */ | 174 | unsigned long state; /* transport state */ |
175 | unsigned char shutdown : 1, /* being shut down */ | 175 | unsigned char shutdown : 1, /* being shut down */ |
176 | resvport : 1; /* use a reserved port */ | 176 | resvport : 1; /* use a reserved port */ |
177 | unsigned int swapper; /* we're swapping over this | ||
178 | transport */ | ||
177 | unsigned int bind_index; /* bind function index */ | 179 | unsigned int bind_index; /* bind function index */ |
178 | 180 | ||
179 | /* | 181 | /* |
@@ -316,6 +318,7 @@ void xprt_release_rqst_cong(struct rpc_task *task); | |||
316 | void xprt_disconnect_done(struct rpc_xprt *xprt); | 318 | void xprt_disconnect_done(struct rpc_xprt *xprt); |
317 | void xprt_force_disconnect(struct rpc_xprt *xprt); | 319 | void xprt_force_disconnect(struct rpc_xprt *xprt); |
318 | void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie); | 320 | void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie); |
321 | int xs_swapper(struct rpc_xprt *xprt, int enable); | ||
319 | 322 | ||
320 | /* | 323 | /* |
321 | * Reserved bit positions in xprt->state | 324 | * Reserved bit positions in xprt->state |
diff --git a/include/linux/swap.h b/include/linux/swap.h index c84ec68eaec9..388e70601413 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -151,6 +151,7 @@ enum { | |||
151 | SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ | 151 | SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ |
152 | SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ | 152 | SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ |
153 | SWP_BLKDEV = (1 << 6), /* its a block device */ | 153 | SWP_BLKDEV = (1 << 6), /* its a block device */ |
154 | SWP_FILE = (1 << 7), /* set after swap_activate success */ | ||
154 | /* add others here before... */ | 155 | /* add others here before... */ |
155 | SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ | 156 | SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ |
156 | }; | 157 | }; |
@@ -301,7 +302,7 @@ static inline void scan_unevictable_unregister_node(struct node *node) | |||
301 | 302 | ||
302 | extern int kswapd_run(int nid); | 303 | extern int kswapd_run(int nid); |
303 | extern void kswapd_stop(int nid); | 304 | extern void kswapd_stop(int nid); |
304 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 305 | #ifdef CONFIG_MEMCG |
305 | extern int mem_cgroup_swappiness(struct mem_cgroup *mem); | 306 | extern int mem_cgroup_swappiness(struct mem_cgroup *mem); |
306 | #else | 307 | #else |
307 | static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) | 308 | static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) |
@@ -309,7 +310,7 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) | |||
309 | return vm_swappiness; | 310 | return vm_swappiness; |
310 | } | 311 | } |
311 | #endif | 312 | #endif |
312 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 313 | #ifdef CONFIG_MEMCG_SWAP |
313 | extern void mem_cgroup_uncharge_swap(swp_entry_t ent); | 314 | extern void mem_cgroup_uncharge_swap(swp_entry_t ent); |
314 | #else | 315 | #else |
315 | static inline void mem_cgroup_uncharge_swap(swp_entry_t ent) | 316 | static inline void mem_cgroup_uncharge_swap(swp_entry_t ent) |
@@ -320,8 +321,14 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
320 | /* linux/mm/page_io.c */ | 321 | /* linux/mm/page_io.c */ |
321 | extern int swap_readpage(struct page *); | 322 | extern int swap_readpage(struct page *); |
322 | extern int swap_writepage(struct page *page, struct writeback_control *wbc); | 323 | extern int swap_writepage(struct page *page, struct writeback_control *wbc); |
324 | extern int swap_set_page_dirty(struct page *page); | ||
323 | extern void end_swap_bio_read(struct bio *bio, int err); | 325 | extern void end_swap_bio_read(struct bio *bio, int err); |
324 | 326 | ||
327 | int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | ||
328 | unsigned long nr_pages, sector_t start_block); | ||
329 | int generic_swapfile_activate(struct swap_info_struct *, struct file *, | ||
330 | sector_t *); | ||
331 | |||
325 | /* linux/mm/swap_state.c */ | 332 | /* linux/mm/swap_state.c */ |
326 | extern struct address_space swapper_space; | 333 | extern struct address_space swapper_space; |
327 | #define total_swapcache_pages swapper_space.nrpages | 334 | #define total_swapcache_pages swapper_space.nrpages |
@@ -356,11 +363,12 @@ extern unsigned int count_swap_pages(int, int); | |||
356 | extern sector_t map_swap_page(struct page *, struct block_device **); | 363 | extern sector_t map_swap_page(struct page *, struct block_device **); |
357 | extern sector_t swapdev_block(int, pgoff_t); | 364 | extern sector_t swapdev_block(int, pgoff_t); |
358 | extern int page_swapcount(struct page *); | 365 | extern int page_swapcount(struct page *); |
366 | extern struct swap_info_struct *page_swap_info(struct page *); | ||
359 | extern int reuse_swap_page(struct page *); | 367 | extern int reuse_swap_page(struct page *); |
360 | extern int try_to_free_swap(struct page *); | 368 | extern int try_to_free_swap(struct page *); |
361 | struct backing_dev_info; | 369 | struct backing_dev_info; |
362 | 370 | ||
363 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 371 | #ifdef CONFIG_MEMCG |
364 | extern void | 372 | extern void |
365 | mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout); | 373 | mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout); |
366 | #else | 374 | #else |
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 06f8e3858251..57f7b1091511 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h | |||
@@ -30,6 +30,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, | |||
30 | FOR_ALL_ZONES(PGSTEAL_DIRECT), | 30 | FOR_ALL_ZONES(PGSTEAL_DIRECT), |
31 | FOR_ALL_ZONES(PGSCAN_KSWAPD), | 31 | FOR_ALL_ZONES(PGSCAN_KSWAPD), |
32 | FOR_ALL_ZONES(PGSCAN_DIRECT), | 32 | FOR_ALL_ZONES(PGSCAN_DIRECT), |
33 | PGSCAN_DIRECT_THROTTLE, | ||
33 | #ifdef CONFIG_NUMA | 34 | #ifdef CONFIG_NUMA |
34 | PGSCAN_ZONE_RECLAIM_FAILED, | 35 | PGSCAN_ZONE_RECLAIM_FAILED, |
35 | #endif | 36 | #endif |
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 65efb92da996..ad2cfd53dadc 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h | |||
@@ -179,11 +179,6 @@ extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp); | |||
179 | #define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d) | 179 | #define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d) |
180 | #define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d)) | 180 | #define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d)) |
181 | 181 | ||
182 | static inline void zap_zone_vm_stats(struct zone *zone) | ||
183 | { | ||
184 | memset(zone->vm_stat, 0, sizeof(zone->vm_stat)); | ||
185 | } | ||
186 | |||
187 | extern void inc_zone_state(struct zone *, enum zone_stat_item); | 182 | extern void inc_zone_state(struct zone *, enum zone_stat_item); |
188 | 183 | ||
189 | #ifdef CONFIG_SMP | 184 | #ifdef CONFIG_SMP |
diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 6d0a0fcd80e7..c66fe3332d83 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h | |||
@@ -189,9 +189,4 @@ void tag_pages_for_writeback(struct address_space *mapping, | |||
189 | 189 | ||
190 | void account_page_redirty(struct page *page); | 190 | void account_page_redirty(struct page *page); |
191 | 191 | ||
192 | /* pdflush.c */ | ||
193 | extern int nr_pdflush_threads; /* Global so it can be exported to sysctl | ||
194 | read-only. */ | ||
195 | |||
196 | |||
197 | #endif /* WRITEBACK_H */ | 192 | #endif /* WRITEBACK_H */ |
diff --git a/include/net/sock.h b/include/net/sock.h index e067f8c18f88..b3730239bf18 100644 --- a/include/net/sock.h +++ b/include/net/sock.h | |||
@@ -621,6 +621,7 @@ enum sock_flags { | |||
621 | SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ | 621 | SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ |
622 | SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ | 622 | SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ |
623 | SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */ | 623 | SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */ |
624 | SOCK_MEMALLOC, /* VM depends on this socket for swapping */ | ||
624 | SOCK_TIMESTAMPING_TX_HARDWARE, /* %SOF_TIMESTAMPING_TX_HARDWARE */ | 625 | SOCK_TIMESTAMPING_TX_HARDWARE, /* %SOF_TIMESTAMPING_TX_HARDWARE */ |
625 | SOCK_TIMESTAMPING_TX_SOFTWARE, /* %SOF_TIMESTAMPING_TX_SOFTWARE */ | 626 | SOCK_TIMESTAMPING_TX_SOFTWARE, /* %SOF_TIMESTAMPING_TX_SOFTWARE */ |
626 | SOCK_TIMESTAMPING_RX_HARDWARE, /* %SOF_TIMESTAMPING_RX_HARDWARE */ | 627 | SOCK_TIMESTAMPING_RX_HARDWARE, /* %SOF_TIMESTAMPING_RX_HARDWARE */ |
@@ -658,6 +659,26 @@ static inline bool sock_flag(const struct sock *sk, enum sock_flags flag) | |||
658 | return test_bit(flag, &sk->sk_flags); | 659 | return test_bit(flag, &sk->sk_flags); |
659 | } | 660 | } |
660 | 661 | ||
662 | #ifdef CONFIG_NET | ||
663 | extern struct static_key memalloc_socks; | ||
664 | static inline int sk_memalloc_socks(void) | ||
665 | { | ||
666 | return static_key_false(&memalloc_socks); | ||
667 | } | ||
668 | #else | ||
669 | |||
670 | static inline int sk_memalloc_socks(void) | ||
671 | { | ||
672 | return 0; | ||
673 | } | ||
674 | |||
675 | #endif | ||
676 | |||
677 | static inline gfp_t sk_gfp_atomic(struct sock *sk, gfp_t gfp_mask) | ||
678 | { | ||
679 | return GFP_ATOMIC | (sk->sk_allocation & __GFP_MEMALLOC); | ||
680 | } | ||
681 | |||
661 | static inline void sk_acceptq_removed(struct sock *sk) | 682 | static inline void sk_acceptq_removed(struct sock *sk) |
662 | { | 683 | { |
663 | sk->sk_ack_backlog--; | 684 | sk->sk_ack_backlog--; |
@@ -733,8 +754,13 @@ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *s | |||
733 | return 0; | 754 | return 0; |
734 | } | 755 | } |
735 | 756 | ||
757 | extern int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); | ||
758 | |||
736 | static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) | 759 | static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) |
737 | { | 760 | { |
761 | if (sk_memalloc_socks() && skb_pfmemalloc(skb)) | ||
762 | return __sk_backlog_rcv(sk, skb); | ||
763 | |||
738 | return sk->sk_backlog_rcv(sk, skb); | 764 | return sk->sk_backlog_rcv(sk, skb); |
739 | } | 765 | } |
740 | 766 | ||
@@ -798,6 +824,8 @@ extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p); | |||
798 | extern void sk_stream_wait_close(struct sock *sk, long timeo_p); | 824 | extern void sk_stream_wait_close(struct sock *sk, long timeo_p); |
799 | extern int sk_stream_error(struct sock *sk, int flags, int err); | 825 | extern int sk_stream_error(struct sock *sk, int flags, int err); |
800 | extern void sk_stream_kill_queues(struct sock *sk); | 826 | extern void sk_stream_kill_queues(struct sock *sk); |
827 | extern void sk_set_memalloc(struct sock *sk); | ||
828 | extern void sk_clear_memalloc(struct sock *sk); | ||
801 | 829 | ||
802 | extern int sk_wait_data(struct sock *sk, long *timeo); | 830 | extern int sk_wait_data(struct sock *sk, long *timeo); |
803 | 831 | ||
@@ -913,7 +941,7 @@ struct proto { | |||
913 | #ifdef SOCK_REFCNT_DEBUG | 941 | #ifdef SOCK_REFCNT_DEBUG |
914 | atomic_t socks; | 942 | atomic_t socks; |
915 | #endif | 943 | #endif |
916 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | 944 | #ifdef CONFIG_MEMCG_KMEM |
917 | /* | 945 | /* |
918 | * cgroup specific init/deinit functions. Called once for all | 946 | * cgroup specific init/deinit functions. Called once for all |
919 | * protocols that implement it, from cgroups populate function. | 947 | * protocols that implement it, from cgroups populate function. |
@@ -994,7 +1022,7 @@ inline void sk_refcnt_debug_release(const struct sock *sk) | |||
994 | #define sk_refcnt_debug_release(sk) do { } while (0) | 1022 | #define sk_refcnt_debug_release(sk) do { } while (0) |
995 | #endif /* SOCK_REFCNT_DEBUG */ | 1023 | #endif /* SOCK_REFCNT_DEBUG */ |
996 | 1024 | ||
997 | #if defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) && defined(CONFIG_NET) | 1025 | #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_NET) |
998 | extern struct static_key memcg_socket_limit_enabled; | 1026 | extern struct static_key memcg_socket_limit_enabled; |
999 | static inline struct cg_proto *parent_cg_proto(struct proto *proto, | 1027 | static inline struct cg_proto *parent_cg_proto(struct proto *proto, |
1000 | struct cg_proto *cg_proto) | 1028 | struct cg_proto *cg_proto) |
@@ -1301,12 +1329,14 @@ static inline bool sk_wmem_schedule(struct sock *sk, int size) | |||
1301 | __sk_mem_schedule(sk, size, SK_MEM_SEND); | 1329 | __sk_mem_schedule(sk, size, SK_MEM_SEND); |
1302 | } | 1330 | } |
1303 | 1331 | ||
1304 | static inline bool sk_rmem_schedule(struct sock *sk, int size) | 1332 | static inline bool |
1333 | sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, unsigned int size) | ||
1305 | { | 1334 | { |
1306 | if (!sk_has_account(sk)) | 1335 | if (!sk_has_account(sk)) |
1307 | return true; | 1336 | return true; |
1308 | return size <= sk->sk_forward_alloc || | 1337 | return size<= sk->sk_forward_alloc || |
1309 | __sk_mem_schedule(sk, size, SK_MEM_RECV); | 1338 | __sk_mem_schedule(sk, size, SK_MEM_RECV) || |
1339 | skb_pfmemalloc(skb); | ||
1310 | } | 1340 | } |
1311 | 1341 | ||
1312 | static inline void sk_mem_reclaim(struct sock *sk) | 1342 | static inline void sk_mem_reclaim(struct sock *sk) |
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index 9fe3a36646e9..d6fd8e5b14b7 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h | |||
@@ -30,6 +30,7 @@ | |||
30 | {(unsigned long)__GFP_COMP, "GFP_COMP"}, \ | 30 | {(unsigned long)__GFP_COMP, "GFP_COMP"}, \ |
31 | {(unsigned long)__GFP_ZERO, "GFP_ZERO"}, \ | 31 | {(unsigned long)__GFP_ZERO, "GFP_ZERO"}, \ |
32 | {(unsigned long)__GFP_NOMEMALLOC, "GFP_NOMEMALLOC"}, \ | 32 | {(unsigned long)__GFP_NOMEMALLOC, "GFP_NOMEMALLOC"}, \ |
33 | {(unsigned long)__GFP_MEMALLOC, "GFP_MEMALLOC"}, \ | ||
33 | {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ | 34 | {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ |
34 | {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ | 35 | {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ |
35 | {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ | 36 | {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ |
diff --git a/init/Kconfig b/init/Kconfig index b3f55f15e107..af6c7f8ba019 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -686,7 +686,7 @@ config RESOURCE_COUNTERS | |||
686 | This option enables controller independent resource accounting | 686 | This option enables controller independent resource accounting |
687 | infrastructure that works with cgroups. | 687 | infrastructure that works with cgroups. |
688 | 688 | ||
689 | config CGROUP_MEM_RES_CTLR | 689 | config MEMCG |
690 | bool "Memory Resource Controller for Control Groups" | 690 | bool "Memory Resource Controller for Control Groups" |
691 | depends on RESOURCE_COUNTERS | 691 | depends on RESOURCE_COUNTERS |
692 | select MM_OWNER | 692 | select MM_OWNER |
@@ -709,9 +709,9 @@ config CGROUP_MEM_RES_CTLR | |||
709 | This config option also selects MM_OWNER config option, which | 709 | This config option also selects MM_OWNER config option, which |
710 | could in turn add some fork/exit overhead. | 710 | could in turn add some fork/exit overhead. |
711 | 711 | ||
712 | config CGROUP_MEM_RES_CTLR_SWAP | 712 | config MEMCG_SWAP |
713 | bool "Memory Resource Controller Swap Extension" | 713 | bool "Memory Resource Controller Swap Extension" |
714 | depends on CGROUP_MEM_RES_CTLR && SWAP | 714 | depends on MEMCG && SWAP |
715 | help | 715 | help |
716 | Add swap management feature to memory resource controller. When you | 716 | Add swap management feature to memory resource controller. When you |
717 | enable this, you can limit mem+swap usage per cgroup. In other words, | 717 | enable this, you can limit mem+swap usage per cgroup. In other words, |
@@ -726,9 +726,9 @@ config CGROUP_MEM_RES_CTLR_SWAP | |||
726 | if boot option "swapaccount=0" is set, swap will not be accounted. | 726 | if boot option "swapaccount=0" is set, swap will not be accounted. |
727 | Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page | 727 | Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page |
728 | size is 4096bytes, 512k per 1Gbytes of swap. | 728 | size is 4096bytes, 512k per 1Gbytes of swap. |
729 | config CGROUP_MEM_RES_CTLR_SWAP_ENABLED | 729 | config MEMCG_SWAP_ENABLED |
730 | bool "Memory Resource Controller Swap Extension enabled by default" | 730 | bool "Memory Resource Controller Swap Extension enabled by default" |
731 | depends on CGROUP_MEM_RES_CTLR_SWAP | 731 | depends on MEMCG_SWAP |
732 | default y | 732 | default y |
733 | help | 733 | help |
734 | Memory Resource Controller Swap Extension comes with its price in | 734 | Memory Resource Controller Swap Extension comes with its price in |
@@ -739,9 +739,9 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED | |||
739 | For those who want to have the feature enabled by default should | 739 | For those who want to have the feature enabled by default should |
740 | select this option (if, for some reason, they need to disable it | 740 | select this option (if, for some reason, they need to disable it |
741 | then swapaccount=0 does the trick). | 741 | then swapaccount=0 does the trick). |
742 | config CGROUP_MEM_RES_CTLR_KMEM | 742 | config MEMCG_KMEM |
743 | bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)" | 743 | bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)" |
744 | depends on CGROUP_MEM_RES_CTLR && EXPERIMENTAL | 744 | depends on MEMCG && EXPERIMENTAL |
745 | default n | 745 | default n |
746 | help | 746 | help |
747 | The Kernel Memory extension for Memory Resource Controller can limit | 747 | The Kernel Memory extension for Memory Resource Controller can limit |
@@ -751,6 +751,21 @@ config CGROUP_MEM_RES_CTLR_KMEM | |||
751 | the kmem extension can use it to guarantee that no group of processes | 751 | the kmem extension can use it to guarantee that no group of processes |
752 | will ever exhaust kernel resources alone. | 752 | will ever exhaust kernel resources alone. |
753 | 753 | ||
754 | config CGROUP_HUGETLB | ||
755 | bool "HugeTLB Resource Controller for Control Groups" | ||
756 | depends on RESOURCE_COUNTERS && HUGETLB_PAGE && EXPERIMENTAL | ||
757 | default n | ||
758 | help | ||
759 | Provides a cgroup Resource Controller for HugeTLB pages. | ||
760 | When you enable this, you can put a per cgroup limit on HugeTLB usage. | ||
761 | The limit is enforced during page fault. Since HugeTLB doesn't | ||
762 | support page reclaim, enforcing the limit at page fault time implies | ||
763 | that, the application will get SIGBUS signal if it tries to access | ||
764 | HugeTLB pages beyond its limit. This requires the application to know | ||
765 | beforehand how much HugeTLB pages it would require for its use. The | ||
766 | control group is tracked in the third page lru pointer. This means | ||
767 | that we cannot use the controller with huge page less than 3 pages. | ||
768 | |||
754 | config CGROUP_PERF | 769 | config CGROUP_PERF |
755 | bool "Enable perf_event per-cpu per-container group (cgroup) monitoring" | 770 | bool "Enable perf_event per-cpu per-container group (cgroup) monitoring" |
756 | depends on PERF_EVENTS && CGROUPS | 771 | depends on PERF_EVENTS && CGROUPS |
diff --git a/init/main.c b/init/main.c index 95316a1b4a76..e60679de61c3 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -506,7 +506,7 @@ asmlinkage void __init start_kernel(void) | |||
506 | setup_per_cpu_areas(); | 506 | setup_per_cpu_areas(); |
507 | smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ | 507 | smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ |
508 | 508 | ||
509 | build_all_zonelists(NULL); | 509 | build_all_zonelists(NULL, NULL); |
510 | page_alloc_init(); | 510 | page_alloc_init(); |
511 | 511 | ||
512 | printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); | 512 | printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); |
diff --git a/kernel/cpu.c b/kernel/cpu.c index a4eb5227a19e..14d32588cccd 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -416,7 +416,7 @@ int __cpuinit cpu_up(unsigned int cpu) | |||
416 | 416 | ||
417 | if (pgdat->node_zonelists->_zonerefs->zone == NULL) { | 417 | if (pgdat->node_zonelists->_zonerefs->zone == NULL) { |
418 | mutex_lock(&zonelists_mutex); | 418 | mutex_lock(&zonelists_mutex); |
419 | build_all_zonelists(NULL); | 419 | build_all_zonelists(NULL, NULL); |
420 | mutex_unlock(&zonelists_mutex); | 420 | mutex_unlock(&zonelists_mutex); |
421 | } | 421 | } |
422 | #endif | 422 | #endif |
diff --git a/kernel/fork.c b/kernel/fork.c index 8efac1fe56bc..3bd2280d79f6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -381,10 +381,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
381 | struct file *file; | 381 | struct file *file; |
382 | 382 | ||
383 | if (mpnt->vm_flags & VM_DONTCOPY) { | 383 | if (mpnt->vm_flags & VM_DONTCOPY) { |
384 | long pages = vma_pages(mpnt); | ||
385 | mm->total_vm -= pages; | ||
386 | vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, | 384 | vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, |
387 | -pages); | 385 | -vma_pages(mpnt)); |
388 | continue; | 386 | continue; |
389 | } | 387 | } |
390 | charge = 0; | 388 | charge = 0; |
@@ -1308,7 +1306,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1308 | #ifdef CONFIG_DEBUG_MUTEXES | 1306 | #ifdef CONFIG_DEBUG_MUTEXES |
1309 | p->blocked_on = NULL; /* not blocked yet */ | 1307 | p->blocked_on = NULL; /* not blocked yet */ |
1310 | #endif | 1308 | #endif |
1311 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 1309 | #ifdef CONFIG_MEMCG |
1312 | p->memcg_batch.do_batch = 0; | 1310 | p->memcg_batch.do_batch = 0; |
1313 | p->memcg_batch.memcg = NULL; | 1311 | p->memcg_batch.memcg = NULL; |
1314 | #endif | 1312 | #endif |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 671f9594e368..b73e681df09e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -210,6 +210,14 @@ asmlinkage void __do_softirq(void) | |||
210 | __u32 pending; | 210 | __u32 pending; |
211 | int max_restart = MAX_SOFTIRQ_RESTART; | 211 | int max_restart = MAX_SOFTIRQ_RESTART; |
212 | int cpu; | 212 | int cpu; |
213 | unsigned long old_flags = current->flags; | ||
214 | |||
215 | /* | ||
216 | * Mask out PF_MEMALLOC s current task context is borrowed for the | ||
217 | * softirq. A softirq handled such as network RX might set PF_MEMALLOC | ||
218 | * again if the socket is related to swap | ||
219 | */ | ||
220 | current->flags &= ~PF_MEMALLOC; | ||
213 | 221 | ||
214 | pending = local_softirq_pending(); | 222 | pending = local_softirq_pending(); |
215 | account_system_vtime(current); | 223 | account_system_vtime(current); |
@@ -265,6 +273,7 @@ restart: | |||
265 | 273 | ||
266 | account_system_vtime(current); | 274 | account_system_vtime(current); |
267 | __local_bh_enable(SOFTIRQ_OFFSET); | 275 | __local_bh_enable(SOFTIRQ_OFFSET); |
276 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); | ||
268 | } | 277 | } |
269 | 278 | ||
270 | #ifndef __ARCH_HAS_DO_SOFTIRQ | 279 | #ifndef __ARCH_HAS_DO_SOFTIRQ |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 97186b99b0e4..6502d35a25ba 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -1101,11 +1101,9 @@ static struct ctl_table vm_table[] = { | |||
1101 | .extra1 = &zero, | 1101 | .extra1 = &zero, |
1102 | }, | 1102 | }, |
1103 | { | 1103 | { |
1104 | .procname = "nr_pdflush_threads", | 1104 | .procname = "nr_pdflush_threads", |
1105 | .data = &nr_pdflush_threads, | 1105 | .mode = 0444 /* read-only */, |
1106 | .maxlen = sizeof nr_pdflush_threads, | 1106 | .proc_handler = pdflush_proc_obsolete, |
1107 | .mode = 0444 /* read-only*/, | ||
1108 | .proc_handler = proc_dointvec, | ||
1109 | }, | 1107 | }, |
1110 | { | 1108 | { |
1111 | .procname = "swappiness", | 1109 | .procname = "swappiness", |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index a650694883a1..65bdcf198d4e 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -147,7 +147,7 @@ static const struct bin_table bin_vm_table[] = { | |||
147 | { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, | 147 | { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, |
148 | /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ | 148 | /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ |
149 | /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ | 149 | /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ |
150 | { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, | 150 | /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */ |
151 | { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, | 151 | { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, |
152 | /* VM_PAGEBUF unused */ | 152 | /* VM_PAGEBUF unused */ |
153 | /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ | 153 | /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ |
diff --git a/mm/Kconfig b/mm/Kconfig index 82fed4eb2b6f..d5c8019c6627 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -140,9 +140,13 @@ config ARCH_DISCARD_MEMBLOCK | |||
140 | config NO_BOOTMEM | 140 | config NO_BOOTMEM |
141 | boolean | 141 | boolean |
142 | 142 | ||
143 | config MEMORY_ISOLATION | ||
144 | boolean | ||
145 | |||
143 | # eventually, we can have this option just 'select SPARSEMEM' | 146 | # eventually, we can have this option just 'select SPARSEMEM' |
144 | config MEMORY_HOTPLUG | 147 | config MEMORY_HOTPLUG |
145 | bool "Allow for memory hot-add" | 148 | bool "Allow for memory hot-add" |
149 | select MEMORY_ISOLATION | ||
146 | depends on SPARSEMEM || X86_64_ACPI_NUMA | 150 | depends on SPARSEMEM || X86_64_ACPI_NUMA |
147 | depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG | 151 | depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG |
148 | depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) | 152 | depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) |
@@ -272,6 +276,7 @@ config MEMORY_FAILURE | |||
272 | depends on MMU | 276 | depends on MMU |
273 | depends on ARCH_SUPPORTS_MEMORY_FAILURE | 277 | depends on ARCH_SUPPORTS_MEMORY_FAILURE |
274 | bool "Enable recovery from hardware memory errors" | 278 | bool "Enable recovery from hardware memory errors" |
279 | select MEMORY_ISOLATION | ||
275 | help | 280 | help |
276 | Enables code to recover from some memory failures on systems | 281 | Enables code to recover from some memory failures on systems |
277 | with MCA recovery. This allows a system to continue running | 282 | with MCA recovery. This allows a system to continue running |
diff --git a/mm/Makefile b/mm/Makefile index 8e81fe263c94..92753e2d82da 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -15,8 +15,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ | |||
15 | maccess.o page_alloc.o page-writeback.o \ | 15 | maccess.o page_alloc.o page-writeback.o \ |
16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
17 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 17 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
18 | page_isolation.o mm_init.o mmu_context.o percpu.o \ | 18 | mm_init.o mmu_context.o percpu.o slab_common.o \ |
19 | compaction.o slab_common.o $(mmu-y) | 19 | compaction.o $(mmu-y) |
20 | 20 | ||
21 | obj-y += init-mm.o | 21 | obj-y += init-mm.o |
22 | 22 | ||
@@ -49,9 +49,11 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o | |||
49 | obj-$(CONFIG_MIGRATION) += migrate.o | 49 | obj-$(CONFIG_MIGRATION) += migrate.o |
50 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 50 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
51 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o | 51 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o |
52 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o | 52 | obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o |
53 | obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o | ||
53 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | 54 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o |
54 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o | 55 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o |
55 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o | 56 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o |
56 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o | 57 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o |
57 | obj-$(CONFIG_CLEANCACHE) += cleancache.o | 58 | obj-$(CONFIG_CLEANCACHE) += cleancache.o |
59 | obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 3387aea11209..6b4718e2ee34 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -886,3 +886,23 @@ out: | |||
886 | return ret; | 886 | return ret; |
887 | } | 887 | } |
888 | EXPORT_SYMBOL(wait_iff_congested); | 888 | EXPORT_SYMBOL(wait_iff_congested); |
889 | |||
890 | int pdflush_proc_obsolete(struct ctl_table *table, int write, | ||
891 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
892 | { | ||
893 | char kbuf[] = "0\n"; | ||
894 | |||
895 | if (*ppos) { | ||
896 | *lenp = 0; | ||
897 | return 0; | ||
898 | } | ||
899 | |||
900 | if (copy_to_user(buffer, kbuf, sizeof(kbuf))) | ||
901 | return -EFAULT; | ||
902 | printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n", | ||
903 | table->procname); | ||
904 | |||
905 | *lenp = 2; | ||
906 | *ppos += *lenp; | ||
907 | return 2; | ||
908 | } | ||
diff --git a/mm/compaction.c b/mm/compaction.c index 2f42d9528539..e78cb9688421 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -422,6 +422,17 @@ static void isolate_freepages(struct zone *zone, | |||
422 | pfn -= pageblock_nr_pages) { | 422 | pfn -= pageblock_nr_pages) { |
423 | unsigned long isolated; | 423 | unsigned long isolated; |
424 | 424 | ||
425 | /* | ||
426 | * Skip ahead if another thread is compacting in the area | ||
427 | * simultaneously. If we wrapped around, we can only skip | ||
428 | * ahead if zone->compact_cached_free_pfn also wrapped to | ||
429 | * above our starting point. | ||
430 | */ | ||
431 | if (cc->order > 0 && (!cc->wrapped || | ||
432 | zone->compact_cached_free_pfn > | ||
433 | cc->start_free_pfn)) | ||
434 | pfn = min(pfn, zone->compact_cached_free_pfn); | ||
435 | |||
425 | if (!pfn_valid(pfn)) | 436 | if (!pfn_valid(pfn)) |
426 | continue; | 437 | continue; |
427 | 438 | ||
@@ -461,8 +472,11 @@ static void isolate_freepages(struct zone *zone, | |||
461 | * looking for free pages, the search will restart here as | 472 | * looking for free pages, the search will restart here as |
462 | * page migration may have returned some pages to the allocator | 473 | * page migration may have returned some pages to the allocator |
463 | */ | 474 | */ |
464 | if (isolated) | 475 | if (isolated) { |
465 | high_pfn = max(high_pfn, pfn); | 476 | high_pfn = max(high_pfn, pfn); |
477 | if (cc->order > 0) | ||
478 | zone->compact_cached_free_pfn = high_pfn; | ||
479 | } | ||
466 | } | 480 | } |
467 | 481 | ||
468 | /* split_free_page does not map the pages */ | 482 | /* split_free_page does not map the pages */ |
@@ -556,6 +570,20 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
556 | return ISOLATE_SUCCESS; | 570 | return ISOLATE_SUCCESS; |
557 | } | 571 | } |
558 | 572 | ||
573 | /* | ||
574 | * Returns the start pfn of the last page block in a zone. This is the starting | ||
575 | * point for full compaction of a zone. Compaction searches for free pages from | ||
576 | * the end of each zone, while isolate_freepages_block scans forward inside each | ||
577 | * page block. | ||
578 | */ | ||
579 | static unsigned long start_free_pfn(struct zone *zone) | ||
580 | { | ||
581 | unsigned long free_pfn; | ||
582 | free_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
583 | free_pfn &= ~(pageblock_nr_pages-1); | ||
584 | return free_pfn; | ||
585 | } | ||
586 | |||
559 | static int compact_finished(struct zone *zone, | 587 | static int compact_finished(struct zone *zone, |
560 | struct compact_control *cc) | 588 | struct compact_control *cc) |
561 | { | 589 | { |
@@ -565,8 +593,26 @@ static int compact_finished(struct zone *zone, | |||
565 | if (fatal_signal_pending(current)) | 593 | if (fatal_signal_pending(current)) |
566 | return COMPACT_PARTIAL; | 594 | return COMPACT_PARTIAL; |
567 | 595 | ||
568 | /* Compaction run completes if the migrate and free scanner meet */ | 596 | /* |
569 | if (cc->free_pfn <= cc->migrate_pfn) | 597 | * A full (order == -1) compaction run starts at the beginning and |
598 | * end of a zone; it completes when the migrate and free scanner meet. | ||
599 | * A partial (order > 0) compaction can start with the free scanner | ||
600 | * at a random point in the zone, and may have to restart. | ||
601 | */ | ||
602 | if (cc->free_pfn <= cc->migrate_pfn) { | ||
603 | if (cc->order > 0 && !cc->wrapped) { | ||
604 | /* We started partway through; restart at the end. */ | ||
605 | unsigned long free_pfn = start_free_pfn(zone); | ||
606 | zone->compact_cached_free_pfn = free_pfn; | ||
607 | cc->free_pfn = free_pfn; | ||
608 | cc->wrapped = 1; | ||
609 | return COMPACT_CONTINUE; | ||
610 | } | ||
611 | return COMPACT_COMPLETE; | ||
612 | } | ||
613 | |||
614 | /* We wrapped around and ended up where we started. */ | ||
615 | if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn) | ||
570 | return COMPACT_COMPLETE; | 616 | return COMPACT_COMPLETE; |
571 | 617 | ||
572 | /* | 618 | /* |
@@ -664,8 +710,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
664 | 710 | ||
665 | /* Setup to move all movable pages to the end of the zone */ | 711 | /* Setup to move all movable pages to the end of the zone */ |
666 | cc->migrate_pfn = zone->zone_start_pfn; | 712 | cc->migrate_pfn = zone->zone_start_pfn; |
667 | cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; | 713 | |
668 | cc->free_pfn &= ~(pageblock_nr_pages-1); | 714 | if (cc->order > 0) { |
715 | /* Incremental compaction. Start where the last one stopped. */ | ||
716 | cc->free_pfn = zone->compact_cached_free_pfn; | ||
717 | cc->start_free_pfn = cc->free_pfn; | ||
718 | } else { | ||
719 | /* Order == -1 starts at the end of the zone. */ | ||
720 | cc->free_pfn = start_free_pfn(zone); | ||
721 | } | ||
669 | 722 | ||
670 | migrate_prep_local(); | 723 | migrate_prep_local(); |
671 | 724 | ||
diff --git a/mm/fadvise.c b/mm/fadvise.c index 469491e0af79..9b75a045dbf4 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -93,11 +93,6 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
93 | spin_unlock(&file->f_lock); | 93 | spin_unlock(&file->f_lock); |
94 | break; | 94 | break; |
95 | case POSIX_FADV_WILLNEED: | 95 | case POSIX_FADV_WILLNEED: |
96 | if (!mapping->a_ops->readpage) { | ||
97 | ret = -EINVAL; | ||
98 | break; | ||
99 | } | ||
100 | |||
101 | /* First and last PARTIAL page! */ | 96 | /* First and last PARTIAL page! */ |
102 | start_index = offset >> PAGE_CACHE_SHIFT; | 97 | start_index = offset >> PAGE_CACHE_SHIFT; |
103 | end_index = endbyte >> PAGE_CACHE_SHIFT; | 98 | end_index = endbyte >> PAGE_CACHE_SHIFT; |
@@ -106,12 +101,13 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
106 | nrpages = end_index - start_index + 1; | 101 | nrpages = end_index - start_index + 1; |
107 | if (!nrpages) | 102 | if (!nrpages) |
108 | nrpages = ~0UL; | 103 | nrpages = ~0UL; |
109 | 104 | ||
110 | ret = force_page_cache_readahead(mapping, file, | 105 | /* |
111 | start_index, | 106 | * Ignore return value because fadvise() shall return |
112 | nrpages); | 107 | * success even if filesystem can't retrieve a hint, |
113 | if (ret > 0) | 108 | */ |
114 | ret = 0; | 109 | force_page_cache_readahead(mapping, file, start_index, |
110 | nrpages); | ||
115 | break; | 111 | break; |
116 | case POSIX_FADV_NOREUSE: | 112 | case POSIX_FADV_NOREUSE: |
117 | break; | 113 | break; |
diff --git a/mm/highmem.c b/mm/highmem.c index 57d82c6250c3..d517cd16a6eb 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -94,6 +94,18 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); | |||
94 | do { spin_unlock(&kmap_lock); (void)(flags); } while (0) | 94 | do { spin_unlock(&kmap_lock); (void)(flags); } while (0) |
95 | #endif | 95 | #endif |
96 | 96 | ||
97 | struct page *kmap_to_page(void *vaddr) | ||
98 | { | ||
99 | unsigned long addr = (unsigned long)vaddr; | ||
100 | |||
101 | if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) { | ||
102 | int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT; | ||
103 | return pte_page(pkmap_page_table[i]); | ||
104 | } | ||
105 | |||
106 | return virt_to_page(addr); | ||
107 | } | ||
108 | |||
97 | static void flush_all_zero_pkmaps(void) | 109 | static void flush_all_zero_pkmaps(void) |
98 | { | 110 | { |
99 | int i; | 111 | int i; |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e198831276a3..bc727122dd44 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -24,17 +24,20 @@ | |||
24 | 24 | ||
25 | #include <asm/page.h> | 25 | #include <asm/page.h> |
26 | #include <asm/pgtable.h> | 26 | #include <asm/pgtable.h> |
27 | #include <linux/io.h> | 27 | #include <asm/tlb.h> |
28 | 28 | ||
29 | #include <linux/io.h> | ||
29 | #include <linux/hugetlb.h> | 30 | #include <linux/hugetlb.h> |
31 | #include <linux/hugetlb_cgroup.h> | ||
30 | #include <linux/node.h> | 32 | #include <linux/node.h> |
33 | #include <linux/hugetlb_cgroup.h> | ||
31 | #include "internal.h" | 34 | #include "internal.h" |
32 | 35 | ||
33 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 36 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
34 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; | 37 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; |
35 | unsigned long hugepages_treat_as_movable; | 38 | unsigned long hugepages_treat_as_movable; |
36 | 39 | ||
37 | static int max_hstate; | 40 | int hugetlb_max_hstate __read_mostly; |
38 | unsigned int default_hstate_idx; | 41 | unsigned int default_hstate_idx; |
39 | struct hstate hstates[HUGE_MAX_HSTATE]; | 42 | struct hstate hstates[HUGE_MAX_HSTATE]; |
40 | 43 | ||
@@ -45,13 +48,10 @@ static struct hstate * __initdata parsed_hstate; | |||
45 | static unsigned long __initdata default_hstate_max_huge_pages; | 48 | static unsigned long __initdata default_hstate_max_huge_pages; |
46 | static unsigned long __initdata default_hstate_size; | 49 | static unsigned long __initdata default_hstate_size; |
47 | 50 | ||
48 | #define for_each_hstate(h) \ | ||
49 | for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) | ||
50 | |||
51 | /* | 51 | /* |
52 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages | 52 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages |
53 | */ | 53 | */ |
54 | static DEFINE_SPINLOCK(hugetlb_lock); | 54 | DEFINE_SPINLOCK(hugetlb_lock); |
55 | 55 | ||
56 | static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) | 56 | static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) |
57 | { | 57 | { |
@@ -509,7 +509,7 @@ void copy_huge_page(struct page *dst, struct page *src) | |||
509 | static void enqueue_huge_page(struct hstate *h, struct page *page) | 509 | static void enqueue_huge_page(struct hstate *h, struct page *page) |
510 | { | 510 | { |
511 | int nid = page_to_nid(page); | 511 | int nid = page_to_nid(page); |
512 | list_add(&page->lru, &h->hugepage_freelists[nid]); | 512 | list_move(&page->lru, &h->hugepage_freelists[nid]); |
513 | h->free_huge_pages++; | 513 | h->free_huge_pages++; |
514 | h->free_huge_pages_node[nid]++; | 514 | h->free_huge_pages_node[nid]++; |
515 | } | 515 | } |
@@ -521,7 +521,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) | |||
521 | if (list_empty(&h->hugepage_freelists[nid])) | 521 | if (list_empty(&h->hugepage_freelists[nid])) |
522 | return NULL; | 522 | return NULL; |
523 | page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); | 523 | page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); |
524 | list_del(&page->lru); | 524 | list_move(&page->lru, &h->hugepage_activelist); |
525 | set_page_refcounted(page); | 525 | set_page_refcounted(page); |
526 | h->free_huge_pages--; | 526 | h->free_huge_pages--; |
527 | h->free_huge_pages_node[nid]--; | 527 | h->free_huge_pages_node[nid]--; |
@@ -593,6 +593,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) | |||
593 | 1 << PG_active | 1 << PG_reserved | | 593 | 1 << PG_active | 1 << PG_reserved | |
594 | 1 << PG_private | 1 << PG_writeback); | 594 | 1 << PG_private | 1 << PG_writeback); |
595 | } | 595 | } |
596 | VM_BUG_ON(hugetlb_cgroup_from_page(page)); | ||
596 | set_compound_page_dtor(page, NULL); | 597 | set_compound_page_dtor(page, NULL); |
597 | set_page_refcounted(page); | 598 | set_page_refcounted(page); |
598 | arch_release_hugepage(page); | 599 | arch_release_hugepage(page); |
@@ -625,10 +626,13 @@ static void free_huge_page(struct page *page) | |||
625 | page->mapping = NULL; | 626 | page->mapping = NULL; |
626 | BUG_ON(page_count(page)); | 627 | BUG_ON(page_count(page)); |
627 | BUG_ON(page_mapcount(page)); | 628 | BUG_ON(page_mapcount(page)); |
628 | INIT_LIST_HEAD(&page->lru); | ||
629 | 629 | ||
630 | spin_lock(&hugetlb_lock); | 630 | spin_lock(&hugetlb_lock); |
631 | hugetlb_cgroup_uncharge_page(hstate_index(h), | ||
632 | pages_per_huge_page(h), page); | ||
631 | if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { | 633 | if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { |
634 | /* remove the page from active list */ | ||
635 | list_del(&page->lru); | ||
632 | update_and_free_page(h, page); | 636 | update_and_free_page(h, page); |
633 | h->surplus_huge_pages--; | 637 | h->surplus_huge_pages--; |
634 | h->surplus_huge_pages_node[nid]--; | 638 | h->surplus_huge_pages_node[nid]--; |
@@ -641,8 +645,10 @@ static void free_huge_page(struct page *page) | |||
641 | 645 | ||
642 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | 646 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
643 | { | 647 | { |
648 | INIT_LIST_HEAD(&page->lru); | ||
644 | set_compound_page_dtor(page, free_huge_page); | 649 | set_compound_page_dtor(page, free_huge_page); |
645 | spin_lock(&hugetlb_lock); | 650 | spin_lock(&hugetlb_lock); |
651 | set_hugetlb_cgroup(page, NULL); | ||
646 | h->nr_huge_pages++; | 652 | h->nr_huge_pages++; |
647 | h->nr_huge_pages_node[nid]++; | 653 | h->nr_huge_pages_node[nid]++; |
648 | spin_unlock(&hugetlb_lock); | 654 | spin_unlock(&hugetlb_lock); |
@@ -889,8 +895,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) | |||
889 | 895 | ||
890 | spin_lock(&hugetlb_lock); | 896 | spin_lock(&hugetlb_lock); |
891 | if (page) { | 897 | if (page) { |
898 | INIT_LIST_HEAD(&page->lru); | ||
892 | r_nid = page_to_nid(page); | 899 | r_nid = page_to_nid(page); |
893 | set_compound_page_dtor(page, free_huge_page); | 900 | set_compound_page_dtor(page, free_huge_page); |
901 | set_hugetlb_cgroup(page, NULL); | ||
894 | /* | 902 | /* |
895 | * We incremented the global counters already | 903 | * We incremented the global counters already |
896 | */ | 904 | */ |
@@ -993,7 +1001,6 @@ retry: | |||
993 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 1001 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
994 | if ((--needed) < 0) | 1002 | if ((--needed) < 0) |
995 | break; | 1003 | break; |
996 | list_del(&page->lru); | ||
997 | /* | 1004 | /* |
998 | * This page is now managed by the hugetlb allocator and has | 1005 | * This page is now managed by the hugetlb allocator and has |
999 | * no users -- drop the buddy allocator's reference. | 1006 | * no users -- drop the buddy allocator's reference. |
@@ -1008,7 +1015,6 @@ free: | |||
1008 | /* Free unnecessary surplus pages to the buddy allocator */ | 1015 | /* Free unnecessary surplus pages to the buddy allocator */ |
1009 | if (!list_empty(&surplus_list)) { | 1016 | if (!list_empty(&surplus_list)) { |
1010 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 1017 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
1011 | list_del(&page->lru); | ||
1012 | put_page(page); | 1018 | put_page(page); |
1013 | } | 1019 | } |
1014 | } | 1020 | } |
@@ -1112,7 +1118,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1112 | struct hstate *h = hstate_vma(vma); | 1118 | struct hstate *h = hstate_vma(vma); |
1113 | struct page *page; | 1119 | struct page *page; |
1114 | long chg; | 1120 | long chg; |
1121 | int ret, idx; | ||
1122 | struct hugetlb_cgroup *h_cg; | ||
1115 | 1123 | ||
1124 | idx = hstate_index(h); | ||
1116 | /* | 1125 | /* |
1117 | * Processes that did not create the mapping will have no | 1126 | * Processes that did not create the mapping will have no |
1118 | * reserves and will not have accounted against subpool | 1127 | * reserves and will not have accounted against subpool |
@@ -1123,27 +1132,43 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1123 | */ | 1132 | */ |
1124 | chg = vma_needs_reservation(h, vma, addr); | 1133 | chg = vma_needs_reservation(h, vma, addr); |
1125 | if (chg < 0) | 1134 | if (chg < 0) |
1126 | return ERR_PTR(-VM_FAULT_OOM); | 1135 | return ERR_PTR(-ENOMEM); |
1127 | if (chg) | 1136 | if (chg) |
1128 | if (hugepage_subpool_get_pages(spool, chg)) | 1137 | if (hugepage_subpool_get_pages(spool, chg)) |
1129 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1138 | return ERR_PTR(-ENOSPC); |
1130 | 1139 | ||
1140 | ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); | ||
1141 | if (ret) { | ||
1142 | hugepage_subpool_put_pages(spool, chg); | ||
1143 | return ERR_PTR(-ENOSPC); | ||
1144 | } | ||
1131 | spin_lock(&hugetlb_lock); | 1145 | spin_lock(&hugetlb_lock); |
1132 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); | 1146 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); |
1133 | spin_unlock(&hugetlb_lock); | 1147 | if (page) { |
1134 | 1148 | /* update page cgroup details */ | |
1135 | if (!page) { | 1149 | hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), |
1150 | h_cg, page); | ||
1151 | spin_unlock(&hugetlb_lock); | ||
1152 | } else { | ||
1153 | spin_unlock(&hugetlb_lock); | ||
1136 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); | 1154 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
1137 | if (!page) { | 1155 | if (!page) { |
1156 | hugetlb_cgroup_uncharge_cgroup(idx, | ||
1157 | pages_per_huge_page(h), | ||
1158 | h_cg); | ||
1138 | hugepage_subpool_put_pages(spool, chg); | 1159 | hugepage_subpool_put_pages(spool, chg); |
1139 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1160 | return ERR_PTR(-ENOSPC); |
1140 | } | 1161 | } |
1162 | spin_lock(&hugetlb_lock); | ||
1163 | hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), | ||
1164 | h_cg, page); | ||
1165 | list_move(&page->lru, &h->hugepage_activelist); | ||
1166 | spin_unlock(&hugetlb_lock); | ||
1141 | } | 1167 | } |
1142 | 1168 | ||
1143 | set_page_private(page, (unsigned long)spool); | 1169 | set_page_private(page, (unsigned long)spool); |
1144 | 1170 | ||
1145 | vma_commit_reservation(h, vma, addr); | 1171 | vma_commit_reservation(h, vma, addr); |
1146 | |||
1147 | return page; | 1172 | return page; |
1148 | } | 1173 | } |
1149 | 1174 | ||
@@ -1646,7 +1671,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, | |||
1646 | struct attribute_group *hstate_attr_group) | 1671 | struct attribute_group *hstate_attr_group) |
1647 | { | 1672 | { |
1648 | int retval; | 1673 | int retval; |
1649 | int hi = h - hstates; | 1674 | int hi = hstate_index(h); |
1650 | 1675 | ||
1651 | hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); | 1676 | hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); |
1652 | if (!hstate_kobjs[hi]) | 1677 | if (!hstate_kobjs[hi]) |
@@ -1741,11 +1766,13 @@ void hugetlb_unregister_node(struct node *node) | |||
1741 | if (!nhs->hugepages_kobj) | 1766 | if (!nhs->hugepages_kobj) |
1742 | return; /* no hstate attributes */ | 1767 | return; /* no hstate attributes */ |
1743 | 1768 | ||
1744 | for_each_hstate(h) | 1769 | for_each_hstate(h) { |
1745 | if (nhs->hstate_kobjs[h - hstates]) { | 1770 | int idx = hstate_index(h); |
1746 | kobject_put(nhs->hstate_kobjs[h - hstates]); | 1771 | if (nhs->hstate_kobjs[idx]) { |
1747 | nhs->hstate_kobjs[h - hstates] = NULL; | 1772 | kobject_put(nhs->hstate_kobjs[idx]); |
1773 | nhs->hstate_kobjs[idx] = NULL; | ||
1748 | } | 1774 | } |
1775 | } | ||
1749 | 1776 | ||
1750 | kobject_put(nhs->hugepages_kobj); | 1777 | kobject_put(nhs->hugepages_kobj); |
1751 | nhs->hugepages_kobj = NULL; | 1778 | nhs->hugepages_kobj = NULL; |
@@ -1848,7 +1875,7 @@ static void __exit hugetlb_exit(void) | |||
1848 | hugetlb_unregister_all_nodes(); | 1875 | hugetlb_unregister_all_nodes(); |
1849 | 1876 | ||
1850 | for_each_hstate(h) { | 1877 | for_each_hstate(h) { |
1851 | kobject_put(hstate_kobjs[h - hstates]); | 1878 | kobject_put(hstate_kobjs[hstate_index(h)]); |
1852 | } | 1879 | } |
1853 | 1880 | ||
1854 | kobject_put(hugepages_kobj); | 1881 | kobject_put(hugepages_kobj); |
@@ -1869,7 +1896,7 @@ static int __init hugetlb_init(void) | |||
1869 | if (!size_to_hstate(default_hstate_size)) | 1896 | if (!size_to_hstate(default_hstate_size)) |
1870 | hugetlb_add_hstate(HUGETLB_PAGE_ORDER); | 1897 | hugetlb_add_hstate(HUGETLB_PAGE_ORDER); |
1871 | } | 1898 | } |
1872 | default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; | 1899 | default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); |
1873 | if (default_hstate_max_huge_pages) | 1900 | if (default_hstate_max_huge_pages) |
1874 | default_hstate.max_huge_pages = default_hstate_max_huge_pages; | 1901 | default_hstate.max_huge_pages = default_hstate_max_huge_pages; |
1875 | 1902 | ||
@@ -1897,19 +1924,27 @@ void __init hugetlb_add_hstate(unsigned order) | |||
1897 | printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); | 1924 | printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); |
1898 | return; | 1925 | return; |
1899 | } | 1926 | } |
1900 | BUG_ON(max_hstate >= HUGE_MAX_HSTATE); | 1927 | BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); |
1901 | BUG_ON(order == 0); | 1928 | BUG_ON(order == 0); |
1902 | h = &hstates[max_hstate++]; | 1929 | h = &hstates[hugetlb_max_hstate++]; |
1903 | h->order = order; | 1930 | h->order = order; |
1904 | h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); | 1931 | h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); |
1905 | h->nr_huge_pages = 0; | 1932 | h->nr_huge_pages = 0; |
1906 | h->free_huge_pages = 0; | 1933 | h->free_huge_pages = 0; |
1907 | for (i = 0; i < MAX_NUMNODES; ++i) | 1934 | for (i = 0; i < MAX_NUMNODES; ++i) |
1908 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | 1935 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
1936 | INIT_LIST_HEAD(&h->hugepage_activelist); | ||
1909 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); | 1937 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); |
1910 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); | 1938 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); |
1911 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | 1939 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", |
1912 | huge_page_size(h)/1024); | 1940 | huge_page_size(h)/1024); |
1941 | /* | ||
1942 | * Add cgroup control files only if the huge page consists | ||
1943 | * of more than two normal pages. This is because we use | ||
1944 | * page[2].lru.next for storing cgoup details. | ||
1945 | */ | ||
1946 | if (order >= HUGETLB_CGROUP_MIN_ORDER) | ||
1947 | hugetlb_cgroup_file_init(hugetlb_max_hstate - 1); | ||
1913 | 1948 | ||
1914 | parsed_hstate = h; | 1949 | parsed_hstate = h; |
1915 | } | 1950 | } |
@@ -1920,10 +1955,10 @@ static int __init hugetlb_nrpages_setup(char *s) | |||
1920 | static unsigned long *last_mhp; | 1955 | static unsigned long *last_mhp; |
1921 | 1956 | ||
1922 | /* | 1957 | /* |
1923 | * !max_hstate means we haven't parsed a hugepagesz= parameter yet, | 1958 | * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, |
1924 | * so this hugepages= parameter goes to the "default hstate". | 1959 | * so this hugepages= parameter goes to the "default hstate". |
1925 | */ | 1960 | */ |
1926 | if (!max_hstate) | 1961 | if (!hugetlb_max_hstate) |
1927 | mhp = &default_hstate_max_huge_pages; | 1962 | mhp = &default_hstate_max_huge_pages; |
1928 | else | 1963 | else |
1929 | mhp = &parsed_hstate->max_huge_pages; | 1964 | mhp = &parsed_hstate->max_huge_pages; |
@@ -1942,7 +1977,7 @@ static int __init hugetlb_nrpages_setup(char *s) | |||
1942 | * But we need to allocate >= MAX_ORDER hstates here early to still | 1977 | * But we need to allocate >= MAX_ORDER hstates here early to still |
1943 | * use the bootmem allocator. | 1978 | * use the bootmem allocator. |
1944 | */ | 1979 | */ |
1945 | if (max_hstate && parsed_hstate->order >= MAX_ORDER) | 1980 | if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) |
1946 | hugetlb_hstate_alloc_pages(parsed_hstate); | 1981 | hugetlb_hstate_alloc_pages(parsed_hstate); |
1947 | 1982 | ||
1948 | last_mhp = mhp; | 1983 | last_mhp = mhp; |
@@ -2308,30 +2343,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte) | |||
2308 | return 0; | 2343 | return 0; |
2309 | } | 2344 | } |
2310 | 2345 | ||
2311 | void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | 2346 | void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, |
2312 | unsigned long end, struct page *ref_page) | 2347 | unsigned long start, unsigned long end, |
2348 | struct page *ref_page) | ||
2313 | { | 2349 | { |
2350 | int force_flush = 0; | ||
2314 | struct mm_struct *mm = vma->vm_mm; | 2351 | struct mm_struct *mm = vma->vm_mm; |
2315 | unsigned long address; | 2352 | unsigned long address; |
2316 | pte_t *ptep; | 2353 | pte_t *ptep; |
2317 | pte_t pte; | 2354 | pte_t pte; |
2318 | struct page *page; | 2355 | struct page *page; |
2319 | struct page *tmp; | ||
2320 | struct hstate *h = hstate_vma(vma); | 2356 | struct hstate *h = hstate_vma(vma); |
2321 | unsigned long sz = huge_page_size(h); | 2357 | unsigned long sz = huge_page_size(h); |
2322 | 2358 | ||
2323 | /* | ||
2324 | * A page gathering list, protected by per file i_mmap_mutex. The | ||
2325 | * lock is used to avoid list corruption from multiple unmapping | ||
2326 | * of the same page since we are using page->lru. | ||
2327 | */ | ||
2328 | LIST_HEAD(page_list); | ||
2329 | |||
2330 | WARN_ON(!is_vm_hugetlb_page(vma)); | 2359 | WARN_ON(!is_vm_hugetlb_page(vma)); |
2331 | BUG_ON(start & ~huge_page_mask(h)); | 2360 | BUG_ON(start & ~huge_page_mask(h)); |
2332 | BUG_ON(end & ~huge_page_mask(h)); | 2361 | BUG_ON(end & ~huge_page_mask(h)); |
2333 | 2362 | ||
2363 | tlb_start_vma(tlb, vma); | ||
2334 | mmu_notifier_invalidate_range_start(mm, start, end); | 2364 | mmu_notifier_invalidate_range_start(mm, start, end); |
2365 | again: | ||
2335 | spin_lock(&mm->page_table_lock); | 2366 | spin_lock(&mm->page_table_lock); |
2336 | for (address = start; address < end; address += sz) { | 2367 | for (address = start; address < end; address += sz) { |
2337 | ptep = huge_pte_offset(mm, address); | 2368 | ptep = huge_pte_offset(mm, address); |
@@ -2370,30 +2401,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
2370 | } | 2401 | } |
2371 | 2402 | ||
2372 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 2403 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
2404 | tlb_remove_tlb_entry(tlb, ptep, address); | ||
2373 | if (pte_dirty(pte)) | 2405 | if (pte_dirty(pte)) |
2374 | set_page_dirty(page); | 2406 | set_page_dirty(page); |
2375 | list_add(&page->lru, &page_list); | ||
2376 | 2407 | ||
2408 | page_remove_rmap(page); | ||
2409 | force_flush = !__tlb_remove_page(tlb, page); | ||
2410 | if (force_flush) | ||
2411 | break; | ||
2377 | /* Bail out after unmapping reference page if supplied */ | 2412 | /* Bail out after unmapping reference page if supplied */ |
2378 | if (ref_page) | 2413 | if (ref_page) |
2379 | break; | 2414 | break; |
2380 | } | 2415 | } |
2381 | flush_tlb_range(vma, start, end); | ||
2382 | spin_unlock(&mm->page_table_lock); | 2416 | spin_unlock(&mm->page_table_lock); |
2383 | mmu_notifier_invalidate_range_end(mm, start, end); | 2417 | /* |
2384 | list_for_each_entry_safe(page, tmp, &page_list, lru) { | 2418 | * mmu_gather ran out of room to batch pages, we break out of |
2385 | page_remove_rmap(page); | 2419 | * the PTE lock to avoid doing the potential expensive TLB invalidate |
2386 | list_del(&page->lru); | 2420 | * and page-free while holding it. |
2387 | put_page(page); | 2421 | */ |
2422 | if (force_flush) { | ||
2423 | force_flush = 0; | ||
2424 | tlb_flush_mmu(tlb); | ||
2425 | if (address < end && !ref_page) | ||
2426 | goto again; | ||
2388 | } | 2427 | } |
2428 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
2429 | tlb_end_vma(tlb, vma); | ||
2430 | } | ||
2431 | |||
2432 | void __unmap_hugepage_range_final(struct mmu_gather *tlb, | ||
2433 | struct vm_area_struct *vma, unsigned long start, | ||
2434 | unsigned long end, struct page *ref_page) | ||
2435 | { | ||
2436 | __unmap_hugepage_range(tlb, vma, start, end, ref_page); | ||
2437 | |||
2438 | /* | ||
2439 | * Clear this flag so that x86's huge_pmd_share page_table_shareable | ||
2440 | * test will fail on a vma being torn down, and not grab a page table | ||
2441 | * on its way out. We're lucky that the flag has such an appropriate | ||
2442 | * name, and can in fact be safely cleared here. We could clear it | ||
2443 | * before the __unmap_hugepage_range above, but all that's necessary | ||
2444 | * is to clear it before releasing the i_mmap_mutex. This works | ||
2445 | * because in the context this is called, the VMA is about to be | ||
2446 | * destroyed and the i_mmap_mutex is held. | ||
2447 | */ | ||
2448 | vma->vm_flags &= ~VM_MAYSHARE; | ||
2389 | } | 2449 | } |
2390 | 2450 | ||
2391 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | 2451 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, |
2392 | unsigned long end, struct page *ref_page) | 2452 | unsigned long end, struct page *ref_page) |
2393 | { | 2453 | { |
2394 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); | 2454 | struct mm_struct *mm; |
2395 | __unmap_hugepage_range(vma, start, end, ref_page); | 2455 | struct mmu_gather tlb; |
2396 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 2456 | |
2457 | mm = vma->vm_mm; | ||
2458 | |||
2459 | tlb_gather_mmu(&tlb, mm, 0); | ||
2460 | __unmap_hugepage_range(&tlb, vma, start, end, ref_page); | ||
2461 | tlb_finish_mmu(&tlb, start, end); | ||
2397 | } | 2462 | } |
2398 | 2463 | ||
2399 | /* | 2464 | /* |
@@ -2438,9 +2503,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2438 | * from the time of fork. This would look like data corruption | 2503 | * from the time of fork. This would look like data corruption |
2439 | */ | 2504 | */ |
2440 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) | 2505 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) |
2441 | __unmap_hugepage_range(iter_vma, | 2506 | unmap_hugepage_range(iter_vma, address, |
2442 | address, address + huge_page_size(h), | 2507 | address + huge_page_size(h), page); |
2443 | page); | ||
2444 | } | 2508 | } |
2445 | mutex_unlock(&mapping->i_mmap_mutex); | 2509 | mutex_unlock(&mapping->i_mmap_mutex); |
2446 | 2510 | ||
@@ -2496,6 +2560,7 @@ retry_avoidcopy: | |||
2496 | new_page = alloc_huge_page(vma, address, outside_reserve); | 2560 | new_page = alloc_huge_page(vma, address, outside_reserve); |
2497 | 2561 | ||
2498 | if (IS_ERR(new_page)) { | 2562 | if (IS_ERR(new_page)) { |
2563 | long err = PTR_ERR(new_page); | ||
2499 | page_cache_release(old_page); | 2564 | page_cache_release(old_page); |
2500 | 2565 | ||
2501 | /* | 2566 | /* |
@@ -2524,7 +2589,10 @@ retry_avoidcopy: | |||
2524 | 2589 | ||
2525 | /* Caller expects lock to be held */ | 2590 | /* Caller expects lock to be held */ |
2526 | spin_lock(&mm->page_table_lock); | 2591 | spin_lock(&mm->page_table_lock); |
2527 | return -PTR_ERR(new_page); | 2592 | if (err == -ENOMEM) |
2593 | return VM_FAULT_OOM; | ||
2594 | else | ||
2595 | return VM_FAULT_SIGBUS; | ||
2528 | } | 2596 | } |
2529 | 2597 | ||
2530 | /* | 2598 | /* |
@@ -2642,7 +2710,11 @@ retry: | |||
2642 | goto out; | 2710 | goto out; |
2643 | page = alloc_huge_page(vma, address, 0); | 2711 | page = alloc_huge_page(vma, address, 0); |
2644 | if (IS_ERR(page)) { | 2712 | if (IS_ERR(page)) { |
2645 | ret = -PTR_ERR(page); | 2713 | ret = PTR_ERR(page); |
2714 | if (ret == -ENOMEM) | ||
2715 | ret = VM_FAULT_OOM; | ||
2716 | else | ||
2717 | ret = VM_FAULT_SIGBUS; | ||
2646 | goto out; | 2718 | goto out; |
2647 | } | 2719 | } |
2648 | clear_huge_page(page, address, pages_per_huge_page(h)); | 2720 | clear_huge_page(page, address, pages_per_huge_page(h)); |
@@ -2679,7 +2751,7 @@ retry: | |||
2679 | */ | 2751 | */ |
2680 | if (unlikely(PageHWPoison(page))) { | 2752 | if (unlikely(PageHWPoison(page))) { |
2681 | ret = VM_FAULT_HWPOISON | | 2753 | ret = VM_FAULT_HWPOISON | |
2682 | VM_FAULT_SET_HINDEX(h - hstates); | 2754 | VM_FAULT_SET_HINDEX(hstate_index(h)); |
2683 | goto backout_unlocked; | 2755 | goto backout_unlocked; |
2684 | } | 2756 | } |
2685 | } | 2757 | } |
@@ -2752,7 +2824,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2752 | return 0; | 2824 | return 0; |
2753 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | 2825 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) |
2754 | return VM_FAULT_HWPOISON_LARGE | | 2826 | return VM_FAULT_HWPOISON_LARGE | |
2755 | VM_FAULT_SET_HINDEX(h - hstates); | 2827 | VM_FAULT_SET_HINDEX(hstate_index(h)); |
2756 | } | 2828 | } |
2757 | 2829 | ||
2758 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); | 2830 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); |
@@ -2959,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
2959 | } | 3031 | } |
2960 | } | 3032 | } |
2961 | spin_unlock(&mm->page_table_lock); | 3033 | spin_unlock(&mm->page_table_lock); |
2962 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 3034 | /* |
2963 | 3035 | * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare | |
3036 | * may have cleared our pud entry and done put_page on the page table: | ||
3037 | * once we release i_mmap_mutex, another task can do the final put_page | ||
3038 | * and that page table be reused and filled with junk. | ||
3039 | */ | ||
2964 | flush_tlb_range(vma, start, end); | 3040 | flush_tlb_range(vma, start, end); |
3041 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | ||
2965 | } | 3042 | } |
2966 | 3043 | ||
2967 | int hugetlb_reserve_pages(struct inode *inode, | 3044 | int hugetlb_reserve_pages(struct inode *inode, |
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c new file mode 100644 index 000000000000..a3f358fb8a0c --- /dev/null +++ b/mm/hugetlb_cgroup.c | |||
@@ -0,0 +1,418 @@ | |||
1 | /* | ||
2 | * | ||
3 | * Copyright IBM Corporation, 2012 | ||
4 | * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify it | ||
7 | * under the terms of version 2.1 of the GNU Lesser General Public License | ||
8 | * as published by the Free Software Foundation. | ||
9 | * | ||
10 | * This program is distributed in the hope that it would be useful, but | ||
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #include <linux/cgroup.h> | ||
17 | #include <linux/slab.h> | ||
18 | #include <linux/hugetlb.h> | ||
19 | #include <linux/hugetlb_cgroup.h> | ||
20 | |||
21 | struct hugetlb_cgroup { | ||
22 | struct cgroup_subsys_state css; | ||
23 | /* | ||
24 | * the counter to account for hugepages from hugetlb. | ||
25 | */ | ||
26 | struct res_counter hugepage[HUGE_MAX_HSTATE]; | ||
27 | }; | ||
28 | |||
29 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | ||
30 | #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) | ||
31 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | ||
32 | |||
33 | struct cgroup_subsys hugetlb_subsys __read_mostly; | ||
34 | static struct hugetlb_cgroup *root_h_cgroup __read_mostly; | ||
35 | |||
36 | static inline | ||
37 | struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) | ||
38 | { | ||
39 | return container_of(s, struct hugetlb_cgroup, css); | ||
40 | } | ||
41 | |||
42 | static inline | ||
43 | struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup) | ||
44 | { | ||
45 | return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup, | ||
46 | hugetlb_subsys_id)); | ||
47 | } | ||
48 | |||
49 | static inline | ||
50 | struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) | ||
51 | { | ||
52 | return hugetlb_cgroup_from_css(task_subsys_state(task, | ||
53 | hugetlb_subsys_id)); | ||
54 | } | ||
55 | |||
56 | static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) | ||
57 | { | ||
58 | return (h_cg == root_h_cgroup); | ||
59 | } | ||
60 | |||
61 | static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg) | ||
62 | { | ||
63 | if (!cg->parent) | ||
64 | return NULL; | ||
65 | return hugetlb_cgroup_from_cgroup(cg->parent); | ||
66 | } | ||
67 | |||
68 | static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg) | ||
69 | { | ||
70 | int idx; | ||
71 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg); | ||
72 | |||
73 | for (idx = 0; idx < hugetlb_max_hstate; idx++) { | ||
74 | if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0) | ||
75 | return true; | ||
76 | } | ||
77 | return false; | ||
78 | } | ||
79 | |||
80 | static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup) | ||
81 | { | ||
82 | int idx; | ||
83 | struct cgroup *parent_cgroup; | ||
84 | struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup; | ||
85 | |||
86 | h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); | ||
87 | if (!h_cgroup) | ||
88 | return ERR_PTR(-ENOMEM); | ||
89 | |||
90 | parent_cgroup = cgroup->parent; | ||
91 | if (parent_cgroup) { | ||
92 | parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup); | ||
93 | for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) | ||
94 | res_counter_init(&h_cgroup->hugepage[idx], | ||
95 | &parent_h_cgroup->hugepage[idx]); | ||
96 | } else { | ||
97 | root_h_cgroup = h_cgroup; | ||
98 | for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) | ||
99 | res_counter_init(&h_cgroup->hugepage[idx], NULL); | ||
100 | } | ||
101 | return &h_cgroup->css; | ||
102 | } | ||
103 | |||
104 | static void hugetlb_cgroup_destroy(struct cgroup *cgroup) | ||
105 | { | ||
106 | struct hugetlb_cgroup *h_cgroup; | ||
107 | |||
108 | h_cgroup = hugetlb_cgroup_from_cgroup(cgroup); | ||
109 | kfree(h_cgroup); | ||
110 | } | ||
111 | |||
112 | |||
113 | /* | ||
114 | * Should be called with hugetlb_lock held. | ||
115 | * Since we are holding hugetlb_lock, pages cannot get moved from | ||
116 | * active list or uncharged from the cgroup, So no need to get | ||
117 | * page reference and test for page active here. This function | ||
118 | * cannot fail. | ||
119 | */ | ||
120 | static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup, | ||
121 | struct page *page) | ||
122 | { | ||
123 | int csize; | ||
124 | struct res_counter *counter; | ||
125 | struct res_counter *fail_res; | ||
126 | struct hugetlb_cgroup *page_hcg; | ||
127 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); | ||
128 | struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup); | ||
129 | |||
130 | page_hcg = hugetlb_cgroup_from_page(page); | ||
131 | /* | ||
132 | * We can have pages in active list without any cgroup | ||
133 | * ie, hugepage with less than 3 pages. We can safely | ||
134 | * ignore those pages. | ||
135 | */ | ||
136 | if (!page_hcg || page_hcg != h_cg) | ||
137 | goto out; | ||
138 | |||
139 | csize = PAGE_SIZE << compound_order(page); | ||
140 | if (!parent) { | ||
141 | parent = root_h_cgroup; | ||
142 | /* root has no limit */ | ||
143 | res_counter_charge_nofail(&parent->hugepage[idx], | ||
144 | csize, &fail_res); | ||
145 | } | ||
146 | counter = &h_cg->hugepage[idx]; | ||
147 | res_counter_uncharge_until(counter, counter->parent, csize); | ||
148 | |||
149 | set_hugetlb_cgroup(page, parent); | ||
150 | out: | ||
151 | return; | ||
152 | } | ||
153 | |||
154 | /* | ||
155 | * Force the hugetlb cgroup to empty the hugetlb resources by moving them to | ||
156 | * the parent cgroup. | ||
157 | */ | ||
158 | static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) | ||
159 | { | ||
160 | struct hstate *h; | ||
161 | struct page *page; | ||
162 | int ret = 0, idx = 0; | ||
163 | |||
164 | do { | ||
165 | if (cgroup_task_count(cgroup) || | ||
166 | !list_empty(&cgroup->children)) { | ||
167 | ret = -EBUSY; | ||
168 | goto out; | ||
169 | } | ||
170 | for_each_hstate(h) { | ||
171 | spin_lock(&hugetlb_lock); | ||
172 | list_for_each_entry(page, &h->hugepage_activelist, lru) | ||
173 | hugetlb_cgroup_move_parent(idx, cgroup, page); | ||
174 | |||
175 | spin_unlock(&hugetlb_lock); | ||
176 | idx++; | ||
177 | } | ||
178 | cond_resched(); | ||
179 | } while (hugetlb_cgroup_have_usage(cgroup)); | ||
180 | out: | ||
181 | return ret; | ||
182 | } | ||
183 | |||
184 | int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, | ||
185 | struct hugetlb_cgroup **ptr) | ||
186 | { | ||
187 | int ret = 0; | ||
188 | struct res_counter *fail_res; | ||
189 | struct hugetlb_cgroup *h_cg = NULL; | ||
190 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
191 | |||
192 | if (hugetlb_cgroup_disabled()) | ||
193 | goto done; | ||
194 | /* | ||
195 | * We don't charge any cgroup if the compound page have less | ||
196 | * than 3 pages. | ||
197 | */ | ||
198 | if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) | ||
199 | goto done; | ||
200 | again: | ||
201 | rcu_read_lock(); | ||
202 | h_cg = hugetlb_cgroup_from_task(current); | ||
203 | if (!css_tryget(&h_cg->css)) { | ||
204 | rcu_read_unlock(); | ||
205 | goto again; | ||
206 | } | ||
207 | rcu_read_unlock(); | ||
208 | |||
209 | ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res); | ||
210 | css_put(&h_cg->css); | ||
211 | done: | ||
212 | *ptr = h_cg; | ||
213 | return ret; | ||
214 | } | ||
215 | |||
216 | /* Should be called with hugetlb_lock held */ | ||
217 | void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, | ||
218 | struct hugetlb_cgroup *h_cg, | ||
219 | struct page *page) | ||
220 | { | ||
221 | if (hugetlb_cgroup_disabled() || !h_cg) | ||
222 | return; | ||
223 | |||
224 | set_hugetlb_cgroup(page, h_cg); | ||
225 | return; | ||
226 | } | ||
227 | |||
228 | /* | ||
229 | * Should be called with hugetlb_lock held | ||
230 | */ | ||
231 | void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, | ||
232 | struct page *page) | ||
233 | { | ||
234 | struct hugetlb_cgroup *h_cg; | ||
235 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
236 | |||
237 | if (hugetlb_cgroup_disabled()) | ||
238 | return; | ||
239 | VM_BUG_ON(!spin_is_locked(&hugetlb_lock)); | ||
240 | h_cg = hugetlb_cgroup_from_page(page); | ||
241 | if (unlikely(!h_cg)) | ||
242 | return; | ||
243 | set_hugetlb_cgroup(page, NULL); | ||
244 | res_counter_uncharge(&h_cg->hugepage[idx], csize); | ||
245 | return; | ||
246 | } | ||
247 | |||
248 | void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, | ||
249 | struct hugetlb_cgroup *h_cg) | ||
250 | { | ||
251 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
252 | |||
253 | if (hugetlb_cgroup_disabled() || !h_cg) | ||
254 | return; | ||
255 | |||
256 | if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) | ||
257 | return; | ||
258 | |||
259 | res_counter_uncharge(&h_cg->hugepage[idx], csize); | ||
260 | return; | ||
261 | } | ||
262 | |||
263 | static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft, | ||
264 | struct file *file, char __user *buf, | ||
265 | size_t nbytes, loff_t *ppos) | ||
266 | { | ||
267 | u64 val; | ||
268 | char str[64]; | ||
269 | int idx, name, len; | ||
270 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); | ||
271 | |||
272 | idx = MEMFILE_IDX(cft->private); | ||
273 | name = MEMFILE_ATTR(cft->private); | ||
274 | |||
275 | val = res_counter_read_u64(&h_cg->hugepage[idx], name); | ||
276 | len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); | ||
277 | return simple_read_from_buffer(buf, nbytes, ppos, str, len); | ||
278 | } | ||
279 | |||
280 | static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft, | ||
281 | const char *buffer) | ||
282 | { | ||
283 | int idx, name, ret; | ||
284 | unsigned long long val; | ||
285 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); | ||
286 | |||
287 | idx = MEMFILE_IDX(cft->private); | ||
288 | name = MEMFILE_ATTR(cft->private); | ||
289 | |||
290 | switch (name) { | ||
291 | case RES_LIMIT: | ||
292 | if (hugetlb_cgroup_is_root(h_cg)) { | ||
293 | /* Can't set limit on root */ | ||
294 | ret = -EINVAL; | ||
295 | break; | ||
296 | } | ||
297 | /* This function does all necessary parse...reuse it */ | ||
298 | ret = res_counter_memparse_write_strategy(buffer, &val); | ||
299 | if (ret) | ||
300 | break; | ||
301 | ret = res_counter_set_limit(&h_cg->hugepage[idx], val); | ||
302 | break; | ||
303 | default: | ||
304 | ret = -EINVAL; | ||
305 | break; | ||
306 | } | ||
307 | return ret; | ||
308 | } | ||
309 | |||
310 | static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event) | ||
311 | { | ||
312 | int idx, name, ret = 0; | ||
313 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); | ||
314 | |||
315 | idx = MEMFILE_IDX(event); | ||
316 | name = MEMFILE_ATTR(event); | ||
317 | |||
318 | switch (name) { | ||
319 | case RES_MAX_USAGE: | ||
320 | res_counter_reset_max(&h_cg->hugepage[idx]); | ||
321 | break; | ||
322 | case RES_FAILCNT: | ||
323 | res_counter_reset_failcnt(&h_cg->hugepage[idx]); | ||
324 | break; | ||
325 | default: | ||
326 | ret = -EINVAL; | ||
327 | break; | ||
328 | } | ||
329 | return ret; | ||
330 | } | ||
331 | |||
332 | static char *mem_fmt(char *buf, int size, unsigned long hsize) | ||
333 | { | ||
334 | if (hsize >= (1UL << 30)) | ||
335 | snprintf(buf, size, "%luGB", hsize >> 30); | ||
336 | else if (hsize >= (1UL << 20)) | ||
337 | snprintf(buf, size, "%luMB", hsize >> 20); | ||
338 | else | ||
339 | snprintf(buf, size, "%luKB", hsize >> 10); | ||
340 | return buf; | ||
341 | } | ||
342 | |||
343 | int __init hugetlb_cgroup_file_init(int idx) | ||
344 | { | ||
345 | char buf[32]; | ||
346 | struct cftype *cft; | ||
347 | struct hstate *h = &hstates[idx]; | ||
348 | |||
349 | /* format the size */ | ||
350 | mem_fmt(buf, 32, huge_page_size(h)); | ||
351 | |||
352 | /* Add the limit file */ | ||
353 | cft = &h->cgroup_files[0]; | ||
354 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); | ||
355 | cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); | ||
356 | cft->read = hugetlb_cgroup_read; | ||
357 | cft->write_string = hugetlb_cgroup_write; | ||
358 | |||
359 | /* Add the usage file */ | ||
360 | cft = &h->cgroup_files[1]; | ||
361 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); | ||
362 | cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); | ||
363 | cft->read = hugetlb_cgroup_read; | ||
364 | |||
365 | /* Add the MAX usage file */ | ||
366 | cft = &h->cgroup_files[2]; | ||
367 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); | ||
368 | cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); | ||
369 | cft->trigger = hugetlb_cgroup_reset; | ||
370 | cft->read = hugetlb_cgroup_read; | ||
371 | |||
372 | /* Add the failcntfile */ | ||
373 | cft = &h->cgroup_files[3]; | ||
374 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); | ||
375 | cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); | ||
376 | cft->trigger = hugetlb_cgroup_reset; | ||
377 | cft->read = hugetlb_cgroup_read; | ||
378 | |||
379 | /* NULL terminate the last cft */ | ||
380 | cft = &h->cgroup_files[4]; | ||
381 | memset(cft, 0, sizeof(*cft)); | ||
382 | |||
383 | WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); | ||
384 | |||
385 | return 0; | ||
386 | } | ||
387 | |||
388 | /* | ||
389 | * hugetlb_lock will make sure a parallel cgroup rmdir won't happen | ||
390 | * when we migrate hugepages | ||
391 | */ | ||
392 | void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) | ||
393 | { | ||
394 | struct hugetlb_cgroup *h_cg; | ||
395 | struct hstate *h = page_hstate(oldhpage); | ||
396 | |||
397 | if (hugetlb_cgroup_disabled()) | ||
398 | return; | ||
399 | |||
400 | VM_BUG_ON(!PageHuge(oldhpage)); | ||
401 | spin_lock(&hugetlb_lock); | ||
402 | h_cg = hugetlb_cgroup_from_page(oldhpage); | ||
403 | set_hugetlb_cgroup(oldhpage, NULL); | ||
404 | |||
405 | /* move the h_cg details to new cgroup */ | ||
406 | set_hugetlb_cgroup(newhpage, h_cg); | ||
407 | list_move(&newhpage->lru, &h->hugepage_activelist); | ||
408 | spin_unlock(&hugetlb_lock); | ||
409 | return; | ||
410 | } | ||
411 | |||
412 | struct cgroup_subsys hugetlb_subsys = { | ||
413 | .name = "hugetlb", | ||
414 | .create = hugetlb_cgroup_create, | ||
415 | .pre_destroy = hugetlb_cgroup_pre_destroy, | ||
416 | .destroy = hugetlb_cgroup_destroy, | ||
417 | .subsys_id = hugetlb_subsys_id, | ||
418 | }; | ||
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index cc448bb983ba..3a61efc518d5 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -123,7 +123,7 @@ static int pfn_inject_init(void) | |||
123 | if (!dentry) | 123 | if (!dentry) |
124 | goto fail; | 124 | goto fail; |
125 | 125 | ||
126 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 126 | #ifdef CONFIG_MEMCG_SWAP |
127 | dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, | 127 | dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, |
128 | hwpoison_dir, &hwpoison_filter_memcg); | 128 | hwpoison_dir, &hwpoison_filter_memcg); |
129 | if (!dentry) | 129 | if (!dentry) |
diff --git a/mm/internal.h b/mm/internal.h index 2ba87fbfb75b..3314f79d775a 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -118,8 +118,14 @@ struct compact_control { | |||
118 | unsigned long nr_freepages; /* Number of isolated free pages */ | 118 | unsigned long nr_freepages; /* Number of isolated free pages */ |
119 | unsigned long nr_migratepages; /* Number of pages to migrate */ | 119 | unsigned long nr_migratepages; /* Number of pages to migrate */ |
120 | unsigned long free_pfn; /* isolate_freepages search base */ | 120 | unsigned long free_pfn; /* isolate_freepages search base */ |
121 | unsigned long start_free_pfn; /* where we started the search */ | ||
121 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 122 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
122 | bool sync; /* Synchronous migration */ | 123 | bool sync; /* Synchronous migration */ |
124 | bool wrapped; /* Order > 0 compactions are | ||
125 | incremental, once free_pfn | ||
126 | and migrate_pfn meet, we restart | ||
127 | from the top of the zone; | ||
128 | remember we wrapped around. */ | ||
123 | 129 | ||
124 | int order; /* order a direct compactor needs */ | 130 | int order; /* order a direct compactor needs */ |
125 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 131 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
@@ -347,3 +353,5 @@ extern u32 hwpoison_filter_enable; | |||
347 | extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, | 353 | extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, |
348 | unsigned long, unsigned long, | 354 | unsigned long, unsigned long, |
349 | unsigned long, unsigned long); | 355 | unsigned long, unsigned long); |
356 | |||
357 | extern void set_pageblock_order(void); | ||
diff --git a/mm/memblock.c b/mm/memblock.c index 5cc6731b00cc..4d9393c7edc9 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -222,13 +222,13 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, | |||
222 | /* Try to find some space for it. | 222 | /* Try to find some space for it. |
223 | * | 223 | * |
224 | * WARNING: We assume that either slab_is_available() and we use it or | 224 | * WARNING: We assume that either slab_is_available() and we use it or |
225 | * we use MEMBLOCK for allocations. That means that this is unsafe to use | 225 | * we use MEMBLOCK for allocations. That means that this is unsafe to |
226 | * when bootmem is currently active (unless bootmem itself is implemented | 226 | * use when bootmem is currently active (unless bootmem itself is |
227 | * on top of MEMBLOCK which isn't the case yet) | 227 | * implemented on top of MEMBLOCK which isn't the case yet) |
228 | * | 228 | * |
229 | * This should however not be an issue for now, as we currently only | 229 | * This should however not be an issue for now, as we currently only |
230 | * call into MEMBLOCK while it's still active, or much later when slab is | 230 | * call into MEMBLOCK while it's still active, or much later when slab |
231 | * active for memory hotplug operations | 231 | * is active for memory hotplug operations |
232 | */ | 232 | */ |
233 | if (use_slab) { | 233 | if (use_slab) { |
234 | new_array = kmalloc(new_size, GFP_KERNEL); | 234 | new_array = kmalloc(new_size, GFP_KERNEL); |
@@ -243,8 +243,8 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, | |||
243 | new_alloc_size, PAGE_SIZE); | 243 | new_alloc_size, PAGE_SIZE); |
244 | if (!addr && new_area_size) | 244 | if (!addr && new_area_size) |
245 | addr = memblock_find_in_range(0, | 245 | addr = memblock_find_in_range(0, |
246 | min(new_area_start, memblock.current_limit), | 246 | min(new_area_start, memblock.current_limit), |
247 | new_alloc_size, PAGE_SIZE); | 247 | new_alloc_size, PAGE_SIZE); |
248 | 248 | ||
249 | new_array = addr ? __va(addr) : 0; | 249 | new_array = addr ? __va(addr) : 0; |
250 | } | 250 | } |
@@ -254,12 +254,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, | |||
254 | return -1; | 254 | return -1; |
255 | } | 255 | } |
256 | 256 | ||
257 | memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", | 257 | memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]", |
258 | memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); | 258 | memblock_type_name(type), type->max * 2, (u64)addr, |
259 | (u64)addr + new_size - 1); | ||
259 | 260 | ||
260 | /* Found space, we now need to move the array over before | 261 | /* |
261 | * we add the reserved region since it may be our reserved | 262 | * Found space, we now need to move the array over before we add the |
262 | * array itself that is full. | 263 | * reserved region since it may be our reserved array itself that is |
264 | * full. | ||
263 | */ | 265 | */ |
264 | memcpy(new_array, type->regions, old_size); | 266 | memcpy(new_array, type->regions, old_size); |
265 | memset(new_array + type->max, 0, old_size); | 267 | memset(new_array + type->max, 0, old_size); |
@@ -267,17 +269,16 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, | |||
267 | type->regions = new_array; | 269 | type->regions = new_array; |
268 | type->max <<= 1; | 270 | type->max <<= 1; |
269 | 271 | ||
270 | /* Free old array. We needn't free it if the array is the | 272 | /* Free old array. We needn't free it if the array is the static one */ |
271 | * static one | ||
272 | */ | ||
273 | if (*in_slab) | 273 | if (*in_slab) |
274 | kfree(old_array); | 274 | kfree(old_array); |
275 | else if (old_array != memblock_memory_init_regions && | 275 | else if (old_array != memblock_memory_init_regions && |
276 | old_array != memblock_reserved_init_regions) | 276 | old_array != memblock_reserved_init_regions) |
277 | memblock_free(__pa(old_array), old_alloc_size); | 277 | memblock_free(__pa(old_array), old_alloc_size); |
278 | 278 | ||
279 | /* Reserve the new array if that comes from the memblock. | 279 | /* |
280 | * Otherwise, we needn't do it | 280 | * Reserve the new array if that comes from the memblock. Otherwise, we |
281 | * needn't do it | ||
281 | */ | 282 | */ |
282 | if (!use_slab) | 283 | if (!use_slab) |
283 | BUG_ON(memblock_reserve(addr, new_alloc_size)); | 284 | BUG_ON(memblock_reserve(addr, new_alloc_size)); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f72b5e52451a..795e525afaba 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -61,12 +61,12 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly; | |||
61 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 61 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
62 | static struct mem_cgroup *root_mem_cgroup __read_mostly; | 62 | static struct mem_cgroup *root_mem_cgroup __read_mostly; |
63 | 63 | ||
64 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 64 | #ifdef CONFIG_MEMCG_SWAP |
65 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ | 65 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ |
66 | int do_swap_account __read_mostly; | 66 | int do_swap_account __read_mostly; |
67 | 67 | ||
68 | /* for remember boot option*/ | 68 | /* for remember boot option*/ |
69 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED | 69 | #ifdef CONFIG_MEMCG_SWAP_ENABLED |
70 | static int really_do_swap_account __initdata = 1; | 70 | static int really_do_swap_account __initdata = 1; |
71 | #else | 71 | #else |
72 | static int really_do_swap_account __initdata = 0; | 72 | static int really_do_swap_account __initdata = 0; |
@@ -87,7 +87,7 @@ enum mem_cgroup_stat_index { | |||
87 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 87 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
88 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | 88 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
89 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | 89 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
90 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 90 | MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ |
91 | MEM_CGROUP_STAT_NSTATS, | 91 | MEM_CGROUP_STAT_NSTATS, |
92 | }; | 92 | }; |
93 | 93 | ||
@@ -378,9 +378,7 @@ static bool move_file(void) | |||
378 | 378 | ||
379 | enum charge_type { | 379 | enum charge_type { |
380 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 380 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
381 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 381 | MEM_CGROUP_CHARGE_TYPE_ANON, |
382 | MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ | ||
383 | MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ | ||
384 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ | 382 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ |
385 | MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ | 383 | MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ |
386 | NR_CHARGE_TYPE, | 384 | NR_CHARGE_TYPE, |
@@ -407,8 +405,14 @@ enum charge_type { | |||
407 | static void mem_cgroup_get(struct mem_cgroup *memcg); | 405 | static void mem_cgroup_get(struct mem_cgroup *memcg); |
408 | static void mem_cgroup_put(struct mem_cgroup *memcg); | 406 | static void mem_cgroup_put(struct mem_cgroup *memcg); |
409 | 407 | ||
408 | static inline | ||
409 | struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) | ||
410 | { | ||
411 | return container_of(s, struct mem_cgroup, css); | ||
412 | } | ||
413 | |||
410 | /* Writing them here to avoid exposing memcg's inner layout */ | 414 | /* Writing them here to avoid exposing memcg's inner layout */ |
411 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | 415 | #ifdef CONFIG_MEMCG_KMEM |
412 | #include <net/sock.h> | 416 | #include <net/sock.h> |
413 | #include <net/ip.h> | 417 | #include <net/ip.h> |
414 | 418 | ||
@@ -467,9 +471,9 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) | |||
467 | } | 471 | } |
468 | EXPORT_SYMBOL(tcp_proto_cgroup); | 472 | EXPORT_SYMBOL(tcp_proto_cgroup); |
469 | #endif /* CONFIG_INET */ | 473 | #endif /* CONFIG_INET */ |
470 | #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ | 474 | #endif /* CONFIG_MEMCG_KMEM */ |
471 | 475 | ||
472 | #if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) | 476 | #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) |
473 | static void disarm_sock_keys(struct mem_cgroup *memcg) | 477 | static void disarm_sock_keys(struct mem_cgroup *memcg) |
474 | { | 478 | { |
475 | if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) | 479 | if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) |
@@ -703,7 +707,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, | |||
703 | bool charge) | 707 | bool charge) |
704 | { | 708 | { |
705 | int val = (charge) ? 1 : -1; | 709 | int val = (charge) ? 1 : -1; |
706 | this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); | 710 | this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); |
707 | } | 711 | } |
708 | 712 | ||
709 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | 713 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, |
@@ -864,9 +868,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) | |||
864 | 868 | ||
865 | struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | 869 | struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) |
866 | { | 870 | { |
867 | return container_of(cgroup_subsys_state(cont, | 871 | return mem_cgroup_from_css( |
868 | mem_cgroup_subsys_id), struct mem_cgroup, | 872 | cgroup_subsys_state(cont, mem_cgroup_subsys_id)); |
869 | css); | ||
870 | } | 873 | } |
871 | 874 | ||
872 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | 875 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) |
@@ -879,8 +882,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |||
879 | if (unlikely(!p)) | 882 | if (unlikely(!p)) |
880 | return NULL; | 883 | return NULL; |
881 | 884 | ||
882 | return container_of(task_subsys_state(p, mem_cgroup_subsys_id), | 885 | return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id)); |
883 | struct mem_cgroup, css); | ||
884 | } | 886 | } |
885 | 887 | ||
886 | struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | 888 | struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) |
@@ -966,8 +968,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
966 | css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); | 968 | css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); |
967 | if (css) { | 969 | if (css) { |
968 | if (css == &root->css || css_tryget(css)) | 970 | if (css == &root->css || css_tryget(css)) |
969 | memcg = container_of(css, | 971 | memcg = mem_cgroup_from_css(css); |
970 | struct mem_cgroup, css); | ||
971 | } else | 972 | } else |
972 | id = 0; | 973 | id = 0; |
973 | rcu_read_unlock(); | 974 | rcu_read_unlock(); |
@@ -1454,7 +1455,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg) | |||
1454 | /* | 1455 | /* |
1455 | * Return the memory (and swap, if configured) limit for a memcg. | 1456 | * Return the memory (and swap, if configured) limit for a memcg. |
1456 | */ | 1457 | */ |
1457 | u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | 1458 | static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) |
1458 | { | 1459 | { |
1459 | u64 limit; | 1460 | u64 limit; |
1460 | u64 memsw; | 1461 | u64 memsw; |
@@ -1470,6 +1471,73 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | |||
1470 | return min(limit, memsw); | 1471 | return min(limit, memsw); |
1471 | } | 1472 | } |
1472 | 1473 | ||
1474 | void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | ||
1475 | int order) | ||
1476 | { | ||
1477 | struct mem_cgroup *iter; | ||
1478 | unsigned long chosen_points = 0; | ||
1479 | unsigned long totalpages; | ||
1480 | unsigned int points = 0; | ||
1481 | struct task_struct *chosen = NULL; | ||
1482 | |||
1483 | /* | ||
1484 | * If current has a pending SIGKILL, then automatically select it. The | ||
1485 | * goal is to allow it to allocate so that it may quickly exit and free | ||
1486 | * its memory. | ||
1487 | */ | ||
1488 | if (fatal_signal_pending(current)) { | ||
1489 | set_thread_flag(TIF_MEMDIE); | ||
1490 | return; | ||
1491 | } | ||
1492 | |||
1493 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); | ||
1494 | totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; | ||
1495 | for_each_mem_cgroup_tree(iter, memcg) { | ||
1496 | struct cgroup *cgroup = iter->css.cgroup; | ||
1497 | struct cgroup_iter it; | ||
1498 | struct task_struct *task; | ||
1499 | |||
1500 | cgroup_iter_start(cgroup, &it); | ||
1501 | while ((task = cgroup_iter_next(cgroup, &it))) { | ||
1502 | switch (oom_scan_process_thread(task, totalpages, NULL, | ||
1503 | false)) { | ||
1504 | case OOM_SCAN_SELECT: | ||
1505 | if (chosen) | ||
1506 | put_task_struct(chosen); | ||
1507 | chosen = task; | ||
1508 | chosen_points = ULONG_MAX; | ||
1509 | get_task_struct(chosen); | ||
1510 | /* fall through */ | ||
1511 | case OOM_SCAN_CONTINUE: | ||
1512 | continue; | ||
1513 | case OOM_SCAN_ABORT: | ||
1514 | cgroup_iter_end(cgroup, &it); | ||
1515 | mem_cgroup_iter_break(memcg, iter); | ||
1516 | if (chosen) | ||
1517 | put_task_struct(chosen); | ||
1518 | return; | ||
1519 | case OOM_SCAN_OK: | ||
1520 | break; | ||
1521 | }; | ||
1522 | points = oom_badness(task, memcg, NULL, totalpages); | ||
1523 | if (points > chosen_points) { | ||
1524 | if (chosen) | ||
1525 | put_task_struct(chosen); | ||
1526 | chosen = task; | ||
1527 | chosen_points = points; | ||
1528 | get_task_struct(chosen); | ||
1529 | } | ||
1530 | } | ||
1531 | cgroup_iter_end(cgroup, &it); | ||
1532 | } | ||
1533 | |||
1534 | if (!chosen) | ||
1535 | return; | ||
1536 | points = chosen_points * 1000 / totalpages; | ||
1537 | oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, | ||
1538 | NULL, "Memory cgroup out of memory"); | ||
1539 | } | ||
1540 | |||
1473 | static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, | 1541 | static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, |
1474 | gfp_t gfp_mask, | 1542 | gfp_t gfp_mask, |
1475 | unsigned long flags) | 1543 | unsigned long flags) |
@@ -1899,7 +1967,7 @@ again: | |||
1899 | return; | 1967 | return; |
1900 | /* | 1968 | /* |
1901 | * If this memory cgroup is not under account moving, we don't | 1969 | * If this memory cgroup is not under account moving, we don't |
1902 | * need to take move_lock_page_cgroup(). Because we already hold | 1970 | * need to take move_lock_mem_cgroup(). Because we already hold |
1903 | * rcu_read_lock(), any calls to move_account will be delayed until | 1971 | * rcu_read_lock(), any calls to move_account will be delayed until |
1904 | * rcu_read_unlock() if mem_cgroup_stolen() == true. | 1972 | * rcu_read_unlock() if mem_cgroup_stolen() == true. |
1905 | */ | 1973 | */ |
@@ -1921,7 +1989,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) | |||
1921 | /* | 1989 | /* |
1922 | * It's guaranteed that pc->mem_cgroup never changes while | 1990 | * It's guaranteed that pc->mem_cgroup never changes while |
1923 | * lock is held because a routine modifies pc->mem_cgroup | 1991 | * lock is held because a routine modifies pc->mem_cgroup |
1924 | * should take move_lock_page_cgroup(). | 1992 | * should take move_lock_mem_cgroup(). |
1925 | */ | 1993 | */ |
1926 | move_unlock_mem_cgroup(pc->mem_cgroup, flags); | 1994 | move_unlock_mem_cgroup(pc->mem_cgroup, flags); |
1927 | } | 1995 | } |
@@ -2268,7 +2336,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
2268 | * We always charge the cgroup the mm_struct belongs to. | 2336 | * We always charge the cgroup the mm_struct belongs to. |
2269 | * The mm_struct's mem_cgroup changes on task migration if the | 2337 | * The mm_struct's mem_cgroup changes on task migration if the |
2270 | * thread group leader migrates. It's possible that mm is not | 2338 | * thread group leader migrates. It's possible that mm is not |
2271 | * set, if so charge the init_mm (happens for pagecache usage). | 2339 | * set, if so charge the root memcg (happens for pagecache usage). |
2272 | */ | 2340 | */ |
2273 | if (!*ptr && !mm) | 2341 | if (!*ptr && !mm) |
2274 | *ptr = root_mem_cgroup; | 2342 | *ptr = root_mem_cgroup; |
@@ -2429,7 +2497,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | |||
2429 | css = css_lookup(&mem_cgroup_subsys, id); | 2497 | css = css_lookup(&mem_cgroup_subsys, id); |
2430 | if (!css) | 2498 | if (!css) |
2431 | return NULL; | 2499 | return NULL; |
2432 | return container_of(css, struct mem_cgroup, css); | 2500 | return mem_cgroup_from_css(css); |
2433 | } | 2501 | } |
2434 | 2502 | ||
2435 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | 2503 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) |
@@ -2473,11 +2541,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2473 | bool anon; | 2541 | bool anon; |
2474 | 2542 | ||
2475 | lock_page_cgroup(pc); | 2543 | lock_page_cgroup(pc); |
2476 | if (unlikely(PageCgroupUsed(pc))) { | 2544 | VM_BUG_ON(PageCgroupUsed(pc)); |
2477 | unlock_page_cgroup(pc); | ||
2478 | __mem_cgroup_cancel_charge(memcg, nr_pages); | ||
2479 | return; | ||
2480 | } | ||
2481 | /* | 2545 | /* |
2482 | * we don't need page_cgroup_lock about tail pages, becase they are not | 2546 | * we don't need page_cgroup_lock about tail pages, becase they are not |
2483 | * accessed by any other context at this point. | 2547 | * accessed by any other context at this point. |
@@ -2519,7 +2583,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2519 | spin_unlock_irq(&zone->lru_lock); | 2583 | spin_unlock_irq(&zone->lru_lock); |
2520 | } | 2584 | } |
2521 | 2585 | ||
2522 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) | 2586 | if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON) |
2523 | anon = true; | 2587 | anon = true; |
2524 | else | 2588 | else |
2525 | anon = false; | 2589 | anon = false; |
@@ -2644,8 +2708,7 @@ out: | |||
2644 | 2708 | ||
2645 | static int mem_cgroup_move_parent(struct page *page, | 2709 | static int mem_cgroup_move_parent(struct page *page, |
2646 | struct page_cgroup *pc, | 2710 | struct page_cgroup *pc, |
2647 | struct mem_cgroup *child, | 2711 | struct mem_cgroup *child) |
2648 | gfp_t gfp_mask) | ||
2649 | { | 2712 | { |
2650 | struct mem_cgroup *parent; | 2713 | struct mem_cgroup *parent; |
2651 | unsigned int nr_pages; | 2714 | unsigned int nr_pages; |
@@ -2728,38 +2791,7 @@ int mem_cgroup_newpage_charge(struct page *page, | |||
2728 | VM_BUG_ON(page->mapping && !PageAnon(page)); | 2791 | VM_BUG_ON(page->mapping && !PageAnon(page)); |
2729 | VM_BUG_ON(!mm); | 2792 | VM_BUG_ON(!mm); |
2730 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 2793 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
2731 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | 2794 | MEM_CGROUP_CHARGE_TYPE_ANON); |
2732 | } | ||
2733 | |||
2734 | static void | ||
2735 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | ||
2736 | enum charge_type ctype); | ||
2737 | |||
2738 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | ||
2739 | gfp_t gfp_mask) | ||
2740 | { | ||
2741 | struct mem_cgroup *memcg = NULL; | ||
2742 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
2743 | int ret; | ||
2744 | |||
2745 | if (mem_cgroup_disabled()) | ||
2746 | return 0; | ||
2747 | if (PageCompound(page)) | ||
2748 | return 0; | ||
2749 | |||
2750 | if (unlikely(!mm)) | ||
2751 | mm = &init_mm; | ||
2752 | if (!page_is_file_cache(page)) | ||
2753 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
2754 | |||
2755 | if (!PageSwapCache(page)) | ||
2756 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); | ||
2757 | else { /* page is swapcache/shmem */ | ||
2758 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); | ||
2759 | if (!ret) | ||
2760 | __mem_cgroup_commit_charge_swapin(page, memcg, type); | ||
2761 | } | ||
2762 | return ret; | ||
2763 | } | 2795 | } |
2764 | 2796 | ||
2765 | /* | 2797 | /* |
@@ -2768,27 +2800,26 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
2768 | * struct page_cgroup is acquired. This refcnt will be consumed by | 2800 | * struct page_cgroup is acquired. This refcnt will be consumed by |
2769 | * "commit()" or removed by "cancel()" | 2801 | * "commit()" or removed by "cancel()" |
2770 | */ | 2802 | */ |
2771 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | 2803 | static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, |
2772 | struct page *page, | 2804 | struct page *page, |
2773 | gfp_t mask, struct mem_cgroup **memcgp) | 2805 | gfp_t mask, |
2806 | struct mem_cgroup **memcgp) | ||
2774 | { | 2807 | { |
2775 | struct mem_cgroup *memcg; | 2808 | struct mem_cgroup *memcg; |
2809 | struct page_cgroup *pc; | ||
2776 | int ret; | 2810 | int ret; |
2777 | 2811 | ||
2778 | *memcgp = NULL; | 2812 | pc = lookup_page_cgroup(page); |
2779 | |||
2780 | if (mem_cgroup_disabled()) | ||
2781 | return 0; | ||
2782 | |||
2783 | if (!do_swap_account) | ||
2784 | goto charge_cur_mm; | ||
2785 | /* | 2813 | /* |
2786 | * A racing thread's fault, or swapoff, may have already updated | 2814 | * Every swap fault against a single page tries to charge the |
2787 | * the pte, and even removed page from swap cache: in those cases | 2815 | * page, bail as early as possible. shmem_unuse() encounters |
2788 | * do_swap_page()'s pte_same() test will fail; but there's also a | 2816 | * already charged pages, too. The USED bit is protected by |
2789 | * KSM case which does need to charge the page. | 2817 | * the page lock, which serializes swap cache removal, which |
2818 | * in turn serializes uncharging. | ||
2790 | */ | 2819 | */ |
2791 | if (!PageSwapCache(page)) | 2820 | if (PageCgroupUsed(pc)) |
2821 | return 0; | ||
2822 | if (!do_swap_account) | ||
2792 | goto charge_cur_mm; | 2823 | goto charge_cur_mm; |
2793 | memcg = try_get_mem_cgroup_from_page(page); | 2824 | memcg = try_get_mem_cgroup_from_page(page); |
2794 | if (!memcg) | 2825 | if (!memcg) |
@@ -2800,14 +2831,44 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2800 | ret = 0; | 2831 | ret = 0; |
2801 | return ret; | 2832 | return ret; |
2802 | charge_cur_mm: | 2833 | charge_cur_mm: |
2803 | if (unlikely(!mm)) | ||
2804 | mm = &init_mm; | ||
2805 | ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); | 2834 | ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); |
2806 | if (ret == -EINTR) | 2835 | if (ret == -EINTR) |
2807 | ret = 0; | 2836 | ret = 0; |
2808 | return ret; | 2837 | return ret; |
2809 | } | 2838 | } |
2810 | 2839 | ||
2840 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, | ||
2841 | gfp_t gfp_mask, struct mem_cgroup **memcgp) | ||
2842 | { | ||
2843 | *memcgp = NULL; | ||
2844 | if (mem_cgroup_disabled()) | ||
2845 | return 0; | ||
2846 | /* | ||
2847 | * A racing thread's fault, or swapoff, may have already | ||
2848 | * updated the pte, and even removed page from swap cache: in | ||
2849 | * those cases unuse_pte()'s pte_same() test will fail; but | ||
2850 | * there's also a KSM case which does need to charge the page. | ||
2851 | */ | ||
2852 | if (!PageSwapCache(page)) { | ||
2853 | int ret; | ||
2854 | |||
2855 | ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true); | ||
2856 | if (ret == -EINTR) | ||
2857 | ret = 0; | ||
2858 | return ret; | ||
2859 | } | ||
2860 | return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); | ||
2861 | } | ||
2862 | |||
2863 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) | ||
2864 | { | ||
2865 | if (mem_cgroup_disabled()) | ||
2866 | return; | ||
2867 | if (!memcg) | ||
2868 | return; | ||
2869 | __mem_cgroup_cancel_charge(memcg, 1); | ||
2870 | } | ||
2871 | |||
2811 | static void | 2872 | static void |
2812 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, | 2873 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, |
2813 | enum charge_type ctype) | 2874 | enum charge_type ctype) |
@@ -2842,16 +2903,30 @@ void mem_cgroup_commit_charge_swapin(struct page *page, | |||
2842 | struct mem_cgroup *memcg) | 2903 | struct mem_cgroup *memcg) |
2843 | { | 2904 | { |
2844 | __mem_cgroup_commit_charge_swapin(page, memcg, | 2905 | __mem_cgroup_commit_charge_swapin(page, memcg, |
2845 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | 2906 | MEM_CGROUP_CHARGE_TYPE_ANON); |
2846 | } | 2907 | } |
2847 | 2908 | ||
2848 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) | 2909 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
2910 | gfp_t gfp_mask) | ||
2849 | { | 2911 | { |
2912 | struct mem_cgroup *memcg = NULL; | ||
2913 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
2914 | int ret; | ||
2915 | |||
2850 | if (mem_cgroup_disabled()) | 2916 | if (mem_cgroup_disabled()) |
2851 | return; | 2917 | return 0; |
2852 | if (!memcg) | 2918 | if (PageCompound(page)) |
2853 | return; | 2919 | return 0; |
2854 | __mem_cgroup_cancel_charge(memcg, 1); | 2920 | |
2921 | if (!PageSwapCache(page)) | ||
2922 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); | ||
2923 | else { /* page is swapcache/shmem */ | ||
2924 | ret = __mem_cgroup_try_charge_swapin(mm, page, | ||
2925 | gfp_mask, &memcg); | ||
2926 | if (!ret) | ||
2927 | __mem_cgroup_commit_charge_swapin(page, memcg, type); | ||
2928 | } | ||
2929 | return ret; | ||
2855 | } | 2930 | } |
2856 | 2931 | ||
2857 | static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, | 2932 | static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, |
@@ -2911,7 +2986,8 @@ direct_uncharge: | |||
2911 | * uncharge if !page_mapped(page) | 2986 | * uncharge if !page_mapped(page) |
2912 | */ | 2987 | */ |
2913 | static struct mem_cgroup * | 2988 | static struct mem_cgroup * |
2914 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | 2989 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, |
2990 | bool end_migration) | ||
2915 | { | 2991 | { |
2916 | struct mem_cgroup *memcg = NULL; | 2992 | struct mem_cgroup *memcg = NULL; |
2917 | unsigned int nr_pages = 1; | 2993 | unsigned int nr_pages = 1; |
@@ -2921,8 +2997,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2921 | if (mem_cgroup_disabled()) | 2997 | if (mem_cgroup_disabled()) |
2922 | return NULL; | 2998 | return NULL; |
2923 | 2999 | ||
2924 | if (PageSwapCache(page)) | 3000 | VM_BUG_ON(PageSwapCache(page)); |
2925 | return NULL; | ||
2926 | 3001 | ||
2927 | if (PageTransHuge(page)) { | 3002 | if (PageTransHuge(page)) { |
2928 | nr_pages <<= compound_order(page); | 3003 | nr_pages <<= compound_order(page); |
@@ -2945,7 +3020,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2945 | anon = PageAnon(page); | 3020 | anon = PageAnon(page); |
2946 | 3021 | ||
2947 | switch (ctype) { | 3022 | switch (ctype) { |
2948 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | 3023 | case MEM_CGROUP_CHARGE_TYPE_ANON: |
2949 | /* | 3024 | /* |
2950 | * Generally PageAnon tells if it's the anon statistics to be | 3025 | * Generally PageAnon tells if it's the anon statistics to be |
2951 | * updated; but sometimes e.g. mem_cgroup_uncharge_page() is | 3026 | * updated; but sometimes e.g. mem_cgroup_uncharge_page() is |
@@ -2955,7 +3030,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2955 | /* fallthrough */ | 3030 | /* fallthrough */ |
2956 | case MEM_CGROUP_CHARGE_TYPE_DROP: | 3031 | case MEM_CGROUP_CHARGE_TYPE_DROP: |
2957 | /* See mem_cgroup_prepare_migration() */ | 3032 | /* See mem_cgroup_prepare_migration() */ |
2958 | if (page_mapped(page) || PageCgroupMigration(pc)) | 3033 | if (page_mapped(page)) |
3034 | goto unlock_out; | ||
3035 | /* | ||
3036 | * Pages under migration may not be uncharged. But | ||
3037 | * end_migration() /must/ be the one uncharging the | ||
3038 | * unused post-migration page and so it has to call | ||
3039 | * here with the migration bit still set. See the | ||
3040 | * res_counter handling below. | ||
3041 | */ | ||
3042 | if (!end_migration && PageCgroupMigration(pc)) | ||
2959 | goto unlock_out; | 3043 | goto unlock_out; |
2960 | break; | 3044 | break; |
2961 | case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: | 3045 | case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: |
@@ -2989,7 +3073,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2989 | mem_cgroup_swap_statistics(memcg, true); | 3073 | mem_cgroup_swap_statistics(memcg, true); |
2990 | mem_cgroup_get(memcg); | 3074 | mem_cgroup_get(memcg); |
2991 | } | 3075 | } |
2992 | if (!mem_cgroup_is_root(memcg)) | 3076 | /* |
3077 | * Migration does not charge the res_counter for the | ||
3078 | * replacement page, so leave it alone when phasing out the | ||
3079 | * page that is unused after the migration. | ||
3080 | */ | ||
3081 | if (!end_migration && !mem_cgroup_is_root(memcg)) | ||
2993 | mem_cgroup_do_uncharge(memcg, nr_pages, ctype); | 3082 | mem_cgroup_do_uncharge(memcg, nr_pages, ctype); |
2994 | 3083 | ||
2995 | return memcg; | 3084 | return memcg; |
@@ -3005,14 +3094,16 @@ void mem_cgroup_uncharge_page(struct page *page) | |||
3005 | if (page_mapped(page)) | 3094 | if (page_mapped(page)) |
3006 | return; | 3095 | return; |
3007 | VM_BUG_ON(page->mapping && !PageAnon(page)); | 3096 | VM_BUG_ON(page->mapping && !PageAnon(page)); |
3008 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); | 3097 | if (PageSwapCache(page)) |
3098 | return; | ||
3099 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); | ||
3009 | } | 3100 | } |
3010 | 3101 | ||
3011 | void mem_cgroup_uncharge_cache_page(struct page *page) | 3102 | void mem_cgroup_uncharge_cache_page(struct page *page) |
3012 | { | 3103 | { |
3013 | VM_BUG_ON(page_mapped(page)); | 3104 | VM_BUG_ON(page_mapped(page)); |
3014 | VM_BUG_ON(page->mapping); | 3105 | VM_BUG_ON(page->mapping); |
3015 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | 3106 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); |
3016 | } | 3107 | } |
3017 | 3108 | ||
3018 | /* | 3109 | /* |
@@ -3076,7 +3167,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | |||
3076 | if (!swapout) /* this was a swap cache but the swap is unused ! */ | 3167 | if (!swapout) /* this was a swap cache but the swap is unused ! */ |
3077 | ctype = MEM_CGROUP_CHARGE_TYPE_DROP; | 3168 | ctype = MEM_CGROUP_CHARGE_TYPE_DROP; |
3078 | 3169 | ||
3079 | memcg = __mem_cgroup_uncharge_common(page, ctype); | 3170 | memcg = __mem_cgroup_uncharge_common(page, ctype, false); |
3080 | 3171 | ||
3081 | /* | 3172 | /* |
3082 | * record memcg information, if swapout && memcg != NULL, | 3173 | * record memcg information, if swapout && memcg != NULL, |
@@ -3087,7 +3178,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | |||
3087 | } | 3178 | } |
3088 | #endif | 3179 | #endif |
3089 | 3180 | ||
3090 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 3181 | #ifdef CONFIG_MEMCG_SWAP |
3091 | /* | 3182 | /* |
3092 | * called from swap_entry_free(). remove record in swap_cgroup and | 3183 | * called from swap_entry_free(). remove record in swap_cgroup and |
3093 | * uncharge "memsw" account. | 3184 | * uncharge "memsw" account. |
@@ -3166,19 +3257,18 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
3166 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old | 3257 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old |
3167 | * page belongs to. | 3258 | * page belongs to. |
3168 | */ | 3259 | */ |
3169 | int mem_cgroup_prepare_migration(struct page *page, | 3260 | void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, |
3170 | struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) | 3261 | struct mem_cgroup **memcgp) |
3171 | { | 3262 | { |
3172 | struct mem_cgroup *memcg = NULL; | 3263 | struct mem_cgroup *memcg = NULL; |
3173 | struct page_cgroup *pc; | 3264 | struct page_cgroup *pc; |
3174 | enum charge_type ctype; | 3265 | enum charge_type ctype; |
3175 | int ret = 0; | ||
3176 | 3266 | ||
3177 | *memcgp = NULL; | 3267 | *memcgp = NULL; |
3178 | 3268 | ||
3179 | VM_BUG_ON(PageTransHuge(page)); | 3269 | VM_BUG_ON(PageTransHuge(page)); |
3180 | if (mem_cgroup_disabled()) | 3270 | if (mem_cgroup_disabled()) |
3181 | return 0; | 3271 | return; |
3182 | 3272 | ||
3183 | pc = lookup_page_cgroup(page); | 3273 | pc = lookup_page_cgroup(page); |
3184 | lock_page_cgroup(pc); | 3274 | lock_page_cgroup(pc); |
@@ -3223,24 +3313,9 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
3223 | * we return here. | 3313 | * we return here. |
3224 | */ | 3314 | */ |
3225 | if (!memcg) | 3315 | if (!memcg) |
3226 | return 0; | 3316 | return; |
3227 | 3317 | ||
3228 | *memcgp = memcg; | 3318 | *memcgp = memcg; |
3229 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false); | ||
3230 | css_put(&memcg->css);/* drop extra refcnt */ | ||
3231 | if (ret) { | ||
3232 | if (PageAnon(page)) { | ||
3233 | lock_page_cgroup(pc); | ||
3234 | ClearPageCgroupMigration(pc); | ||
3235 | unlock_page_cgroup(pc); | ||
3236 | /* | ||
3237 | * The old page may be fully unmapped while we kept it. | ||
3238 | */ | ||
3239 | mem_cgroup_uncharge_page(page); | ||
3240 | } | ||
3241 | /* we'll need to revisit this error code (we have -EINTR) */ | ||
3242 | return -ENOMEM; | ||
3243 | } | ||
3244 | /* | 3319 | /* |
3245 | * We charge new page before it's used/mapped. So, even if unlock_page() | 3320 | * We charge new page before it's used/mapped. So, even if unlock_page() |
3246 | * is called before end_migration, we can catch all events on this new | 3321 | * is called before end_migration, we can catch all events on this new |
@@ -3248,13 +3323,15 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
3248 | * mapcount will be finally 0 and we call uncharge in end_migration(). | 3323 | * mapcount will be finally 0 and we call uncharge in end_migration(). |
3249 | */ | 3324 | */ |
3250 | if (PageAnon(page)) | 3325 | if (PageAnon(page)) |
3251 | ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | 3326 | ctype = MEM_CGROUP_CHARGE_TYPE_ANON; |
3252 | else if (page_is_file_cache(page)) | ||
3253 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
3254 | else | 3327 | else |
3255 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 3328 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; |
3329 | /* | ||
3330 | * The page is committed to the memcg, but it's not actually | ||
3331 | * charged to the res_counter since we plan on replacing the | ||
3332 | * old one and only one page is going to be left afterwards. | ||
3333 | */ | ||
3256 | __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); | 3334 | __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); |
3257 | return ret; | ||
3258 | } | 3335 | } |
3259 | 3336 | ||
3260 | /* remove redundant charge if migration failed*/ | 3337 | /* remove redundant charge if migration failed*/ |
@@ -3276,6 +3353,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3276 | used = newpage; | 3353 | used = newpage; |
3277 | unused = oldpage; | 3354 | unused = oldpage; |
3278 | } | 3355 | } |
3356 | anon = PageAnon(used); | ||
3357 | __mem_cgroup_uncharge_common(unused, | ||
3358 | anon ? MEM_CGROUP_CHARGE_TYPE_ANON | ||
3359 | : MEM_CGROUP_CHARGE_TYPE_CACHE, | ||
3360 | true); | ||
3361 | css_put(&memcg->css); | ||
3279 | /* | 3362 | /* |
3280 | * We disallowed uncharge of pages under migration because mapcount | 3363 | * We disallowed uncharge of pages under migration because mapcount |
3281 | * of the page goes down to zero, temporarly. | 3364 | * of the page goes down to zero, temporarly. |
@@ -3285,10 +3368,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3285 | lock_page_cgroup(pc); | 3368 | lock_page_cgroup(pc); |
3286 | ClearPageCgroupMigration(pc); | 3369 | ClearPageCgroupMigration(pc); |
3287 | unlock_page_cgroup(pc); | 3370 | unlock_page_cgroup(pc); |
3288 | anon = PageAnon(used); | ||
3289 | __mem_cgroup_uncharge_common(unused, | ||
3290 | anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED | ||
3291 | : MEM_CGROUP_CHARGE_TYPE_CACHE); | ||
3292 | 3371 | ||
3293 | /* | 3372 | /* |
3294 | * If a page is a file cache, radix-tree replacement is very atomic | 3373 | * If a page is a file cache, radix-tree replacement is very atomic |
@@ -3340,10 +3419,6 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, | |||
3340 | */ | 3419 | */ |
3341 | if (!memcg) | 3420 | if (!memcg) |
3342 | return; | 3421 | return; |
3343 | |||
3344 | if (PageSwapBacked(oldpage)) | ||
3345 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
3346 | |||
3347 | /* | 3422 | /* |
3348 | * Even if newpage->mapping was NULL before starting replacement, | 3423 | * Even if newpage->mapping was NULL before starting replacement, |
3349 | * the newpage may be on LRU(or pagevec for LRU) already. We lock | 3424 | * the newpage may be on LRU(or pagevec for LRU) already. We lock |
@@ -3418,7 +3493,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
3418 | /* | 3493 | /* |
3419 | * Rather than hide all in some function, I do this in | 3494 | * Rather than hide all in some function, I do this in |
3420 | * open coded manner. You see what this really does. | 3495 | * open coded manner. You see what this really does. |
3421 | * We have to guarantee memcg->res.limit < memcg->memsw.limit. | 3496 | * We have to guarantee memcg->res.limit <= memcg->memsw.limit. |
3422 | */ | 3497 | */ |
3423 | mutex_lock(&set_limit_mutex); | 3498 | mutex_lock(&set_limit_mutex); |
3424 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 3499 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); |
@@ -3479,7 +3554,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
3479 | /* | 3554 | /* |
3480 | * Rather than hide all in some function, I do this in | 3555 | * Rather than hide all in some function, I do this in |
3481 | * open coded manner. You see what this really does. | 3556 | * open coded manner. You see what this really does. |
3482 | * We have to guarantee memcg->res.limit < memcg->memsw.limit. | 3557 | * We have to guarantee memcg->res.limit <= memcg->memsw.limit. |
3483 | */ | 3558 | */ |
3484 | mutex_lock(&set_limit_mutex); | 3559 | mutex_lock(&set_limit_mutex); |
3485 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | 3560 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); |
@@ -3611,10 +3686,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3611 | } | 3686 | } |
3612 | 3687 | ||
3613 | /* | 3688 | /* |
3614 | * This routine traverse page_cgroup in given list and drop them all. | 3689 | * Traverse a specified page_cgroup list and try to drop them all. This doesn't |
3615 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 3690 | * reclaim the pages page themselves - it just removes the page_cgroups. |
3691 | * Returns true if some page_cgroups were not freed, indicating that the caller | ||
3692 | * must retry this operation. | ||
3616 | */ | 3693 | */ |
3617 | static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | 3694 | static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, |
3618 | int node, int zid, enum lru_list lru) | 3695 | int node, int zid, enum lru_list lru) |
3619 | { | 3696 | { |
3620 | struct mem_cgroup_per_zone *mz; | 3697 | struct mem_cgroup_per_zone *mz; |
@@ -3622,7 +3699,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3622 | struct list_head *list; | 3699 | struct list_head *list; |
3623 | struct page *busy; | 3700 | struct page *busy; |
3624 | struct zone *zone; | 3701 | struct zone *zone; |
3625 | int ret = 0; | ||
3626 | 3702 | ||
3627 | zone = &NODE_DATA(node)->node_zones[zid]; | 3703 | zone = &NODE_DATA(node)->node_zones[zid]; |
3628 | mz = mem_cgroup_zoneinfo(memcg, node, zid); | 3704 | mz = mem_cgroup_zoneinfo(memcg, node, zid); |
@@ -3636,7 +3712,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3636 | struct page_cgroup *pc; | 3712 | struct page_cgroup *pc; |
3637 | struct page *page; | 3713 | struct page *page; |
3638 | 3714 | ||
3639 | ret = 0; | ||
3640 | spin_lock_irqsave(&zone->lru_lock, flags); | 3715 | spin_lock_irqsave(&zone->lru_lock, flags); |
3641 | if (list_empty(list)) { | 3716 | if (list_empty(list)) { |
3642 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 3717 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
@@ -3653,21 +3728,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3653 | 3728 | ||
3654 | pc = lookup_page_cgroup(page); | 3729 | pc = lookup_page_cgroup(page); |
3655 | 3730 | ||
3656 | ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); | 3731 | if (mem_cgroup_move_parent(page, pc, memcg)) { |
3657 | if (ret == -ENOMEM || ret == -EINTR) | ||
3658 | break; | ||
3659 | |||
3660 | if (ret == -EBUSY || ret == -EINVAL) { | ||
3661 | /* found lock contention or "pc" is obsolete. */ | 3732 | /* found lock contention or "pc" is obsolete. */ |
3662 | busy = page; | 3733 | busy = page; |
3663 | cond_resched(); | 3734 | cond_resched(); |
3664 | } else | 3735 | } else |
3665 | busy = NULL; | 3736 | busy = NULL; |
3666 | } | 3737 | } |
3667 | 3738 | return !list_empty(list); | |
3668 | if (!ret && !list_empty(list)) | ||
3669 | return -EBUSY; | ||
3670 | return ret; | ||
3671 | } | 3739 | } |
3672 | 3740 | ||
3673 | /* | 3741 | /* |
@@ -3692,9 +3760,6 @@ move_account: | |||
3692 | ret = -EBUSY; | 3760 | ret = -EBUSY; |
3693 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) | 3761 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) |
3694 | goto out; | 3762 | goto out; |
3695 | ret = -EINTR; | ||
3696 | if (signal_pending(current)) | ||
3697 | goto out; | ||
3698 | /* This is for making all *used* pages to be on LRU. */ | 3763 | /* This is for making all *used* pages to be on LRU. */ |
3699 | lru_add_drain_all(); | 3764 | lru_add_drain_all(); |
3700 | drain_all_stock_sync(memcg); | 3765 | drain_all_stock_sync(memcg); |
@@ -3715,9 +3780,6 @@ move_account: | |||
3715 | } | 3780 | } |
3716 | mem_cgroup_end_move(memcg); | 3781 | mem_cgroup_end_move(memcg); |
3717 | memcg_oom_recover(memcg); | 3782 | memcg_oom_recover(memcg); |
3718 | /* it seems parent cgroup doesn't have enough mem */ | ||
3719 | if (ret == -ENOMEM) | ||
3720 | goto try_to_free; | ||
3721 | cond_resched(); | 3783 | cond_resched(); |
3722 | /* "ret" should also be checked to ensure all lists are empty. */ | 3784 | /* "ret" should also be checked to ensure all lists are empty. */ |
3723 | } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); | 3785 | } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); |
@@ -3779,6 +3841,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
3779 | parent_memcg = mem_cgroup_from_cont(parent); | 3841 | parent_memcg = mem_cgroup_from_cont(parent); |
3780 | 3842 | ||
3781 | cgroup_lock(); | 3843 | cgroup_lock(); |
3844 | |||
3845 | if (memcg->use_hierarchy == val) | ||
3846 | goto out; | ||
3847 | |||
3782 | /* | 3848 | /* |
3783 | * If parent's use_hierarchy is set, we can't make any modifications | 3849 | * If parent's use_hierarchy is set, we can't make any modifications |
3784 | * in the child subtrees. If it is unset, then the change can | 3850 | * in the child subtrees. If it is unset, then the change can |
@@ -3795,6 +3861,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
3795 | retval = -EBUSY; | 3861 | retval = -EBUSY; |
3796 | } else | 3862 | } else |
3797 | retval = -EINVAL; | 3863 | retval = -EINVAL; |
3864 | |||
3865 | out: | ||
3798 | cgroup_unlock(); | 3866 | cgroup_unlock(); |
3799 | 3867 | ||
3800 | return retval; | 3868 | return retval; |
@@ -3831,7 +3899,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | |||
3831 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); | 3899 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); |
3832 | 3900 | ||
3833 | if (swap) | 3901 | if (swap) |
3834 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); | 3902 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); |
3835 | 3903 | ||
3836 | return val << PAGE_SHIFT; | 3904 | return val << PAGE_SHIFT; |
3837 | } | 3905 | } |
@@ -4015,7 +4083,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | |||
4015 | #endif | 4083 | #endif |
4016 | 4084 | ||
4017 | #ifdef CONFIG_NUMA | 4085 | #ifdef CONFIG_NUMA |
4018 | static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft, | 4086 | static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, |
4019 | struct seq_file *m) | 4087 | struct seq_file *m) |
4020 | { | 4088 | { |
4021 | int nid; | 4089 | int nid; |
@@ -4074,7 +4142,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void) | |||
4074 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | 4142 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); |
4075 | } | 4143 | } |
4076 | 4144 | ||
4077 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | 4145 | static int memcg_stat_show(struct cgroup *cont, struct cftype *cft, |
4078 | struct seq_file *m) | 4146 | struct seq_file *m) |
4079 | { | 4147 | { |
4080 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 4148 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
@@ -4082,7 +4150,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4082 | unsigned int i; | 4150 | unsigned int i; |
4083 | 4151 | ||
4084 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 4152 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
4085 | if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account) | 4153 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) |
4086 | continue; | 4154 | continue; |
4087 | seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], | 4155 | seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], |
4088 | mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); | 4156 | mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); |
@@ -4109,7 +4177,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4109 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 4177 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
4110 | long long val = 0; | 4178 | long long val = 0; |
4111 | 4179 | ||
4112 | if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account) | 4180 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) |
4113 | continue; | 4181 | continue; |
4114 | for_each_mem_cgroup_tree(mi, memcg) | 4182 | for_each_mem_cgroup_tree(mi, memcg) |
4115 | val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; | 4183 | val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; |
@@ -4533,7 +4601,7 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | |||
4533 | return 0; | 4601 | return 0; |
4534 | } | 4602 | } |
4535 | 4603 | ||
4536 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | 4604 | #ifdef CONFIG_MEMCG_KMEM |
4537 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | 4605 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) |
4538 | { | 4606 | { |
4539 | return mem_cgroup_sockets_init(memcg, ss); | 4607 | return mem_cgroup_sockets_init(memcg, ss); |
@@ -4588,7 +4656,7 @@ static struct cftype mem_cgroup_files[] = { | |||
4588 | }, | 4656 | }, |
4589 | { | 4657 | { |
4590 | .name = "stat", | 4658 | .name = "stat", |
4591 | .read_seq_string = mem_control_stat_show, | 4659 | .read_seq_string = memcg_stat_show, |
4592 | }, | 4660 | }, |
4593 | { | 4661 | { |
4594 | .name = "force_empty", | 4662 | .name = "force_empty", |
@@ -4620,10 +4688,10 @@ static struct cftype mem_cgroup_files[] = { | |||
4620 | #ifdef CONFIG_NUMA | 4688 | #ifdef CONFIG_NUMA |
4621 | { | 4689 | { |
4622 | .name = "numa_stat", | 4690 | .name = "numa_stat", |
4623 | .read_seq_string = mem_control_numa_stat_show, | 4691 | .read_seq_string = memcg_numa_stat_show, |
4624 | }, | 4692 | }, |
4625 | #endif | 4693 | #endif |
4626 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 4694 | #ifdef CONFIG_MEMCG_SWAP |
4627 | { | 4695 | { |
4628 | .name = "memsw.usage_in_bytes", | 4696 | .name = "memsw.usage_in_bytes", |
4629 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 4697 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
@@ -4810,7 +4878,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | |||
4810 | } | 4878 | } |
4811 | EXPORT_SYMBOL(parent_mem_cgroup); | 4879 | EXPORT_SYMBOL(parent_mem_cgroup); |
4812 | 4880 | ||
4813 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 4881 | #ifdef CONFIG_MEMCG_SWAP |
4814 | static void __init enable_swap_cgroup(void) | 4882 | static void __init enable_swap_cgroup(void) |
4815 | { | 4883 | { |
4816 | if (!mem_cgroup_disabled() && really_do_swap_account) | 4884 | if (!mem_cgroup_disabled() && really_do_swap_account) |
@@ -5541,7 +5609,7 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
5541 | .__DEPRECATED_clear_css_refs = true, | 5609 | .__DEPRECATED_clear_css_refs = true, |
5542 | }; | 5610 | }; |
5543 | 5611 | ||
5544 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 5612 | #ifdef CONFIG_MEMCG_SWAP |
5545 | static int __init enable_swap_account(char *s) | 5613 | static int __init enable_swap_account(char *s) |
5546 | { | 5614 | { |
5547 | /* consider enabled if no parameter or 1 is given */ | 5615 | /* consider enabled if no parameter or 1 is given */ |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6de0d613bbe6..a6e2141a6610 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -128,7 +128,7 @@ static int hwpoison_filter_flags(struct page *p) | |||
128 | * can only guarantee that the page either belongs to the memcg tasks, or is | 128 | * can only guarantee that the page either belongs to the memcg tasks, or is |
129 | * a freed page. | 129 | * a freed page. |
130 | */ | 130 | */ |
131 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 131 | #ifdef CONFIG_MEMCG_SWAP |
132 | u64 hwpoison_filter_memcg; | 132 | u64 hwpoison_filter_memcg; |
133 | EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); | 133 | EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); |
134 | static int hwpoison_filter_task(struct page *p) | 134 | static int hwpoison_filter_task(struct page *p) |
@@ -1416,7 +1416,6 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1416 | int ret; | 1416 | int ret; |
1417 | unsigned long pfn = page_to_pfn(page); | 1417 | unsigned long pfn = page_to_pfn(page); |
1418 | struct page *hpage = compound_head(page); | 1418 | struct page *hpage = compound_head(page); |
1419 | LIST_HEAD(pagelist); | ||
1420 | 1419 | ||
1421 | ret = get_any_page(page, pfn, flags); | 1420 | ret = get_any_page(page, pfn, flags); |
1422 | if (ret < 0) | 1421 | if (ret < 0) |
@@ -1431,24 +1430,18 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1431 | } | 1430 | } |
1432 | 1431 | ||
1433 | /* Keep page count to indicate a given hugepage is isolated. */ | 1432 | /* Keep page count to indicate a given hugepage is isolated. */ |
1434 | 1433 | ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false, | |
1435 | list_add(&hpage->lru, &pagelist); | ||
1436 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, false, | ||
1437 | MIGRATE_SYNC); | 1434 | MIGRATE_SYNC); |
1435 | put_page(hpage); | ||
1438 | if (ret) { | 1436 | if (ret) { |
1439 | struct page *page1, *page2; | ||
1440 | list_for_each_entry_safe(page1, page2, &pagelist, lru) | ||
1441 | put_page(page1); | ||
1442 | |||
1443 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1437 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1444 | pfn, ret, page->flags); | 1438 | pfn, ret, page->flags); |
1445 | if (ret > 0) | ||
1446 | ret = -EIO; | ||
1447 | return ret; | 1439 | return ret; |
1448 | } | 1440 | } |
1449 | done: | 1441 | done: |
1450 | if (!PageHWPoison(hpage)) | 1442 | if (!PageHWPoison(hpage)) |
1451 | atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); | 1443 | atomic_long_add(1 << compound_trans_order(hpage), |
1444 | &mce_bad_pages); | ||
1452 | set_page_hwpoison_huge_page(hpage); | 1445 | set_page_hwpoison_huge_page(hpage); |
1453 | dequeue_hwpoisoned_huge_page(hpage); | 1446 | dequeue_hwpoisoned_huge_page(hpage); |
1454 | /* keep elevated page count for bad page */ | 1447 | /* keep elevated page count for bad page */ |
diff --git a/mm/memory.c b/mm/memory.c index 91f69459d3e8..482f089765ff 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1343,8 +1343,11 @@ static void unmap_single_vma(struct mmu_gather *tlb, | |||
1343 | * Since no pte has actually been setup, it is | 1343 | * Since no pte has actually been setup, it is |
1344 | * safe to do nothing in this case. | 1344 | * safe to do nothing in this case. |
1345 | */ | 1345 | */ |
1346 | if (vma->vm_file) | 1346 | if (vma->vm_file) { |
1347 | unmap_hugepage_range(vma, start, end, NULL); | 1347 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); |
1348 | __unmap_hugepage_range_final(tlb, vma, start, end, NULL); | ||
1349 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | ||
1350 | } | ||
1348 | } else | 1351 | } else |
1349 | unmap_page_range(tlb, vma, start, end, details); | 1352 | unmap_page_range(tlb, vma, start, end, details); |
1350 | } | 1353 | } |
@@ -3938,7 +3941,7 @@ void print_vma_addr(char *prefix, unsigned long ip) | |||
3938 | free_page((unsigned long)buf); | 3941 | free_page((unsigned long)buf); |
3939 | } | 3942 | } |
3940 | } | 3943 | } |
3941 | up_read(¤t->mm->mmap_sem); | 3944 | up_read(&mm->mmap_sem); |
3942 | } | 3945 | } |
3943 | 3946 | ||
3944 | #ifdef CONFIG_PROVE_LOCKING | 3947 | #ifdef CONFIG_PROVE_LOCKING |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 427bb291dd0f..3ad25f9d1fc1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -512,19 +512,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |||
512 | 512 | ||
513 | zone->present_pages += onlined_pages; | 513 | zone->present_pages += onlined_pages; |
514 | zone->zone_pgdat->node_present_pages += onlined_pages; | 514 | zone->zone_pgdat->node_present_pages += onlined_pages; |
515 | if (need_zonelists_rebuild) | 515 | if (onlined_pages) { |
516 | build_all_zonelists(zone); | 516 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); |
517 | else | 517 | if (need_zonelists_rebuild) |
518 | zone_pcp_update(zone); | 518 | build_all_zonelists(NULL, zone); |
519 | else | ||
520 | zone_pcp_update(zone); | ||
521 | } | ||
519 | 522 | ||
520 | mutex_unlock(&zonelists_mutex); | 523 | mutex_unlock(&zonelists_mutex); |
521 | 524 | ||
522 | init_per_zone_wmark_min(); | 525 | init_per_zone_wmark_min(); |
523 | 526 | ||
524 | if (onlined_pages) { | 527 | if (onlined_pages) |
525 | kswapd_run(zone_to_nid(zone)); | 528 | kswapd_run(zone_to_nid(zone)); |
526 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); | ||
527 | } | ||
528 | 529 | ||
529 | vm_total_pages = nr_free_pagecache_pages(); | 530 | vm_total_pages = nr_free_pagecache_pages(); |
530 | 531 | ||
@@ -562,7 +563,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | |||
562 | * to access not-initialized zonelist, build here. | 563 | * to access not-initialized zonelist, build here. |
563 | */ | 564 | */ |
564 | mutex_lock(&zonelists_mutex); | 565 | mutex_lock(&zonelists_mutex); |
565 | build_all_zonelists(NULL); | 566 | build_all_zonelists(pgdat, NULL); |
566 | mutex_unlock(&zonelists_mutex); | 567 | mutex_unlock(&zonelists_mutex); |
567 | 568 | ||
568 | return pgdat; | 569 | return pgdat; |
@@ -965,6 +966,9 @@ repeat: | |||
965 | 966 | ||
966 | init_per_zone_wmark_min(); | 967 | init_per_zone_wmark_min(); |
967 | 968 | ||
969 | if (!populated_zone(zone)) | ||
970 | zone_pcp_reset(zone); | ||
971 | |||
968 | if (!node_present_pages(node)) { | 972 | if (!node_present_pages(node)) { |
969 | node_clear_state(node, N_HIGH_MEMORY); | 973 | node_clear_state(node, N_HIGH_MEMORY); |
970 | kswapd_stop(node); | 974 | kswapd_stop(node); |
diff --git a/mm/migrate.c b/mm/migrate.c index be26d5cbe56b..77ed2d773705 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/memcontrol.h> | 33 | #include <linux/memcontrol.h> |
34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
35 | #include <linux/hugetlb.h> | 35 | #include <linux/hugetlb.h> |
36 | #include <linux/hugetlb_cgroup.h> | ||
36 | #include <linux/gfp.h> | 37 | #include <linux/gfp.h> |
37 | 38 | ||
38 | #include <asm/tlbflush.h> | 39 | #include <asm/tlbflush.h> |
@@ -682,7 +683,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
682 | { | 683 | { |
683 | int rc = -EAGAIN; | 684 | int rc = -EAGAIN; |
684 | int remap_swapcache = 1; | 685 | int remap_swapcache = 1; |
685 | int charge = 0; | ||
686 | struct mem_cgroup *mem; | 686 | struct mem_cgroup *mem; |
687 | struct anon_vma *anon_vma = NULL; | 687 | struct anon_vma *anon_vma = NULL; |
688 | 688 | ||
@@ -724,12 +724,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
724 | } | 724 | } |
725 | 725 | ||
726 | /* charge against new page */ | 726 | /* charge against new page */ |
727 | charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL); | 727 | mem_cgroup_prepare_migration(page, newpage, &mem); |
728 | if (charge == -ENOMEM) { | ||
729 | rc = -ENOMEM; | ||
730 | goto unlock; | ||
731 | } | ||
732 | BUG_ON(charge); | ||
733 | 728 | ||
734 | if (PageWriteback(page)) { | 729 | if (PageWriteback(page)) { |
735 | /* | 730 | /* |
@@ -819,8 +814,7 @@ skip_unmap: | |||
819 | put_anon_vma(anon_vma); | 814 | put_anon_vma(anon_vma); |
820 | 815 | ||
821 | uncharge: | 816 | uncharge: |
822 | if (!charge) | 817 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); |
823 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); | ||
824 | unlock: | 818 | unlock: |
825 | unlock_page(page); | 819 | unlock_page(page); |
826 | out: | 820 | out: |
@@ -931,16 +925,13 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
931 | 925 | ||
932 | if (anon_vma) | 926 | if (anon_vma) |
933 | put_anon_vma(anon_vma); | 927 | put_anon_vma(anon_vma); |
934 | unlock_page(hpage); | ||
935 | 928 | ||
936 | out: | 929 | if (!rc) |
937 | if (rc != -EAGAIN) { | 930 | hugetlb_cgroup_migrate(hpage, new_hpage); |
938 | list_del(&hpage->lru); | ||
939 | put_page(hpage); | ||
940 | } | ||
941 | 931 | ||
932 | unlock_page(hpage); | ||
933 | out: | ||
942 | put_page(new_hpage); | 934 | put_page(new_hpage); |
943 | |||
944 | if (result) { | 935 | if (result) { |
945 | if (rc) | 936 | if (rc) |
946 | *result = rc; | 937 | *result = rc; |
@@ -1016,48 +1007,32 @@ out: | |||
1016 | return nr_failed + retry; | 1007 | return nr_failed + retry; |
1017 | } | 1008 | } |
1018 | 1009 | ||
1019 | int migrate_huge_pages(struct list_head *from, | 1010 | int migrate_huge_page(struct page *hpage, new_page_t get_new_page, |
1020 | new_page_t get_new_page, unsigned long private, bool offlining, | 1011 | unsigned long private, bool offlining, |
1021 | enum migrate_mode mode) | 1012 | enum migrate_mode mode) |
1022 | { | 1013 | { |
1023 | int retry = 1; | 1014 | int pass, rc; |
1024 | int nr_failed = 0; | 1015 | |
1025 | int pass = 0; | 1016 | for (pass = 0; pass < 10; pass++) { |
1026 | struct page *page; | 1017 | rc = unmap_and_move_huge_page(get_new_page, |
1027 | struct page *page2; | 1018 | private, hpage, pass > 2, offlining, |
1028 | int rc; | 1019 | mode); |
1029 | 1020 | switch (rc) { | |
1030 | for (pass = 0; pass < 10 && retry; pass++) { | 1021 | case -ENOMEM: |
1031 | retry = 0; | 1022 | goto out; |
1032 | 1023 | case -EAGAIN: | |
1033 | list_for_each_entry_safe(page, page2, from, lru) { | 1024 | /* try again */ |
1034 | cond_resched(); | 1025 | cond_resched(); |
1035 | 1026 | break; | |
1036 | rc = unmap_and_move_huge_page(get_new_page, | 1027 | case 0: |
1037 | private, page, pass > 2, offlining, | 1028 | goto out; |
1038 | mode); | 1029 | default: |
1039 | 1030 | rc = -EIO; | |
1040 | switch(rc) { | 1031 | goto out; |
1041 | case -ENOMEM: | ||
1042 | goto out; | ||
1043 | case -EAGAIN: | ||
1044 | retry++; | ||
1045 | break; | ||
1046 | case 0: | ||
1047 | break; | ||
1048 | default: | ||
1049 | /* Permanent failure */ | ||
1050 | nr_failed++; | ||
1051 | break; | ||
1052 | } | ||
1053 | } | 1032 | } |
1054 | } | 1033 | } |
1055 | rc = 0; | ||
1056 | out: | 1034 | out: |
1057 | if (rc) | 1035 | return rc; |
1058 | return rc; | ||
1059 | |||
1060 | return nr_failed + retry; | ||
1061 | } | 1036 | } |
1062 | 1037 | ||
1063 | #ifdef CONFIG_NUMA | 1038 | #ifdef CONFIG_NUMA |
@@ -943,6 +943,8 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, | |||
943 | const unsigned long stack_flags | 943 | const unsigned long stack_flags |
944 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); | 944 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); |
945 | 945 | ||
946 | mm->total_vm += pages; | ||
947 | |||
946 | if (file) { | 948 | if (file) { |
947 | mm->shared_vm += pages; | 949 | mm->shared_vm += pages; |
948 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) | 950 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) |
@@ -1347,7 +1349,6 @@ munmap_back: | |||
1347 | out: | 1349 | out: |
1348 | perf_event_mmap(vma); | 1350 | perf_event_mmap(vma); |
1349 | 1351 | ||
1350 | mm->total_vm += len >> PAGE_SHIFT; | ||
1351 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1352 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
1352 | if (vm_flags & VM_LOCKED) { | 1353 | if (vm_flags & VM_LOCKED) { |
1353 | if (!mlock_vma_pages_range(vma, addr, addr + len)) | 1354 | if (!mlock_vma_pages_range(vma, addr, addr + len)) |
@@ -1707,7 +1708,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
1707 | return -ENOMEM; | 1708 | return -ENOMEM; |
1708 | 1709 | ||
1709 | /* Ok, everything looks good - let it rip */ | 1710 | /* Ok, everything looks good - let it rip */ |
1710 | mm->total_vm += grow; | ||
1711 | if (vma->vm_flags & VM_LOCKED) | 1711 | if (vma->vm_flags & VM_LOCKED) |
1712 | mm->locked_vm += grow; | 1712 | mm->locked_vm += grow; |
1713 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); | 1713 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); |
@@ -1889,7 +1889,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) | |||
1889 | 1889 | ||
1890 | if (vma->vm_flags & VM_ACCOUNT) | 1890 | if (vma->vm_flags & VM_ACCOUNT) |
1891 | nr_accounted += nrpages; | 1891 | nr_accounted += nrpages; |
1892 | mm->total_vm -= nrpages; | ||
1893 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); | 1892 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); |
1894 | vma = remove_vma(vma); | 1893 | vma = remove_vma(vma); |
1895 | } while (vma); | 1894 | } while (vma); |
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 9a611d3a1848..862b60822d9f 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -33,6 +33,24 @@ | |||
33 | void __mmu_notifier_release(struct mm_struct *mm) | 33 | void __mmu_notifier_release(struct mm_struct *mm) |
34 | { | 34 | { |
35 | struct mmu_notifier *mn; | 35 | struct mmu_notifier *mn; |
36 | struct hlist_node *n; | ||
37 | |||
38 | /* | ||
39 | * RCU here will block mmu_notifier_unregister until | ||
40 | * ->release returns. | ||
41 | */ | ||
42 | rcu_read_lock(); | ||
43 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) | ||
44 | /* | ||
45 | * if ->release runs before mmu_notifier_unregister it | ||
46 | * must be handled as it's the only way for the driver | ||
47 | * to flush all existing sptes and stop the driver | ||
48 | * from establishing any more sptes before all the | ||
49 | * pages in the mm are freed. | ||
50 | */ | ||
51 | if (mn->ops->release) | ||
52 | mn->ops->release(mn, mm); | ||
53 | rcu_read_unlock(); | ||
36 | 54 | ||
37 | spin_lock(&mm->mmu_notifier_mm->lock); | 55 | spin_lock(&mm->mmu_notifier_mm->lock); |
38 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { | 56 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { |
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
46 | * mmu_notifier_unregister to return. | 64 | * mmu_notifier_unregister to return. |
47 | */ | 65 | */ |
48 | hlist_del_init_rcu(&mn->hlist); | 66 | hlist_del_init_rcu(&mn->hlist); |
49 | /* | ||
50 | * RCU here will block mmu_notifier_unregister until | ||
51 | * ->release returns. | ||
52 | */ | ||
53 | rcu_read_lock(); | ||
54 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
55 | /* | ||
56 | * if ->release runs before mmu_notifier_unregister it | ||
57 | * must be handled as it's the only way for the driver | ||
58 | * to flush all existing sptes and stop the driver | ||
59 | * from establishing any more sptes before all the | ||
60 | * pages in the mm are freed. | ||
61 | */ | ||
62 | if (mn->ops->release) | ||
63 | mn->ops->release(mn, mm); | ||
64 | rcu_read_unlock(); | ||
65 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
66 | } | 67 | } |
67 | spin_unlock(&mm->mmu_notifier_mm->lock); | 68 | spin_unlock(&mm->mmu_notifier_mm->lock); |
68 | 69 | ||
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
284 | { | 285 | { |
285 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | 286 | BUG_ON(atomic_read(&mm->mm_count) <= 0); |
286 | 287 | ||
287 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
288 | if (!hlist_unhashed(&mn->hlist)) { | 288 | if (!hlist_unhashed(&mn->hlist)) { |
289 | hlist_del_rcu(&mn->hlist); | ||
290 | |||
291 | /* | 289 | /* |
292 | * RCU here will force exit_mmap to wait ->release to finish | 290 | * RCU here will force exit_mmap to wait ->release to finish |
293 | * before freeing the pages. | 291 | * before freeing the pages. |
294 | */ | 292 | */ |
295 | rcu_read_lock(); | 293 | rcu_read_lock(); |
296 | spin_unlock(&mm->mmu_notifier_mm->lock); | 294 | |
297 | /* | 295 | /* |
298 | * exit_mmap will block in mmu_notifier_release to | 296 | * exit_mmap will block in mmu_notifier_release to |
299 | * guarantee ->release is called before freeing the | 297 | * guarantee ->release is called before freeing the |
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
302 | if (mn->ops->release) | 300 | if (mn->ops->release) |
303 | mn->ops->release(mn, mm); | 301 | mn->ops->release(mn, mm); |
304 | rcu_read_unlock(); | 302 | rcu_read_unlock(); |
305 | } else | 303 | |
304 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
305 | hlist_del_rcu(&mn->hlist); | ||
306 | spin_unlock(&mm->mmu_notifier_mm->lock); | 306 | spin_unlock(&mm->mmu_notifier_mm->lock); |
307 | } | ||
307 | 308 | ||
308 | /* | 309 | /* |
309 | * Wait any running method to finish, of course including | 310 | * Wait any running method to finish, of course including |
diff --git a/mm/mmzone.c b/mm/mmzone.c index 6830eab5bf09..3cef80f6ac79 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -96,7 +96,7 @@ void lruvec_init(struct lruvec *lruvec, struct zone *zone) | |||
96 | for_each_lru(lru) | 96 | for_each_lru(lru) |
97 | INIT_LIST_HEAD(&lruvec->lists[lru]); | 97 | INIT_LIST_HEAD(&lruvec->lists[lru]); |
98 | 98 | ||
99 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 99 | #ifdef CONFIG_MEMCG |
100 | lruvec->zone = zone; | 100 | lruvec->zone = zone; |
101 | #endif | 101 | #endif |
102 | } | 102 | } |
diff --git a/mm/mremap.c b/mm/mremap.c index 21fed202ddad..cc06d0e48d05 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -260,7 +260,6 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
260 | * If this were a serious issue, we'd add a flag to do_munmap(). | 260 | * If this were a serious issue, we'd add a flag to do_munmap(). |
261 | */ | 261 | */ |
262 | hiwater_vm = mm->hiwater_vm; | 262 | hiwater_vm = mm->hiwater_vm; |
263 | mm->total_vm += new_len >> PAGE_SHIFT; | ||
264 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); | 263 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); |
265 | 264 | ||
266 | if (do_munmap(mm, old_addr, old_len) < 0) { | 265 | if (do_munmap(mm, old_addr, old_len) < 0) { |
@@ -497,7 +496,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
497 | goto out; | 496 | goto out; |
498 | } | 497 | } |
499 | 498 | ||
500 | mm->total_vm += pages; | ||
501 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); | 499 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); |
502 | if (vma->vm_flags & VM_LOCKED) { | 500 | if (vma->vm_flags & VM_LOCKED) { |
503 | mm->locked_vm += pages; | 501 | mm->locked_vm += pages; |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ac300c99baf6..198600861638 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -288,76 +288,93 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | |||
288 | } | 288 | } |
289 | #endif | 289 | #endif |
290 | 290 | ||
291 | enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | ||
292 | unsigned long totalpages, const nodemask_t *nodemask, | ||
293 | bool force_kill) | ||
294 | { | ||
295 | if (task->exit_state) | ||
296 | return OOM_SCAN_CONTINUE; | ||
297 | if (oom_unkillable_task(task, NULL, nodemask)) | ||
298 | return OOM_SCAN_CONTINUE; | ||
299 | |||
300 | /* | ||
301 | * This task already has access to memory reserves and is being killed. | ||
302 | * Don't allow any other task to have access to the reserves. | ||
303 | */ | ||
304 | if (test_tsk_thread_flag(task, TIF_MEMDIE)) { | ||
305 | if (unlikely(frozen(task))) | ||
306 | __thaw_task(task); | ||
307 | if (!force_kill) | ||
308 | return OOM_SCAN_ABORT; | ||
309 | } | ||
310 | if (!task->mm) | ||
311 | return OOM_SCAN_CONTINUE; | ||
312 | |||
313 | if (task->flags & PF_EXITING) { | ||
314 | /* | ||
315 | * If task is current and is in the process of releasing memory, | ||
316 | * allow the "kill" to set TIF_MEMDIE, which will allow it to | ||
317 | * access memory reserves. Otherwise, it may stall forever. | ||
318 | * | ||
319 | * The iteration isn't broken here, however, in case other | ||
320 | * threads are found to have already been oom killed. | ||
321 | */ | ||
322 | if (task == current) | ||
323 | return OOM_SCAN_SELECT; | ||
324 | else if (!force_kill) { | ||
325 | /* | ||
326 | * If this task is not being ptraced on exit, then wait | ||
327 | * for it to finish before killing some other task | ||
328 | * unnecessarily. | ||
329 | */ | ||
330 | if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) | ||
331 | return OOM_SCAN_ABORT; | ||
332 | } | ||
333 | } | ||
334 | return OOM_SCAN_OK; | ||
335 | } | ||
336 | |||
291 | /* | 337 | /* |
292 | * Simple selection loop. We chose the process with the highest | 338 | * Simple selection loop. We chose the process with the highest |
293 | * number of 'points'. We expect the caller will lock the tasklist. | 339 | * number of 'points'. |
294 | * | 340 | * |
295 | * (not docbooked, we don't want this one cluttering up the manual) | 341 | * (not docbooked, we don't want this one cluttering up the manual) |
296 | */ | 342 | */ |
297 | static struct task_struct *select_bad_process(unsigned int *ppoints, | 343 | static struct task_struct *select_bad_process(unsigned int *ppoints, |
298 | unsigned long totalpages, struct mem_cgroup *memcg, | 344 | unsigned long totalpages, const nodemask_t *nodemask, |
299 | const nodemask_t *nodemask, bool force_kill) | 345 | bool force_kill) |
300 | { | 346 | { |
301 | struct task_struct *g, *p; | 347 | struct task_struct *g, *p; |
302 | struct task_struct *chosen = NULL; | 348 | struct task_struct *chosen = NULL; |
303 | unsigned long chosen_points = 0; | 349 | unsigned long chosen_points = 0; |
304 | 350 | ||
351 | rcu_read_lock(); | ||
305 | do_each_thread(g, p) { | 352 | do_each_thread(g, p) { |
306 | unsigned int points; | 353 | unsigned int points; |
307 | 354 | ||
308 | if (p->exit_state) | 355 | switch (oom_scan_process_thread(p, totalpages, nodemask, |
309 | continue; | 356 | force_kill)) { |
310 | if (oom_unkillable_task(p, memcg, nodemask)) | 357 | case OOM_SCAN_SELECT: |
311 | continue; | 358 | chosen = p; |
312 | 359 | chosen_points = ULONG_MAX; | |
313 | /* | 360 | /* fall through */ |
314 | * This task already has access to memory reserves and is | 361 | case OOM_SCAN_CONTINUE: |
315 | * being killed. Don't allow any other task access to the | ||
316 | * memory reserve. | ||
317 | * | ||
318 | * Note: this may have a chance of deadlock if it gets | ||
319 | * blocked waiting for another task which itself is waiting | ||
320 | * for memory. Is there a better alternative? | ||
321 | */ | ||
322 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) { | ||
323 | if (unlikely(frozen(p))) | ||
324 | __thaw_task(p); | ||
325 | if (!force_kill) | ||
326 | return ERR_PTR(-1UL); | ||
327 | } | ||
328 | if (!p->mm) | ||
329 | continue; | 362 | continue; |
330 | 363 | case OOM_SCAN_ABORT: | |
331 | if (p->flags & PF_EXITING) { | 364 | rcu_read_unlock(); |
332 | /* | 365 | return ERR_PTR(-1UL); |
333 | * If p is the current task and is in the process of | 366 | case OOM_SCAN_OK: |
334 | * releasing memory, we allow the "kill" to set | 367 | break; |
335 | * TIF_MEMDIE, which will allow it to gain access to | 368 | }; |
336 | * memory reserves. Otherwise, it may stall forever. | 369 | points = oom_badness(p, NULL, nodemask, totalpages); |
337 | * | ||
338 | * The loop isn't broken here, however, in case other | ||
339 | * threads are found to have already been oom killed. | ||
340 | */ | ||
341 | if (p == current) { | ||
342 | chosen = p; | ||
343 | chosen_points = ULONG_MAX; | ||
344 | } else if (!force_kill) { | ||
345 | /* | ||
346 | * If this task is not being ptraced on exit, | ||
347 | * then wait for it to finish before killing | ||
348 | * some other task unnecessarily. | ||
349 | */ | ||
350 | if (!(p->group_leader->ptrace & PT_TRACE_EXIT)) | ||
351 | return ERR_PTR(-1UL); | ||
352 | } | ||
353 | } | ||
354 | |||
355 | points = oom_badness(p, memcg, nodemask, totalpages); | ||
356 | if (points > chosen_points) { | 370 | if (points > chosen_points) { |
357 | chosen = p; | 371 | chosen = p; |
358 | chosen_points = points; | 372 | chosen_points = points; |
359 | } | 373 | } |
360 | } while_each_thread(g, p); | 374 | } while_each_thread(g, p); |
375 | if (chosen) | ||
376 | get_task_struct(chosen); | ||
377 | rcu_read_unlock(); | ||
361 | 378 | ||
362 | *ppoints = chosen_points * 1000 / totalpages; | 379 | *ppoints = chosen_points * 1000 / totalpages; |
363 | return chosen; | 380 | return chosen; |
@@ -371,17 +388,16 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
371 | * Dumps the current memory state of all eligible tasks. Tasks not in the same | 388 | * Dumps the current memory state of all eligible tasks. Tasks not in the same |
372 | * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes | 389 | * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes |
373 | * are not shown. | 390 | * are not shown. |
374 | * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj | 391 | * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, |
375 | * value, oom_score_adj value, and name. | 392 | * swapents, oom_score_adj value, and name. |
376 | * | ||
377 | * Call with tasklist_lock read-locked. | ||
378 | */ | 393 | */ |
379 | static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) | 394 | static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) |
380 | { | 395 | { |
381 | struct task_struct *p; | 396 | struct task_struct *p; |
382 | struct task_struct *task; | 397 | struct task_struct *task; |
383 | 398 | ||
384 | pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); | 399 | pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n"); |
400 | rcu_read_lock(); | ||
385 | for_each_process(p) { | 401 | for_each_process(p) { |
386 | if (oom_unkillable_task(p, memcg, nodemask)) | 402 | if (oom_unkillable_task(p, memcg, nodemask)) |
387 | continue; | 403 | continue; |
@@ -396,13 +412,15 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas | |||
396 | continue; | 412 | continue; |
397 | } | 413 | } |
398 | 414 | ||
399 | pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n", | 415 | pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n", |
400 | task->pid, from_kuid(&init_user_ns, task_uid(task)), | 416 | task->pid, from_kuid(&init_user_ns, task_uid(task)), |
401 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), | 417 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), |
402 | task_cpu(task), task->signal->oom_adj, | 418 | task->mm->nr_ptes, |
419 | get_mm_counter(task->mm, MM_SWAPENTS), | ||
403 | task->signal->oom_score_adj, task->comm); | 420 | task->signal->oom_score_adj, task->comm); |
404 | task_unlock(task); | 421 | task_unlock(task); |
405 | } | 422 | } |
423 | rcu_read_unlock(); | ||
406 | } | 424 | } |
407 | 425 | ||
408 | static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | 426 | static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, |
@@ -423,10 +441,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
423 | } | 441 | } |
424 | 442 | ||
425 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 443 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
426 | static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | 444 | /* |
427 | unsigned int points, unsigned long totalpages, | 445 | * Must be called while holding a reference to p, which will be released upon |
428 | struct mem_cgroup *memcg, nodemask_t *nodemask, | 446 | * returning. |
429 | const char *message) | 447 | */ |
448 | void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | ||
449 | unsigned int points, unsigned long totalpages, | ||
450 | struct mem_cgroup *memcg, nodemask_t *nodemask, | ||
451 | const char *message) | ||
430 | { | 452 | { |
431 | struct task_struct *victim = p; | 453 | struct task_struct *victim = p; |
432 | struct task_struct *child; | 454 | struct task_struct *child; |
@@ -442,6 +464,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
442 | */ | 464 | */ |
443 | if (p->flags & PF_EXITING) { | 465 | if (p->flags & PF_EXITING) { |
444 | set_tsk_thread_flag(p, TIF_MEMDIE); | 466 | set_tsk_thread_flag(p, TIF_MEMDIE); |
467 | put_task_struct(p); | ||
445 | return; | 468 | return; |
446 | } | 469 | } |
447 | 470 | ||
@@ -459,6 +482,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
459 | * parent. This attempts to lose the minimal amount of work done while | 482 | * parent. This attempts to lose the minimal amount of work done while |
460 | * still freeing memory. | 483 | * still freeing memory. |
461 | */ | 484 | */ |
485 | read_lock(&tasklist_lock); | ||
462 | do { | 486 | do { |
463 | list_for_each_entry(child, &t->children, sibling) { | 487 | list_for_each_entry(child, &t->children, sibling) { |
464 | unsigned int child_points; | 488 | unsigned int child_points; |
@@ -471,15 +495,26 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
471 | child_points = oom_badness(child, memcg, nodemask, | 495 | child_points = oom_badness(child, memcg, nodemask, |
472 | totalpages); | 496 | totalpages); |
473 | if (child_points > victim_points) { | 497 | if (child_points > victim_points) { |
498 | put_task_struct(victim); | ||
474 | victim = child; | 499 | victim = child; |
475 | victim_points = child_points; | 500 | victim_points = child_points; |
501 | get_task_struct(victim); | ||
476 | } | 502 | } |
477 | } | 503 | } |
478 | } while_each_thread(p, t); | 504 | } while_each_thread(p, t); |
505 | read_unlock(&tasklist_lock); | ||
479 | 506 | ||
480 | victim = find_lock_task_mm(victim); | 507 | rcu_read_lock(); |
481 | if (!victim) | 508 | p = find_lock_task_mm(victim); |
509 | if (!p) { | ||
510 | rcu_read_unlock(); | ||
511 | put_task_struct(victim); | ||
482 | return; | 512 | return; |
513 | } else if (victim != p) { | ||
514 | get_task_struct(p); | ||
515 | put_task_struct(victim); | ||
516 | victim = p; | ||
517 | } | ||
483 | 518 | ||
484 | /* mm cannot safely be dereferenced after task_unlock(victim) */ | 519 | /* mm cannot safely be dereferenced after task_unlock(victim) */ |
485 | mm = victim->mm; | 520 | mm = victim->mm; |
@@ -510,17 +545,19 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
510 | task_unlock(p); | 545 | task_unlock(p); |
511 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); | 546 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); |
512 | } | 547 | } |
548 | rcu_read_unlock(); | ||
513 | 549 | ||
514 | set_tsk_thread_flag(victim, TIF_MEMDIE); | 550 | set_tsk_thread_flag(victim, TIF_MEMDIE); |
515 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); | 551 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); |
552 | put_task_struct(victim); | ||
516 | } | 553 | } |
517 | #undef K | 554 | #undef K |
518 | 555 | ||
519 | /* | 556 | /* |
520 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. | 557 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. |
521 | */ | 558 | */ |
522 | static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, | 559 | void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, |
523 | int order, const nodemask_t *nodemask) | 560 | int order, const nodemask_t *nodemask) |
524 | { | 561 | { |
525 | if (likely(!sysctl_panic_on_oom)) | 562 | if (likely(!sysctl_panic_on_oom)) |
526 | return; | 563 | return; |
@@ -533,42 +570,11 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, | |||
533 | if (constraint != CONSTRAINT_NONE) | 570 | if (constraint != CONSTRAINT_NONE) |
534 | return; | 571 | return; |
535 | } | 572 | } |
536 | read_lock(&tasklist_lock); | ||
537 | dump_header(NULL, gfp_mask, order, NULL, nodemask); | 573 | dump_header(NULL, gfp_mask, order, NULL, nodemask); |
538 | read_unlock(&tasklist_lock); | ||
539 | panic("Out of memory: %s panic_on_oom is enabled\n", | 574 | panic("Out of memory: %s panic_on_oom is enabled\n", |
540 | sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); | 575 | sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); |
541 | } | 576 | } |
542 | 577 | ||
543 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
544 | void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | ||
545 | int order) | ||
546 | { | ||
547 | unsigned long limit; | ||
548 | unsigned int points = 0; | ||
549 | struct task_struct *p; | ||
550 | |||
551 | /* | ||
552 | * If current has a pending SIGKILL, then automatically select it. The | ||
553 | * goal is to allow it to allocate so that it may quickly exit and free | ||
554 | * its memory. | ||
555 | */ | ||
556 | if (fatal_signal_pending(current)) { | ||
557 | set_thread_flag(TIF_MEMDIE); | ||
558 | return; | ||
559 | } | ||
560 | |||
561 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); | ||
562 | limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; | ||
563 | read_lock(&tasklist_lock); | ||
564 | p = select_bad_process(&points, limit, memcg, NULL, false); | ||
565 | if (p && PTR_ERR(p) != -1UL) | ||
566 | oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL, | ||
567 | "Memory cgroup out of memory"); | ||
568 | read_unlock(&tasklist_lock); | ||
569 | } | ||
570 | #endif | ||
571 | |||
572 | static BLOCKING_NOTIFIER_HEAD(oom_notify_list); | 578 | static BLOCKING_NOTIFIER_HEAD(oom_notify_list); |
573 | 579 | ||
574 | int register_oom_notifier(struct notifier_block *nb) | 580 | int register_oom_notifier(struct notifier_block *nb) |
@@ -690,7 +696,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
690 | struct task_struct *p; | 696 | struct task_struct *p; |
691 | unsigned long totalpages; | 697 | unsigned long totalpages; |
692 | unsigned long freed = 0; | 698 | unsigned long freed = 0; |
693 | unsigned int points; | 699 | unsigned int uninitialized_var(points); |
694 | enum oom_constraint constraint = CONSTRAINT_NONE; | 700 | enum oom_constraint constraint = CONSTRAINT_NONE; |
695 | int killed = 0; | 701 | int killed = 0; |
696 | 702 | ||
@@ -718,22 +724,20 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
718 | mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; | 724 | mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; |
719 | check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); | 725 | check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); |
720 | 726 | ||
721 | read_lock(&tasklist_lock); | 727 | if (sysctl_oom_kill_allocating_task && current->mm && |
722 | if (sysctl_oom_kill_allocating_task && | ||
723 | !oom_unkillable_task(current, NULL, nodemask) && | 728 | !oom_unkillable_task(current, NULL, nodemask) && |
724 | current->mm) { | 729 | current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { |
730 | get_task_struct(current); | ||
725 | oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, | 731 | oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, |
726 | nodemask, | 732 | nodemask, |
727 | "Out of memory (oom_kill_allocating_task)"); | 733 | "Out of memory (oom_kill_allocating_task)"); |
728 | goto out; | 734 | goto out; |
729 | } | 735 | } |
730 | 736 | ||
731 | p = select_bad_process(&points, totalpages, NULL, mpol_mask, | 737 | p = select_bad_process(&points, totalpages, mpol_mask, force_kill); |
732 | force_kill); | ||
733 | /* Found nothing?!?! Either we hang forever, or we panic. */ | 738 | /* Found nothing?!?! Either we hang forever, or we panic. */ |
734 | if (!p) { | 739 | if (!p) { |
735 | dump_header(NULL, gfp_mask, order, NULL, mpol_mask); | 740 | dump_header(NULL, gfp_mask, order, NULL, mpol_mask); |
736 | read_unlock(&tasklist_lock); | ||
737 | panic("Out of memory and no killable processes...\n"); | 741 | panic("Out of memory and no killable processes...\n"); |
738 | } | 742 | } |
739 | if (PTR_ERR(p) != -1UL) { | 743 | if (PTR_ERR(p) != -1UL) { |
@@ -742,14 +746,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
742 | killed = 1; | 746 | killed = 1; |
743 | } | 747 | } |
744 | out: | 748 | out: |
745 | read_unlock(&tasklist_lock); | ||
746 | |||
747 | /* | 749 | /* |
748 | * Give "p" a good chance of killing itself before we | 750 | * Give the killed threads a good chance of exiting before trying to |
749 | * retry to allocate memory unless "p" is current | 751 | * allocate memory again. |
750 | */ | 752 | */ |
751 | if (killed && !test_thread_flag(TIF_MEMDIE)) | 753 | if (killed) |
752 | schedule_timeout_uninterruptible(1); | 754 | schedule_timeout_killable(1); |
753 | } | 755 | } |
754 | 756 | ||
755 | /* | 757 | /* |
@@ -764,6 +766,5 @@ void pagefault_out_of_memory(void) | |||
764 | out_of_memory(NULL, 0, 0, NULL, false); | 766 | out_of_memory(NULL, 0, 0, NULL, false); |
765 | clear_system_oom(); | 767 | clear_system_oom(); |
766 | } | 768 | } |
767 | if (!test_thread_flag(TIF_MEMDIE)) | 769 | schedule_timeout_killable(1); |
768 | schedule_timeout_uninterruptible(1); | ||
769 | } | 770 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4a4f9219683f..889532b8e6c1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -51,7 +51,6 @@ | |||
51 | #include <linux/page_cgroup.h> | 51 | #include <linux/page_cgroup.h> |
52 | #include <linux/debugobjects.h> | 52 | #include <linux/debugobjects.h> |
53 | #include <linux/kmemleak.h> | 53 | #include <linux/kmemleak.h> |
54 | #include <linux/memory.h> | ||
55 | #include <linux/compaction.h> | 54 | #include <linux/compaction.h> |
56 | #include <trace/events/kmem.h> | 55 | #include <trace/events/kmem.h> |
57 | #include <linux/ftrace_event.h> | 56 | #include <linux/ftrace_event.h> |
@@ -219,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes); | |||
219 | 218 | ||
220 | int page_group_by_mobility_disabled __read_mostly; | 219 | int page_group_by_mobility_disabled __read_mostly; |
221 | 220 | ||
222 | static void set_pageblock_migratetype(struct page *page, int migratetype) | 221 | /* |
222 | * NOTE: | ||
223 | * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly. | ||
224 | * Instead, use {un}set_pageblock_isolate. | ||
225 | */ | ||
226 | void set_pageblock_migratetype(struct page *page, int migratetype) | ||
223 | { | 227 | { |
224 | 228 | ||
225 | if (unlikely(page_group_by_mobility_disabled)) | 229 | if (unlikely(page_group_by_mobility_disabled)) |
@@ -954,7 +958,7 @@ static int move_freepages(struct zone *zone, | |||
954 | return pages_moved; | 958 | return pages_moved; |
955 | } | 959 | } |
956 | 960 | ||
957 | static int move_freepages_block(struct zone *zone, struct page *page, | 961 | int move_freepages_block(struct zone *zone, struct page *page, |
958 | int migratetype) | 962 | int migratetype) |
959 | { | 963 | { |
960 | unsigned long start_pfn, end_pfn; | 964 | unsigned long start_pfn, end_pfn; |
@@ -1158,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | |||
1158 | to_drain = pcp->batch; | 1162 | to_drain = pcp->batch; |
1159 | else | 1163 | else |
1160 | to_drain = pcp->count; | 1164 | to_drain = pcp->count; |
1161 | free_pcppages_bulk(zone, to_drain, pcp); | 1165 | if (to_drain > 0) { |
1162 | pcp->count -= to_drain; | 1166 | free_pcppages_bulk(zone, to_drain, pcp); |
1167 | pcp->count -= to_drain; | ||
1168 | } | ||
1163 | local_irq_restore(flags); | 1169 | local_irq_restore(flags); |
1164 | } | 1170 | } |
1165 | #endif | 1171 | #endif |
@@ -1529,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str) | |||
1529 | } | 1535 | } |
1530 | __setup("fail_page_alloc=", setup_fail_page_alloc); | 1536 | __setup("fail_page_alloc=", setup_fail_page_alloc); |
1531 | 1537 | ||
1532 | static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | 1538 | static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
1533 | { | 1539 | { |
1534 | if (order < fail_page_alloc.min_order) | 1540 | if (order < fail_page_alloc.min_order) |
1535 | return 0; | 1541 | return false; |
1536 | if (gfp_mask & __GFP_NOFAIL) | 1542 | if (gfp_mask & __GFP_NOFAIL) |
1537 | return 0; | 1543 | return false; |
1538 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) | 1544 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) |
1539 | return 0; | 1545 | return false; |
1540 | if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) | 1546 | if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) |
1541 | return 0; | 1547 | return false; |
1542 | 1548 | ||
1543 | return should_fail(&fail_page_alloc.attr, 1 << order); | 1549 | return should_fail(&fail_page_alloc.attr, 1 << order); |
1544 | } | 1550 | } |
@@ -1578,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs); | |||
1578 | 1584 | ||
1579 | #else /* CONFIG_FAIL_PAGE_ALLOC */ | 1585 | #else /* CONFIG_FAIL_PAGE_ALLOC */ |
1580 | 1586 | ||
1581 | static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | 1587 | static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
1582 | { | 1588 | { |
1583 | return 0; | 1589 | return false; |
1584 | } | 1590 | } |
1585 | 1591 | ||
1586 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ | 1592 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ |
@@ -1594,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1594 | { | 1600 | { |
1595 | /* free_pages my go negative - that's OK */ | 1601 | /* free_pages my go negative - that's OK */ |
1596 | long min = mark; | 1602 | long min = mark; |
1603 | long lowmem_reserve = z->lowmem_reserve[classzone_idx]; | ||
1597 | int o; | 1604 | int o; |
1598 | 1605 | ||
1599 | free_pages -= (1 << order) - 1; | 1606 | free_pages -= (1 << order) - 1; |
@@ -1602,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1602 | if (alloc_flags & ALLOC_HARDER) | 1609 | if (alloc_flags & ALLOC_HARDER) |
1603 | min -= min / 4; | 1610 | min -= min / 4; |
1604 | 1611 | ||
1605 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) | 1612 | if (free_pages <= min + lowmem_reserve) |
1606 | return false; | 1613 | return false; |
1607 | for (o = 0; o < order; o++) { | 1614 | for (o = 0; o < order; o++) { |
1608 | /* At the next order, this order's pages become unavailable */ | 1615 | /* At the next order, this order's pages become unavailable */ |
@@ -1617,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1617 | return true; | 1624 | return true; |
1618 | } | 1625 | } |
1619 | 1626 | ||
1627 | #ifdef CONFIG_MEMORY_ISOLATION | ||
1628 | static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) | ||
1629 | { | ||
1630 | if (unlikely(zone->nr_pageblock_isolate)) | ||
1631 | return zone->nr_pageblock_isolate * pageblock_nr_pages; | ||
1632 | return 0; | ||
1633 | } | ||
1634 | #else | ||
1635 | static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) | ||
1636 | { | ||
1637 | return 0; | ||
1638 | } | ||
1639 | #endif | ||
1640 | |||
1620 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1641 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
1621 | int classzone_idx, int alloc_flags) | 1642 | int classzone_idx, int alloc_flags) |
1622 | { | 1643 | { |
@@ -1632,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | |||
1632 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) | 1653 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) |
1633 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); | 1654 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); |
1634 | 1655 | ||
1656 | /* | ||
1657 | * If the zone has MIGRATE_ISOLATE type free pages, we should consider | ||
1658 | * it. nr_zone_isolate_freepages is never accurate so kswapd might not | ||
1659 | * sleep although it could do so. But this is more desirable for memory | ||
1660 | * hotplug than sleeping which can cause a livelock in the direct | ||
1661 | * reclaim path. | ||
1662 | */ | ||
1663 | free_pages -= nr_zone_isolate_freepages(z); | ||
1635 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | 1664 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, |
1636 | free_pages); | 1665 | free_pages); |
1637 | } | 1666 | } |
@@ -2087,8 +2116,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2087 | 2116 | ||
2088 | page = get_page_from_freelist(gfp_mask, nodemask, | 2117 | page = get_page_from_freelist(gfp_mask, nodemask, |
2089 | order, zonelist, high_zoneidx, | 2118 | order, zonelist, high_zoneidx, |
2090 | alloc_flags, preferred_zone, | 2119 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2091 | migratetype); | 2120 | preferred_zone, migratetype); |
2092 | if (page) { | 2121 | if (page) { |
2093 | preferred_zone->compact_considered = 0; | 2122 | preferred_zone->compact_considered = 0; |
2094 | preferred_zone->compact_defer_shift = 0; | 2123 | preferred_zone->compact_defer_shift = 0; |
@@ -2180,8 +2209,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
2180 | retry: | 2209 | retry: |
2181 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2210 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
2182 | zonelist, high_zoneidx, | 2211 | zonelist, high_zoneidx, |
2183 | alloc_flags, preferred_zone, | 2212 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2184 | migratetype); | 2213 | preferred_zone, migratetype); |
2185 | 2214 | ||
2186 | /* | 2215 | /* |
2187 | * If an allocation failed after direct reclaim, it could be because | 2216 | * If an allocation failed after direct reclaim, it could be because |
@@ -2265,15 +2294,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
2265 | alloc_flags |= ALLOC_HARDER; | 2294 | alloc_flags |= ALLOC_HARDER; |
2266 | 2295 | ||
2267 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { | 2296 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { |
2268 | if (!in_interrupt() && | 2297 | if (gfp_mask & __GFP_MEMALLOC) |
2269 | ((current->flags & PF_MEMALLOC) || | 2298 | alloc_flags |= ALLOC_NO_WATERMARKS; |
2270 | unlikely(test_thread_flag(TIF_MEMDIE)))) | 2299 | else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) |
2300 | alloc_flags |= ALLOC_NO_WATERMARKS; | ||
2301 | else if (!in_interrupt() && | ||
2302 | ((current->flags & PF_MEMALLOC) || | ||
2303 | unlikely(test_thread_flag(TIF_MEMDIE)))) | ||
2271 | alloc_flags |= ALLOC_NO_WATERMARKS; | 2304 | alloc_flags |= ALLOC_NO_WATERMARKS; |
2272 | } | 2305 | } |
2273 | 2306 | ||
2274 | return alloc_flags; | 2307 | return alloc_flags; |
2275 | } | 2308 | } |
2276 | 2309 | ||
2310 | bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) | ||
2311 | { | ||
2312 | return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); | ||
2313 | } | ||
2314 | |||
2277 | static inline struct page * | 2315 | static inline struct page * |
2278 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | 2316 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, |
2279 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2317 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
@@ -2340,11 +2378,27 @@ rebalance: | |||
2340 | 2378 | ||
2341 | /* Allocate without watermarks if the context allows */ | 2379 | /* Allocate without watermarks if the context allows */ |
2342 | if (alloc_flags & ALLOC_NO_WATERMARKS) { | 2380 | if (alloc_flags & ALLOC_NO_WATERMARKS) { |
2381 | /* | ||
2382 | * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds | ||
2383 | * the allocation is high priority and these type of | ||
2384 | * allocations are system rather than user orientated | ||
2385 | */ | ||
2386 | zonelist = node_zonelist(numa_node_id(), gfp_mask); | ||
2387 | |||
2343 | page = __alloc_pages_high_priority(gfp_mask, order, | 2388 | page = __alloc_pages_high_priority(gfp_mask, order, |
2344 | zonelist, high_zoneidx, nodemask, | 2389 | zonelist, high_zoneidx, nodemask, |
2345 | preferred_zone, migratetype); | 2390 | preferred_zone, migratetype); |
2346 | if (page) | 2391 | if (page) { |
2392 | /* | ||
2393 | * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was | ||
2394 | * necessary to allocate the page. The expectation is | ||
2395 | * that the caller is taking steps that will free more | ||
2396 | * memory. The caller should avoid the page being used | ||
2397 | * for !PFMEMALLOC purposes. | ||
2398 | */ | ||
2399 | page->pfmemalloc = true; | ||
2347 | goto got_pg; | 2400 | goto got_pg; |
2401 | } | ||
2348 | } | 2402 | } |
2349 | 2403 | ||
2350 | /* Atomic allocations - we can't balance anything */ | 2404 | /* Atomic allocations - we can't balance anything */ |
@@ -2463,8 +2517,8 @@ nopage: | |||
2463 | got_pg: | 2517 | got_pg: |
2464 | if (kmemcheck_enabled) | 2518 | if (kmemcheck_enabled) |
2465 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | 2519 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); |
2466 | return page; | ||
2467 | 2520 | ||
2521 | return page; | ||
2468 | } | 2522 | } |
2469 | 2523 | ||
2470 | /* | 2524 | /* |
@@ -2515,6 +2569,8 @@ retry_cpuset: | |||
2515 | page = __alloc_pages_slowpath(gfp_mask, order, | 2569 | page = __alloc_pages_slowpath(gfp_mask, order, |
2516 | zonelist, high_zoneidx, nodemask, | 2570 | zonelist, high_zoneidx, nodemask, |
2517 | preferred_zone, migratetype); | 2571 | preferred_zone, migratetype); |
2572 | else | ||
2573 | page->pfmemalloc = false; | ||
2518 | 2574 | ||
2519 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2575 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); |
2520 | 2576 | ||
@@ -3030,7 +3086,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
3030 | user_zonelist_order = oldval; | 3086 | user_zonelist_order = oldval; |
3031 | } else if (oldval != user_zonelist_order) { | 3087 | } else if (oldval != user_zonelist_order) { |
3032 | mutex_lock(&zonelists_mutex); | 3088 | mutex_lock(&zonelists_mutex); |
3033 | build_all_zonelists(NULL); | 3089 | build_all_zonelists(NULL, NULL); |
3034 | mutex_unlock(&zonelists_mutex); | 3090 | mutex_unlock(&zonelists_mutex); |
3035 | } | 3091 | } |
3036 | } | 3092 | } |
@@ -3409,14 +3465,21 @@ static void setup_zone_pageset(struct zone *zone); | |||
3409 | DEFINE_MUTEX(zonelists_mutex); | 3465 | DEFINE_MUTEX(zonelists_mutex); |
3410 | 3466 | ||
3411 | /* return values int ....just for stop_machine() */ | 3467 | /* return values int ....just for stop_machine() */ |
3412 | static __init_refok int __build_all_zonelists(void *data) | 3468 | static int __build_all_zonelists(void *data) |
3413 | { | 3469 | { |
3414 | int nid; | 3470 | int nid; |
3415 | int cpu; | 3471 | int cpu; |
3472 | pg_data_t *self = data; | ||
3416 | 3473 | ||
3417 | #ifdef CONFIG_NUMA | 3474 | #ifdef CONFIG_NUMA |
3418 | memset(node_load, 0, sizeof(node_load)); | 3475 | memset(node_load, 0, sizeof(node_load)); |
3419 | #endif | 3476 | #endif |
3477 | |||
3478 | if (self && !node_online(self->node_id)) { | ||
3479 | build_zonelists(self); | ||
3480 | build_zonelist_cache(self); | ||
3481 | } | ||
3482 | |||
3420 | for_each_online_node(nid) { | 3483 | for_each_online_node(nid) { |
3421 | pg_data_t *pgdat = NODE_DATA(nid); | 3484 | pg_data_t *pgdat = NODE_DATA(nid); |
3422 | 3485 | ||
@@ -3461,7 +3524,7 @@ static __init_refok int __build_all_zonelists(void *data) | |||
3461 | * Called with zonelists_mutex held always | 3524 | * Called with zonelists_mutex held always |
3462 | * unless system_state == SYSTEM_BOOTING. | 3525 | * unless system_state == SYSTEM_BOOTING. |
3463 | */ | 3526 | */ |
3464 | void __ref build_all_zonelists(void *data) | 3527 | void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) |
3465 | { | 3528 | { |
3466 | set_zonelist_order(); | 3529 | set_zonelist_order(); |
3467 | 3530 | ||
@@ -3473,10 +3536,10 @@ void __ref build_all_zonelists(void *data) | |||
3473 | /* we have to stop all cpus to guarantee there is no user | 3536 | /* we have to stop all cpus to guarantee there is no user |
3474 | of zonelist */ | 3537 | of zonelist */ |
3475 | #ifdef CONFIG_MEMORY_HOTPLUG | 3538 | #ifdef CONFIG_MEMORY_HOTPLUG |
3476 | if (data) | 3539 | if (zone) |
3477 | setup_zone_pageset((struct zone *)data); | 3540 | setup_zone_pageset(zone); |
3478 | #endif | 3541 | #endif |
3479 | stop_machine(__build_all_zonelists, NULL, NULL); | 3542 | stop_machine(__build_all_zonelists, pgdat, NULL); |
3480 | /* cpuset refresh routine should be here */ | 3543 | /* cpuset refresh routine should be here */ |
3481 | } | 3544 | } |
3482 | vm_total_pages = nr_free_pagecache_pages(); | 3545 | vm_total_pages = nr_free_pagecache_pages(); |
@@ -3746,7 +3809,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) | |||
3746 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) | 3809 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) |
3747 | #endif | 3810 | #endif |
3748 | 3811 | ||
3749 | static int zone_batchsize(struct zone *zone) | 3812 | static int __meminit zone_batchsize(struct zone *zone) |
3750 | { | 3813 | { |
3751 | #ifdef CONFIG_MMU | 3814 | #ifdef CONFIG_MMU |
3752 | int batch; | 3815 | int batch; |
@@ -3828,7 +3891,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, | |||
3828 | pcp->batch = PAGE_SHIFT * 8; | 3891 | pcp->batch = PAGE_SHIFT * 8; |
3829 | } | 3892 | } |
3830 | 3893 | ||
3831 | static void setup_zone_pageset(struct zone *zone) | 3894 | static void __meminit setup_zone_pageset(struct zone *zone) |
3832 | { | 3895 | { |
3833 | int cpu; | 3896 | int cpu; |
3834 | 3897 | ||
@@ -3901,32 +3964,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | |||
3901 | return 0; | 3964 | return 0; |
3902 | } | 3965 | } |
3903 | 3966 | ||
3904 | static int __zone_pcp_update(void *data) | ||
3905 | { | ||
3906 | struct zone *zone = data; | ||
3907 | int cpu; | ||
3908 | unsigned long batch = zone_batchsize(zone), flags; | ||
3909 | |||
3910 | for_each_possible_cpu(cpu) { | ||
3911 | struct per_cpu_pageset *pset; | ||
3912 | struct per_cpu_pages *pcp; | ||
3913 | |||
3914 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
3915 | pcp = &pset->pcp; | ||
3916 | |||
3917 | local_irq_save(flags); | ||
3918 | free_pcppages_bulk(zone, pcp->count, pcp); | ||
3919 | setup_pageset(pset, batch); | ||
3920 | local_irq_restore(flags); | ||
3921 | } | ||
3922 | return 0; | ||
3923 | } | ||
3924 | |||
3925 | void zone_pcp_update(struct zone *zone) | ||
3926 | { | ||
3927 | stop_machine(__zone_pcp_update, zone, NULL); | ||
3928 | } | ||
3929 | |||
3930 | static __meminit void zone_pcp_init(struct zone *zone) | 3967 | static __meminit void zone_pcp_init(struct zone *zone) |
3931 | { | 3968 | { |
3932 | /* | 3969 | /* |
@@ -3942,7 +3979,7 @@ static __meminit void zone_pcp_init(struct zone *zone) | |||
3942 | zone_batchsize(zone)); | 3979 | zone_batchsize(zone)); |
3943 | } | 3980 | } |
3944 | 3981 | ||
3945 | __meminit int init_currently_empty_zone(struct zone *zone, | 3982 | int __meminit init_currently_empty_zone(struct zone *zone, |
3946 | unsigned long zone_start_pfn, | 3983 | unsigned long zone_start_pfn, |
3947 | unsigned long size, | 3984 | unsigned long size, |
3948 | enum memmap_context context) | 3985 | enum memmap_context context) |
@@ -4301,7 +4338,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, | |||
4301 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 4338 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
4302 | 4339 | ||
4303 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ | 4340 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ |
4304 | static inline void __init set_pageblock_order(void) | 4341 | void __init set_pageblock_order(void) |
4305 | { | 4342 | { |
4306 | unsigned int order; | 4343 | unsigned int order; |
4307 | 4344 | ||
@@ -4329,7 +4366,7 @@ static inline void __init set_pageblock_order(void) | |||
4329 | * include/linux/pageblock-flags.h for the values of pageblock_order based on | 4366 | * include/linux/pageblock-flags.h for the values of pageblock_order based on |
4330 | * the kernel config | 4367 | * the kernel config |
4331 | */ | 4368 | */ |
4332 | static inline void set_pageblock_order(void) | 4369 | void __init set_pageblock_order(void) |
4333 | { | 4370 | { |
4334 | } | 4371 | } |
4335 | 4372 | ||
@@ -4340,6 +4377,8 @@ static inline void set_pageblock_order(void) | |||
4340 | * - mark all pages reserved | 4377 | * - mark all pages reserved |
4341 | * - mark all memory queues empty | 4378 | * - mark all memory queues empty |
4342 | * - clear the memory bitmaps | 4379 | * - clear the memory bitmaps |
4380 | * | ||
4381 | * NOTE: pgdat should get zeroed by caller. | ||
4343 | */ | 4382 | */ |
4344 | static void __paginginit free_area_init_core(struct pglist_data *pgdat, | 4383 | static void __paginginit free_area_init_core(struct pglist_data *pgdat, |
4345 | unsigned long *zones_size, unsigned long *zholes_size) | 4384 | unsigned long *zones_size, unsigned long *zholes_size) |
@@ -4350,9 +4389,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4350 | int ret; | 4389 | int ret; |
4351 | 4390 | ||
4352 | pgdat_resize_init(pgdat); | 4391 | pgdat_resize_init(pgdat); |
4353 | pgdat->nr_zones = 0; | ||
4354 | init_waitqueue_head(&pgdat->kswapd_wait); | 4392 | init_waitqueue_head(&pgdat->kswapd_wait); |
4355 | pgdat->kswapd_max_order = 0; | 4393 | init_waitqueue_head(&pgdat->pfmemalloc_wait); |
4356 | pgdat_page_cgroup_init(pgdat); | 4394 | pgdat_page_cgroup_init(pgdat); |
4357 | 4395 | ||
4358 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4396 | for (j = 0; j < MAX_NR_ZONES; j++) { |
@@ -4394,6 +4432,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4394 | 4432 | ||
4395 | zone->spanned_pages = size; | 4433 | zone->spanned_pages = size; |
4396 | zone->present_pages = realsize; | 4434 | zone->present_pages = realsize; |
4435 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | ||
4436 | zone->compact_cached_free_pfn = zone->zone_start_pfn + | ||
4437 | zone->spanned_pages; | ||
4438 | zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1); | ||
4439 | #endif | ||
4397 | #ifdef CONFIG_NUMA | 4440 | #ifdef CONFIG_NUMA |
4398 | zone->node = nid; | 4441 | zone->node = nid; |
4399 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | 4442 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) |
@@ -4408,8 +4451,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4408 | 4451 | ||
4409 | zone_pcp_init(zone); | 4452 | zone_pcp_init(zone); |
4410 | lruvec_init(&zone->lruvec, zone); | 4453 | lruvec_init(&zone->lruvec, zone); |
4411 | zap_zone_vm_stats(zone); | ||
4412 | zone->flags = 0; | ||
4413 | if (!size) | 4454 | if (!size) |
4414 | continue; | 4455 | continue; |
4415 | 4456 | ||
@@ -4469,6 +4510,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
4469 | { | 4510 | { |
4470 | pg_data_t *pgdat = NODE_DATA(nid); | 4511 | pg_data_t *pgdat = NODE_DATA(nid); |
4471 | 4512 | ||
4513 | /* pg_data_t should be reset to zero when it's allocated */ | ||
4514 | WARN_ON(pgdat->nr_zones || pgdat->node_start_pfn || pgdat->classzone_idx); | ||
4515 | |||
4472 | pgdat->node_id = nid; | 4516 | pgdat->node_id = nid; |
4473 | pgdat->node_start_pfn = node_start_pfn; | 4517 | pgdat->node_start_pfn = node_start_pfn; |
4474 | calculate_node_totalpages(pgdat, zones_size, zholes_size); | 4518 | calculate_node_totalpages(pgdat, zones_size, zholes_size); |
@@ -4750,7 +4794,7 @@ out: | |||
4750 | } | 4794 | } |
4751 | 4795 | ||
4752 | /* Any regular memory on that node ? */ | 4796 | /* Any regular memory on that node ? */ |
4753 | static void check_for_regular_memory(pg_data_t *pgdat) | 4797 | static void __init check_for_regular_memory(pg_data_t *pgdat) |
4754 | { | 4798 | { |
4755 | #ifdef CONFIG_HIGHMEM | 4799 | #ifdef CONFIG_HIGHMEM |
4756 | enum zone_type zone_type; | 4800 | enum zone_type zone_type; |
@@ -5468,26 +5512,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
5468 | } | 5512 | } |
5469 | 5513 | ||
5470 | /* | 5514 | /* |
5471 | * This is designed as sub function...plz see page_isolation.c also. | 5515 | * This function checks whether pageblock includes unmovable pages or not. |
5472 | * set/clear page block's type to be ISOLATE. | 5516 | * If @count is not zero, it is okay to include less @count unmovable pages |
5473 | * page allocater never alloc memory from ISOLATE block. | 5517 | * |
5518 | * PageLRU check wihtout isolation or lru_lock could race so that | ||
5519 | * MIGRATE_MOVABLE block might include unmovable pages. It means you can't | ||
5520 | * expect this function should be exact. | ||
5474 | */ | 5521 | */ |
5475 | 5522 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count) | |
5476 | static int | ||
5477 | __count_immobile_pages(struct zone *zone, struct page *page, int count) | ||
5478 | { | 5523 | { |
5479 | unsigned long pfn, iter, found; | 5524 | unsigned long pfn, iter, found; |
5480 | int mt; | 5525 | int mt; |
5481 | 5526 | ||
5482 | /* | 5527 | /* |
5483 | * For avoiding noise data, lru_add_drain_all() should be called | 5528 | * For avoiding noise data, lru_add_drain_all() should be called |
5484 | * If ZONE_MOVABLE, the zone never contains immobile pages | 5529 | * If ZONE_MOVABLE, the zone never contains unmovable pages |
5485 | */ | 5530 | */ |
5486 | if (zone_idx(zone) == ZONE_MOVABLE) | 5531 | if (zone_idx(zone) == ZONE_MOVABLE) |
5487 | return true; | 5532 | return false; |
5488 | mt = get_pageblock_migratetype(page); | 5533 | mt = get_pageblock_migratetype(page); |
5489 | if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) | 5534 | if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) |
5490 | return true; | 5535 | return false; |
5491 | 5536 | ||
5492 | pfn = page_to_pfn(page); | 5537 | pfn = page_to_pfn(page); |
5493 | for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { | 5538 | for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { |
@@ -5497,11 +5542,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) | |||
5497 | continue; | 5542 | continue; |
5498 | 5543 | ||
5499 | page = pfn_to_page(check); | 5544 | page = pfn_to_page(check); |
5500 | if (!page_count(page)) { | 5545 | /* |
5546 | * We can't use page_count without pin a page | ||
5547 | * because another CPU can free compound page. | ||
5548 | * This check already skips compound tails of THP | ||
5549 | * because their page->_count is zero at all time. | ||
5550 | */ | ||
5551 | if (!atomic_read(&page->_count)) { | ||
5501 | if (PageBuddy(page)) | 5552 | if (PageBuddy(page)) |
5502 | iter += (1 << page_order(page)) - 1; | 5553 | iter += (1 << page_order(page)) - 1; |
5503 | continue; | 5554 | continue; |
5504 | } | 5555 | } |
5556 | |||
5505 | if (!PageLRU(page)) | 5557 | if (!PageLRU(page)) |
5506 | found++; | 5558 | found++; |
5507 | /* | 5559 | /* |
@@ -5518,9 +5570,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) | |||
5518 | * page at boot. | 5570 | * page at boot. |
5519 | */ | 5571 | */ |
5520 | if (found > count) | 5572 | if (found > count) |
5521 | return false; | 5573 | return true; |
5522 | } | 5574 | } |
5523 | return true; | 5575 | return false; |
5524 | } | 5576 | } |
5525 | 5577 | ||
5526 | bool is_pageblock_removable_nolock(struct page *page) | 5578 | bool is_pageblock_removable_nolock(struct page *page) |
@@ -5544,77 +5596,7 @@ bool is_pageblock_removable_nolock(struct page *page) | |||
5544 | zone->zone_start_pfn + zone->spanned_pages <= pfn) | 5596 | zone->zone_start_pfn + zone->spanned_pages <= pfn) |
5545 | return false; | 5597 | return false; |
5546 | 5598 | ||
5547 | return __count_immobile_pages(zone, page, 0); | 5599 | return !has_unmovable_pages(zone, page, 0); |
5548 | } | ||
5549 | |||
5550 | int set_migratetype_isolate(struct page *page) | ||
5551 | { | ||
5552 | struct zone *zone; | ||
5553 | unsigned long flags, pfn; | ||
5554 | struct memory_isolate_notify arg; | ||
5555 | int notifier_ret; | ||
5556 | int ret = -EBUSY; | ||
5557 | |||
5558 | zone = page_zone(page); | ||
5559 | |||
5560 | spin_lock_irqsave(&zone->lock, flags); | ||
5561 | |||
5562 | pfn = page_to_pfn(page); | ||
5563 | arg.start_pfn = pfn; | ||
5564 | arg.nr_pages = pageblock_nr_pages; | ||
5565 | arg.pages_found = 0; | ||
5566 | |||
5567 | /* | ||
5568 | * It may be possible to isolate a pageblock even if the | ||
5569 | * migratetype is not MIGRATE_MOVABLE. The memory isolation | ||
5570 | * notifier chain is used by balloon drivers to return the | ||
5571 | * number of pages in a range that are held by the balloon | ||
5572 | * driver to shrink memory. If all the pages are accounted for | ||
5573 | * by balloons, are free, or on the LRU, isolation can continue. | ||
5574 | * Later, for example, when memory hotplug notifier runs, these | ||
5575 | * pages reported as "can be isolated" should be isolated(freed) | ||
5576 | * by the balloon driver through the memory notifier chain. | ||
5577 | */ | ||
5578 | notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); | ||
5579 | notifier_ret = notifier_to_errno(notifier_ret); | ||
5580 | if (notifier_ret) | ||
5581 | goto out; | ||
5582 | /* | ||
5583 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. | ||
5584 | * We just check MOVABLE pages. | ||
5585 | */ | ||
5586 | if (__count_immobile_pages(zone, page, arg.pages_found)) | ||
5587 | ret = 0; | ||
5588 | |||
5589 | /* | ||
5590 | * immobile means "not-on-lru" paes. If immobile is larger than | ||
5591 | * removable-by-driver pages reported by notifier, we'll fail. | ||
5592 | */ | ||
5593 | |||
5594 | out: | ||
5595 | if (!ret) { | ||
5596 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | ||
5597 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | ||
5598 | } | ||
5599 | |||
5600 | spin_unlock_irqrestore(&zone->lock, flags); | ||
5601 | if (!ret) | ||
5602 | drain_all_pages(); | ||
5603 | return ret; | ||
5604 | } | ||
5605 | |||
5606 | void unset_migratetype_isolate(struct page *page, unsigned migratetype) | ||
5607 | { | ||
5608 | struct zone *zone; | ||
5609 | unsigned long flags; | ||
5610 | zone = page_zone(page); | ||
5611 | spin_lock_irqsave(&zone->lock, flags); | ||
5612 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | ||
5613 | goto out; | ||
5614 | set_pageblock_migratetype(page, migratetype); | ||
5615 | move_freepages_block(zone, page, migratetype); | ||
5616 | out: | ||
5617 | spin_unlock_irqrestore(&zone->lock, flags); | ||
5618 | } | 5600 | } |
5619 | 5601 | ||
5620 | #ifdef CONFIG_CMA | 5602 | #ifdef CONFIG_CMA |
@@ -5869,7 +5851,49 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) | |||
5869 | } | 5851 | } |
5870 | #endif | 5852 | #endif |
5871 | 5853 | ||
5854 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
5855 | static int __meminit __zone_pcp_update(void *data) | ||
5856 | { | ||
5857 | struct zone *zone = data; | ||
5858 | int cpu; | ||
5859 | unsigned long batch = zone_batchsize(zone), flags; | ||
5860 | |||
5861 | for_each_possible_cpu(cpu) { | ||
5862 | struct per_cpu_pageset *pset; | ||
5863 | struct per_cpu_pages *pcp; | ||
5864 | |||
5865 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
5866 | pcp = &pset->pcp; | ||
5867 | |||
5868 | local_irq_save(flags); | ||
5869 | if (pcp->count > 0) | ||
5870 | free_pcppages_bulk(zone, pcp->count, pcp); | ||
5871 | setup_pageset(pset, batch); | ||
5872 | local_irq_restore(flags); | ||
5873 | } | ||
5874 | return 0; | ||
5875 | } | ||
5876 | |||
5877 | void __meminit zone_pcp_update(struct zone *zone) | ||
5878 | { | ||
5879 | stop_machine(__zone_pcp_update, zone, NULL); | ||
5880 | } | ||
5881 | #endif | ||
5882 | |||
5872 | #ifdef CONFIG_MEMORY_HOTREMOVE | 5883 | #ifdef CONFIG_MEMORY_HOTREMOVE |
5884 | void zone_pcp_reset(struct zone *zone) | ||
5885 | { | ||
5886 | unsigned long flags; | ||
5887 | |||
5888 | /* avoid races with drain_pages() */ | ||
5889 | local_irq_save(flags); | ||
5890 | if (zone->pageset != &boot_pageset) { | ||
5891 | free_percpu(zone->pageset); | ||
5892 | zone->pageset = &boot_pageset; | ||
5893 | } | ||
5894 | local_irq_restore(flags); | ||
5895 | } | ||
5896 | |||
5873 | /* | 5897 | /* |
5874 | * All pages in the range must be isolated before calling this. | 5898 | * All pages in the range must be isolated before calling this. |
5875 | */ | 5899 | */ |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index eb750f851395..5ddad0c6daa6 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -317,7 +317,7 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) | |||
317 | #endif | 317 | #endif |
318 | 318 | ||
319 | 319 | ||
320 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 320 | #ifdef CONFIG_MEMCG_SWAP |
321 | 321 | ||
322 | static DEFINE_MUTEX(swap_cgroup_mutex); | 322 | static DEFINE_MUTEX(swap_cgroup_mutex); |
323 | struct swap_cgroup_ctrl { | 323 | struct swap_cgroup_ctrl { |
diff --git a/mm/page_io.c b/mm/page_io.c index 34f02923744c..78eee32ee486 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/swap.h> | 17 | #include <linux/swap.h> |
18 | #include <linux/bio.h> | 18 | #include <linux/bio.h> |
19 | #include <linux/swapops.h> | 19 | #include <linux/swapops.h> |
20 | #include <linux/buffer_head.h> | ||
20 | #include <linux/writeback.h> | 21 | #include <linux/writeback.h> |
21 | #include <linux/frontswap.h> | 22 | #include <linux/frontswap.h> |
22 | #include <asm/pgtable.h> | 23 | #include <asm/pgtable.h> |
@@ -86,6 +87,98 @@ void end_swap_bio_read(struct bio *bio, int err) | |||
86 | bio_put(bio); | 87 | bio_put(bio); |
87 | } | 88 | } |
88 | 89 | ||
90 | int generic_swapfile_activate(struct swap_info_struct *sis, | ||
91 | struct file *swap_file, | ||
92 | sector_t *span) | ||
93 | { | ||
94 | struct address_space *mapping = swap_file->f_mapping; | ||
95 | struct inode *inode = mapping->host; | ||
96 | unsigned blocks_per_page; | ||
97 | unsigned long page_no; | ||
98 | unsigned blkbits; | ||
99 | sector_t probe_block; | ||
100 | sector_t last_block; | ||
101 | sector_t lowest_block = -1; | ||
102 | sector_t highest_block = 0; | ||
103 | int nr_extents = 0; | ||
104 | int ret; | ||
105 | |||
106 | blkbits = inode->i_blkbits; | ||
107 | blocks_per_page = PAGE_SIZE >> blkbits; | ||
108 | |||
109 | /* | ||
110 | * Map all the blocks into the extent list. This code doesn't try | ||
111 | * to be very smart. | ||
112 | */ | ||
113 | probe_block = 0; | ||
114 | page_no = 0; | ||
115 | last_block = i_size_read(inode) >> blkbits; | ||
116 | while ((probe_block + blocks_per_page) <= last_block && | ||
117 | page_no < sis->max) { | ||
118 | unsigned block_in_page; | ||
119 | sector_t first_block; | ||
120 | |||
121 | first_block = bmap(inode, probe_block); | ||
122 | if (first_block == 0) | ||
123 | goto bad_bmap; | ||
124 | |||
125 | /* | ||
126 | * It must be PAGE_SIZE aligned on-disk | ||
127 | */ | ||
128 | if (first_block & (blocks_per_page - 1)) { | ||
129 | probe_block++; | ||
130 | goto reprobe; | ||
131 | } | ||
132 | |||
133 | for (block_in_page = 1; block_in_page < blocks_per_page; | ||
134 | block_in_page++) { | ||
135 | sector_t block; | ||
136 | |||
137 | block = bmap(inode, probe_block + block_in_page); | ||
138 | if (block == 0) | ||
139 | goto bad_bmap; | ||
140 | if (block != first_block + block_in_page) { | ||
141 | /* Discontiguity */ | ||
142 | probe_block++; | ||
143 | goto reprobe; | ||
144 | } | ||
145 | } | ||
146 | |||
147 | first_block >>= (PAGE_SHIFT - blkbits); | ||
148 | if (page_no) { /* exclude the header page */ | ||
149 | if (first_block < lowest_block) | ||
150 | lowest_block = first_block; | ||
151 | if (first_block > highest_block) | ||
152 | highest_block = first_block; | ||
153 | } | ||
154 | |||
155 | /* | ||
156 | * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks | ||
157 | */ | ||
158 | ret = add_swap_extent(sis, page_no, 1, first_block); | ||
159 | if (ret < 0) | ||
160 | goto out; | ||
161 | nr_extents += ret; | ||
162 | page_no++; | ||
163 | probe_block += blocks_per_page; | ||
164 | reprobe: | ||
165 | continue; | ||
166 | } | ||
167 | ret = nr_extents; | ||
168 | *span = 1 + highest_block - lowest_block; | ||
169 | if (page_no == 0) | ||
170 | page_no = 1; /* force Empty message */ | ||
171 | sis->max = page_no; | ||
172 | sis->pages = page_no - 1; | ||
173 | sis->highest_bit = page_no - 1; | ||
174 | out: | ||
175 | return ret; | ||
176 | bad_bmap: | ||
177 | printk(KERN_ERR "swapon: swapfile has holes\n"); | ||
178 | ret = -EINVAL; | ||
179 | goto out; | ||
180 | } | ||
181 | |||
89 | /* | 182 | /* |
90 | * We may have stale swap cache pages in memory: notice | 183 | * We may have stale swap cache pages in memory: notice |
91 | * them here and get rid of the unnecessary final write. | 184 | * them here and get rid of the unnecessary final write. |
@@ -94,6 +187,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
94 | { | 187 | { |
95 | struct bio *bio; | 188 | struct bio *bio; |
96 | int ret = 0, rw = WRITE; | 189 | int ret = 0, rw = WRITE; |
190 | struct swap_info_struct *sis = page_swap_info(page); | ||
97 | 191 | ||
98 | if (try_to_free_swap(page)) { | 192 | if (try_to_free_swap(page)) { |
99 | unlock_page(page); | 193 | unlock_page(page); |
@@ -105,6 +199,33 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
105 | end_page_writeback(page); | 199 | end_page_writeback(page); |
106 | goto out; | 200 | goto out; |
107 | } | 201 | } |
202 | |||
203 | if (sis->flags & SWP_FILE) { | ||
204 | struct kiocb kiocb; | ||
205 | struct file *swap_file = sis->swap_file; | ||
206 | struct address_space *mapping = swap_file->f_mapping; | ||
207 | struct iovec iov = { | ||
208 | .iov_base = kmap(page), | ||
209 | .iov_len = PAGE_SIZE, | ||
210 | }; | ||
211 | |||
212 | init_sync_kiocb(&kiocb, swap_file); | ||
213 | kiocb.ki_pos = page_file_offset(page); | ||
214 | kiocb.ki_left = PAGE_SIZE; | ||
215 | kiocb.ki_nbytes = PAGE_SIZE; | ||
216 | |||
217 | unlock_page(page); | ||
218 | ret = mapping->a_ops->direct_IO(KERNEL_WRITE, | ||
219 | &kiocb, &iov, | ||
220 | kiocb.ki_pos, 1); | ||
221 | kunmap(page); | ||
222 | if (ret == PAGE_SIZE) { | ||
223 | count_vm_event(PSWPOUT); | ||
224 | ret = 0; | ||
225 | } | ||
226 | return ret; | ||
227 | } | ||
228 | |||
108 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); | 229 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); |
109 | if (bio == NULL) { | 230 | if (bio == NULL) { |
110 | set_page_dirty(page); | 231 | set_page_dirty(page); |
@@ -126,6 +247,7 @@ int swap_readpage(struct page *page) | |||
126 | { | 247 | { |
127 | struct bio *bio; | 248 | struct bio *bio; |
128 | int ret = 0; | 249 | int ret = 0; |
250 | struct swap_info_struct *sis = page_swap_info(page); | ||
129 | 251 | ||
130 | VM_BUG_ON(!PageLocked(page)); | 252 | VM_BUG_ON(!PageLocked(page)); |
131 | VM_BUG_ON(PageUptodate(page)); | 253 | VM_BUG_ON(PageUptodate(page)); |
@@ -134,6 +256,17 @@ int swap_readpage(struct page *page) | |||
134 | unlock_page(page); | 256 | unlock_page(page); |
135 | goto out; | 257 | goto out; |
136 | } | 258 | } |
259 | |||
260 | if (sis->flags & SWP_FILE) { | ||
261 | struct file *swap_file = sis->swap_file; | ||
262 | struct address_space *mapping = swap_file->f_mapping; | ||
263 | |||
264 | ret = mapping->a_ops->readpage(swap_file, page); | ||
265 | if (!ret) | ||
266 | count_vm_event(PSWPIN); | ||
267 | return ret; | ||
268 | } | ||
269 | |||
137 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); | 270 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); |
138 | if (bio == NULL) { | 271 | if (bio == NULL) { |
139 | unlock_page(page); | 272 | unlock_page(page); |
@@ -145,3 +278,15 @@ int swap_readpage(struct page *page) | |||
145 | out: | 278 | out: |
146 | return ret; | 279 | return ret; |
147 | } | 280 | } |
281 | |||
282 | int swap_set_page_dirty(struct page *page) | ||
283 | { | ||
284 | struct swap_info_struct *sis = page_swap_info(page); | ||
285 | |||
286 | if (sis->flags & SWP_FILE) { | ||
287 | struct address_space *mapping = sis->swap_file->f_mapping; | ||
288 | return mapping->a_ops->set_page_dirty(page); | ||
289 | } else { | ||
290 | return __set_page_dirty_no_writeback(page); | ||
291 | } | ||
292 | } | ||
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c9f04774f2b8..247d1f175739 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -5,8 +5,101 @@ | |||
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | #include <linux/page-isolation.h> | 6 | #include <linux/page-isolation.h> |
7 | #include <linux/pageblock-flags.h> | 7 | #include <linux/pageblock-flags.h> |
8 | #include <linux/memory.h> | ||
8 | #include "internal.h" | 9 | #include "internal.h" |
9 | 10 | ||
11 | /* called while holding zone->lock */ | ||
12 | static void set_pageblock_isolate(struct page *page) | ||
13 | { | ||
14 | if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE) | ||
15 | return; | ||
16 | |||
17 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | ||
18 | page_zone(page)->nr_pageblock_isolate++; | ||
19 | } | ||
20 | |||
21 | /* called while holding zone->lock */ | ||
22 | static void restore_pageblock_isolate(struct page *page, int migratetype) | ||
23 | { | ||
24 | struct zone *zone = page_zone(page); | ||
25 | if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) | ||
26 | return; | ||
27 | |||
28 | BUG_ON(zone->nr_pageblock_isolate <= 0); | ||
29 | set_pageblock_migratetype(page, migratetype); | ||
30 | zone->nr_pageblock_isolate--; | ||
31 | } | ||
32 | |||
33 | int set_migratetype_isolate(struct page *page) | ||
34 | { | ||
35 | struct zone *zone; | ||
36 | unsigned long flags, pfn; | ||
37 | struct memory_isolate_notify arg; | ||
38 | int notifier_ret; | ||
39 | int ret = -EBUSY; | ||
40 | |||
41 | zone = page_zone(page); | ||
42 | |||
43 | spin_lock_irqsave(&zone->lock, flags); | ||
44 | |||
45 | pfn = page_to_pfn(page); | ||
46 | arg.start_pfn = pfn; | ||
47 | arg.nr_pages = pageblock_nr_pages; | ||
48 | arg.pages_found = 0; | ||
49 | |||
50 | /* | ||
51 | * It may be possible to isolate a pageblock even if the | ||
52 | * migratetype is not MIGRATE_MOVABLE. The memory isolation | ||
53 | * notifier chain is used by balloon drivers to return the | ||
54 | * number of pages in a range that are held by the balloon | ||
55 | * driver to shrink memory. If all the pages are accounted for | ||
56 | * by balloons, are free, or on the LRU, isolation can continue. | ||
57 | * Later, for example, when memory hotplug notifier runs, these | ||
58 | * pages reported as "can be isolated" should be isolated(freed) | ||
59 | * by the balloon driver through the memory notifier chain. | ||
60 | */ | ||
61 | notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); | ||
62 | notifier_ret = notifier_to_errno(notifier_ret); | ||
63 | if (notifier_ret) | ||
64 | goto out; | ||
65 | /* | ||
66 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. | ||
67 | * We just check MOVABLE pages. | ||
68 | */ | ||
69 | if (!has_unmovable_pages(zone, page, arg.pages_found)) | ||
70 | ret = 0; | ||
71 | |||
72 | /* | ||
73 | * immobile means "not-on-lru" paes. If immobile is larger than | ||
74 | * removable-by-driver pages reported by notifier, we'll fail. | ||
75 | */ | ||
76 | |||
77 | out: | ||
78 | if (!ret) { | ||
79 | set_pageblock_isolate(page); | ||
80 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | ||
81 | } | ||
82 | |||
83 | spin_unlock_irqrestore(&zone->lock, flags); | ||
84 | if (!ret) | ||
85 | drain_all_pages(); | ||
86 | return ret; | ||
87 | } | ||
88 | |||
89 | void unset_migratetype_isolate(struct page *page, unsigned migratetype) | ||
90 | { | ||
91 | struct zone *zone; | ||
92 | unsigned long flags; | ||
93 | zone = page_zone(page); | ||
94 | spin_lock_irqsave(&zone->lock, flags); | ||
95 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | ||
96 | goto out; | ||
97 | move_freepages_block(zone, page, migratetype); | ||
98 | restore_pageblock_isolate(page, migratetype); | ||
99 | out: | ||
100 | spin_unlock_irqrestore(&zone->lock, flags); | ||
101 | } | ||
102 | |||
10 | static inline struct page * | 103 | static inline struct page * |
11 | __first_valid_page(unsigned long pfn, unsigned long nr_pages) | 104 | __first_valid_page(unsigned long pfn, unsigned long nr_pages) |
12 | { | 105 | { |
diff --git a/mm/shmem.c b/mm/shmem.c index c15b998e5a86..d4e184e2a38e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -929,7 +929,8 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, | |||
929 | 929 | ||
930 | /* Create a pseudo vma that just contains the policy */ | 930 | /* Create a pseudo vma that just contains the policy */ |
931 | pvma.vm_start = 0; | 931 | pvma.vm_start = 0; |
932 | pvma.vm_pgoff = index; | 932 | /* Bias interleave by inode number to distribute better across nodes */ |
933 | pvma.vm_pgoff = index + info->vfs_inode.i_ino; | ||
933 | pvma.vm_ops = NULL; | 934 | pvma.vm_ops = NULL; |
934 | pvma.vm_policy = spol; | 935 | pvma.vm_policy = spol; |
935 | return swapin_readahead(swap, gfp, &pvma, 0); | 936 | return swapin_readahead(swap, gfp, &pvma, 0); |
@@ -942,7 +943,8 @@ static struct page *shmem_alloc_page(gfp_t gfp, | |||
942 | 943 | ||
943 | /* Create a pseudo vma that just contains the policy */ | 944 | /* Create a pseudo vma that just contains the policy */ |
944 | pvma.vm_start = 0; | 945 | pvma.vm_start = 0; |
945 | pvma.vm_pgoff = index; | 946 | /* Bias interleave by inode number to distribute better across nodes */ |
947 | pvma.vm_pgoff = index + info->vfs_inode.i_ino; | ||
946 | pvma.vm_ops = NULL; | 948 | pvma.vm_ops = NULL; |
947 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); | 949 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); |
948 | 950 | ||
@@ -118,12 +118,16 @@ | |||
118 | #include <linux/memory.h> | 118 | #include <linux/memory.h> |
119 | #include <linux/prefetch.h> | 119 | #include <linux/prefetch.h> |
120 | 120 | ||
121 | #include <net/sock.h> | ||
122 | |||
121 | #include <asm/cacheflush.h> | 123 | #include <asm/cacheflush.h> |
122 | #include <asm/tlbflush.h> | 124 | #include <asm/tlbflush.h> |
123 | #include <asm/page.h> | 125 | #include <asm/page.h> |
124 | 126 | ||
125 | #include <trace/events/kmem.h> | 127 | #include <trace/events/kmem.h> |
126 | 128 | ||
129 | #include "internal.h" | ||
130 | |||
127 | /* | 131 | /* |
128 | * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. | 132 | * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. |
129 | * 0 for faster, smaller code (especially in the critical paths). | 133 | * 0 for faster, smaller code (especially in the critical paths). |
@@ -152,6 +156,12 @@ | |||
152 | #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN | 156 | #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN |
153 | #endif | 157 | #endif |
154 | 158 | ||
159 | /* | ||
160 | * true if a page was allocated from pfmemalloc reserves for network-based | ||
161 | * swap | ||
162 | */ | ||
163 | static bool pfmemalloc_active __read_mostly; | ||
164 | |||
155 | /* Legal flag mask for kmem_cache_create(). */ | 165 | /* Legal flag mask for kmem_cache_create(). */ |
156 | #if DEBUG | 166 | #if DEBUG |
157 | # define CREATE_MASK (SLAB_RED_ZONE | \ | 167 | # define CREATE_MASK (SLAB_RED_ZONE | \ |
@@ -257,9 +267,30 @@ struct array_cache { | |||
257 | * Must have this definition in here for the proper | 267 | * Must have this definition in here for the proper |
258 | * alignment of array_cache. Also simplifies accessing | 268 | * alignment of array_cache. Also simplifies accessing |
259 | * the entries. | 269 | * the entries. |
270 | * | ||
271 | * Entries should not be directly dereferenced as | ||
272 | * entries belonging to slabs marked pfmemalloc will | ||
273 | * have the lower bits set SLAB_OBJ_PFMEMALLOC | ||
260 | */ | 274 | */ |
261 | }; | 275 | }; |
262 | 276 | ||
277 | #define SLAB_OBJ_PFMEMALLOC 1 | ||
278 | static inline bool is_obj_pfmemalloc(void *objp) | ||
279 | { | ||
280 | return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; | ||
281 | } | ||
282 | |||
283 | static inline void set_obj_pfmemalloc(void **objp) | ||
284 | { | ||
285 | *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); | ||
286 | return; | ||
287 | } | ||
288 | |||
289 | static inline void clear_obj_pfmemalloc(void **objp) | ||
290 | { | ||
291 | *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); | ||
292 | } | ||
293 | |||
263 | /* | 294 | /* |
264 | * bootstrap: The caches do not work without cpuarrays anymore, but the | 295 | * bootstrap: The caches do not work without cpuarrays anymore, but the |
265 | * cpuarrays are allocated from the generic caches... | 296 | * cpuarrays are allocated from the generic caches... |
@@ -900,6 +931,124 @@ static struct array_cache *alloc_arraycache(int node, int entries, | |||
900 | return nc; | 931 | return nc; |
901 | } | 932 | } |
902 | 933 | ||
934 | static inline bool is_slab_pfmemalloc(struct slab *slabp) | ||
935 | { | ||
936 | struct page *page = virt_to_page(slabp->s_mem); | ||
937 | |||
938 | return PageSlabPfmemalloc(page); | ||
939 | } | ||
940 | |||
941 | /* Clears pfmemalloc_active if no slabs have pfmalloc set */ | ||
942 | static void recheck_pfmemalloc_active(struct kmem_cache *cachep, | ||
943 | struct array_cache *ac) | ||
944 | { | ||
945 | struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()]; | ||
946 | struct slab *slabp; | ||
947 | unsigned long flags; | ||
948 | |||
949 | if (!pfmemalloc_active) | ||
950 | return; | ||
951 | |||
952 | spin_lock_irqsave(&l3->list_lock, flags); | ||
953 | list_for_each_entry(slabp, &l3->slabs_full, list) | ||
954 | if (is_slab_pfmemalloc(slabp)) | ||
955 | goto out; | ||
956 | |||
957 | list_for_each_entry(slabp, &l3->slabs_partial, list) | ||
958 | if (is_slab_pfmemalloc(slabp)) | ||
959 | goto out; | ||
960 | |||
961 | list_for_each_entry(slabp, &l3->slabs_free, list) | ||
962 | if (is_slab_pfmemalloc(slabp)) | ||
963 | goto out; | ||
964 | |||
965 | pfmemalloc_active = false; | ||
966 | out: | ||
967 | spin_unlock_irqrestore(&l3->list_lock, flags); | ||
968 | } | ||
969 | |||
970 | static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, | ||
971 | gfp_t flags, bool force_refill) | ||
972 | { | ||
973 | int i; | ||
974 | void *objp = ac->entry[--ac->avail]; | ||
975 | |||
976 | /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ | ||
977 | if (unlikely(is_obj_pfmemalloc(objp))) { | ||
978 | struct kmem_list3 *l3; | ||
979 | |||
980 | if (gfp_pfmemalloc_allowed(flags)) { | ||
981 | clear_obj_pfmemalloc(&objp); | ||
982 | return objp; | ||
983 | } | ||
984 | |||
985 | /* The caller cannot use PFMEMALLOC objects, find another one */ | ||
986 | for (i = 1; i < ac->avail; i++) { | ||
987 | /* If a !PFMEMALLOC object is found, swap them */ | ||
988 | if (!is_obj_pfmemalloc(ac->entry[i])) { | ||
989 | objp = ac->entry[i]; | ||
990 | ac->entry[i] = ac->entry[ac->avail]; | ||
991 | ac->entry[ac->avail] = objp; | ||
992 | return objp; | ||
993 | } | ||
994 | } | ||
995 | |||
996 | /* | ||
997 | * If there are empty slabs on the slabs_free list and we are | ||
998 | * being forced to refill the cache, mark this one !pfmemalloc. | ||
999 | */ | ||
1000 | l3 = cachep->nodelists[numa_mem_id()]; | ||
1001 | if (!list_empty(&l3->slabs_free) && force_refill) { | ||
1002 | struct slab *slabp = virt_to_slab(objp); | ||
1003 | ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem)); | ||
1004 | clear_obj_pfmemalloc(&objp); | ||
1005 | recheck_pfmemalloc_active(cachep, ac); | ||
1006 | return objp; | ||
1007 | } | ||
1008 | |||
1009 | /* No !PFMEMALLOC objects available */ | ||
1010 | ac->avail++; | ||
1011 | objp = NULL; | ||
1012 | } | ||
1013 | |||
1014 | return objp; | ||
1015 | } | ||
1016 | |||
1017 | static inline void *ac_get_obj(struct kmem_cache *cachep, | ||
1018 | struct array_cache *ac, gfp_t flags, bool force_refill) | ||
1019 | { | ||
1020 | void *objp; | ||
1021 | |||
1022 | if (unlikely(sk_memalloc_socks())) | ||
1023 | objp = __ac_get_obj(cachep, ac, flags, force_refill); | ||
1024 | else | ||
1025 | objp = ac->entry[--ac->avail]; | ||
1026 | |||
1027 | return objp; | ||
1028 | } | ||
1029 | |||
1030 | static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, | ||
1031 | void *objp) | ||
1032 | { | ||
1033 | if (unlikely(pfmemalloc_active)) { | ||
1034 | /* Some pfmemalloc slabs exist, check if this is one */ | ||
1035 | struct page *page = virt_to_page(objp); | ||
1036 | if (PageSlabPfmemalloc(page)) | ||
1037 | set_obj_pfmemalloc(&objp); | ||
1038 | } | ||
1039 | |||
1040 | return objp; | ||
1041 | } | ||
1042 | |||
1043 | static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, | ||
1044 | void *objp) | ||
1045 | { | ||
1046 | if (unlikely(sk_memalloc_socks())) | ||
1047 | objp = __ac_put_obj(cachep, ac, objp); | ||
1048 | |||
1049 | ac->entry[ac->avail++] = objp; | ||
1050 | } | ||
1051 | |||
903 | /* | 1052 | /* |
904 | * Transfer objects in one arraycache to another. | 1053 | * Transfer objects in one arraycache to another. |
905 | * Locking must be handled by the caller. | 1054 | * Locking must be handled by the caller. |
@@ -1076,7 +1225,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | |||
1076 | STATS_INC_ACOVERFLOW(cachep); | 1225 | STATS_INC_ACOVERFLOW(cachep); |
1077 | __drain_alien_cache(cachep, alien, nodeid); | 1226 | __drain_alien_cache(cachep, alien, nodeid); |
1078 | } | 1227 | } |
1079 | alien->entry[alien->avail++] = objp; | 1228 | ac_put_obj(cachep, alien, objp); |
1080 | spin_unlock(&alien->lock); | 1229 | spin_unlock(&alien->lock); |
1081 | } else { | 1230 | } else { |
1082 | spin_lock(&(cachep->nodelists[nodeid])->list_lock); | 1231 | spin_lock(&(cachep->nodelists[nodeid])->list_lock); |
@@ -1759,6 +1908,10 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1759 | return NULL; | 1908 | return NULL; |
1760 | } | 1909 | } |
1761 | 1910 | ||
1911 | /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ | ||
1912 | if (unlikely(page->pfmemalloc)) | ||
1913 | pfmemalloc_active = true; | ||
1914 | |||
1762 | nr_pages = (1 << cachep->gfporder); | 1915 | nr_pages = (1 << cachep->gfporder); |
1763 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1916 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1764 | add_zone_page_state(page_zone(page), | 1917 | add_zone_page_state(page_zone(page), |
@@ -1766,9 +1919,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1766 | else | 1919 | else |
1767 | add_zone_page_state(page_zone(page), | 1920 | add_zone_page_state(page_zone(page), |
1768 | NR_SLAB_UNRECLAIMABLE, nr_pages); | 1921 | NR_SLAB_UNRECLAIMABLE, nr_pages); |
1769 | for (i = 0; i < nr_pages; i++) | 1922 | for (i = 0; i < nr_pages; i++) { |
1770 | __SetPageSlab(page + i); | 1923 | __SetPageSlab(page + i); |
1771 | 1924 | ||
1925 | if (page->pfmemalloc) | ||
1926 | SetPageSlabPfmemalloc(page + i); | ||
1927 | } | ||
1928 | |||
1772 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { | 1929 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { |
1773 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); | 1930 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); |
1774 | 1931 | ||
@@ -1800,6 +1957,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) | |||
1800 | NR_SLAB_UNRECLAIMABLE, nr_freed); | 1957 | NR_SLAB_UNRECLAIMABLE, nr_freed); |
1801 | while (i--) { | 1958 | while (i--) { |
1802 | BUG_ON(!PageSlab(page)); | 1959 | BUG_ON(!PageSlab(page)); |
1960 | __ClearPageSlabPfmemalloc(page); | ||
1803 | __ClearPageSlab(page); | 1961 | __ClearPageSlab(page); |
1804 | page++; | 1962 | page++; |
1805 | } | 1963 | } |
@@ -3015,16 +3173,19 @@ bad: | |||
3015 | #define check_slabp(x,y) do { } while(0) | 3173 | #define check_slabp(x,y) do { } while(0) |
3016 | #endif | 3174 | #endif |
3017 | 3175 | ||
3018 | static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) | 3176 | static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, |
3177 | bool force_refill) | ||
3019 | { | 3178 | { |
3020 | int batchcount; | 3179 | int batchcount; |
3021 | struct kmem_list3 *l3; | 3180 | struct kmem_list3 *l3; |
3022 | struct array_cache *ac; | 3181 | struct array_cache *ac; |
3023 | int node; | 3182 | int node; |
3024 | 3183 | ||
3025 | retry: | ||
3026 | check_irq_off(); | 3184 | check_irq_off(); |
3027 | node = numa_mem_id(); | 3185 | node = numa_mem_id(); |
3186 | if (unlikely(force_refill)) | ||
3187 | goto force_grow; | ||
3188 | retry: | ||
3028 | ac = cpu_cache_get(cachep); | 3189 | ac = cpu_cache_get(cachep); |
3029 | batchcount = ac->batchcount; | 3190 | batchcount = ac->batchcount; |
3030 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { | 3191 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { |
@@ -3074,8 +3235,8 @@ retry: | |||
3074 | STATS_INC_ACTIVE(cachep); | 3235 | STATS_INC_ACTIVE(cachep); |
3075 | STATS_SET_HIGH(cachep); | 3236 | STATS_SET_HIGH(cachep); |
3076 | 3237 | ||
3077 | ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, | 3238 | ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp, |
3078 | node); | 3239 | node)); |
3079 | } | 3240 | } |
3080 | check_slabp(cachep, slabp); | 3241 | check_slabp(cachep, slabp); |
3081 | 3242 | ||
@@ -3094,18 +3255,22 @@ alloc_done: | |||
3094 | 3255 | ||
3095 | if (unlikely(!ac->avail)) { | 3256 | if (unlikely(!ac->avail)) { |
3096 | int x; | 3257 | int x; |
3258 | force_grow: | ||
3097 | x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); | 3259 | x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); |
3098 | 3260 | ||
3099 | /* cache_grow can reenable interrupts, then ac could change. */ | 3261 | /* cache_grow can reenable interrupts, then ac could change. */ |
3100 | ac = cpu_cache_get(cachep); | 3262 | ac = cpu_cache_get(cachep); |
3101 | if (!x && ac->avail == 0) /* no objects in sight? abort */ | 3263 | |
3264 | /* no objects in sight? abort */ | ||
3265 | if (!x && (ac->avail == 0 || force_refill)) | ||
3102 | return NULL; | 3266 | return NULL; |
3103 | 3267 | ||
3104 | if (!ac->avail) /* objects refilled by interrupt? */ | 3268 | if (!ac->avail) /* objects refilled by interrupt? */ |
3105 | goto retry; | 3269 | goto retry; |
3106 | } | 3270 | } |
3107 | ac->touched = 1; | 3271 | ac->touched = 1; |
3108 | return ac->entry[--ac->avail]; | 3272 | |
3273 | return ac_get_obj(cachep, ac, flags, force_refill); | ||
3109 | } | 3274 | } |
3110 | 3275 | ||
3111 | static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, | 3276 | static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, |
@@ -3187,23 +3352,35 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3187 | { | 3352 | { |
3188 | void *objp; | 3353 | void *objp; |
3189 | struct array_cache *ac; | 3354 | struct array_cache *ac; |
3355 | bool force_refill = false; | ||
3190 | 3356 | ||
3191 | check_irq_off(); | 3357 | check_irq_off(); |
3192 | 3358 | ||
3193 | ac = cpu_cache_get(cachep); | 3359 | ac = cpu_cache_get(cachep); |
3194 | if (likely(ac->avail)) { | 3360 | if (likely(ac->avail)) { |
3195 | STATS_INC_ALLOCHIT(cachep); | ||
3196 | ac->touched = 1; | 3361 | ac->touched = 1; |
3197 | objp = ac->entry[--ac->avail]; | 3362 | objp = ac_get_obj(cachep, ac, flags, false); |
3198 | } else { | 3363 | |
3199 | STATS_INC_ALLOCMISS(cachep); | ||
3200 | objp = cache_alloc_refill(cachep, flags); | ||
3201 | /* | 3364 | /* |
3202 | * the 'ac' may be updated by cache_alloc_refill(), | 3365 | * Allow for the possibility all avail objects are not allowed |
3203 | * and kmemleak_erase() requires its correct value. | 3366 | * by the current flags |
3204 | */ | 3367 | */ |
3205 | ac = cpu_cache_get(cachep); | 3368 | if (objp) { |
3369 | STATS_INC_ALLOCHIT(cachep); | ||
3370 | goto out; | ||
3371 | } | ||
3372 | force_refill = true; | ||
3206 | } | 3373 | } |
3374 | |||
3375 | STATS_INC_ALLOCMISS(cachep); | ||
3376 | objp = cache_alloc_refill(cachep, flags, force_refill); | ||
3377 | /* | ||
3378 | * the 'ac' may be updated by cache_alloc_refill(), | ||
3379 | * and kmemleak_erase() requires its correct value. | ||
3380 | */ | ||
3381 | ac = cpu_cache_get(cachep); | ||
3382 | |||
3383 | out: | ||
3207 | /* | 3384 | /* |
3208 | * To avoid a false negative, if an object that is in one of the | 3385 | * To avoid a false negative, if an object that is in one of the |
3209 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't | 3386 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't |
@@ -3525,9 +3702,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, | |||
3525 | struct kmem_list3 *l3; | 3702 | struct kmem_list3 *l3; |
3526 | 3703 | ||
3527 | for (i = 0; i < nr_objects; i++) { | 3704 | for (i = 0; i < nr_objects; i++) { |
3528 | void *objp = objpp[i]; | 3705 | void *objp; |
3529 | struct slab *slabp; | 3706 | struct slab *slabp; |
3530 | 3707 | ||
3708 | clear_obj_pfmemalloc(&objpp[i]); | ||
3709 | objp = objpp[i]; | ||
3710 | |||
3531 | slabp = virt_to_slab(objp); | 3711 | slabp = virt_to_slab(objp); |
3532 | l3 = cachep->nodelists[node]; | 3712 | l3 = cachep->nodelists[node]; |
3533 | list_del(&slabp->list); | 3713 | list_del(&slabp->list); |
@@ -3645,7 +3825,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, | |||
3645 | cache_flusharray(cachep, ac); | 3825 | cache_flusharray(cachep, ac); |
3646 | } | 3826 | } |
3647 | 3827 | ||
3648 | ac->entry[ac->avail++] = objp; | 3828 | ac_put_obj(cachep, ac, objp); |
3649 | } | 3829 | } |
3650 | 3830 | ||
3651 | /** | 3831 | /** |
@@ -34,6 +34,8 @@ | |||
34 | 34 | ||
35 | #include <trace/events/kmem.h> | 35 | #include <trace/events/kmem.h> |
36 | 36 | ||
37 | #include "internal.h" | ||
38 | |||
37 | /* | 39 | /* |
38 | * Lock order: | 40 | * Lock order: |
39 | * 1. slab_mutex (Global Mutex) | 41 | * 1. slab_mutex (Global Mutex) |
@@ -1354,6 +1356,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1354 | inc_slabs_node(s, page_to_nid(page), page->objects); | 1356 | inc_slabs_node(s, page_to_nid(page), page->objects); |
1355 | page->slab = s; | 1357 | page->slab = s; |
1356 | __SetPageSlab(page); | 1358 | __SetPageSlab(page); |
1359 | if (page->pfmemalloc) | ||
1360 | SetPageSlabPfmemalloc(page); | ||
1357 | 1361 | ||
1358 | start = page_address(page); | 1362 | start = page_address(page); |
1359 | 1363 | ||
@@ -1397,6 +1401,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1397 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, | 1401 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, |
1398 | -pages); | 1402 | -pages); |
1399 | 1403 | ||
1404 | __ClearPageSlabPfmemalloc(page); | ||
1400 | __ClearPageSlab(page); | 1405 | __ClearPageSlab(page); |
1401 | reset_page_mapcount(page); | 1406 | reset_page_mapcount(page); |
1402 | if (current->reclaim_state) | 1407 | if (current->reclaim_state) |
@@ -2126,6 +2131,14 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, | |||
2126 | return freelist; | 2131 | return freelist; |
2127 | } | 2132 | } |
2128 | 2133 | ||
2134 | static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) | ||
2135 | { | ||
2136 | if (unlikely(PageSlabPfmemalloc(page))) | ||
2137 | return gfp_pfmemalloc_allowed(gfpflags); | ||
2138 | |||
2139 | return true; | ||
2140 | } | ||
2141 | |||
2129 | /* | 2142 | /* |
2130 | * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist | 2143 | * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist |
2131 | * or deactivate the page. | 2144 | * or deactivate the page. |
@@ -2206,6 +2219,18 @@ redo: | |||
2206 | goto new_slab; | 2219 | goto new_slab; |
2207 | } | 2220 | } |
2208 | 2221 | ||
2222 | /* | ||
2223 | * By rights, we should be searching for a slab page that was | ||
2224 | * PFMEMALLOC but right now, we are losing the pfmemalloc | ||
2225 | * information when the page leaves the per-cpu allocator | ||
2226 | */ | ||
2227 | if (unlikely(!pfmemalloc_match(page, gfpflags))) { | ||
2228 | deactivate_slab(s, page, c->freelist); | ||
2229 | c->page = NULL; | ||
2230 | c->freelist = NULL; | ||
2231 | goto new_slab; | ||
2232 | } | ||
2233 | |||
2209 | /* must check again c->freelist in case of cpu migration or IRQ */ | 2234 | /* must check again c->freelist in case of cpu migration or IRQ */ |
2210 | freelist = c->freelist; | 2235 | freelist = c->freelist; |
2211 | if (freelist) | 2236 | if (freelist) |
@@ -2256,11 +2281,11 @@ new_slab: | |||
2256 | } | 2281 | } |
2257 | 2282 | ||
2258 | page = c->page; | 2283 | page = c->page; |
2259 | if (likely(!kmem_cache_debug(s))) | 2284 | if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) |
2260 | goto load_freelist; | 2285 | goto load_freelist; |
2261 | 2286 | ||
2262 | /* Only entered in the debug case */ | 2287 | /* Only entered in the debug case */ |
2263 | if (!alloc_debug_processing(s, page, freelist, addr)) | 2288 | if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr)) |
2264 | goto new_slab; /* Slab failed checks. Next slab needed */ | 2289 | goto new_slab; /* Slab failed checks. Next slab needed */ |
2265 | 2290 | ||
2266 | deactivate_slab(s, page, get_freepointer(s, freelist)); | 2291 | deactivate_slab(s, page, get_freepointer(s, freelist)); |
@@ -2313,7 +2338,6 @@ redo: | |||
2313 | object = c->freelist; | 2338 | object = c->freelist; |
2314 | page = c->page; | 2339 | page = c->page; |
2315 | if (unlikely(!object || !node_match(page, node))) | 2340 | if (unlikely(!object || !node_match(page, node))) |
2316 | |||
2317 | object = __slab_alloc(s, gfpflags, node, addr, c); | 2341 | object = __slab_alloc(s, gfpflags, node, addr, c); |
2318 | 2342 | ||
2319 | else { | 2343 | else { |
diff --git a/mm/sparse.c b/mm/sparse.c index c7bb952400c8..fac95f2888f2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -65,21 +65,18 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) | |||
65 | 65 | ||
66 | if (slab_is_available()) { | 66 | if (slab_is_available()) { |
67 | if (node_state(nid, N_HIGH_MEMORY)) | 67 | if (node_state(nid, N_HIGH_MEMORY)) |
68 | section = kmalloc_node(array_size, GFP_KERNEL, nid); | 68 | section = kzalloc_node(array_size, GFP_KERNEL, nid); |
69 | else | 69 | else |
70 | section = kmalloc(array_size, GFP_KERNEL); | 70 | section = kzalloc(array_size, GFP_KERNEL); |
71 | } else | 71 | } else { |
72 | section = alloc_bootmem_node(NODE_DATA(nid), array_size); | 72 | section = alloc_bootmem_node(NODE_DATA(nid), array_size); |
73 | 73 | } | |
74 | if (section) | ||
75 | memset(section, 0, array_size); | ||
76 | 74 | ||
77 | return section; | 75 | return section; |
78 | } | 76 | } |
79 | 77 | ||
80 | static int __meminit sparse_index_init(unsigned long section_nr, int nid) | 78 | static int __meminit sparse_index_init(unsigned long section_nr, int nid) |
81 | { | 79 | { |
82 | static DEFINE_SPINLOCK(index_init_lock); | ||
83 | unsigned long root = SECTION_NR_TO_ROOT(section_nr); | 80 | unsigned long root = SECTION_NR_TO_ROOT(section_nr); |
84 | struct mem_section *section; | 81 | struct mem_section *section; |
85 | int ret = 0; | 82 | int ret = 0; |
@@ -90,20 +87,9 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) | |||
90 | section = sparse_index_alloc(nid); | 87 | section = sparse_index_alloc(nid); |
91 | if (!section) | 88 | if (!section) |
92 | return -ENOMEM; | 89 | return -ENOMEM; |
93 | /* | ||
94 | * This lock keeps two different sections from | ||
95 | * reallocating for the same index | ||
96 | */ | ||
97 | spin_lock(&index_init_lock); | ||
98 | |||
99 | if (mem_section[root]) { | ||
100 | ret = -EEXIST; | ||
101 | goto out; | ||
102 | } | ||
103 | 90 | ||
104 | mem_section[root] = section; | 91 | mem_section[root] = section; |
105 | out: | 92 | |
106 | spin_unlock(&index_init_lock); | ||
107 | return ret; | 93 | return ret; |
108 | } | 94 | } |
109 | #else /* !SPARSEMEM_EXTREME */ | 95 | #else /* !SPARSEMEM_EXTREME */ |
@@ -132,6 +118,8 @@ int __section_nr(struct mem_section* ms) | |||
132 | break; | 118 | break; |
133 | } | 119 | } |
134 | 120 | ||
121 | VM_BUG_ON(root_nr == NR_SECTION_ROOTS); | ||
122 | |||
135 | return (root_nr * SECTIONS_PER_ROOT) + (ms - root); | 123 | return (root_nr * SECTIONS_PER_ROOT) + (ms - root); |
136 | } | 124 | } |
137 | 125 | ||
@@ -493,6 +481,9 @@ void __init sparse_init(void) | |||
493 | struct page **map_map; | 481 | struct page **map_map; |
494 | #endif | 482 | #endif |
495 | 483 | ||
484 | /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ | ||
485 | set_pageblock_order(); | ||
486 | |||
496 | /* | 487 | /* |
497 | * map is using big page (aka 2M in x86 64 bit) | 488 | * map is using big page (aka 2M in x86 64 bit) |
498 | * usemap is less one page (aka 24 bytes) | 489 | * usemap is less one page (aka 24 bytes) |
@@ -236,6 +236,58 @@ void put_pages_list(struct list_head *pages) | |||
236 | } | 236 | } |
237 | EXPORT_SYMBOL(put_pages_list); | 237 | EXPORT_SYMBOL(put_pages_list); |
238 | 238 | ||
239 | /* | ||
240 | * get_kernel_pages() - pin kernel pages in memory | ||
241 | * @kiov: An array of struct kvec structures | ||
242 | * @nr_segs: number of segments to pin | ||
243 | * @write: pinning for read/write, currently ignored | ||
244 | * @pages: array that receives pointers to the pages pinned. | ||
245 | * Should be at least nr_segs long. | ||
246 | * | ||
247 | * Returns number of pages pinned. This may be fewer than the number | ||
248 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
249 | * were pinned, returns -errno. Each page returned must be released | ||
250 | * with a put_page() call when it is finished with. | ||
251 | */ | ||
252 | int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, | ||
253 | struct page **pages) | ||
254 | { | ||
255 | int seg; | ||
256 | |||
257 | for (seg = 0; seg < nr_segs; seg++) { | ||
258 | if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) | ||
259 | return seg; | ||
260 | |||
261 | pages[seg] = kmap_to_page(kiov[seg].iov_base); | ||
262 | page_cache_get(pages[seg]); | ||
263 | } | ||
264 | |||
265 | return seg; | ||
266 | } | ||
267 | EXPORT_SYMBOL_GPL(get_kernel_pages); | ||
268 | |||
269 | /* | ||
270 | * get_kernel_page() - pin a kernel page in memory | ||
271 | * @start: starting kernel address | ||
272 | * @write: pinning for read/write, currently ignored | ||
273 | * @pages: array that receives pointer to the page pinned. | ||
274 | * Must be at least nr_segs long. | ||
275 | * | ||
276 | * Returns 1 if page is pinned. If the page was not pinned, returns | ||
277 | * -errno. The page returned must be released with a put_page() call | ||
278 | * when it is finished with. | ||
279 | */ | ||
280 | int get_kernel_page(unsigned long start, int write, struct page **pages) | ||
281 | { | ||
282 | const struct kvec kiov = { | ||
283 | .iov_base = (void *)start, | ||
284 | .iov_len = PAGE_SIZE | ||
285 | }; | ||
286 | |||
287 | return get_kernel_pages(&kiov, 1, write, pages); | ||
288 | } | ||
289 | EXPORT_SYMBOL_GPL(get_kernel_page); | ||
290 | |||
239 | static void pagevec_lru_move_fn(struct pagevec *pvec, | 291 | static void pagevec_lru_move_fn(struct pagevec *pvec, |
240 | void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), | 292 | void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), |
241 | void *arg) | 293 | void *arg) |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 4c5ff7f284d9..0cb36fb1f61c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <linux/pagemap.h> | 15 | #include <linux/pagemap.h> |
16 | #include <linux/backing-dev.h> | 16 | #include <linux/backing-dev.h> |
17 | #include <linux/blkdev.h> | ||
17 | #include <linux/pagevec.h> | 18 | #include <linux/pagevec.h> |
18 | #include <linux/migrate.h> | 19 | #include <linux/migrate.h> |
19 | #include <linux/page_cgroup.h> | 20 | #include <linux/page_cgroup.h> |
@@ -26,7 +27,7 @@ | |||
26 | */ | 27 | */ |
27 | static const struct address_space_operations swap_aops = { | 28 | static const struct address_space_operations swap_aops = { |
28 | .writepage = swap_writepage, | 29 | .writepage = swap_writepage, |
29 | .set_page_dirty = __set_page_dirty_no_writeback, | 30 | .set_page_dirty = swap_set_page_dirty, |
30 | .migratepage = migrate_page, | 31 | .migratepage = migrate_page, |
31 | }; | 32 | }; |
32 | 33 | ||
@@ -376,6 +377,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
376 | unsigned long offset = swp_offset(entry); | 377 | unsigned long offset = swp_offset(entry); |
377 | unsigned long start_offset, end_offset; | 378 | unsigned long start_offset, end_offset; |
378 | unsigned long mask = (1UL << page_cluster) - 1; | 379 | unsigned long mask = (1UL << page_cluster) - 1; |
380 | struct blk_plug plug; | ||
379 | 381 | ||
380 | /* Read a page_cluster sized and aligned cluster around offset. */ | 382 | /* Read a page_cluster sized and aligned cluster around offset. */ |
381 | start_offset = offset & ~mask; | 383 | start_offset = offset & ~mask; |
@@ -383,6 +385,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
383 | if (!start_offset) /* First page is swap header. */ | 385 | if (!start_offset) /* First page is swap header. */ |
384 | start_offset++; | 386 | start_offset++; |
385 | 387 | ||
388 | blk_start_plug(&plug); | ||
386 | for (offset = start_offset; offset <= end_offset ; offset++) { | 389 | for (offset = start_offset; offset <= end_offset ; offset++) { |
387 | /* Ok, do the async read-ahead now */ | 390 | /* Ok, do the async read-ahead now */ |
388 | page = read_swap_cache_async(swp_entry(swp_type(entry), offset), | 391 | page = read_swap_cache_async(swp_entry(swp_type(entry), offset), |
@@ -391,6 +394,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
391 | continue; | 394 | continue; |
392 | page_cache_release(page); | 395 | page_cache_release(page); |
393 | } | 396 | } |
397 | blk_finish_plug(&plug); | ||
398 | |||
394 | lru_add_drain(); /* Push any new pages onto the LRU now */ | 399 | lru_add_drain(); /* Push any new pages onto the LRU now */ |
395 | return read_swap_cache_async(entry, gfp_mask, vma, addr); | 400 | return read_swap_cache_async(entry, gfp_mask, vma, addr); |
396 | } | 401 | } |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 71373d03fcee..14e254c768fc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/oom.h> | 33 | #include <linux/oom.h> |
34 | #include <linux/frontswap.h> | 34 | #include <linux/frontswap.h> |
35 | #include <linux/swapfile.h> | 35 | #include <linux/swapfile.h> |
36 | #include <linux/export.h> | ||
36 | 37 | ||
37 | #include <asm/pgtable.h> | 38 | #include <asm/pgtable.h> |
38 | #include <asm/tlbflush.h> | 39 | #include <asm/tlbflush.h> |
@@ -548,7 +549,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
548 | 549 | ||
549 | /* free if no reference */ | 550 | /* free if no reference */ |
550 | if (!usage) { | 551 | if (!usage) { |
551 | struct gendisk *disk = p->bdev->bd_disk; | ||
552 | if (offset < p->lowest_bit) | 552 | if (offset < p->lowest_bit) |
553 | p->lowest_bit = offset; | 553 | p->lowest_bit = offset; |
554 | if (offset > p->highest_bit) | 554 | if (offset > p->highest_bit) |
@@ -559,9 +559,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
559 | nr_swap_pages++; | 559 | nr_swap_pages++; |
560 | p->inuse_pages--; | 560 | p->inuse_pages--; |
561 | frontswap_invalidate_page(p->type, offset); | 561 | frontswap_invalidate_page(p->type, offset); |
562 | if ((p->flags & SWP_BLKDEV) && | 562 | if (p->flags & SWP_BLKDEV) { |
563 | disk->fops->swap_slot_free_notify) | 563 | struct gendisk *disk = p->bdev->bd_disk; |
564 | disk->fops->swap_slot_free_notify(p->bdev, offset); | 564 | if (disk->fops->swap_slot_free_notify) |
565 | disk->fops->swap_slot_free_notify(p->bdev, | ||
566 | offset); | ||
567 | } | ||
565 | } | 568 | } |
566 | 569 | ||
567 | return usage; | 570 | return usage; |
@@ -832,8 +835,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
832 | 835 | ||
833 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 836 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
834 | if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { | 837 | if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { |
835 | if (ret > 0) | 838 | mem_cgroup_cancel_charge_swapin(memcg); |
836 | mem_cgroup_cancel_charge_swapin(memcg); | ||
837 | ret = 0; | 839 | ret = 0; |
838 | goto out; | 840 | goto out; |
839 | } | 841 | } |
@@ -1328,6 +1330,14 @@ static void destroy_swap_extents(struct swap_info_struct *sis) | |||
1328 | list_del(&se->list); | 1330 | list_del(&se->list); |
1329 | kfree(se); | 1331 | kfree(se); |
1330 | } | 1332 | } |
1333 | |||
1334 | if (sis->flags & SWP_FILE) { | ||
1335 | struct file *swap_file = sis->swap_file; | ||
1336 | struct address_space *mapping = swap_file->f_mapping; | ||
1337 | |||
1338 | sis->flags &= ~SWP_FILE; | ||
1339 | mapping->a_ops->swap_deactivate(swap_file); | ||
1340 | } | ||
1331 | } | 1341 | } |
1332 | 1342 | ||
1333 | /* | 1343 | /* |
@@ -1336,7 +1346,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis) | |||
1336 | * | 1346 | * |
1337 | * This function rather assumes that it is called in ascending page order. | 1347 | * This function rather assumes that it is called in ascending page order. |
1338 | */ | 1348 | */ |
1339 | static int | 1349 | int |
1340 | add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | 1350 | add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, |
1341 | unsigned long nr_pages, sector_t start_block) | 1351 | unsigned long nr_pages, sector_t start_block) |
1342 | { | 1352 | { |
@@ -1409,98 +1419,28 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | |||
1409 | */ | 1419 | */ |
1410 | static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) | 1420 | static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) |
1411 | { | 1421 | { |
1412 | struct inode *inode; | 1422 | struct file *swap_file = sis->swap_file; |
1413 | unsigned blocks_per_page; | 1423 | struct address_space *mapping = swap_file->f_mapping; |
1414 | unsigned long page_no; | 1424 | struct inode *inode = mapping->host; |
1415 | unsigned blkbits; | ||
1416 | sector_t probe_block; | ||
1417 | sector_t last_block; | ||
1418 | sector_t lowest_block = -1; | ||
1419 | sector_t highest_block = 0; | ||
1420 | int nr_extents = 0; | ||
1421 | int ret; | 1425 | int ret; |
1422 | 1426 | ||
1423 | inode = sis->swap_file->f_mapping->host; | ||
1424 | if (S_ISBLK(inode->i_mode)) { | 1427 | if (S_ISBLK(inode->i_mode)) { |
1425 | ret = add_swap_extent(sis, 0, sis->max, 0); | 1428 | ret = add_swap_extent(sis, 0, sis->max, 0); |
1426 | *span = sis->pages; | 1429 | *span = sis->pages; |
1427 | goto out; | 1430 | return ret; |
1428 | } | 1431 | } |
1429 | 1432 | ||
1430 | blkbits = inode->i_blkbits; | 1433 | if (mapping->a_ops->swap_activate) { |
1431 | blocks_per_page = PAGE_SIZE >> blkbits; | 1434 | ret = mapping->a_ops->swap_activate(sis, swap_file, span); |
1432 | 1435 | if (!ret) { | |
1433 | /* | 1436 | sis->flags |= SWP_FILE; |
1434 | * Map all the blocks into the extent list. This code doesn't try | 1437 | ret = add_swap_extent(sis, 0, sis->max, 0); |
1435 | * to be very smart. | 1438 | *span = sis->pages; |
1436 | */ | ||
1437 | probe_block = 0; | ||
1438 | page_no = 0; | ||
1439 | last_block = i_size_read(inode) >> blkbits; | ||
1440 | while ((probe_block + blocks_per_page) <= last_block && | ||
1441 | page_no < sis->max) { | ||
1442 | unsigned block_in_page; | ||
1443 | sector_t first_block; | ||
1444 | |||
1445 | first_block = bmap(inode, probe_block); | ||
1446 | if (first_block == 0) | ||
1447 | goto bad_bmap; | ||
1448 | |||
1449 | /* | ||
1450 | * It must be PAGE_SIZE aligned on-disk | ||
1451 | */ | ||
1452 | if (first_block & (blocks_per_page - 1)) { | ||
1453 | probe_block++; | ||
1454 | goto reprobe; | ||
1455 | } | ||
1456 | |||
1457 | for (block_in_page = 1; block_in_page < blocks_per_page; | ||
1458 | block_in_page++) { | ||
1459 | sector_t block; | ||
1460 | |||
1461 | block = bmap(inode, probe_block + block_in_page); | ||
1462 | if (block == 0) | ||
1463 | goto bad_bmap; | ||
1464 | if (block != first_block + block_in_page) { | ||
1465 | /* Discontiguity */ | ||
1466 | probe_block++; | ||
1467 | goto reprobe; | ||
1468 | } | ||
1469 | } | ||
1470 | |||
1471 | first_block >>= (PAGE_SHIFT - blkbits); | ||
1472 | if (page_no) { /* exclude the header page */ | ||
1473 | if (first_block < lowest_block) | ||
1474 | lowest_block = first_block; | ||
1475 | if (first_block > highest_block) | ||
1476 | highest_block = first_block; | ||
1477 | } | 1439 | } |
1440 | return ret; | ||
1441 | } | ||
1478 | 1442 | ||
1479 | /* | 1443 | return generic_swapfile_activate(sis, swap_file, span); |
1480 | * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks | ||
1481 | */ | ||
1482 | ret = add_swap_extent(sis, page_no, 1, first_block); | ||
1483 | if (ret < 0) | ||
1484 | goto out; | ||
1485 | nr_extents += ret; | ||
1486 | page_no++; | ||
1487 | probe_block += blocks_per_page; | ||
1488 | reprobe: | ||
1489 | continue; | ||
1490 | } | ||
1491 | ret = nr_extents; | ||
1492 | *span = 1 + highest_block - lowest_block; | ||
1493 | if (page_no == 0) | ||
1494 | page_no = 1; /* force Empty message */ | ||
1495 | sis->max = page_no; | ||
1496 | sis->pages = page_no - 1; | ||
1497 | sis->highest_bit = page_no - 1; | ||
1498 | out: | ||
1499 | return ret; | ||
1500 | bad_bmap: | ||
1501 | printk(KERN_ERR "swapon: swapfile has holes\n"); | ||
1502 | ret = -EINVAL; | ||
1503 | goto out; | ||
1504 | } | 1444 | } |
1505 | 1445 | ||
1506 | static void enable_swap_info(struct swap_info_struct *p, int prio, | 1446 | static void enable_swap_info(struct swap_info_struct *p, int prio, |
@@ -2285,6 +2225,31 @@ int swapcache_prepare(swp_entry_t entry) | |||
2285 | return __swap_duplicate(entry, SWAP_HAS_CACHE); | 2225 | return __swap_duplicate(entry, SWAP_HAS_CACHE); |
2286 | } | 2226 | } |
2287 | 2227 | ||
2228 | struct swap_info_struct *page_swap_info(struct page *page) | ||
2229 | { | ||
2230 | swp_entry_t swap = { .val = page_private(page) }; | ||
2231 | BUG_ON(!PageSwapCache(page)); | ||
2232 | return swap_info[swp_type(swap)]; | ||
2233 | } | ||
2234 | |||
2235 | /* | ||
2236 | * out-of-line __page_file_ methods to avoid include hell. | ||
2237 | */ | ||
2238 | struct address_space *__page_file_mapping(struct page *page) | ||
2239 | { | ||
2240 | VM_BUG_ON(!PageSwapCache(page)); | ||
2241 | return page_swap_info(page)->swap_file->f_mapping; | ||
2242 | } | ||
2243 | EXPORT_SYMBOL_GPL(__page_file_mapping); | ||
2244 | |||
2245 | pgoff_t __page_file_index(struct page *page) | ||
2246 | { | ||
2247 | swp_entry_t swap = { .val = page_private(page) }; | ||
2248 | VM_BUG_ON(!PageSwapCache(page)); | ||
2249 | return swp_offset(swap); | ||
2250 | } | ||
2251 | EXPORT_SYMBOL_GPL(__page_file_index); | ||
2252 | |||
2288 | /* | 2253 | /* |
2289 | * add_swap_count_continuation - called when a swap count is duplicated | 2254 | * add_swap_count_continuation - called when a swap count is duplicated |
2290 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's | 2255 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e03f4c7307a5..2bb90b1d241c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -413,11 +413,11 @@ nocache: | |||
413 | if (addr + size - 1 < addr) | 413 | if (addr + size - 1 < addr) |
414 | goto overflow; | 414 | goto overflow; |
415 | 415 | ||
416 | n = rb_next(&first->rb_node); | 416 | if (list_is_last(&first->list, &vmap_area_list)) |
417 | if (n) | ||
418 | first = rb_entry(n, struct vmap_area, rb_node); | ||
419 | else | ||
420 | goto found; | 417 | goto found; |
418 | |||
419 | first = list_entry(first->list.next, | ||
420 | struct vmap_area, list); | ||
421 | } | 421 | } |
422 | 422 | ||
423 | found: | 423 | found: |
@@ -904,6 +904,14 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) | |||
904 | 904 | ||
905 | BUG_ON(size & ~PAGE_MASK); | 905 | BUG_ON(size & ~PAGE_MASK); |
906 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | 906 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); |
907 | if (WARN_ON(size == 0)) { | ||
908 | /* | ||
909 | * Allocating 0 bytes isn't what caller wants since | ||
910 | * get_order(0) returns funny result. Just warn and terminate | ||
911 | * early. | ||
912 | */ | ||
913 | return NULL; | ||
914 | } | ||
907 | order = get_order(size); | 915 | order = get_order(size); |
908 | 916 | ||
909 | again: | 917 | again: |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 347b3ff2a478..8d01243d9560 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -133,7 +133,7 @@ long vm_total_pages; /* The total number of pages which the VM controls */ | |||
133 | static LIST_HEAD(shrinker_list); | 133 | static LIST_HEAD(shrinker_list); |
134 | static DECLARE_RWSEM(shrinker_rwsem); | 134 | static DECLARE_RWSEM(shrinker_rwsem); |
135 | 135 | ||
136 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 136 | #ifdef CONFIG_MEMCG |
137 | static bool global_reclaim(struct scan_control *sc) | 137 | static bool global_reclaim(struct scan_control *sc) |
138 | { | 138 | { |
139 | return !sc->target_mem_cgroup; | 139 | return !sc->target_mem_cgroup; |
@@ -687,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
687 | 687 | ||
688 | cond_resched(); | 688 | cond_resched(); |
689 | 689 | ||
690 | mem_cgroup_uncharge_start(); | ||
690 | while (!list_empty(page_list)) { | 691 | while (!list_empty(page_list)) { |
691 | enum page_references references; | 692 | enum page_references references; |
692 | struct address_space *mapping; | 693 | struct address_space *mapping; |
@@ -720,9 +721,41 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
720 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); | 721 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); |
721 | 722 | ||
722 | if (PageWriteback(page)) { | 723 | if (PageWriteback(page)) { |
723 | nr_writeback++; | 724 | /* |
724 | unlock_page(page); | 725 | * memcg doesn't have any dirty pages throttling so we |
725 | goto keep; | 726 | * could easily OOM just because too many pages are in |
727 | * writeback and there is nothing else to reclaim. | ||
728 | * | ||
729 | * Check __GFP_IO, certainly because a loop driver | ||
730 | * thread might enter reclaim, and deadlock if it waits | ||
731 | * on a page for which it is needed to do the write | ||
732 | * (loop masks off __GFP_IO|__GFP_FS for this reason); | ||
733 | * but more thought would probably show more reasons. | ||
734 | * | ||
735 | * Don't require __GFP_FS, since we're not going into | ||
736 | * the FS, just waiting on its writeback completion. | ||
737 | * Worryingly, ext4 gfs2 and xfs allocate pages with | ||
738 | * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so | ||
739 | * testing may_enter_fs here is liable to OOM on them. | ||
740 | */ | ||
741 | if (global_reclaim(sc) || | ||
742 | !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { | ||
743 | /* | ||
744 | * This is slightly racy - end_page_writeback() | ||
745 | * might have just cleared PageReclaim, then | ||
746 | * setting PageReclaim here end up interpreted | ||
747 | * as PageReadahead - but that does not matter | ||
748 | * enough to care. What we do want is for this | ||
749 | * page to have PageReclaim set next time memcg | ||
750 | * reclaim reaches the tests above, so it will | ||
751 | * then wait_on_page_writeback() to avoid OOM; | ||
752 | * and it's also appropriate in global reclaim. | ||
753 | */ | ||
754 | SetPageReclaim(page); | ||
755 | nr_writeback++; | ||
756 | goto keep_locked; | ||
757 | } | ||
758 | wait_on_page_writeback(page); | ||
726 | } | 759 | } |
727 | 760 | ||
728 | references = page_check_references(page, sc); | 761 | references = page_check_references(page, sc); |
@@ -921,6 +954,7 @@ keep: | |||
921 | 954 | ||
922 | list_splice(&ret_pages, page_list); | 955 | list_splice(&ret_pages, page_list); |
923 | count_vm_events(PGACTIVATE, pgactivate); | 956 | count_vm_events(PGACTIVATE, pgactivate); |
957 | mem_cgroup_uncharge_end(); | ||
924 | *ret_nr_dirty += nr_dirty; | 958 | *ret_nr_dirty += nr_dirty; |
925 | *ret_nr_writeback += nr_writeback; | 959 | *ret_nr_writeback += nr_writeback; |
926 | return nr_reclaimed; | 960 | return nr_reclaimed; |
@@ -2112,6 +2146,83 @@ out: | |||
2112 | return 0; | 2146 | return 0; |
2113 | } | 2147 | } |
2114 | 2148 | ||
2149 | static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | ||
2150 | { | ||
2151 | struct zone *zone; | ||
2152 | unsigned long pfmemalloc_reserve = 0; | ||
2153 | unsigned long free_pages = 0; | ||
2154 | int i; | ||
2155 | bool wmark_ok; | ||
2156 | |||
2157 | for (i = 0; i <= ZONE_NORMAL; i++) { | ||
2158 | zone = &pgdat->node_zones[i]; | ||
2159 | pfmemalloc_reserve += min_wmark_pages(zone); | ||
2160 | free_pages += zone_page_state(zone, NR_FREE_PAGES); | ||
2161 | } | ||
2162 | |||
2163 | wmark_ok = free_pages > pfmemalloc_reserve / 2; | ||
2164 | |||
2165 | /* kswapd must be awake if processes are being throttled */ | ||
2166 | if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { | ||
2167 | pgdat->classzone_idx = min(pgdat->classzone_idx, | ||
2168 | (enum zone_type)ZONE_NORMAL); | ||
2169 | wake_up_interruptible(&pgdat->kswapd_wait); | ||
2170 | } | ||
2171 | |||
2172 | return wmark_ok; | ||
2173 | } | ||
2174 | |||
2175 | /* | ||
2176 | * Throttle direct reclaimers if backing storage is backed by the network | ||
2177 | * and the PFMEMALLOC reserve for the preferred node is getting dangerously | ||
2178 | * depleted. kswapd will continue to make progress and wake the processes | ||
2179 | * when the low watermark is reached | ||
2180 | */ | ||
2181 | static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | ||
2182 | nodemask_t *nodemask) | ||
2183 | { | ||
2184 | struct zone *zone; | ||
2185 | int high_zoneidx = gfp_zone(gfp_mask); | ||
2186 | pg_data_t *pgdat; | ||
2187 | |||
2188 | /* | ||
2189 | * Kernel threads should not be throttled as they may be indirectly | ||
2190 | * responsible for cleaning pages necessary for reclaim to make forward | ||
2191 | * progress. kjournald for example may enter direct reclaim while | ||
2192 | * committing a transaction where throttling it could forcing other | ||
2193 | * processes to block on log_wait_commit(). | ||
2194 | */ | ||
2195 | if (current->flags & PF_KTHREAD) | ||
2196 | return; | ||
2197 | |||
2198 | /* Check if the pfmemalloc reserves are ok */ | ||
2199 | first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); | ||
2200 | pgdat = zone->zone_pgdat; | ||
2201 | if (pfmemalloc_watermark_ok(pgdat)) | ||
2202 | return; | ||
2203 | |||
2204 | /* Account for the throttling */ | ||
2205 | count_vm_event(PGSCAN_DIRECT_THROTTLE); | ||
2206 | |||
2207 | /* | ||
2208 | * If the caller cannot enter the filesystem, it's possible that it | ||
2209 | * is due to the caller holding an FS lock or performing a journal | ||
2210 | * transaction in the case of a filesystem like ext[3|4]. In this case, | ||
2211 | * it is not safe to block on pfmemalloc_wait as kswapd could be | ||
2212 | * blocked waiting on the same lock. Instead, throttle for up to a | ||
2213 | * second before continuing. | ||
2214 | */ | ||
2215 | if (!(gfp_mask & __GFP_FS)) { | ||
2216 | wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, | ||
2217 | pfmemalloc_watermark_ok(pgdat), HZ); | ||
2218 | return; | ||
2219 | } | ||
2220 | |||
2221 | /* Throttle until kswapd wakes the process */ | ||
2222 | wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, | ||
2223 | pfmemalloc_watermark_ok(pgdat)); | ||
2224 | } | ||
2225 | |||
2115 | unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | 2226 | unsigned long try_to_free_pages(struct zonelist *zonelist, int order, |
2116 | gfp_t gfp_mask, nodemask_t *nodemask) | 2227 | gfp_t gfp_mask, nodemask_t *nodemask) |
2117 | { | 2228 | { |
@@ -2131,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2131 | .gfp_mask = sc.gfp_mask, | 2242 | .gfp_mask = sc.gfp_mask, |
2132 | }; | 2243 | }; |
2133 | 2244 | ||
2245 | throttle_direct_reclaim(gfp_mask, zonelist, nodemask); | ||
2246 | |||
2247 | /* | ||
2248 | * Do not enter reclaim if fatal signal is pending. 1 is returned so | ||
2249 | * that the page allocator does not consider triggering OOM | ||
2250 | */ | ||
2251 | if (fatal_signal_pending(current)) | ||
2252 | return 1; | ||
2253 | |||
2134 | trace_mm_vmscan_direct_reclaim_begin(order, | 2254 | trace_mm_vmscan_direct_reclaim_begin(order, |
2135 | sc.may_writepage, | 2255 | sc.may_writepage, |
2136 | gfp_mask); | 2256 | gfp_mask); |
@@ -2142,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2142 | return nr_reclaimed; | 2262 | return nr_reclaimed; |
2143 | } | 2263 | } |
2144 | 2264 | ||
2145 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 2265 | #ifdef CONFIG_MEMCG |
2146 | 2266 | ||
2147 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | 2267 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, |
2148 | gfp_t gfp_mask, bool noswap, | 2268 | gfp_t gfp_mask, bool noswap, |
@@ -2275,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, | |||
2275 | return balanced_pages >= (present_pages >> 2); | 2395 | return balanced_pages >= (present_pages >> 2); |
2276 | } | 2396 | } |
2277 | 2397 | ||
2278 | /* is kswapd sleeping prematurely? */ | 2398 | /* |
2279 | static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | 2399 | * Prepare kswapd for sleeping. This verifies that there are no processes |
2400 | * waiting in throttle_direct_reclaim() and that watermarks have been met. | ||
2401 | * | ||
2402 | * Returns true if kswapd is ready to sleep | ||
2403 | */ | ||
2404 | static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | ||
2280 | int classzone_idx) | 2405 | int classzone_idx) |
2281 | { | 2406 | { |
2282 | int i; | 2407 | int i; |
@@ -2285,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2285 | 2410 | ||
2286 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ | 2411 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ |
2287 | if (remaining) | 2412 | if (remaining) |
2288 | return true; | 2413 | return false; |
2414 | |||
2415 | /* | ||
2416 | * There is a potential race between when kswapd checks its watermarks | ||
2417 | * and a process gets throttled. There is also a potential race if | ||
2418 | * processes get throttled, kswapd wakes, a large process exits therby | ||
2419 | * balancing the zones that causes kswapd to miss a wakeup. If kswapd | ||
2420 | * is going to sleep, no process should be sleeping on pfmemalloc_wait | ||
2421 | * so wake them now if necessary. If necessary, processes will wake | ||
2422 | * kswapd and get throttled again | ||
2423 | */ | ||
2424 | if (waitqueue_active(&pgdat->pfmemalloc_wait)) { | ||
2425 | wake_up(&pgdat->pfmemalloc_wait); | ||
2426 | return false; | ||
2427 | } | ||
2289 | 2428 | ||
2290 | /* Check the watermark levels */ | 2429 | /* Check the watermark levels */ |
2291 | for (i = 0; i <= classzone_idx; i++) { | 2430 | for (i = 0; i <= classzone_idx; i++) { |
@@ -2318,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2318 | * must be balanced | 2457 | * must be balanced |
2319 | */ | 2458 | */ |
2320 | if (order) | 2459 | if (order) |
2321 | return !pgdat_balanced(pgdat, balanced, classzone_idx); | 2460 | return pgdat_balanced(pgdat, balanced, classzone_idx); |
2322 | else | 2461 | else |
2323 | return !all_zones_ok; | 2462 | return all_zones_ok; |
2324 | } | 2463 | } |
2325 | 2464 | ||
2326 | /* | 2465 | /* |
@@ -2546,6 +2685,16 @@ loop_again: | |||
2546 | } | 2685 | } |
2547 | 2686 | ||
2548 | } | 2687 | } |
2688 | |||
2689 | /* | ||
2690 | * If the low watermark is met there is no need for processes | ||
2691 | * to be throttled on pfmemalloc_wait as they should not be | ||
2692 | * able to safely make forward progress. Wake them | ||
2693 | */ | ||
2694 | if (waitqueue_active(&pgdat->pfmemalloc_wait) && | ||
2695 | pfmemalloc_watermark_ok(pgdat)) | ||
2696 | wake_up(&pgdat->pfmemalloc_wait); | ||
2697 | |||
2549 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) | 2698 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) |
2550 | break; /* kswapd: all done */ | 2699 | break; /* kswapd: all done */ |
2551 | /* | 2700 | /* |
@@ -2647,7 +2796,7 @@ out: | |||
2647 | } | 2796 | } |
2648 | 2797 | ||
2649 | /* | 2798 | /* |
2650 | * Return the order we were reclaiming at so sleeping_prematurely() | 2799 | * Return the order we were reclaiming at so prepare_kswapd_sleep() |
2651 | * makes a decision on the order we were last reclaiming at. However, | 2800 | * makes a decision on the order we were last reclaiming at. However, |
2652 | * if another caller entered the allocator slow path while kswapd | 2801 | * if another caller entered the allocator slow path while kswapd |
2653 | * was awake, order will remain at the higher level | 2802 | * was awake, order will remain at the higher level |
@@ -2667,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2667 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 2816 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
2668 | 2817 | ||
2669 | /* Try to sleep for a short interval */ | 2818 | /* Try to sleep for a short interval */ |
2670 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | 2819 | if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { |
2671 | remaining = schedule_timeout(HZ/10); | 2820 | remaining = schedule_timeout(HZ/10); |
2672 | finish_wait(&pgdat->kswapd_wait, &wait); | 2821 | finish_wait(&pgdat->kswapd_wait, &wait); |
2673 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 2822 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
@@ -2677,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2677 | * After a short sleep, check if it was a premature sleep. If not, then | 2826 | * After a short sleep, check if it was a premature sleep. If not, then |
2678 | * go fully to sleep until explicitly woken up. | 2827 | * go fully to sleep until explicitly woken up. |
2679 | */ | 2828 | */ |
2680 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | 2829 | if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { |
2681 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | 2830 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); |
2682 | 2831 | ||
2683 | /* | 2832 | /* |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 1bbbbd9776ad..df7a6748231d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -745,6 +745,7 @@ const char * const vmstat_text[] = { | |||
745 | TEXTS_FOR_ZONES("pgsteal_direct") | 745 | TEXTS_FOR_ZONES("pgsteal_direct") |
746 | TEXTS_FOR_ZONES("pgscan_kswapd") | 746 | TEXTS_FOR_ZONES("pgscan_kswapd") |
747 | TEXTS_FOR_ZONES("pgscan_direct") | 747 | TEXTS_FOR_ZONES("pgscan_direct") |
748 | "pgscan_direct_throttle", | ||
748 | 749 | ||
749 | #ifdef CONFIG_NUMA | 750 | #ifdef CONFIG_NUMA |
750 | "zone_reclaim_failed", | 751 | "zone_reclaim_failed", |
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 78f1cdad5b33..095259f83902 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c | |||
@@ -141,7 +141,7 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) | |||
141 | err = sk_filter(sk, skb); | 141 | err = sk_filter(sk, skb); |
142 | if (err) | 142 | if (err) |
143 | return err; | 143 | return err; |
144 | if (!sk_rmem_schedule(sk, skb->truesize) && rx_flow_is_on(cf_sk)) { | 144 | if (!sk_rmem_schedule(sk, skb, skb->truesize) && rx_flow_is_on(cf_sk)) { |
145 | set_rx_flow_off(cf_sk); | 145 | set_rx_flow_off(cf_sk); |
146 | net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n"); | 146 | net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n"); |
147 | caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); | 147 | caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); |
diff --git a/net/core/dev.c b/net/core/dev.c index c8569f826b71..0cb3fe8d8e72 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
@@ -3156,6 +3156,23 @@ void netdev_rx_handler_unregister(struct net_device *dev) | |||
3156 | } | 3156 | } |
3157 | EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); | 3157 | EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); |
3158 | 3158 | ||
3159 | /* | ||
3160 | * Limit the use of PFMEMALLOC reserves to those protocols that implement | ||
3161 | * the special handling of PFMEMALLOC skbs. | ||
3162 | */ | ||
3163 | static bool skb_pfmemalloc_protocol(struct sk_buff *skb) | ||
3164 | { | ||
3165 | switch (skb->protocol) { | ||
3166 | case __constant_htons(ETH_P_ARP): | ||
3167 | case __constant_htons(ETH_P_IP): | ||
3168 | case __constant_htons(ETH_P_IPV6): | ||
3169 | case __constant_htons(ETH_P_8021Q): | ||
3170 | return true; | ||
3171 | default: | ||
3172 | return false; | ||
3173 | } | ||
3174 | } | ||
3175 | |||
3159 | static int __netif_receive_skb(struct sk_buff *skb) | 3176 | static int __netif_receive_skb(struct sk_buff *skb) |
3160 | { | 3177 | { |
3161 | struct packet_type *ptype, *pt_prev; | 3178 | struct packet_type *ptype, *pt_prev; |
@@ -3165,14 +3182,27 @@ static int __netif_receive_skb(struct sk_buff *skb) | |||
3165 | bool deliver_exact = false; | 3182 | bool deliver_exact = false; |
3166 | int ret = NET_RX_DROP; | 3183 | int ret = NET_RX_DROP; |
3167 | __be16 type; | 3184 | __be16 type; |
3185 | unsigned long pflags = current->flags; | ||
3168 | 3186 | ||
3169 | net_timestamp_check(!netdev_tstamp_prequeue, skb); | 3187 | net_timestamp_check(!netdev_tstamp_prequeue, skb); |
3170 | 3188 | ||
3171 | trace_netif_receive_skb(skb); | 3189 | trace_netif_receive_skb(skb); |
3172 | 3190 | ||
3191 | /* | ||
3192 | * PFMEMALLOC skbs are special, they should | ||
3193 | * - be delivered to SOCK_MEMALLOC sockets only | ||
3194 | * - stay away from userspace | ||
3195 | * - have bounded memory usage | ||
3196 | * | ||
3197 | * Use PF_MEMALLOC as this saves us from propagating the allocation | ||
3198 | * context down to all allocation sites. | ||
3199 | */ | ||
3200 | if (sk_memalloc_socks() && skb_pfmemalloc(skb)) | ||
3201 | current->flags |= PF_MEMALLOC; | ||
3202 | |||
3173 | /* if we've gotten here through NAPI, check netpoll */ | 3203 | /* if we've gotten here through NAPI, check netpoll */ |
3174 | if (netpoll_receive_skb(skb)) | 3204 | if (netpoll_receive_skb(skb)) |
3175 | return NET_RX_DROP; | 3205 | goto out; |
3176 | 3206 | ||
3177 | orig_dev = skb->dev; | 3207 | orig_dev = skb->dev; |
3178 | 3208 | ||
@@ -3192,7 +3222,7 @@ another_round: | |||
3192 | if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { | 3222 | if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { |
3193 | skb = vlan_untag(skb); | 3223 | skb = vlan_untag(skb); |
3194 | if (unlikely(!skb)) | 3224 | if (unlikely(!skb)) |
3195 | goto out; | 3225 | goto unlock; |
3196 | } | 3226 | } |
3197 | 3227 | ||
3198 | #ifdef CONFIG_NET_CLS_ACT | 3228 | #ifdef CONFIG_NET_CLS_ACT |
@@ -3202,6 +3232,9 @@ another_round: | |||
3202 | } | 3232 | } |
3203 | #endif | 3233 | #endif |
3204 | 3234 | ||
3235 | if (sk_memalloc_socks() && skb_pfmemalloc(skb)) | ||
3236 | goto skip_taps; | ||
3237 | |||
3205 | list_for_each_entry_rcu(ptype, &ptype_all, list) { | 3238 | list_for_each_entry_rcu(ptype, &ptype_all, list) { |
3206 | if (!ptype->dev || ptype->dev == skb->dev) { | 3239 | if (!ptype->dev || ptype->dev == skb->dev) { |
3207 | if (pt_prev) | 3240 | if (pt_prev) |
@@ -3210,13 +3243,18 @@ another_round: | |||
3210 | } | 3243 | } |
3211 | } | 3244 | } |
3212 | 3245 | ||
3246 | skip_taps: | ||
3213 | #ifdef CONFIG_NET_CLS_ACT | 3247 | #ifdef CONFIG_NET_CLS_ACT |
3214 | skb = handle_ing(skb, &pt_prev, &ret, orig_dev); | 3248 | skb = handle_ing(skb, &pt_prev, &ret, orig_dev); |
3215 | if (!skb) | 3249 | if (!skb) |
3216 | goto out; | 3250 | goto unlock; |
3217 | ncls: | 3251 | ncls: |
3218 | #endif | 3252 | #endif |
3219 | 3253 | ||
3254 | if (sk_memalloc_socks() && skb_pfmemalloc(skb) | ||
3255 | && !skb_pfmemalloc_protocol(skb)) | ||
3256 | goto drop; | ||
3257 | |||
3220 | rx_handler = rcu_dereference(skb->dev->rx_handler); | 3258 | rx_handler = rcu_dereference(skb->dev->rx_handler); |
3221 | if (vlan_tx_tag_present(skb)) { | 3259 | if (vlan_tx_tag_present(skb)) { |
3222 | if (pt_prev) { | 3260 | if (pt_prev) { |
@@ -3226,7 +3264,7 @@ ncls: | |||
3226 | if (vlan_do_receive(&skb, !rx_handler)) | 3264 | if (vlan_do_receive(&skb, !rx_handler)) |
3227 | goto another_round; | 3265 | goto another_round; |
3228 | else if (unlikely(!skb)) | 3266 | else if (unlikely(!skb)) |
3229 | goto out; | 3267 | goto unlock; |
3230 | } | 3268 | } |
3231 | 3269 | ||
3232 | if (rx_handler) { | 3270 | if (rx_handler) { |
@@ -3236,7 +3274,7 @@ ncls: | |||
3236 | } | 3274 | } |
3237 | switch (rx_handler(&skb)) { | 3275 | switch (rx_handler(&skb)) { |
3238 | case RX_HANDLER_CONSUMED: | 3276 | case RX_HANDLER_CONSUMED: |
3239 | goto out; | 3277 | goto unlock; |
3240 | case RX_HANDLER_ANOTHER: | 3278 | case RX_HANDLER_ANOTHER: |
3241 | goto another_round; | 3279 | goto another_round; |
3242 | case RX_HANDLER_EXACT: | 3280 | case RX_HANDLER_EXACT: |
@@ -3269,6 +3307,7 @@ ncls: | |||
3269 | else | 3307 | else |
3270 | ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); | 3308 | ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); |
3271 | } else { | 3309 | } else { |
3310 | drop: | ||
3272 | atomic_long_inc(&skb->dev->rx_dropped); | 3311 | atomic_long_inc(&skb->dev->rx_dropped); |
3273 | kfree_skb(skb); | 3312 | kfree_skb(skb); |
3274 | /* Jamal, now you will not able to escape explaining | 3313 | /* Jamal, now you will not able to escape explaining |
@@ -3277,8 +3316,10 @@ ncls: | |||
3277 | ret = NET_RX_DROP; | 3316 | ret = NET_RX_DROP; |
3278 | } | 3317 | } |
3279 | 3318 | ||
3280 | out: | 3319 | unlock: |
3281 | rcu_read_unlock(); | 3320 | rcu_read_unlock(); |
3321 | out: | ||
3322 | tsk_restore_flags(current, pflags, PF_MEMALLOC); | ||
3282 | return ret; | 3323 | return ret; |
3283 | } | 3324 | } |
3284 | 3325 | ||
diff --git a/net/core/filter.c b/net/core/filter.c index d4ce2dc712e3..907efd27ec77 100644 --- a/net/core/filter.c +++ b/net/core/filter.c | |||
@@ -83,6 +83,14 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) | |||
83 | int err; | 83 | int err; |
84 | struct sk_filter *filter; | 84 | struct sk_filter *filter; |
85 | 85 | ||
86 | /* | ||
87 | * If the skb was allocated from pfmemalloc reserves, only | ||
88 | * allow SOCK_MEMALLOC sockets to use it as this socket is | ||
89 | * helping free memory | ||
90 | */ | ||
91 | if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) | ||
92 | return -ENOMEM; | ||
93 | |||
86 | err = security_sock_rcv_skb(sk, skb); | 94 | err = security_sock_rcv_skb(sk, skb); |
87 | if (err) | 95 | if (err) |
88 | return err; | 96 | return err; |
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 368f65c15e4f..fe00d1208167 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c | |||
@@ -145,6 +145,43 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here) | |||
145 | BUG(); | 145 | BUG(); |
146 | } | 146 | } |
147 | 147 | ||
148 | |||
149 | /* | ||
150 | * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells | ||
151 | * the caller if emergency pfmemalloc reserves are being used. If it is and | ||
152 | * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves | ||
153 | * may be used. Otherwise, the packet data may be discarded until enough | ||
154 | * memory is free | ||
155 | */ | ||
156 | #define kmalloc_reserve(size, gfp, node, pfmemalloc) \ | ||
157 | __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc) | ||
158 | void *__kmalloc_reserve(size_t size, gfp_t flags, int node, unsigned long ip, | ||
159 | bool *pfmemalloc) | ||
160 | { | ||
161 | void *obj; | ||
162 | bool ret_pfmemalloc = false; | ||
163 | |||
164 | /* | ||
165 | * Try a regular allocation, when that fails and we're not entitled | ||
166 | * to the reserves, fail. | ||
167 | */ | ||
168 | obj = kmalloc_node_track_caller(size, | ||
169 | flags | __GFP_NOMEMALLOC | __GFP_NOWARN, | ||
170 | node); | ||
171 | if (obj || !(gfp_pfmemalloc_allowed(flags))) | ||
172 | goto out; | ||
173 | |||
174 | /* Try again but now we are using pfmemalloc reserves */ | ||
175 | ret_pfmemalloc = true; | ||
176 | obj = kmalloc_node_track_caller(size, flags, node); | ||
177 | |||
178 | out: | ||
179 | if (pfmemalloc) | ||
180 | *pfmemalloc = ret_pfmemalloc; | ||
181 | |||
182 | return obj; | ||
183 | } | ||
184 | |||
148 | /* Allocate a new skbuff. We do this ourselves so we can fill in a few | 185 | /* Allocate a new skbuff. We do this ourselves so we can fill in a few |
149 | * 'private' fields and also do memory statistics to find all the | 186 | * 'private' fields and also do memory statistics to find all the |
150 | * [BEEP] leaks. | 187 | * [BEEP] leaks. |
@@ -155,8 +192,10 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here) | |||
155 | * __alloc_skb - allocate a network buffer | 192 | * __alloc_skb - allocate a network buffer |
156 | * @size: size to allocate | 193 | * @size: size to allocate |
157 | * @gfp_mask: allocation mask | 194 | * @gfp_mask: allocation mask |
158 | * @fclone: allocate from fclone cache instead of head cache | 195 | * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache |
159 | * and allocate a cloned (child) skb | 196 | * instead of head cache and allocate a cloned (child) skb. |
197 | * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for | ||
198 | * allocations in case the data is required for writeback | ||
160 | * @node: numa node to allocate memory on | 199 | * @node: numa node to allocate memory on |
161 | * | 200 | * |
162 | * Allocate a new &sk_buff. The returned buffer has no headroom and a | 201 | * Allocate a new &sk_buff. The returned buffer has no headroom and a |
@@ -167,14 +206,19 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here) | |||
167 | * %GFP_ATOMIC. | 206 | * %GFP_ATOMIC. |
168 | */ | 207 | */ |
169 | struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, | 208 | struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, |
170 | int fclone, int node) | 209 | int flags, int node) |
171 | { | 210 | { |
172 | struct kmem_cache *cache; | 211 | struct kmem_cache *cache; |
173 | struct skb_shared_info *shinfo; | 212 | struct skb_shared_info *shinfo; |
174 | struct sk_buff *skb; | 213 | struct sk_buff *skb; |
175 | u8 *data; | 214 | u8 *data; |
215 | bool pfmemalloc; | ||
176 | 216 | ||
177 | cache = fclone ? skbuff_fclone_cache : skbuff_head_cache; | 217 | cache = (flags & SKB_ALLOC_FCLONE) |
218 | ? skbuff_fclone_cache : skbuff_head_cache; | ||
219 | |||
220 | if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) | ||
221 | gfp_mask |= __GFP_MEMALLOC; | ||
178 | 222 | ||
179 | /* Get the HEAD */ | 223 | /* Get the HEAD */ |
180 | skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); | 224 | skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); |
@@ -189,7 +233,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, | |||
189 | */ | 233 | */ |
190 | size = SKB_DATA_ALIGN(size); | 234 | size = SKB_DATA_ALIGN(size); |
191 | size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); | 235 | size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); |
192 | data = kmalloc_node_track_caller(size, gfp_mask, node); | 236 | data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); |
193 | if (!data) | 237 | if (!data) |
194 | goto nodata; | 238 | goto nodata; |
195 | /* kmalloc(size) might give us more room than requested. | 239 | /* kmalloc(size) might give us more room than requested. |
@@ -207,6 +251,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, | |||
207 | memset(skb, 0, offsetof(struct sk_buff, tail)); | 251 | memset(skb, 0, offsetof(struct sk_buff, tail)); |
208 | /* Account for allocated memory : skb + skb->head */ | 252 | /* Account for allocated memory : skb + skb->head */ |
209 | skb->truesize = SKB_TRUESIZE(size); | 253 | skb->truesize = SKB_TRUESIZE(size); |
254 | skb->pfmemalloc = pfmemalloc; | ||
210 | atomic_set(&skb->users, 1); | 255 | atomic_set(&skb->users, 1); |
211 | skb->head = data; | 256 | skb->head = data; |
212 | skb->data = data; | 257 | skb->data = data; |
@@ -222,7 +267,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, | |||
222 | atomic_set(&shinfo->dataref, 1); | 267 | atomic_set(&shinfo->dataref, 1); |
223 | kmemcheck_annotate_variable(shinfo->destructor_arg); | 268 | kmemcheck_annotate_variable(shinfo->destructor_arg); |
224 | 269 | ||
225 | if (fclone) { | 270 | if (flags & SKB_ALLOC_FCLONE) { |
226 | struct sk_buff *child = skb + 1; | 271 | struct sk_buff *child = skb + 1; |
227 | atomic_t *fclone_ref = (atomic_t *) (child + 1); | 272 | atomic_t *fclone_ref = (atomic_t *) (child + 1); |
228 | 273 | ||
@@ -232,6 +277,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, | |||
232 | atomic_set(fclone_ref, 1); | 277 | atomic_set(fclone_ref, 1); |
233 | 278 | ||
234 | child->fclone = SKB_FCLONE_UNAVAILABLE; | 279 | child->fclone = SKB_FCLONE_UNAVAILABLE; |
280 | child->pfmemalloc = pfmemalloc; | ||
235 | } | 281 | } |
236 | out: | 282 | out: |
237 | return skb; | 283 | return skb; |
@@ -302,14 +348,7 @@ static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); | |||
302 | 348 | ||
303 | #define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES) | 349 | #define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES) |
304 | 350 | ||
305 | /** | 351 | static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) |
306 | * netdev_alloc_frag - allocate a page fragment | ||
307 | * @fragsz: fragment size | ||
308 | * | ||
309 | * Allocates a frag from a page for receive buffer. | ||
310 | * Uses GFP_ATOMIC allocations. | ||
311 | */ | ||
312 | void *netdev_alloc_frag(unsigned int fragsz) | ||
313 | { | 352 | { |
314 | struct netdev_alloc_cache *nc; | 353 | struct netdev_alloc_cache *nc; |
315 | void *data = NULL; | 354 | void *data = NULL; |
@@ -319,7 +358,7 @@ void *netdev_alloc_frag(unsigned int fragsz) | |||
319 | nc = &__get_cpu_var(netdev_alloc_cache); | 358 | nc = &__get_cpu_var(netdev_alloc_cache); |
320 | if (unlikely(!nc->page)) { | 359 | if (unlikely(!nc->page)) { |
321 | refill: | 360 | refill: |
322 | nc->page = alloc_page(GFP_ATOMIC | __GFP_COLD); | 361 | nc->page = alloc_page(gfp_mask); |
323 | if (unlikely(!nc->page)) | 362 | if (unlikely(!nc->page)) |
324 | goto end; | 363 | goto end; |
325 | recycle: | 364 | recycle: |
@@ -343,6 +382,18 @@ end: | |||
343 | local_irq_restore(flags); | 382 | local_irq_restore(flags); |
344 | return data; | 383 | return data; |
345 | } | 384 | } |
385 | |||
386 | /** | ||
387 | * netdev_alloc_frag - allocate a page fragment | ||
388 | * @fragsz: fragment size | ||
389 | * | ||
390 | * Allocates a frag from a page for receive buffer. | ||
391 | * Uses GFP_ATOMIC allocations. | ||
392 | */ | ||
393 | void *netdev_alloc_frag(unsigned int fragsz) | ||
394 | { | ||
395 | return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); | ||
396 | } | ||
346 | EXPORT_SYMBOL(netdev_alloc_frag); | 397 | EXPORT_SYMBOL(netdev_alloc_frag); |
347 | 398 | ||
348 | /** | 399 | /** |
@@ -366,7 +417,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, | |||
366 | SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); | 417 | SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); |
367 | 418 | ||
368 | if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { | 419 | if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { |
369 | void *data = netdev_alloc_frag(fragsz); | 420 | void *data; |
421 | |||
422 | if (sk_memalloc_socks()) | ||
423 | gfp_mask |= __GFP_MEMALLOC; | ||
424 | |||
425 | data = __netdev_alloc_frag(fragsz, gfp_mask); | ||
370 | 426 | ||
371 | if (likely(data)) { | 427 | if (likely(data)) { |
372 | skb = build_skb(data, fragsz); | 428 | skb = build_skb(data, fragsz); |
@@ -374,7 +430,8 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, | |||
374 | put_page(virt_to_head_page(data)); | 430 | put_page(virt_to_head_page(data)); |
375 | } | 431 | } |
376 | } else { | 432 | } else { |
377 | skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE); | 433 | skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, |
434 | SKB_ALLOC_RX, NUMA_NO_NODE); | ||
378 | } | 435 | } |
379 | if (likely(skb)) { | 436 | if (likely(skb)) { |
380 | skb_reserve(skb, NET_SKB_PAD); | 437 | skb_reserve(skb, NET_SKB_PAD); |
@@ -656,6 +713,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) | |||
656 | #if IS_ENABLED(CONFIG_IP_VS) | 713 | #if IS_ENABLED(CONFIG_IP_VS) |
657 | new->ipvs_property = old->ipvs_property; | 714 | new->ipvs_property = old->ipvs_property; |
658 | #endif | 715 | #endif |
716 | new->pfmemalloc = old->pfmemalloc; | ||
659 | new->protocol = old->protocol; | 717 | new->protocol = old->protocol; |
660 | new->mark = old->mark; | 718 | new->mark = old->mark; |
661 | new->skb_iif = old->skb_iif; | 719 | new->skb_iif = old->skb_iif; |
@@ -814,6 +872,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) | |||
814 | n->fclone = SKB_FCLONE_CLONE; | 872 | n->fclone = SKB_FCLONE_CLONE; |
815 | atomic_inc(fclone_ref); | 873 | atomic_inc(fclone_ref); |
816 | } else { | 874 | } else { |
875 | if (skb_pfmemalloc(skb)) | ||
876 | gfp_mask |= __GFP_MEMALLOC; | ||
877 | |||
817 | n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); | 878 | n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); |
818 | if (!n) | 879 | if (!n) |
819 | return NULL; | 880 | return NULL; |
@@ -850,6 +911,13 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) | |||
850 | skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; | 911 | skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; |
851 | } | 912 | } |
852 | 913 | ||
914 | static inline int skb_alloc_rx_flag(const struct sk_buff *skb) | ||
915 | { | ||
916 | if (skb_pfmemalloc(skb)) | ||
917 | return SKB_ALLOC_RX; | ||
918 | return 0; | ||
919 | } | ||
920 | |||
853 | /** | 921 | /** |
854 | * skb_copy - create private copy of an sk_buff | 922 | * skb_copy - create private copy of an sk_buff |
855 | * @skb: buffer to copy | 923 | * @skb: buffer to copy |
@@ -871,7 +939,8 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) | |||
871 | { | 939 | { |
872 | int headerlen = skb_headroom(skb); | 940 | int headerlen = skb_headroom(skb); |
873 | unsigned int size = skb_end_offset(skb) + skb->data_len; | 941 | unsigned int size = skb_end_offset(skb) + skb->data_len; |
874 | struct sk_buff *n = alloc_skb(size, gfp_mask); | 942 | struct sk_buff *n = __alloc_skb(size, gfp_mask, |
943 | skb_alloc_rx_flag(skb), NUMA_NO_NODE); | ||
875 | 944 | ||
876 | if (!n) | 945 | if (!n) |
877 | return NULL; | 946 | return NULL; |
@@ -906,7 +975,8 @@ EXPORT_SYMBOL(skb_copy); | |||
906 | struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) | 975 | struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) |
907 | { | 976 | { |
908 | unsigned int size = skb_headlen(skb) + headroom; | 977 | unsigned int size = skb_headlen(skb) + headroom; |
909 | struct sk_buff *n = alloc_skb(size, gfp_mask); | 978 | struct sk_buff *n = __alloc_skb(size, gfp_mask, |
979 | skb_alloc_rx_flag(skb), NUMA_NO_NODE); | ||
910 | 980 | ||
911 | if (!n) | 981 | if (!n) |
912 | goto out; | 982 | goto out; |
@@ -979,8 +1049,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, | |||
979 | 1049 | ||
980 | size = SKB_DATA_ALIGN(size); | 1050 | size = SKB_DATA_ALIGN(size); |
981 | 1051 | ||
982 | data = kmalloc(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), | 1052 | if (skb_pfmemalloc(skb)) |
983 | gfp_mask); | 1053 | gfp_mask |= __GFP_MEMALLOC; |
1054 | data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), | ||
1055 | gfp_mask, NUMA_NO_NODE, NULL); | ||
984 | if (!data) | 1056 | if (!data) |
985 | goto nodata; | 1057 | goto nodata; |
986 | size = SKB_WITH_OVERHEAD(ksize(data)); | 1058 | size = SKB_WITH_OVERHEAD(ksize(data)); |
@@ -1092,8 +1164,9 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, | |||
1092 | /* | 1164 | /* |
1093 | * Allocate the copy buffer | 1165 | * Allocate the copy buffer |
1094 | */ | 1166 | */ |
1095 | struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, | 1167 | struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, |
1096 | gfp_mask); | 1168 | gfp_mask, skb_alloc_rx_flag(skb), |
1169 | NUMA_NO_NODE); | ||
1097 | int oldheadroom = skb_headroom(skb); | 1170 | int oldheadroom = skb_headroom(skb); |
1098 | int head_copy_len, head_copy_off; | 1171 | int head_copy_len, head_copy_off; |
1099 | int off; | 1172 | int off; |
@@ -2775,8 +2848,9 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) | |||
2775 | skb_release_head_state(nskb); | 2848 | skb_release_head_state(nskb); |
2776 | __skb_push(nskb, doffset); | 2849 | __skb_push(nskb, doffset); |
2777 | } else { | 2850 | } else { |
2778 | nskb = alloc_skb(hsize + doffset + headroom, | 2851 | nskb = __alloc_skb(hsize + doffset + headroom, |
2779 | GFP_ATOMIC); | 2852 | GFP_ATOMIC, skb_alloc_rx_flag(skb), |
2853 | NUMA_NO_NODE); | ||
2780 | 2854 | ||
2781 | if (unlikely(!nskb)) | 2855 | if (unlikely(!nskb)) |
2782 | goto err; | 2856 | goto err; |
diff --git a/net/core/sock.c b/net/core/sock.c index 2676a88f533e..6b654b3ddfda 100644 --- a/net/core/sock.c +++ b/net/core/sock.c | |||
@@ -142,7 +142,7 @@ | |||
142 | static DEFINE_MUTEX(proto_list_mutex); | 142 | static DEFINE_MUTEX(proto_list_mutex); |
143 | static LIST_HEAD(proto_list); | 143 | static LIST_HEAD(proto_list); |
144 | 144 | ||
145 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | 145 | #ifdef CONFIG_MEMCG_KMEM |
146 | int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | 146 | int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) |
147 | { | 147 | { |
148 | struct proto *proto; | 148 | struct proto *proto; |
@@ -271,6 +271,61 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; | |||
271 | int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); | 271 | int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); |
272 | EXPORT_SYMBOL(sysctl_optmem_max); | 272 | EXPORT_SYMBOL(sysctl_optmem_max); |
273 | 273 | ||
274 | struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE; | ||
275 | EXPORT_SYMBOL_GPL(memalloc_socks); | ||
276 | |||
277 | /** | ||
278 | * sk_set_memalloc - sets %SOCK_MEMALLOC | ||
279 | * @sk: socket to set it on | ||
280 | * | ||
281 | * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. | ||
282 | * It's the responsibility of the admin to adjust min_free_kbytes | ||
283 | * to meet the requirements | ||
284 | */ | ||
285 | void sk_set_memalloc(struct sock *sk) | ||
286 | { | ||
287 | sock_set_flag(sk, SOCK_MEMALLOC); | ||
288 | sk->sk_allocation |= __GFP_MEMALLOC; | ||
289 | static_key_slow_inc(&memalloc_socks); | ||
290 | } | ||
291 | EXPORT_SYMBOL_GPL(sk_set_memalloc); | ||
292 | |||
293 | void sk_clear_memalloc(struct sock *sk) | ||
294 | { | ||
295 | sock_reset_flag(sk, SOCK_MEMALLOC); | ||
296 | sk->sk_allocation &= ~__GFP_MEMALLOC; | ||
297 | static_key_slow_dec(&memalloc_socks); | ||
298 | |||
299 | /* | ||
300 | * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward | ||
301 | * progress of swapping. However, if SOCK_MEMALLOC is cleared while | ||
302 | * it has rmem allocations there is a risk that the user of the | ||
303 | * socket cannot make forward progress due to exceeding the rmem | ||
304 | * limits. By rights, sk_clear_memalloc() should only be called | ||
305 | * on sockets being torn down but warn and reset the accounting if | ||
306 | * that assumption breaks. | ||
307 | */ | ||
308 | if (WARN_ON(sk->sk_forward_alloc)) | ||
309 | sk_mem_reclaim(sk); | ||
310 | } | ||
311 | EXPORT_SYMBOL_GPL(sk_clear_memalloc); | ||
312 | |||
313 | int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) | ||
314 | { | ||
315 | int ret; | ||
316 | unsigned long pflags = current->flags; | ||
317 | |||
318 | /* these should have been dropped before queueing */ | ||
319 | BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); | ||
320 | |||
321 | current->flags |= PF_MEMALLOC; | ||
322 | ret = sk->sk_backlog_rcv(sk, skb); | ||
323 | tsk_restore_flags(current, pflags, PF_MEMALLOC); | ||
324 | |||
325 | return ret; | ||
326 | } | ||
327 | EXPORT_SYMBOL(__sk_backlog_rcv); | ||
328 | |||
274 | #if defined(CONFIG_CGROUPS) | 329 | #if defined(CONFIG_CGROUPS) |
275 | #if !defined(CONFIG_NET_CLS_CGROUP) | 330 | #if !defined(CONFIG_NET_CLS_CGROUP) |
276 | int net_cls_subsys_id = -1; | 331 | int net_cls_subsys_id = -1; |
@@ -353,7 +408,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) | |||
353 | if (err) | 408 | if (err) |
354 | return err; | 409 | return err; |
355 | 410 | ||
356 | if (!sk_rmem_schedule(sk, skb->truesize)) { | 411 | if (!sk_rmem_schedule(sk, skb, skb->truesize)) { |
357 | atomic_inc(&sk->sk_drops); | 412 | atomic_inc(&sk->sk_drops); |
358 | return -ENOBUFS; | 413 | return -ENOBUFS; |
359 | } | 414 | } |
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ae2ccf2890e4..15ca63ec604e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile | |||
@@ -49,7 +49,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o | |||
49 | obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o | 49 | obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o |
50 | obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o | 50 | obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o |
51 | obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o | 51 | obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o |
52 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) += tcp_memcontrol.o | 52 | obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o |
53 | obj-$(CONFIG_NETLABEL) += cipso_ipv4.o | 53 | obj-$(CONFIG_NETLABEL) += cipso_ipv4.o |
54 | 54 | ||
55 | obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ | 55 | obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4b6487a68279..1b5ce96707a3 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -184,7 +184,7 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write, | |||
184 | int ret; | 184 | int ret; |
185 | unsigned long vec[3]; | 185 | unsigned long vec[3]; |
186 | struct net *net = current->nsproxy->net_ns; | 186 | struct net *net = current->nsproxy->net_ns; |
187 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | 187 | #ifdef CONFIG_MEMCG_KMEM |
188 | struct mem_cgroup *memcg; | 188 | struct mem_cgroup *memcg; |
189 | #endif | 189 | #endif |
190 | 190 | ||
@@ -203,7 +203,7 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write, | |||
203 | if (ret) | 203 | if (ret) |
204 | return ret; | 204 | return ret; |
205 | 205 | ||
206 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | 206 | #ifdef CONFIG_MEMCG_KMEM |
207 | rcu_read_lock(); | 207 | rcu_read_lock(); |
208 | memcg = mem_cgroup_from_task(current); | 208 | memcg = mem_cgroup_from_task(current); |
209 | 209 | ||
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9be30b039ae3..2fd2bc9e3c64 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -4351,19 +4351,20 @@ static void tcp_ofo_queue(struct sock *sk) | |||
4351 | static bool tcp_prune_ofo_queue(struct sock *sk); | 4351 | static bool tcp_prune_ofo_queue(struct sock *sk); |
4352 | static int tcp_prune_queue(struct sock *sk); | 4352 | static int tcp_prune_queue(struct sock *sk); |
4353 | 4353 | ||
4354 | static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) | 4354 | static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, |
4355 | unsigned int size) | ||
4355 | { | 4356 | { |
4356 | if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || | 4357 | if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || |
4357 | !sk_rmem_schedule(sk, size)) { | 4358 | !sk_rmem_schedule(sk, skb, size)) { |
4358 | 4359 | ||
4359 | if (tcp_prune_queue(sk) < 0) | 4360 | if (tcp_prune_queue(sk) < 0) |
4360 | return -1; | 4361 | return -1; |
4361 | 4362 | ||
4362 | if (!sk_rmem_schedule(sk, size)) { | 4363 | if (!sk_rmem_schedule(sk, skb, size)) { |
4363 | if (!tcp_prune_ofo_queue(sk)) | 4364 | if (!tcp_prune_ofo_queue(sk)) |
4364 | return -1; | 4365 | return -1; |
4365 | 4366 | ||
4366 | if (!sk_rmem_schedule(sk, size)) | 4367 | if (!sk_rmem_schedule(sk, skb, size)) |
4367 | return -1; | 4368 | return -1; |
4368 | } | 4369 | } |
4369 | } | 4370 | } |
@@ -4418,7 +4419,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) | |||
4418 | 4419 | ||
4419 | TCP_ECN_check_ce(tp, skb); | 4420 | TCP_ECN_check_ce(tp, skb); |
4420 | 4421 | ||
4421 | if (unlikely(tcp_try_rmem_schedule(sk, skb->truesize))) { | 4422 | if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { |
4422 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); | 4423 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); |
4423 | __kfree_skb(skb); | 4424 | __kfree_skb(skb); |
4424 | return; | 4425 | return; |
@@ -4552,17 +4553,17 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int | |||
4552 | 4553 | ||
4553 | int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) | 4554 | int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) |
4554 | { | 4555 | { |
4555 | struct sk_buff *skb; | 4556 | struct sk_buff *skb = NULL; |
4556 | struct tcphdr *th; | 4557 | struct tcphdr *th; |
4557 | bool fragstolen; | 4558 | bool fragstolen; |
4558 | 4559 | ||
4559 | if (tcp_try_rmem_schedule(sk, size + sizeof(*th))) | ||
4560 | goto err; | ||
4561 | |||
4562 | skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); | 4560 | skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); |
4563 | if (!skb) | 4561 | if (!skb) |
4564 | goto err; | 4562 | goto err; |
4565 | 4563 | ||
4564 | if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th))) | ||
4565 | goto err_free; | ||
4566 | |||
4566 | th = (struct tcphdr *)skb_put(skb, sizeof(*th)); | 4567 | th = (struct tcphdr *)skb_put(skb, sizeof(*th)); |
4567 | skb_reset_transport_header(skb); | 4568 | skb_reset_transport_header(skb); |
4568 | memset(th, 0, sizeof(*th)); | 4569 | memset(th, 0, sizeof(*th)); |
@@ -4633,7 +4634,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) | |||
4633 | if (eaten <= 0) { | 4634 | if (eaten <= 0) { |
4634 | queue_and_out: | 4635 | queue_and_out: |
4635 | if (eaten < 0 && | 4636 | if (eaten < 0 && |
4636 | tcp_try_rmem_schedule(sk, skb->truesize)) | 4637 | tcp_try_rmem_schedule(sk, skb, skb->truesize)) |
4637 | goto drop; | 4638 | goto drop; |
4638 | 4639 | ||
4639 | eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); | 4640 | eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7f91e5ac8277..42b2a6a73092 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -2633,7 +2633,7 @@ struct proto tcp_prot = { | |||
2633 | .compat_setsockopt = compat_tcp_setsockopt, | 2633 | .compat_setsockopt = compat_tcp_setsockopt, |
2634 | .compat_getsockopt = compat_tcp_getsockopt, | 2634 | .compat_getsockopt = compat_tcp_getsockopt, |
2635 | #endif | 2635 | #endif |
2636 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | 2636 | #ifdef CONFIG_MEMCG_KMEM |
2637 | .init_cgroup = tcp_init_cgroup, | 2637 | .init_cgroup = tcp_init_cgroup, |
2638 | .destroy_cgroup = tcp_destroy_cgroup, | 2638 | .destroy_cgroup = tcp_destroy_cgroup, |
2639 | .proto_cgroup = tcp_proto_cgroup, | 2639 | .proto_cgroup = tcp_proto_cgroup, |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 33cd065cfbd8..3f1bcff0b10b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -2045,7 +2045,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, | |||
2045 | if (unlikely(sk->sk_state == TCP_CLOSE)) | 2045 | if (unlikely(sk->sk_state == TCP_CLOSE)) |
2046 | return; | 2046 | return; |
2047 | 2047 | ||
2048 | if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC)) | 2048 | if (tcp_write_xmit(sk, cur_mss, nonagle, 0, |
2049 | sk_gfp_atomic(sk, GFP_ATOMIC))) | ||
2049 | tcp_check_probe_timer(sk); | 2050 | tcp_check_probe_timer(sk); |
2050 | } | 2051 | } |
2051 | 2052 | ||
@@ -2666,7 +2667,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2666 | 2667 | ||
2667 | if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) | 2668 | if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) |
2668 | s_data_desired = cvp->s_data_desired; | 2669 | s_data_desired = cvp->s_data_desired; |
2669 | skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC); | 2670 | skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, |
2671 | sk_gfp_atomic(sk, GFP_ATOMIC)); | ||
2670 | if (unlikely(!skb)) { | 2672 | if (unlikely(!skb)) { |
2671 | dst_release(dst); | 2673 | dst_release(dst); |
2672 | return NULL; | 2674 | return NULL; |
@@ -3064,7 +3066,7 @@ void tcp_send_ack(struct sock *sk) | |||
3064 | * tcp_transmit_skb() will set the ownership to this | 3066 | * tcp_transmit_skb() will set the ownership to this |
3065 | * sock. | 3067 | * sock. |
3066 | */ | 3068 | */ |
3067 | buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); | 3069 | buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); |
3068 | if (buff == NULL) { | 3070 | if (buff == NULL) { |
3069 | inet_csk_schedule_ack(sk); | 3071 | inet_csk_schedule_ack(sk); |
3070 | inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; | 3072 | inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; |
@@ -3079,7 +3081,7 @@ void tcp_send_ack(struct sock *sk) | |||
3079 | 3081 | ||
3080 | /* Send it off, this clears delayed acks for us. */ | 3082 | /* Send it off, this clears delayed acks for us. */ |
3081 | TCP_SKB_CB(buff)->when = tcp_time_stamp; | 3083 | TCP_SKB_CB(buff)->when = tcp_time_stamp; |
3082 | tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); | 3084 | tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); |
3083 | } | 3085 | } |
3084 | 3086 | ||
3085 | /* This routine sends a packet with an out of date sequence | 3087 | /* This routine sends a packet with an out of date sequence |
@@ -3099,7 +3101,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) | |||
3099 | struct sk_buff *skb; | 3101 | struct sk_buff *skb; |
3100 | 3102 | ||
3101 | /* We don't queue it, tcp_transmit_skb() sets ownership. */ | 3103 | /* We don't queue it, tcp_transmit_skb() sets ownership. */ |
3102 | skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); | 3104 | skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); |
3103 | if (skb == NULL) | 3105 | if (skb == NULL) |
3104 | return -1; | 3106 | return -1; |
3105 | 3107 | ||
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 221224e72507..c66b90f71c9b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c | |||
@@ -1299,7 +1299,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, | |||
1299 | /* Clone pktoptions received with SYN */ | 1299 | /* Clone pktoptions received with SYN */ |
1300 | newnp->pktoptions = NULL; | 1300 | newnp->pktoptions = NULL; |
1301 | if (treq->pktopts != NULL) { | 1301 | if (treq->pktopts != NULL) { |
1302 | newnp->pktoptions = skb_clone(treq->pktopts, GFP_ATOMIC); | 1302 | newnp->pktoptions = skb_clone(treq->pktopts, |
1303 | sk_gfp_atomic(sk, GFP_ATOMIC)); | ||
1303 | consume_skb(treq->pktopts); | 1304 | consume_skb(treq->pktopts); |
1304 | treq->pktopts = NULL; | 1305 | treq->pktopts = NULL; |
1305 | if (newnp->pktoptions) | 1306 | if (newnp->pktoptions) |
@@ -1349,7 +1350,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, | |||
1349 | * across. Shucks. | 1350 | * across. Shucks. |
1350 | */ | 1351 | */ |
1351 | tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newnp->daddr, | 1352 | tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newnp->daddr, |
1352 | AF_INET6, key->key, key->keylen, GFP_ATOMIC); | 1353 | AF_INET6, key->key, key->keylen, |
1354 | sk_gfp_atomic(sk, GFP_ATOMIC)); | ||
1353 | } | 1355 | } |
1354 | #endif | 1356 | #endif |
1355 | 1357 | ||
@@ -1442,7 +1444,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) | |||
1442 | --ANK (980728) | 1444 | --ANK (980728) |
1443 | */ | 1445 | */ |
1444 | if (np->rxopt.all) | 1446 | if (np->rxopt.all) |
1445 | opt_skb = skb_clone(skb, GFP_ATOMIC); | 1447 | opt_skb = skb_clone(skb, sk_gfp_atomic(sk, GFP_ATOMIC)); |
1446 | 1448 | ||
1447 | if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ | 1449 | if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ |
1448 | sock_rps_save_rxhash(sk, skb); | 1450 | sock_rps_save_rxhash(sk, skb); |
@@ -2015,7 +2017,7 @@ struct proto tcpv6_prot = { | |||
2015 | .compat_setsockopt = compat_tcp_setsockopt, | 2017 | .compat_setsockopt = compat_tcp_setsockopt, |
2016 | .compat_getsockopt = compat_tcp_getsockopt, | 2018 | .compat_getsockopt = compat_tcp_getsockopt, |
2017 | #endif | 2019 | #endif |
2018 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | 2020 | #ifdef CONFIG_MEMCG_KMEM |
2019 | .proto_cgroup = tcp_proto_cgroup, | 2021 | .proto_cgroup = tcp_proto_cgroup, |
2020 | #endif | 2022 | #endif |
2021 | }; | 2023 | }; |
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 33d894776192..10c018a5b9fe 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c | |||
@@ -702,7 +702,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, | |||
702 | if (rx_count >= asoc->base.sk->sk_rcvbuf) { | 702 | if (rx_count >= asoc->base.sk->sk_rcvbuf) { |
703 | 703 | ||
704 | if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) || | 704 | if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) || |
705 | (!sk_rmem_schedule(asoc->base.sk, chunk->skb->truesize))) | 705 | (!sk_rmem_schedule(asoc->base.sk, chunk->skb, |
706 | chunk->skb->truesize))) | ||
706 | goto fail; | 707 | goto fail; |
707 | } | 708 | } |
708 | 709 | ||
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 9fe8857d8d59..03d03e37a7d5 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig | |||
@@ -21,6 +21,11 @@ config SUNRPC_XPRT_RDMA | |||
21 | 21 | ||
22 | If unsure, say N. | 22 | If unsure, say N. |
23 | 23 | ||
24 | config SUNRPC_SWAP | ||
25 | bool | ||
26 | depends on SUNRPC | ||
27 | select NETVM | ||
28 | |||
24 | config RPCSEC_GSS_KRB5 | 29 | config RPCSEC_GSS_KRB5 |
25 | tristate "Secure RPC: Kerberos V mechanism" | 30 | tristate "Secure RPC: Kerberos V mechanism" |
26 | depends on SUNRPC && CRYPTO | 31 | depends on SUNRPC && CRYPTO |
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index b05df36692ff..fa48c60aef23 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c | |||
@@ -717,6 +717,15 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) | |||
717 | atomic_inc(&clnt->cl_count); | 717 | atomic_inc(&clnt->cl_count); |
718 | if (clnt->cl_softrtry) | 718 | if (clnt->cl_softrtry) |
719 | task->tk_flags |= RPC_TASK_SOFT; | 719 | task->tk_flags |= RPC_TASK_SOFT; |
720 | if (sk_memalloc_socks()) { | ||
721 | struct rpc_xprt *xprt; | ||
722 | |||
723 | rcu_read_lock(); | ||
724 | xprt = rcu_dereference(clnt->cl_xprt); | ||
725 | if (xprt->swapper) | ||
726 | task->tk_flags |= RPC_TASK_SWAPPER; | ||
727 | rcu_read_unlock(); | ||
728 | } | ||
720 | /* Add to the client's list of all tasks */ | 729 | /* Add to the client's list of all tasks */ |
721 | spin_lock(&clnt->cl_lock); | 730 | spin_lock(&clnt->cl_lock); |
722 | list_add_tail(&task->tk_task, &clnt->cl_tasks); | 731 | list_add_tail(&task->tk_task, &clnt->cl_tasks); |
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 1f19aa15f89b..128494ec9a64 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c | |||
@@ -815,7 +815,10 @@ static void rpc_async_schedule(struct work_struct *work) | |||
815 | void *rpc_malloc(struct rpc_task *task, size_t size) | 815 | void *rpc_malloc(struct rpc_task *task, size_t size) |
816 | { | 816 | { |
817 | struct rpc_buffer *buf; | 817 | struct rpc_buffer *buf; |
818 | gfp_t gfp = RPC_IS_SWAPPER(task) ? GFP_ATOMIC : GFP_NOWAIT; | 818 | gfp_t gfp = GFP_NOWAIT; |
819 | |||
820 | if (RPC_IS_SWAPPER(task)) | ||
821 | gfp |= __GFP_MEMALLOC; | ||
819 | 822 | ||
820 | size += sizeof(struct rpc_buffer); | 823 | size += sizeof(struct rpc_buffer); |
821 | if (size <= RPC_BUFFER_MAXSIZE) | 824 | if (size <= RPC_BUFFER_MAXSIZE) |
@@ -889,7 +892,7 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta | |||
889 | static struct rpc_task * | 892 | static struct rpc_task * |
890 | rpc_alloc_task(void) | 893 | rpc_alloc_task(void) |
891 | { | 894 | { |
892 | return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS); | 895 | return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOIO); |
893 | } | 896 | } |
894 | 897 | ||
895 | /* | 898 | /* |
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 926679459e71..400567243f84 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c | |||
@@ -1930,6 +1930,45 @@ out: | |||
1930 | current->flags &= ~PF_FSTRANS; | 1930 | current->flags &= ~PF_FSTRANS; |
1931 | } | 1931 | } |
1932 | 1932 | ||
1933 | #ifdef CONFIG_SUNRPC_SWAP | ||
1934 | static void xs_set_memalloc(struct rpc_xprt *xprt) | ||
1935 | { | ||
1936 | struct sock_xprt *transport = container_of(xprt, struct sock_xprt, | ||
1937 | xprt); | ||
1938 | |||
1939 | if (xprt->swapper) | ||
1940 | sk_set_memalloc(transport->inet); | ||
1941 | } | ||
1942 | |||
1943 | /** | ||
1944 | * xs_swapper - Tag this transport as being used for swap. | ||
1945 | * @xprt: transport to tag | ||
1946 | * @enable: enable/disable | ||
1947 | * | ||
1948 | */ | ||
1949 | int xs_swapper(struct rpc_xprt *xprt, int enable) | ||
1950 | { | ||
1951 | struct sock_xprt *transport = container_of(xprt, struct sock_xprt, | ||
1952 | xprt); | ||
1953 | int err = 0; | ||
1954 | |||
1955 | if (enable) { | ||
1956 | xprt->swapper++; | ||
1957 | xs_set_memalloc(xprt); | ||
1958 | } else if (xprt->swapper) { | ||
1959 | xprt->swapper--; | ||
1960 | sk_clear_memalloc(transport->inet); | ||
1961 | } | ||
1962 | |||
1963 | return err; | ||
1964 | } | ||
1965 | EXPORT_SYMBOL_GPL(xs_swapper); | ||
1966 | #else | ||
1967 | static void xs_set_memalloc(struct rpc_xprt *xprt) | ||
1968 | { | ||
1969 | } | ||
1970 | #endif | ||
1971 | |||
1933 | static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) | 1972 | static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) |
1934 | { | 1973 | { |
1935 | struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); | 1974 | struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); |
@@ -1954,6 +1993,8 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) | |||
1954 | transport->sock = sock; | 1993 | transport->sock = sock; |
1955 | transport->inet = sk; | 1994 | transport->inet = sk; |
1956 | 1995 | ||
1996 | xs_set_memalloc(xprt); | ||
1997 | |||
1957 | write_unlock_bh(&sk->sk_callback_lock); | 1998 | write_unlock_bh(&sk->sk_callback_lock); |
1958 | } | 1999 | } |
1959 | xs_udp_do_set_buffer_size(xprt); | 2000 | xs_udp_do_set_buffer_size(xprt); |
@@ -2081,6 +2122,8 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) | |||
2081 | if (!xprt_bound(xprt)) | 2122 | if (!xprt_bound(xprt)) |
2082 | goto out; | 2123 | goto out; |
2083 | 2124 | ||
2125 | xs_set_memalloc(xprt); | ||
2126 | |||
2084 | /* Tell the socket layer to start connecting... */ | 2127 | /* Tell the socket layer to start connecting... */ |
2085 | xprt->stat.connect_count++; | 2128 | xprt->stat.connect_count++; |
2086 | xprt->stat.connect_start = jiffies; | 2129 | xprt->stat.connect_start = jiffies; |
diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 68d82daed257..4d3fab47e643 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c | |||
@@ -274,7 +274,7 @@ static struct avc_node *avc_alloc_node(void) | |||
274 | { | 274 | { |
275 | struct avc_node *node; | 275 | struct avc_node *node; |
276 | 276 | ||
277 | node = kmem_cache_zalloc(avc_node_cachep, GFP_ATOMIC); | 277 | node = kmem_cache_zalloc(avc_node_cachep, GFP_ATOMIC|__GFP_NOMEMALLOC); |
278 | if (!node) | 278 | if (!node) |
279 | goto out; | 279 | goto out; |
280 | 280 | ||
diff --git a/tools/testing/fault-injection/failcmd.sh b/tools/testing/fault-injection/failcmd.sh index 1776e924b202..78a9ed7fecdb 100644 --- a/tools/testing/fault-injection/failcmd.sh +++ b/tools/testing/fault-injection/failcmd.sh | |||
@@ -206,7 +206,7 @@ while true; do | |||
206 | esac | 206 | esac |
207 | done | 207 | done |
208 | 208 | ||
209 | [ -z "$@" ] && exit 0 | 209 | [ -z "$1" ] && exit 0 |
210 | 210 | ||
211 | echo $oom_kill_allocating_task > /proc/sys/vm/oom_kill_allocating_task | 211 | echo $oom_kill_allocating_task > /proc/sys/vm/oom_kill_allocating_task |
212 | echo $task_filter > $FAULTATTR/task-filter | 212 | echo $task_filter > $FAULTATTR/task-filter |