diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-10-28 11:26:12 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-10-28 11:26:12 -0400 |
commit | 7a9787e1eba95a166265e6a260cf30af04ef0a99 (patch) | |
tree | e730a4565e0318140d2fbd2f0415d18a339d7336 /mm | |
parent | 41b9eb264c8407655db57b60b4457fe1b2ec9977 (diff) | |
parent | 0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff) |
Merge commit 'v2.6.28-rc2' into x86/pci-ioapic-boot-irq-quirks
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 23 | ||||
-rw-r--r-- | mm/Makefile | 6 | ||||
-rw-r--r-- | mm/allocpercpu.c | 24 | ||||
-rw-r--r-- | mm/bootmem.c | 948 | ||||
-rw-r--r-- | mm/bounce.c | 2 | ||||
-rw-r--r-- | mm/fadvise.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 474 | ||||
-rw-r--r-- | mm/filemap_xip.c | 70 | ||||
-rw-r--r-- | mm/fremap.c | 30 | ||||
-rw-r--r-- | mm/highmem.c | 6 | ||||
-rw-r--r-- | mm/hugetlb.c | 1733 | ||||
-rw-r--r-- | mm/internal.h | 192 | ||||
-rw-r--r-- | mm/madvise.c | 4 | ||||
-rw-r--r-- | mm/memcontrol.c | 772 | ||||
-rw-r--r-- | mm/memory.c | 421 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 99 | ||||
-rw-r--r-- | mm/mempolicy.c | 21 | ||||
-rw-r--r-- | mm/migrate.c | 323 | ||||
-rw-r--r-- | mm/mlock.c | 445 | ||||
-rw-r--r-- | mm/mm_init.c | 152 | ||||
-rw-r--r-- | mm/mmap.c | 267 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 277 | ||||
-rw-r--r-- | mm/mmzone.c | 2 | ||||
-rw-r--r-- | mm/mprotect.c | 9 | ||||
-rw-r--r-- | mm/mremap.c | 14 | ||||
-rw-r--r-- | mm/nommu.c | 69 | ||||
-rw-r--r-- | mm/oom_kill.c | 6 | ||||
-rw-r--r-- | mm/page-writeback.c | 34 | ||||
-rw-r--r-- | mm/page_alloc.c | 304 | ||||
-rw-r--r-- | mm/page_cgroup.c | 256 | ||||
-rw-r--r-- | mm/page_isolation.c | 13 | ||||
-rw-r--r-- | mm/pdflush.c | 6 | ||||
-rw-r--r-- | mm/quicklist.c | 9 | ||||
-rw-r--r-- | mm/readahead.c | 10 | ||||
-rw-r--r-- | mm/rmap.c | 380 | ||||
-rw-r--r-- | mm/shmem.c | 118 | ||||
-rw-r--r-- | mm/shmem_acl.c | 2 | ||||
-rw-r--r-- | mm/slab.c | 64 | ||||
-rw-r--r-- | mm/slob.c | 28 | ||||
-rw-r--r-- | mm/slub.c | 149 | ||||
-rw-r--r-- | mm/sparse.c | 116 | ||||
-rw-r--r-- | mm/swap.c | 183 | ||||
-rw-r--r-- | mm/swap_state.c | 47 | ||||
-rw-r--r-- | mm/swapfile.c | 90 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 27 | ||||
-rw-r--r-- | mm/truncate.c | 22 | ||||
-rw-r--r-- | mm/util.c | 70 | ||||
-rw-r--r-- | mm/vmalloc.c | 1056 | ||||
-rw-r--r-- | mm/vmscan.c | 1117 | ||||
-rw-r--r-- | mm/vmstat.c | 124 |
50 files changed, 7869 insertions, 2747 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index c4de85285bb4..5b5790f8a816 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -101,7 +101,7 @@ config HAVE_MEMORY_PRESENT | |||
101 | # with gcc 3.4 and later. | 101 | # with gcc 3.4 and later. |
102 | # | 102 | # |
103 | config SPARSEMEM_STATIC | 103 | config SPARSEMEM_STATIC |
104 | def_bool n | 104 | bool |
105 | 105 | ||
106 | # | 106 | # |
107 | # Architecture platforms which require a two level mem_section in SPARSEMEM | 107 | # Architecture platforms which require a two level mem_section in SPARSEMEM |
@@ -113,7 +113,7 @@ config SPARSEMEM_EXTREME | |||
113 | depends on SPARSEMEM && !SPARSEMEM_STATIC | 113 | depends on SPARSEMEM && !SPARSEMEM_STATIC |
114 | 114 | ||
115 | config SPARSEMEM_VMEMMAP_ENABLE | 115 | config SPARSEMEM_VMEMMAP_ENABLE |
116 | def_bool n | 116 | bool |
117 | 117 | ||
118 | config SPARSEMEM_VMEMMAP | 118 | config SPARSEMEM_VMEMMAP |
119 | bool "Sparse Memory virtual memmap" | 119 | bool "Sparse Memory virtual memmap" |
@@ -174,7 +174,7 @@ config SPLIT_PTLOCK_CPUS | |||
174 | config MIGRATION | 174 | config MIGRATION |
175 | bool "Page migration" | 175 | bool "Page migration" |
176 | def_bool y | 176 | def_bool y |
177 | depends on NUMA | 177 | depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE |
178 | help | 178 | help |
179 | Allows the migration of the physical location of pages of processes | 179 | Allows the migration of the physical location of pages of processes |
180 | while the virtual addresses are not changed. This is useful for | 180 | while the virtual addresses are not changed. This is useful for |
@@ -187,6 +187,9 @@ config RESOURCES_64BIT | |||
187 | help | 187 | help |
188 | This option allows memory and IO resources to be 64 bit. | 188 | This option allows memory and IO resources to be 64 bit. |
189 | 189 | ||
190 | config PHYS_ADDR_T_64BIT | ||
191 | def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT | ||
192 | |||
190 | config ZONE_DMA_FLAG | 193 | config ZONE_DMA_FLAG |
191 | int | 194 | int |
192 | default "0" if !ZONE_DMA | 195 | default "0" if !ZONE_DMA |
@@ -205,3 +208,17 @@ config NR_QUICK | |||
205 | config VIRT_TO_BUS | 208 | config VIRT_TO_BUS |
206 | def_bool y | 209 | def_bool y |
207 | depends on !ARCH_NO_VIRT_TO_BUS | 210 | depends on !ARCH_NO_VIRT_TO_BUS |
211 | |||
212 | config UNEVICTABLE_LRU | ||
213 | bool "Add LRU list to track non-evictable pages" | ||
214 | default y | ||
215 | depends on MMU | ||
216 | help | ||
217 | Keeps unevictable pages off of the active and inactive pageout | ||
218 | lists, so kswapd will not waste CPU time or have its balancing | ||
219 | algorithms thrown off by scanning these pages. Selecting this | ||
220 | will use one page flag and increase the code size a little, | ||
221 | say Y unless you know what you are doing. | ||
222 | |||
223 | config MMU_NOTIFIER | ||
224 | bool | ||
diff --git a/mm/Makefile b/mm/Makefile index 18c143b3c46c..c06b45a1ff5f 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | |||
11 | maccess.o page_alloc.o page-writeback.o pdflush.o \ | 11 | maccess.o page_alloc.o page-writeback.o pdflush.o \ |
12 | readahead.o swap.o truncate.o vmscan.o \ | 12 | readahead.o swap.o truncate.o vmscan.o \ |
13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
14 | page_isolation.o $(mmu-y) | 14 | page_isolation.o mm_init.o $(mmu-y) |
15 | 15 | ||
16 | obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o | 16 | obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o |
17 | obj-$(CONFIG_BOUNCE) += bounce.o | 17 | obj-$(CONFIG_BOUNCE) += bounce.o |
@@ -25,6 +25,7 @@ obj-$(CONFIG_SHMEM) += shmem.o | |||
25 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o | 25 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o |
26 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o | 26 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o |
27 | obj-$(CONFIG_SLOB) += slob.o | 27 | obj-$(CONFIG_SLOB) += slob.o |
28 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | ||
28 | obj-$(CONFIG_SLAB) += slab.o | 29 | obj-$(CONFIG_SLAB) += slab.o |
29 | obj-$(CONFIG_SLUB) += slub.o | 30 | obj-$(CONFIG_SLUB) += slub.o |
30 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 31 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
@@ -32,5 +33,4 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o | |||
32 | obj-$(CONFIG_MIGRATION) += migrate.o | 33 | obj-$(CONFIG_MIGRATION) += migrate.o |
33 | obj-$(CONFIG_SMP) += allocpercpu.o | 34 | obj-$(CONFIG_SMP) += allocpercpu.o |
34 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 35 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
35 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o | 36 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o |
36 | |||
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index 05f2b4009ccc..4297bc41bfd2 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c | |||
@@ -18,27 +18,28 @@ | |||
18 | * Depopulating per-cpu data for a cpu going offline would be a typical | 18 | * Depopulating per-cpu data for a cpu going offline would be a typical |
19 | * use case. You need to register a cpu hotplug handler for that purpose. | 19 | * use case. You need to register a cpu hotplug handler for that purpose. |
20 | */ | 20 | */ |
21 | void percpu_depopulate(void *__pdata, int cpu) | 21 | static void percpu_depopulate(void *__pdata, int cpu) |
22 | { | 22 | { |
23 | struct percpu_data *pdata = __percpu_disguise(__pdata); | 23 | struct percpu_data *pdata = __percpu_disguise(__pdata); |
24 | 24 | ||
25 | kfree(pdata->ptrs[cpu]); | 25 | kfree(pdata->ptrs[cpu]); |
26 | pdata->ptrs[cpu] = NULL; | 26 | pdata->ptrs[cpu] = NULL; |
27 | } | 27 | } |
28 | EXPORT_SYMBOL_GPL(percpu_depopulate); | ||
29 | 28 | ||
30 | /** | 29 | /** |
31 | * percpu_depopulate_mask - depopulate per-cpu data for some cpu's | 30 | * percpu_depopulate_mask - depopulate per-cpu data for some cpu's |
32 | * @__pdata: per-cpu data to depopulate | 31 | * @__pdata: per-cpu data to depopulate |
33 | * @mask: depopulate per-cpu data for cpu's selected through mask bits | 32 | * @mask: depopulate per-cpu data for cpu's selected through mask bits |
34 | */ | 33 | */ |
35 | void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) | 34 | static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) |
36 | { | 35 | { |
37 | int cpu; | 36 | int cpu; |
38 | for_each_cpu_mask(cpu, *mask) | 37 | for_each_cpu_mask_nr(cpu, *mask) |
39 | percpu_depopulate(__pdata, cpu); | 38 | percpu_depopulate(__pdata, cpu); |
40 | } | 39 | } |
41 | EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); | 40 | |
41 | #define percpu_depopulate_mask(__pdata, mask) \ | ||
42 | __percpu_depopulate_mask((__pdata), &(mask)) | ||
42 | 43 | ||
43 | /** | 44 | /** |
44 | * percpu_populate - populate per-cpu data for given cpu | 45 | * percpu_populate - populate per-cpu data for given cpu |
@@ -51,7 +52,7 @@ EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); | |||
51 | * use case. You need to register a cpu hotplug handler for that purpose. | 52 | * use case. You need to register a cpu hotplug handler for that purpose. |
52 | * Per-cpu object is populated with zeroed buffer. | 53 | * Per-cpu object is populated with zeroed buffer. |
53 | */ | 54 | */ |
54 | void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) | 55 | static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) |
55 | { | 56 | { |
56 | struct percpu_data *pdata = __percpu_disguise(__pdata); | 57 | struct percpu_data *pdata = __percpu_disguise(__pdata); |
57 | int node = cpu_to_node(cpu); | 58 | int node = cpu_to_node(cpu); |
@@ -68,7 +69,6 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) | |||
68 | pdata->ptrs[cpu] = kzalloc(size, gfp); | 69 | pdata->ptrs[cpu] = kzalloc(size, gfp); |
69 | return pdata->ptrs[cpu]; | 70 | return pdata->ptrs[cpu]; |
70 | } | 71 | } |
71 | EXPORT_SYMBOL_GPL(percpu_populate); | ||
72 | 72 | ||
73 | /** | 73 | /** |
74 | * percpu_populate_mask - populate per-cpu data for more cpu's | 74 | * percpu_populate_mask - populate per-cpu data for more cpu's |
@@ -79,14 +79,14 @@ EXPORT_SYMBOL_GPL(percpu_populate); | |||
79 | * | 79 | * |
80 | * Per-cpu objects are populated with zeroed buffers. | 80 | * Per-cpu objects are populated with zeroed buffers. |
81 | */ | 81 | */ |
82 | int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, | 82 | static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, |
83 | cpumask_t *mask) | 83 | cpumask_t *mask) |
84 | { | 84 | { |
85 | cpumask_t populated; | 85 | cpumask_t populated; |
86 | int cpu; | 86 | int cpu; |
87 | 87 | ||
88 | cpus_clear(populated); | 88 | cpus_clear(populated); |
89 | for_each_cpu_mask(cpu, *mask) | 89 | for_each_cpu_mask_nr(cpu, *mask) |
90 | if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { | 90 | if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { |
91 | __percpu_depopulate_mask(__pdata, &populated); | 91 | __percpu_depopulate_mask(__pdata, &populated); |
92 | return -ENOMEM; | 92 | return -ENOMEM; |
@@ -94,7 +94,9 @@ int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, | |||
94 | cpu_set(cpu, populated); | 94 | cpu_set(cpu, populated); |
95 | return 0; | 95 | return 0; |
96 | } | 96 | } |
97 | EXPORT_SYMBOL_GPL(__percpu_populate_mask); | 97 | |
98 | #define percpu_populate_mask(__pdata, size, gfp, mask) \ | ||
99 | __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) | ||
98 | 100 | ||
99 | /** | 101 | /** |
100 | * percpu_alloc_mask - initial setup of per-cpu data | 102 | * percpu_alloc_mask - initial setup of per-cpu data |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 8d9f60e06f62..ac5a891f142a 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -1,12 +1,12 @@ | |||
1 | /* | 1 | /* |
2 | * linux/mm/bootmem.c | 2 | * bootmem - A boot-time physical memory allocator and configurator |
3 | * | 3 | * |
4 | * Copyright (C) 1999 Ingo Molnar | 4 | * Copyright (C) 1999 Ingo Molnar |
5 | * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 | 5 | * 1999 Kanoj Sarcar, SGI |
6 | * 2008 Johannes Weiner | ||
6 | * | 7 | * |
7 | * simple boot-time physical memory area allocator and | 8 | * Access to this subsystem has to be serialized externally (which is true |
8 | * free memory collector. It's used to deal with reserved | 9 | * for the boot process anyway). |
9 | * system memory and memory holes as well. | ||
10 | */ | 10 | */ |
11 | #include <linux/init.h> | 11 | #include <linux/init.h> |
12 | #include <linux/pfn.h> | 12 | #include <linux/pfn.h> |
@@ -19,15 +19,10 @@ | |||
19 | 19 | ||
20 | #include "internal.h" | 20 | #include "internal.h" |
21 | 21 | ||
22 | /* | ||
23 | * Access to this subsystem has to be serialized externally. (this is | ||
24 | * true for the boot process anyway) | ||
25 | */ | ||
26 | unsigned long max_low_pfn; | 22 | unsigned long max_low_pfn; |
27 | unsigned long min_low_pfn; | 23 | unsigned long min_low_pfn; |
28 | unsigned long max_pfn; | 24 | unsigned long max_pfn; |
29 | 25 | ||
30 | static LIST_HEAD(bdata_list); | ||
31 | #ifdef CONFIG_CRASH_DUMP | 26 | #ifdef CONFIG_CRASH_DUMP |
32 | /* | 27 | /* |
33 | * If we have booted due to a crash, max_pfn will be a very low value. We need | 28 | * If we have booted due to a crash, max_pfn will be a very low value. We need |
@@ -36,63 +31,72 @@ static LIST_HEAD(bdata_list); | |||
36 | unsigned long saved_max_pfn; | 31 | unsigned long saved_max_pfn; |
37 | #endif | 32 | #endif |
38 | 33 | ||
39 | /* return the number of _pages_ that will be allocated for the boot bitmap */ | 34 | bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; |
40 | unsigned long __init bootmem_bootmap_pages(unsigned long pages) | 35 | |
36 | static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); | ||
37 | |||
38 | static int bootmem_debug; | ||
39 | |||
40 | static int __init bootmem_debug_setup(char *buf) | ||
41 | { | 41 | { |
42 | unsigned long mapsize; | 42 | bootmem_debug = 1; |
43 | return 0; | ||
44 | } | ||
45 | early_param("bootmem_debug", bootmem_debug_setup); | ||
43 | 46 | ||
44 | mapsize = (pages+7)/8; | 47 | #define bdebug(fmt, args...) ({ \ |
45 | mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; | 48 | if (unlikely(bootmem_debug)) \ |
46 | mapsize >>= PAGE_SHIFT; | 49 | printk(KERN_INFO \ |
50 | "bootmem::%s " fmt, \ | ||
51 | __func__, ## args); \ | ||
52 | }) | ||
47 | 53 | ||
48 | return mapsize; | 54 | static unsigned long __init bootmap_bytes(unsigned long pages) |
55 | { | ||
56 | unsigned long bytes = (pages + 7) / 8; | ||
57 | |||
58 | return ALIGN(bytes, sizeof(long)); | ||
49 | } | 59 | } |
50 | 60 | ||
51 | /* | 61 | /** |
52 | * link bdata in order | 62 | * bootmem_bootmap_pages - calculate bitmap size in pages |
63 | * @pages: number of pages the bitmap has to represent | ||
53 | */ | 64 | */ |
54 | static void __init link_bootmem(bootmem_data_t *bdata) | 65 | unsigned long __init bootmem_bootmap_pages(unsigned long pages) |
55 | { | 66 | { |
56 | bootmem_data_t *ent; | 67 | unsigned long bytes = bootmap_bytes(pages); |
57 | 68 | ||
58 | if (list_empty(&bdata_list)) { | 69 | return PAGE_ALIGN(bytes) >> PAGE_SHIFT; |
59 | list_add(&bdata->list, &bdata_list); | ||
60 | return; | ||
61 | } | ||
62 | /* insert in order */ | ||
63 | list_for_each_entry(ent, &bdata_list, list) { | ||
64 | if (bdata->node_boot_start < ent->node_boot_start) { | ||
65 | list_add_tail(&bdata->list, &ent->list); | ||
66 | return; | ||
67 | } | ||
68 | } | ||
69 | list_add_tail(&bdata->list, &bdata_list); | ||
70 | } | 70 | } |
71 | 71 | ||
72 | /* | 72 | /* |
73 | * Given an initialised bdata, it returns the size of the boot bitmap | 73 | * link bdata in order |
74 | */ | 74 | */ |
75 | static unsigned long __init get_mapsize(bootmem_data_t *bdata) | 75 | static void __init link_bootmem(bootmem_data_t *bdata) |
76 | { | 76 | { |
77 | unsigned long mapsize; | 77 | struct list_head *iter; |
78 | unsigned long start = PFN_DOWN(bdata->node_boot_start); | ||
79 | unsigned long end = bdata->node_low_pfn; | ||
80 | 78 | ||
81 | mapsize = ((end - start) + 7) / 8; | 79 | list_for_each(iter, &bdata_list) { |
82 | return ALIGN(mapsize, sizeof(long)); | 80 | bootmem_data_t *ent; |
81 | |||
82 | ent = list_entry(iter, bootmem_data_t, list); | ||
83 | if (bdata->node_min_pfn < ent->node_min_pfn) | ||
84 | break; | ||
85 | } | ||
86 | list_add_tail(&bdata->list, iter); | ||
83 | } | 87 | } |
84 | 88 | ||
85 | /* | 89 | /* |
86 | * Called once to set up the allocator itself. | 90 | * Called once to set up the allocator itself. |
87 | */ | 91 | */ |
88 | static unsigned long __init init_bootmem_core(pg_data_t *pgdat, | 92 | static unsigned long __init init_bootmem_core(bootmem_data_t *bdata, |
89 | unsigned long mapstart, unsigned long start, unsigned long end) | 93 | unsigned long mapstart, unsigned long start, unsigned long end) |
90 | { | 94 | { |
91 | bootmem_data_t *bdata = pgdat->bdata; | ||
92 | unsigned long mapsize; | 95 | unsigned long mapsize; |
93 | 96 | ||
97 | mminit_validate_memmodel_limits(&start, &end); | ||
94 | bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); | 98 | bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); |
95 | bdata->node_boot_start = PFN_PHYS(start); | 99 | bdata->node_min_pfn = start; |
96 | bdata->node_low_pfn = end; | 100 | bdata->node_low_pfn = end; |
97 | link_bootmem(bdata); | 101 | link_bootmem(bdata); |
98 | 102 | ||
@@ -100,429 +104,484 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat, | |||
100 | * Initially all pages are reserved - setup_arch() has to | 104 | * Initially all pages are reserved - setup_arch() has to |
101 | * register free RAM areas explicitly. | 105 | * register free RAM areas explicitly. |
102 | */ | 106 | */ |
103 | mapsize = get_mapsize(bdata); | 107 | mapsize = bootmap_bytes(end - start); |
104 | memset(bdata->node_bootmem_map, 0xff, mapsize); | 108 | memset(bdata->node_bootmem_map, 0xff, mapsize); |
105 | 109 | ||
110 | bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n", | ||
111 | bdata - bootmem_node_data, start, mapstart, end, mapsize); | ||
112 | |||
106 | return mapsize; | 113 | return mapsize; |
107 | } | 114 | } |
108 | 115 | ||
109 | /* | 116 | /** |
110 | * Marks a particular physical memory range as unallocatable. Usable RAM | 117 | * init_bootmem_node - register a node as boot memory |
111 | * might be used for boot-time allocations - or it might get added | 118 | * @pgdat: node to register |
112 | * to the free page pool later on. | 119 | * @freepfn: pfn where the bitmap for this node is to be placed |
120 | * @startpfn: first pfn on the node | ||
121 | * @endpfn: first pfn after the node | ||
122 | * | ||
123 | * Returns the number of bytes needed to hold the bitmap for this node. | ||
113 | */ | 124 | */ |
114 | static int __init can_reserve_bootmem_core(bootmem_data_t *bdata, | 125 | unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, |
115 | unsigned long addr, unsigned long size, int flags) | 126 | unsigned long startpfn, unsigned long endpfn) |
116 | { | 127 | { |
117 | unsigned long sidx, eidx; | 128 | return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn); |
118 | unsigned long i; | 129 | } |
119 | 130 | ||
120 | BUG_ON(!size); | 131 | /** |
132 | * init_bootmem - register boot memory | ||
133 | * @start: pfn where the bitmap is to be placed | ||
134 | * @pages: number of available physical pages | ||
135 | * | ||
136 | * Returns the number of bytes needed to hold the bitmap. | ||
137 | */ | ||
138 | unsigned long __init init_bootmem(unsigned long start, unsigned long pages) | ||
139 | { | ||
140 | max_low_pfn = pages; | ||
141 | min_low_pfn = start; | ||
142 | return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); | ||
143 | } | ||
144 | |||
145 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | ||
146 | { | ||
147 | int aligned; | ||
148 | struct page *page; | ||
149 | unsigned long start, end, pages, count = 0; | ||
121 | 150 | ||
122 | /* out of range, don't hold other */ | 151 | if (!bdata->node_bootmem_map) |
123 | if (addr + size < bdata->node_boot_start || | ||
124 | PFN_DOWN(addr) > bdata->node_low_pfn) | ||
125 | return 0; | 152 | return 0; |
126 | 153 | ||
154 | start = bdata->node_min_pfn; | ||
155 | end = bdata->node_low_pfn; | ||
156 | |||
127 | /* | 157 | /* |
128 | * Round up to index to the range. | 158 | * If the start is aligned to the machines wordsize, we might |
159 | * be able to free pages in bulks of that order. | ||
129 | */ | 160 | */ |
130 | if (addr > bdata->node_boot_start) | 161 | aligned = !(start & (BITS_PER_LONG - 1)); |
131 | sidx= PFN_DOWN(addr - bdata->node_boot_start); | ||
132 | else | ||
133 | sidx = 0; | ||
134 | 162 | ||
135 | eidx = PFN_UP(addr + size - bdata->node_boot_start); | 163 | bdebug("nid=%td start=%lx end=%lx aligned=%d\n", |
136 | if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) | 164 | bdata - bootmem_node_data, start, end, aligned); |
137 | eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); | ||
138 | 165 | ||
139 | for (i = sidx; i < eidx; i++) { | 166 | while (start < end) { |
140 | if (test_bit(i, bdata->node_bootmem_map)) { | 167 | unsigned long *map, idx, vec; |
141 | if (flags & BOOTMEM_EXCLUSIVE) | 168 | |
142 | return -EBUSY; | 169 | map = bdata->node_bootmem_map; |
170 | idx = start - bdata->node_min_pfn; | ||
171 | vec = ~map[idx / BITS_PER_LONG]; | ||
172 | |||
173 | if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) { | ||
174 | int order = ilog2(BITS_PER_LONG); | ||
175 | |||
176 | __free_pages_bootmem(pfn_to_page(start), order); | ||
177 | count += BITS_PER_LONG; | ||
178 | } else { | ||
179 | unsigned long off = 0; | ||
180 | |||
181 | while (vec && off < BITS_PER_LONG) { | ||
182 | if (vec & 1) { | ||
183 | page = pfn_to_page(start + off); | ||
184 | __free_pages_bootmem(page, 0); | ||
185 | count++; | ||
186 | } | ||
187 | vec >>= 1; | ||
188 | off++; | ||
189 | } | ||
143 | } | 190 | } |
191 | start += BITS_PER_LONG; | ||
144 | } | 192 | } |
145 | 193 | ||
146 | return 0; | 194 | page = virt_to_page(bdata->node_bootmem_map); |
195 | pages = bdata->node_low_pfn - bdata->node_min_pfn; | ||
196 | pages = bootmem_bootmap_pages(pages); | ||
197 | count += pages; | ||
198 | while (pages--) | ||
199 | __free_pages_bootmem(page++, 0); | ||
147 | 200 | ||
201 | bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); | ||
202 | |||
203 | return count; | ||
148 | } | 204 | } |
149 | 205 | ||
150 | static void __init reserve_bootmem_core(bootmem_data_t *bdata, | 206 | /** |
151 | unsigned long addr, unsigned long size, int flags) | 207 | * free_all_bootmem_node - release a node's free pages to the buddy allocator |
208 | * @pgdat: node to be released | ||
209 | * | ||
210 | * Returns the number of pages actually released. | ||
211 | */ | ||
212 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | ||
152 | { | 213 | { |
153 | unsigned long sidx, eidx; | 214 | register_page_bootmem_info_node(pgdat); |
154 | unsigned long i; | 215 | return free_all_bootmem_core(pgdat->bdata); |
216 | } | ||
155 | 217 | ||
156 | BUG_ON(!size); | 218 | /** |
219 | * free_all_bootmem - release free pages to the buddy allocator | ||
220 | * | ||
221 | * Returns the number of pages actually released. | ||
222 | */ | ||
223 | unsigned long __init free_all_bootmem(void) | ||
224 | { | ||
225 | return free_all_bootmem_core(NODE_DATA(0)->bdata); | ||
226 | } | ||
157 | 227 | ||
158 | /* out of range */ | 228 | static void __init __free(bootmem_data_t *bdata, |
159 | if (addr + size < bdata->node_boot_start || | 229 | unsigned long sidx, unsigned long eidx) |
160 | PFN_DOWN(addr) > bdata->node_low_pfn) | 230 | { |
161 | return; | 231 | unsigned long idx; |
162 | 232 | ||
163 | /* | 233 | bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data, |
164 | * Round up to index to the range. | 234 | sidx + bdata->node_min_pfn, |
165 | */ | 235 | eidx + bdata->node_min_pfn); |
166 | if (addr > bdata->node_boot_start) | ||
167 | sidx= PFN_DOWN(addr - bdata->node_boot_start); | ||
168 | else | ||
169 | sidx = 0; | ||
170 | 236 | ||
171 | eidx = PFN_UP(addr + size - bdata->node_boot_start); | 237 | if (bdata->hint_idx > sidx) |
172 | if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) | 238 | bdata->hint_idx = sidx; |
173 | eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); | ||
174 | 239 | ||
175 | for (i = sidx; i < eidx; i++) { | 240 | for (idx = sidx; idx < eidx; idx++) |
176 | if (test_and_set_bit(i, bdata->node_bootmem_map)) { | 241 | if (!test_and_clear_bit(idx, bdata->node_bootmem_map)) |
177 | #ifdef CONFIG_DEBUG_BOOTMEM | 242 | BUG(); |
178 | printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); | 243 | } |
179 | #endif | 244 | |
245 | static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx, | ||
246 | unsigned long eidx, int flags) | ||
247 | { | ||
248 | unsigned long idx; | ||
249 | int exclusive = flags & BOOTMEM_EXCLUSIVE; | ||
250 | |||
251 | bdebug("nid=%td start=%lx end=%lx flags=%x\n", | ||
252 | bdata - bootmem_node_data, | ||
253 | sidx + bdata->node_min_pfn, | ||
254 | eidx + bdata->node_min_pfn, | ||
255 | flags); | ||
256 | |||
257 | for (idx = sidx; idx < eidx; idx++) | ||
258 | if (test_and_set_bit(idx, bdata->node_bootmem_map)) { | ||
259 | if (exclusive) { | ||
260 | __free(bdata, sidx, idx); | ||
261 | return -EBUSY; | ||
262 | } | ||
263 | bdebug("silent double reserve of PFN %lx\n", | ||
264 | idx + bdata->node_min_pfn); | ||
180 | } | 265 | } |
181 | } | 266 | return 0; |
182 | } | 267 | } |
183 | 268 | ||
184 | static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, | 269 | static int __init mark_bootmem_node(bootmem_data_t *bdata, |
185 | unsigned long size) | 270 | unsigned long start, unsigned long end, |
271 | int reserve, int flags) | ||
186 | { | 272 | { |
187 | unsigned long sidx, eidx; | 273 | unsigned long sidx, eidx; |
188 | unsigned long i; | ||
189 | 274 | ||
190 | BUG_ON(!size); | 275 | bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n", |
276 | bdata - bootmem_node_data, start, end, reserve, flags); | ||
191 | 277 | ||
192 | /* out range */ | 278 | BUG_ON(start < bdata->node_min_pfn); |
193 | if (addr + size < bdata->node_boot_start || | 279 | BUG_ON(end > bdata->node_low_pfn); |
194 | PFN_DOWN(addr) > bdata->node_low_pfn) | ||
195 | return; | ||
196 | /* | ||
197 | * round down end of usable mem, partially free pages are | ||
198 | * considered reserved. | ||
199 | */ | ||
200 | 280 | ||
201 | if (addr >= bdata->node_boot_start && addr < bdata->last_success) | 281 | sidx = start - bdata->node_min_pfn; |
202 | bdata->last_success = addr; | 282 | eidx = end - bdata->node_min_pfn; |
203 | 283 | ||
204 | /* | 284 | if (reserve) |
205 | * Round up to index to the range. | 285 | return __reserve(bdata, sidx, eidx, flags); |
206 | */ | ||
207 | if (PFN_UP(addr) > PFN_DOWN(bdata->node_boot_start)) | ||
208 | sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start); | ||
209 | else | 286 | else |
210 | sidx = 0; | 287 | __free(bdata, sidx, eidx); |
288 | return 0; | ||
289 | } | ||
211 | 290 | ||
212 | eidx = PFN_DOWN(addr + size - bdata->node_boot_start); | 291 | static int __init mark_bootmem(unsigned long start, unsigned long end, |
213 | if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) | 292 | int reserve, int flags) |
214 | eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); | 293 | { |
294 | unsigned long pos; | ||
295 | bootmem_data_t *bdata; | ||
215 | 296 | ||
216 | for (i = sidx; i < eidx; i++) { | 297 | pos = start; |
217 | if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) | 298 | list_for_each_entry(bdata, &bdata_list, list) { |
218 | BUG(); | 299 | int err; |
300 | unsigned long max; | ||
301 | |||
302 | if (pos < bdata->node_min_pfn || | ||
303 | pos >= bdata->node_low_pfn) { | ||
304 | BUG_ON(pos != start); | ||
305 | continue; | ||
306 | } | ||
307 | |||
308 | max = min(bdata->node_low_pfn, end); | ||
309 | |||
310 | err = mark_bootmem_node(bdata, pos, max, reserve, flags); | ||
311 | if (reserve && err) { | ||
312 | mark_bootmem(start, pos, 0, 0); | ||
313 | return err; | ||
314 | } | ||
315 | |||
316 | if (max == end) | ||
317 | return 0; | ||
318 | pos = bdata->node_low_pfn; | ||
219 | } | 319 | } |
320 | BUG(); | ||
220 | } | 321 | } |
221 | 322 | ||
222 | /* | 323 | /** |
223 | * We 'merge' subsequent allocations to save space. We might 'lose' | 324 | * free_bootmem_node - mark a page range as usable |
224 | * some fraction of a page if allocations cannot be satisfied due to | 325 | * @pgdat: node the range resides on |
225 | * size constraints on boxes where there is physical RAM space | 326 | * @physaddr: starting address of the range |
226 | * fragmentation - in these cases (mostly large memory boxes) this | 327 | * @size: size of the range in bytes |
227 | * is not a problem. | ||
228 | * | ||
229 | * On low memory boxes we get it right in 100% of the cases. | ||
230 | * | 328 | * |
231 | * alignment has to be a power of 2 value. | 329 | * Partial pages will be considered reserved and left as they are. |
232 | * | 330 | * |
233 | * NOTE: This function is _not_ reentrant. | 331 | * The range must reside completely on the specified node. |
234 | */ | 332 | */ |
235 | void * __init | 333 | void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, |
236 | __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, | 334 | unsigned long size) |
237 | unsigned long align, unsigned long goal, unsigned long limit) | ||
238 | { | 335 | { |
239 | unsigned long areasize, preferred; | 336 | unsigned long start, end; |
240 | unsigned long i, start = 0, incr, eidx, end_pfn; | ||
241 | void *ret; | ||
242 | unsigned long node_boot_start; | ||
243 | void *node_bootmem_map; | ||
244 | |||
245 | if (!size) { | ||
246 | printk("__alloc_bootmem_core(): zero-sized request\n"); | ||
247 | BUG(); | ||
248 | } | ||
249 | BUG_ON(align & (align-1)); | ||
250 | 337 | ||
251 | /* on nodes without memory - bootmem_map is NULL */ | 338 | start = PFN_UP(physaddr); |
252 | if (!bdata->node_bootmem_map) | 339 | end = PFN_DOWN(physaddr + size); |
253 | return NULL; | ||
254 | |||
255 | /* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */ | ||
256 | node_boot_start = bdata->node_boot_start; | ||
257 | node_bootmem_map = bdata->node_bootmem_map; | ||
258 | if (align) { | ||
259 | node_boot_start = ALIGN(bdata->node_boot_start, align); | ||
260 | if (node_boot_start > bdata->node_boot_start) | ||
261 | node_bootmem_map = (unsigned long *)bdata->node_bootmem_map + | ||
262 | PFN_DOWN(node_boot_start - bdata->node_boot_start)/BITS_PER_LONG; | ||
263 | } | ||
264 | |||
265 | if (limit && node_boot_start >= limit) | ||
266 | return NULL; | ||
267 | 340 | ||
268 | end_pfn = bdata->node_low_pfn; | 341 | mark_bootmem_node(pgdat->bdata, start, end, 0, 0); |
269 | limit = PFN_DOWN(limit); | 342 | } |
270 | if (limit && end_pfn > limit) | ||
271 | end_pfn = limit; | ||
272 | 343 | ||
273 | eidx = end_pfn - PFN_DOWN(node_boot_start); | 344 | /** |
345 | * free_bootmem - mark a page range as usable | ||
346 | * @addr: starting address of the range | ||
347 | * @size: size of the range in bytes | ||
348 | * | ||
349 | * Partial pages will be considered reserved and left as they are. | ||
350 | * | ||
351 | * The range must be contiguous but may span node boundaries. | ||
352 | */ | ||
353 | void __init free_bootmem(unsigned long addr, unsigned long size) | ||
354 | { | ||
355 | unsigned long start, end; | ||
274 | 356 | ||
275 | /* | 357 | start = PFN_UP(addr); |
276 | * We try to allocate bootmem pages above 'goal' | 358 | end = PFN_DOWN(addr + size); |
277 | * first, then we try to allocate lower pages. | ||
278 | */ | ||
279 | preferred = 0; | ||
280 | if (goal && PFN_DOWN(goal) < end_pfn) { | ||
281 | if (goal > node_boot_start) | ||
282 | preferred = goal - node_boot_start; | ||
283 | |||
284 | if (bdata->last_success > node_boot_start && | ||
285 | bdata->last_success - node_boot_start >= preferred) | ||
286 | if (!limit || (limit && limit > bdata->last_success)) | ||
287 | preferred = bdata->last_success - node_boot_start; | ||
288 | } | ||
289 | 359 | ||
290 | preferred = PFN_DOWN(ALIGN(preferred, align)); | 360 | mark_bootmem(start, end, 0, 0); |
291 | areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; | 361 | } |
292 | incr = align >> PAGE_SHIFT ? : 1; | ||
293 | 362 | ||
294 | restart_scan: | 363 | /** |
295 | for (i = preferred; i < eidx;) { | 364 | * reserve_bootmem_node - mark a page range as reserved |
296 | unsigned long j; | 365 | * @pgdat: node the range resides on |
366 | * @physaddr: starting address of the range | ||
367 | * @size: size of the range in bytes | ||
368 | * @flags: reservation flags (see linux/bootmem.h) | ||
369 | * | ||
370 | * Partial pages will be reserved. | ||
371 | * | ||
372 | * The range must reside completely on the specified node. | ||
373 | */ | ||
374 | int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | ||
375 | unsigned long size, int flags) | ||
376 | { | ||
377 | unsigned long start, end; | ||
297 | 378 | ||
298 | i = find_next_zero_bit(node_bootmem_map, eidx, i); | 379 | start = PFN_DOWN(physaddr); |
299 | i = ALIGN(i, incr); | 380 | end = PFN_UP(physaddr + size); |
300 | if (i >= eidx) | ||
301 | break; | ||
302 | if (test_bit(i, node_bootmem_map)) { | ||
303 | i += incr; | ||
304 | continue; | ||
305 | } | ||
306 | for (j = i + 1; j < i + areasize; ++j) { | ||
307 | if (j >= eidx) | ||
308 | goto fail_block; | ||
309 | if (test_bit(j, node_bootmem_map)) | ||
310 | goto fail_block; | ||
311 | } | ||
312 | start = i; | ||
313 | goto found; | ||
314 | fail_block: | ||
315 | i = ALIGN(j, incr); | ||
316 | if (i == j) | ||
317 | i += incr; | ||
318 | } | ||
319 | 381 | ||
320 | if (preferred > 0) { | 382 | return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); |
321 | preferred = 0; | 383 | } |
322 | goto restart_scan; | ||
323 | } | ||
324 | return NULL; | ||
325 | 384 | ||
326 | found: | 385 | #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE |
327 | bdata->last_success = PFN_PHYS(start) + node_boot_start; | 386 | /** |
328 | BUG_ON(start >= eidx); | 387 | * reserve_bootmem - mark a page range as usable |
388 | * @addr: starting address of the range | ||
389 | * @size: size of the range in bytes | ||
390 | * @flags: reservation flags (see linux/bootmem.h) | ||
391 | * | ||
392 | * Partial pages will be reserved. | ||
393 | * | ||
394 | * The range must be contiguous but may span node boundaries. | ||
395 | */ | ||
396 | int __init reserve_bootmem(unsigned long addr, unsigned long size, | ||
397 | int flags) | ||
398 | { | ||
399 | unsigned long start, end; | ||
329 | 400 | ||
330 | /* | 401 | start = PFN_DOWN(addr); |
331 | * Is the next page of the previous allocation-end the start | 402 | end = PFN_UP(addr + size); |
332 | * of this allocation's buffer? If yes then we can 'merge' | ||
333 | * the previous partial page with this allocation. | ||
334 | */ | ||
335 | if (align < PAGE_SIZE && | ||
336 | bdata->last_offset && bdata->last_pos+1 == start) { | ||
337 | unsigned long offset, remaining_size; | ||
338 | offset = ALIGN(bdata->last_offset, align); | ||
339 | BUG_ON(offset > PAGE_SIZE); | ||
340 | remaining_size = PAGE_SIZE - offset; | ||
341 | if (size < remaining_size) { | ||
342 | areasize = 0; | ||
343 | /* last_pos unchanged */ | ||
344 | bdata->last_offset = offset + size; | ||
345 | ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + | ||
346 | offset + node_boot_start); | ||
347 | } else { | ||
348 | remaining_size = size - remaining_size; | ||
349 | areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE; | ||
350 | ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + | ||
351 | offset + node_boot_start); | ||
352 | bdata->last_pos = start + areasize - 1; | ||
353 | bdata->last_offset = remaining_size; | ||
354 | } | ||
355 | bdata->last_offset &= ~PAGE_MASK; | ||
356 | } else { | ||
357 | bdata->last_pos = start + areasize - 1; | ||
358 | bdata->last_offset = size & ~PAGE_MASK; | ||
359 | ret = phys_to_virt(start * PAGE_SIZE + node_boot_start); | ||
360 | } | ||
361 | 403 | ||
362 | /* | 404 | return mark_bootmem(start, end, 1, flags); |
363 | * Reserve the area now: | ||
364 | */ | ||
365 | for (i = start; i < start + areasize; i++) | ||
366 | if (unlikely(test_and_set_bit(i, node_bootmem_map))) | ||
367 | BUG(); | ||
368 | memset(ret, 0, size); | ||
369 | return ret; | ||
370 | } | 405 | } |
406 | #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ | ||
371 | 407 | ||
372 | static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) | 408 | static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, |
409 | unsigned long step) | ||
373 | { | 410 | { |
374 | struct page *page; | 411 | unsigned long base = bdata->node_min_pfn; |
375 | unsigned long pfn; | ||
376 | bootmem_data_t *bdata = pgdat->bdata; | ||
377 | unsigned long i, count, total = 0; | ||
378 | unsigned long idx; | ||
379 | unsigned long *map; | ||
380 | int gofast = 0; | ||
381 | |||
382 | BUG_ON(!bdata->node_bootmem_map); | ||
383 | |||
384 | count = 0; | ||
385 | /* first extant page of the node */ | ||
386 | pfn = PFN_DOWN(bdata->node_boot_start); | ||
387 | idx = bdata->node_low_pfn - pfn; | ||
388 | map = bdata->node_bootmem_map; | ||
389 | /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */ | ||
390 | if (bdata->node_boot_start == 0 || | ||
391 | ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG)) | ||
392 | gofast = 1; | ||
393 | for (i = 0; i < idx; ) { | ||
394 | unsigned long v = ~map[i / BITS_PER_LONG]; | ||
395 | |||
396 | if (gofast && v == ~0UL) { | ||
397 | int order; | ||
398 | |||
399 | page = pfn_to_page(pfn); | ||
400 | count += BITS_PER_LONG; | ||
401 | order = ffs(BITS_PER_LONG) - 1; | ||
402 | __free_pages_bootmem(page, order); | ||
403 | i += BITS_PER_LONG; | ||
404 | page += BITS_PER_LONG; | ||
405 | } else if (v) { | ||
406 | unsigned long m; | ||
407 | |||
408 | page = pfn_to_page(pfn); | ||
409 | for (m = 1; m && i < idx; m<<=1, page++, i++) { | ||
410 | if (v & m) { | ||
411 | count++; | ||
412 | __free_pages_bootmem(page, 0); | ||
413 | } | ||
414 | } | ||
415 | } else { | ||
416 | i += BITS_PER_LONG; | ||
417 | } | ||
418 | pfn += BITS_PER_LONG; | ||
419 | } | ||
420 | total += count; | ||
421 | 412 | ||
422 | /* | 413 | /* |
423 | * Now free the allocator bitmap itself, it's not | 414 | * Align the index with respect to the node start so that the |
424 | * needed anymore: | 415 | * combination of both satisfies the requested alignment. |
425 | */ | 416 | */ |
426 | page = virt_to_page(bdata->node_bootmem_map); | ||
427 | count = 0; | ||
428 | idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT; | ||
429 | for (i = 0; i < idx; i++, page++) { | ||
430 | __free_pages_bootmem(page, 0); | ||
431 | count++; | ||
432 | } | ||
433 | total += count; | ||
434 | bdata->node_bootmem_map = NULL; | ||
435 | 417 | ||
436 | return total; | 418 | return ALIGN(base + idx, step) - base; |
437 | } | 419 | } |
438 | 420 | ||
439 | unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, | 421 | static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, |
440 | unsigned long startpfn, unsigned long endpfn) | 422 | unsigned long align) |
441 | { | 423 | { |
442 | return init_bootmem_core(pgdat, freepfn, startpfn, endpfn); | 424 | unsigned long base = PFN_PHYS(bdata->node_min_pfn); |
425 | |||
426 | /* Same as align_idx for byte offsets */ | ||
427 | |||
428 | return ALIGN(base + off, align) - base; | ||
443 | } | 429 | } |
444 | 430 | ||
445 | int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | 431 | static void * __init alloc_bootmem_core(struct bootmem_data *bdata, |
446 | unsigned long size, int flags) | 432 | unsigned long size, unsigned long align, |
433 | unsigned long goal, unsigned long limit) | ||
447 | { | 434 | { |
448 | int ret; | 435 | unsigned long fallback = 0; |
436 | unsigned long min, max, start, sidx, midx, step; | ||
449 | 437 | ||
450 | ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); | 438 | BUG_ON(!size); |
451 | if (ret < 0) | 439 | BUG_ON(align & (align - 1)); |
452 | return -ENOMEM; | 440 | BUG_ON(limit && goal + size > limit); |
453 | reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); | ||
454 | 441 | ||
455 | return 0; | 442 | if (!bdata->node_bootmem_map) |
456 | } | 443 | return NULL; |
457 | 444 | ||
458 | void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | 445 | bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", |
459 | unsigned long size) | 446 | bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, |
460 | { | 447 | align, goal, limit); |
461 | free_bootmem_core(pgdat->bdata, physaddr, size); | ||
462 | } | ||
463 | 448 | ||
464 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | 449 | min = bdata->node_min_pfn; |
465 | { | 450 | max = bdata->node_low_pfn; |
466 | register_page_bootmem_info_node(pgdat); | ||
467 | return free_all_bootmem_core(pgdat); | ||
468 | } | ||
469 | 451 | ||
470 | unsigned long __init init_bootmem(unsigned long start, unsigned long pages) | 452 | goal >>= PAGE_SHIFT; |
471 | { | 453 | limit >>= PAGE_SHIFT; |
472 | max_low_pfn = pages; | ||
473 | min_low_pfn = start; | ||
474 | return init_bootmem_core(NODE_DATA(0), start, 0, pages); | ||
475 | } | ||
476 | 454 | ||
477 | #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE | 455 | if (limit && max > limit) |
478 | int __init reserve_bootmem(unsigned long addr, unsigned long size, | 456 | max = limit; |
479 | int flags) | 457 | if (max <= min) |
480 | { | 458 | return NULL; |
481 | bootmem_data_t *bdata; | ||
482 | int ret; | ||
483 | 459 | ||
484 | list_for_each_entry(bdata, &bdata_list, list) { | 460 | step = max(align >> PAGE_SHIFT, 1UL); |
485 | ret = can_reserve_bootmem_core(bdata, addr, size, flags); | 461 | |
486 | if (ret < 0) | 462 | if (goal && min < goal && goal < max) |
487 | return ret; | 463 | start = ALIGN(goal, step); |
464 | else | ||
465 | start = ALIGN(min, step); | ||
466 | |||
467 | sidx = start - bdata->node_min_pfn; | ||
468 | midx = max - bdata->node_min_pfn; | ||
469 | |||
470 | if (bdata->hint_idx > sidx) { | ||
471 | /* | ||
472 | * Handle the valid case of sidx being zero and still | ||
473 | * catch the fallback below. | ||
474 | */ | ||
475 | fallback = sidx + 1; | ||
476 | sidx = align_idx(bdata, bdata->hint_idx, step); | ||
488 | } | 477 | } |
489 | list_for_each_entry(bdata, &bdata_list, list) | ||
490 | reserve_bootmem_core(bdata, addr, size, flags); | ||
491 | 478 | ||
492 | return 0; | 479 | while (1) { |
493 | } | 480 | int merge; |
494 | #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ | 481 | void *region; |
482 | unsigned long eidx, i, start_off, end_off; | ||
483 | find_block: | ||
484 | sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx); | ||
485 | sidx = align_idx(bdata, sidx, step); | ||
486 | eidx = sidx + PFN_UP(size); | ||
495 | 487 | ||
496 | void __init free_bootmem(unsigned long addr, unsigned long size) | 488 | if (sidx >= midx || eidx > midx) |
497 | { | 489 | break; |
498 | bootmem_data_t *bdata; | ||
499 | list_for_each_entry(bdata, &bdata_list, list) | ||
500 | free_bootmem_core(bdata, addr, size); | ||
501 | } | ||
502 | 490 | ||
503 | unsigned long __init free_all_bootmem(void) | 491 | for (i = sidx; i < eidx; i++) |
504 | { | 492 | if (test_bit(i, bdata->node_bootmem_map)) { |
505 | return free_all_bootmem_core(NODE_DATA(0)); | 493 | sidx = align_idx(bdata, i, step); |
494 | if (sidx == i) | ||
495 | sidx += step; | ||
496 | goto find_block; | ||
497 | } | ||
498 | |||
499 | if (bdata->last_end_off & (PAGE_SIZE - 1) && | ||
500 | PFN_DOWN(bdata->last_end_off) + 1 == sidx) | ||
501 | start_off = align_off(bdata, bdata->last_end_off, align); | ||
502 | else | ||
503 | start_off = PFN_PHYS(sidx); | ||
504 | |||
505 | merge = PFN_DOWN(start_off) < sidx; | ||
506 | end_off = start_off + size; | ||
507 | |||
508 | bdata->last_end_off = end_off; | ||
509 | bdata->hint_idx = PFN_UP(end_off); | ||
510 | |||
511 | /* | ||
512 | * Reserve the area now: | ||
513 | */ | ||
514 | if (__reserve(bdata, PFN_DOWN(start_off) + merge, | ||
515 | PFN_UP(end_off), BOOTMEM_EXCLUSIVE)) | ||
516 | BUG(); | ||
517 | |||
518 | region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + | ||
519 | start_off); | ||
520 | memset(region, 0, size); | ||
521 | return region; | ||
522 | } | ||
523 | |||
524 | if (fallback) { | ||
525 | sidx = align_idx(bdata, fallback - 1, step); | ||
526 | fallback = 0; | ||
527 | goto find_block; | ||
528 | } | ||
529 | |||
530 | return NULL; | ||
506 | } | 531 | } |
507 | 532 | ||
508 | void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, | 533 | static void * __init ___alloc_bootmem_nopanic(unsigned long size, |
509 | unsigned long goal) | 534 | unsigned long align, |
535 | unsigned long goal, | ||
536 | unsigned long limit) | ||
510 | { | 537 | { |
511 | bootmem_data_t *bdata; | 538 | bootmem_data_t *bdata; |
512 | void *ptr; | ||
513 | 539 | ||
540 | restart: | ||
514 | list_for_each_entry(bdata, &bdata_list, list) { | 541 | list_for_each_entry(bdata, &bdata_list, list) { |
515 | ptr = __alloc_bootmem_core(bdata, size, align, goal, 0); | 542 | void *region; |
516 | if (ptr) | 543 | |
517 | return ptr; | 544 | if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) |
545 | continue; | ||
546 | if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) | ||
547 | break; | ||
548 | |||
549 | region = alloc_bootmem_core(bdata, size, align, goal, limit); | ||
550 | if (region) | ||
551 | return region; | ||
552 | } | ||
553 | |||
554 | if (goal) { | ||
555 | goal = 0; | ||
556 | goto restart; | ||
518 | } | 557 | } |
558 | |||
519 | return NULL; | 559 | return NULL; |
520 | } | 560 | } |
521 | 561 | ||
522 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, | 562 | /** |
523 | unsigned long goal) | 563 | * __alloc_bootmem_nopanic - allocate boot memory without panicking |
564 | * @size: size of the request in bytes | ||
565 | * @align: alignment of the region | ||
566 | * @goal: preferred starting address of the region | ||
567 | * | ||
568 | * The goal is dropped if it can not be satisfied and the allocation will | ||
569 | * fall back to memory below @goal. | ||
570 | * | ||
571 | * Allocation may happen on any node in the system. | ||
572 | * | ||
573 | * Returns NULL on failure. | ||
574 | */ | ||
575 | void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, | ||
576 | unsigned long goal) | ||
524 | { | 577 | { |
525 | void *mem = __alloc_bootmem_nopanic(size,align,goal); | 578 | return ___alloc_bootmem_nopanic(size, align, goal, 0); |
579 | } | ||
580 | |||
581 | static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, | ||
582 | unsigned long goal, unsigned long limit) | ||
583 | { | ||
584 | void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit); | ||
526 | 585 | ||
527 | if (mem) | 586 | if (mem) |
528 | return mem; | 587 | return mem; |
@@ -534,78 +593,135 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, | |||
534 | return NULL; | 593 | return NULL; |
535 | } | 594 | } |
536 | 595 | ||
596 | /** | ||
597 | * __alloc_bootmem - allocate boot memory | ||
598 | * @size: size of the request in bytes | ||
599 | * @align: alignment of the region | ||
600 | * @goal: preferred starting address of the region | ||
601 | * | ||
602 | * The goal is dropped if it can not be satisfied and the allocation will | ||
603 | * fall back to memory below @goal. | ||
604 | * | ||
605 | * Allocation may happen on any node in the system. | ||
606 | * | ||
607 | * The function panics if the request can not be satisfied. | ||
608 | */ | ||
609 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, | ||
610 | unsigned long goal) | ||
611 | { | ||
612 | return ___alloc_bootmem(size, align, goal, 0); | ||
613 | } | ||
537 | 614 | ||
538 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | 615 | static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, |
539 | unsigned long align, unsigned long goal) | 616 | unsigned long size, unsigned long align, |
617 | unsigned long goal, unsigned long limit) | ||
540 | { | 618 | { |
541 | void *ptr; | 619 | void *ptr; |
542 | 620 | ||
543 | ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); | 621 | ptr = alloc_bootmem_core(bdata, size, align, goal, limit); |
544 | if (ptr) | 622 | if (ptr) |
545 | return ptr; | 623 | return ptr; |
546 | 624 | ||
547 | return __alloc_bootmem(size, align, goal); | 625 | return ___alloc_bootmem(size, align, goal, limit); |
626 | } | ||
627 | |||
628 | /** | ||
629 | * __alloc_bootmem_node - allocate boot memory from a specific node | ||
630 | * @pgdat: node to allocate from | ||
631 | * @size: size of the request in bytes | ||
632 | * @align: alignment of the region | ||
633 | * @goal: preferred starting address of the region | ||
634 | * | ||
635 | * The goal is dropped if it can not be satisfied and the allocation will | ||
636 | * fall back to memory below @goal. | ||
637 | * | ||
638 | * Allocation may fall back to any node in the system if the specified node | ||
639 | * can not hold the requested memory. | ||
640 | * | ||
641 | * The function panics if the request can not be satisfied. | ||
642 | */ | ||
643 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | ||
644 | unsigned long align, unsigned long goal) | ||
645 | { | ||
646 | return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); | ||
548 | } | 647 | } |
549 | 648 | ||
550 | #ifdef CONFIG_SPARSEMEM | 649 | #ifdef CONFIG_SPARSEMEM |
650 | /** | ||
651 | * alloc_bootmem_section - allocate boot memory from a specific section | ||
652 | * @size: size of the request in bytes | ||
653 | * @section_nr: sparse map section to allocate from | ||
654 | * | ||
655 | * Return NULL on failure. | ||
656 | */ | ||
551 | void * __init alloc_bootmem_section(unsigned long size, | 657 | void * __init alloc_bootmem_section(unsigned long size, |
552 | unsigned long section_nr) | 658 | unsigned long section_nr) |
553 | { | 659 | { |
554 | void *ptr; | 660 | bootmem_data_t *bdata; |
555 | unsigned long limit, goal, start_nr, end_nr, pfn; | 661 | unsigned long pfn, goal, limit; |
556 | struct pglist_data *pgdat; | ||
557 | 662 | ||
558 | pfn = section_nr_to_pfn(section_nr); | 663 | pfn = section_nr_to_pfn(section_nr); |
559 | goal = PFN_PHYS(pfn); | 664 | goal = pfn << PAGE_SHIFT; |
560 | limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1; | 665 | limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; |
561 | pgdat = NODE_DATA(early_pfn_to_nid(pfn)); | 666 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; |
562 | ptr = __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal, | ||
563 | limit); | ||
564 | 667 | ||
565 | if (!ptr) | 668 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); |
566 | return NULL; | 669 | } |
670 | #endif | ||
567 | 671 | ||
568 | start_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr))); | 672 | void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, |
569 | end_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr) + size)); | 673 | unsigned long align, unsigned long goal) |
570 | if (start_nr != section_nr || end_nr != section_nr) { | 674 | { |
571 | printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n", | 675 | void *ptr; |
572 | section_nr); | ||
573 | free_bootmem_core(pgdat->bdata, __pa(ptr), size); | ||
574 | ptr = NULL; | ||
575 | } | ||
576 | 676 | ||
577 | return ptr; | 677 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); |
678 | if (ptr) | ||
679 | return ptr; | ||
680 | |||
681 | return __alloc_bootmem_nopanic(size, align, goal); | ||
578 | } | 682 | } |
579 | #endif | ||
580 | 683 | ||
581 | #ifndef ARCH_LOW_ADDRESS_LIMIT | 684 | #ifndef ARCH_LOW_ADDRESS_LIMIT |
582 | #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL | 685 | #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL |
583 | #endif | 686 | #endif |
584 | 687 | ||
688 | /** | ||
689 | * __alloc_bootmem_low - allocate low boot memory | ||
690 | * @size: size of the request in bytes | ||
691 | * @align: alignment of the region | ||
692 | * @goal: preferred starting address of the region | ||
693 | * | ||
694 | * The goal is dropped if it can not be satisfied and the allocation will | ||
695 | * fall back to memory below @goal. | ||
696 | * | ||
697 | * Allocation may happen on any node in the system. | ||
698 | * | ||
699 | * The function panics if the request can not be satisfied. | ||
700 | */ | ||
585 | void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, | 701 | void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, |
586 | unsigned long goal) | 702 | unsigned long goal) |
587 | { | 703 | { |
588 | bootmem_data_t *bdata; | 704 | return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); |
589 | void *ptr; | ||
590 | |||
591 | list_for_each_entry(bdata, &bdata_list, list) { | ||
592 | ptr = __alloc_bootmem_core(bdata, size, align, goal, | ||
593 | ARCH_LOW_ADDRESS_LIMIT); | ||
594 | if (ptr) | ||
595 | return ptr; | ||
596 | } | ||
597 | |||
598 | /* | ||
599 | * Whoops, we cannot satisfy the allocation request. | ||
600 | */ | ||
601 | printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size); | ||
602 | panic("Out of low memory"); | ||
603 | return NULL; | ||
604 | } | 705 | } |
605 | 706 | ||
707 | /** | ||
708 | * __alloc_bootmem_low_node - allocate low boot memory from a specific node | ||
709 | * @pgdat: node to allocate from | ||
710 | * @size: size of the request in bytes | ||
711 | * @align: alignment of the region | ||
712 | * @goal: preferred starting address of the region | ||
713 | * | ||
714 | * The goal is dropped if it can not be satisfied and the allocation will | ||
715 | * fall back to memory below @goal. | ||
716 | * | ||
717 | * Allocation may fall back to any node in the system if the specified node | ||
718 | * can not hold the requested memory. | ||
719 | * | ||
720 | * The function panics if the request can not be satisfied. | ||
721 | */ | ||
606 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | 722 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, |
607 | unsigned long align, unsigned long goal) | 723 | unsigned long align, unsigned long goal) |
608 | { | 724 | { |
609 | return __alloc_bootmem_core(pgdat->bdata, size, align, goal, | 725 | return ___alloc_bootmem_node(pgdat->bdata, size, align, |
610 | ARCH_LOW_ADDRESS_LIMIT); | 726 | goal, ARCH_LOW_ADDRESS_LIMIT); |
611 | } | 727 | } |
diff --git a/mm/bounce.c b/mm/bounce.c index b6d2d0f1019b..06722c403058 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
@@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) | |||
267 | /* | 267 | /* |
268 | * Data-less bio, nothing to bounce | 268 | * Data-less bio, nothing to bounce |
269 | */ | 269 | */ |
270 | if (bio_empty_barrier(*bio_orig)) | 270 | if (!bio_has_data(*bio_orig)) |
271 | return; | 271 | return; |
272 | 272 | ||
273 | /* | 273 | /* |
diff --git a/mm/fadvise.c b/mm/fadvise.c index 343cfdfebd9e..a1da969bd980 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -3,7 +3,7 @@ | |||
3 | * | 3 | * |
4 | * Copyright (C) 2002, Linus Torvalds | 4 | * Copyright (C) 2002, Linus Torvalds |
5 | * | 5 | * |
6 | * 11Jan2003 akpm@digeo.com | 6 | * 11Jan2003 Andrew Morton |
7 | * Initial version. | 7 | * Initial version. |
8 | */ | 8 | */ |
9 | 9 | ||
diff --git a/mm/filemap.c b/mm/filemap.c index 65d9d9e2b755..ab8553658af3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/cpuset.h> | 33 | #include <linux/cpuset.h> |
34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ | 34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ |
35 | #include <linux/memcontrol.h> | 35 | #include <linux/memcontrol.h> |
36 | #include <linux/mm_inline.h> /* for page_is_file_cache() */ | ||
36 | #include "internal.h" | 37 | #include "internal.h" |
37 | 38 | ||
38 | /* | 39 | /* |
@@ -42,9 +43,6 @@ | |||
42 | 43 | ||
43 | #include <asm/mman.h> | 44 | #include <asm/mman.h> |
44 | 45 | ||
45 | static ssize_t | ||
46 | generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | ||
47 | loff_t offset, unsigned long nr_segs); | ||
48 | 46 | ||
49 | /* | 47 | /* |
50 | * Shared mappings implemented 30.11.1994. It's not fully working yet, | 48 | * Shared mappings implemented 30.11.1994. It's not fully working yet, |
@@ -112,18 +110,18 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | |||
112 | /* | 110 | /* |
113 | * Remove a page from the page cache and free it. Caller has to make | 111 | * Remove a page from the page cache and free it. Caller has to make |
114 | * sure the page is locked and that nobody else uses it - or that usage | 112 | * sure the page is locked and that nobody else uses it - or that usage |
115 | * is safe. The caller must hold a write_lock on the mapping's tree_lock. | 113 | * is safe. The caller must hold the mapping's tree_lock. |
116 | */ | 114 | */ |
117 | void __remove_from_page_cache(struct page *page) | 115 | void __remove_from_page_cache(struct page *page) |
118 | { | 116 | { |
119 | struct address_space *mapping = page->mapping; | 117 | struct address_space *mapping = page->mapping; |
120 | 118 | ||
121 | mem_cgroup_uncharge_page(page); | ||
122 | radix_tree_delete(&mapping->page_tree, page->index); | 119 | radix_tree_delete(&mapping->page_tree, page->index); |
123 | page->mapping = NULL; | 120 | page->mapping = NULL; |
124 | mapping->nrpages--; | 121 | mapping->nrpages--; |
125 | __dec_zone_page_state(page, NR_FILE_PAGES); | 122 | __dec_zone_page_state(page, NR_FILE_PAGES); |
126 | BUG_ON(page_mapped(page)); | 123 | BUG_ON(page_mapped(page)); |
124 | mem_cgroup_uncharge_cache_page(page); | ||
127 | 125 | ||
128 | /* | 126 | /* |
129 | * Some filesystems seem to re-dirty the page even after | 127 | * Some filesystems seem to re-dirty the page even after |
@@ -144,9 +142,9 @@ void remove_from_page_cache(struct page *page) | |||
144 | 142 | ||
145 | BUG_ON(!PageLocked(page)); | 143 | BUG_ON(!PageLocked(page)); |
146 | 144 | ||
147 | write_lock_irq(&mapping->tree_lock); | 145 | spin_lock_irq(&mapping->tree_lock); |
148 | __remove_from_page_cache(page); | 146 | __remove_from_page_cache(page); |
149 | write_unlock_irq(&mapping->tree_lock); | 147 | spin_unlock_irq(&mapping->tree_lock); |
150 | } | 148 | } |
151 | 149 | ||
152 | static int sync_page(void *word) | 150 | static int sync_page(void *word) |
@@ -445,55 +443,74 @@ int filemap_write_and_wait_range(struct address_space *mapping, | |||
445 | } | 443 | } |
446 | 444 | ||
447 | /** | 445 | /** |
448 | * add_to_page_cache - add newly allocated pagecache pages | 446 | * add_to_page_cache_locked - add a locked page to the pagecache |
449 | * @page: page to add | 447 | * @page: page to add |
450 | * @mapping: the page's address_space | 448 | * @mapping: the page's address_space |
451 | * @offset: page index | 449 | * @offset: page index |
452 | * @gfp_mask: page allocation mode | 450 | * @gfp_mask: page allocation mode |
453 | * | 451 | * |
454 | * This function is used to add newly allocated pagecache pages; | 452 | * This function is used to add a page to the pagecache. It must be locked. |
455 | * the page is new, so we can just run SetPageLocked() against it. | ||
456 | * The other page state flags were set by rmqueue(). | ||
457 | * | ||
458 | * This function does not add the page to the LRU. The caller must do that. | 453 | * This function does not add the page to the LRU. The caller must do that. |
459 | */ | 454 | */ |
460 | int add_to_page_cache(struct page *page, struct address_space *mapping, | 455 | int add_to_page_cache_locked(struct page *page, struct address_space *mapping, |
461 | pgoff_t offset, gfp_t gfp_mask) | 456 | pgoff_t offset, gfp_t gfp_mask) |
462 | { | 457 | { |
463 | int error = mem_cgroup_cache_charge(page, current->mm, | 458 | int error; |
459 | |||
460 | VM_BUG_ON(!PageLocked(page)); | ||
461 | |||
462 | error = mem_cgroup_cache_charge(page, current->mm, | ||
464 | gfp_mask & ~__GFP_HIGHMEM); | 463 | gfp_mask & ~__GFP_HIGHMEM); |
465 | if (error) | 464 | if (error) |
466 | goto out; | 465 | goto out; |
467 | 466 | ||
468 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | 467 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); |
469 | if (error == 0) { | 468 | if (error == 0) { |
470 | write_lock_irq(&mapping->tree_lock); | 469 | page_cache_get(page); |
470 | page->mapping = mapping; | ||
471 | page->index = offset; | ||
472 | |||
473 | spin_lock_irq(&mapping->tree_lock); | ||
471 | error = radix_tree_insert(&mapping->page_tree, offset, page); | 474 | error = radix_tree_insert(&mapping->page_tree, offset, page); |
472 | if (!error) { | 475 | if (likely(!error)) { |
473 | page_cache_get(page); | ||
474 | SetPageLocked(page); | ||
475 | page->mapping = mapping; | ||
476 | page->index = offset; | ||
477 | mapping->nrpages++; | 476 | mapping->nrpages++; |
478 | __inc_zone_page_state(page, NR_FILE_PAGES); | 477 | __inc_zone_page_state(page, NR_FILE_PAGES); |
479 | } else | 478 | } else { |
480 | mem_cgroup_uncharge_page(page); | 479 | page->mapping = NULL; |
480 | mem_cgroup_uncharge_cache_page(page); | ||
481 | page_cache_release(page); | ||
482 | } | ||
481 | 483 | ||
482 | write_unlock_irq(&mapping->tree_lock); | 484 | spin_unlock_irq(&mapping->tree_lock); |
483 | radix_tree_preload_end(); | 485 | radix_tree_preload_end(); |
484 | } else | 486 | } else |
485 | mem_cgroup_uncharge_page(page); | 487 | mem_cgroup_uncharge_cache_page(page); |
486 | out: | 488 | out: |
487 | return error; | 489 | return error; |
488 | } | 490 | } |
489 | EXPORT_SYMBOL(add_to_page_cache); | 491 | EXPORT_SYMBOL(add_to_page_cache_locked); |
490 | 492 | ||
491 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | 493 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, |
492 | pgoff_t offset, gfp_t gfp_mask) | 494 | pgoff_t offset, gfp_t gfp_mask) |
493 | { | 495 | { |
494 | int ret = add_to_page_cache(page, mapping, offset, gfp_mask); | 496 | int ret; |
495 | if (ret == 0) | 497 | |
496 | lru_cache_add(page); | 498 | /* |
499 | * Splice_read and readahead add shmem/tmpfs pages into the page cache | ||
500 | * before shmem_readpage has a chance to mark them as SwapBacked: they | ||
501 | * need to go on the active_anon lru below, and mem_cgroup_cache_charge | ||
502 | * (called in add_to_page_cache) needs to know where they're going too. | ||
503 | */ | ||
504 | if (mapping_cap_swap_backed(mapping)) | ||
505 | SetPageSwapBacked(page); | ||
506 | |||
507 | ret = add_to_page_cache(page, mapping, offset, gfp_mask); | ||
508 | if (ret == 0) { | ||
509 | if (page_is_file_cache(page)) | ||
510 | lru_cache_add_file(page); | ||
511 | else | ||
512 | lru_cache_add_active_anon(page); | ||
513 | } | ||
497 | return ret; | 514 | return ret; |
498 | } | 515 | } |
499 | 516 | ||
@@ -556,17 +573,14 @@ EXPORT_SYMBOL(wait_on_page_bit); | |||
556 | * mechananism between PageLocked pages and PageWriteback pages is shared. | 573 | * mechananism between PageLocked pages and PageWriteback pages is shared. |
557 | * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. | 574 | * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. |
558 | * | 575 | * |
559 | * The first mb is necessary to safely close the critical section opened by the | 576 | * The mb is necessary to enforce ordering between the clear_bit and the read |
560 | * TestSetPageLocked(), the second mb is necessary to enforce ordering between | 577 | * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). |
561 | * the clear_bit and the read of the waitqueue (to avoid SMP races with a | ||
562 | * parallel wait_on_page_locked()). | ||
563 | */ | 578 | */ |
564 | void unlock_page(struct page *page) | 579 | void unlock_page(struct page *page) |
565 | { | 580 | { |
566 | smp_mb__before_clear_bit(); | 581 | VM_BUG_ON(!PageLocked(page)); |
567 | if (!TestClearPageLocked(page)) | 582 | clear_bit_unlock(PG_locked, &page->flags); |
568 | BUG(); | 583 | smp_mb__after_clear_bit(); |
569 | smp_mb__after_clear_bit(); | ||
570 | wake_up_page(page, PG_locked); | 584 | wake_up_page(page, PG_locked); |
571 | } | 585 | } |
572 | EXPORT_SYMBOL(unlock_page); | 586 | EXPORT_SYMBOL(unlock_page); |
@@ -636,15 +650,35 @@ void __lock_page_nosync(struct page *page) | |||
636 | * Is there a pagecache struct page at the given (mapping, offset) tuple? | 650 | * Is there a pagecache struct page at the given (mapping, offset) tuple? |
637 | * If yes, increment its refcount and return it; if no, return NULL. | 651 | * If yes, increment its refcount and return it; if no, return NULL. |
638 | */ | 652 | */ |
639 | struct page * find_get_page(struct address_space *mapping, pgoff_t offset) | 653 | struct page *find_get_page(struct address_space *mapping, pgoff_t offset) |
640 | { | 654 | { |
655 | void **pagep; | ||
641 | struct page *page; | 656 | struct page *page; |
642 | 657 | ||
643 | read_lock_irq(&mapping->tree_lock); | 658 | rcu_read_lock(); |
644 | page = radix_tree_lookup(&mapping->page_tree, offset); | 659 | repeat: |
645 | if (page) | 660 | page = NULL; |
646 | page_cache_get(page); | 661 | pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); |
647 | read_unlock_irq(&mapping->tree_lock); | 662 | if (pagep) { |
663 | page = radix_tree_deref_slot(pagep); | ||
664 | if (unlikely(!page || page == RADIX_TREE_RETRY)) | ||
665 | goto repeat; | ||
666 | |||
667 | if (!page_cache_get_speculative(page)) | ||
668 | goto repeat; | ||
669 | |||
670 | /* | ||
671 | * Has the page moved? | ||
672 | * This is part of the lockless pagecache protocol. See | ||
673 | * include/linux/pagemap.h for details. | ||
674 | */ | ||
675 | if (unlikely(page != *pagep)) { | ||
676 | page_cache_release(page); | ||
677 | goto repeat; | ||
678 | } | ||
679 | } | ||
680 | rcu_read_unlock(); | ||
681 | |||
648 | return page; | 682 | return page; |
649 | } | 683 | } |
650 | EXPORT_SYMBOL(find_get_page); | 684 | EXPORT_SYMBOL(find_get_page); |
@@ -659,32 +693,22 @@ EXPORT_SYMBOL(find_get_page); | |||
659 | * | 693 | * |
660 | * Returns zero if the page was not present. find_lock_page() may sleep. | 694 | * Returns zero if the page was not present. find_lock_page() may sleep. |
661 | */ | 695 | */ |
662 | struct page *find_lock_page(struct address_space *mapping, | 696 | struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) |
663 | pgoff_t offset) | ||
664 | { | 697 | { |
665 | struct page *page; | 698 | struct page *page; |
666 | 699 | ||
667 | repeat: | 700 | repeat: |
668 | read_lock_irq(&mapping->tree_lock); | 701 | page = find_get_page(mapping, offset); |
669 | page = radix_tree_lookup(&mapping->page_tree, offset); | ||
670 | if (page) { | 702 | if (page) { |
671 | page_cache_get(page); | 703 | lock_page(page); |
672 | if (TestSetPageLocked(page)) { | 704 | /* Has the page been truncated? */ |
673 | read_unlock_irq(&mapping->tree_lock); | 705 | if (unlikely(page->mapping != mapping)) { |
674 | __lock_page(page); | 706 | unlock_page(page); |
675 | 707 | page_cache_release(page); | |
676 | /* Has the page been truncated while we slept? */ | 708 | goto repeat; |
677 | if (unlikely(page->mapping != mapping)) { | ||
678 | unlock_page(page); | ||
679 | page_cache_release(page); | ||
680 | goto repeat; | ||
681 | } | ||
682 | VM_BUG_ON(page->index != offset); | ||
683 | goto out; | ||
684 | } | 709 | } |
710 | VM_BUG_ON(page->index != offset); | ||
685 | } | 711 | } |
686 | read_unlock_irq(&mapping->tree_lock); | ||
687 | out: | ||
688 | return page; | 712 | return page; |
689 | } | 713 | } |
690 | EXPORT_SYMBOL(find_lock_page); | 714 | EXPORT_SYMBOL(find_lock_page); |
@@ -750,13 +774,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, | |||
750 | { | 774 | { |
751 | unsigned int i; | 775 | unsigned int i; |
752 | unsigned int ret; | 776 | unsigned int ret; |
777 | unsigned int nr_found; | ||
778 | |||
779 | rcu_read_lock(); | ||
780 | restart: | ||
781 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | ||
782 | (void ***)pages, start, nr_pages); | ||
783 | ret = 0; | ||
784 | for (i = 0; i < nr_found; i++) { | ||
785 | struct page *page; | ||
786 | repeat: | ||
787 | page = radix_tree_deref_slot((void **)pages[i]); | ||
788 | if (unlikely(!page)) | ||
789 | continue; | ||
790 | /* | ||
791 | * this can only trigger if nr_found == 1, making livelock | ||
792 | * a non issue. | ||
793 | */ | ||
794 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
795 | goto restart; | ||
753 | 796 | ||
754 | read_lock_irq(&mapping->tree_lock); | 797 | if (!page_cache_get_speculative(page)) |
755 | ret = radix_tree_gang_lookup(&mapping->page_tree, | 798 | goto repeat; |
756 | (void **)pages, start, nr_pages); | 799 | |
757 | for (i = 0; i < ret; i++) | 800 | /* Has the page moved? */ |
758 | page_cache_get(pages[i]); | 801 | if (unlikely(page != *((void **)pages[i]))) { |
759 | read_unlock_irq(&mapping->tree_lock); | 802 | page_cache_release(page); |
803 | goto repeat; | ||
804 | } | ||
805 | |||
806 | pages[ret] = page; | ||
807 | ret++; | ||
808 | } | ||
809 | rcu_read_unlock(); | ||
760 | return ret; | 810 | return ret; |
761 | } | 811 | } |
762 | 812 | ||
@@ -777,19 +827,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, | |||
777 | { | 827 | { |
778 | unsigned int i; | 828 | unsigned int i; |
779 | unsigned int ret; | 829 | unsigned int ret; |
830 | unsigned int nr_found; | ||
831 | |||
832 | rcu_read_lock(); | ||
833 | restart: | ||
834 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | ||
835 | (void ***)pages, index, nr_pages); | ||
836 | ret = 0; | ||
837 | for (i = 0; i < nr_found; i++) { | ||
838 | struct page *page; | ||
839 | repeat: | ||
840 | page = radix_tree_deref_slot((void **)pages[i]); | ||
841 | if (unlikely(!page)) | ||
842 | continue; | ||
843 | /* | ||
844 | * this can only trigger if nr_found == 1, making livelock | ||
845 | * a non issue. | ||
846 | */ | ||
847 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
848 | goto restart; | ||
780 | 849 | ||
781 | read_lock_irq(&mapping->tree_lock); | 850 | if (page->mapping == NULL || page->index != index) |
782 | ret = radix_tree_gang_lookup(&mapping->page_tree, | ||
783 | (void **)pages, index, nr_pages); | ||
784 | for (i = 0; i < ret; i++) { | ||
785 | if (pages[i]->mapping == NULL || pages[i]->index != index) | ||
786 | break; | 851 | break; |
787 | 852 | ||
788 | page_cache_get(pages[i]); | 853 | if (!page_cache_get_speculative(page)) |
854 | goto repeat; | ||
855 | |||
856 | /* Has the page moved? */ | ||
857 | if (unlikely(page != *((void **)pages[i]))) { | ||
858 | page_cache_release(page); | ||
859 | goto repeat; | ||
860 | } | ||
861 | |||
862 | pages[ret] = page; | ||
863 | ret++; | ||
789 | index++; | 864 | index++; |
790 | } | 865 | } |
791 | read_unlock_irq(&mapping->tree_lock); | 866 | rcu_read_unlock(); |
792 | return i; | 867 | return ret; |
793 | } | 868 | } |
794 | EXPORT_SYMBOL(find_get_pages_contig); | 869 | EXPORT_SYMBOL(find_get_pages_contig); |
795 | 870 | ||
@@ -809,15 +884,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, | |||
809 | { | 884 | { |
810 | unsigned int i; | 885 | unsigned int i; |
811 | unsigned int ret; | 886 | unsigned int ret; |
887 | unsigned int nr_found; | ||
888 | |||
889 | rcu_read_lock(); | ||
890 | restart: | ||
891 | nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, | ||
892 | (void ***)pages, *index, nr_pages, tag); | ||
893 | ret = 0; | ||
894 | for (i = 0; i < nr_found; i++) { | ||
895 | struct page *page; | ||
896 | repeat: | ||
897 | page = radix_tree_deref_slot((void **)pages[i]); | ||
898 | if (unlikely(!page)) | ||
899 | continue; | ||
900 | /* | ||
901 | * this can only trigger if nr_found == 1, making livelock | ||
902 | * a non issue. | ||
903 | */ | ||
904 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
905 | goto restart; | ||
906 | |||
907 | if (!page_cache_get_speculative(page)) | ||
908 | goto repeat; | ||
909 | |||
910 | /* Has the page moved? */ | ||
911 | if (unlikely(page != *((void **)pages[i]))) { | ||
912 | page_cache_release(page); | ||
913 | goto repeat; | ||
914 | } | ||
915 | |||
916 | pages[ret] = page; | ||
917 | ret++; | ||
918 | } | ||
919 | rcu_read_unlock(); | ||
812 | 920 | ||
813 | read_lock_irq(&mapping->tree_lock); | ||
814 | ret = radix_tree_gang_lookup_tag(&mapping->page_tree, | ||
815 | (void **)pages, *index, nr_pages, tag); | ||
816 | for (i = 0; i < ret; i++) | ||
817 | page_cache_get(pages[i]); | ||
818 | if (ret) | 921 | if (ret) |
819 | *index = pages[ret - 1]->index + 1; | 922 | *index = pages[ret - 1]->index + 1; |
820 | read_unlock_irq(&mapping->tree_lock); | 923 | |
821 | return ret; | 924 | return ret; |
822 | } | 925 | } |
823 | EXPORT_SYMBOL(find_get_pages_tag); | 926 | EXPORT_SYMBOL(find_get_pages_tag); |
@@ -841,7 +944,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) | |||
841 | struct page *page = find_get_page(mapping, index); | 944 | struct page *page = find_get_page(mapping, index); |
842 | 945 | ||
843 | if (page) { | 946 | if (page) { |
844 | if (!TestSetPageLocked(page)) | 947 | if (trylock_page(page)) |
845 | return page; | 948 | return page; |
846 | page_cache_release(page); | 949 | page_cache_release(page); |
847 | return NULL; | 950 | return NULL; |
@@ -933,8 +1036,17 @@ find_page: | |||
933 | ra, filp, page, | 1036 | ra, filp, page, |
934 | index, last_index - index); | 1037 | index, last_index - index); |
935 | } | 1038 | } |
936 | if (!PageUptodate(page)) | 1039 | if (!PageUptodate(page)) { |
937 | goto page_not_up_to_date; | 1040 | if (inode->i_blkbits == PAGE_CACHE_SHIFT || |
1041 | !mapping->a_ops->is_partially_uptodate) | ||
1042 | goto page_not_up_to_date; | ||
1043 | if (!trylock_page(page)) | ||
1044 | goto page_not_up_to_date; | ||
1045 | if (!mapping->a_ops->is_partially_uptodate(page, | ||
1046 | desc, offset)) | ||
1047 | goto page_not_up_to_date_locked; | ||
1048 | unlock_page(page); | ||
1049 | } | ||
938 | page_ok: | 1050 | page_ok: |
939 | /* | 1051 | /* |
940 | * i_size must be checked after we know the page is Uptodate. | 1052 | * i_size must be checked after we know the page is Uptodate. |
@@ -1001,9 +1113,11 @@ page_ok: | |||
1001 | 1113 | ||
1002 | page_not_up_to_date: | 1114 | page_not_up_to_date: |
1003 | /* Get exclusive access to the page ... */ | 1115 | /* Get exclusive access to the page ... */ |
1004 | if (lock_page_killable(page)) | 1116 | error = lock_page_killable(page); |
1005 | goto readpage_eio; | 1117 | if (unlikely(error)) |
1118 | goto readpage_error; | ||
1006 | 1119 | ||
1120 | page_not_up_to_date_locked: | ||
1007 | /* Did it get truncated before we got the lock? */ | 1121 | /* Did it get truncated before we got the lock? */ |
1008 | if (!page->mapping) { | 1122 | if (!page->mapping) { |
1009 | unlock_page(page); | 1123 | unlock_page(page); |
@@ -1030,8 +1144,9 @@ readpage: | |||
1030 | } | 1144 | } |
1031 | 1145 | ||
1032 | if (!PageUptodate(page)) { | 1146 | if (!PageUptodate(page)) { |
1033 | if (lock_page_killable(page)) | 1147 | error = lock_page_killable(page); |
1034 | goto readpage_eio; | 1148 | if (unlikely(error)) |
1149 | goto readpage_error; | ||
1035 | if (!PageUptodate(page)) { | 1150 | if (!PageUptodate(page)) { |
1036 | if (page->mapping == NULL) { | 1151 | if (page->mapping == NULL) { |
1037 | /* | 1152 | /* |
@@ -1043,15 +1158,14 @@ readpage: | |||
1043 | } | 1158 | } |
1044 | unlock_page(page); | 1159 | unlock_page(page); |
1045 | shrink_readahead_size_eio(filp, ra); | 1160 | shrink_readahead_size_eio(filp, ra); |
1046 | goto readpage_eio; | 1161 | error = -EIO; |
1162 | goto readpage_error; | ||
1047 | } | 1163 | } |
1048 | unlock_page(page); | 1164 | unlock_page(page); |
1049 | } | 1165 | } |
1050 | 1166 | ||
1051 | goto page_ok; | 1167 | goto page_ok; |
1052 | 1168 | ||
1053 | readpage_eio: | ||
1054 | error = -EIO; | ||
1055 | readpage_error: | 1169 | readpage_error: |
1056 | /* UHHUH! A synchronous read error occurred. Report it */ | 1170 | /* UHHUH! A synchronous read error occurred. Report it */ |
1057 | desc->error = error; | 1171 | desc->error = error; |
@@ -1086,8 +1200,7 @@ out: | |||
1086 | ra->prev_pos |= prev_offset; | 1200 | ra->prev_pos |= prev_offset; |
1087 | 1201 | ||
1088 | *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; | 1202 | *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; |
1089 | if (filp) | 1203 | file_accessed(filp); |
1090 | file_accessed(filp); | ||
1091 | } | 1204 | } |
1092 | 1205 | ||
1093 | int file_read_actor(read_descriptor_t *desc, struct page *page, | 1206 | int file_read_actor(read_descriptor_t *desc, struct page *page, |
@@ -1200,42 +1313,41 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1200 | 1313 | ||
1201 | mapping = filp->f_mapping; | 1314 | mapping = filp->f_mapping; |
1202 | inode = mapping->host; | 1315 | inode = mapping->host; |
1203 | retval = 0; | ||
1204 | if (!count) | 1316 | if (!count) |
1205 | goto out; /* skip atime */ | 1317 | goto out; /* skip atime */ |
1206 | size = i_size_read(inode); | 1318 | size = i_size_read(inode); |
1207 | if (pos < size) { | 1319 | if (pos < size) { |
1208 | retval = generic_file_direct_IO(READ, iocb, | 1320 | retval = filemap_write_and_wait(mapping); |
1209 | iov, pos, nr_segs); | 1321 | if (!retval) { |
1322 | retval = mapping->a_ops->direct_IO(READ, iocb, | ||
1323 | iov, pos, nr_segs); | ||
1324 | } | ||
1210 | if (retval > 0) | 1325 | if (retval > 0) |
1211 | *ppos = pos + retval; | 1326 | *ppos = pos + retval; |
1212 | } | 1327 | if (retval) { |
1213 | if (likely(retval != 0)) { | 1328 | file_accessed(filp); |
1214 | file_accessed(filp); | 1329 | goto out; |
1215 | goto out; | 1330 | } |
1216 | } | 1331 | } |
1217 | } | 1332 | } |
1218 | 1333 | ||
1219 | retval = 0; | 1334 | for (seg = 0; seg < nr_segs; seg++) { |
1220 | if (count) { | 1335 | read_descriptor_t desc; |
1221 | for (seg = 0; seg < nr_segs; seg++) { | ||
1222 | read_descriptor_t desc; | ||
1223 | 1336 | ||
1224 | desc.written = 0; | 1337 | desc.written = 0; |
1225 | desc.arg.buf = iov[seg].iov_base; | 1338 | desc.arg.buf = iov[seg].iov_base; |
1226 | desc.count = iov[seg].iov_len; | 1339 | desc.count = iov[seg].iov_len; |
1227 | if (desc.count == 0) | 1340 | if (desc.count == 0) |
1228 | continue; | 1341 | continue; |
1229 | desc.error = 0; | 1342 | desc.error = 0; |
1230 | do_generic_file_read(filp,ppos,&desc,file_read_actor); | 1343 | do_generic_file_read(filp, ppos, &desc, file_read_actor); |
1231 | retval += desc.written; | 1344 | retval += desc.written; |
1232 | if (desc.error) { | 1345 | if (desc.error) { |
1233 | retval = retval ?: desc.error; | 1346 | retval = retval ?: desc.error; |
1234 | break; | 1347 | break; |
1235 | } | ||
1236 | if (desc.count > 0) | ||
1237 | break; | ||
1238 | } | 1348 | } |
1349 | if (desc.count > 0) | ||
1350 | break; | ||
1239 | } | 1351 | } |
1240 | out: | 1352 | out: |
1241 | return retval; | 1353 | return retval; |
@@ -1669,8 +1781,9 @@ static int __remove_suid(struct dentry *dentry, int kill) | |||
1669 | return notify_change(dentry, &newattrs); | 1781 | return notify_change(dentry, &newattrs); |
1670 | } | 1782 | } |
1671 | 1783 | ||
1672 | int remove_suid(struct dentry *dentry) | 1784 | int file_remove_suid(struct file *file) |
1673 | { | 1785 | { |
1786 | struct dentry *dentry = file->f_path.dentry; | ||
1674 | int killsuid = should_remove_suid(dentry); | 1787 | int killsuid = should_remove_suid(dentry); |
1675 | int killpriv = security_inode_need_killpriv(dentry); | 1788 | int killpriv = security_inode_need_killpriv(dentry); |
1676 | int error = 0; | 1789 | int error = 0; |
@@ -1684,7 +1797,7 @@ int remove_suid(struct dentry *dentry) | |||
1684 | 1797 | ||
1685 | return error; | 1798 | return error; |
1686 | } | 1799 | } |
1687 | EXPORT_SYMBOL(remove_suid); | 1800 | EXPORT_SYMBOL(file_remove_suid); |
1688 | 1801 | ||
1689 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, | 1802 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, |
1690 | const struct iovec *iov, size_t base, size_t bytes) | 1803 | const struct iovec *iov, size_t base, size_t bytes) |
@@ -1779,7 +1892,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes) | |||
1779 | * The !iov->iov_len check ensures we skip over unlikely | 1892 | * The !iov->iov_len check ensures we skip over unlikely |
1780 | * zero-length segments (without overruning the iovec). | 1893 | * zero-length segments (without overruning the iovec). |
1781 | */ | 1894 | */ |
1782 | while (bytes || unlikely(!iov->iov_len && i->count)) { | 1895 | while (bytes || unlikely(i->count && !iov->iov_len)) { |
1783 | int copy; | 1896 | int copy; |
1784 | 1897 | ||
1785 | copy = min(bytes, iov->iov_len - base); | 1898 | copy = min(bytes, iov->iov_len - base); |
@@ -2004,11 +2117,62 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
2004 | struct address_space *mapping = file->f_mapping; | 2117 | struct address_space *mapping = file->f_mapping; |
2005 | struct inode *inode = mapping->host; | 2118 | struct inode *inode = mapping->host; |
2006 | ssize_t written; | 2119 | ssize_t written; |
2120 | size_t write_len; | ||
2121 | pgoff_t end; | ||
2007 | 2122 | ||
2008 | if (count != ocount) | 2123 | if (count != ocount) |
2009 | *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); | 2124 | *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); |
2010 | 2125 | ||
2011 | written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs); | 2126 | /* |
2127 | * Unmap all mmappings of the file up-front. | ||
2128 | * | ||
2129 | * This will cause any pte dirty bits to be propagated into the | ||
2130 | * pageframes for the subsequent filemap_write_and_wait(). | ||
2131 | */ | ||
2132 | write_len = iov_length(iov, *nr_segs); | ||
2133 | end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; | ||
2134 | if (mapping_mapped(mapping)) | ||
2135 | unmap_mapping_range(mapping, pos, write_len, 0); | ||
2136 | |||
2137 | written = filemap_write_and_wait(mapping); | ||
2138 | if (written) | ||
2139 | goto out; | ||
2140 | |||
2141 | /* | ||
2142 | * After a write we want buffered reads to be sure to go to disk to get | ||
2143 | * the new data. We invalidate clean cached page from the region we're | ||
2144 | * about to write. We do this *before* the write so that we can return | ||
2145 | * without clobbering -EIOCBQUEUED from ->direct_IO(). | ||
2146 | */ | ||
2147 | if (mapping->nrpages) { | ||
2148 | written = invalidate_inode_pages2_range(mapping, | ||
2149 | pos >> PAGE_CACHE_SHIFT, end); | ||
2150 | /* | ||
2151 | * If a page can not be invalidated, return 0 to fall back | ||
2152 | * to buffered write. | ||
2153 | */ | ||
2154 | if (written) { | ||
2155 | if (written == -EBUSY) | ||
2156 | return 0; | ||
2157 | goto out; | ||
2158 | } | ||
2159 | } | ||
2160 | |||
2161 | written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); | ||
2162 | |||
2163 | /* | ||
2164 | * Finally, try again to invalidate clean pages which might have been | ||
2165 | * cached by non-direct readahead, or faulted in by get_user_pages() | ||
2166 | * if the source of the write was an mmap'ed region of the file | ||
2167 | * we're writing. Either one is a pretty crazy thing to do, | ||
2168 | * so we don't support it 100%. If this invalidation | ||
2169 | * fails, tough, the write still worked... | ||
2170 | */ | ||
2171 | if (mapping->nrpages) { | ||
2172 | invalidate_inode_pages2_range(mapping, | ||
2173 | pos >> PAGE_CACHE_SHIFT, end); | ||
2174 | } | ||
2175 | |||
2012 | if (written > 0) { | 2176 | if (written > 0) { |
2013 | loff_t end = pos + written; | 2177 | loff_t end = pos + written; |
2014 | if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { | 2178 | if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { |
@@ -2024,6 +2188,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
2024 | * i_mutex is held, which protects generic_osync_inode() from | 2188 | * i_mutex is held, which protects generic_osync_inode() from |
2025 | * livelocking. AIO O_DIRECT ops attempt to sync metadata here. | 2189 | * livelocking. AIO O_DIRECT ops attempt to sync metadata here. |
2026 | */ | 2190 | */ |
2191 | out: | ||
2027 | if ((written >= 0 || written == -EIOCBQUEUED) && | 2192 | if ((written >= 0 || written == -EIOCBQUEUED) && |
2028 | ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 2193 | ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { |
2029 | int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); | 2194 | int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); |
@@ -2395,7 +2560,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | |||
2395 | if (count == 0) | 2560 | if (count == 0) |
2396 | goto out; | 2561 | goto out; |
2397 | 2562 | ||
2398 | err = remove_suid(file->f_path.dentry); | 2563 | err = file_remove_suid(file); |
2399 | if (err) | 2564 | if (err) |
2400 | goto out; | 2565 | goto out; |
2401 | 2566 | ||
@@ -2511,66 +2676,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2511 | } | 2676 | } |
2512 | EXPORT_SYMBOL(generic_file_aio_write); | 2677 | EXPORT_SYMBOL(generic_file_aio_write); |
2513 | 2678 | ||
2514 | /* | ||
2515 | * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something | ||
2516 | * went wrong during pagecache shootdown. | ||
2517 | */ | ||
2518 | static ssize_t | ||
2519 | generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | ||
2520 | loff_t offset, unsigned long nr_segs) | ||
2521 | { | ||
2522 | struct file *file = iocb->ki_filp; | ||
2523 | struct address_space *mapping = file->f_mapping; | ||
2524 | ssize_t retval; | ||
2525 | size_t write_len; | ||
2526 | pgoff_t end = 0; /* silence gcc */ | ||
2527 | |||
2528 | /* | ||
2529 | * If it's a write, unmap all mmappings of the file up-front. This | ||
2530 | * will cause any pte dirty bits to be propagated into the pageframes | ||
2531 | * for the subsequent filemap_write_and_wait(). | ||
2532 | */ | ||
2533 | if (rw == WRITE) { | ||
2534 | write_len = iov_length(iov, nr_segs); | ||
2535 | end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT; | ||
2536 | if (mapping_mapped(mapping)) | ||
2537 | unmap_mapping_range(mapping, offset, write_len, 0); | ||
2538 | } | ||
2539 | |||
2540 | retval = filemap_write_and_wait(mapping); | ||
2541 | if (retval) | ||
2542 | goto out; | ||
2543 | |||
2544 | /* | ||
2545 | * After a write we want buffered reads to be sure to go to disk to get | ||
2546 | * the new data. We invalidate clean cached page from the region we're | ||
2547 | * about to write. We do this *before* the write so that we can return | ||
2548 | * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). | ||
2549 | */ | ||
2550 | if (rw == WRITE && mapping->nrpages) { | ||
2551 | retval = invalidate_inode_pages2_range(mapping, | ||
2552 | offset >> PAGE_CACHE_SHIFT, end); | ||
2553 | if (retval) | ||
2554 | goto out; | ||
2555 | } | ||
2556 | |||
2557 | retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); | ||
2558 | |||
2559 | /* | ||
2560 | * Finally, try again to invalidate clean pages which might have been | ||
2561 | * cached by non-direct readahead, or faulted in by get_user_pages() | ||
2562 | * if the source of the write was an mmap'ed region of the file | ||
2563 | * we're writing. Either one is a pretty crazy thing to do, | ||
2564 | * so we don't support it 100%. If this invalidation | ||
2565 | * fails, tough, the write still worked... | ||
2566 | */ | ||
2567 | if (rw == WRITE && mapping->nrpages) { | ||
2568 | invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end); | ||
2569 | } | ||
2570 | out: | ||
2571 | return retval; | ||
2572 | } | ||
2573 | |||
2574 | /** | 2679 | /** |
2575 | * try_to_release_page() - release old fs-specific metadata on a page | 2680 | * try_to_release_page() - release old fs-specific metadata on a page |
2576 | * | 2681 | * |
@@ -2582,9 +2687,8 @@ out: | |||
2582 | * Otherwise return zero. | 2687 | * Otherwise return zero. |
2583 | * | 2688 | * |
2584 | * The @gfp_mask argument specifies whether I/O may be performed to release | 2689 | * The @gfp_mask argument specifies whether I/O may be performed to release |
2585 | * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). | 2690 | * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). |
2586 | * | 2691 | * |
2587 | * NOTE: @gfp_mask may go away, and this function may become non-blocking. | ||
2588 | */ | 2692 | */ |
2589 | int try_to_release_page(struct page *page, gfp_t gfp_mask) | 2693 | int try_to_release_page(struct page *page, gfp_t gfp_mask) |
2590 | { | 2694 | { |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 3e744abcce9d..b5167dfb2f2d 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -13,7 +13,10 @@ | |||
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/uio.h> | 14 | #include <linux/uio.h> |
15 | #include <linux/rmap.h> | 15 | #include <linux/rmap.h> |
16 | #include <linux/mmu_notifier.h> | ||
16 | #include <linux/sched.h> | 17 | #include <linux/sched.h> |
18 | #include <linux/seqlock.h> | ||
19 | #include <linux/mutex.h> | ||
17 | #include <asm/tlbflush.h> | 20 | #include <asm/tlbflush.h> |
18 | #include <asm/io.h> | 21 | #include <asm/io.h> |
19 | 22 | ||
@@ -21,22 +24,18 @@ | |||
21 | * We do use our own empty page to avoid interference with other users | 24 | * We do use our own empty page to avoid interference with other users |
22 | * of ZERO_PAGE(), such as /dev/zero | 25 | * of ZERO_PAGE(), such as /dev/zero |
23 | */ | 26 | */ |
27 | static DEFINE_MUTEX(xip_sparse_mutex); | ||
28 | static seqcount_t xip_sparse_seq = SEQCNT_ZERO; | ||
24 | static struct page *__xip_sparse_page; | 29 | static struct page *__xip_sparse_page; |
25 | 30 | ||
31 | /* called under xip_sparse_mutex */ | ||
26 | static struct page *xip_sparse_page(void) | 32 | static struct page *xip_sparse_page(void) |
27 | { | 33 | { |
28 | if (!__xip_sparse_page) { | 34 | if (!__xip_sparse_page) { |
29 | struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); | 35 | struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); |
30 | 36 | ||
31 | if (page) { | 37 | if (page) |
32 | static DEFINE_SPINLOCK(xip_alloc_lock); | 38 | __xip_sparse_page = page; |
33 | spin_lock(&xip_alloc_lock); | ||
34 | if (!__xip_sparse_page) | ||
35 | __xip_sparse_page = page; | ||
36 | else | ||
37 | __free_page(page); | ||
38 | spin_unlock(&xip_alloc_lock); | ||
39 | } | ||
40 | } | 39 | } |
41 | return __xip_sparse_page; | 40 | return __xip_sparse_page; |
42 | } | 41 | } |
@@ -173,22 +172,27 @@ __xip_unmap (struct address_space * mapping, | |||
173 | pte_t pteval; | 172 | pte_t pteval; |
174 | spinlock_t *ptl; | 173 | spinlock_t *ptl; |
175 | struct page *page; | 174 | struct page *page; |
175 | unsigned count; | ||
176 | int locked = 0; | ||
177 | |||
178 | count = read_seqcount_begin(&xip_sparse_seq); | ||
176 | 179 | ||
177 | page = __xip_sparse_page; | 180 | page = __xip_sparse_page; |
178 | if (!page) | 181 | if (!page) |
179 | return; | 182 | return; |
180 | 183 | ||
184 | retry: | ||
181 | spin_lock(&mapping->i_mmap_lock); | 185 | spin_lock(&mapping->i_mmap_lock); |
182 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 186 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
183 | mm = vma->vm_mm; | 187 | mm = vma->vm_mm; |
184 | address = vma->vm_start + | 188 | address = vma->vm_start + |
185 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | 189 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); |
186 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 190 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
187 | pte = page_check_address(page, mm, address, &ptl); | 191 | pte = page_check_address(page, mm, address, &ptl, 1); |
188 | if (pte) { | 192 | if (pte) { |
189 | /* Nuke the page table entry. */ | 193 | /* Nuke the page table entry. */ |
190 | flush_cache_page(vma, address, pte_pfn(*pte)); | 194 | flush_cache_page(vma, address, pte_pfn(*pte)); |
191 | pteval = ptep_clear_flush(vma, address, pte); | 195 | pteval = ptep_clear_flush_notify(vma, address, pte); |
192 | page_remove_rmap(page, vma); | 196 | page_remove_rmap(page, vma); |
193 | dec_mm_counter(mm, file_rss); | 197 | dec_mm_counter(mm, file_rss); |
194 | BUG_ON(pte_dirty(pteval)); | 198 | BUG_ON(pte_dirty(pteval)); |
@@ -197,6 +201,14 @@ __xip_unmap (struct address_space * mapping, | |||
197 | } | 201 | } |
198 | } | 202 | } |
199 | spin_unlock(&mapping->i_mmap_lock); | 203 | spin_unlock(&mapping->i_mmap_lock); |
204 | |||
205 | if (locked) { | ||
206 | mutex_unlock(&xip_sparse_mutex); | ||
207 | } else if (read_seqcount_retry(&xip_sparse_seq, count)) { | ||
208 | mutex_lock(&xip_sparse_mutex); | ||
209 | locked = 1; | ||
210 | goto retry; | ||
211 | } | ||
200 | } | 212 | } |
201 | 213 | ||
202 | /* | 214 | /* |
@@ -217,7 +229,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
217 | int error; | 229 | int error; |
218 | 230 | ||
219 | /* XXX: are VM_FAULT_ codes OK? */ | 231 | /* XXX: are VM_FAULT_ codes OK? */ |
220 | 232 | again: | |
221 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 233 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
222 | if (vmf->pgoff >= size) | 234 | if (vmf->pgoff >= size) |
223 | return VM_FAULT_SIGBUS; | 235 | return VM_FAULT_SIGBUS; |
@@ -236,8 +248,10 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
236 | int err; | 248 | int err; |
237 | 249 | ||
238 | /* maybe shared writable, allocate new block */ | 250 | /* maybe shared writable, allocate new block */ |
251 | mutex_lock(&xip_sparse_mutex); | ||
239 | error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, | 252 | error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, |
240 | &xip_mem, &xip_pfn); | 253 | &xip_mem, &xip_pfn); |
254 | mutex_unlock(&xip_sparse_mutex); | ||
241 | if (error) | 255 | if (error) |
242 | return VM_FAULT_SIGBUS; | 256 | return VM_FAULT_SIGBUS; |
243 | /* unmap sparse mappings at pgoff from all other vmas */ | 257 | /* unmap sparse mappings at pgoff from all other vmas */ |
@@ -251,14 +265,34 @@ found: | |||
251 | BUG_ON(err); | 265 | BUG_ON(err); |
252 | return VM_FAULT_NOPAGE; | 266 | return VM_FAULT_NOPAGE; |
253 | } else { | 267 | } else { |
268 | int err, ret = VM_FAULT_OOM; | ||
269 | |||
270 | mutex_lock(&xip_sparse_mutex); | ||
271 | write_seqcount_begin(&xip_sparse_seq); | ||
272 | error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, | ||
273 | &xip_mem, &xip_pfn); | ||
274 | if (unlikely(!error)) { | ||
275 | write_seqcount_end(&xip_sparse_seq); | ||
276 | mutex_unlock(&xip_sparse_mutex); | ||
277 | goto again; | ||
278 | } | ||
279 | if (error != -ENODATA) | ||
280 | goto out; | ||
254 | /* not shared and writable, use xip_sparse_page() */ | 281 | /* not shared and writable, use xip_sparse_page() */ |
255 | page = xip_sparse_page(); | 282 | page = xip_sparse_page(); |
256 | if (!page) | 283 | if (!page) |
257 | return VM_FAULT_OOM; | 284 | goto out; |
285 | err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, | ||
286 | page); | ||
287 | if (err == -ENOMEM) | ||
288 | goto out; | ||
258 | 289 | ||
259 | page_cache_get(page); | 290 | ret = VM_FAULT_NOPAGE; |
260 | vmf->page = page; | 291 | out: |
261 | return 0; | 292 | write_seqcount_end(&xip_sparse_seq); |
293 | mutex_unlock(&xip_sparse_mutex); | ||
294 | |||
295 | return ret; | ||
262 | } | 296 | } |
263 | } | 297 | } |
264 | 298 | ||
@@ -307,8 +341,10 @@ __xip_file_write(struct file *filp, const char __user *buf, | |||
307 | &xip_mem, &xip_pfn); | 341 | &xip_mem, &xip_pfn); |
308 | if (status == -ENODATA) { | 342 | if (status == -ENODATA) { |
309 | /* we allocate a new page unmap it */ | 343 | /* we allocate a new page unmap it */ |
344 | mutex_lock(&xip_sparse_mutex); | ||
310 | status = a_ops->get_xip_mem(mapping, index, 1, | 345 | status = a_ops->get_xip_mem(mapping, index, 1, |
311 | &xip_mem, &xip_pfn); | 346 | &xip_mem, &xip_pfn); |
347 | mutex_unlock(&xip_sparse_mutex); | ||
312 | if (!status) | 348 | if (!status) |
313 | /* unmap page at pgoff from all other vmas */ | 349 | /* unmap page at pgoff from all other vmas */ |
314 | __xip_unmap(mapping, index); | 350 | __xip_unmap(mapping, index); |
@@ -380,7 +416,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, | |||
380 | if (count == 0) | 416 | if (count == 0) |
381 | goto out_backing; | 417 | goto out_backing; |
382 | 418 | ||
383 | ret = remove_suid(filp->f_path.dentry); | 419 | ret = file_remove_suid(filp); |
384 | if (ret) | 420 | if (ret) |
385 | goto out_backing; | 421 | goto out_backing; |
386 | 422 | ||
diff --git a/mm/fremap.c b/mm/fremap.c index 07a9c82ce1a3..7d12ca70ef7b 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -15,11 +15,14 @@ | |||
15 | #include <linux/rmap.h> | 15 | #include <linux/rmap.h> |
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <linux/syscalls.h> | 17 | #include <linux/syscalls.h> |
18 | #include <linux/mmu_notifier.h> | ||
18 | 19 | ||
19 | #include <asm/mmu_context.h> | 20 | #include <asm/mmu_context.h> |
20 | #include <asm/cacheflush.h> | 21 | #include <asm/cacheflush.h> |
21 | #include <asm/tlbflush.h> | 22 | #include <asm/tlbflush.h> |
22 | 23 | ||
24 | #include "internal.h" | ||
25 | |||
23 | static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | 26 | static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, |
24 | unsigned long addr, pte_t *ptep) | 27 | unsigned long addr, pte_t *ptep) |
25 | { | 28 | { |
@@ -214,13 +217,31 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, | |||
214 | spin_unlock(&mapping->i_mmap_lock); | 217 | spin_unlock(&mapping->i_mmap_lock); |
215 | } | 218 | } |
216 | 219 | ||
220 | if (vma->vm_flags & VM_LOCKED) { | ||
221 | /* | ||
222 | * drop PG_Mlocked flag for over-mapped range | ||
223 | */ | ||
224 | unsigned int saved_flags = vma->vm_flags; | ||
225 | munlock_vma_pages_range(vma, start, start + size); | ||
226 | vma->vm_flags = saved_flags; | ||
227 | } | ||
228 | |||
229 | mmu_notifier_invalidate_range_start(mm, start, start + size); | ||
217 | err = populate_range(mm, vma, start, size, pgoff); | 230 | err = populate_range(mm, vma, start, size, pgoff); |
231 | mmu_notifier_invalidate_range_end(mm, start, start + size); | ||
218 | if (!err && !(flags & MAP_NONBLOCK)) { | 232 | if (!err && !(flags & MAP_NONBLOCK)) { |
219 | if (unlikely(has_write_lock)) { | 233 | if (vma->vm_flags & VM_LOCKED) { |
220 | downgrade_write(&mm->mmap_sem); | 234 | /* |
221 | has_write_lock = 0; | 235 | * might be mapping previously unmapped range of file |
236 | */ | ||
237 | mlock_vma_pages_range(vma, start, start + size); | ||
238 | } else { | ||
239 | if (unlikely(has_write_lock)) { | ||
240 | downgrade_write(&mm->mmap_sem); | ||
241 | has_write_lock = 0; | ||
242 | } | ||
243 | make_pages_present(start, start+size); | ||
222 | } | 244 | } |
223 | make_pages_present(start, start+size); | ||
224 | } | 245 | } |
225 | 246 | ||
226 | /* | 247 | /* |
@@ -237,4 +258,3 @@ out: | |||
237 | 258 | ||
238 | return err; | 259 | return err; |
239 | } | 260 | } |
240 | |||
diff --git a/mm/highmem.c b/mm/highmem.c index 7da4a7b6af11..b36b83b920ff 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -40,6 +40,7 @@ | |||
40 | #ifdef CONFIG_HIGHMEM | 40 | #ifdef CONFIG_HIGHMEM |
41 | 41 | ||
42 | unsigned long totalhigh_pages __read_mostly; | 42 | unsigned long totalhigh_pages __read_mostly; |
43 | EXPORT_SYMBOL(totalhigh_pages); | ||
43 | 44 | ||
44 | unsigned int nr_free_highpages (void) | 45 | unsigned int nr_free_highpages (void) |
45 | { | 46 | { |
@@ -69,6 +70,7 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); | |||
69 | static void flush_all_zero_pkmaps(void) | 70 | static void flush_all_zero_pkmaps(void) |
70 | { | 71 | { |
71 | int i; | 72 | int i; |
73 | int need_flush = 0; | ||
72 | 74 | ||
73 | flush_cache_kmaps(); | 75 | flush_cache_kmaps(); |
74 | 76 | ||
@@ -100,8 +102,10 @@ static void flush_all_zero_pkmaps(void) | |||
100 | &pkmap_page_table[i]); | 102 | &pkmap_page_table[i]); |
101 | 103 | ||
102 | set_page_address(page, NULL); | 104 | set_page_address(page, NULL); |
105 | need_flush = 1; | ||
103 | } | 106 | } |
104 | flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); | 107 | if (need_flush) |
108 | flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); | ||
105 | } | 109 | } |
106 | 110 | ||
107 | /** | 111 | /** |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ab171274ef21..421aee99b84a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -7,45 +7,360 @@ | |||
7 | #include <linux/init.h> | 7 | #include <linux/init.h> |
8 | #include <linux/module.h> | 8 | #include <linux/module.h> |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/seq_file.h> | ||
10 | #include <linux/sysctl.h> | 11 | #include <linux/sysctl.h> |
11 | #include <linux/highmem.h> | 12 | #include <linux/highmem.h> |
13 | #include <linux/mmu_notifier.h> | ||
12 | #include <linux/nodemask.h> | 14 | #include <linux/nodemask.h> |
13 | #include <linux/pagemap.h> | 15 | #include <linux/pagemap.h> |
14 | #include <linux/mempolicy.h> | 16 | #include <linux/mempolicy.h> |
15 | #include <linux/cpuset.h> | 17 | #include <linux/cpuset.h> |
16 | #include <linux/mutex.h> | 18 | #include <linux/mutex.h> |
19 | #include <linux/bootmem.h> | ||
20 | #include <linux/sysfs.h> | ||
17 | 21 | ||
18 | #include <asm/page.h> | 22 | #include <asm/page.h> |
19 | #include <asm/pgtable.h> | 23 | #include <asm/pgtable.h> |
24 | #include <asm/io.h> | ||
20 | 25 | ||
21 | #include <linux/hugetlb.h> | 26 | #include <linux/hugetlb.h> |
22 | #include "internal.h" | 27 | #include "internal.h" |
23 | 28 | ||
24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 29 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
25 | static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; | ||
26 | static unsigned long surplus_huge_pages; | ||
27 | static unsigned long nr_overcommit_huge_pages; | ||
28 | unsigned long max_huge_pages; | ||
29 | unsigned long sysctl_overcommit_huge_pages; | ||
30 | static struct list_head hugepage_freelists[MAX_NUMNODES]; | ||
31 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; | ||
32 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; | ||
33 | static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; | ||
34 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; | 30 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; |
35 | unsigned long hugepages_treat_as_movable; | 31 | unsigned long hugepages_treat_as_movable; |
36 | static int hugetlb_next_nid; | 32 | |
33 | static int max_hstate; | ||
34 | unsigned int default_hstate_idx; | ||
35 | struct hstate hstates[HUGE_MAX_HSTATE]; | ||
36 | |||
37 | __initdata LIST_HEAD(huge_boot_pages); | ||
38 | |||
39 | /* for command line parsing */ | ||
40 | static struct hstate * __initdata parsed_hstate; | ||
41 | static unsigned long __initdata default_hstate_max_huge_pages; | ||
42 | static unsigned long __initdata default_hstate_size; | ||
43 | |||
44 | #define for_each_hstate(h) \ | ||
45 | for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) | ||
37 | 46 | ||
38 | /* | 47 | /* |
39 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages | 48 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages |
40 | */ | 49 | */ |
41 | static DEFINE_SPINLOCK(hugetlb_lock); | 50 | static DEFINE_SPINLOCK(hugetlb_lock); |
42 | 51 | ||
43 | static void clear_huge_page(struct page *page, unsigned long addr) | 52 | /* |
53 | * Region tracking -- allows tracking of reservations and instantiated pages | ||
54 | * across the pages in a mapping. | ||
55 | * | ||
56 | * The region data structures are protected by a combination of the mmap_sem | ||
57 | * and the hugetlb_instantion_mutex. To access or modify a region the caller | ||
58 | * must either hold the mmap_sem for write, or the mmap_sem for read and | ||
59 | * the hugetlb_instantiation mutex: | ||
60 | * | ||
61 | * down_write(&mm->mmap_sem); | ||
62 | * or | ||
63 | * down_read(&mm->mmap_sem); | ||
64 | * mutex_lock(&hugetlb_instantiation_mutex); | ||
65 | */ | ||
66 | struct file_region { | ||
67 | struct list_head link; | ||
68 | long from; | ||
69 | long to; | ||
70 | }; | ||
71 | |||
72 | static long region_add(struct list_head *head, long f, long t) | ||
73 | { | ||
74 | struct file_region *rg, *nrg, *trg; | ||
75 | |||
76 | /* Locate the region we are either in or before. */ | ||
77 | list_for_each_entry(rg, head, link) | ||
78 | if (f <= rg->to) | ||
79 | break; | ||
80 | |||
81 | /* Round our left edge to the current segment if it encloses us. */ | ||
82 | if (f > rg->from) | ||
83 | f = rg->from; | ||
84 | |||
85 | /* Check for and consume any regions we now overlap with. */ | ||
86 | nrg = rg; | ||
87 | list_for_each_entry_safe(rg, trg, rg->link.prev, link) { | ||
88 | if (&rg->link == head) | ||
89 | break; | ||
90 | if (rg->from > t) | ||
91 | break; | ||
92 | |||
93 | /* If this area reaches higher then extend our area to | ||
94 | * include it completely. If this is not the first area | ||
95 | * which we intend to reuse, free it. */ | ||
96 | if (rg->to > t) | ||
97 | t = rg->to; | ||
98 | if (rg != nrg) { | ||
99 | list_del(&rg->link); | ||
100 | kfree(rg); | ||
101 | } | ||
102 | } | ||
103 | nrg->from = f; | ||
104 | nrg->to = t; | ||
105 | return 0; | ||
106 | } | ||
107 | |||
108 | static long region_chg(struct list_head *head, long f, long t) | ||
109 | { | ||
110 | struct file_region *rg, *nrg; | ||
111 | long chg = 0; | ||
112 | |||
113 | /* Locate the region we are before or in. */ | ||
114 | list_for_each_entry(rg, head, link) | ||
115 | if (f <= rg->to) | ||
116 | break; | ||
117 | |||
118 | /* If we are below the current region then a new region is required. | ||
119 | * Subtle, allocate a new region at the position but make it zero | ||
120 | * size such that we can guarantee to record the reservation. */ | ||
121 | if (&rg->link == head || t < rg->from) { | ||
122 | nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); | ||
123 | if (!nrg) | ||
124 | return -ENOMEM; | ||
125 | nrg->from = f; | ||
126 | nrg->to = f; | ||
127 | INIT_LIST_HEAD(&nrg->link); | ||
128 | list_add(&nrg->link, rg->link.prev); | ||
129 | |||
130 | return t - f; | ||
131 | } | ||
132 | |||
133 | /* Round our left edge to the current segment if it encloses us. */ | ||
134 | if (f > rg->from) | ||
135 | f = rg->from; | ||
136 | chg = t - f; | ||
137 | |||
138 | /* Check for and consume any regions we now overlap with. */ | ||
139 | list_for_each_entry(rg, rg->link.prev, link) { | ||
140 | if (&rg->link == head) | ||
141 | break; | ||
142 | if (rg->from > t) | ||
143 | return chg; | ||
144 | |||
145 | /* We overlap with this area, if it extends futher than | ||
146 | * us then we must extend ourselves. Account for its | ||
147 | * existing reservation. */ | ||
148 | if (rg->to > t) { | ||
149 | chg += rg->to - t; | ||
150 | t = rg->to; | ||
151 | } | ||
152 | chg -= rg->to - rg->from; | ||
153 | } | ||
154 | return chg; | ||
155 | } | ||
156 | |||
157 | static long region_truncate(struct list_head *head, long end) | ||
158 | { | ||
159 | struct file_region *rg, *trg; | ||
160 | long chg = 0; | ||
161 | |||
162 | /* Locate the region we are either in or before. */ | ||
163 | list_for_each_entry(rg, head, link) | ||
164 | if (end <= rg->to) | ||
165 | break; | ||
166 | if (&rg->link == head) | ||
167 | return 0; | ||
168 | |||
169 | /* If we are in the middle of a region then adjust it. */ | ||
170 | if (end > rg->from) { | ||
171 | chg = rg->to - end; | ||
172 | rg->to = end; | ||
173 | rg = list_entry(rg->link.next, typeof(*rg), link); | ||
174 | } | ||
175 | |||
176 | /* Drop any remaining regions. */ | ||
177 | list_for_each_entry_safe(rg, trg, rg->link.prev, link) { | ||
178 | if (&rg->link == head) | ||
179 | break; | ||
180 | chg += rg->to - rg->from; | ||
181 | list_del(&rg->link); | ||
182 | kfree(rg); | ||
183 | } | ||
184 | return chg; | ||
185 | } | ||
186 | |||
187 | static long region_count(struct list_head *head, long f, long t) | ||
188 | { | ||
189 | struct file_region *rg; | ||
190 | long chg = 0; | ||
191 | |||
192 | /* Locate each segment we overlap with, and count that overlap. */ | ||
193 | list_for_each_entry(rg, head, link) { | ||
194 | int seg_from; | ||
195 | int seg_to; | ||
196 | |||
197 | if (rg->to <= f) | ||
198 | continue; | ||
199 | if (rg->from >= t) | ||
200 | break; | ||
201 | |||
202 | seg_from = max(rg->from, f); | ||
203 | seg_to = min(rg->to, t); | ||
204 | |||
205 | chg += seg_to - seg_from; | ||
206 | } | ||
207 | |||
208 | return chg; | ||
209 | } | ||
210 | |||
211 | /* | ||
212 | * Convert the address within this vma to the page offset within | ||
213 | * the mapping, in pagecache page units; huge pages here. | ||
214 | */ | ||
215 | static pgoff_t vma_hugecache_offset(struct hstate *h, | ||
216 | struct vm_area_struct *vma, unsigned long address) | ||
217 | { | ||
218 | return ((address - vma->vm_start) >> huge_page_shift(h)) + | ||
219 | (vma->vm_pgoff >> huge_page_order(h)); | ||
220 | } | ||
221 | |||
222 | /* | ||
223 | * Flags for MAP_PRIVATE reservations. These are stored in the bottom | ||
224 | * bits of the reservation map pointer, which are always clear due to | ||
225 | * alignment. | ||
226 | */ | ||
227 | #define HPAGE_RESV_OWNER (1UL << 0) | ||
228 | #define HPAGE_RESV_UNMAPPED (1UL << 1) | ||
229 | #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) | ||
230 | |||
231 | /* | ||
232 | * These helpers are used to track how many pages are reserved for | ||
233 | * faults in a MAP_PRIVATE mapping. Only the process that called mmap() | ||
234 | * is guaranteed to have their future faults succeed. | ||
235 | * | ||
236 | * With the exception of reset_vma_resv_huge_pages() which is called at fork(), | ||
237 | * the reserve counters are updated with the hugetlb_lock held. It is safe | ||
238 | * to reset the VMA at fork() time as it is not in use yet and there is no | ||
239 | * chance of the global counters getting corrupted as a result of the values. | ||
240 | * | ||
241 | * The private mapping reservation is represented in a subtly different | ||
242 | * manner to a shared mapping. A shared mapping has a region map associated | ||
243 | * with the underlying file, this region map represents the backing file | ||
244 | * pages which have ever had a reservation assigned which this persists even | ||
245 | * after the page is instantiated. A private mapping has a region map | ||
246 | * associated with the original mmap which is attached to all VMAs which | ||
247 | * reference it, this region map represents those offsets which have consumed | ||
248 | * reservation ie. where pages have been instantiated. | ||
249 | */ | ||
250 | static unsigned long get_vma_private_data(struct vm_area_struct *vma) | ||
251 | { | ||
252 | return (unsigned long)vma->vm_private_data; | ||
253 | } | ||
254 | |||
255 | static void set_vma_private_data(struct vm_area_struct *vma, | ||
256 | unsigned long value) | ||
257 | { | ||
258 | vma->vm_private_data = (void *)value; | ||
259 | } | ||
260 | |||
261 | struct resv_map { | ||
262 | struct kref refs; | ||
263 | struct list_head regions; | ||
264 | }; | ||
265 | |||
266 | static struct resv_map *resv_map_alloc(void) | ||
267 | { | ||
268 | struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); | ||
269 | if (!resv_map) | ||
270 | return NULL; | ||
271 | |||
272 | kref_init(&resv_map->refs); | ||
273 | INIT_LIST_HEAD(&resv_map->regions); | ||
274 | |||
275 | return resv_map; | ||
276 | } | ||
277 | |||
278 | static void resv_map_release(struct kref *ref) | ||
279 | { | ||
280 | struct resv_map *resv_map = container_of(ref, struct resv_map, refs); | ||
281 | |||
282 | /* Clear out any active regions before we release the map. */ | ||
283 | region_truncate(&resv_map->regions, 0); | ||
284 | kfree(resv_map); | ||
285 | } | ||
286 | |||
287 | static struct resv_map *vma_resv_map(struct vm_area_struct *vma) | ||
288 | { | ||
289 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | ||
290 | if (!(vma->vm_flags & VM_SHARED)) | ||
291 | return (struct resv_map *)(get_vma_private_data(vma) & | ||
292 | ~HPAGE_RESV_MASK); | ||
293 | return NULL; | ||
294 | } | ||
295 | |||
296 | static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) | ||
297 | { | ||
298 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | ||
299 | VM_BUG_ON(vma->vm_flags & VM_SHARED); | ||
300 | |||
301 | set_vma_private_data(vma, (get_vma_private_data(vma) & | ||
302 | HPAGE_RESV_MASK) | (unsigned long)map); | ||
303 | } | ||
304 | |||
305 | static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) | ||
306 | { | ||
307 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | ||
308 | VM_BUG_ON(vma->vm_flags & VM_SHARED); | ||
309 | |||
310 | set_vma_private_data(vma, get_vma_private_data(vma) | flags); | ||
311 | } | ||
312 | |||
313 | static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) | ||
314 | { | ||
315 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | ||
316 | |||
317 | return (get_vma_private_data(vma) & flag) != 0; | ||
318 | } | ||
319 | |||
320 | /* Decrement the reserved pages in the hugepage pool by one */ | ||
321 | static void decrement_hugepage_resv_vma(struct hstate *h, | ||
322 | struct vm_area_struct *vma) | ||
323 | { | ||
324 | if (vma->vm_flags & VM_NORESERVE) | ||
325 | return; | ||
326 | |||
327 | if (vma->vm_flags & VM_SHARED) { | ||
328 | /* Shared mappings always use reserves */ | ||
329 | h->resv_huge_pages--; | ||
330 | } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { | ||
331 | /* | ||
332 | * Only the process that called mmap() has reserves for | ||
333 | * private mappings. | ||
334 | */ | ||
335 | h->resv_huge_pages--; | ||
336 | } | ||
337 | } | ||
338 | |||
339 | /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ | ||
340 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma) | ||
341 | { | ||
342 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | ||
343 | if (!(vma->vm_flags & VM_SHARED)) | ||
344 | vma->vm_private_data = (void *)0; | ||
345 | } | ||
346 | |||
347 | /* Returns true if the VMA has associated reserve pages */ | ||
348 | static int vma_has_reserves(struct vm_area_struct *vma) | ||
349 | { | ||
350 | if (vma->vm_flags & VM_SHARED) | ||
351 | return 1; | ||
352 | if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) | ||
353 | return 1; | ||
354 | return 0; | ||
355 | } | ||
356 | |||
357 | static void clear_huge_page(struct page *page, | ||
358 | unsigned long addr, unsigned long sz) | ||
44 | { | 359 | { |
45 | int i; | 360 | int i; |
46 | 361 | ||
47 | might_sleep(); | 362 | might_sleep(); |
48 | for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { | 363 | for (i = 0; i < sz/PAGE_SIZE; i++) { |
49 | cond_resched(); | 364 | cond_resched(); |
50 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); | 365 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); |
51 | } | 366 | } |
@@ -55,42 +370,44 @@ static void copy_huge_page(struct page *dst, struct page *src, | |||
55 | unsigned long addr, struct vm_area_struct *vma) | 370 | unsigned long addr, struct vm_area_struct *vma) |
56 | { | 371 | { |
57 | int i; | 372 | int i; |
373 | struct hstate *h = hstate_vma(vma); | ||
58 | 374 | ||
59 | might_sleep(); | 375 | might_sleep(); |
60 | for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { | 376 | for (i = 0; i < pages_per_huge_page(h); i++) { |
61 | cond_resched(); | 377 | cond_resched(); |
62 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); | 378 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); |
63 | } | 379 | } |
64 | } | 380 | } |
65 | 381 | ||
66 | static void enqueue_huge_page(struct page *page) | 382 | static void enqueue_huge_page(struct hstate *h, struct page *page) |
67 | { | 383 | { |
68 | int nid = page_to_nid(page); | 384 | int nid = page_to_nid(page); |
69 | list_add(&page->lru, &hugepage_freelists[nid]); | 385 | list_add(&page->lru, &h->hugepage_freelists[nid]); |
70 | free_huge_pages++; | 386 | h->free_huge_pages++; |
71 | free_huge_pages_node[nid]++; | 387 | h->free_huge_pages_node[nid]++; |
72 | } | 388 | } |
73 | 389 | ||
74 | static struct page *dequeue_huge_page(void) | 390 | static struct page *dequeue_huge_page(struct hstate *h) |
75 | { | 391 | { |
76 | int nid; | 392 | int nid; |
77 | struct page *page = NULL; | 393 | struct page *page = NULL; |
78 | 394 | ||
79 | for (nid = 0; nid < MAX_NUMNODES; ++nid) { | 395 | for (nid = 0; nid < MAX_NUMNODES; ++nid) { |
80 | if (!list_empty(&hugepage_freelists[nid])) { | 396 | if (!list_empty(&h->hugepage_freelists[nid])) { |
81 | page = list_entry(hugepage_freelists[nid].next, | 397 | page = list_entry(h->hugepage_freelists[nid].next, |
82 | struct page, lru); | 398 | struct page, lru); |
83 | list_del(&page->lru); | 399 | list_del(&page->lru); |
84 | free_huge_pages--; | 400 | h->free_huge_pages--; |
85 | free_huge_pages_node[nid]--; | 401 | h->free_huge_pages_node[nid]--; |
86 | break; | 402 | break; |
87 | } | 403 | } |
88 | } | 404 | } |
89 | return page; | 405 | return page; |
90 | } | 406 | } |
91 | 407 | ||
92 | static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | 408 | static struct page *dequeue_huge_page_vma(struct hstate *h, |
93 | unsigned long address) | 409 | struct vm_area_struct *vma, |
410 | unsigned long address, int avoid_reserve) | ||
94 | { | 411 | { |
95 | int nid; | 412 | int nid; |
96 | struct page *page = NULL; | 413 | struct page *page = NULL; |
@@ -101,18 +418,33 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | |||
101 | struct zone *zone; | 418 | struct zone *zone; |
102 | struct zoneref *z; | 419 | struct zoneref *z; |
103 | 420 | ||
421 | /* | ||
422 | * A child process with MAP_PRIVATE mappings created by their parent | ||
423 | * have no page reserves. This check ensures that reservations are | ||
424 | * not "stolen". The child may still get SIGKILLed | ||
425 | */ | ||
426 | if (!vma_has_reserves(vma) && | ||
427 | h->free_huge_pages - h->resv_huge_pages == 0) | ||
428 | return NULL; | ||
429 | |||
430 | /* If reserves cannot be used, ensure enough pages are in the pool */ | ||
431 | if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) | ||
432 | return NULL; | ||
433 | |||
104 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 434 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
105 | MAX_NR_ZONES - 1, nodemask) { | 435 | MAX_NR_ZONES - 1, nodemask) { |
106 | nid = zone_to_nid(zone); | 436 | nid = zone_to_nid(zone); |
107 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && | 437 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && |
108 | !list_empty(&hugepage_freelists[nid])) { | 438 | !list_empty(&h->hugepage_freelists[nid])) { |
109 | page = list_entry(hugepage_freelists[nid].next, | 439 | page = list_entry(h->hugepage_freelists[nid].next, |
110 | struct page, lru); | 440 | struct page, lru); |
111 | list_del(&page->lru); | 441 | list_del(&page->lru); |
112 | free_huge_pages--; | 442 | h->free_huge_pages--; |
113 | free_huge_pages_node[nid]--; | 443 | h->free_huge_pages_node[nid]--; |
114 | if (vma && vma->vm_flags & VM_MAYSHARE) | 444 | |
115 | resv_huge_pages--; | 445 | if (!avoid_reserve) |
446 | decrement_hugepage_resv_vma(h, vma); | ||
447 | |||
116 | break; | 448 | break; |
117 | } | 449 | } |
118 | } | 450 | } |
@@ -120,12 +452,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | |||
120 | return page; | 452 | return page; |
121 | } | 453 | } |
122 | 454 | ||
123 | static void update_and_free_page(struct page *page) | 455 | static void update_and_free_page(struct hstate *h, struct page *page) |
124 | { | 456 | { |
125 | int i; | 457 | int i; |
126 | nr_huge_pages--; | 458 | |
127 | nr_huge_pages_node[page_to_nid(page)]--; | 459 | h->nr_huge_pages--; |
128 | for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { | 460 | h->nr_huge_pages_node[page_to_nid(page)]--; |
461 | for (i = 0; i < pages_per_huge_page(h); i++) { | ||
129 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | | 462 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | |
130 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | | 463 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | |
131 | 1 << PG_private | 1<< PG_writeback); | 464 | 1 << PG_private | 1<< PG_writeback); |
@@ -133,11 +466,27 @@ static void update_and_free_page(struct page *page) | |||
133 | set_compound_page_dtor(page, NULL); | 466 | set_compound_page_dtor(page, NULL); |
134 | set_page_refcounted(page); | 467 | set_page_refcounted(page); |
135 | arch_release_hugepage(page); | 468 | arch_release_hugepage(page); |
136 | __free_pages(page, HUGETLB_PAGE_ORDER); | 469 | __free_pages(page, huge_page_order(h)); |
470 | } | ||
471 | |||
472 | struct hstate *size_to_hstate(unsigned long size) | ||
473 | { | ||
474 | struct hstate *h; | ||
475 | |||
476 | for_each_hstate(h) { | ||
477 | if (huge_page_size(h) == size) | ||
478 | return h; | ||
479 | } | ||
480 | return NULL; | ||
137 | } | 481 | } |
138 | 482 | ||
139 | static void free_huge_page(struct page *page) | 483 | static void free_huge_page(struct page *page) |
140 | { | 484 | { |
485 | /* | ||
486 | * Can't pass hstate in here because it is called from the | ||
487 | * compound page destructor. | ||
488 | */ | ||
489 | struct hstate *h = page_hstate(page); | ||
141 | int nid = page_to_nid(page); | 490 | int nid = page_to_nid(page); |
142 | struct address_space *mapping; | 491 | struct address_space *mapping; |
143 | 492 | ||
@@ -147,12 +496,12 @@ static void free_huge_page(struct page *page) | |||
147 | INIT_LIST_HEAD(&page->lru); | 496 | INIT_LIST_HEAD(&page->lru); |
148 | 497 | ||
149 | spin_lock(&hugetlb_lock); | 498 | spin_lock(&hugetlb_lock); |
150 | if (surplus_huge_pages_node[nid]) { | 499 | if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { |
151 | update_and_free_page(page); | 500 | update_and_free_page(h, page); |
152 | surplus_huge_pages--; | 501 | h->surplus_huge_pages--; |
153 | surplus_huge_pages_node[nid]--; | 502 | h->surplus_huge_pages_node[nid]--; |
154 | } else { | 503 | } else { |
155 | enqueue_huge_page(page); | 504 | enqueue_huge_page(h, page); |
156 | } | 505 | } |
157 | spin_unlock(&hugetlb_lock); | 506 | spin_unlock(&hugetlb_lock); |
158 | if (mapping) | 507 | if (mapping) |
@@ -164,7 +513,7 @@ static void free_huge_page(struct page *page) | |||
164 | * balanced by operating on them in a round-robin fashion. | 513 | * balanced by operating on them in a round-robin fashion. |
165 | * Returns 1 if an adjustment was made. | 514 | * Returns 1 if an adjustment was made. |
166 | */ | 515 | */ |
167 | static int adjust_pool_surplus(int delta) | 516 | static int adjust_pool_surplus(struct hstate *h, int delta) |
168 | { | 517 | { |
169 | static int prev_nid; | 518 | static int prev_nid; |
170 | int nid = prev_nid; | 519 | int nid = prev_nid; |
@@ -177,15 +526,15 @@ static int adjust_pool_surplus(int delta) | |||
177 | nid = first_node(node_online_map); | 526 | nid = first_node(node_online_map); |
178 | 527 | ||
179 | /* To shrink on this node, there must be a surplus page */ | 528 | /* To shrink on this node, there must be a surplus page */ |
180 | if (delta < 0 && !surplus_huge_pages_node[nid]) | 529 | if (delta < 0 && !h->surplus_huge_pages_node[nid]) |
181 | continue; | 530 | continue; |
182 | /* Surplus cannot exceed the total number of pages */ | 531 | /* Surplus cannot exceed the total number of pages */ |
183 | if (delta > 0 && surplus_huge_pages_node[nid] >= | 532 | if (delta > 0 && h->surplus_huge_pages_node[nid] >= |
184 | nr_huge_pages_node[nid]) | 533 | h->nr_huge_pages_node[nid]) |
185 | continue; | 534 | continue; |
186 | 535 | ||
187 | surplus_huge_pages += delta; | 536 | h->surplus_huge_pages += delta; |
188 | surplus_huge_pages_node[nid] += delta; | 537 | h->surplus_huge_pages_node[nid] += delta; |
189 | ret = 1; | 538 | ret = 1; |
190 | break; | 539 | break; |
191 | } while (nid != prev_nid); | 540 | } while (nid != prev_nid); |
@@ -194,59 +543,74 @@ static int adjust_pool_surplus(int delta) | |||
194 | return ret; | 543 | return ret; |
195 | } | 544 | } |
196 | 545 | ||
197 | static struct page *alloc_fresh_huge_page_node(int nid) | 546 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
547 | { | ||
548 | set_compound_page_dtor(page, free_huge_page); | ||
549 | spin_lock(&hugetlb_lock); | ||
550 | h->nr_huge_pages++; | ||
551 | h->nr_huge_pages_node[nid]++; | ||
552 | spin_unlock(&hugetlb_lock); | ||
553 | put_page(page); /* free it into the hugepage allocator */ | ||
554 | } | ||
555 | |||
556 | static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | ||
198 | { | 557 | { |
199 | struct page *page; | 558 | struct page *page; |
200 | 559 | ||
560 | if (h->order >= MAX_ORDER) | ||
561 | return NULL; | ||
562 | |||
201 | page = alloc_pages_node(nid, | 563 | page = alloc_pages_node(nid, |
202 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| | 564 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| |
203 | __GFP_REPEAT|__GFP_NOWARN, | 565 | __GFP_REPEAT|__GFP_NOWARN, |
204 | HUGETLB_PAGE_ORDER); | 566 | huge_page_order(h)); |
205 | if (page) { | 567 | if (page) { |
206 | if (arch_prepare_hugepage(page)) { | 568 | if (arch_prepare_hugepage(page)) { |
207 | __free_pages(page, HUGETLB_PAGE_ORDER); | 569 | __free_pages(page, huge_page_order(h)); |
208 | return NULL; | 570 | return NULL; |
209 | } | 571 | } |
210 | set_compound_page_dtor(page, free_huge_page); | 572 | prep_new_huge_page(h, page, nid); |
211 | spin_lock(&hugetlb_lock); | ||
212 | nr_huge_pages++; | ||
213 | nr_huge_pages_node[nid]++; | ||
214 | spin_unlock(&hugetlb_lock); | ||
215 | put_page(page); /* free it into the hugepage allocator */ | ||
216 | } | 573 | } |
217 | 574 | ||
218 | return page; | 575 | return page; |
219 | } | 576 | } |
220 | 577 | ||
221 | static int alloc_fresh_huge_page(void) | 578 | /* |
579 | * Use a helper variable to find the next node and then | ||
580 | * copy it back to hugetlb_next_nid afterwards: | ||
581 | * otherwise there's a window in which a racer might | ||
582 | * pass invalid nid MAX_NUMNODES to alloc_pages_node. | ||
583 | * But we don't need to use a spin_lock here: it really | ||
584 | * doesn't matter if occasionally a racer chooses the | ||
585 | * same nid as we do. Move nid forward in the mask even | ||
586 | * if we just successfully allocated a hugepage so that | ||
587 | * the next caller gets hugepages on the next node. | ||
588 | */ | ||
589 | static int hstate_next_node(struct hstate *h) | ||
590 | { | ||
591 | int next_nid; | ||
592 | next_nid = next_node(h->hugetlb_next_nid, node_online_map); | ||
593 | if (next_nid == MAX_NUMNODES) | ||
594 | next_nid = first_node(node_online_map); | ||
595 | h->hugetlb_next_nid = next_nid; | ||
596 | return next_nid; | ||
597 | } | ||
598 | |||
599 | static int alloc_fresh_huge_page(struct hstate *h) | ||
222 | { | 600 | { |
223 | struct page *page; | 601 | struct page *page; |
224 | int start_nid; | 602 | int start_nid; |
225 | int next_nid; | 603 | int next_nid; |
226 | int ret = 0; | 604 | int ret = 0; |
227 | 605 | ||
228 | start_nid = hugetlb_next_nid; | 606 | start_nid = h->hugetlb_next_nid; |
229 | 607 | ||
230 | do { | 608 | do { |
231 | page = alloc_fresh_huge_page_node(hugetlb_next_nid); | 609 | page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); |
232 | if (page) | 610 | if (page) |
233 | ret = 1; | 611 | ret = 1; |
234 | /* | 612 | next_nid = hstate_next_node(h); |
235 | * Use a helper variable to find the next node and then | 613 | } while (!page && h->hugetlb_next_nid != start_nid); |
236 | * copy it back to hugetlb_next_nid afterwards: | ||
237 | * otherwise there's a window in which a racer might | ||
238 | * pass invalid nid MAX_NUMNODES to alloc_pages_node. | ||
239 | * But we don't need to use a spin_lock here: it really | ||
240 | * doesn't matter if occasionally a racer chooses the | ||
241 | * same nid as we do. Move nid forward in the mask even | ||
242 | * if we just successfully allocated a hugepage so that | ||
243 | * the next caller gets hugepages on the next node. | ||
244 | */ | ||
245 | next_nid = next_node(hugetlb_next_nid, node_online_map); | ||
246 | if (next_nid == MAX_NUMNODES) | ||
247 | next_nid = first_node(node_online_map); | ||
248 | hugetlb_next_nid = next_nid; | ||
249 | } while (!page && hugetlb_next_nid != start_nid); | ||
250 | 614 | ||
251 | if (ret) | 615 | if (ret) |
252 | count_vm_event(HTLB_BUDDY_PGALLOC); | 616 | count_vm_event(HTLB_BUDDY_PGALLOC); |
@@ -256,12 +620,15 @@ static int alloc_fresh_huge_page(void) | |||
256 | return ret; | 620 | return ret; |
257 | } | 621 | } |
258 | 622 | ||
259 | static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | 623 | static struct page *alloc_buddy_huge_page(struct hstate *h, |
260 | unsigned long address) | 624 | struct vm_area_struct *vma, unsigned long address) |
261 | { | 625 | { |
262 | struct page *page; | 626 | struct page *page; |
263 | unsigned int nid; | 627 | unsigned int nid; |
264 | 628 | ||
629 | if (h->order >= MAX_ORDER) | ||
630 | return NULL; | ||
631 | |||
265 | /* | 632 | /* |
266 | * Assume we will successfully allocate the surplus page to | 633 | * Assume we will successfully allocate the surplus page to |
267 | * prevent racing processes from causing the surplus to exceed | 634 | * prevent racing processes from causing the surplus to exceed |
@@ -286,18 +653,23 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | |||
286 | * per-node value is checked there. | 653 | * per-node value is checked there. |
287 | */ | 654 | */ |
288 | spin_lock(&hugetlb_lock); | 655 | spin_lock(&hugetlb_lock); |
289 | if (surplus_huge_pages >= nr_overcommit_huge_pages) { | 656 | if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { |
290 | spin_unlock(&hugetlb_lock); | 657 | spin_unlock(&hugetlb_lock); |
291 | return NULL; | 658 | return NULL; |
292 | } else { | 659 | } else { |
293 | nr_huge_pages++; | 660 | h->nr_huge_pages++; |
294 | surplus_huge_pages++; | 661 | h->surplus_huge_pages++; |
295 | } | 662 | } |
296 | spin_unlock(&hugetlb_lock); | 663 | spin_unlock(&hugetlb_lock); |
297 | 664 | ||
298 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP| | 665 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP| |
299 | __GFP_REPEAT|__GFP_NOWARN, | 666 | __GFP_REPEAT|__GFP_NOWARN, |
300 | HUGETLB_PAGE_ORDER); | 667 | huge_page_order(h)); |
668 | |||
669 | if (page && arch_prepare_hugepage(page)) { | ||
670 | __free_pages(page, huge_page_order(h)); | ||
671 | return NULL; | ||
672 | } | ||
301 | 673 | ||
302 | spin_lock(&hugetlb_lock); | 674 | spin_lock(&hugetlb_lock); |
303 | if (page) { | 675 | if (page) { |
@@ -312,12 +684,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | |||
312 | /* | 684 | /* |
313 | * We incremented the global counters already | 685 | * We incremented the global counters already |
314 | */ | 686 | */ |
315 | nr_huge_pages_node[nid]++; | 687 | h->nr_huge_pages_node[nid]++; |
316 | surplus_huge_pages_node[nid]++; | 688 | h->surplus_huge_pages_node[nid]++; |
317 | __count_vm_event(HTLB_BUDDY_PGALLOC); | 689 | __count_vm_event(HTLB_BUDDY_PGALLOC); |
318 | } else { | 690 | } else { |
319 | nr_huge_pages--; | 691 | h->nr_huge_pages--; |
320 | surplus_huge_pages--; | 692 | h->surplus_huge_pages--; |
321 | __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); | 693 | __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); |
322 | } | 694 | } |
323 | spin_unlock(&hugetlb_lock); | 695 | spin_unlock(&hugetlb_lock); |
@@ -329,16 +701,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | |||
329 | * Increase the hugetlb pool such that it can accomodate a reservation | 701 | * Increase the hugetlb pool such that it can accomodate a reservation |
330 | * of size 'delta'. | 702 | * of size 'delta'. |
331 | */ | 703 | */ |
332 | static int gather_surplus_pages(int delta) | 704 | static int gather_surplus_pages(struct hstate *h, int delta) |
333 | { | 705 | { |
334 | struct list_head surplus_list; | 706 | struct list_head surplus_list; |
335 | struct page *page, *tmp; | 707 | struct page *page, *tmp; |
336 | int ret, i; | 708 | int ret, i; |
337 | int needed, allocated; | 709 | int needed, allocated; |
338 | 710 | ||
339 | needed = (resv_huge_pages + delta) - free_huge_pages; | 711 | needed = (h->resv_huge_pages + delta) - h->free_huge_pages; |
340 | if (needed <= 0) { | 712 | if (needed <= 0) { |
341 | resv_huge_pages += delta; | 713 | h->resv_huge_pages += delta; |
342 | return 0; | 714 | return 0; |
343 | } | 715 | } |
344 | 716 | ||
@@ -349,7 +721,7 @@ static int gather_surplus_pages(int delta) | |||
349 | retry: | 721 | retry: |
350 | spin_unlock(&hugetlb_lock); | 722 | spin_unlock(&hugetlb_lock); |
351 | for (i = 0; i < needed; i++) { | 723 | for (i = 0; i < needed; i++) { |
352 | page = alloc_buddy_huge_page(NULL, 0); | 724 | page = alloc_buddy_huge_page(h, NULL, 0); |
353 | if (!page) { | 725 | if (!page) { |
354 | /* | 726 | /* |
355 | * We were not able to allocate enough pages to | 727 | * We were not able to allocate enough pages to |
@@ -370,7 +742,8 @@ retry: | |||
370 | * because either resv_huge_pages or free_huge_pages may have changed. | 742 | * because either resv_huge_pages or free_huge_pages may have changed. |
371 | */ | 743 | */ |
372 | spin_lock(&hugetlb_lock); | 744 | spin_lock(&hugetlb_lock); |
373 | needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); | 745 | needed = (h->resv_huge_pages + delta) - |
746 | (h->free_huge_pages + allocated); | ||
374 | if (needed > 0) | 747 | if (needed > 0) |
375 | goto retry; | 748 | goto retry; |
376 | 749 | ||
@@ -383,7 +756,7 @@ retry: | |||
383 | * before they are reserved. | 756 | * before they are reserved. |
384 | */ | 757 | */ |
385 | needed += allocated; | 758 | needed += allocated; |
386 | resv_huge_pages += delta; | 759 | h->resv_huge_pages += delta; |
387 | ret = 0; | 760 | ret = 0; |
388 | free: | 761 | free: |
389 | /* Free the needed pages to the hugetlb pool */ | 762 | /* Free the needed pages to the hugetlb pool */ |
@@ -391,7 +764,7 @@ free: | |||
391 | if ((--needed) < 0) | 764 | if ((--needed) < 0) |
392 | break; | 765 | break; |
393 | list_del(&page->lru); | 766 | list_del(&page->lru); |
394 | enqueue_huge_page(page); | 767 | enqueue_huge_page(h, page); |
395 | } | 768 | } |
396 | 769 | ||
397 | /* Free unnecessary surplus pages to the buddy allocator */ | 770 | /* Free unnecessary surplus pages to the buddy allocator */ |
@@ -419,7 +792,8 @@ free: | |||
419 | * allocated to satisfy the reservation must be explicitly freed if they were | 792 | * allocated to satisfy the reservation must be explicitly freed if they were |
420 | * never used. | 793 | * never used. |
421 | */ | 794 | */ |
422 | static void return_unused_surplus_pages(unsigned long unused_resv_pages) | 795 | static void return_unused_surplus_pages(struct hstate *h, |
796 | unsigned long unused_resv_pages) | ||
423 | { | 797 | { |
424 | static int nid = -1; | 798 | static int nid = -1; |
425 | struct page *page; | 799 | struct page *page; |
@@ -434,157 +808,269 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) | |||
434 | unsigned long remaining_iterations = num_online_nodes(); | 808 | unsigned long remaining_iterations = num_online_nodes(); |
435 | 809 | ||
436 | /* Uncommit the reservation */ | 810 | /* Uncommit the reservation */ |
437 | resv_huge_pages -= unused_resv_pages; | 811 | h->resv_huge_pages -= unused_resv_pages; |
438 | 812 | ||
439 | nr_pages = min(unused_resv_pages, surplus_huge_pages); | 813 | /* Cannot return gigantic pages currently */ |
814 | if (h->order >= MAX_ORDER) | ||
815 | return; | ||
816 | |||
817 | nr_pages = min(unused_resv_pages, h->surplus_huge_pages); | ||
440 | 818 | ||
441 | while (remaining_iterations-- && nr_pages) { | 819 | while (remaining_iterations-- && nr_pages) { |
442 | nid = next_node(nid, node_online_map); | 820 | nid = next_node(nid, node_online_map); |
443 | if (nid == MAX_NUMNODES) | 821 | if (nid == MAX_NUMNODES) |
444 | nid = first_node(node_online_map); | 822 | nid = first_node(node_online_map); |
445 | 823 | ||
446 | if (!surplus_huge_pages_node[nid]) | 824 | if (!h->surplus_huge_pages_node[nid]) |
447 | continue; | 825 | continue; |
448 | 826 | ||
449 | if (!list_empty(&hugepage_freelists[nid])) { | 827 | if (!list_empty(&h->hugepage_freelists[nid])) { |
450 | page = list_entry(hugepage_freelists[nid].next, | 828 | page = list_entry(h->hugepage_freelists[nid].next, |
451 | struct page, lru); | 829 | struct page, lru); |
452 | list_del(&page->lru); | 830 | list_del(&page->lru); |
453 | update_and_free_page(page); | 831 | update_and_free_page(h, page); |
454 | free_huge_pages--; | 832 | h->free_huge_pages--; |
455 | free_huge_pages_node[nid]--; | 833 | h->free_huge_pages_node[nid]--; |
456 | surplus_huge_pages--; | 834 | h->surplus_huge_pages--; |
457 | surplus_huge_pages_node[nid]--; | 835 | h->surplus_huge_pages_node[nid]--; |
458 | nr_pages--; | 836 | nr_pages--; |
459 | remaining_iterations = num_online_nodes(); | 837 | remaining_iterations = num_online_nodes(); |
460 | } | 838 | } |
461 | } | 839 | } |
462 | } | 840 | } |
463 | 841 | ||
842 | /* | ||
843 | * Determine if the huge page at addr within the vma has an associated | ||
844 | * reservation. Where it does not we will need to logically increase | ||
845 | * reservation and actually increase quota before an allocation can occur. | ||
846 | * Where any new reservation would be required the reservation change is | ||
847 | * prepared, but not committed. Once the page has been quota'd allocated | ||
848 | * an instantiated the change should be committed via vma_commit_reservation. | ||
849 | * No action is required on failure. | ||
850 | */ | ||
851 | static int vma_needs_reservation(struct hstate *h, | ||
852 | struct vm_area_struct *vma, unsigned long addr) | ||
853 | { | ||
854 | struct address_space *mapping = vma->vm_file->f_mapping; | ||
855 | struct inode *inode = mapping->host; | ||
856 | |||
857 | if (vma->vm_flags & VM_SHARED) { | ||
858 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); | ||
859 | return region_chg(&inode->i_mapping->private_list, | ||
860 | idx, idx + 1); | ||
464 | 861 | ||
465 | static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, | 862 | } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { |
466 | unsigned long addr) | 863 | return 1; |
864 | |||
865 | } else { | ||
866 | int err; | ||
867 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); | ||
868 | struct resv_map *reservations = vma_resv_map(vma); | ||
869 | |||
870 | err = region_chg(&reservations->regions, idx, idx + 1); | ||
871 | if (err < 0) | ||
872 | return err; | ||
873 | return 0; | ||
874 | } | ||
875 | } | ||
876 | static void vma_commit_reservation(struct hstate *h, | ||
877 | struct vm_area_struct *vma, unsigned long addr) | ||
467 | { | 878 | { |
468 | struct page *page; | 879 | struct address_space *mapping = vma->vm_file->f_mapping; |
880 | struct inode *inode = mapping->host; | ||
469 | 881 | ||
470 | spin_lock(&hugetlb_lock); | 882 | if (vma->vm_flags & VM_SHARED) { |
471 | page = dequeue_huge_page_vma(vma, addr); | 883 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); |
472 | spin_unlock(&hugetlb_lock); | 884 | region_add(&inode->i_mapping->private_list, idx, idx + 1); |
473 | return page ? page : ERR_PTR(-VM_FAULT_OOM); | 885 | |
886 | } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { | ||
887 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); | ||
888 | struct resv_map *reservations = vma_resv_map(vma); | ||
889 | |||
890 | /* Mark this page used in the map. */ | ||
891 | region_add(&reservations->regions, idx, idx + 1); | ||
892 | } | ||
474 | } | 893 | } |
475 | 894 | ||
476 | static struct page *alloc_huge_page_private(struct vm_area_struct *vma, | 895 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
477 | unsigned long addr) | 896 | unsigned long addr, int avoid_reserve) |
478 | { | 897 | { |
479 | struct page *page = NULL; | 898 | struct hstate *h = hstate_vma(vma); |
899 | struct page *page; | ||
900 | struct address_space *mapping = vma->vm_file->f_mapping; | ||
901 | struct inode *inode = mapping->host; | ||
902 | unsigned int chg; | ||
480 | 903 | ||
481 | if (hugetlb_get_quota(vma->vm_file->f_mapping, 1)) | 904 | /* |
482 | return ERR_PTR(-VM_FAULT_SIGBUS); | 905 | * Processes that did not create the mapping will have no reserves and |
906 | * will not have accounted against quota. Check that the quota can be | ||
907 | * made before satisfying the allocation | ||
908 | * MAP_NORESERVE mappings may also need pages and quota allocated | ||
909 | * if no reserve mapping overlaps. | ||
910 | */ | ||
911 | chg = vma_needs_reservation(h, vma, addr); | ||
912 | if (chg < 0) | ||
913 | return ERR_PTR(chg); | ||
914 | if (chg) | ||
915 | if (hugetlb_get_quota(inode->i_mapping, chg)) | ||
916 | return ERR_PTR(-ENOSPC); | ||
483 | 917 | ||
484 | spin_lock(&hugetlb_lock); | 918 | spin_lock(&hugetlb_lock); |
485 | if (free_huge_pages > resv_huge_pages) | 919 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); |
486 | page = dequeue_huge_page_vma(vma, addr); | ||
487 | spin_unlock(&hugetlb_lock); | 920 | spin_unlock(&hugetlb_lock); |
921 | |||
488 | if (!page) { | 922 | if (!page) { |
489 | page = alloc_buddy_huge_page(vma, addr); | 923 | page = alloc_buddy_huge_page(h, vma, addr); |
490 | if (!page) { | 924 | if (!page) { |
491 | hugetlb_put_quota(vma->vm_file->f_mapping, 1); | 925 | hugetlb_put_quota(inode->i_mapping, chg); |
492 | return ERR_PTR(-VM_FAULT_OOM); | 926 | return ERR_PTR(-VM_FAULT_OOM); |
493 | } | 927 | } |
494 | } | 928 | } |
929 | |||
930 | set_page_refcounted(page); | ||
931 | set_page_private(page, (unsigned long) mapping); | ||
932 | |||
933 | vma_commit_reservation(h, vma, addr); | ||
934 | |||
495 | return page; | 935 | return page; |
496 | } | 936 | } |
497 | 937 | ||
498 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 938 | __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) |
499 | unsigned long addr) | ||
500 | { | 939 | { |
501 | struct page *page; | 940 | struct huge_bootmem_page *m; |
502 | struct address_space *mapping = vma->vm_file->f_mapping; | 941 | int nr_nodes = nodes_weight(node_online_map); |
503 | 942 | ||
504 | if (vma->vm_flags & VM_MAYSHARE) | 943 | while (nr_nodes) { |
505 | page = alloc_huge_page_shared(vma, addr); | 944 | void *addr; |
506 | else | 945 | |
507 | page = alloc_huge_page_private(vma, addr); | 946 | addr = __alloc_bootmem_node_nopanic( |
947 | NODE_DATA(h->hugetlb_next_nid), | ||
948 | huge_page_size(h), huge_page_size(h), 0); | ||
508 | 949 | ||
509 | if (!IS_ERR(page)) { | 950 | if (addr) { |
510 | set_page_refcounted(page); | 951 | /* |
511 | set_page_private(page, (unsigned long) mapping); | 952 | * Use the beginning of the huge page to store the |
953 | * huge_bootmem_page struct (until gather_bootmem | ||
954 | * puts them into the mem_map). | ||
955 | */ | ||
956 | m = addr; | ||
957 | if (m) | ||
958 | goto found; | ||
959 | } | ||
960 | hstate_next_node(h); | ||
961 | nr_nodes--; | ||
512 | } | 962 | } |
513 | return page; | 963 | return 0; |
964 | |||
965 | found: | ||
966 | BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); | ||
967 | /* Put them into a private list first because mem_map is not up yet */ | ||
968 | list_add(&m->list, &huge_boot_pages); | ||
969 | m->hstate = h; | ||
970 | return 1; | ||
514 | } | 971 | } |
515 | 972 | ||
516 | static int __init hugetlb_init(void) | 973 | /* Put bootmem huge pages into the standard lists after mem_map is up */ |
974 | static void __init gather_bootmem_prealloc(void) | ||
517 | { | 975 | { |
518 | unsigned long i; | 976 | struct huge_bootmem_page *m; |
519 | 977 | ||
520 | if (HPAGE_SHIFT == 0) | 978 | list_for_each_entry(m, &huge_boot_pages, list) { |
521 | return 0; | 979 | struct page *page = virt_to_page(m); |
522 | 980 | struct hstate *h = m->hstate; | |
523 | for (i = 0; i < MAX_NUMNODES; ++i) | 981 | __ClearPageReserved(page); |
524 | INIT_LIST_HEAD(&hugepage_freelists[i]); | 982 | WARN_ON(page_count(page) != 1); |
983 | prep_compound_page(page, h->order); | ||
984 | prep_new_huge_page(h, page, page_to_nid(page)); | ||
985 | } | ||
986 | } | ||
525 | 987 | ||
526 | hugetlb_next_nid = first_node(node_online_map); | 988 | static void __init hugetlb_hstate_alloc_pages(struct hstate *h) |
989 | { | ||
990 | unsigned long i; | ||
527 | 991 | ||
528 | for (i = 0; i < max_huge_pages; ++i) { | 992 | for (i = 0; i < h->max_huge_pages; ++i) { |
529 | if (!alloc_fresh_huge_page()) | 993 | if (h->order >= MAX_ORDER) { |
994 | if (!alloc_bootmem_huge_page(h)) | ||
995 | break; | ||
996 | } else if (!alloc_fresh_huge_page(h)) | ||
530 | break; | 997 | break; |
531 | } | 998 | } |
532 | max_huge_pages = free_huge_pages = nr_huge_pages = i; | 999 | h->max_huge_pages = i; |
533 | printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); | ||
534 | return 0; | ||
535 | } | 1000 | } |
536 | module_init(hugetlb_init); | ||
537 | 1001 | ||
538 | static int __init hugetlb_setup(char *s) | 1002 | static void __init hugetlb_init_hstates(void) |
539 | { | 1003 | { |
540 | if (sscanf(s, "%lu", &max_huge_pages) <= 0) | 1004 | struct hstate *h; |
541 | max_huge_pages = 0; | 1005 | |
542 | return 1; | 1006 | for_each_hstate(h) { |
1007 | /* oversize hugepages were init'ed in early boot */ | ||
1008 | if (h->order < MAX_ORDER) | ||
1009 | hugetlb_hstate_alloc_pages(h); | ||
1010 | } | ||
543 | } | 1011 | } |
544 | __setup("hugepages=", hugetlb_setup); | ||
545 | 1012 | ||
546 | static unsigned int cpuset_mems_nr(unsigned int *array) | 1013 | static char * __init memfmt(char *buf, unsigned long n) |
547 | { | 1014 | { |
548 | int node; | 1015 | if (n >= (1UL << 30)) |
549 | unsigned int nr = 0; | 1016 | sprintf(buf, "%lu GB", n >> 30); |
550 | 1017 | else if (n >= (1UL << 20)) | |
551 | for_each_node_mask(node, cpuset_current_mems_allowed) | 1018 | sprintf(buf, "%lu MB", n >> 20); |
552 | nr += array[node]; | 1019 | else |
1020 | sprintf(buf, "%lu KB", n >> 10); | ||
1021 | return buf; | ||
1022 | } | ||
553 | 1023 | ||
554 | return nr; | 1024 | static void __init report_hugepages(void) |
1025 | { | ||
1026 | struct hstate *h; | ||
1027 | |||
1028 | for_each_hstate(h) { | ||
1029 | char buf[32]; | ||
1030 | printk(KERN_INFO "HugeTLB registered %s page size, " | ||
1031 | "pre-allocated %ld pages\n", | ||
1032 | memfmt(buf, huge_page_size(h)), | ||
1033 | h->free_huge_pages); | ||
1034 | } | ||
555 | } | 1035 | } |
556 | 1036 | ||
557 | #ifdef CONFIG_SYSCTL | ||
558 | #ifdef CONFIG_HIGHMEM | 1037 | #ifdef CONFIG_HIGHMEM |
559 | static void try_to_free_low(unsigned long count) | 1038 | static void try_to_free_low(struct hstate *h, unsigned long count) |
560 | { | 1039 | { |
561 | int i; | 1040 | int i; |
562 | 1041 | ||
1042 | if (h->order >= MAX_ORDER) | ||
1043 | return; | ||
1044 | |||
563 | for (i = 0; i < MAX_NUMNODES; ++i) { | 1045 | for (i = 0; i < MAX_NUMNODES; ++i) { |
564 | struct page *page, *next; | 1046 | struct page *page, *next; |
565 | list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { | 1047 | struct list_head *freel = &h->hugepage_freelists[i]; |
566 | if (count >= nr_huge_pages) | 1048 | list_for_each_entry_safe(page, next, freel, lru) { |
1049 | if (count >= h->nr_huge_pages) | ||
567 | return; | 1050 | return; |
568 | if (PageHighMem(page)) | 1051 | if (PageHighMem(page)) |
569 | continue; | 1052 | continue; |
570 | list_del(&page->lru); | 1053 | list_del(&page->lru); |
571 | update_and_free_page(page); | 1054 | update_and_free_page(h, page); |
572 | free_huge_pages--; | 1055 | h->free_huge_pages--; |
573 | free_huge_pages_node[page_to_nid(page)]--; | 1056 | h->free_huge_pages_node[page_to_nid(page)]--; |
574 | } | 1057 | } |
575 | } | 1058 | } |
576 | } | 1059 | } |
577 | #else | 1060 | #else |
578 | static inline void try_to_free_low(unsigned long count) | 1061 | static inline void try_to_free_low(struct hstate *h, unsigned long count) |
579 | { | 1062 | { |
580 | } | 1063 | } |
581 | #endif | 1064 | #endif |
582 | 1065 | ||
583 | #define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) | 1066 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) |
584 | static unsigned long set_max_huge_pages(unsigned long count) | 1067 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) |
585 | { | 1068 | { |
586 | unsigned long min_count, ret; | 1069 | unsigned long min_count, ret; |
587 | 1070 | ||
1071 | if (h->order >= MAX_ORDER) | ||
1072 | return h->max_huge_pages; | ||
1073 | |||
588 | /* | 1074 | /* |
589 | * Increase the pool size | 1075 | * Increase the pool size |
590 | * First take pages out of surplus state. Then make up the | 1076 | * First take pages out of surplus state. Then make up the |
@@ -597,20 +1083,19 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
597 | * within all the constraints specified by the sysctls. | 1083 | * within all the constraints specified by the sysctls. |
598 | */ | 1084 | */ |
599 | spin_lock(&hugetlb_lock); | 1085 | spin_lock(&hugetlb_lock); |
600 | while (surplus_huge_pages && count > persistent_huge_pages) { | 1086 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { |
601 | if (!adjust_pool_surplus(-1)) | 1087 | if (!adjust_pool_surplus(h, -1)) |
602 | break; | 1088 | break; |
603 | } | 1089 | } |
604 | 1090 | ||
605 | while (count > persistent_huge_pages) { | 1091 | while (count > persistent_huge_pages(h)) { |
606 | int ret; | ||
607 | /* | 1092 | /* |
608 | * If this allocation races such that we no longer need the | 1093 | * If this allocation races such that we no longer need the |
609 | * page, free_huge_page will handle it by freeing the page | 1094 | * page, free_huge_page will handle it by freeing the page |
610 | * and reducing the surplus. | 1095 | * and reducing the surplus. |
611 | */ | 1096 | */ |
612 | spin_unlock(&hugetlb_lock); | 1097 | spin_unlock(&hugetlb_lock); |
613 | ret = alloc_fresh_huge_page(); | 1098 | ret = alloc_fresh_huge_page(h); |
614 | spin_lock(&hugetlb_lock); | 1099 | spin_lock(&hugetlb_lock); |
615 | if (!ret) | 1100 | if (!ret) |
616 | goto out; | 1101 | goto out; |
@@ -632,31 +1117,305 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
632 | * and won't grow the pool anywhere else. Not until one of the | 1117 | * and won't grow the pool anywhere else. Not until one of the |
633 | * sysctls are changed, or the surplus pages go out of use. | 1118 | * sysctls are changed, or the surplus pages go out of use. |
634 | */ | 1119 | */ |
635 | min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; | 1120 | min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; |
636 | min_count = max(count, min_count); | 1121 | min_count = max(count, min_count); |
637 | try_to_free_low(min_count); | 1122 | try_to_free_low(h, min_count); |
638 | while (min_count < persistent_huge_pages) { | 1123 | while (min_count < persistent_huge_pages(h)) { |
639 | struct page *page = dequeue_huge_page(); | 1124 | struct page *page = dequeue_huge_page(h); |
640 | if (!page) | 1125 | if (!page) |
641 | break; | 1126 | break; |
642 | update_and_free_page(page); | 1127 | update_and_free_page(h, page); |
643 | } | 1128 | } |
644 | while (count < persistent_huge_pages) { | 1129 | while (count < persistent_huge_pages(h)) { |
645 | if (!adjust_pool_surplus(1)) | 1130 | if (!adjust_pool_surplus(h, 1)) |
646 | break; | 1131 | break; |
647 | } | 1132 | } |
648 | out: | 1133 | out: |
649 | ret = persistent_huge_pages; | 1134 | ret = persistent_huge_pages(h); |
650 | spin_unlock(&hugetlb_lock); | 1135 | spin_unlock(&hugetlb_lock); |
651 | return ret; | 1136 | return ret; |
652 | } | 1137 | } |
653 | 1138 | ||
1139 | #define HSTATE_ATTR_RO(_name) \ | ||
1140 | static struct kobj_attribute _name##_attr = __ATTR_RO(_name) | ||
1141 | |||
1142 | #define HSTATE_ATTR(_name) \ | ||
1143 | static struct kobj_attribute _name##_attr = \ | ||
1144 | __ATTR(_name, 0644, _name##_show, _name##_store) | ||
1145 | |||
1146 | static struct kobject *hugepages_kobj; | ||
1147 | static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; | ||
1148 | |||
1149 | static struct hstate *kobj_to_hstate(struct kobject *kobj) | ||
1150 | { | ||
1151 | int i; | ||
1152 | for (i = 0; i < HUGE_MAX_HSTATE; i++) | ||
1153 | if (hstate_kobjs[i] == kobj) | ||
1154 | return &hstates[i]; | ||
1155 | BUG(); | ||
1156 | return NULL; | ||
1157 | } | ||
1158 | |||
1159 | static ssize_t nr_hugepages_show(struct kobject *kobj, | ||
1160 | struct kobj_attribute *attr, char *buf) | ||
1161 | { | ||
1162 | struct hstate *h = kobj_to_hstate(kobj); | ||
1163 | return sprintf(buf, "%lu\n", h->nr_huge_pages); | ||
1164 | } | ||
1165 | static ssize_t nr_hugepages_store(struct kobject *kobj, | ||
1166 | struct kobj_attribute *attr, const char *buf, size_t count) | ||
1167 | { | ||
1168 | int err; | ||
1169 | unsigned long input; | ||
1170 | struct hstate *h = kobj_to_hstate(kobj); | ||
1171 | |||
1172 | err = strict_strtoul(buf, 10, &input); | ||
1173 | if (err) | ||
1174 | return 0; | ||
1175 | |||
1176 | h->max_huge_pages = set_max_huge_pages(h, input); | ||
1177 | |||
1178 | return count; | ||
1179 | } | ||
1180 | HSTATE_ATTR(nr_hugepages); | ||
1181 | |||
1182 | static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, | ||
1183 | struct kobj_attribute *attr, char *buf) | ||
1184 | { | ||
1185 | struct hstate *h = kobj_to_hstate(kobj); | ||
1186 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); | ||
1187 | } | ||
1188 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | ||
1189 | struct kobj_attribute *attr, const char *buf, size_t count) | ||
1190 | { | ||
1191 | int err; | ||
1192 | unsigned long input; | ||
1193 | struct hstate *h = kobj_to_hstate(kobj); | ||
1194 | |||
1195 | err = strict_strtoul(buf, 10, &input); | ||
1196 | if (err) | ||
1197 | return 0; | ||
1198 | |||
1199 | spin_lock(&hugetlb_lock); | ||
1200 | h->nr_overcommit_huge_pages = input; | ||
1201 | spin_unlock(&hugetlb_lock); | ||
1202 | |||
1203 | return count; | ||
1204 | } | ||
1205 | HSTATE_ATTR(nr_overcommit_hugepages); | ||
1206 | |||
1207 | static ssize_t free_hugepages_show(struct kobject *kobj, | ||
1208 | struct kobj_attribute *attr, char *buf) | ||
1209 | { | ||
1210 | struct hstate *h = kobj_to_hstate(kobj); | ||
1211 | return sprintf(buf, "%lu\n", h->free_huge_pages); | ||
1212 | } | ||
1213 | HSTATE_ATTR_RO(free_hugepages); | ||
1214 | |||
1215 | static ssize_t resv_hugepages_show(struct kobject *kobj, | ||
1216 | struct kobj_attribute *attr, char *buf) | ||
1217 | { | ||
1218 | struct hstate *h = kobj_to_hstate(kobj); | ||
1219 | return sprintf(buf, "%lu\n", h->resv_huge_pages); | ||
1220 | } | ||
1221 | HSTATE_ATTR_RO(resv_hugepages); | ||
1222 | |||
1223 | static ssize_t surplus_hugepages_show(struct kobject *kobj, | ||
1224 | struct kobj_attribute *attr, char *buf) | ||
1225 | { | ||
1226 | struct hstate *h = kobj_to_hstate(kobj); | ||
1227 | return sprintf(buf, "%lu\n", h->surplus_huge_pages); | ||
1228 | } | ||
1229 | HSTATE_ATTR_RO(surplus_hugepages); | ||
1230 | |||
1231 | static struct attribute *hstate_attrs[] = { | ||
1232 | &nr_hugepages_attr.attr, | ||
1233 | &nr_overcommit_hugepages_attr.attr, | ||
1234 | &free_hugepages_attr.attr, | ||
1235 | &resv_hugepages_attr.attr, | ||
1236 | &surplus_hugepages_attr.attr, | ||
1237 | NULL, | ||
1238 | }; | ||
1239 | |||
1240 | static struct attribute_group hstate_attr_group = { | ||
1241 | .attrs = hstate_attrs, | ||
1242 | }; | ||
1243 | |||
1244 | static int __init hugetlb_sysfs_add_hstate(struct hstate *h) | ||
1245 | { | ||
1246 | int retval; | ||
1247 | |||
1248 | hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, | ||
1249 | hugepages_kobj); | ||
1250 | if (!hstate_kobjs[h - hstates]) | ||
1251 | return -ENOMEM; | ||
1252 | |||
1253 | retval = sysfs_create_group(hstate_kobjs[h - hstates], | ||
1254 | &hstate_attr_group); | ||
1255 | if (retval) | ||
1256 | kobject_put(hstate_kobjs[h - hstates]); | ||
1257 | |||
1258 | return retval; | ||
1259 | } | ||
1260 | |||
1261 | static void __init hugetlb_sysfs_init(void) | ||
1262 | { | ||
1263 | struct hstate *h; | ||
1264 | int err; | ||
1265 | |||
1266 | hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); | ||
1267 | if (!hugepages_kobj) | ||
1268 | return; | ||
1269 | |||
1270 | for_each_hstate(h) { | ||
1271 | err = hugetlb_sysfs_add_hstate(h); | ||
1272 | if (err) | ||
1273 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s", | ||
1274 | h->name); | ||
1275 | } | ||
1276 | } | ||
1277 | |||
1278 | static void __exit hugetlb_exit(void) | ||
1279 | { | ||
1280 | struct hstate *h; | ||
1281 | |||
1282 | for_each_hstate(h) { | ||
1283 | kobject_put(hstate_kobjs[h - hstates]); | ||
1284 | } | ||
1285 | |||
1286 | kobject_put(hugepages_kobj); | ||
1287 | } | ||
1288 | module_exit(hugetlb_exit); | ||
1289 | |||
1290 | static int __init hugetlb_init(void) | ||
1291 | { | ||
1292 | /* Some platform decide whether they support huge pages at boot | ||
1293 | * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when | ||
1294 | * there is no such support | ||
1295 | */ | ||
1296 | if (HPAGE_SHIFT == 0) | ||
1297 | return 0; | ||
1298 | |||
1299 | if (!size_to_hstate(default_hstate_size)) { | ||
1300 | default_hstate_size = HPAGE_SIZE; | ||
1301 | if (!size_to_hstate(default_hstate_size)) | ||
1302 | hugetlb_add_hstate(HUGETLB_PAGE_ORDER); | ||
1303 | } | ||
1304 | default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; | ||
1305 | if (default_hstate_max_huge_pages) | ||
1306 | default_hstate.max_huge_pages = default_hstate_max_huge_pages; | ||
1307 | |||
1308 | hugetlb_init_hstates(); | ||
1309 | |||
1310 | gather_bootmem_prealloc(); | ||
1311 | |||
1312 | report_hugepages(); | ||
1313 | |||
1314 | hugetlb_sysfs_init(); | ||
1315 | |||
1316 | return 0; | ||
1317 | } | ||
1318 | module_init(hugetlb_init); | ||
1319 | |||
1320 | /* Should be called on processing a hugepagesz=... option */ | ||
1321 | void __init hugetlb_add_hstate(unsigned order) | ||
1322 | { | ||
1323 | struct hstate *h; | ||
1324 | unsigned long i; | ||
1325 | |||
1326 | if (size_to_hstate(PAGE_SIZE << order)) { | ||
1327 | printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); | ||
1328 | return; | ||
1329 | } | ||
1330 | BUG_ON(max_hstate >= HUGE_MAX_HSTATE); | ||
1331 | BUG_ON(order == 0); | ||
1332 | h = &hstates[max_hstate++]; | ||
1333 | h->order = order; | ||
1334 | h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); | ||
1335 | h->nr_huge_pages = 0; | ||
1336 | h->free_huge_pages = 0; | ||
1337 | for (i = 0; i < MAX_NUMNODES; ++i) | ||
1338 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | ||
1339 | h->hugetlb_next_nid = first_node(node_online_map); | ||
1340 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | ||
1341 | huge_page_size(h)/1024); | ||
1342 | |||
1343 | parsed_hstate = h; | ||
1344 | } | ||
1345 | |||
1346 | static int __init hugetlb_nrpages_setup(char *s) | ||
1347 | { | ||
1348 | unsigned long *mhp; | ||
1349 | static unsigned long *last_mhp; | ||
1350 | |||
1351 | /* | ||
1352 | * !max_hstate means we haven't parsed a hugepagesz= parameter yet, | ||
1353 | * so this hugepages= parameter goes to the "default hstate". | ||
1354 | */ | ||
1355 | if (!max_hstate) | ||
1356 | mhp = &default_hstate_max_huge_pages; | ||
1357 | else | ||
1358 | mhp = &parsed_hstate->max_huge_pages; | ||
1359 | |||
1360 | if (mhp == last_mhp) { | ||
1361 | printk(KERN_WARNING "hugepages= specified twice without " | ||
1362 | "interleaving hugepagesz=, ignoring\n"); | ||
1363 | return 1; | ||
1364 | } | ||
1365 | |||
1366 | if (sscanf(s, "%lu", mhp) <= 0) | ||
1367 | *mhp = 0; | ||
1368 | |||
1369 | /* | ||
1370 | * Global state is always initialized later in hugetlb_init. | ||
1371 | * But we need to allocate >= MAX_ORDER hstates here early to still | ||
1372 | * use the bootmem allocator. | ||
1373 | */ | ||
1374 | if (max_hstate && parsed_hstate->order >= MAX_ORDER) | ||
1375 | hugetlb_hstate_alloc_pages(parsed_hstate); | ||
1376 | |||
1377 | last_mhp = mhp; | ||
1378 | |||
1379 | return 1; | ||
1380 | } | ||
1381 | __setup("hugepages=", hugetlb_nrpages_setup); | ||
1382 | |||
1383 | static int __init hugetlb_default_setup(char *s) | ||
1384 | { | ||
1385 | default_hstate_size = memparse(s, &s); | ||
1386 | return 1; | ||
1387 | } | ||
1388 | __setup("default_hugepagesz=", hugetlb_default_setup); | ||
1389 | |||
1390 | static unsigned int cpuset_mems_nr(unsigned int *array) | ||
1391 | { | ||
1392 | int node; | ||
1393 | unsigned int nr = 0; | ||
1394 | |||
1395 | for_each_node_mask(node, cpuset_current_mems_allowed) | ||
1396 | nr += array[node]; | ||
1397 | |||
1398 | return nr; | ||
1399 | } | ||
1400 | |||
1401 | #ifdef CONFIG_SYSCTL | ||
654 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 1402 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, |
655 | struct file *file, void __user *buffer, | 1403 | struct file *file, void __user *buffer, |
656 | size_t *length, loff_t *ppos) | 1404 | size_t *length, loff_t *ppos) |
657 | { | 1405 | { |
1406 | struct hstate *h = &default_hstate; | ||
1407 | unsigned long tmp; | ||
1408 | |||
1409 | if (!write) | ||
1410 | tmp = h->max_huge_pages; | ||
1411 | |||
1412 | table->data = &tmp; | ||
1413 | table->maxlen = sizeof(unsigned long); | ||
658 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); | 1414 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); |
659 | max_huge_pages = set_max_huge_pages(max_huge_pages); | 1415 | |
1416 | if (write) | ||
1417 | h->max_huge_pages = set_max_huge_pages(h, tmp); | ||
1418 | |||
660 | return 0; | 1419 | return 0; |
661 | } | 1420 | } |
662 | 1421 | ||
@@ -676,45 +1435,141 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | |||
676 | struct file *file, void __user *buffer, | 1435 | struct file *file, void __user *buffer, |
677 | size_t *length, loff_t *ppos) | 1436 | size_t *length, loff_t *ppos) |
678 | { | 1437 | { |
1438 | struct hstate *h = &default_hstate; | ||
1439 | unsigned long tmp; | ||
1440 | |||
1441 | if (!write) | ||
1442 | tmp = h->nr_overcommit_huge_pages; | ||
1443 | |||
1444 | table->data = &tmp; | ||
1445 | table->maxlen = sizeof(unsigned long); | ||
679 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); | 1446 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); |
680 | spin_lock(&hugetlb_lock); | 1447 | |
681 | nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; | 1448 | if (write) { |
682 | spin_unlock(&hugetlb_lock); | 1449 | spin_lock(&hugetlb_lock); |
1450 | h->nr_overcommit_huge_pages = tmp; | ||
1451 | spin_unlock(&hugetlb_lock); | ||
1452 | } | ||
1453 | |||
683 | return 0; | 1454 | return 0; |
684 | } | 1455 | } |
685 | 1456 | ||
686 | #endif /* CONFIG_SYSCTL */ | 1457 | #endif /* CONFIG_SYSCTL */ |
687 | 1458 | ||
688 | int hugetlb_report_meminfo(char *buf) | 1459 | void hugetlb_report_meminfo(struct seq_file *m) |
689 | { | 1460 | { |
690 | return sprintf(buf, | 1461 | struct hstate *h = &default_hstate; |
691 | "HugePages_Total: %5lu\n" | 1462 | seq_printf(m, |
692 | "HugePages_Free: %5lu\n" | 1463 | "HugePages_Total: %5lu\n" |
693 | "HugePages_Rsvd: %5lu\n" | 1464 | "HugePages_Free: %5lu\n" |
694 | "HugePages_Surp: %5lu\n" | 1465 | "HugePages_Rsvd: %5lu\n" |
695 | "Hugepagesize: %5lu kB\n", | 1466 | "HugePages_Surp: %5lu\n" |
696 | nr_huge_pages, | 1467 | "Hugepagesize: %8lu kB\n", |
697 | free_huge_pages, | 1468 | h->nr_huge_pages, |
698 | resv_huge_pages, | 1469 | h->free_huge_pages, |
699 | surplus_huge_pages, | 1470 | h->resv_huge_pages, |
700 | HPAGE_SIZE/1024); | 1471 | h->surplus_huge_pages, |
1472 | 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); | ||
701 | } | 1473 | } |
702 | 1474 | ||
703 | int hugetlb_report_node_meminfo(int nid, char *buf) | 1475 | int hugetlb_report_node_meminfo(int nid, char *buf) |
704 | { | 1476 | { |
1477 | struct hstate *h = &default_hstate; | ||
705 | return sprintf(buf, | 1478 | return sprintf(buf, |
706 | "Node %d HugePages_Total: %5u\n" | 1479 | "Node %d HugePages_Total: %5u\n" |
707 | "Node %d HugePages_Free: %5u\n" | 1480 | "Node %d HugePages_Free: %5u\n" |
708 | "Node %d HugePages_Surp: %5u\n", | 1481 | "Node %d HugePages_Surp: %5u\n", |
709 | nid, nr_huge_pages_node[nid], | 1482 | nid, h->nr_huge_pages_node[nid], |
710 | nid, free_huge_pages_node[nid], | 1483 | nid, h->free_huge_pages_node[nid], |
711 | nid, surplus_huge_pages_node[nid]); | 1484 | nid, h->surplus_huge_pages_node[nid]); |
712 | } | 1485 | } |
713 | 1486 | ||
714 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ | 1487 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ |
715 | unsigned long hugetlb_total_pages(void) | 1488 | unsigned long hugetlb_total_pages(void) |
716 | { | 1489 | { |
717 | return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); | 1490 | struct hstate *h = &default_hstate; |
1491 | return h->nr_huge_pages * pages_per_huge_page(h); | ||
1492 | } | ||
1493 | |||
1494 | static int hugetlb_acct_memory(struct hstate *h, long delta) | ||
1495 | { | ||
1496 | int ret = -ENOMEM; | ||
1497 | |||
1498 | spin_lock(&hugetlb_lock); | ||
1499 | /* | ||
1500 | * When cpuset is configured, it breaks the strict hugetlb page | ||
1501 | * reservation as the accounting is done on a global variable. Such | ||
1502 | * reservation is completely rubbish in the presence of cpuset because | ||
1503 | * the reservation is not checked against page availability for the | ||
1504 | * current cpuset. Application can still potentially OOM'ed by kernel | ||
1505 | * with lack of free htlb page in cpuset that the task is in. | ||
1506 | * Attempt to enforce strict accounting with cpuset is almost | ||
1507 | * impossible (or too ugly) because cpuset is too fluid that | ||
1508 | * task or memory node can be dynamically moved between cpusets. | ||
1509 | * | ||
1510 | * The change of semantics for shared hugetlb mapping with cpuset is | ||
1511 | * undesirable. However, in order to preserve some of the semantics, | ||
1512 | * we fall back to check against current free page availability as | ||
1513 | * a best attempt and hopefully to minimize the impact of changing | ||
1514 | * semantics that cpuset has. | ||
1515 | */ | ||
1516 | if (delta > 0) { | ||
1517 | if (gather_surplus_pages(h, delta) < 0) | ||
1518 | goto out; | ||
1519 | |||
1520 | if (delta > cpuset_mems_nr(h->free_huge_pages_node)) { | ||
1521 | return_unused_surplus_pages(h, delta); | ||
1522 | goto out; | ||
1523 | } | ||
1524 | } | ||
1525 | |||
1526 | ret = 0; | ||
1527 | if (delta < 0) | ||
1528 | return_unused_surplus_pages(h, (unsigned long) -delta); | ||
1529 | |||
1530 | out: | ||
1531 | spin_unlock(&hugetlb_lock); | ||
1532 | return ret; | ||
1533 | } | ||
1534 | |||
1535 | static void hugetlb_vm_op_open(struct vm_area_struct *vma) | ||
1536 | { | ||
1537 | struct resv_map *reservations = vma_resv_map(vma); | ||
1538 | |||
1539 | /* | ||
1540 | * This new VMA should share its siblings reservation map if present. | ||
1541 | * The VMA will only ever have a valid reservation map pointer where | ||
1542 | * it is being copied for another still existing VMA. As that VMA | ||
1543 | * has a reference to the reservation map it cannot dissappear until | ||
1544 | * after this open call completes. It is therefore safe to take a | ||
1545 | * new reference here without additional locking. | ||
1546 | */ | ||
1547 | if (reservations) | ||
1548 | kref_get(&reservations->refs); | ||
1549 | } | ||
1550 | |||
1551 | static void hugetlb_vm_op_close(struct vm_area_struct *vma) | ||
1552 | { | ||
1553 | struct hstate *h = hstate_vma(vma); | ||
1554 | struct resv_map *reservations = vma_resv_map(vma); | ||
1555 | unsigned long reserve; | ||
1556 | unsigned long start; | ||
1557 | unsigned long end; | ||
1558 | |||
1559 | if (reservations) { | ||
1560 | start = vma_hugecache_offset(h, vma, vma->vm_start); | ||
1561 | end = vma_hugecache_offset(h, vma, vma->vm_end); | ||
1562 | |||
1563 | reserve = (end - start) - | ||
1564 | region_count(&reservations->regions, start, end); | ||
1565 | |||
1566 | kref_put(&reservations->refs, resv_map_release); | ||
1567 | |||
1568 | if (reserve) { | ||
1569 | hugetlb_acct_memory(h, -reserve); | ||
1570 | hugetlb_put_quota(vma->vm_file->f_mapping, reserve); | ||
1571 | } | ||
1572 | } | ||
718 | } | 1573 | } |
719 | 1574 | ||
720 | /* | 1575 | /* |
@@ -731,6 +1586,8 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
731 | 1586 | ||
732 | struct vm_operations_struct hugetlb_vm_ops = { | 1587 | struct vm_operations_struct hugetlb_vm_ops = { |
733 | .fault = hugetlb_vm_op_fault, | 1588 | .fault = hugetlb_vm_op_fault, |
1589 | .open = hugetlb_vm_op_open, | ||
1590 | .close = hugetlb_vm_op_close, | ||
734 | }; | 1591 | }; |
735 | 1592 | ||
736 | static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, | 1593 | static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, |
@@ -769,14 +1626,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
769 | struct page *ptepage; | 1626 | struct page *ptepage; |
770 | unsigned long addr; | 1627 | unsigned long addr; |
771 | int cow; | 1628 | int cow; |
1629 | struct hstate *h = hstate_vma(vma); | ||
1630 | unsigned long sz = huge_page_size(h); | ||
772 | 1631 | ||
773 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 1632 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
774 | 1633 | ||
775 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | 1634 | for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { |
776 | src_pte = huge_pte_offset(src, addr); | 1635 | src_pte = huge_pte_offset(src, addr); |
777 | if (!src_pte) | 1636 | if (!src_pte) |
778 | continue; | 1637 | continue; |
779 | dst_pte = huge_pte_alloc(dst, addr); | 1638 | dst_pte = huge_pte_alloc(dst, addr, sz); |
780 | if (!dst_pte) | 1639 | if (!dst_pte) |
781 | goto nomem; | 1640 | goto nomem; |
782 | 1641 | ||
@@ -804,7 +1663,7 @@ nomem: | |||
804 | } | 1663 | } |
805 | 1664 | ||
806 | void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | 1665 | void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, |
807 | unsigned long end) | 1666 | unsigned long end, struct page *ref_page) |
808 | { | 1667 | { |
809 | struct mm_struct *mm = vma->vm_mm; | 1668 | struct mm_struct *mm = vma->vm_mm; |
810 | unsigned long address; | 1669 | unsigned long address; |
@@ -812,6 +1671,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
812 | pte_t pte; | 1671 | pte_t pte; |
813 | struct page *page; | 1672 | struct page *page; |
814 | struct page *tmp; | 1673 | struct page *tmp; |
1674 | struct hstate *h = hstate_vma(vma); | ||
1675 | unsigned long sz = huge_page_size(h); | ||
1676 | |||
815 | /* | 1677 | /* |
816 | * A page gathering list, protected by per file i_mmap_lock. The | 1678 | * A page gathering list, protected by per file i_mmap_lock. The |
817 | * lock is used to avoid list corruption from multiple unmapping | 1679 | * lock is used to avoid list corruption from multiple unmapping |
@@ -820,11 +1682,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
820 | LIST_HEAD(page_list); | 1682 | LIST_HEAD(page_list); |
821 | 1683 | ||
822 | WARN_ON(!is_vm_hugetlb_page(vma)); | 1684 | WARN_ON(!is_vm_hugetlb_page(vma)); |
823 | BUG_ON(start & ~HPAGE_MASK); | 1685 | BUG_ON(start & ~huge_page_mask(h)); |
824 | BUG_ON(end & ~HPAGE_MASK); | 1686 | BUG_ON(end & ~huge_page_mask(h)); |
825 | 1687 | ||
1688 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
826 | spin_lock(&mm->page_table_lock); | 1689 | spin_lock(&mm->page_table_lock); |
827 | for (address = start; address < end; address += HPAGE_SIZE) { | 1690 | for (address = start; address < end; address += sz) { |
828 | ptep = huge_pte_offset(mm, address); | 1691 | ptep = huge_pte_offset(mm, address); |
829 | if (!ptep) | 1692 | if (!ptep) |
830 | continue; | 1693 | continue; |
@@ -832,6 +1695,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
832 | if (huge_pmd_unshare(mm, &address, ptep)) | 1695 | if (huge_pmd_unshare(mm, &address, ptep)) |
833 | continue; | 1696 | continue; |
834 | 1697 | ||
1698 | /* | ||
1699 | * If a reference page is supplied, it is because a specific | ||
1700 | * page is being unmapped, not a range. Ensure the page we | ||
1701 | * are about to unmap is the actual page of interest. | ||
1702 | */ | ||
1703 | if (ref_page) { | ||
1704 | pte = huge_ptep_get(ptep); | ||
1705 | if (huge_pte_none(pte)) | ||
1706 | continue; | ||
1707 | page = pte_page(pte); | ||
1708 | if (page != ref_page) | ||
1709 | continue; | ||
1710 | |||
1711 | /* | ||
1712 | * Mark the VMA as having unmapped its page so that | ||
1713 | * future faults in this VMA will fail rather than | ||
1714 | * looking like data was lost | ||
1715 | */ | ||
1716 | set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); | ||
1717 | } | ||
1718 | |||
835 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 1719 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
836 | if (huge_pte_none(pte)) | 1720 | if (huge_pte_none(pte)) |
837 | continue; | 1721 | continue; |
@@ -843,6 +1727,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
843 | } | 1727 | } |
844 | spin_unlock(&mm->page_table_lock); | 1728 | spin_unlock(&mm->page_table_lock); |
845 | flush_tlb_range(vma, start, end); | 1729 | flush_tlb_range(vma, start, end); |
1730 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
846 | list_for_each_entry_safe(page, tmp, &page_list, lru) { | 1731 | list_for_each_entry_safe(page, tmp, &page_list, lru) { |
847 | list_del(&page->lru); | 1732 | list_del(&page->lru); |
848 | put_page(page); | 1733 | put_page(page); |
@@ -850,31 +1735,69 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
850 | } | 1735 | } |
851 | 1736 | ||
852 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | 1737 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, |
853 | unsigned long end) | 1738 | unsigned long end, struct page *ref_page) |
854 | { | 1739 | { |
1740 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); | ||
1741 | __unmap_hugepage_range(vma, start, end, ref_page); | ||
1742 | spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); | ||
1743 | } | ||
1744 | |||
1745 | /* | ||
1746 | * This is called when the original mapper is failing to COW a MAP_PRIVATE | ||
1747 | * mappping it owns the reserve page for. The intention is to unmap the page | ||
1748 | * from other VMAs and let the children be SIGKILLed if they are faulting the | ||
1749 | * same region. | ||
1750 | */ | ||
1751 | static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | ||
1752 | struct page *page, unsigned long address) | ||
1753 | { | ||
1754 | struct vm_area_struct *iter_vma; | ||
1755 | struct address_space *mapping; | ||
1756 | struct prio_tree_iter iter; | ||
1757 | pgoff_t pgoff; | ||
1758 | |||
855 | /* | 1759 | /* |
856 | * It is undesirable to test vma->vm_file as it should be non-null | 1760 | * vm_pgoff is in PAGE_SIZE units, hence the different calculation |
857 | * for valid hugetlb area. However, vm_file will be NULL in the error | 1761 | * from page cache lookup which is in HPAGE_SIZE units. |
858 | * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails, | ||
859 | * do_mmap_pgoff() nullifies vma->vm_file before calling this function | ||
860 | * to clean up. Since no pte has actually been setup, it is safe to | ||
861 | * do nothing in this case. | ||
862 | */ | 1762 | */ |
863 | if (vma->vm_file) { | 1763 | address = address & huge_page_mask(hstate_vma(vma)); |
864 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); | 1764 | pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) |
865 | __unmap_hugepage_range(vma, start, end); | 1765 | + (vma->vm_pgoff >> PAGE_SHIFT); |
866 | spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); | 1766 | mapping = (struct address_space *)page_private(page); |
1767 | |||
1768 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | ||
1769 | /* Do not unmap the current VMA */ | ||
1770 | if (iter_vma == vma) | ||
1771 | continue; | ||
1772 | |||
1773 | /* | ||
1774 | * Unmap the page from other VMAs without their own reserves. | ||
1775 | * They get marked to be SIGKILLed if they fault in these | ||
1776 | * areas. This is because a future no-page fault on this VMA | ||
1777 | * could insert a zeroed page instead of the data existing | ||
1778 | * from the time of fork. This would look like data corruption | ||
1779 | */ | ||
1780 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) | ||
1781 | unmap_hugepage_range(iter_vma, | ||
1782 | address, address + HPAGE_SIZE, | ||
1783 | page); | ||
867 | } | 1784 | } |
1785 | |||
1786 | return 1; | ||
868 | } | 1787 | } |
869 | 1788 | ||
870 | static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | 1789 | static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, |
871 | unsigned long address, pte_t *ptep, pte_t pte) | 1790 | unsigned long address, pte_t *ptep, pte_t pte, |
1791 | struct page *pagecache_page) | ||
872 | { | 1792 | { |
1793 | struct hstate *h = hstate_vma(vma); | ||
873 | struct page *old_page, *new_page; | 1794 | struct page *old_page, *new_page; |
874 | int avoidcopy; | 1795 | int avoidcopy; |
1796 | int outside_reserve = 0; | ||
875 | 1797 | ||
876 | old_page = pte_page(pte); | 1798 | old_page = pte_page(pte); |
877 | 1799 | ||
1800 | retry_avoidcopy: | ||
878 | /* If no-one else is actually using this page, avoid the copy | 1801 | /* If no-one else is actually using this page, avoid the copy |
879 | * and just make the page writable */ | 1802 | * and just make the page writable */ |
880 | avoidcopy = (page_count(old_page) == 1); | 1803 | avoidcopy = (page_count(old_page) == 1); |
@@ -883,11 +1806,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
883 | return 0; | 1806 | return 0; |
884 | } | 1807 | } |
885 | 1808 | ||
1809 | /* | ||
1810 | * If the process that created a MAP_PRIVATE mapping is about to | ||
1811 | * perform a COW due to a shared page count, attempt to satisfy | ||
1812 | * the allocation without using the existing reserves. The pagecache | ||
1813 | * page is used to determine if the reserve at this address was | ||
1814 | * consumed or not. If reserves were used, a partial faulted mapping | ||
1815 | * at the time of fork() could consume its reserves on COW instead | ||
1816 | * of the full address range. | ||
1817 | */ | ||
1818 | if (!(vma->vm_flags & VM_SHARED) && | ||
1819 | is_vma_resv_set(vma, HPAGE_RESV_OWNER) && | ||
1820 | old_page != pagecache_page) | ||
1821 | outside_reserve = 1; | ||
1822 | |||
886 | page_cache_get(old_page); | 1823 | page_cache_get(old_page); |
887 | new_page = alloc_huge_page(vma, address); | 1824 | new_page = alloc_huge_page(vma, address, outside_reserve); |
888 | 1825 | ||
889 | if (IS_ERR(new_page)) { | 1826 | if (IS_ERR(new_page)) { |
890 | page_cache_release(old_page); | 1827 | page_cache_release(old_page); |
1828 | |||
1829 | /* | ||
1830 | * If a process owning a MAP_PRIVATE mapping fails to COW, | ||
1831 | * it is due to references held by a child and an insufficient | ||
1832 | * huge page pool. To guarantee the original mappers | ||
1833 | * reliability, unmap the page from child processes. The child | ||
1834 | * may get SIGKILLed if it later faults. | ||
1835 | */ | ||
1836 | if (outside_reserve) { | ||
1837 | BUG_ON(huge_pte_none(pte)); | ||
1838 | if (unmap_ref_private(mm, vma, old_page, address)) { | ||
1839 | BUG_ON(page_count(old_page) != 1); | ||
1840 | BUG_ON(huge_pte_none(pte)); | ||
1841 | goto retry_avoidcopy; | ||
1842 | } | ||
1843 | WARN_ON_ONCE(1); | ||
1844 | } | ||
1845 | |||
891 | return -PTR_ERR(new_page); | 1846 | return -PTR_ERR(new_page); |
892 | } | 1847 | } |
893 | 1848 | ||
@@ -896,7 +1851,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
896 | __SetPageUptodate(new_page); | 1851 | __SetPageUptodate(new_page); |
897 | spin_lock(&mm->page_table_lock); | 1852 | spin_lock(&mm->page_table_lock); |
898 | 1853 | ||
899 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); | 1854 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
900 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { | 1855 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { |
901 | /* Break COW */ | 1856 | /* Break COW */ |
902 | huge_ptep_clear_flush(vma, address, ptep); | 1857 | huge_ptep_clear_flush(vma, address, ptep); |
@@ -910,19 +1865,44 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
910 | return 0; | 1865 | return 0; |
911 | } | 1866 | } |
912 | 1867 | ||
1868 | /* Return the pagecache page at a given address within a VMA */ | ||
1869 | static struct page *hugetlbfs_pagecache_page(struct hstate *h, | ||
1870 | struct vm_area_struct *vma, unsigned long address) | ||
1871 | { | ||
1872 | struct address_space *mapping; | ||
1873 | pgoff_t idx; | ||
1874 | |||
1875 | mapping = vma->vm_file->f_mapping; | ||
1876 | idx = vma_hugecache_offset(h, vma, address); | ||
1877 | |||
1878 | return find_lock_page(mapping, idx); | ||
1879 | } | ||
1880 | |||
913 | static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1881 | static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, |
914 | unsigned long address, pte_t *ptep, int write_access) | 1882 | unsigned long address, pte_t *ptep, int write_access) |
915 | { | 1883 | { |
1884 | struct hstate *h = hstate_vma(vma); | ||
916 | int ret = VM_FAULT_SIGBUS; | 1885 | int ret = VM_FAULT_SIGBUS; |
917 | unsigned long idx; | 1886 | pgoff_t idx; |
918 | unsigned long size; | 1887 | unsigned long size; |
919 | struct page *page; | 1888 | struct page *page; |
920 | struct address_space *mapping; | 1889 | struct address_space *mapping; |
921 | pte_t new_pte; | 1890 | pte_t new_pte; |
922 | 1891 | ||
1892 | /* | ||
1893 | * Currently, we are forced to kill the process in the event the | ||
1894 | * original mapper has unmapped pages from the child due to a failed | ||
1895 | * COW. Warn that such a situation has occured as it may not be obvious | ||
1896 | */ | ||
1897 | if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { | ||
1898 | printk(KERN_WARNING | ||
1899 | "PID %d killed due to inadequate hugepage pool\n", | ||
1900 | current->pid); | ||
1901 | return ret; | ||
1902 | } | ||
1903 | |||
923 | mapping = vma->vm_file->f_mapping; | 1904 | mapping = vma->vm_file->f_mapping; |
924 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) | 1905 | idx = vma_hugecache_offset(h, vma, address); |
925 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | ||
926 | 1906 | ||
927 | /* | 1907 | /* |
928 | * Use page lock to guard against racing truncation | 1908 | * Use page lock to guard against racing truncation |
@@ -931,15 +1911,15 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
931 | retry: | 1911 | retry: |
932 | page = find_lock_page(mapping, idx); | 1912 | page = find_lock_page(mapping, idx); |
933 | if (!page) { | 1913 | if (!page) { |
934 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; | 1914 | size = i_size_read(mapping->host) >> huge_page_shift(h); |
935 | if (idx >= size) | 1915 | if (idx >= size) |
936 | goto out; | 1916 | goto out; |
937 | page = alloc_huge_page(vma, address); | 1917 | page = alloc_huge_page(vma, address, 0); |
938 | if (IS_ERR(page)) { | 1918 | if (IS_ERR(page)) { |
939 | ret = -PTR_ERR(page); | 1919 | ret = -PTR_ERR(page); |
940 | goto out; | 1920 | goto out; |
941 | } | 1921 | } |
942 | clear_huge_page(page, address); | 1922 | clear_huge_page(page, address, huge_page_size(h)); |
943 | __SetPageUptodate(page); | 1923 | __SetPageUptodate(page); |
944 | 1924 | ||
945 | if (vma->vm_flags & VM_SHARED) { | 1925 | if (vma->vm_flags & VM_SHARED) { |
@@ -955,14 +1935,26 @@ retry: | |||
955 | } | 1935 | } |
956 | 1936 | ||
957 | spin_lock(&inode->i_lock); | 1937 | spin_lock(&inode->i_lock); |
958 | inode->i_blocks += BLOCKS_PER_HUGEPAGE; | 1938 | inode->i_blocks += blocks_per_huge_page(h); |
959 | spin_unlock(&inode->i_lock); | 1939 | spin_unlock(&inode->i_lock); |
960 | } else | 1940 | } else |
961 | lock_page(page); | 1941 | lock_page(page); |
962 | } | 1942 | } |
963 | 1943 | ||
1944 | /* | ||
1945 | * If we are going to COW a private mapping later, we examine the | ||
1946 | * pending reservations for this page now. This will ensure that | ||
1947 | * any allocations necessary to record that reservation occur outside | ||
1948 | * the spinlock. | ||
1949 | */ | ||
1950 | if (write_access && !(vma->vm_flags & VM_SHARED)) | ||
1951 | if (vma_needs_reservation(h, vma, address) < 0) { | ||
1952 | ret = VM_FAULT_OOM; | ||
1953 | goto backout_unlocked; | ||
1954 | } | ||
1955 | |||
964 | spin_lock(&mm->page_table_lock); | 1956 | spin_lock(&mm->page_table_lock); |
965 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; | 1957 | size = i_size_read(mapping->host) >> huge_page_shift(h); |
966 | if (idx >= size) | 1958 | if (idx >= size) |
967 | goto backout; | 1959 | goto backout; |
968 | 1960 | ||
@@ -976,7 +1968,7 @@ retry: | |||
976 | 1968 | ||
977 | if (write_access && !(vma->vm_flags & VM_SHARED)) { | 1969 | if (write_access && !(vma->vm_flags & VM_SHARED)) { |
978 | /* Optimization, do the COW without a second fault */ | 1970 | /* Optimization, do the COW without a second fault */ |
979 | ret = hugetlb_cow(mm, vma, address, ptep, new_pte); | 1971 | ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); |
980 | } | 1972 | } |
981 | 1973 | ||
982 | spin_unlock(&mm->page_table_lock); | 1974 | spin_unlock(&mm->page_table_lock); |
@@ -986,6 +1978,7 @@ out: | |||
986 | 1978 | ||
987 | backout: | 1979 | backout: |
988 | spin_unlock(&mm->page_table_lock); | 1980 | spin_unlock(&mm->page_table_lock); |
1981 | backout_unlocked: | ||
989 | unlock_page(page); | 1982 | unlock_page(page); |
990 | put_page(page); | 1983 | put_page(page); |
991 | goto out; | 1984 | goto out; |
@@ -997,9 +1990,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
997 | pte_t *ptep; | 1990 | pte_t *ptep; |
998 | pte_t entry; | 1991 | pte_t entry; |
999 | int ret; | 1992 | int ret; |
1993 | struct page *pagecache_page = NULL; | ||
1000 | static DEFINE_MUTEX(hugetlb_instantiation_mutex); | 1994 | static DEFINE_MUTEX(hugetlb_instantiation_mutex); |
1995 | struct hstate *h = hstate_vma(vma); | ||
1001 | 1996 | ||
1002 | ptep = huge_pte_alloc(mm, address); | 1997 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); |
1003 | if (!ptep) | 1998 | if (!ptep) |
1004 | return VM_FAULT_OOM; | 1999 | return VM_FAULT_OOM; |
1005 | 2000 | ||
@@ -1012,23 +2007,79 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1012 | entry = huge_ptep_get(ptep); | 2007 | entry = huge_ptep_get(ptep); |
1013 | if (huge_pte_none(entry)) { | 2008 | if (huge_pte_none(entry)) { |
1014 | ret = hugetlb_no_page(mm, vma, address, ptep, write_access); | 2009 | ret = hugetlb_no_page(mm, vma, address, ptep, write_access); |
1015 | mutex_unlock(&hugetlb_instantiation_mutex); | 2010 | goto out_mutex; |
1016 | return ret; | ||
1017 | } | 2011 | } |
1018 | 2012 | ||
1019 | ret = 0; | 2013 | ret = 0; |
1020 | 2014 | ||
2015 | /* | ||
2016 | * If we are going to COW the mapping later, we examine the pending | ||
2017 | * reservations for this page now. This will ensure that any | ||
2018 | * allocations necessary to record that reservation occur outside the | ||
2019 | * spinlock. For private mappings, we also lookup the pagecache | ||
2020 | * page now as it is used to determine if a reservation has been | ||
2021 | * consumed. | ||
2022 | */ | ||
2023 | if (write_access && !pte_write(entry)) { | ||
2024 | if (vma_needs_reservation(h, vma, address) < 0) { | ||
2025 | ret = VM_FAULT_OOM; | ||
2026 | goto out_mutex; | ||
2027 | } | ||
2028 | |||
2029 | if (!(vma->vm_flags & VM_SHARED)) | ||
2030 | pagecache_page = hugetlbfs_pagecache_page(h, | ||
2031 | vma, address); | ||
2032 | } | ||
2033 | |||
1021 | spin_lock(&mm->page_table_lock); | 2034 | spin_lock(&mm->page_table_lock); |
1022 | /* Check for a racing update before calling hugetlb_cow */ | 2035 | /* Check for a racing update before calling hugetlb_cow */ |
1023 | if (likely(pte_same(entry, huge_ptep_get(ptep)))) | 2036 | if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) |
1024 | if (write_access && !pte_write(entry)) | 2037 | goto out_page_table_lock; |
1025 | ret = hugetlb_cow(mm, vma, address, ptep, entry); | 2038 | |
2039 | |||
2040 | if (write_access) { | ||
2041 | if (!pte_write(entry)) { | ||
2042 | ret = hugetlb_cow(mm, vma, address, ptep, entry, | ||
2043 | pagecache_page); | ||
2044 | goto out_page_table_lock; | ||
2045 | } | ||
2046 | entry = pte_mkdirty(entry); | ||
2047 | } | ||
2048 | entry = pte_mkyoung(entry); | ||
2049 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access)) | ||
2050 | update_mmu_cache(vma, address, entry); | ||
2051 | |||
2052 | out_page_table_lock: | ||
1026 | spin_unlock(&mm->page_table_lock); | 2053 | spin_unlock(&mm->page_table_lock); |
2054 | |||
2055 | if (pagecache_page) { | ||
2056 | unlock_page(pagecache_page); | ||
2057 | put_page(pagecache_page); | ||
2058 | } | ||
2059 | |||
2060 | out_mutex: | ||
1027 | mutex_unlock(&hugetlb_instantiation_mutex); | 2061 | mutex_unlock(&hugetlb_instantiation_mutex); |
1028 | 2062 | ||
1029 | return ret; | 2063 | return ret; |
1030 | } | 2064 | } |
1031 | 2065 | ||
2066 | /* Can be overriden by architectures */ | ||
2067 | __attribute__((weak)) struct page * | ||
2068 | follow_huge_pud(struct mm_struct *mm, unsigned long address, | ||
2069 | pud_t *pud, int write) | ||
2070 | { | ||
2071 | BUG(); | ||
2072 | return NULL; | ||
2073 | } | ||
2074 | |||
2075 | static int huge_zeropage_ok(pte_t *ptep, int write, int shared) | ||
2076 | { | ||
2077 | if (!ptep || write || shared) | ||
2078 | return 0; | ||
2079 | else | ||
2080 | return huge_pte_none(huge_ptep_get(ptep)); | ||
2081 | } | ||
2082 | |||
1032 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2083 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |
1033 | struct page **pages, struct vm_area_struct **vmas, | 2084 | struct page **pages, struct vm_area_struct **vmas, |
1034 | unsigned long *position, int *length, int i, | 2085 | unsigned long *position, int *length, int i, |
@@ -1037,6 +2088,9 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1037 | unsigned long pfn_offset; | 2088 | unsigned long pfn_offset; |
1038 | unsigned long vaddr = *position; | 2089 | unsigned long vaddr = *position; |
1039 | int remainder = *length; | 2090 | int remainder = *length; |
2091 | struct hstate *h = hstate_vma(vma); | ||
2092 | int zeropage_ok = 0; | ||
2093 | int shared = vma->vm_flags & VM_SHARED; | ||
1040 | 2094 | ||
1041 | spin_lock(&mm->page_table_lock); | 2095 | spin_lock(&mm->page_table_lock); |
1042 | while (vaddr < vma->vm_end && remainder) { | 2096 | while (vaddr < vma->vm_end && remainder) { |
@@ -1048,9 +2102,12 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1048 | * each hugepage. We have to make * sure we get the | 2102 | * each hugepage. We have to make * sure we get the |
1049 | * first, for the page indexing below to work. | 2103 | * first, for the page indexing below to work. |
1050 | */ | 2104 | */ |
1051 | pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); | 2105 | pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); |
2106 | if (huge_zeropage_ok(pte, write, shared)) | ||
2107 | zeropage_ok = 1; | ||
1052 | 2108 | ||
1053 | if (!pte || huge_pte_none(huge_ptep_get(pte)) || | 2109 | if (!pte || |
2110 | (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || | ||
1054 | (write && !pte_write(huge_ptep_get(pte)))) { | 2111 | (write && !pte_write(huge_ptep_get(pte)))) { |
1055 | int ret; | 2112 | int ret; |
1056 | 2113 | ||
@@ -1066,12 +2123,15 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1066 | break; | 2123 | break; |
1067 | } | 2124 | } |
1068 | 2125 | ||
1069 | pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; | 2126 | pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; |
1070 | page = pte_page(huge_ptep_get(pte)); | 2127 | page = pte_page(huge_ptep_get(pte)); |
1071 | same_page: | 2128 | same_page: |
1072 | if (pages) { | 2129 | if (pages) { |
1073 | get_page(page); | 2130 | if (zeropage_ok) |
1074 | pages[i] = page + pfn_offset; | 2131 | pages[i] = ZERO_PAGE(0); |
2132 | else | ||
2133 | pages[i] = page + pfn_offset; | ||
2134 | get_page(pages[i]); | ||
1075 | } | 2135 | } |
1076 | 2136 | ||
1077 | if (vmas) | 2137 | if (vmas) |
@@ -1082,7 +2142,7 @@ same_page: | |||
1082 | --remainder; | 2142 | --remainder; |
1083 | ++i; | 2143 | ++i; |
1084 | if (vaddr < vma->vm_end && remainder && | 2144 | if (vaddr < vma->vm_end && remainder && |
1085 | pfn_offset < HPAGE_SIZE/PAGE_SIZE) { | 2145 | pfn_offset < pages_per_huge_page(h)) { |
1086 | /* | 2146 | /* |
1087 | * We use pfn_offset to avoid touching the pageframes | 2147 | * We use pfn_offset to avoid touching the pageframes |
1088 | * of this compound page. | 2148 | * of this compound page. |
@@ -1104,13 +2164,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
1104 | unsigned long start = address; | 2164 | unsigned long start = address; |
1105 | pte_t *ptep; | 2165 | pte_t *ptep; |
1106 | pte_t pte; | 2166 | pte_t pte; |
2167 | struct hstate *h = hstate_vma(vma); | ||
1107 | 2168 | ||
1108 | BUG_ON(address >= end); | 2169 | BUG_ON(address >= end); |
1109 | flush_cache_range(vma, address, end); | 2170 | flush_cache_range(vma, address, end); |
1110 | 2171 | ||
1111 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); | 2172 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); |
1112 | spin_lock(&mm->page_table_lock); | 2173 | spin_lock(&mm->page_table_lock); |
1113 | for (; address < end; address += HPAGE_SIZE) { | 2174 | for (; address < end; address += huge_page_size(h)) { |
1114 | ptep = huge_pte_offset(mm, address); | 2175 | ptep = huge_pte_offset(mm, address); |
1115 | if (!ptep) | 2176 | if (!ptep) |
1116 | continue; | 2177 | continue; |
@@ -1128,195 +2189,59 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
1128 | flush_tlb_range(vma, start, end); | 2189 | flush_tlb_range(vma, start, end); |
1129 | } | 2190 | } |
1130 | 2191 | ||
1131 | struct file_region { | 2192 | int hugetlb_reserve_pages(struct inode *inode, |
1132 | struct list_head link; | 2193 | long from, long to, |
1133 | long from; | 2194 | struct vm_area_struct *vma) |
1134 | long to; | ||
1135 | }; | ||
1136 | |||
1137 | static long region_add(struct list_head *head, long f, long t) | ||
1138 | { | ||
1139 | struct file_region *rg, *nrg, *trg; | ||
1140 | |||
1141 | /* Locate the region we are either in or before. */ | ||
1142 | list_for_each_entry(rg, head, link) | ||
1143 | if (f <= rg->to) | ||
1144 | break; | ||
1145 | |||
1146 | /* Round our left edge to the current segment if it encloses us. */ | ||
1147 | if (f > rg->from) | ||
1148 | f = rg->from; | ||
1149 | |||
1150 | /* Check for and consume any regions we now overlap with. */ | ||
1151 | nrg = rg; | ||
1152 | list_for_each_entry_safe(rg, trg, rg->link.prev, link) { | ||
1153 | if (&rg->link == head) | ||
1154 | break; | ||
1155 | if (rg->from > t) | ||
1156 | break; | ||
1157 | |||
1158 | /* If this area reaches higher then extend our area to | ||
1159 | * include it completely. If this is not the first area | ||
1160 | * which we intend to reuse, free it. */ | ||
1161 | if (rg->to > t) | ||
1162 | t = rg->to; | ||
1163 | if (rg != nrg) { | ||
1164 | list_del(&rg->link); | ||
1165 | kfree(rg); | ||
1166 | } | ||
1167 | } | ||
1168 | nrg->from = f; | ||
1169 | nrg->to = t; | ||
1170 | return 0; | ||
1171 | } | ||
1172 | |||
1173 | static long region_chg(struct list_head *head, long f, long t) | ||
1174 | { | ||
1175 | struct file_region *rg, *nrg; | ||
1176 | long chg = 0; | ||
1177 | |||
1178 | /* Locate the region we are before or in. */ | ||
1179 | list_for_each_entry(rg, head, link) | ||
1180 | if (f <= rg->to) | ||
1181 | break; | ||
1182 | |||
1183 | /* If we are below the current region then a new region is required. | ||
1184 | * Subtle, allocate a new region at the position but make it zero | ||
1185 | * size such that we can guarantee to record the reservation. */ | ||
1186 | if (&rg->link == head || t < rg->from) { | ||
1187 | nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); | ||
1188 | if (!nrg) | ||
1189 | return -ENOMEM; | ||
1190 | nrg->from = f; | ||
1191 | nrg->to = f; | ||
1192 | INIT_LIST_HEAD(&nrg->link); | ||
1193 | list_add(&nrg->link, rg->link.prev); | ||
1194 | |||
1195 | return t - f; | ||
1196 | } | ||
1197 | |||
1198 | /* Round our left edge to the current segment if it encloses us. */ | ||
1199 | if (f > rg->from) | ||
1200 | f = rg->from; | ||
1201 | chg = t - f; | ||
1202 | |||
1203 | /* Check for and consume any regions we now overlap with. */ | ||
1204 | list_for_each_entry(rg, rg->link.prev, link) { | ||
1205 | if (&rg->link == head) | ||
1206 | break; | ||
1207 | if (rg->from > t) | ||
1208 | return chg; | ||
1209 | |||
1210 | /* We overlap with this area, if it extends futher than | ||
1211 | * us then we must extend ourselves. Account for its | ||
1212 | * existing reservation. */ | ||
1213 | if (rg->to > t) { | ||
1214 | chg += rg->to - t; | ||
1215 | t = rg->to; | ||
1216 | } | ||
1217 | chg -= rg->to - rg->from; | ||
1218 | } | ||
1219 | return chg; | ||
1220 | } | ||
1221 | |||
1222 | static long region_truncate(struct list_head *head, long end) | ||
1223 | { | 2195 | { |
1224 | struct file_region *rg, *trg; | 2196 | long ret, chg; |
1225 | long chg = 0; | 2197 | struct hstate *h = hstate_inode(inode); |
1226 | 2198 | ||
1227 | /* Locate the region we are either in or before. */ | 2199 | if (vma && vma->vm_flags & VM_NORESERVE) |
1228 | list_for_each_entry(rg, head, link) | ||
1229 | if (end <= rg->to) | ||
1230 | break; | ||
1231 | if (&rg->link == head) | ||
1232 | return 0; | 2200 | return 0; |
1233 | 2201 | ||
1234 | /* If we are in the middle of a region then adjust it. */ | ||
1235 | if (end > rg->from) { | ||
1236 | chg = rg->to - end; | ||
1237 | rg->to = end; | ||
1238 | rg = list_entry(rg->link.next, typeof(*rg), link); | ||
1239 | } | ||
1240 | |||
1241 | /* Drop any remaining regions. */ | ||
1242 | list_for_each_entry_safe(rg, trg, rg->link.prev, link) { | ||
1243 | if (&rg->link == head) | ||
1244 | break; | ||
1245 | chg += rg->to - rg->from; | ||
1246 | list_del(&rg->link); | ||
1247 | kfree(rg); | ||
1248 | } | ||
1249 | return chg; | ||
1250 | } | ||
1251 | |||
1252 | static int hugetlb_acct_memory(long delta) | ||
1253 | { | ||
1254 | int ret = -ENOMEM; | ||
1255 | |||
1256 | spin_lock(&hugetlb_lock); | ||
1257 | /* | 2202 | /* |
1258 | * When cpuset is configured, it breaks the strict hugetlb page | 2203 | * Shared mappings base their reservation on the number of pages that |
1259 | * reservation as the accounting is done on a global variable. Such | 2204 | * are already allocated on behalf of the file. Private mappings need |
1260 | * reservation is completely rubbish in the presence of cpuset because | 2205 | * to reserve the full area even if read-only as mprotect() may be |
1261 | * the reservation is not checked against page availability for the | 2206 | * called to make the mapping read-write. Assume !vma is a shm mapping |
1262 | * current cpuset. Application can still potentially OOM'ed by kernel | ||
1263 | * with lack of free htlb page in cpuset that the task is in. | ||
1264 | * Attempt to enforce strict accounting with cpuset is almost | ||
1265 | * impossible (or too ugly) because cpuset is too fluid that | ||
1266 | * task or memory node can be dynamically moved between cpusets. | ||
1267 | * | ||
1268 | * The change of semantics for shared hugetlb mapping with cpuset is | ||
1269 | * undesirable. However, in order to preserve some of the semantics, | ||
1270 | * we fall back to check against current free page availability as | ||
1271 | * a best attempt and hopefully to minimize the impact of changing | ||
1272 | * semantics that cpuset has. | ||
1273 | */ | 2207 | */ |
1274 | if (delta > 0) { | 2208 | if (!vma || vma->vm_flags & VM_SHARED) |
1275 | if (gather_surplus_pages(delta) < 0) | 2209 | chg = region_chg(&inode->i_mapping->private_list, from, to); |
1276 | goto out; | 2210 | else { |
1277 | 2211 | struct resv_map *resv_map = resv_map_alloc(); | |
1278 | if (delta > cpuset_mems_nr(free_huge_pages_node)) { | 2212 | if (!resv_map) |
1279 | return_unused_surplus_pages(delta); | 2213 | return -ENOMEM; |
1280 | goto out; | ||
1281 | } | ||
1282 | } | ||
1283 | 2214 | ||
1284 | ret = 0; | 2215 | chg = to - from; |
1285 | if (delta < 0) | ||
1286 | return_unused_surplus_pages((unsigned long) -delta); | ||
1287 | 2216 | ||
1288 | out: | 2217 | set_vma_resv_map(vma, resv_map); |
1289 | spin_unlock(&hugetlb_lock); | 2218 | set_vma_resv_flags(vma, HPAGE_RESV_OWNER); |
1290 | return ret; | 2219 | } |
1291 | } | ||
1292 | |||
1293 | int hugetlb_reserve_pages(struct inode *inode, long from, long to) | ||
1294 | { | ||
1295 | long ret, chg; | ||
1296 | 2220 | ||
1297 | chg = region_chg(&inode->i_mapping->private_list, from, to); | ||
1298 | if (chg < 0) | 2221 | if (chg < 0) |
1299 | return chg; | 2222 | return chg; |
1300 | 2223 | ||
1301 | if (hugetlb_get_quota(inode->i_mapping, chg)) | 2224 | if (hugetlb_get_quota(inode->i_mapping, chg)) |
1302 | return -ENOSPC; | 2225 | return -ENOSPC; |
1303 | ret = hugetlb_acct_memory(chg); | 2226 | ret = hugetlb_acct_memory(h, chg); |
1304 | if (ret < 0) { | 2227 | if (ret < 0) { |
1305 | hugetlb_put_quota(inode->i_mapping, chg); | 2228 | hugetlb_put_quota(inode->i_mapping, chg); |
1306 | return ret; | 2229 | return ret; |
1307 | } | 2230 | } |
1308 | region_add(&inode->i_mapping->private_list, from, to); | 2231 | if (!vma || vma->vm_flags & VM_SHARED) |
2232 | region_add(&inode->i_mapping->private_list, from, to); | ||
1309 | return 0; | 2233 | return 0; |
1310 | } | 2234 | } |
1311 | 2235 | ||
1312 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | 2236 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) |
1313 | { | 2237 | { |
2238 | struct hstate *h = hstate_inode(inode); | ||
1314 | long chg = region_truncate(&inode->i_mapping->private_list, offset); | 2239 | long chg = region_truncate(&inode->i_mapping->private_list, offset); |
1315 | 2240 | ||
1316 | spin_lock(&inode->i_lock); | 2241 | spin_lock(&inode->i_lock); |
1317 | inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; | 2242 | inode->i_blocks -= blocks_per_huge_page(h); |
1318 | spin_unlock(&inode->i_lock); | 2243 | spin_unlock(&inode->i_lock); |
1319 | 2244 | ||
1320 | hugetlb_put_quota(inode->i_mapping, (chg - freed)); | 2245 | hugetlb_put_quota(inode->i_mapping, (chg - freed)); |
1321 | hugetlb_acct_memory(-(chg - freed)); | 2246 | hugetlb_acct_memory(h, -(chg - freed)); |
1322 | } | 2247 | } |
diff --git a/mm/internal.h b/mm/internal.h index 0034e947e4bc..e4e728bdf324 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -13,6 +13,11 @@ | |||
13 | 13 | ||
14 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
15 | 15 | ||
16 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | ||
17 | unsigned long floor, unsigned long ceiling); | ||
18 | |||
19 | extern void prep_compound_page(struct page *page, unsigned long order); | ||
20 | |||
16 | static inline void set_page_count(struct page *page, int v) | 21 | static inline void set_page_count(struct page *page, int v) |
17 | { | 22 | { |
18 | atomic_set(&page->_count, v); | 23 | atomic_set(&page->_count, v); |
@@ -34,6 +39,15 @@ static inline void __put_page(struct page *page) | |||
34 | atomic_dec(&page->_count); | 39 | atomic_dec(&page->_count); |
35 | } | 40 | } |
36 | 41 | ||
42 | /* | ||
43 | * in mm/vmscan.c: | ||
44 | */ | ||
45 | extern int isolate_lru_page(struct page *page); | ||
46 | extern void putback_lru_page(struct page *page); | ||
47 | |||
48 | /* | ||
49 | * in mm/page_alloc.c | ||
50 | */ | ||
37 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 51 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
38 | 52 | ||
39 | /* | 53 | /* |
@@ -47,6 +61,120 @@ static inline unsigned long page_order(struct page *page) | |||
47 | return page_private(page); | 61 | return page_private(page); |
48 | } | 62 | } |
49 | 63 | ||
64 | extern long mlock_vma_pages_range(struct vm_area_struct *vma, | ||
65 | unsigned long start, unsigned long end); | ||
66 | extern void munlock_vma_pages_range(struct vm_area_struct *vma, | ||
67 | unsigned long start, unsigned long end); | ||
68 | static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | ||
69 | { | ||
70 | munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); | ||
71 | } | ||
72 | |||
73 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
74 | /* | ||
75 | * unevictable_migrate_page() called only from migrate_page_copy() to | ||
76 | * migrate unevictable flag to new page. | ||
77 | * Note that the old page has been isolated from the LRU lists at this | ||
78 | * point so we don't need to worry about LRU statistics. | ||
79 | */ | ||
80 | static inline void unevictable_migrate_page(struct page *new, struct page *old) | ||
81 | { | ||
82 | if (TestClearPageUnevictable(old)) | ||
83 | SetPageUnevictable(new); | ||
84 | } | ||
85 | #else | ||
86 | static inline void unevictable_migrate_page(struct page *new, struct page *old) | ||
87 | { | ||
88 | } | ||
89 | #endif | ||
90 | |||
91 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
92 | /* | ||
93 | * Called only in fault path via page_evictable() for a new page | ||
94 | * to determine if it's being mapped into a LOCKED vma. | ||
95 | * If so, mark page as mlocked. | ||
96 | */ | ||
97 | static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) | ||
98 | { | ||
99 | VM_BUG_ON(PageLRU(page)); | ||
100 | |||
101 | if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) | ||
102 | return 0; | ||
103 | |||
104 | if (!TestSetPageMlocked(page)) { | ||
105 | inc_zone_page_state(page, NR_MLOCK); | ||
106 | count_vm_event(UNEVICTABLE_PGMLOCKED); | ||
107 | } | ||
108 | return 1; | ||
109 | } | ||
110 | |||
111 | /* | ||
112 | * must be called with vma's mmap_sem held for read, and page locked. | ||
113 | */ | ||
114 | extern void mlock_vma_page(struct page *page); | ||
115 | |||
116 | /* | ||
117 | * Clear the page's PageMlocked(). This can be useful in a situation where | ||
118 | * we want to unconditionally remove a page from the pagecache -- e.g., | ||
119 | * on truncation or freeing. | ||
120 | * | ||
121 | * It is legal to call this function for any page, mlocked or not. | ||
122 | * If called for a page that is still mapped by mlocked vmas, all we do | ||
123 | * is revert to lazy LRU behaviour -- semantics are not broken. | ||
124 | */ | ||
125 | extern void __clear_page_mlock(struct page *page); | ||
126 | static inline void clear_page_mlock(struct page *page) | ||
127 | { | ||
128 | if (unlikely(TestClearPageMlocked(page))) | ||
129 | __clear_page_mlock(page); | ||
130 | } | ||
131 | |||
132 | /* | ||
133 | * mlock_migrate_page - called only from migrate_page_copy() to | ||
134 | * migrate the Mlocked page flag; update statistics. | ||
135 | */ | ||
136 | static inline void mlock_migrate_page(struct page *newpage, struct page *page) | ||
137 | { | ||
138 | if (TestClearPageMlocked(page)) { | ||
139 | unsigned long flags; | ||
140 | |||
141 | local_irq_save(flags); | ||
142 | __dec_zone_page_state(page, NR_MLOCK); | ||
143 | SetPageMlocked(newpage); | ||
144 | __inc_zone_page_state(newpage, NR_MLOCK); | ||
145 | local_irq_restore(flags); | ||
146 | } | ||
147 | } | ||
148 | |||
149 | /* | ||
150 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | ||
151 | * Page should not be on lru, so no need to fix that up. | ||
152 | * free_pages_check() will verify... | ||
153 | */ | ||
154 | static inline void free_page_mlock(struct page *page) | ||
155 | { | ||
156 | if (unlikely(TestClearPageMlocked(page))) { | ||
157 | unsigned long flags; | ||
158 | |||
159 | local_irq_save(flags); | ||
160 | __dec_zone_page_state(page, NR_MLOCK); | ||
161 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | ||
162 | local_irq_restore(flags); | ||
163 | } | ||
164 | } | ||
165 | |||
166 | #else /* CONFIG_UNEVICTABLE_LRU */ | ||
167 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | ||
168 | { | ||
169 | return 0; | ||
170 | } | ||
171 | static inline void clear_page_mlock(struct page *page) { } | ||
172 | static inline void mlock_vma_page(struct page *page) { } | ||
173 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } | ||
174 | static inline void free_page_mlock(struct page *page) { } | ||
175 | |||
176 | #endif /* CONFIG_UNEVICTABLE_LRU */ | ||
177 | |||
50 | /* | 178 | /* |
51 | * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, | 179 | * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, |
52 | * so all functions starting at paging_init should be marked __init | 180 | * so all functions starting at paging_init should be marked __init |
@@ -59,4 +187,68 @@ static inline unsigned long page_order(struct page *page) | |||
59 | #define __paginginit __init | 187 | #define __paginginit __init |
60 | #endif | 188 | #endif |
61 | 189 | ||
190 | /* Memory initialisation debug and verification */ | ||
191 | enum mminit_level { | ||
192 | MMINIT_WARNING, | ||
193 | MMINIT_VERIFY, | ||
194 | MMINIT_TRACE | ||
195 | }; | ||
196 | |||
197 | #ifdef CONFIG_DEBUG_MEMORY_INIT | ||
198 | |||
199 | extern int mminit_loglevel; | ||
200 | |||
201 | #define mminit_dprintk(level, prefix, fmt, arg...) \ | ||
202 | do { \ | ||
203 | if (level < mminit_loglevel) { \ | ||
204 | printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \ | ||
205 | printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \ | ||
206 | } \ | ||
207 | } while (0) | ||
208 | |||
209 | extern void mminit_verify_pageflags_layout(void); | ||
210 | extern void mminit_verify_page_links(struct page *page, | ||
211 | enum zone_type zone, unsigned long nid, unsigned long pfn); | ||
212 | extern void mminit_verify_zonelist(void); | ||
213 | |||
214 | #else | ||
215 | |||
216 | static inline void mminit_dprintk(enum mminit_level level, | ||
217 | const char *prefix, const char *fmt, ...) | ||
218 | { | ||
219 | } | ||
220 | |||
221 | static inline void mminit_verify_pageflags_layout(void) | ||
222 | { | ||
223 | } | ||
224 | |||
225 | static inline void mminit_verify_page_links(struct page *page, | ||
226 | enum zone_type zone, unsigned long nid, unsigned long pfn) | ||
227 | { | ||
228 | } | ||
229 | |||
230 | static inline void mminit_verify_zonelist(void) | ||
231 | { | ||
232 | } | ||
233 | #endif /* CONFIG_DEBUG_MEMORY_INIT */ | ||
234 | |||
235 | /* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ | ||
236 | #if defined(CONFIG_SPARSEMEM) | ||
237 | extern void mminit_validate_memmodel_limits(unsigned long *start_pfn, | ||
238 | unsigned long *end_pfn); | ||
239 | #else | ||
240 | static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, | ||
241 | unsigned long *end_pfn) | ||
242 | { | ||
243 | } | ||
244 | #endif /* CONFIG_SPARSEMEM */ | ||
245 | |||
246 | #define GUP_FLAGS_WRITE 0x1 | ||
247 | #define GUP_FLAGS_FORCE 0x2 | ||
248 | #define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 | ||
249 | |||
250 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | ||
251 | unsigned long start, int len, int flags, | ||
252 | struct page **pages, struct vm_area_struct **vmas); | ||
253 | |||
62 | #endif | 254 | #endif |
diff --git a/mm/madvise.c b/mm/madvise.c index 23a0ec3e0ea0..f9349c18a1b5 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -132,10 +132,10 @@ static long madvise_willneed(struct vm_area_struct * vma, | |||
132 | * Application no longer needs these pages. If the pages are dirty, | 132 | * Application no longer needs these pages. If the pages are dirty, |
133 | * it's OK to just throw them away. The app will be more careful about | 133 | * it's OK to just throw them away. The app will be more careful about |
134 | * data it wants to keep. Be sure to free swap resources too. The | 134 | * data it wants to keep. Be sure to free swap resources too. The |
135 | * zap_page_range call sets things up for refill_inactive to actually free | 135 | * zap_page_range call sets things up for shrink_active_list to actually free |
136 | * these pages later if no one else has touched them in the meantime, | 136 | * these pages later if no one else has touched them in the meantime, |
137 | * although we could add these pages to a global reuse list for | 137 | * although we could add these pages to a global reuse list for |
138 | * refill_inactive to pick up before reclaiming other pages. | 138 | * shrink_active_list to pick up before reclaiming other pages. |
139 | * | 139 | * |
140 | * NB: This interface discards data rather than pushes it out to swap, | 140 | * NB: This interface discards data rather than pushes it out to swap, |
141 | * as some implementations do. This has performance implications for | 141 | * as some implementations do. This has performance implications for |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e46451e1d9b7..866dcc7eeb0c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -32,12 +32,13 @@ | |||
32 | #include <linux/fs.h> | 32 | #include <linux/fs.h> |
33 | #include <linux/seq_file.h> | 33 | #include <linux/seq_file.h> |
34 | #include <linux/vmalloc.h> | 34 | #include <linux/vmalloc.h> |
35 | #include <linux/mm_inline.h> | ||
36 | #include <linux/page_cgroup.h> | ||
35 | 37 | ||
36 | #include <asm/uaccess.h> | 38 | #include <asm/uaccess.h> |
37 | 39 | ||
38 | struct cgroup_subsys mem_cgroup_subsys; | 40 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
39 | static const int MEM_CGROUP_RECLAIM_RETRIES = 5; | 41 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
40 | static struct kmem_cache *page_cgroup_cache; | ||
41 | 42 | ||
42 | /* | 43 | /* |
43 | * Statistics for memory cgroup. | 44 | * Statistics for memory cgroup. |
@@ -65,11 +66,10 @@ struct mem_cgroup_stat { | |||
65 | /* | 66 | /* |
66 | * For accounting under irq disable, no need for increment preempt count. | 67 | * For accounting under irq disable, no need for increment preempt count. |
67 | */ | 68 | */ |
68 | static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat, | 69 | static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, |
69 | enum mem_cgroup_stat_index idx, int val) | 70 | enum mem_cgroup_stat_index idx, int val) |
70 | { | 71 | { |
71 | int cpu = smp_processor_id(); | 72 | stat->count[idx] += val; |
72 | stat->cpustat[cpu].count[idx] += val; | ||
73 | } | 73 | } |
74 | 74 | ||
75 | static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, | 75 | static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, |
@@ -85,22 +85,13 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, | |||
85 | /* | 85 | /* |
86 | * per-zone information in memory controller. | 86 | * per-zone information in memory controller. |
87 | */ | 87 | */ |
88 | |||
89 | enum mem_cgroup_zstat_index { | ||
90 | MEM_CGROUP_ZSTAT_ACTIVE, | ||
91 | MEM_CGROUP_ZSTAT_INACTIVE, | ||
92 | |||
93 | NR_MEM_CGROUP_ZSTAT, | ||
94 | }; | ||
95 | |||
96 | struct mem_cgroup_per_zone { | 88 | struct mem_cgroup_per_zone { |
97 | /* | 89 | /* |
98 | * spin_lock to protect the per cgroup LRU | 90 | * spin_lock to protect the per cgroup LRU |
99 | */ | 91 | */ |
100 | spinlock_t lru_lock; | 92 | spinlock_t lru_lock; |
101 | struct list_head active_list; | 93 | struct list_head lists[NR_LRU_LISTS]; |
102 | struct list_head inactive_list; | 94 | unsigned long count[NR_LRU_LISTS]; |
103 | unsigned long count[NR_MEM_CGROUP_ZSTAT]; | ||
104 | }; | 95 | }; |
105 | /* Macro for accessing counter */ | 96 | /* Macro for accessing counter */ |
106 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | 97 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) |
@@ -144,69 +135,52 @@ struct mem_cgroup { | |||
144 | }; | 135 | }; |
145 | static struct mem_cgroup init_mem_cgroup; | 136 | static struct mem_cgroup init_mem_cgroup; |
146 | 137 | ||
147 | /* | ||
148 | * We use the lower bit of the page->page_cgroup pointer as a bit spin | ||
149 | * lock. We need to ensure that page->page_cgroup is at least two | ||
150 | * byte aligned (based on comments from Nick Piggin). But since | ||
151 | * bit_spin_lock doesn't actually set that lock bit in a non-debug | ||
152 | * uniprocessor kernel, we should avoid setting it here too. | ||
153 | */ | ||
154 | #define PAGE_CGROUP_LOCK_BIT 0x0 | ||
155 | #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) | ||
156 | #define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) | ||
157 | #else | ||
158 | #define PAGE_CGROUP_LOCK 0x0 | ||
159 | #endif | ||
160 | |||
161 | /* | ||
162 | * A page_cgroup page is associated with every page descriptor. The | ||
163 | * page_cgroup helps us identify information about the cgroup | ||
164 | */ | ||
165 | struct page_cgroup { | ||
166 | struct list_head lru; /* per cgroup LRU list */ | ||
167 | struct page *page; | ||
168 | struct mem_cgroup *mem_cgroup; | ||
169 | int ref_cnt; /* cached, mapped, migrating */ | ||
170 | int flags; | ||
171 | }; | ||
172 | #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ | ||
173 | #define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ | ||
174 | |||
175 | static int page_cgroup_nid(struct page_cgroup *pc) | ||
176 | { | ||
177 | return page_to_nid(pc->page); | ||
178 | } | ||
179 | |||
180 | static enum zone_type page_cgroup_zid(struct page_cgroup *pc) | ||
181 | { | ||
182 | return page_zonenum(pc->page); | ||
183 | } | ||
184 | |||
185 | enum charge_type { | 138 | enum charge_type { |
186 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 139 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
187 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 140 | MEM_CGROUP_CHARGE_TYPE_MAPPED, |
141 | MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ | ||
142 | MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ | ||
143 | NR_CHARGE_TYPE, | ||
144 | }; | ||
145 | |||
146 | /* only for here (for easy reading.) */ | ||
147 | #define PCGF_CACHE (1UL << PCG_CACHE) | ||
148 | #define PCGF_USED (1UL << PCG_USED) | ||
149 | #define PCGF_ACTIVE (1UL << PCG_ACTIVE) | ||
150 | #define PCGF_LOCK (1UL << PCG_LOCK) | ||
151 | #define PCGF_FILE (1UL << PCG_FILE) | ||
152 | static const unsigned long | ||
153 | pcg_default_flags[NR_CHARGE_TYPE] = { | ||
154 | PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */ | ||
155 | PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */ | ||
156 | PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ | ||
157 | 0, /* FORCE */ | ||
188 | }; | 158 | }; |
189 | 159 | ||
190 | /* | 160 | /* |
191 | * Always modified under lru lock. Then, not necessary to preempt_disable() | 161 | * Always modified under lru lock. Then, not necessary to preempt_disable() |
192 | */ | 162 | */ |
193 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, | 163 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
194 | bool charge) | 164 | struct page_cgroup *pc, |
165 | bool charge) | ||
195 | { | 166 | { |
196 | int val = (charge)? 1 : -1; | 167 | int val = (charge)? 1 : -1; |
197 | struct mem_cgroup_stat *stat = &mem->stat; | 168 | struct mem_cgroup_stat *stat = &mem->stat; |
169 | struct mem_cgroup_stat_cpu *cpustat; | ||
198 | 170 | ||
199 | VM_BUG_ON(!irqs_disabled()); | 171 | VM_BUG_ON(!irqs_disabled()); |
200 | if (flags & PAGE_CGROUP_FLAG_CACHE) | 172 | |
201 | __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val); | 173 | cpustat = &stat->cpustat[smp_processor_id()]; |
174 | if (PageCgroupCache(pc)) | ||
175 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); | ||
202 | else | 176 | else |
203 | __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); | 177 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); |
204 | 178 | ||
205 | if (charge) | 179 | if (charge) |
206 | __mem_cgroup_stat_add_safe(stat, | 180 | __mem_cgroup_stat_add_safe(cpustat, |
207 | MEM_CGROUP_STAT_PGPGIN_COUNT, 1); | 181 | MEM_CGROUP_STAT_PGPGIN_COUNT, 1); |
208 | else | 182 | else |
209 | __mem_cgroup_stat_add_safe(stat, | 183 | __mem_cgroup_stat_add_safe(cpustat, |
210 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); | 184 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); |
211 | } | 185 | } |
212 | 186 | ||
@@ -227,7 +201,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc) | |||
227 | } | 201 | } |
228 | 202 | ||
229 | static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, | 203 | static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, |
230 | enum mem_cgroup_zstat_index idx) | 204 | enum lru_list idx) |
231 | { | 205 | { |
232 | int nid, zid; | 206 | int nid, zid; |
233 | struct mem_cgroup_per_zone *mz; | 207 | struct mem_cgroup_per_zone *mz; |
@@ -250,89 +224,89 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | |||
250 | 224 | ||
251 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | 225 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) |
252 | { | 226 | { |
227 | /* | ||
228 | * mm_update_next_owner() may clear mm->owner to NULL | ||
229 | * if it races with swapoff, page migration, etc. | ||
230 | * So this can be called with p == NULL. | ||
231 | */ | ||
232 | if (unlikely(!p)) | ||
233 | return NULL; | ||
234 | |||
253 | return container_of(task_subsys_state(p, mem_cgroup_subsys_id), | 235 | return container_of(task_subsys_state(p, mem_cgroup_subsys_id), |
254 | struct mem_cgroup, css); | 236 | struct mem_cgroup, css); |
255 | } | 237 | } |
256 | 238 | ||
257 | static inline int page_cgroup_locked(struct page *page) | ||
258 | { | ||
259 | return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
260 | } | ||
261 | |||
262 | static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) | ||
263 | { | ||
264 | VM_BUG_ON(!page_cgroup_locked(page)); | ||
265 | page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK); | ||
266 | } | ||
267 | |||
268 | struct page_cgroup *page_get_page_cgroup(struct page *page) | ||
269 | { | ||
270 | return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK); | ||
271 | } | ||
272 | |||
273 | static void lock_page_cgroup(struct page *page) | ||
274 | { | ||
275 | bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
276 | } | ||
277 | |||
278 | static int try_lock_page_cgroup(struct page *page) | ||
279 | { | ||
280 | return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
281 | } | ||
282 | |||
283 | static void unlock_page_cgroup(struct page *page) | ||
284 | { | ||
285 | bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
286 | } | ||
287 | |||
288 | static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, | 239 | static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, |
289 | struct page_cgroup *pc) | 240 | struct page_cgroup *pc) |
290 | { | 241 | { |
291 | int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; | 242 | int lru = LRU_BASE; |
243 | |||
244 | if (PageCgroupUnevictable(pc)) | ||
245 | lru = LRU_UNEVICTABLE; | ||
246 | else { | ||
247 | if (PageCgroupActive(pc)) | ||
248 | lru += LRU_ACTIVE; | ||
249 | if (PageCgroupFile(pc)) | ||
250 | lru += LRU_FILE; | ||
251 | } | ||
292 | 252 | ||
293 | if (from) | 253 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; |
294 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; | ||
295 | else | ||
296 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; | ||
297 | 254 | ||
298 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); | 255 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false); |
299 | list_del_init(&pc->lru); | 256 | list_del(&pc->lru); |
300 | } | 257 | } |
301 | 258 | ||
302 | static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, | 259 | static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, |
303 | struct page_cgroup *pc) | 260 | struct page_cgroup *pc) |
304 | { | 261 | { |
305 | int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; | 262 | int lru = LRU_BASE; |
306 | 263 | ||
307 | if (!to) { | 264 | if (PageCgroupUnevictable(pc)) |
308 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; | 265 | lru = LRU_UNEVICTABLE; |
309 | list_add(&pc->lru, &mz->inactive_list); | 266 | else { |
310 | } else { | 267 | if (PageCgroupActive(pc)) |
311 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; | 268 | lru += LRU_ACTIVE; |
312 | list_add(&pc->lru, &mz->active_list); | 269 | if (PageCgroupFile(pc)) |
270 | lru += LRU_FILE; | ||
313 | } | 271 | } |
314 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); | 272 | |
273 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | ||
274 | list_add(&pc->lru, &mz->lists[lru]); | ||
275 | |||
276 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); | ||
315 | } | 277 | } |
316 | 278 | ||
317 | static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) | 279 | static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru) |
318 | { | 280 | { |
319 | int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; | ||
320 | struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); | 281 | struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); |
282 | int active = PageCgroupActive(pc); | ||
283 | int file = PageCgroupFile(pc); | ||
284 | int unevictable = PageCgroupUnevictable(pc); | ||
285 | enum lru_list from = unevictable ? LRU_UNEVICTABLE : | ||
286 | (LRU_FILE * !!file + !!active); | ||
321 | 287 | ||
322 | if (from) | 288 | if (lru == from) |
323 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; | 289 | return; |
324 | else | ||
325 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; | ||
326 | 290 | ||
327 | if (active) { | 291 | MEM_CGROUP_ZSTAT(mz, from) -= 1; |
328 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; | 292 | /* |
329 | pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; | 293 | * However this is done under mz->lru_lock, another flags, which |
330 | list_move(&pc->lru, &mz->active_list); | 294 | * are not related to LRU, will be modified from out-of-lock. |
295 | * We have to use atomic set/clear flags. | ||
296 | */ | ||
297 | if (is_unevictable_lru(lru)) { | ||
298 | ClearPageCgroupActive(pc); | ||
299 | SetPageCgroupUnevictable(pc); | ||
331 | } else { | 300 | } else { |
332 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; | 301 | if (is_active_lru(lru)) |
333 | pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; | 302 | SetPageCgroupActive(pc); |
334 | list_move(&pc->lru, &mz->inactive_list); | 303 | else |
304 | ClearPageCgroupActive(pc); | ||
305 | ClearPageCgroupUnevictable(pc); | ||
335 | } | 306 | } |
307 | |||
308 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | ||
309 | list_move(&pc->lru, &mz->lists[lru]); | ||
336 | } | 310 | } |
337 | 311 | ||
338 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | 312 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) |
@@ -348,12 +322,15 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
348 | /* | 322 | /* |
349 | * This routine assumes that the appropriate zone's lru lock is already held | 323 | * This routine assumes that the appropriate zone's lru lock is already held |
350 | */ | 324 | */ |
351 | void mem_cgroup_move_lists(struct page *page, bool active) | 325 | void mem_cgroup_move_lists(struct page *page, enum lru_list lru) |
352 | { | 326 | { |
353 | struct page_cgroup *pc; | 327 | struct page_cgroup *pc; |
354 | struct mem_cgroup_per_zone *mz; | 328 | struct mem_cgroup_per_zone *mz; |
355 | unsigned long flags; | 329 | unsigned long flags; |
356 | 330 | ||
331 | if (mem_cgroup_subsys.disabled) | ||
332 | return; | ||
333 | |||
357 | /* | 334 | /* |
358 | * We cannot lock_page_cgroup while holding zone's lru_lock, | 335 | * We cannot lock_page_cgroup while holding zone's lru_lock, |
359 | * because other holders of lock_page_cgroup can be interrupted | 336 | * because other holders of lock_page_cgroup can be interrupted |
@@ -361,17 +338,16 @@ void mem_cgroup_move_lists(struct page *page, bool active) | |||
361 | * safely get to page_cgroup without it, so just try_lock it: | 338 | * safely get to page_cgroup without it, so just try_lock it: |
362 | * mem_cgroup_isolate_pages allows for page left on wrong list. | 339 | * mem_cgroup_isolate_pages allows for page left on wrong list. |
363 | */ | 340 | */ |
364 | if (!try_lock_page_cgroup(page)) | 341 | pc = lookup_page_cgroup(page); |
342 | if (!trylock_page_cgroup(pc)) | ||
365 | return; | 343 | return; |
366 | 344 | if (pc && PageCgroupUsed(pc)) { | |
367 | pc = page_get_page_cgroup(page); | ||
368 | if (pc) { | ||
369 | mz = page_cgroup_zoneinfo(pc); | 345 | mz = page_cgroup_zoneinfo(pc); |
370 | spin_lock_irqsave(&mz->lru_lock, flags); | 346 | spin_lock_irqsave(&mz->lru_lock, flags); |
371 | __mem_cgroup_move_lists(pc, active); | 347 | __mem_cgroup_move_lists(pc, lru); |
372 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 348 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
373 | } | 349 | } |
374 | unlock_page_cgroup(page); | 350 | unlock_page_cgroup(pc); |
375 | } | 351 | } |
376 | 352 | ||
377 | /* | 353 | /* |
@@ -392,21 +368,6 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) | |||
392 | } | 368 | } |
393 | 369 | ||
394 | /* | 370 | /* |
395 | * This function is called from vmscan.c. In page reclaiming loop. balance | ||
396 | * between active and inactive list is calculated. For memory controller | ||
397 | * page reclaiming, we should use using mem_cgroup's imbalance rather than | ||
398 | * zone's global lru imbalance. | ||
399 | */ | ||
400 | long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem) | ||
401 | { | ||
402 | unsigned long active, inactive; | ||
403 | /* active and inactive are the number of pages. 'long' is ok.*/ | ||
404 | active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE); | ||
405 | inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE); | ||
406 | return (long) (active / (inactive + 1)); | ||
407 | } | ||
408 | |||
409 | /* | ||
410 | * prev_priority control...this will be used in memory reclaim path. | 371 | * prev_priority control...this will be used in memory reclaim path. |
411 | */ | 372 | */ |
412 | int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) | 373 | int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) |
@@ -433,28 +394,17 @@ void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) | |||
433 | * (see include/linux/mmzone.h) | 394 | * (see include/linux/mmzone.h) |
434 | */ | 395 | */ |
435 | 396 | ||
436 | long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, | 397 | long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone, |
437 | struct zone *zone, int priority) | 398 | int priority, enum lru_list lru) |
438 | { | 399 | { |
439 | long nr_active; | 400 | long nr_pages; |
440 | int nid = zone->zone_pgdat->node_id; | 401 | int nid = zone->zone_pgdat->node_id; |
441 | int zid = zone_idx(zone); | 402 | int zid = zone_idx(zone); |
442 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); | 403 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); |
443 | 404 | ||
444 | nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE); | 405 | nr_pages = MEM_CGROUP_ZSTAT(mz, lru); |
445 | return (nr_active >> priority); | ||
446 | } | ||
447 | 406 | ||
448 | long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, | 407 | return (nr_pages >> priority); |
449 | struct zone *zone, int priority) | ||
450 | { | ||
451 | long nr_inactive; | ||
452 | int nid = zone->zone_pgdat->node_id; | ||
453 | int zid = zone_idx(zone); | ||
454 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); | ||
455 | |||
456 | nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); | ||
457 | return (nr_inactive >> priority); | ||
458 | } | 408 | } |
459 | 409 | ||
460 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | 410 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, |
@@ -462,7 +412,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
462 | unsigned long *scanned, int order, | 412 | unsigned long *scanned, int order, |
463 | int mode, struct zone *z, | 413 | int mode, struct zone *z, |
464 | struct mem_cgroup *mem_cont, | 414 | struct mem_cgroup *mem_cont, |
465 | int active) | 415 | int active, int file) |
466 | { | 416 | { |
467 | unsigned long nr_taken = 0; | 417 | unsigned long nr_taken = 0; |
468 | struct page *page; | 418 | struct page *page; |
@@ -473,38 +423,38 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
473 | int nid = z->zone_pgdat->node_id; | 423 | int nid = z->zone_pgdat->node_id; |
474 | int zid = zone_idx(z); | 424 | int zid = zone_idx(z); |
475 | struct mem_cgroup_per_zone *mz; | 425 | struct mem_cgroup_per_zone *mz; |
426 | int lru = LRU_FILE * !!file + !!active; | ||
476 | 427 | ||
477 | BUG_ON(!mem_cont); | 428 | BUG_ON(!mem_cont); |
478 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); | 429 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); |
479 | if (active) | 430 | src = &mz->lists[lru]; |
480 | src = &mz->active_list; | ||
481 | else | ||
482 | src = &mz->inactive_list; | ||
483 | |||
484 | 431 | ||
485 | spin_lock(&mz->lru_lock); | 432 | spin_lock(&mz->lru_lock); |
486 | scan = 0; | 433 | scan = 0; |
487 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { | 434 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { |
488 | if (scan >= nr_to_scan) | 435 | if (scan >= nr_to_scan) |
489 | break; | 436 | break; |
437 | if (unlikely(!PageCgroupUsed(pc))) | ||
438 | continue; | ||
490 | page = pc->page; | 439 | page = pc->page; |
491 | 440 | ||
492 | if (unlikely(!PageLRU(page))) | 441 | if (unlikely(!PageLRU(page))) |
493 | continue; | 442 | continue; |
494 | 443 | ||
495 | if (PageActive(page) && !active) { | 444 | /* |
496 | __mem_cgroup_move_lists(pc, true); | 445 | * TODO: play better with lumpy reclaim, grabbing anything. |
497 | continue; | 446 | */ |
498 | } | 447 | if (PageUnevictable(page) || |
499 | if (!PageActive(page) && active) { | 448 | (PageActive(page) && !active) || |
500 | __mem_cgroup_move_lists(pc, false); | 449 | (!PageActive(page) && active)) { |
450 | __mem_cgroup_move_lists(pc, page_lru(page)); | ||
501 | continue; | 451 | continue; |
502 | } | 452 | } |
503 | 453 | ||
504 | scan++; | 454 | scan++; |
505 | list_move(&pc->lru, &pc_list); | 455 | list_move(&pc->lru, &pc_list); |
506 | 456 | ||
507 | if (__isolate_lru_page(page, mode) == 0) { | 457 | if (__isolate_lru_page(page, mode, file) == 0) { |
508 | list_move(&page->lru, dst); | 458 | list_move(&page->lru, dst); |
509 | nr_taken++; | 459 | nr_taken++; |
510 | } | 460 | } |
@@ -524,63 +474,45 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
524 | * < 0 if the cgroup is over its limit | 474 | * < 0 if the cgroup is over its limit |
525 | */ | 475 | */ |
526 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | 476 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, |
527 | gfp_t gfp_mask, enum charge_type ctype) | 477 | gfp_t gfp_mask, enum charge_type ctype, |
478 | struct mem_cgroup *memcg) | ||
528 | { | 479 | { |
529 | struct mem_cgroup *mem; | 480 | struct mem_cgroup *mem; |
530 | struct page_cgroup *pc; | 481 | struct page_cgroup *pc; |
531 | unsigned long flags; | ||
532 | unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 482 | unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
533 | struct mem_cgroup_per_zone *mz; | 483 | struct mem_cgroup_per_zone *mz; |
484 | unsigned long flags; | ||
534 | 485 | ||
535 | if (mem_cgroup_subsys.disabled) | 486 | pc = lookup_page_cgroup(page); |
487 | /* can happen at boot */ | ||
488 | if (unlikely(!pc)) | ||
536 | return 0; | 489 | return 0; |
537 | 490 | prefetchw(pc); | |
538 | /* | ||
539 | * Should page_cgroup's go to their own slab? | ||
540 | * One could optimize the performance of the charging routine | ||
541 | * by saving a bit in the page_flags and using it as a lock | ||
542 | * to see if the cgroup page already has a page_cgroup associated | ||
543 | * with it | ||
544 | */ | ||
545 | retry: | ||
546 | lock_page_cgroup(page); | ||
547 | pc = page_get_page_cgroup(page); | ||
548 | /* | ||
549 | * The page_cgroup exists and | ||
550 | * the page has already been accounted. | ||
551 | */ | ||
552 | if (pc) { | ||
553 | VM_BUG_ON(pc->page != page); | ||
554 | VM_BUG_ON(pc->ref_cnt <= 0); | ||
555 | |||
556 | pc->ref_cnt++; | ||
557 | unlock_page_cgroup(page); | ||
558 | goto done; | ||
559 | } | ||
560 | unlock_page_cgroup(page); | ||
561 | |||
562 | pc = kmem_cache_zalloc(page_cgroup_cache, gfp_mask); | ||
563 | if (pc == NULL) | ||
564 | goto err; | ||
565 | |||
566 | /* | 491 | /* |
567 | * We always charge the cgroup the mm_struct belongs to. | 492 | * We always charge the cgroup the mm_struct belongs to. |
568 | * The mm_struct's mem_cgroup changes on task migration if the | 493 | * The mm_struct's mem_cgroup changes on task migration if the |
569 | * thread group leader migrates. It's possible that mm is not | 494 | * thread group leader migrates. It's possible that mm is not |
570 | * set, if so charge the init_mm (happens for pagecache usage). | 495 | * set, if so charge the init_mm (happens for pagecache usage). |
571 | */ | 496 | */ |
572 | if (!mm) | ||
573 | mm = &init_mm; | ||
574 | 497 | ||
575 | rcu_read_lock(); | 498 | if (likely(!memcg)) { |
576 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 499 | rcu_read_lock(); |
577 | /* | 500 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
578 | * For every charge from the cgroup, increment reference count | 501 | if (unlikely(!mem)) { |
579 | */ | 502 | rcu_read_unlock(); |
580 | css_get(&mem->css); | 503 | return 0; |
581 | rcu_read_unlock(); | 504 | } |
505 | /* | ||
506 | * For every charge from the cgroup, increment reference count | ||
507 | */ | ||
508 | css_get(&mem->css); | ||
509 | rcu_read_unlock(); | ||
510 | } else { | ||
511 | mem = memcg; | ||
512 | css_get(&memcg->css); | ||
513 | } | ||
582 | 514 | ||
583 | while (res_counter_charge(&mem->res, PAGE_SIZE)) { | 515 | while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) { |
584 | if (!(gfp_mask & __GFP_WAIT)) | 516 | if (!(gfp_mask & __GFP_WAIT)) |
585 | goto out; | 517 | goto out; |
586 | 518 | ||
@@ -603,63 +535,104 @@ retry: | |||
603 | } | 535 | } |
604 | } | 536 | } |
605 | 537 | ||
606 | pc->ref_cnt = 1; | 538 | |
607 | pc->mem_cgroup = mem; | 539 | lock_page_cgroup(pc); |
608 | pc->page = page; | 540 | if (unlikely(PageCgroupUsed(pc))) { |
609 | pc->flags = PAGE_CGROUP_FLAG_ACTIVE; | 541 | unlock_page_cgroup(pc); |
610 | if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) | ||
611 | pc->flags = PAGE_CGROUP_FLAG_CACHE; | ||
612 | |||
613 | lock_page_cgroup(page); | ||
614 | if (page_get_page_cgroup(page)) { | ||
615 | unlock_page_cgroup(page); | ||
616 | /* | ||
617 | * Another charge has been added to this page already. | ||
618 | * We take lock_page_cgroup(page) again and read | ||
619 | * page->cgroup, increment refcnt.... just retry is OK. | ||
620 | */ | ||
621 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 542 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
622 | css_put(&mem->css); | 543 | css_put(&mem->css); |
623 | kmem_cache_free(page_cgroup_cache, pc); | 544 | |
624 | goto retry; | 545 | goto done; |
625 | } | 546 | } |
626 | page_assign_page_cgroup(page, pc); | 547 | pc->mem_cgroup = mem; |
548 | /* | ||
549 | * If a page is accounted as a page cache, insert to inactive list. | ||
550 | * If anon, insert to active list. | ||
551 | */ | ||
552 | pc->flags = pcg_default_flags[ctype]; | ||
627 | 553 | ||
628 | mz = page_cgroup_zoneinfo(pc); | 554 | mz = page_cgroup_zoneinfo(pc); |
555 | |||
629 | spin_lock_irqsave(&mz->lru_lock, flags); | 556 | spin_lock_irqsave(&mz->lru_lock, flags); |
630 | __mem_cgroup_add_list(mz, pc); | 557 | __mem_cgroup_add_list(mz, pc); |
631 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 558 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
559 | unlock_page_cgroup(pc); | ||
632 | 560 | ||
633 | unlock_page_cgroup(page); | ||
634 | done: | 561 | done: |
635 | return 0; | 562 | return 0; |
636 | out: | 563 | out: |
637 | css_put(&mem->css); | 564 | css_put(&mem->css); |
638 | kmem_cache_free(page_cgroup_cache, pc); | ||
639 | err: | ||
640 | return -ENOMEM; | 565 | return -ENOMEM; |
641 | } | 566 | } |
642 | 567 | ||
643 | int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) | 568 | int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) |
644 | { | 569 | { |
570 | if (mem_cgroup_subsys.disabled) | ||
571 | return 0; | ||
572 | if (PageCompound(page)) | ||
573 | return 0; | ||
574 | /* | ||
575 | * If already mapped, we don't have to account. | ||
576 | * If page cache, page->mapping has address_space. | ||
577 | * But page->mapping may have out-of-use anon_vma pointer, | ||
578 | * detecit it by PageAnon() check. newly-mapped-anon's page->mapping | ||
579 | * is NULL. | ||
580 | */ | ||
581 | if (page_mapped(page) || (page->mapping && !PageAnon(page))) | ||
582 | return 0; | ||
583 | if (unlikely(!mm)) | ||
584 | mm = &init_mm; | ||
645 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 585 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
646 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | 586 | MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); |
647 | } | 587 | } |
648 | 588 | ||
649 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | 589 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
650 | gfp_t gfp_mask) | 590 | gfp_t gfp_mask) |
651 | { | 591 | { |
652 | if (!mm) | 592 | if (mem_cgroup_subsys.disabled) |
593 | return 0; | ||
594 | if (PageCompound(page)) | ||
595 | return 0; | ||
596 | /* | ||
597 | * Corner case handling. This is called from add_to_page_cache() | ||
598 | * in usual. But some FS (shmem) precharges this page before calling it | ||
599 | * and call add_to_page_cache() with GFP_NOWAIT. | ||
600 | * | ||
601 | * For GFP_NOWAIT case, the page may be pre-charged before calling | ||
602 | * add_to_page_cache(). (See shmem.c) check it here and avoid to call | ||
603 | * charge twice. (It works but has to pay a bit larger cost.) | ||
604 | */ | ||
605 | if (!(gfp_mask & __GFP_WAIT)) { | ||
606 | struct page_cgroup *pc; | ||
607 | |||
608 | |||
609 | pc = lookup_page_cgroup(page); | ||
610 | if (!pc) | ||
611 | return 0; | ||
612 | lock_page_cgroup(pc); | ||
613 | if (PageCgroupUsed(pc)) { | ||
614 | unlock_page_cgroup(pc); | ||
615 | return 0; | ||
616 | } | ||
617 | unlock_page_cgroup(pc); | ||
618 | } | ||
619 | |||
620 | if (unlikely(!mm)) | ||
653 | mm = &init_mm; | 621 | mm = &init_mm; |
654 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 622 | |
655 | MEM_CGROUP_CHARGE_TYPE_CACHE); | 623 | if (page_is_file_cache(page)) |
624 | return mem_cgroup_charge_common(page, mm, gfp_mask, | ||
625 | MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); | ||
626 | else | ||
627 | return mem_cgroup_charge_common(page, mm, gfp_mask, | ||
628 | MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); | ||
656 | } | 629 | } |
657 | 630 | ||
658 | /* | 631 | /* |
659 | * Uncharging is always a welcome operation, we never complain, simply | 632 | * uncharge if !page_mapped(page) |
660 | * uncharge. | ||
661 | */ | 633 | */ |
662 | void mem_cgroup_uncharge_page(struct page *page) | 634 | static void |
635 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | ||
663 | { | 636 | { |
664 | struct page_cgroup *pc; | 637 | struct page_cgroup *pc; |
665 | struct mem_cgroup *mem; | 638 | struct mem_cgroup *mem; |
@@ -672,106 +645,172 @@ void mem_cgroup_uncharge_page(struct page *page) | |||
672 | /* | 645 | /* |
673 | * Check if our page_cgroup is valid | 646 | * Check if our page_cgroup is valid |
674 | */ | 647 | */ |
675 | lock_page_cgroup(page); | 648 | pc = lookup_page_cgroup(page); |
676 | pc = page_get_page_cgroup(page); | 649 | if (unlikely(!pc || !PageCgroupUsed(pc))) |
677 | if (!pc) | 650 | return; |
678 | goto unlock; | ||
679 | 651 | ||
680 | VM_BUG_ON(pc->page != page); | 652 | lock_page_cgroup(pc); |
681 | VM_BUG_ON(pc->ref_cnt <= 0); | 653 | if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page)) |
654 | || !PageCgroupUsed(pc)) { | ||
655 | /* This happens at race in zap_pte_range() and do_swap_page()*/ | ||
656 | unlock_page_cgroup(pc); | ||
657 | return; | ||
658 | } | ||
659 | ClearPageCgroupUsed(pc); | ||
660 | mem = pc->mem_cgroup; | ||
682 | 661 | ||
683 | if (--(pc->ref_cnt) == 0) { | 662 | mz = page_cgroup_zoneinfo(pc); |
684 | mz = page_cgroup_zoneinfo(pc); | 663 | spin_lock_irqsave(&mz->lru_lock, flags); |
685 | spin_lock_irqsave(&mz->lru_lock, flags); | 664 | __mem_cgroup_remove_list(mz, pc); |
686 | __mem_cgroup_remove_list(mz, pc); | 665 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
687 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 666 | unlock_page_cgroup(pc); |
688 | 667 | ||
689 | page_assign_page_cgroup(page, NULL); | 668 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
690 | unlock_page_cgroup(page); | 669 | css_put(&mem->css); |
691 | 670 | ||
692 | mem = pc->mem_cgroup; | 671 | return; |
693 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 672 | } |
694 | css_put(&mem->css); | ||
695 | 673 | ||
696 | kmem_cache_free(page_cgroup_cache, pc); | 674 | void mem_cgroup_uncharge_page(struct page *page) |
675 | { | ||
676 | /* early check. */ | ||
677 | if (page_mapped(page)) | ||
697 | return; | 678 | return; |
698 | } | 679 | if (page->mapping && !PageAnon(page)) |
680 | return; | ||
681 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); | ||
682 | } | ||
699 | 683 | ||
700 | unlock: | 684 | void mem_cgroup_uncharge_cache_page(struct page *page) |
701 | unlock_page_cgroup(page); | 685 | { |
686 | VM_BUG_ON(page_mapped(page)); | ||
687 | VM_BUG_ON(page->mapping); | ||
688 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | ||
702 | } | 689 | } |
703 | 690 | ||
704 | /* | 691 | /* |
705 | * Returns non-zero if a page (under migration) has valid page_cgroup member. | 692 | * Before starting migration, account against new page. |
706 | * Refcnt of page_cgroup is incremented. | ||
707 | */ | 693 | */ |
708 | int mem_cgroup_prepare_migration(struct page *page) | 694 | int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) |
709 | { | 695 | { |
710 | struct page_cgroup *pc; | 696 | struct page_cgroup *pc; |
697 | struct mem_cgroup *mem = NULL; | ||
698 | enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | ||
699 | int ret = 0; | ||
711 | 700 | ||
712 | if (mem_cgroup_subsys.disabled) | 701 | if (mem_cgroup_subsys.disabled) |
713 | return 0; | 702 | return 0; |
714 | 703 | ||
715 | lock_page_cgroup(page); | 704 | pc = lookup_page_cgroup(page); |
716 | pc = page_get_page_cgroup(page); | 705 | lock_page_cgroup(pc); |
717 | if (pc) | 706 | if (PageCgroupUsed(pc)) { |
718 | pc->ref_cnt++; | 707 | mem = pc->mem_cgroup; |
719 | unlock_page_cgroup(page); | 708 | css_get(&mem->css); |
720 | return pc != NULL; | 709 | if (PageCgroupCache(pc)) { |
710 | if (page_is_file_cache(page)) | ||
711 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
712 | else | ||
713 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
714 | } | ||
715 | } | ||
716 | unlock_page_cgroup(pc); | ||
717 | if (mem) { | ||
718 | ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, | ||
719 | ctype, mem); | ||
720 | css_put(&mem->css); | ||
721 | } | ||
722 | return ret; | ||
721 | } | 723 | } |
722 | 724 | ||
723 | void mem_cgroup_end_migration(struct page *page) | 725 | /* remove redundant charge if migration failed*/ |
726 | void mem_cgroup_end_migration(struct page *newpage) | ||
724 | { | 727 | { |
725 | mem_cgroup_uncharge_page(page); | 728 | /* |
729 | * At success, page->mapping is not NULL. | ||
730 | * special rollback care is necessary when | ||
731 | * 1. at migration failure. (newpage->mapping is cleared in this case) | ||
732 | * 2. the newpage was moved but not remapped again because the task | ||
733 | * exits and the newpage is obsolete. In this case, the new page | ||
734 | * may be a swapcache. So, we just call mem_cgroup_uncharge_page() | ||
735 | * always for avoiding mess. The page_cgroup will be removed if | ||
736 | * unnecessary. File cache pages is still on radix-tree. Don't | ||
737 | * care it. | ||
738 | */ | ||
739 | if (!newpage->mapping) | ||
740 | __mem_cgroup_uncharge_common(newpage, | ||
741 | MEM_CGROUP_CHARGE_TYPE_FORCE); | ||
742 | else if (PageAnon(newpage)) | ||
743 | mem_cgroup_uncharge_page(newpage); | ||
726 | } | 744 | } |
727 | 745 | ||
728 | /* | 746 | /* |
729 | * We know both *page* and *newpage* are now not-on-LRU and PG_locked. | 747 | * A call to try to shrink memory usage under specified resource controller. |
730 | * And no race with uncharge() routines because page_cgroup for *page* | 748 | * This is typically used for page reclaiming for shmem for reducing side |
731 | * has extra one reference by mem_cgroup_prepare_migration. | 749 | * effect of page allocation from shmem, which is used by some mem_cgroup. |
732 | */ | 750 | */ |
733 | void mem_cgroup_page_migration(struct page *page, struct page *newpage) | 751 | int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) |
734 | { | 752 | { |
735 | struct page_cgroup *pc; | 753 | struct mem_cgroup *mem; |
736 | struct mem_cgroup_per_zone *mz; | 754 | int progress = 0; |
737 | unsigned long flags; | 755 | int retry = MEM_CGROUP_RECLAIM_RETRIES; |
738 | 756 | ||
739 | lock_page_cgroup(page); | 757 | if (mem_cgroup_subsys.disabled) |
740 | pc = page_get_page_cgroup(page); | 758 | return 0; |
741 | if (!pc) { | 759 | if (!mm) |
742 | unlock_page_cgroup(page); | 760 | return 0; |
743 | return; | 761 | |
762 | rcu_read_lock(); | ||
763 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | ||
764 | if (unlikely(!mem)) { | ||
765 | rcu_read_unlock(); | ||
766 | return 0; | ||
744 | } | 767 | } |
768 | css_get(&mem->css); | ||
769 | rcu_read_unlock(); | ||
745 | 770 | ||
746 | mz = page_cgroup_zoneinfo(pc); | 771 | do { |
747 | spin_lock_irqsave(&mz->lru_lock, flags); | 772 | progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); |
748 | __mem_cgroup_remove_list(mz, pc); | 773 | progress += res_counter_check_under_limit(&mem->res); |
749 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 774 | } while (!progress && --retry); |
750 | 775 | ||
751 | page_assign_page_cgroup(page, NULL); | 776 | css_put(&mem->css); |
752 | unlock_page_cgroup(page); | 777 | if (!retry) |
778 | return -ENOMEM; | ||
779 | return 0; | ||
780 | } | ||
753 | 781 | ||
754 | pc->page = newpage; | 782 | int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) |
755 | lock_page_cgroup(newpage); | 783 | { |
756 | page_assign_page_cgroup(newpage, pc); | ||
757 | 784 | ||
758 | mz = page_cgroup_zoneinfo(pc); | 785 | int retry_count = MEM_CGROUP_RECLAIM_RETRIES; |
759 | spin_lock_irqsave(&mz->lru_lock, flags); | 786 | int progress; |
760 | __mem_cgroup_add_list(mz, pc); | 787 | int ret = 0; |
761 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
762 | 788 | ||
763 | unlock_page_cgroup(newpage); | 789 | while (res_counter_set_limit(&memcg->res, val)) { |
790 | if (signal_pending(current)) { | ||
791 | ret = -EINTR; | ||
792 | break; | ||
793 | } | ||
794 | if (!retry_count) { | ||
795 | ret = -EBUSY; | ||
796 | break; | ||
797 | } | ||
798 | progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL); | ||
799 | if (!progress) | ||
800 | retry_count--; | ||
801 | } | ||
802 | return ret; | ||
764 | } | 803 | } |
765 | 804 | ||
805 | |||
766 | /* | 806 | /* |
767 | * This routine traverse page_cgroup in given list and drop them all. | 807 | * This routine traverse page_cgroup in given list and drop them all. |
768 | * This routine ignores page_cgroup->ref_cnt. | ||
769 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 808 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. |
770 | */ | 809 | */ |
771 | #define FORCE_UNCHARGE_BATCH (128) | 810 | #define FORCE_UNCHARGE_BATCH (128) |
772 | static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, | 811 | static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, |
773 | struct mem_cgroup_per_zone *mz, | 812 | struct mem_cgroup_per_zone *mz, |
774 | int active) | 813 | enum lru_list lru) |
775 | { | 814 | { |
776 | struct page_cgroup *pc; | 815 | struct page_cgroup *pc; |
777 | struct page *page; | 816 | struct page *page; |
@@ -779,22 +818,31 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
779 | unsigned long flags; | 818 | unsigned long flags; |
780 | struct list_head *list; | 819 | struct list_head *list; |
781 | 820 | ||
782 | if (active) | 821 | list = &mz->lists[lru]; |
783 | list = &mz->active_list; | ||
784 | else | ||
785 | list = &mz->inactive_list; | ||
786 | 822 | ||
787 | spin_lock_irqsave(&mz->lru_lock, flags); | 823 | spin_lock_irqsave(&mz->lru_lock, flags); |
788 | while (!list_empty(list)) { | 824 | while (!list_empty(list)) { |
789 | pc = list_entry(list->prev, struct page_cgroup, lru); | 825 | pc = list_entry(list->prev, struct page_cgroup, lru); |
790 | page = pc->page; | 826 | page = pc->page; |
827 | if (!PageCgroupUsed(pc)) | ||
828 | break; | ||
791 | get_page(page); | 829 | get_page(page); |
792 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 830 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
793 | mem_cgroup_uncharge_page(page); | 831 | /* |
794 | put_page(page); | 832 | * Check if this page is on LRU. !LRU page can be found |
795 | if (--count <= 0) { | 833 | * if it's under page migration. |
796 | count = FORCE_UNCHARGE_BATCH; | 834 | */ |
797 | cond_resched(); | 835 | if (PageLRU(page)) { |
836 | __mem_cgroup_uncharge_common(page, | ||
837 | MEM_CGROUP_CHARGE_TYPE_FORCE); | ||
838 | put_page(page); | ||
839 | if (--count <= 0) { | ||
840 | count = FORCE_UNCHARGE_BATCH; | ||
841 | cond_resched(); | ||
842 | } | ||
843 | } else { | ||
844 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
845 | break; | ||
798 | } | 846 | } |
799 | spin_lock_irqsave(&mz->lru_lock, flags); | 847 | spin_lock_irqsave(&mz->lru_lock, flags); |
800 | } | 848 | } |
@@ -810,9 +858,6 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem) | |||
810 | int ret = -EBUSY; | 858 | int ret = -EBUSY; |
811 | int node, zid; | 859 | int node, zid; |
812 | 860 | ||
813 | if (mem_cgroup_subsys.disabled) | ||
814 | return 0; | ||
815 | |||
816 | css_get(&mem->css); | 861 | css_get(&mem->css); |
817 | /* | 862 | /* |
818 | * page reclaim code (kswapd etc..) will move pages between | 863 | * page reclaim code (kswapd etc..) will move pages between |
@@ -822,15 +867,17 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem) | |||
822 | while (mem->res.usage > 0) { | 867 | while (mem->res.usage > 0) { |
823 | if (atomic_read(&mem->css.cgroup->count) > 0) | 868 | if (atomic_read(&mem->css.cgroup->count) > 0) |
824 | goto out; | 869 | goto out; |
870 | /* This is for making all *used* pages to be on LRU. */ | ||
871 | lru_add_drain_all(); | ||
825 | for_each_node_state(node, N_POSSIBLE) | 872 | for_each_node_state(node, N_POSSIBLE) |
826 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 873 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
827 | struct mem_cgroup_per_zone *mz; | 874 | struct mem_cgroup_per_zone *mz; |
875 | enum lru_list l; | ||
828 | mz = mem_cgroup_zoneinfo(mem, node, zid); | 876 | mz = mem_cgroup_zoneinfo(mem, node, zid); |
829 | /* drop all page_cgroup in active_list */ | 877 | for_each_lru(l) |
830 | mem_cgroup_force_empty_list(mem, mz, 1); | 878 | mem_cgroup_force_empty_list(mem, mz, l); |
831 | /* drop all page_cgroup in inactive_list */ | ||
832 | mem_cgroup_force_empty_list(mem, mz, 0); | ||
833 | } | 879 | } |
880 | cond_resched(); | ||
834 | } | 881 | } |
835 | ret = 0; | 882 | ret = 0; |
836 | out: | 883 | out: |
@@ -838,32 +885,34 @@ out: | |||
838 | return ret; | 885 | return ret; |
839 | } | 886 | } |
840 | 887 | ||
841 | static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) | ||
842 | { | ||
843 | *tmp = memparse(buf, &buf); | ||
844 | if (*buf != '\0') | ||
845 | return -EINVAL; | ||
846 | |||
847 | /* | ||
848 | * Round up the value to the closest page size | ||
849 | */ | ||
850 | *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT; | ||
851 | return 0; | ||
852 | } | ||
853 | |||
854 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 888 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
855 | { | 889 | { |
856 | return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, | 890 | return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, |
857 | cft->private); | 891 | cft->private); |
858 | } | 892 | } |
859 | 893 | /* | |
860 | static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | 894 | * The user of this function is... |
861 | struct file *file, const char __user *userbuf, | 895 | * RES_LIMIT. |
862 | size_t nbytes, loff_t *ppos) | 896 | */ |
897 | static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | ||
898 | const char *buffer) | ||
863 | { | 899 | { |
864 | return res_counter_write(&mem_cgroup_from_cont(cont)->res, | 900 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
865 | cft->private, userbuf, nbytes, ppos, | 901 | unsigned long long val; |
866 | mem_cgroup_write_strategy); | 902 | int ret; |
903 | |||
904 | switch (cft->private) { | ||
905 | case RES_LIMIT: | ||
906 | /* This function does all necessary parse...reuse it */ | ||
907 | ret = res_counter_memparse_write_strategy(buffer, &val); | ||
908 | if (!ret) | ||
909 | ret = mem_cgroup_resize_limit(memcg, val); | ||
910 | break; | ||
911 | default: | ||
912 | ret = -EINVAL; /* should be BUG() ? */ | ||
913 | break; | ||
914 | } | ||
915 | return ret; | ||
867 | } | 916 | } |
868 | 917 | ||
869 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | 918 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) |
@@ -913,14 +962,27 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
913 | } | 962 | } |
914 | /* showing # of active pages */ | 963 | /* showing # of active pages */ |
915 | { | 964 | { |
916 | unsigned long active, inactive; | 965 | unsigned long active_anon, inactive_anon; |
917 | 966 | unsigned long active_file, inactive_file; | |
918 | inactive = mem_cgroup_get_all_zonestat(mem_cont, | 967 | unsigned long unevictable; |
919 | MEM_CGROUP_ZSTAT_INACTIVE); | 968 | |
920 | active = mem_cgroup_get_all_zonestat(mem_cont, | 969 | inactive_anon = mem_cgroup_get_all_zonestat(mem_cont, |
921 | MEM_CGROUP_ZSTAT_ACTIVE); | 970 | LRU_INACTIVE_ANON); |
922 | cb->fill(cb, "active", (active) * PAGE_SIZE); | 971 | active_anon = mem_cgroup_get_all_zonestat(mem_cont, |
923 | cb->fill(cb, "inactive", (inactive) * PAGE_SIZE); | 972 | LRU_ACTIVE_ANON); |
973 | inactive_file = mem_cgroup_get_all_zonestat(mem_cont, | ||
974 | LRU_INACTIVE_FILE); | ||
975 | active_file = mem_cgroup_get_all_zonestat(mem_cont, | ||
976 | LRU_ACTIVE_FILE); | ||
977 | unevictable = mem_cgroup_get_all_zonestat(mem_cont, | ||
978 | LRU_UNEVICTABLE); | ||
979 | |||
980 | cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE); | ||
981 | cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE); | ||
982 | cb->fill(cb, "active_file", (active_file) * PAGE_SIZE); | ||
983 | cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE); | ||
984 | cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); | ||
985 | |||
924 | } | 986 | } |
925 | return 0; | 987 | return 0; |
926 | } | 988 | } |
@@ -940,7 +1002,7 @@ static struct cftype mem_cgroup_files[] = { | |||
940 | { | 1002 | { |
941 | .name = "limit_in_bytes", | 1003 | .name = "limit_in_bytes", |
942 | .private = RES_LIMIT, | 1004 | .private = RES_LIMIT, |
943 | .write = mem_cgroup_write, | 1005 | .write_string = mem_cgroup_write, |
944 | .read_u64 = mem_cgroup_read, | 1006 | .read_u64 = mem_cgroup_read, |
945 | }, | 1007 | }, |
946 | { | 1008 | { |
@@ -963,6 +1025,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
963 | { | 1025 | { |
964 | struct mem_cgroup_per_node *pn; | 1026 | struct mem_cgroup_per_node *pn; |
965 | struct mem_cgroup_per_zone *mz; | 1027 | struct mem_cgroup_per_zone *mz; |
1028 | enum lru_list l; | ||
966 | int zone, tmp = node; | 1029 | int zone, tmp = node; |
967 | /* | 1030 | /* |
968 | * This routine is called against possible nodes. | 1031 | * This routine is called against possible nodes. |
@@ -983,9 +1046,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
983 | 1046 | ||
984 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 1047 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
985 | mz = &pn->zoneinfo[zone]; | 1048 | mz = &pn->zoneinfo[zone]; |
986 | INIT_LIST_HEAD(&mz->active_list); | ||
987 | INIT_LIST_HEAD(&mz->inactive_list); | ||
988 | spin_lock_init(&mz->lru_lock); | 1049 | spin_lock_init(&mz->lru_lock); |
1050 | for_each_lru(l) | ||
1051 | INIT_LIST_HEAD(&mz->lists[l]); | ||
989 | } | 1052 | } |
990 | return 0; | 1053 | return 0; |
991 | } | 1054 | } |
@@ -1026,7 +1089,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1026 | 1089 | ||
1027 | if (unlikely((cont->parent) == NULL)) { | 1090 | if (unlikely((cont->parent) == NULL)) { |
1028 | mem = &init_mem_cgroup; | 1091 | mem = &init_mem_cgroup; |
1029 | page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC); | ||
1030 | } else { | 1092 | } else { |
1031 | mem = mem_cgroup_alloc(); | 1093 | mem = mem_cgroup_alloc(); |
1032 | if (!mem) | 1094 | if (!mem) |
@@ -1070,8 +1132,6 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss, | |||
1070 | static int mem_cgroup_populate(struct cgroup_subsys *ss, | 1132 | static int mem_cgroup_populate(struct cgroup_subsys *ss, |
1071 | struct cgroup *cont) | 1133 | struct cgroup *cont) |
1072 | { | 1134 | { |
1073 | if (mem_cgroup_subsys.disabled) | ||
1074 | return 0; | ||
1075 | return cgroup_add_files(cont, ss, mem_cgroup_files, | 1135 | return cgroup_add_files(cont, ss, mem_cgroup_files, |
1076 | ARRAY_SIZE(mem_cgroup_files)); | 1136 | ARRAY_SIZE(mem_cgroup_files)); |
1077 | } | 1137 | } |
@@ -1084,9 +1144,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
1084 | struct mm_struct *mm; | 1144 | struct mm_struct *mm; |
1085 | struct mem_cgroup *mem, *old_mem; | 1145 | struct mem_cgroup *mem, *old_mem; |
1086 | 1146 | ||
1087 | if (mem_cgroup_subsys.disabled) | ||
1088 | return; | ||
1089 | |||
1090 | mm = get_task_mm(p); | 1147 | mm = get_task_mm(p); |
1091 | if (mm == NULL) | 1148 | if (mm == NULL) |
1092 | return; | 1149 | return; |
@@ -1094,9 +1151,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
1094 | mem = mem_cgroup_from_cont(cont); | 1151 | mem = mem_cgroup_from_cont(cont); |
1095 | old_mem = mem_cgroup_from_cont(old_cont); | 1152 | old_mem = mem_cgroup_from_cont(old_cont); |
1096 | 1153 | ||
1097 | if (mem == old_mem) | ||
1098 | goto out; | ||
1099 | |||
1100 | /* | 1154 | /* |
1101 | * Only thread group leaders are allowed to migrate, the mm_struct is | 1155 | * Only thread group leaders are allowed to migrate, the mm_struct is |
1102 | * in effect owned by the leader | 1156 | * in effect owned by the leader |
diff --git a/mm/memory.c b/mm/memory.c index 2302d228fe04..164951c47305 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <linux/init.h> | 51 | #include <linux/init.h> |
52 | #include <linux/writeback.h> | 52 | #include <linux/writeback.h> |
53 | #include <linux/memcontrol.h> | 53 | #include <linux/memcontrol.h> |
54 | #include <linux/mmu_notifier.h> | ||
54 | 55 | ||
55 | #include <asm/pgalloc.h> | 56 | #include <asm/pgalloc.h> |
56 | #include <asm/uaccess.h> | 57 | #include <asm/uaccess.h> |
@@ -61,6 +62,8 @@ | |||
61 | #include <linux/swapops.h> | 62 | #include <linux/swapops.h> |
62 | #include <linux/elf.h> | 63 | #include <linux/elf.h> |
63 | 64 | ||
65 | #include "internal.h" | ||
66 | |||
64 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 67 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
65 | /* use the per-pgdat data instead for discontigmem - mbligh */ | 68 | /* use the per-pgdat data instead for discontigmem - mbligh */ |
66 | unsigned long max_mapnr; | 69 | unsigned long max_mapnr; |
@@ -211,7 +214,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | |||
211 | * | 214 | * |
212 | * Must be called with pagetable lock held. | 215 | * Must be called with pagetable lock held. |
213 | */ | 216 | */ |
214 | void free_pgd_range(struct mmu_gather **tlb, | 217 | void free_pgd_range(struct mmu_gather *tlb, |
215 | unsigned long addr, unsigned long end, | 218 | unsigned long addr, unsigned long end, |
216 | unsigned long floor, unsigned long ceiling) | 219 | unsigned long floor, unsigned long ceiling) |
217 | { | 220 | { |
@@ -262,16 +265,16 @@ void free_pgd_range(struct mmu_gather **tlb, | |||
262 | return; | 265 | return; |
263 | 266 | ||
264 | start = addr; | 267 | start = addr; |
265 | pgd = pgd_offset((*tlb)->mm, addr); | 268 | pgd = pgd_offset(tlb->mm, addr); |
266 | do { | 269 | do { |
267 | next = pgd_addr_end(addr, end); | 270 | next = pgd_addr_end(addr, end); |
268 | if (pgd_none_or_clear_bad(pgd)) | 271 | if (pgd_none_or_clear_bad(pgd)) |
269 | continue; | 272 | continue; |
270 | free_pud_range(*tlb, pgd, addr, next, floor, ceiling); | 273 | free_pud_range(tlb, pgd, addr, next, floor, ceiling); |
271 | } while (pgd++, addr = next, addr != end); | 274 | } while (pgd++, addr = next, addr != end); |
272 | } | 275 | } |
273 | 276 | ||
274 | void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | 277 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, |
275 | unsigned long floor, unsigned long ceiling) | 278 | unsigned long floor, unsigned long ceiling) |
276 | { | 279 | { |
277 | while (vma) { | 280 | while (vma) { |
@@ -372,7 +375,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) | |||
372 | * | 375 | * |
373 | * The calling function must still handle the error. | 376 | * The calling function must still handle the error. |
374 | */ | 377 | */ |
375 | void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) | 378 | static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, |
379 | unsigned long vaddr) | ||
376 | { | 380 | { |
377 | printk(KERN_ERR "Bad pte = %08llx, process = %s, " | 381 | printk(KERN_ERR "Bad pte = %08llx, process = %s, " |
378 | "vm_flags = %lx, vaddr = %lx\n", | 382 | "vm_flags = %lx, vaddr = %lx\n", |
@@ -649,6 +653,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
649 | unsigned long next; | 653 | unsigned long next; |
650 | unsigned long addr = vma->vm_start; | 654 | unsigned long addr = vma->vm_start; |
651 | unsigned long end = vma->vm_end; | 655 | unsigned long end = vma->vm_end; |
656 | int ret; | ||
652 | 657 | ||
653 | /* | 658 | /* |
654 | * Don't copy ptes where a page fault will fill them correctly. | 659 | * Don't copy ptes where a page fault will fill them correctly. |
@@ -664,17 +669,33 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
664 | if (is_vm_hugetlb_page(vma)) | 669 | if (is_vm_hugetlb_page(vma)) |
665 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); | 670 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); |
666 | 671 | ||
672 | /* | ||
673 | * We need to invalidate the secondary MMU mappings only when | ||
674 | * there could be a permission downgrade on the ptes of the | ||
675 | * parent mm. And a permission downgrade will only happen if | ||
676 | * is_cow_mapping() returns true. | ||
677 | */ | ||
678 | if (is_cow_mapping(vma->vm_flags)) | ||
679 | mmu_notifier_invalidate_range_start(src_mm, addr, end); | ||
680 | |||
681 | ret = 0; | ||
667 | dst_pgd = pgd_offset(dst_mm, addr); | 682 | dst_pgd = pgd_offset(dst_mm, addr); |
668 | src_pgd = pgd_offset(src_mm, addr); | 683 | src_pgd = pgd_offset(src_mm, addr); |
669 | do { | 684 | do { |
670 | next = pgd_addr_end(addr, end); | 685 | next = pgd_addr_end(addr, end); |
671 | if (pgd_none_or_clear_bad(src_pgd)) | 686 | if (pgd_none_or_clear_bad(src_pgd)) |
672 | continue; | 687 | continue; |
673 | if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, | 688 | if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, |
674 | vma, addr, next)) | 689 | vma, addr, next))) { |
675 | return -ENOMEM; | 690 | ret = -ENOMEM; |
691 | break; | ||
692 | } | ||
676 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); | 693 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); |
677 | return 0; | 694 | |
695 | if (is_cow_mapping(vma->vm_flags)) | ||
696 | mmu_notifier_invalidate_range_end(src_mm, | ||
697 | vma->vm_start, end); | ||
698 | return ret; | ||
678 | } | 699 | } |
679 | 700 | ||
680 | static unsigned long zap_pte_range(struct mmu_gather *tlb, | 701 | static unsigned long zap_pte_range(struct mmu_gather *tlb, |
@@ -878,7 +899,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, | |||
878 | unsigned long start = start_addr; | 899 | unsigned long start = start_addr; |
879 | spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; | 900 | spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; |
880 | int fullmm = (*tlbp)->fullmm; | 901 | int fullmm = (*tlbp)->fullmm; |
902 | struct mm_struct *mm = vma->vm_mm; | ||
881 | 903 | ||
904 | mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); | ||
882 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { | 905 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { |
883 | unsigned long end; | 906 | unsigned long end; |
884 | 907 | ||
@@ -899,9 +922,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, | |||
899 | } | 922 | } |
900 | 923 | ||
901 | if (unlikely(is_vm_hugetlb_page(vma))) { | 924 | if (unlikely(is_vm_hugetlb_page(vma))) { |
902 | unmap_hugepage_range(vma, start, end); | 925 | /* |
903 | zap_work -= (end - start) / | 926 | * It is undesirable to test vma->vm_file as it |
904 | (HPAGE_SIZE / PAGE_SIZE); | 927 | * should be non-null for valid hugetlb area. |
928 | * However, vm_file will be NULL in the error | ||
929 | * cleanup path of do_mmap_pgoff. When | ||
930 | * hugetlbfs ->mmap method fails, | ||
931 | * do_mmap_pgoff() nullifies vma->vm_file | ||
932 | * before calling this function to clean up. | ||
933 | * Since no pte has actually been setup, it is | ||
934 | * safe to do nothing in this case. | ||
935 | */ | ||
936 | if (vma->vm_file) { | ||
937 | unmap_hugepage_range(vma, start, end, NULL); | ||
938 | zap_work -= (end - start) / | ||
939 | pages_per_huge_page(hstate_vma(vma)); | ||
940 | } | ||
941 | |||
905 | start = end; | 942 | start = end; |
906 | } else | 943 | } else |
907 | start = unmap_page_range(*tlbp, vma, | 944 | start = unmap_page_range(*tlbp, vma, |
@@ -929,6 +966,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, | |||
929 | } | 966 | } |
930 | } | 967 | } |
931 | out: | 968 | out: |
969 | mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); | ||
932 | return start; /* which is now the end (or restart) address */ | 970 | return start; /* which is now the end (or restart) address */ |
933 | } | 971 | } |
934 | 972 | ||
@@ -956,6 +994,29 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, | |||
956 | return end; | 994 | return end; |
957 | } | 995 | } |
958 | 996 | ||
997 | /** | ||
998 | * zap_vma_ptes - remove ptes mapping the vma | ||
999 | * @vma: vm_area_struct holding ptes to be zapped | ||
1000 | * @address: starting address of pages to zap | ||
1001 | * @size: number of bytes to zap | ||
1002 | * | ||
1003 | * This function only unmaps ptes assigned to VM_PFNMAP vmas. | ||
1004 | * | ||
1005 | * The entire address range must be fully contained within the vma. | ||
1006 | * | ||
1007 | * Returns 0 if successful. | ||
1008 | */ | ||
1009 | int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, | ||
1010 | unsigned long size) | ||
1011 | { | ||
1012 | if (address < vma->vm_start || address + size > vma->vm_end || | ||
1013 | !(vma->vm_flags & VM_PFNMAP)) | ||
1014 | return -1; | ||
1015 | zap_page_range(vma, address, size, NULL); | ||
1016 | return 0; | ||
1017 | } | ||
1018 | EXPORT_SYMBOL_GPL(zap_vma_ptes); | ||
1019 | |||
959 | /* | 1020 | /* |
960 | * Do a quick page-table lookup for a single page. | 1021 | * Do a quick page-table lookup for a single page. |
961 | */ | 1022 | */ |
@@ -982,19 +1043,24 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
982 | goto no_page_table; | 1043 | goto no_page_table; |
983 | 1044 | ||
984 | pud = pud_offset(pgd, address); | 1045 | pud = pud_offset(pgd, address); |
985 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) | 1046 | if (pud_none(*pud)) |
986 | goto no_page_table; | 1047 | goto no_page_table; |
987 | 1048 | if (pud_huge(*pud)) { | |
1049 | BUG_ON(flags & FOLL_GET); | ||
1050 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); | ||
1051 | goto out; | ||
1052 | } | ||
1053 | if (unlikely(pud_bad(*pud))) | ||
1054 | goto no_page_table; | ||
1055 | |||
988 | pmd = pmd_offset(pud, address); | 1056 | pmd = pmd_offset(pud, address); |
989 | if (pmd_none(*pmd)) | 1057 | if (pmd_none(*pmd)) |
990 | goto no_page_table; | 1058 | goto no_page_table; |
991 | |||
992 | if (pmd_huge(*pmd)) { | 1059 | if (pmd_huge(*pmd)) { |
993 | BUG_ON(flags & FOLL_GET); | 1060 | BUG_ON(flags & FOLL_GET); |
994 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 1061 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); |
995 | goto out; | 1062 | goto out; |
996 | } | 1063 | } |
997 | |||
998 | if (unlikely(pmd_bad(*pmd))) | 1064 | if (unlikely(pmd_bad(*pmd))) |
999 | goto no_page_table; | 1065 | goto no_page_table; |
1000 | 1066 | ||
@@ -1058,19 +1124,22 @@ static inline int use_zero_page(struct vm_area_struct *vma) | |||
1058 | if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) | 1124 | if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) |
1059 | return 0; | 1125 | return 0; |
1060 | /* | 1126 | /* |
1061 | * And if we have a fault or a nopfn routine, it's not an | 1127 | * And if we have a fault routine, it's not an anonymous region. |
1062 | * anonymous region. | ||
1063 | */ | 1128 | */ |
1064 | return !vma->vm_ops || | 1129 | return !vma->vm_ops || !vma->vm_ops->fault; |
1065 | (!vma->vm_ops->fault && !vma->vm_ops->nopfn); | ||
1066 | } | 1130 | } |
1067 | 1131 | ||
1068 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1132 | |
1069 | unsigned long start, int len, int write, int force, | 1133 | |
1134 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | ||
1135 | unsigned long start, int len, int flags, | ||
1070 | struct page **pages, struct vm_area_struct **vmas) | 1136 | struct page **pages, struct vm_area_struct **vmas) |
1071 | { | 1137 | { |
1072 | int i; | 1138 | int i; |
1073 | unsigned int vm_flags; | 1139 | unsigned int vm_flags = 0; |
1140 | int write = !!(flags & GUP_FLAGS_WRITE); | ||
1141 | int force = !!(flags & GUP_FLAGS_FORCE); | ||
1142 | int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); | ||
1074 | 1143 | ||
1075 | if (len <= 0) | 1144 | if (len <= 0) |
1076 | return 0; | 1145 | return 0; |
@@ -1094,7 +1163,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1094 | pud_t *pud; | 1163 | pud_t *pud; |
1095 | pmd_t *pmd; | 1164 | pmd_t *pmd; |
1096 | pte_t *pte; | 1165 | pte_t *pte; |
1097 | if (write) /* user gate pages are read-only */ | 1166 | |
1167 | /* user gate pages are read-only */ | ||
1168 | if (!ignore && write) | ||
1098 | return i ? : -EFAULT; | 1169 | return i ? : -EFAULT; |
1099 | if (pg > TASK_SIZE) | 1170 | if (pg > TASK_SIZE) |
1100 | pgd = pgd_offset_k(pg); | 1171 | pgd = pgd_offset_k(pg); |
@@ -1126,8 +1197,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1126 | continue; | 1197 | continue; |
1127 | } | 1198 | } |
1128 | 1199 | ||
1129 | if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) | 1200 | if (!vma || |
1130 | || !(vm_flags & vma->vm_flags)) | 1201 | (vma->vm_flags & (VM_IO | VM_PFNMAP)) || |
1202 | (!ignore && !(vm_flags & vma->vm_flags))) | ||
1131 | return i ? : -EFAULT; | 1203 | return i ? : -EFAULT; |
1132 | 1204 | ||
1133 | if (is_vm_hugetlb_page(vma)) { | 1205 | if (is_vm_hugetlb_page(vma)) { |
@@ -1202,6 +1274,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1202 | } while (len); | 1274 | } while (len); |
1203 | return i; | 1275 | return i; |
1204 | } | 1276 | } |
1277 | |||
1278 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | ||
1279 | unsigned long start, int len, int write, int force, | ||
1280 | struct page **pages, struct vm_area_struct **vmas) | ||
1281 | { | ||
1282 | int flags = 0; | ||
1283 | |||
1284 | if (write) | ||
1285 | flags |= GUP_FLAGS_WRITE; | ||
1286 | if (force) | ||
1287 | flags |= GUP_FLAGS_FORCE; | ||
1288 | |||
1289 | return __get_user_pages(tsk, mm, | ||
1290 | start, len, flags, | ||
1291 | pages, vmas); | ||
1292 | } | ||
1293 | |||
1205 | EXPORT_SYMBOL(get_user_pages); | 1294 | EXPORT_SYMBOL(get_user_pages); |
1206 | 1295 | ||
1207 | pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, | 1296 | pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, |
@@ -1232,18 +1321,14 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, | |||
1232 | pte_t *pte; | 1321 | pte_t *pte; |
1233 | spinlock_t *ptl; | 1322 | spinlock_t *ptl; |
1234 | 1323 | ||
1235 | retval = mem_cgroup_charge(page, mm, GFP_KERNEL); | ||
1236 | if (retval) | ||
1237 | goto out; | ||
1238 | |||
1239 | retval = -EINVAL; | 1324 | retval = -EINVAL; |
1240 | if (PageAnon(page)) | 1325 | if (PageAnon(page)) |
1241 | goto out_uncharge; | 1326 | goto out; |
1242 | retval = -ENOMEM; | 1327 | retval = -ENOMEM; |
1243 | flush_dcache_page(page); | 1328 | flush_dcache_page(page); |
1244 | pte = get_locked_pte(mm, addr, &ptl); | 1329 | pte = get_locked_pte(mm, addr, &ptl); |
1245 | if (!pte) | 1330 | if (!pte) |
1246 | goto out_uncharge; | 1331 | goto out; |
1247 | retval = -EBUSY; | 1332 | retval = -EBUSY; |
1248 | if (!pte_none(*pte)) | 1333 | if (!pte_none(*pte)) |
1249 | goto out_unlock; | 1334 | goto out_unlock; |
@@ -1259,8 +1344,6 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, | |||
1259 | return retval; | 1344 | return retval; |
1260 | out_unlock: | 1345 | out_unlock: |
1261 | pte_unmap_unlock(pte, ptl); | 1346 | pte_unmap_unlock(pte, ptl); |
1262 | out_uncharge: | ||
1263 | mem_cgroup_uncharge_page(page); | ||
1264 | out: | 1347 | out: |
1265 | return retval; | 1348 | return retval; |
1266 | } | 1349 | } |
@@ -1338,6 +1421,11 @@ out: | |||
1338 | * | 1421 | * |
1339 | * This function should only be called from a vm_ops->fault handler, and | 1422 | * This function should only be called from a vm_ops->fault handler, and |
1340 | * in that case the handler should return NULL. | 1423 | * in that case the handler should return NULL. |
1424 | * | ||
1425 | * vma cannot be a COW mapping. | ||
1426 | * | ||
1427 | * As this is called only for pages that do not currently exist, we | ||
1428 | * do not need to flush old virtual caches or the TLB. | ||
1341 | */ | 1429 | */ |
1342 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | 1430 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, |
1343 | unsigned long pfn) | 1431 | unsigned long pfn) |
@@ -1548,6 +1636,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
1548 | unsigned long next; | 1636 | unsigned long next; |
1549 | int err; | 1637 | int err; |
1550 | 1638 | ||
1639 | BUG_ON(pud_huge(*pud)); | ||
1640 | |||
1551 | pmd = pmd_alloc(mm, pud, addr); | 1641 | pmd = pmd_alloc(mm, pud, addr); |
1552 | if (!pmd) | 1642 | if (!pmd) |
1553 | return -ENOMEM; | 1643 | return -ENOMEM; |
@@ -1589,10 +1679,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, | |||
1589 | { | 1679 | { |
1590 | pgd_t *pgd; | 1680 | pgd_t *pgd; |
1591 | unsigned long next; | 1681 | unsigned long next; |
1592 | unsigned long end = addr + size; | 1682 | unsigned long start = addr, end = addr + size; |
1593 | int err; | 1683 | int err; |
1594 | 1684 | ||
1595 | BUG_ON(addr >= end); | 1685 | BUG_ON(addr >= end); |
1686 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
1596 | pgd = pgd_offset(mm, addr); | 1687 | pgd = pgd_offset(mm, addr); |
1597 | do { | 1688 | do { |
1598 | next = pgd_addr_end(addr, end); | 1689 | next = pgd_addr_end(addr, end); |
@@ -1600,6 +1691,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, | |||
1600 | if (err) | 1691 | if (err) |
1601 | break; | 1692 | break; |
1602 | } while (pgd++, addr = next, addr != end); | 1693 | } while (pgd++, addr = next, addr != end); |
1694 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
1603 | return err; | 1695 | return err; |
1604 | } | 1696 | } |
1605 | EXPORT_SYMBOL_GPL(apply_to_page_range); | 1697 | EXPORT_SYMBOL_GPL(apply_to_page_range); |
@@ -1716,7 +1808,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1716 | * not dirty accountable. | 1808 | * not dirty accountable. |
1717 | */ | 1809 | */ |
1718 | if (PageAnon(old_page)) { | 1810 | if (PageAnon(old_page)) { |
1719 | if (!TestSetPageLocked(old_page)) { | 1811 | if (trylock_page(old_page)) { |
1720 | reuse = can_share_swap_page(old_page); | 1812 | reuse = can_share_swap_page(old_page); |
1721 | unlock_page(old_page); | 1813 | unlock_page(old_page); |
1722 | } | 1814 | } |
@@ -1785,6 +1877,15 @@ gotten: | |||
1785 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 1877 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
1786 | if (!new_page) | 1878 | if (!new_page) |
1787 | goto oom; | 1879 | goto oom; |
1880 | /* | ||
1881 | * Don't let another task, with possibly unlocked vma, | ||
1882 | * keep the mlocked page. | ||
1883 | */ | ||
1884 | if (vma->vm_flags & VM_LOCKED) { | ||
1885 | lock_page(old_page); /* for LRU manipulation */ | ||
1886 | clear_page_mlock(old_page); | ||
1887 | unlock_page(old_page); | ||
1888 | } | ||
1788 | cow_user_page(new_page, old_page, address, vma); | 1889 | cow_user_page(new_page, old_page, address, vma); |
1789 | __SetPageUptodate(new_page); | 1890 | __SetPageUptodate(new_page); |
1790 | 1891 | ||
@@ -1812,12 +1913,14 @@ gotten: | |||
1812 | * seen in the presence of one thread doing SMC and another | 1913 | * seen in the presence of one thread doing SMC and another |
1813 | * thread doing COW. | 1914 | * thread doing COW. |
1814 | */ | 1915 | */ |
1815 | ptep_clear_flush(vma, address, page_table); | 1916 | ptep_clear_flush_notify(vma, address, page_table); |
1816 | set_pte_at(mm, address, page_table, entry); | 1917 | SetPageSwapBacked(new_page); |
1817 | update_mmu_cache(vma, address, entry); | 1918 | lru_cache_add_active_or_unevictable(new_page, vma); |
1818 | lru_cache_add_active(new_page); | ||
1819 | page_add_new_anon_rmap(new_page, vma, address); | 1919 | page_add_new_anon_rmap(new_page, vma, address); |
1820 | 1920 | ||
1921 | //TODO: is this safe? do_anonymous_page() does it this way. | ||
1922 | set_pte_at(mm, address, page_table, entry); | ||
1923 | update_mmu_cache(vma, address, entry); | ||
1821 | if (old_page) { | 1924 | if (old_page) { |
1822 | /* | 1925 | /* |
1823 | * Only after switching the pte to the new page may | 1926 | * Only after switching the pte to the new page may |
@@ -2215,16 +2318,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2215 | count_vm_event(PGMAJFAULT); | 2318 | count_vm_event(PGMAJFAULT); |
2216 | } | 2319 | } |
2217 | 2320 | ||
2321 | mark_page_accessed(page); | ||
2322 | |||
2323 | lock_page(page); | ||
2324 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | ||
2325 | |||
2218 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { | 2326 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { |
2219 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | ||
2220 | ret = VM_FAULT_OOM; | 2327 | ret = VM_FAULT_OOM; |
2328 | unlock_page(page); | ||
2221 | goto out; | 2329 | goto out; |
2222 | } | 2330 | } |
2223 | 2331 | ||
2224 | mark_page_accessed(page); | ||
2225 | lock_page(page); | ||
2226 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | ||
2227 | |||
2228 | /* | 2332 | /* |
2229 | * Back out if somebody else already faulted in this pte. | 2333 | * Back out if somebody else already faulted in this pte. |
2230 | */ | 2334 | */ |
@@ -2251,7 +2355,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2251 | page_add_anon_rmap(page, vma, address); | 2355 | page_add_anon_rmap(page, vma, address); |
2252 | 2356 | ||
2253 | swap_free(entry); | 2357 | swap_free(entry); |
2254 | if (vm_swap_full()) | 2358 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) |
2255 | remove_exclusive_swap_page(page); | 2359 | remove_exclusive_swap_page(page); |
2256 | unlock_page(page); | 2360 | unlock_page(page); |
2257 | 2361 | ||
@@ -2309,7 +2413,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2309 | if (!pte_none(*page_table)) | 2413 | if (!pte_none(*page_table)) |
2310 | goto release; | 2414 | goto release; |
2311 | inc_mm_counter(mm, anon_rss); | 2415 | inc_mm_counter(mm, anon_rss); |
2312 | lru_cache_add_active(page); | 2416 | SetPageSwapBacked(page); |
2417 | lru_cache_add_active_or_unevictable(page, vma); | ||
2313 | page_add_new_anon_rmap(page, vma, address); | 2418 | page_add_new_anon_rmap(page, vma, address); |
2314 | set_pte_at(mm, address, page_table, entry); | 2419 | set_pte_at(mm, address, page_table, entry); |
2315 | 2420 | ||
@@ -2350,6 +2455,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2350 | struct page *page; | 2455 | struct page *page; |
2351 | pte_t entry; | 2456 | pte_t entry; |
2352 | int anon = 0; | 2457 | int anon = 0; |
2458 | int charged = 0; | ||
2353 | struct page *dirty_page = NULL; | 2459 | struct page *dirty_page = NULL; |
2354 | struct vm_fault vmf; | 2460 | struct vm_fault vmf; |
2355 | int ret; | 2461 | int ret; |
@@ -2390,6 +2496,18 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2390 | ret = VM_FAULT_OOM; | 2496 | ret = VM_FAULT_OOM; |
2391 | goto out; | 2497 | goto out; |
2392 | } | 2498 | } |
2499 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { | ||
2500 | ret = VM_FAULT_OOM; | ||
2501 | page_cache_release(page); | ||
2502 | goto out; | ||
2503 | } | ||
2504 | charged = 1; | ||
2505 | /* | ||
2506 | * Don't let another task, with possibly unlocked vma, | ||
2507 | * keep the mlocked page. | ||
2508 | */ | ||
2509 | if (vma->vm_flags & VM_LOCKED) | ||
2510 | clear_page_mlock(vmf.page); | ||
2393 | copy_user_highpage(page, vmf.page, address, vma); | 2511 | copy_user_highpage(page, vmf.page, address, vma); |
2394 | __SetPageUptodate(page); | 2512 | __SetPageUptodate(page); |
2395 | } else { | 2513 | } else { |
@@ -2424,11 +2542,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2424 | 2542 | ||
2425 | } | 2543 | } |
2426 | 2544 | ||
2427 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { | ||
2428 | ret = VM_FAULT_OOM; | ||
2429 | goto out; | ||
2430 | } | ||
2431 | |||
2432 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2545 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
2433 | 2546 | ||
2434 | /* | 2547 | /* |
@@ -2447,11 +2560,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2447 | entry = mk_pte(page, vma->vm_page_prot); | 2560 | entry = mk_pte(page, vma->vm_page_prot); |
2448 | if (flags & FAULT_FLAG_WRITE) | 2561 | if (flags & FAULT_FLAG_WRITE) |
2449 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2562 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2450 | set_pte_at(mm, address, page_table, entry); | ||
2451 | if (anon) { | 2563 | if (anon) { |
2452 | inc_mm_counter(mm, anon_rss); | 2564 | inc_mm_counter(mm, anon_rss); |
2453 | lru_cache_add_active(page); | 2565 | SetPageSwapBacked(page); |
2454 | page_add_new_anon_rmap(page, vma, address); | 2566 | lru_cache_add_active_or_unevictable(page, vma); |
2567 | page_add_new_anon_rmap(page, vma, address); | ||
2455 | } else { | 2568 | } else { |
2456 | inc_mm_counter(mm, file_rss); | 2569 | inc_mm_counter(mm, file_rss); |
2457 | page_add_file_rmap(page); | 2570 | page_add_file_rmap(page); |
@@ -2460,11 +2573,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2460 | get_page(dirty_page); | 2573 | get_page(dirty_page); |
2461 | } | 2574 | } |
2462 | } | 2575 | } |
2576 | //TODO: is this safe? do_anonymous_page() does it this way. | ||
2577 | set_pte_at(mm, address, page_table, entry); | ||
2463 | 2578 | ||
2464 | /* no need to invalidate: a not-present page won't be cached */ | 2579 | /* no need to invalidate: a not-present page won't be cached */ |
2465 | update_mmu_cache(vma, address, entry); | 2580 | update_mmu_cache(vma, address, entry); |
2466 | } else { | 2581 | } else { |
2467 | mem_cgroup_uncharge_page(page); | 2582 | if (charged) |
2583 | mem_cgroup_uncharge_page(page); | ||
2468 | if (anon) | 2584 | if (anon) |
2469 | page_cache_release(page); | 2585 | page_cache_release(page); |
2470 | else | 2586 | else |
@@ -2501,59 +2617,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2501 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 2617 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
2502 | } | 2618 | } |
2503 | 2619 | ||
2504 | |||
2505 | /* | ||
2506 | * do_no_pfn() tries to create a new page mapping for a page without | ||
2507 | * a struct_page backing it | ||
2508 | * | ||
2509 | * As this is called only for pages that do not currently exist, we | ||
2510 | * do not need to flush old virtual caches or the TLB. | ||
2511 | * | ||
2512 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | ||
2513 | * but allow concurrent faults), and pte mapped but not yet locked. | ||
2514 | * We return with mmap_sem still held, but pte unmapped and unlocked. | ||
2515 | * | ||
2516 | * It is expected that the ->nopfn handler always returns the same pfn | ||
2517 | * for a given virtual mapping. | ||
2518 | * | ||
2519 | * Mark this `noinline' to prevent it from bloating the main pagefault code. | ||
2520 | */ | ||
2521 | static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, | ||
2522 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
2523 | int write_access) | ||
2524 | { | ||
2525 | spinlock_t *ptl; | ||
2526 | pte_t entry; | ||
2527 | unsigned long pfn; | ||
2528 | |||
2529 | pte_unmap(page_table); | ||
2530 | BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); | ||
2531 | BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); | ||
2532 | |||
2533 | pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); | ||
2534 | |||
2535 | BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); | ||
2536 | |||
2537 | if (unlikely(pfn == NOPFN_OOM)) | ||
2538 | return VM_FAULT_OOM; | ||
2539 | else if (unlikely(pfn == NOPFN_SIGBUS)) | ||
2540 | return VM_FAULT_SIGBUS; | ||
2541 | else if (unlikely(pfn == NOPFN_REFAULT)) | ||
2542 | return 0; | ||
2543 | |||
2544 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
2545 | |||
2546 | /* Only go through if we didn't race with anybody else... */ | ||
2547 | if (pte_none(*page_table)) { | ||
2548 | entry = pfn_pte(pfn, vma->vm_page_prot); | ||
2549 | if (write_access) | ||
2550 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
2551 | set_pte_at(mm, address, page_table, entry); | ||
2552 | } | ||
2553 | pte_unmap_unlock(page_table, ptl); | ||
2554 | return 0; | ||
2555 | } | ||
2556 | |||
2557 | /* | 2620 | /* |
2558 | * Fault of a previously existing named mapping. Repopulate the pte | 2621 | * Fault of a previously existing named mapping. Repopulate the pte |
2559 | * from the encoded file_pte if possible. This enables swappable | 2622 | * from the encoded file_pte if possible. This enables swappable |
@@ -2614,9 +2677,6 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
2614 | if (likely(vma->vm_ops->fault)) | 2677 | if (likely(vma->vm_ops->fault)) |
2615 | return do_linear_fault(mm, vma, address, | 2678 | return do_linear_fault(mm, vma, address, |
2616 | pte, pmd, write_access, entry); | 2679 | pte, pmd, write_access, entry); |
2617 | if (unlikely(vma->vm_ops->nopfn)) | ||
2618 | return do_no_pfn(mm, vma, address, pte, | ||
2619 | pmd, write_access); | ||
2620 | } | 2680 | } |
2621 | return do_anonymous_page(mm, vma, address, | 2681 | return do_anonymous_page(mm, vma, address, |
2622 | pte, pmd, write_access); | 2682 | pte, pmd, write_access); |
@@ -2748,7 +2808,7 @@ int make_pages_present(unsigned long addr, unsigned long end) | |||
2748 | 2808 | ||
2749 | vma = find_vma(current->mm, addr); | 2809 | vma = find_vma(current->mm, addr); |
2750 | if (!vma) | 2810 | if (!vma) |
2751 | return -1; | 2811 | return -ENOMEM; |
2752 | write = (vma->vm_flags & VM_WRITE) != 0; | 2812 | write = (vma->vm_flags & VM_WRITE) != 0; |
2753 | BUG_ON(addr >= end); | 2813 | BUG_ON(addr >= end); |
2754 | BUG_ON(end > vma->vm_end); | 2814 | BUG_ON(end > vma->vm_end); |
@@ -2757,7 +2817,7 @@ int make_pages_present(unsigned long addr, unsigned long end) | |||
2757 | len, write, 0, NULL, NULL); | 2817 | len, write, 0, NULL, NULL); |
2758 | if (ret < 0) | 2818 | if (ret < 0) |
2759 | return ret; | 2819 | return ret; |
2760 | return ret == len ? 0 : -1; | 2820 | return ret == len ? 0 : -EFAULT; |
2761 | } | 2821 | } |
2762 | 2822 | ||
2763 | #if !defined(__HAVE_ARCH_GATE_AREA) | 2823 | #if !defined(__HAVE_ARCH_GATE_AREA) |
@@ -2804,6 +2864,86 @@ int in_gate_area_no_task(unsigned long addr) | |||
2804 | 2864 | ||
2805 | #endif /* __HAVE_ARCH_GATE_AREA */ | 2865 | #endif /* __HAVE_ARCH_GATE_AREA */ |
2806 | 2866 | ||
2867 | #ifdef CONFIG_HAVE_IOREMAP_PROT | ||
2868 | static resource_size_t follow_phys(struct vm_area_struct *vma, | ||
2869 | unsigned long address, unsigned int flags, | ||
2870 | unsigned long *prot) | ||
2871 | { | ||
2872 | pgd_t *pgd; | ||
2873 | pud_t *pud; | ||
2874 | pmd_t *pmd; | ||
2875 | pte_t *ptep, pte; | ||
2876 | spinlock_t *ptl; | ||
2877 | resource_size_t phys_addr = 0; | ||
2878 | struct mm_struct *mm = vma->vm_mm; | ||
2879 | |||
2880 | VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP))); | ||
2881 | |||
2882 | pgd = pgd_offset(mm, address); | ||
2883 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | ||
2884 | goto no_page_table; | ||
2885 | |||
2886 | pud = pud_offset(pgd, address); | ||
2887 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) | ||
2888 | goto no_page_table; | ||
2889 | |||
2890 | pmd = pmd_offset(pud, address); | ||
2891 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | ||
2892 | goto no_page_table; | ||
2893 | |||
2894 | /* We cannot handle huge page PFN maps. Luckily they don't exist. */ | ||
2895 | if (pmd_huge(*pmd)) | ||
2896 | goto no_page_table; | ||
2897 | |||
2898 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
2899 | if (!ptep) | ||
2900 | goto out; | ||
2901 | |||
2902 | pte = *ptep; | ||
2903 | if (!pte_present(pte)) | ||
2904 | goto unlock; | ||
2905 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | ||
2906 | goto unlock; | ||
2907 | phys_addr = pte_pfn(pte); | ||
2908 | phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ | ||
2909 | |||
2910 | *prot = pgprot_val(pte_pgprot(pte)); | ||
2911 | |||
2912 | unlock: | ||
2913 | pte_unmap_unlock(ptep, ptl); | ||
2914 | out: | ||
2915 | return phys_addr; | ||
2916 | no_page_table: | ||
2917 | return 0; | ||
2918 | } | ||
2919 | |||
2920 | int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, | ||
2921 | void *buf, int len, int write) | ||
2922 | { | ||
2923 | resource_size_t phys_addr; | ||
2924 | unsigned long prot = 0; | ||
2925 | void *maddr; | ||
2926 | int offset = addr & (PAGE_SIZE-1); | ||
2927 | |||
2928 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | ||
2929 | return -EINVAL; | ||
2930 | |||
2931 | phys_addr = follow_phys(vma, addr, write, &prot); | ||
2932 | |||
2933 | if (!phys_addr) | ||
2934 | return -EINVAL; | ||
2935 | |||
2936 | maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); | ||
2937 | if (write) | ||
2938 | memcpy_toio(maddr + offset, buf, len); | ||
2939 | else | ||
2940 | memcpy_fromio(buf, maddr + offset, len); | ||
2941 | iounmap(maddr); | ||
2942 | |||
2943 | return len; | ||
2944 | } | ||
2945 | #endif | ||
2946 | |||
2807 | /* | 2947 | /* |
2808 | * Access another process' address space. | 2948 | * Access another process' address space. |
2809 | * Source/target buffer must be kernel space, | 2949 | * Source/target buffer must be kernel space, |
@@ -2813,7 +2953,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
2813 | { | 2953 | { |
2814 | struct mm_struct *mm; | 2954 | struct mm_struct *mm; |
2815 | struct vm_area_struct *vma; | 2955 | struct vm_area_struct *vma; |
2816 | struct page *page; | ||
2817 | void *old_buf = buf; | 2956 | void *old_buf = buf; |
2818 | 2957 | ||
2819 | mm = get_task_mm(tsk); | 2958 | mm = get_task_mm(tsk); |
@@ -2825,28 +2964,44 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
2825 | while (len) { | 2964 | while (len) { |
2826 | int bytes, ret, offset; | 2965 | int bytes, ret, offset; |
2827 | void *maddr; | 2966 | void *maddr; |
2967 | struct page *page = NULL; | ||
2828 | 2968 | ||
2829 | ret = get_user_pages(tsk, mm, addr, 1, | 2969 | ret = get_user_pages(tsk, mm, addr, 1, |
2830 | write, 1, &page, &vma); | 2970 | write, 1, &page, &vma); |
2831 | if (ret <= 0) | 2971 | if (ret <= 0) { |
2832 | break; | 2972 | /* |
2833 | 2973 | * Check if this is a VM_IO | VM_PFNMAP VMA, which | |
2834 | bytes = len; | 2974 | * we can access using slightly different code. |
2835 | offset = addr & (PAGE_SIZE-1); | 2975 | */ |
2836 | if (bytes > PAGE_SIZE-offset) | 2976 | #ifdef CONFIG_HAVE_IOREMAP_PROT |
2837 | bytes = PAGE_SIZE-offset; | 2977 | vma = find_vma(mm, addr); |
2838 | 2978 | if (!vma) | |
2839 | maddr = kmap(page); | 2979 | break; |
2840 | if (write) { | 2980 | if (vma->vm_ops && vma->vm_ops->access) |
2841 | copy_to_user_page(vma, page, addr, | 2981 | ret = vma->vm_ops->access(vma, addr, buf, |
2842 | maddr + offset, buf, bytes); | 2982 | len, write); |
2843 | set_page_dirty_lock(page); | 2983 | if (ret <= 0) |
2984 | #endif | ||
2985 | break; | ||
2986 | bytes = ret; | ||
2844 | } else { | 2987 | } else { |
2845 | copy_from_user_page(vma, page, addr, | 2988 | bytes = len; |
2846 | buf, maddr + offset, bytes); | 2989 | offset = addr & (PAGE_SIZE-1); |
2990 | if (bytes > PAGE_SIZE-offset) | ||
2991 | bytes = PAGE_SIZE-offset; | ||
2992 | |||
2993 | maddr = kmap(page); | ||
2994 | if (write) { | ||
2995 | copy_to_user_page(vma, page, addr, | ||
2996 | maddr + offset, buf, bytes); | ||
2997 | set_page_dirty_lock(page); | ||
2998 | } else { | ||
2999 | copy_from_user_page(vma, page, addr, | ||
3000 | buf, maddr + offset, bytes); | ||
3001 | } | ||
3002 | kunmap(page); | ||
3003 | page_cache_release(page); | ||
2847 | } | 3004 | } |
2848 | kunmap(page); | ||
2849 | page_cache_release(page); | ||
2850 | len -= bytes; | 3005 | len -= bytes; |
2851 | buf += bytes; | 3006 | buf += bytes; |
2852 | addr += bytes; | 3007 | addr += bytes; |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 833f854eabe5..6837a1014372 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/delay.h> | 26 | #include <linux/delay.h> |
27 | #include <linux/migrate.h> | 27 | #include <linux/migrate.h> |
28 | #include <linux/page-isolation.h> | 28 | #include <linux/page-isolation.h> |
29 | #include <linux/pfn.h> | ||
29 | 30 | ||
30 | #include <asm/tlbflush.h> | 31 | #include <asm/tlbflush.h> |
31 | 32 | ||
@@ -62,9 +63,9 @@ static void release_memory_resource(struct resource *res) | |||
62 | 63 | ||
63 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE | 64 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE |
64 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | 65 | #ifndef CONFIG_SPARSEMEM_VMEMMAP |
65 | static void get_page_bootmem(unsigned long info, struct page *page, int magic) | 66 | static void get_page_bootmem(unsigned long info, struct page *page, int type) |
66 | { | 67 | { |
67 | atomic_set(&page->_mapcount, magic); | 68 | atomic_set(&page->_mapcount, type); |
68 | SetPagePrivate(page); | 69 | SetPagePrivate(page); |
69 | set_page_private(page, info); | 70 | set_page_private(page, info); |
70 | atomic_inc(&page->_count); | 71 | atomic_inc(&page->_count); |
@@ -72,10 +73,10 @@ static void get_page_bootmem(unsigned long info, struct page *page, int magic) | |||
72 | 73 | ||
73 | void put_page_bootmem(struct page *page) | 74 | void put_page_bootmem(struct page *page) |
74 | { | 75 | { |
75 | int magic; | 76 | int type; |
76 | 77 | ||
77 | magic = atomic_read(&page->_mapcount); | 78 | type = atomic_read(&page->_mapcount); |
78 | BUG_ON(magic >= -1); | 79 | BUG_ON(type >= -1); |
79 | 80 | ||
80 | if (atomic_dec_return(&page->_count) == 1) { | 81 | if (atomic_dec_return(&page->_count) == 1) { |
81 | ClearPagePrivate(page); | 82 | ClearPagePrivate(page); |
@@ -86,7 +87,7 @@ void put_page_bootmem(struct page *page) | |||
86 | 87 | ||
87 | } | 88 | } |
88 | 89 | ||
89 | void register_page_bootmem_info_section(unsigned long start_pfn) | 90 | static void register_page_bootmem_info_section(unsigned long start_pfn) |
90 | { | 91 | { |
91 | unsigned long *usemap, mapsize, section_nr, i; | 92 | unsigned long *usemap, mapsize, section_nr, i; |
92 | struct mem_section *ms; | 93 | struct mem_section *ms; |
@@ -119,7 +120,7 @@ void register_page_bootmem_info_section(unsigned long start_pfn) | |||
119 | mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; | 120 | mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; |
120 | 121 | ||
121 | for (i = 0; i < mapsize; i++, page++) | 122 | for (i = 0; i < mapsize; i++, page++) |
122 | get_page_bootmem(section_nr, page, MIX_INFO); | 123 | get_page_bootmem(section_nr, page, MIX_SECTION_INFO); |
123 | 124 | ||
124 | } | 125 | } |
125 | 126 | ||
@@ -323,11 +324,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, | |||
323 | BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); | 324 | BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); |
324 | BUG_ON(nr_pages % PAGES_PER_SECTION); | 325 | BUG_ON(nr_pages % PAGES_PER_SECTION); |
325 | 326 | ||
326 | release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE); | ||
327 | |||
328 | sections_to_remove = nr_pages / PAGES_PER_SECTION; | 327 | sections_to_remove = nr_pages / PAGES_PER_SECTION; |
329 | for (i = 0; i < sections_to_remove; i++) { | 328 | for (i = 0; i < sections_to_remove; i++) { |
330 | unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; | 329 | unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; |
330 | release_mem_region(pfn << PAGE_SHIFT, | ||
331 | PAGES_PER_SECTION << PAGE_SHIFT); | ||
331 | ret = __remove_section(zone, __pfn_to_section(pfn)); | 332 | ret = __remove_section(zone, __pfn_to_section(pfn)); |
332 | if (ret) | 333 | if (ret) |
333 | break; | 334 | break; |
@@ -429,7 +430,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
429 | 430 | ||
430 | if (need_zonelists_rebuild) | 431 | if (need_zonelists_rebuild) |
431 | build_all_zonelists(); | 432 | build_all_zonelists(); |
432 | vm_total_pages = nr_free_pagecache_pages(); | 433 | else |
434 | vm_total_pages = nr_free_pagecache_pages(); | ||
435 | |||
433 | writeback_set_ratelimit(); | 436 | writeback_set_ratelimit(); |
434 | 437 | ||
435 | if (onlined_pages) | 438 | if (onlined_pages) |
@@ -455,7 +458,7 @@ static pg_data_t *hotadd_new_pgdat(int nid, u64 start) | |||
455 | /* we can use NODE_DATA(nid) from here */ | 458 | /* we can use NODE_DATA(nid) from here */ |
456 | 459 | ||
457 | /* init node's zones as empty zones, we don't have any present pages.*/ | 460 | /* init node's zones as empty zones, we don't have any present pages.*/ |
458 | free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size); | 461 | free_area_init_node(nid, zones_size, start_pfn, zholes_size); |
459 | 462 | ||
460 | return pgdat; | 463 | return pgdat; |
461 | } | 464 | } |
@@ -521,6 +524,66 @@ EXPORT_SYMBOL_GPL(add_memory); | |||
521 | 524 | ||
522 | #ifdef CONFIG_MEMORY_HOTREMOVE | 525 | #ifdef CONFIG_MEMORY_HOTREMOVE |
523 | /* | 526 | /* |
527 | * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy | ||
528 | * set and the size of the free page is given by page_order(). Using this, | ||
529 | * the function determines if the pageblock contains only free pages. | ||
530 | * Due to buddy contraints, a free page at least the size of a pageblock will | ||
531 | * be located at the start of the pageblock | ||
532 | */ | ||
533 | static inline int pageblock_free(struct page *page) | ||
534 | { | ||
535 | return PageBuddy(page) && page_order(page) >= pageblock_order; | ||
536 | } | ||
537 | |||
538 | /* Return the start of the next active pageblock after a given page */ | ||
539 | static struct page *next_active_pageblock(struct page *page) | ||
540 | { | ||
541 | int pageblocks_stride; | ||
542 | |||
543 | /* Ensure the starting page is pageblock-aligned */ | ||
544 | BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); | ||
545 | |||
546 | /* Move forward by at least 1 * pageblock_nr_pages */ | ||
547 | pageblocks_stride = 1; | ||
548 | |||
549 | /* If the entire pageblock is free, move to the end of free page */ | ||
550 | if (pageblock_free(page)) | ||
551 | pageblocks_stride += page_order(page) - pageblock_order; | ||
552 | |||
553 | return page + (pageblocks_stride * pageblock_nr_pages); | ||
554 | } | ||
555 | |||
556 | /* Checks if this range of memory is likely to be hot-removable. */ | ||
557 | int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) | ||
558 | { | ||
559 | int type; | ||
560 | struct page *page = pfn_to_page(start_pfn); | ||
561 | struct page *end_page = page + nr_pages; | ||
562 | |||
563 | /* Check the starting page of each pageblock within the range */ | ||
564 | for (; page < end_page; page = next_active_pageblock(page)) { | ||
565 | type = get_pageblock_migratetype(page); | ||
566 | |||
567 | /* | ||
568 | * A pageblock containing MOVABLE or free pages is considered | ||
569 | * removable | ||
570 | */ | ||
571 | if (type != MIGRATE_MOVABLE && !pageblock_free(page)) | ||
572 | return 0; | ||
573 | |||
574 | /* | ||
575 | * A pageblock starting with a PageReserved page is not | ||
576 | * considered removable. | ||
577 | */ | ||
578 | if (PageReserved(page)) | ||
579 | return 0; | ||
580 | } | ||
581 | |||
582 | /* All pageblocks in the memory block are likely to be hot-removable */ | ||
583 | return 1; | ||
584 | } | ||
585 | |||
586 | /* | ||
524 | * Confirm all pages in a range [start, end) is belongs to the same zone. | 587 | * Confirm all pages in a range [start, end) is belongs to the same zone. |
525 | */ | 588 | */ |
526 | static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) | 589 | static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) |
@@ -595,8 +658,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
595 | * We can skip free pages. And we can only deal with pages on | 658 | * We can skip free pages. And we can only deal with pages on |
596 | * LRU. | 659 | * LRU. |
597 | */ | 660 | */ |
598 | ret = isolate_lru_page(page, &source); | 661 | ret = isolate_lru_page(page); |
599 | if (!ret) { /* Success */ | 662 | if (!ret) { /* Success */ |
663 | list_add_tail(&page->lru, &source); | ||
600 | move_pages--; | 664 | move_pages--; |
601 | } else { | 665 | } else { |
602 | /* Becasue we don't have big zone->lock. we should | 666 | /* Becasue we don't have big zone->lock. we should |
@@ -787,10 +851,19 @@ failed_removal: | |||
787 | 851 | ||
788 | return ret; | 852 | return ret; |
789 | } | 853 | } |
854 | |||
855 | int remove_memory(u64 start, u64 size) | ||
856 | { | ||
857 | unsigned long start_pfn, end_pfn; | ||
858 | |||
859 | start_pfn = PFN_DOWN(start); | ||
860 | end_pfn = start_pfn + PFN_DOWN(size); | ||
861 | return offline_pages(start_pfn, end_pfn, 120 * HZ); | ||
862 | } | ||
790 | #else | 863 | #else |
791 | int remove_memory(u64 start, u64 size) | 864 | int remove_memory(u64 start, u64 size) |
792 | { | 865 | { |
793 | return -EINVAL; | 866 | return -EINVAL; |
794 | } | 867 | } |
795 | EXPORT_SYMBOL_GPL(remove_memory); | ||
796 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 868 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
869 | EXPORT_SYMBOL_GPL(remove_memory); | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c94e58b192c3..36f42573a335 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -93,6 +93,8 @@ | |||
93 | #include <asm/tlbflush.h> | 93 | #include <asm/tlbflush.h> |
94 | #include <asm/uaccess.h> | 94 | #include <asm/uaccess.h> |
95 | 95 | ||
96 | #include "internal.h" | ||
97 | |||
96 | /* Internal flags */ | 98 | /* Internal flags */ |
97 | #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ | 99 | #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ |
98 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ | 100 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ |
@@ -762,8 +764,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
762 | /* | 764 | /* |
763 | * Avoid migrating a page that is shared with others. | 765 | * Avoid migrating a page that is shared with others. |
764 | */ | 766 | */ |
765 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) | 767 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { |
766 | isolate_lru_page(page, pagelist); | 768 | if (!isolate_lru_page(page)) { |
769 | list_add_tail(&page->lru, pagelist); | ||
770 | } | ||
771 | } | ||
767 | } | 772 | } |
768 | 773 | ||
769 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) | 774 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) |
@@ -803,7 +808,6 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
803 | int do_migrate_pages(struct mm_struct *mm, | 808 | int do_migrate_pages(struct mm_struct *mm, |
804 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) | 809 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) |
805 | { | 810 | { |
806 | LIST_HEAD(pagelist); | ||
807 | int busy = 0; | 811 | int busy = 0; |
808 | int err = 0; | 812 | int err = 0; |
809 | nodemask_t tmp; | 813 | nodemask_t tmp; |
@@ -1481,7 +1485,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | |||
1481 | 1485 | ||
1482 | if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { | 1486 | if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { |
1483 | zl = node_zonelist(interleave_nid(*mpol, vma, addr, | 1487 | zl = node_zonelist(interleave_nid(*mpol, vma, addr, |
1484 | HPAGE_SHIFT), gfp_flags); | 1488 | huge_page_shift(hstate_vma(vma))), gfp_flags); |
1485 | } else { | 1489 | } else { |
1486 | zl = policy_zonelist(gfp_flags, *mpol); | 1490 | zl = policy_zonelist(gfp_flags, *mpol); |
1487 | if ((*mpol)->mode == MPOL_BIND) | 1491 | if ((*mpol)->mode == MPOL_BIND) |
@@ -2198,7 +2202,7 @@ static void gather_stats(struct page *page, void *private, int pte_dirty) | |||
2198 | if (PageSwapCache(page)) | 2202 | if (PageSwapCache(page)) |
2199 | md->swapcache++; | 2203 | md->swapcache++; |
2200 | 2204 | ||
2201 | if (PageActive(page)) | 2205 | if (PageActive(page) || PageUnevictable(page)) |
2202 | md->active++; | 2206 | md->active++; |
2203 | 2207 | ||
2204 | if (PageWriteback(page)) | 2208 | if (PageWriteback(page)) |
@@ -2220,9 +2224,12 @@ static void check_huge_range(struct vm_area_struct *vma, | |||
2220 | { | 2224 | { |
2221 | unsigned long addr; | 2225 | unsigned long addr; |
2222 | struct page *page; | 2226 | struct page *page; |
2227 | struct hstate *h = hstate_vma(vma); | ||
2228 | unsigned long sz = huge_page_size(h); | ||
2223 | 2229 | ||
2224 | for (addr = start; addr < end; addr += HPAGE_SIZE) { | 2230 | for (addr = start; addr < end; addr += sz) { |
2225 | pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); | 2231 | pte_t *ptep = huge_pte_offset(vma->vm_mm, |
2232 | addr & huge_page_mask(h)); | ||
2226 | pte_t pte; | 2233 | pte_t pte; |
2227 | 2234 | ||
2228 | if (!ptep) | 2235 | if (!ptep) |
diff --git a/mm/migrate.c b/mm/migrate.c index 55bd355d170d..6602941bfab0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -30,42 +30,13 @@ | |||
30 | #include <linux/vmalloc.h> | 30 | #include <linux/vmalloc.h> |
31 | #include <linux/security.h> | 31 | #include <linux/security.h> |
32 | #include <linux/memcontrol.h> | 32 | #include <linux/memcontrol.h> |
33 | #include <linux/syscalls.h> | ||
33 | 34 | ||
34 | #include "internal.h" | 35 | #include "internal.h" |
35 | 36 | ||
36 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | 37 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
37 | 38 | ||
38 | /* | 39 | /* |
39 | * Isolate one page from the LRU lists. If successful put it onto | ||
40 | * the indicated list with elevated page count. | ||
41 | * | ||
42 | * Result: | ||
43 | * -EBUSY: page not on LRU list | ||
44 | * 0: page removed from LRU list and added to the specified list. | ||
45 | */ | ||
46 | int isolate_lru_page(struct page *page, struct list_head *pagelist) | ||
47 | { | ||
48 | int ret = -EBUSY; | ||
49 | |||
50 | if (PageLRU(page)) { | ||
51 | struct zone *zone = page_zone(page); | ||
52 | |||
53 | spin_lock_irq(&zone->lru_lock); | ||
54 | if (PageLRU(page) && get_page_unless_zero(page)) { | ||
55 | ret = 0; | ||
56 | ClearPageLRU(page); | ||
57 | if (PageActive(page)) | ||
58 | del_page_from_active_list(zone, page); | ||
59 | else | ||
60 | del_page_from_inactive_list(zone, page); | ||
61 | list_add_tail(&page->lru, pagelist); | ||
62 | } | ||
63 | spin_unlock_irq(&zone->lru_lock); | ||
64 | } | ||
65 | return ret; | ||
66 | } | ||
67 | |||
68 | /* | ||
69 | * migrate_prep() needs to be called before we start compiling a list of pages | 40 | * migrate_prep() needs to be called before we start compiling a list of pages |
70 | * to be migrated using isolate_lru_page(). | 41 | * to be migrated using isolate_lru_page(). |
71 | */ | 42 | */ |
@@ -82,23 +53,9 @@ int migrate_prep(void) | |||
82 | return 0; | 53 | return 0; |
83 | } | 54 | } |
84 | 55 | ||
85 | static inline void move_to_lru(struct page *page) | ||
86 | { | ||
87 | if (PageActive(page)) { | ||
88 | /* | ||
89 | * lru_cache_add_active checks that | ||
90 | * the PG_active bit is off. | ||
91 | */ | ||
92 | ClearPageActive(page); | ||
93 | lru_cache_add_active(page); | ||
94 | } else { | ||
95 | lru_cache_add(page); | ||
96 | } | ||
97 | put_page(page); | ||
98 | } | ||
99 | |||
100 | /* | 56 | /* |
101 | * Add isolated pages on the list back to the LRU. | 57 | * Add isolated pages on the list back to the LRU under page lock |
58 | * to avoid leaking evictable pages back onto unevictable list. | ||
102 | * | 59 | * |
103 | * returns the number of pages put back. | 60 | * returns the number of pages put back. |
104 | */ | 61 | */ |
@@ -110,7 +67,7 @@ int putback_lru_pages(struct list_head *l) | |||
110 | 67 | ||
111 | list_for_each_entry_safe(page, page2, l, lru) { | 68 | list_for_each_entry_safe(page, page2, l, lru) { |
112 | list_del(&page->lru); | 69 | list_del(&page->lru); |
113 | move_to_lru(page); | 70 | putback_lru_page(page); |
114 | count++; | 71 | count++; |
115 | } | 72 | } |
116 | return count; | 73 | return count; |
@@ -284,7 +241,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, | |||
284 | 241 | ||
285 | page = migration_entry_to_page(entry); | 242 | page = migration_entry_to_page(entry); |
286 | 243 | ||
287 | get_page(page); | 244 | /* |
245 | * Once radix-tree replacement of page migration started, page_count | ||
246 | * *must* be zero. And, we don't want to call wait_on_page_locked() | ||
247 | * against a page without get_page(). | ||
248 | * So, we use get_page_unless_zero(), here. Even failed, page fault | ||
249 | * will occur again. | ||
250 | */ | ||
251 | if (!get_page_unless_zero(page)) | ||
252 | goto out; | ||
288 | pte_unmap_unlock(ptep, ptl); | 253 | pte_unmap_unlock(ptep, ptl); |
289 | wait_on_page_locked(page); | 254 | wait_on_page_locked(page); |
290 | put_page(page); | 255 | put_page(page); |
@@ -304,6 +269,7 @@ out: | |||
304 | static int migrate_page_move_mapping(struct address_space *mapping, | 269 | static int migrate_page_move_mapping(struct address_space *mapping, |
305 | struct page *newpage, struct page *page) | 270 | struct page *newpage, struct page *page) |
306 | { | 271 | { |
272 | int expected_count; | ||
307 | void **pslot; | 273 | void **pslot; |
308 | 274 | ||
309 | if (!mapping) { | 275 | if (!mapping) { |
@@ -313,14 +279,20 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
313 | return 0; | 279 | return 0; |
314 | } | 280 | } |
315 | 281 | ||
316 | write_lock_irq(&mapping->tree_lock); | 282 | spin_lock_irq(&mapping->tree_lock); |
317 | 283 | ||
318 | pslot = radix_tree_lookup_slot(&mapping->page_tree, | 284 | pslot = radix_tree_lookup_slot(&mapping->page_tree, |
319 | page_index(page)); | 285 | page_index(page)); |
320 | 286 | ||
321 | if (page_count(page) != 2 + !!PagePrivate(page) || | 287 | expected_count = 2 + !!PagePrivate(page); |
288 | if (page_count(page) != expected_count || | ||
322 | (struct page *)radix_tree_deref_slot(pslot) != page) { | 289 | (struct page *)radix_tree_deref_slot(pslot) != page) { |
323 | write_unlock_irq(&mapping->tree_lock); | 290 | spin_unlock_irq(&mapping->tree_lock); |
291 | return -EAGAIN; | ||
292 | } | ||
293 | |||
294 | if (!page_freeze_refs(page, expected_count)) { | ||
295 | spin_unlock_irq(&mapping->tree_lock); | ||
324 | return -EAGAIN; | 296 | return -EAGAIN; |
325 | } | 297 | } |
326 | 298 | ||
@@ -337,6 +309,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
337 | 309 | ||
338 | radix_tree_replace_slot(pslot, newpage); | 310 | radix_tree_replace_slot(pslot, newpage); |
339 | 311 | ||
312 | page_unfreeze_refs(page, expected_count); | ||
340 | /* | 313 | /* |
341 | * Drop cache reference from old page. | 314 | * Drop cache reference from old page. |
342 | * We know this isn't the last reference. | 315 | * We know this isn't the last reference. |
@@ -356,7 +329,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
356 | __dec_zone_page_state(page, NR_FILE_PAGES); | 329 | __dec_zone_page_state(page, NR_FILE_PAGES); |
357 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | 330 | __inc_zone_page_state(newpage, NR_FILE_PAGES); |
358 | 331 | ||
359 | write_unlock_irq(&mapping->tree_lock); | 332 | spin_unlock_irq(&mapping->tree_lock); |
360 | 333 | ||
361 | return 0; | 334 | return 0; |
362 | } | 335 | } |
@@ -366,6 +339,8 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
366 | */ | 339 | */ |
367 | static void migrate_page_copy(struct page *newpage, struct page *page) | 340 | static void migrate_page_copy(struct page *newpage, struct page *page) |
368 | { | 341 | { |
342 | int anon; | ||
343 | |||
369 | copy_highpage(newpage, page); | 344 | copy_highpage(newpage, page); |
370 | 345 | ||
371 | if (PageError(page)) | 346 | if (PageError(page)) |
@@ -374,8 +349,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page) | |||
374 | SetPageReferenced(newpage); | 349 | SetPageReferenced(newpage); |
375 | if (PageUptodate(page)) | 350 | if (PageUptodate(page)) |
376 | SetPageUptodate(newpage); | 351 | SetPageUptodate(newpage); |
377 | if (PageActive(page)) | 352 | if (TestClearPageActive(page)) { |
353 | VM_BUG_ON(PageUnevictable(page)); | ||
378 | SetPageActive(newpage); | 354 | SetPageActive(newpage); |
355 | } else | ||
356 | unevictable_migrate_page(newpage, page); | ||
379 | if (PageChecked(page)) | 357 | if (PageChecked(page)) |
380 | SetPageChecked(newpage); | 358 | SetPageChecked(newpage); |
381 | if (PageMappedToDisk(page)) | 359 | if (PageMappedToDisk(page)) |
@@ -393,14 +371,20 @@ static void migrate_page_copy(struct page *newpage, struct page *page) | |||
393 | __set_page_dirty_nobuffers(newpage); | 371 | __set_page_dirty_nobuffers(newpage); |
394 | } | 372 | } |
395 | 373 | ||
374 | mlock_migrate_page(newpage, page); | ||
375 | |||
396 | #ifdef CONFIG_SWAP | 376 | #ifdef CONFIG_SWAP |
397 | ClearPageSwapCache(page); | 377 | ClearPageSwapCache(page); |
398 | #endif | 378 | #endif |
399 | ClearPageActive(page); | ||
400 | ClearPagePrivate(page); | 379 | ClearPagePrivate(page); |
401 | set_page_private(page, 0); | 380 | set_page_private(page, 0); |
381 | /* page->mapping contains a flag for PageAnon() */ | ||
382 | anon = PageAnon(page); | ||
402 | page->mapping = NULL; | 383 | page->mapping = NULL; |
403 | 384 | ||
385 | if (!anon) /* This page was removed from radix-tree. */ | ||
386 | mem_cgroup_uncharge_cache_page(page); | ||
387 | |||
404 | /* | 388 | /* |
405 | * If any waiters have accumulated on the new page then | 389 | * If any waiters have accumulated on the new page then |
406 | * wake them up. | 390 | * wake them up. |
@@ -575,6 +559,10 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
575 | * | 559 | * |
576 | * The new page will have replaced the old page if this function | 560 | * The new page will have replaced the old page if this function |
577 | * is successful. | 561 | * is successful. |
562 | * | ||
563 | * Return value: | ||
564 | * < 0 - error code | ||
565 | * == 0 - success | ||
578 | */ | 566 | */ |
579 | static int move_to_new_page(struct page *newpage, struct page *page) | 567 | static int move_to_new_page(struct page *newpage, struct page *page) |
580 | { | 568 | { |
@@ -586,12 +574,14 @@ static int move_to_new_page(struct page *newpage, struct page *page) | |||
586 | * establishing additional references. We are the only one | 574 | * establishing additional references. We are the only one |
587 | * holding a reference to the new page at this point. | 575 | * holding a reference to the new page at this point. |
588 | */ | 576 | */ |
589 | if (TestSetPageLocked(newpage)) | 577 | if (!trylock_page(newpage)) |
590 | BUG(); | 578 | BUG(); |
591 | 579 | ||
592 | /* Prepare mapping for the new page.*/ | 580 | /* Prepare mapping for the new page.*/ |
593 | newpage->index = page->index; | 581 | newpage->index = page->index; |
594 | newpage->mapping = page->mapping; | 582 | newpage->mapping = page->mapping; |
583 | if (PageSwapBacked(page)) | ||
584 | SetPageSwapBacked(newpage); | ||
595 | 585 | ||
596 | mapping = page_mapping(page); | 586 | mapping = page_mapping(page); |
597 | if (!mapping) | 587 | if (!mapping) |
@@ -610,7 +600,6 @@ static int move_to_new_page(struct page *newpage, struct page *page) | |||
610 | rc = fallback_migrate_page(mapping, newpage, page); | 600 | rc = fallback_migrate_page(mapping, newpage, page); |
611 | 601 | ||
612 | if (!rc) { | 602 | if (!rc) { |
613 | mem_cgroup_page_migration(page, newpage); | ||
614 | remove_migration_ptes(page, newpage); | 603 | remove_migration_ptes(page, newpage); |
615 | } else | 604 | } else |
616 | newpage->mapping = NULL; | 605 | newpage->mapping = NULL; |
@@ -636,12 +625,21 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
636 | if (!newpage) | 625 | if (!newpage) |
637 | return -ENOMEM; | 626 | return -ENOMEM; |
638 | 627 | ||
639 | if (page_count(page) == 1) | 628 | if (page_count(page) == 1) { |
640 | /* page was freed from under us. So we are done. */ | 629 | /* page was freed from under us. So we are done. */ |
641 | goto move_newpage; | 630 | goto move_newpage; |
631 | } | ||
632 | |||
633 | charge = mem_cgroup_prepare_migration(page, newpage); | ||
634 | if (charge == -ENOMEM) { | ||
635 | rc = -ENOMEM; | ||
636 | goto move_newpage; | ||
637 | } | ||
638 | /* prepare cgroup just returns 0 or -ENOMEM */ | ||
639 | BUG_ON(charge); | ||
642 | 640 | ||
643 | rc = -EAGAIN; | 641 | rc = -EAGAIN; |
644 | if (TestSetPageLocked(page)) { | 642 | if (!trylock_page(page)) { |
645 | if (!force) | 643 | if (!force) |
646 | goto move_newpage; | 644 | goto move_newpage; |
647 | lock_page(page); | 645 | lock_page(page); |
@@ -691,25 +689,19 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
691 | goto rcu_unlock; | 689 | goto rcu_unlock; |
692 | } | 690 | } |
693 | 691 | ||
694 | charge = mem_cgroup_prepare_migration(page); | ||
695 | /* Establish migration ptes or remove ptes */ | 692 | /* Establish migration ptes or remove ptes */ |
696 | try_to_unmap(page, 1); | 693 | try_to_unmap(page, 1); |
697 | 694 | ||
698 | if (!page_mapped(page)) | 695 | if (!page_mapped(page)) |
699 | rc = move_to_new_page(newpage, page); | 696 | rc = move_to_new_page(newpage, page); |
700 | 697 | ||
701 | if (rc) { | 698 | if (rc) |
702 | remove_migration_ptes(page, page); | 699 | remove_migration_ptes(page, page); |
703 | if (charge) | ||
704 | mem_cgroup_end_migration(page); | ||
705 | } else if (charge) | ||
706 | mem_cgroup_end_migration(newpage); | ||
707 | rcu_unlock: | 700 | rcu_unlock: |
708 | if (rcu_locked) | 701 | if (rcu_locked) |
709 | rcu_read_unlock(); | 702 | rcu_read_unlock(); |
710 | 703 | ||
711 | unlock: | 704 | unlock: |
712 | |||
713 | unlock_page(page); | 705 | unlock_page(page); |
714 | 706 | ||
715 | if (rc != -EAGAIN) { | 707 | if (rc != -EAGAIN) { |
@@ -720,15 +712,19 @@ unlock: | |||
720 | * restored. | 712 | * restored. |
721 | */ | 713 | */ |
722 | list_del(&page->lru); | 714 | list_del(&page->lru); |
723 | move_to_lru(page); | 715 | putback_lru_page(page); |
724 | } | 716 | } |
725 | 717 | ||
726 | move_newpage: | 718 | move_newpage: |
719 | if (!charge) | ||
720 | mem_cgroup_end_migration(newpage); | ||
721 | |||
727 | /* | 722 | /* |
728 | * Move the new page to the LRU. If migration was not successful | 723 | * Move the new page to the LRU. If migration was not successful |
729 | * then this will free the page. | 724 | * then this will free the page. |
730 | */ | 725 | */ |
731 | move_to_lru(newpage); | 726 | putback_lru_page(newpage); |
727 | |||
732 | if (result) { | 728 | if (result) { |
733 | if (rc) | 729 | if (rc) |
734 | *result = rc; | 730 | *result = rc; |
@@ -835,9 +831,11 @@ static struct page *new_page_node(struct page *p, unsigned long private, | |||
835 | * Move a set of pages as indicated in the pm array. The addr | 831 | * Move a set of pages as indicated in the pm array. The addr |
836 | * field must be set to the virtual address of the page to be moved | 832 | * field must be set to the virtual address of the page to be moved |
837 | * and the node number must contain a valid target node. | 833 | * and the node number must contain a valid target node. |
834 | * The pm array ends with node = MAX_NUMNODES. | ||
838 | */ | 835 | */ |
839 | static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, | 836 | static int do_move_page_to_node_array(struct mm_struct *mm, |
840 | int migrate_all) | 837 | struct page_to_node *pm, |
838 | int migrate_all) | ||
841 | { | 839 | { |
842 | int err; | 840 | int err; |
843 | struct page_to_node *pp; | 841 | struct page_to_node *pp; |
@@ -891,7 +889,9 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, | |||
891 | !migrate_all) | 889 | !migrate_all) |
892 | goto put_and_set; | 890 | goto put_and_set; |
893 | 891 | ||
894 | err = isolate_lru_page(page, &pagelist); | 892 | err = isolate_lru_page(page); |
893 | if (!err) | ||
894 | list_add_tail(&page->lru, &pagelist); | ||
895 | put_and_set: | 895 | put_and_set: |
896 | /* | 896 | /* |
897 | * Either remove the duplicate refcount from | 897 | * Either remove the duplicate refcount from |
@@ -903,36 +903,118 @@ set_status: | |||
903 | pp->status = err; | 903 | pp->status = err; |
904 | } | 904 | } |
905 | 905 | ||
906 | err = 0; | ||
906 | if (!list_empty(&pagelist)) | 907 | if (!list_empty(&pagelist)) |
907 | err = migrate_pages(&pagelist, new_page_node, | 908 | err = migrate_pages(&pagelist, new_page_node, |
908 | (unsigned long)pm); | 909 | (unsigned long)pm); |
909 | else | ||
910 | err = -ENOENT; | ||
911 | 910 | ||
912 | up_read(&mm->mmap_sem); | 911 | up_read(&mm->mmap_sem); |
913 | return err; | 912 | return err; |
914 | } | 913 | } |
915 | 914 | ||
916 | /* | 915 | /* |
917 | * Determine the nodes of a list of pages. The addr in the pm array | 916 | * Migrate an array of page address onto an array of nodes and fill |
918 | * must have been set to the virtual address of which we want to determine | 917 | * the corresponding array of status. |
919 | * the node number. | ||
920 | */ | 918 | */ |
921 | static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) | 919 | static int do_pages_move(struct mm_struct *mm, struct task_struct *task, |
920 | unsigned long nr_pages, | ||
921 | const void __user * __user *pages, | ||
922 | const int __user *nodes, | ||
923 | int __user *status, int flags) | ||
922 | { | 924 | { |
925 | struct page_to_node *pm = NULL; | ||
926 | nodemask_t task_nodes; | ||
927 | int err = 0; | ||
928 | int i; | ||
929 | |||
930 | task_nodes = cpuset_mems_allowed(task); | ||
931 | |||
932 | /* Limit nr_pages so that the multiplication may not overflow */ | ||
933 | if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { | ||
934 | err = -E2BIG; | ||
935 | goto out; | ||
936 | } | ||
937 | |||
938 | pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); | ||
939 | if (!pm) { | ||
940 | err = -ENOMEM; | ||
941 | goto out; | ||
942 | } | ||
943 | |||
944 | /* | ||
945 | * Get parameters from user space and initialize the pm | ||
946 | * array. Return various errors if the user did something wrong. | ||
947 | */ | ||
948 | for (i = 0; i < nr_pages; i++) { | ||
949 | const void __user *p; | ||
950 | |||
951 | err = -EFAULT; | ||
952 | if (get_user(p, pages + i)) | ||
953 | goto out_pm; | ||
954 | |||
955 | pm[i].addr = (unsigned long)p; | ||
956 | if (nodes) { | ||
957 | int node; | ||
958 | |||
959 | if (get_user(node, nodes + i)) | ||
960 | goto out_pm; | ||
961 | |||
962 | err = -ENODEV; | ||
963 | if (!node_state(node, N_HIGH_MEMORY)) | ||
964 | goto out_pm; | ||
965 | |||
966 | err = -EACCES; | ||
967 | if (!node_isset(node, task_nodes)) | ||
968 | goto out_pm; | ||
969 | |||
970 | pm[i].node = node; | ||
971 | } else | ||
972 | pm[i].node = 0; /* anything to not match MAX_NUMNODES */ | ||
973 | } | ||
974 | /* End marker */ | ||
975 | pm[nr_pages].node = MAX_NUMNODES; | ||
976 | |||
977 | err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL); | ||
978 | if (err >= 0) | ||
979 | /* Return status information */ | ||
980 | for (i = 0; i < nr_pages; i++) | ||
981 | if (put_user(pm[i].status, status + i)) | ||
982 | err = -EFAULT; | ||
983 | |||
984 | out_pm: | ||
985 | vfree(pm); | ||
986 | out: | ||
987 | return err; | ||
988 | } | ||
989 | |||
990 | /* | ||
991 | * Determine the nodes of an array of pages and store it in an array of status. | ||
992 | */ | ||
993 | static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, | ||
994 | const void __user * __user *pages, | ||
995 | int __user *status) | ||
996 | { | ||
997 | unsigned long i; | ||
998 | int err; | ||
999 | |||
923 | down_read(&mm->mmap_sem); | 1000 | down_read(&mm->mmap_sem); |
924 | 1001 | ||
925 | for ( ; pm->node != MAX_NUMNODES; pm++) { | 1002 | for (i = 0; i < nr_pages; i++) { |
1003 | const void __user *p; | ||
1004 | unsigned long addr; | ||
926 | struct vm_area_struct *vma; | 1005 | struct vm_area_struct *vma; |
927 | struct page *page; | 1006 | struct page *page; |
928 | int err; | ||
929 | 1007 | ||
930 | err = -EFAULT; | 1008 | err = -EFAULT; |
931 | vma = find_vma(mm, pm->addr); | 1009 | if (get_user(p, pages+i)) |
1010 | goto out; | ||
1011 | addr = (unsigned long) p; | ||
1012 | |||
1013 | vma = find_vma(mm, addr); | ||
932 | if (!vma) | 1014 | if (!vma) |
933 | goto set_status; | 1015 | goto set_status; |
934 | 1016 | ||
935 | page = follow_page(vma, pm->addr, 0); | 1017 | page = follow_page(vma, addr, 0); |
936 | 1018 | ||
937 | err = PTR_ERR(page); | 1019 | err = PTR_ERR(page); |
938 | if (IS_ERR(page)) | 1020 | if (IS_ERR(page)) |
@@ -945,11 +1027,13 @@ static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) | |||
945 | 1027 | ||
946 | err = page_to_nid(page); | 1028 | err = page_to_nid(page); |
947 | set_status: | 1029 | set_status: |
948 | pm->status = err; | 1030 | put_user(err, status+i); |
949 | } | 1031 | } |
1032 | err = 0; | ||
950 | 1033 | ||
1034 | out: | ||
951 | up_read(&mm->mmap_sem); | 1035 | up_read(&mm->mmap_sem); |
952 | return 0; | 1036 | return err; |
953 | } | 1037 | } |
954 | 1038 | ||
955 | /* | 1039 | /* |
@@ -961,12 +1045,9 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, | |||
961 | const int __user *nodes, | 1045 | const int __user *nodes, |
962 | int __user *status, int flags) | 1046 | int __user *status, int flags) |
963 | { | 1047 | { |
964 | int err = 0; | ||
965 | int i; | ||
966 | struct task_struct *task; | 1048 | struct task_struct *task; |
967 | nodemask_t task_nodes; | ||
968 | struct mm_struct *mm; | 1049 | struct mm_struct *mm; |
969 | struct page_to_node *pm = NULL; | 1050 | int err; |
970 | 1051 | ||
971 | /* Check flags */ | 1052 | /* Check flags */ |
972 | if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) | 1053 | if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) |
@@ -998,79 +1079,24 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, | |||
998 | (current->uid != task->suid) && (current->uid != task->uid) && | 1079 | (current->uid != task->suid) && (current->uid != task->uid) && |
999 | !capable(CAP_SYS_NICE)) { | 1080 | !capable(CAP_SYS_NICE)) { |
1000 | err = -EPERM; | 1081 | err = -EPERM; |
1001 | goto out2; | 1082 | goto out; |
1002 | } | 1083 | } |
1003 | 1084 | ||
1004 | err = security_task_movememory(task); | 1085 | err = security_task_movememory(task); |
1005 | if (err) | 1086 | if (err) |
1006 | goto out2; | 1087 | goto out; |
1007 | |||
1008 | |||
1009 | task_nodes = cpuset_mems_allowed(task); | ||
1010 | |||
1011 | /* Limit nr_pages so that the multiplication may not overflow */ | ||
1012 | if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { | ||
1013 | err = -E2BIG; | ||
1014 | goto out2; | ||
1015 | } | ||
1016 | |||
1017 | pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); | ||
1018 | if (!pm) { | ||
1019 | err = -ENOMEM; | ||
1020 | goto out2; | ||
1021 | } | ||
1022 | |||
1023 | /* | ||
1024 | * Get parameters from user space and initialize the pm | ||
1025 | * array. Return various errors if the user did something wrong. | ||
1026 | */ | ||
1027 | for (i = 0; i < nr_pages; i++) { | ||
1028 | const void __user *p; | ||
1029 | |||
1030 | err = -EFAULT; | ||
1031 | if (get_user(p, pages + i)) | ||
1032 | goto out; | ||
1033 | |||
1034 | pm[i].addr = (unsigned long)p; | ||
1035 | if (nodes) { | ||
1036 | int node; | ||
1037 | |||
1038 | if (get_user(node, nodes + i)) | ||
1039 | goto out; | ||
1040 | |||
1041 | err = -ENODEV; | ||
1042 | if (!node_state(node, N_HIGH_MEMORY)) | ||
1043 | goto out; | ||
1044 | |||
1045 | err = -EACCES; | ||
1046 | if (!node_isset(node, task_nodes)) | ||
1047 | goto out; | ||
1048 | 1088 | ||
1049 | pm[i].node = node; | 1089 | if (nodes) { |
1050 | } else | 1090 | err = do_pages_move(mm, task, nr_pages, pages, nodes, status, |
1051 | pm[i].node = 0; /* anything to not match MAX_NUMNODES */ | 1091 | flags); |
1092 | } else { | ||
1093 | err = do_pages_stat(mm, nr_pages, pages, status); | ||
1052 | } | 1094 | } |
1053 | /* End marker */ | ||
1054 | pm[nr_pages].node = MAX_NUMNODES; | ||
1055 | |||
1056 | if (nodes) | ||
1057 | err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL); | ||
1058 | else | ||
1059 | err = do_pages_stat(mm, pm); | ||
1060 | |||
1061 | if (err >= 0) | ||
1062 | /* Return status information */ | ||
1063 | for (i = 0; i < nr_pages; i++) | ||
1064 | if (put_user(pm[i].status, status + i)) | ||
1065 | err = -EFAULT; | ||
1066 | 1095 | ||
1067 | out: | 1096 | out: |
1068 | vfree(pm); | ||
1069 | out2: | ||
1070 | mmput(mm); | 1097 | mmput(mm); |
1071 | return err; | 1098 | return err; |
1072 | } | 1099 | } |
1073 | #endif | ||
1074 | 1100 | ||
1075 | /* | 1101 | /* |
1076 | * Call migration functions in the vma_ops that may prepare | 1102 | * Call migration functions in the vma_ops that may prepare |
@@ -1092,3 +1118,4 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, | |||
1092 | } | 1118 | } |
1093 | return err; | 1119 | return err; |
1094 | } | 1120 | } |
1121 | #endif | ||
diff --git a/mm/mlock.c b/mm/mlock.c index 7b2656055d6a..008ea70b7afa 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -8,10 +8,18 @@ | |||
8 | #include <linux/capability.h> | 8 | #include <linux/capability.h> |
9 | #include <linux/mman.h> | 9 | #include <linux/mman.h> |
10 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
11 | #include <linux/swap.h> | ||
12 | #include <linux/swapops.h> | ||
13 | #include <linux/pagemap.h> | ||
11 | #include <linux/mempolicy.h> | 14 | #include <linux/mempolicy.h> |
12 | #include <linux/syscalls.h> | 15 | #include <linux/syscalls.h> |
13 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
14 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/rmap.h> | ||
19 | #include <linux/mmzone.h> | ||
20 | #include <linux/hugetlb.h> | ||
21 | |||
22 | #include "internal.h" | ||
15 | 23 | ||
16 | int can_do_mlock(void) | 24 | int can_do_mlock(void) |
17 | { | 25 | { |
@@ -23,17 +31,381 @@ int can_do_mlock(void) | |||
23 | } | 31 | } |
24 | EXPORT_SYMBOL(can_do_mlock); | 32 | EXPORT_SYMBOL(can_do_mlock); |
25 | 33 | ||
34 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
35 | /* | ||
36 | * Mlocked pages are marked with PageMlocked() flag for efficient testing | ||
37 | * in vmscan and, possibly, the fault path; and to support semi-accurate | ||
38 | * statistics. | ||
39 | * | ||
40 | * An mlocked page [PageMlocked(page)] is unevictable. As such, it will | ||
41 | * be placed on the LRU "unevictable" list, rather than the [in]active lists. | ||
42 | * The unevictable list is an LRU sibling list to the [in]active lists. | ||
43 | * PageUnevictable is set to indicate the unevictable state. | ||
44 | * | ||
45 | * When lazy mlocking via vmscan, it is important to ensure that the | ||
46 | * vma's VM_LOCKED status is not concurrently being modified, otherwise we | ||
47 | * may have mlocked a page that is being munlocked. So lazy mlock must take | ||
48 | * the mmap_sem for read, and verify that the vma really is locked | ||
49 | * (see mm/rmap.c). | ||
50 | */ | ||
51 | |||
52 | /* | ||
53 | * LRU accounting for clear_page_mlock() | ||
54 | */ | ||
55 | void __clear_page_mlock(struct page *page) | ||
56 | { | ||
57 | VM_BUG_ON(!PageLocked(page)); | ||
58 | |||
59 | if (!page->mapping) { /* truncated ? */ | ||
60 | return; | ||
61 | } | ||
62 | |||
63 | dec_zone_page_state(page, NR_MLOCK); | ||
64 | count_vm_event(UNEVICTABLE_PGCLEARED); | ||
65 | if (!isolate_lru_page(page)) { | ||
66 | putback_lru_page(page); | ||
67 | } else { | ||
68 | /* | ||
69 | * Page not on the LRU yet. Flush all pagevecs and retry. | ||
70 | */ | ||
71 | lru_add_drain_all(); | ||
72 | if (!isolate_lru_page(page)) | ||
73 | putback_lru_page(page); | ||
74 | else if (PageUnevictable(page)) | ||
75 | count_vm_event(UNEVICTABLE_PGSTRANDED); | ||
76 | |||
77 | } | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * Mark page as mlocked if not already. | ||
82 | * If page on LRU, isolate and putback to move to unevictable list. | ||
83 | */ | ||
84 | void mlock_vma_page(struct page *page) | ||
85 | { | ||
86 | BUG_ON(!PageLocked(page)); | ||
87 | |||
88 | if (!TestSetPageMlocked(page)) { | ||
89 | inc_zone_page_state(page, NR_MLOCK); | ||
90 | count_vm_event(UNEVICTABLE_PGMLOCKED); | ||
91 | if (!isolate_lru_page(page)) | ||
92 | putback_lru_page(page); | ||
93 | } | ||
94 | } | ||
95 | |||
96 | /* | ||
97 | * called from munlock()/munmap() path with page supposedly on the LRU. | ||
98 | * | ||
99 | * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked | ||
100 | * [in try_to_munlock()] and then attempt to isolate the page. We must | ||
101 | * isolate the page to keep others from messing with its unevictable | ||
102 | * and mlocked state while trying to munlock. However, we pre-clear the | ||
103 | * mlocked state anyway as we might lose the isolation race and we might | ||
104 | * not get another chance to clear PageMlocked. If we successfully | ||
105 | * isolate the page and try_to_munlock() detects other VM_LOCKED vmas | ||
106 | * mapping the page, it will restore the PageMlocked state, unless the page | ||
107 | * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(), | ||
108 | * perhaps redundantly. | ||
109 | * If we lose the isolation race, and the page is mapped by other VM_LOCKED | ||
110 | * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap() | ||
111 | * either of which will restore the PageMlocked state by calling | ||
112 | * mlock_vma_page() above, if it can grab the vma's mmap sem. | ||
113 | */ | ||
114 | static void munlock_vma_page(struct page *page) | ||
115 | { | ||
116 | BUG_ON(!PageLocked(page)); | ||
117 | |||
118 | if (TestClearPageMlocked(page)) { | ||
119 | dec_zone_page_state(page, NR_MLOCK); | ||
120 | if (!isolate_lru_page(page)) { | ||
121 | int ret = try_to_munlock(page); | ||
122 | /* | ||
123 | * did try_to_unlock() succeed or punt? | ||
124 | */ | ||
125 | if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN) | ||
126 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); | ||
127 | |||
128 | putback_lru_page(page); | ||
129 | } else { | ||
130 | /* | ||
131 | * We lost the race. let try_to_unmap() deal | ||
132 | * with it. At least we get the page state and | ||
133 | * mlock stats right. However, page is still on | ||
134 | * the noreclaim list. We'll fix that up when | ||
135 | * the page is eventually freed or we scan the | ||
136 | * noreclaim list. | ||
137 | */ | ||
138 | if (PageUnevictable(page)) | ||
139 | count_vm_event(UNEVICTABLE_PGSTRANDED); | ||
140 | else | ||
141 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); | ||
142 | } | ||
143 | } | ||
144 | } | ||
145 | |||
146 | /** | ||
147 | * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma. | ||
148 | * @vma: target vma | ||
149 | * @start: start address | ||
150 | * @end: end address | ||
151 | * @mlock: 0 indicate munlock, otherwise mlock. | ||
152 | * | ||
153 | * If @mlock == 0, unlock an mlocked range; | ||
154 | * else mlock the range of pages. This takes care of making the pages present , | ||
155 | * too. | ||
156 | * | ||
157 | * return 0 on success, negative error code on error. | ||
158 | * | ||
159 | * vma->vm_mm->mmap_sem must be held for at least read. | ||
160 | */ | ||
161 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, | ||
162 | unsigned long start, unsigned long end, | ||
163 | int mlock) | ||
164 | { | ||
165 | struct mm_struct *mm = vma->vm_mm; | ||
166 | unsigned long addr = start; | ||
167 | struct page *pages[16]; /* 16 gives a reasonable batch */ | ||
168 | int nr_pages = (end - start) / PAGE_SIZE; | ||
169 | int ret; | ||
170 | int gup_flags = 0; | ||
171 | |||
172 | VM_BUG_ON(start & ~PAGE_MASK); | ||
173 | VM_BUG_ON(end & ~PAGE_MASK); | ||
174 | VM_BUG_ON(start < vma->vm_start); | ||
175 | VM_BUG_ON(end > vma->vm_end); | ||
176 | VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) && | ||
177 | (atomic_read(&mm->mm_users) != 0)); | ||
178 | |||
179 | /* | ||
180 | * mlock: don't page populate if page has PROT_NONE permission. | ||
181 | * munlock: the pages always do munlock althrough | ||
182 | * its has PROT_NONE permission. | ||
183 | */ | ||
184 | if (!mlock) | ||
185 | gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS; | ||
186 | |||
187 | if (vma->vm_flags & VM_WRITE) | ||
188 | gup_flags |= GUP_FLAGS_WRITE; | ||
189 | |||
190 | lru_add_drain_all(); /* push cached pages to LRU */ | ||
191 | |||
192 | while (nr_pages > 0) { | ||
193 | int i; | ||
194 | |||
195 | cond_resched(); | ||
196 | |||
197 | /* | ||
198 | * get_user_pages makes pages present if we are | ||
199 | * setting mlock. and this extra reference count will | ||
200 | * disable migration of this page. However, page may | ||
201 | * still be truncated out from under us. | ||
202 | */ | ||
203 | ret = __get_user_pages(current, mm, addr, | ||
204 | min_t(int, nr_pages, ARRAY_SIZE(pages)), | ||
205 | gup_flags, pages, NULL); | ||
206 | /* | ||
207 | * This can happen for, e.g., VM_NONLINEAR regions before | ||
208 | * a page has been allocated and mapped at a given offset, | ||
209 | * or for addresses that map beyond end of a file. | ||
210 | * We'll mlock the the pages if/when they get faulted in. | ||
211 | */ | ||
212 | if (ret < 0) | ||
213 | break; | ||
214 | if (ret == 0) { | ||
215 | /* | ||
216 | * We know the vma is there, so the only time | ||
217 | * we cannot get a single page should be an | ||
218 | * error (ret < 0) case. | ||
219 | */ | ||
220 | WARN_ON(1); | ||
221 | break; | ||
222 | } | ||
223 | |||
224 | lru_add_drain(); /* push cached pages to LRU */ | ||
225 | |||
226 | for (i = 0; i < ret; i++) { | ||
227 | struct page *page = pages[i]; | ||
228 | |||
229 | lock_page(page); | ||
230 | /* | ||
231 | * Because we lock page here and migration is blocked | ||
232 | * by the elevated reference, we need only check for | ||
233 | * page truncation (file-cache only). | ||
234 | */ | ||
235 | if (page->mapping) { | ||
236 | if (mlock) | ||
237 | mlock_vma_page(page); | ||
238 | else | ||
239 | munlock_vma_page(page); | ||
240 | } | ||
241 | unlock_page(page); | ||
242 | put_page(page); /* ref from get_user_pages() */ | ||
243 | |||
244 | /* | ||
245 | * here we assume that get_user_pages() has given us | ||
246 | * a list of virtually contiguous pages. | ||
247 | */ | ||
248 | addr += PAGE_SIZE; /* for next get_user_pages() */ | ||
249 | nr_pages--; | ||
250 | } | ||
251 | ret = 0; | ||
252 | } | ||
253 | |||
254 | lru_add_drain_all(); /* to update stats */ | ||
255 | |||
256 | return ret; /* count entire vma as locked_vm */ | ||
257 | } | ||
258 | |||
259 | /* | ||
260 | * convert get_user_pages() return value to posix mlock() error | ||
261 | */ | ||
262 | static int __mlock_posix_error_return(long retval) | ||
263 | { | ||
264 | if (retval == -EFAULT) | ||
265 | retval = -ENOMEM; | ||
266 | else if (retval == -ENOMEM) | ||
267 | retval = -EAGAIN; | ||
268 | return retval; | ||
269 | } | ||
270 | |||
271 | #else /* CONFIG_UNEVICTABLE_LRU */ | ||
272 | |||
273 | /* | ||
274 | * Just make pages present if VM_LOCKED. No-op if unlocking. | ||
275 | */ | ||
276 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, | ||
277 | unsigned long start, unsigned long end, | ||
278 | int mlock) | ||
279 | { | ||
280 | if (mlock && (vma->vm_flags & VM_LOCKED)) | ||
281 | return make_pages_present(start, end); | ||
282 | return 0; | ||
283 | } | ||
284 | |||
285 | static inline int __mlock_posix_error_return(long retval) | ||
286 | { | ||
287 | return 0; | ||
288 | } | ||
289 | |||
290 | #endif /* CONFIG_UNEVICTABLE_LRU */ | ||
291 | |||
292 | /** | ||
293 | * mlock_vma_pages_range() - mlock pages in specified vma range. | ||
294 | * @vma - the vma containing the specfied address range | ||
295 | * @start - starting address in @vma to mlock | ||
296 | * @end - end address [+1] in @vma to mlock | ||
297 | * | ||
298 | * For mmap()/mremap()/expansion of mlocked vma. | ||
299 | * | ||
300 | * return 0 on success for "normal" vmas. | ||
301 | * | ||
302 | * return number of pages [> 0] to be removed from locked_vm on success | ||
303 | * of "special" vmas. | ||
304 | * | ||
305 | * return negative error if vma spanning @start-@range disappears while | ||
306 | * mmap semaphore is dropped. Unlikely? | ||
307 | */ | ||
308 | long mlock_vma_pages_range(struct vm_area_struct *vma, | ||
309 | unsigned long start, unsigned long end) | ||
310 | { | ||
311 | struct mm_struct *mm = vma->vm_mm; | ||
312 | int nr_pages = (end - start) / PAGE_SIZE; | ||
313 | BUG_ON(!(vma->vm_flags & VM_LOCKED)); | ||
314 | |||
315 | /* | ||
316 | * filter unlockable vmas | ||
317 | */ | ||
318 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) | ||
319 | goto no_mlock; | ||
320 | |||
321 | if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || | ||
322 | is_vm_hugetlb_page(vma) || | ||
323 | vma == get_gate_vma(current))) { | ||
324 | long error; | ||
325 | downgrade_write(&mm->mmap_sem); | ||
326 | |||
327 | error = __mlock_vma_pages_range(vma, start, end, 1); | ||
328 | |||
329 | up_read(&mm->mmap_sem); | ||
330 | /* vma can change or disappear */ | ||
331 | down_write(&mm->mmap_sem); | ||
332 | vma = find_vma(mm, start); | ||
333 | /* non-NULL vma must contain @start, but need to check @end */ | ||
334 | if (!vma || end > vma->vm_end) | ||
335 | return -ENOMEM; | ||
336 | |||
337 | return 0; /* hide other errors from mmap(), et al */ | ||
338 | } | ||
339 | |||
340 | /* | ||
341 | * User mapped kernel pages or huge pages: | ||
342 | * make these pages present to populate the ptes, but | ||
343 | * fall thru' to reset VM_LOCKED--no need to unlock, and | ||
344 | * return nr_pages so these don't get counted against task's | ||
345 | * locked limit. huge pages are already counted against | ||
346 | * locked vm limit. | ||
347 | */ | ||
348 | make_pages_present(start, end); | ||
349 | |||
350 | no_mlock: | ||
351 | vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */ | ||
352 | return nr_pages; /* error or pages NOT mlocked */ | ||
353 | } | ||
354 | |||
355 | |||
356 | /* | ||
357 | * munlock_vma_pages_range() - munlock all pages in the vma range.' | ||
358 | * @vma - vma containing range to be munlock()ed. | ||
359 | * @start - start address in @vma of the range | ||
360 | * @end - end of range in @vma. | ||
361 | * | ||
362 | * For mremap(), munmap() and exit(). | ||
363 | * | ||
364 | * Called with @vma VM_LOCKED. | ||
365 | * | ||
366 | * Returns with VM_LOCKED cleared. Callers must be prepared to | ||
367 | * deal with this. | ||
368 | * | ||
369 | * We don't save and restore VM_LOCKED here because pages are | ||
370 | * still on lru. In unmap path, pages might be scanned by reclaim | ||
371 | * and re-mlocked by try_to_{munlock|unmap} before we unmap and | ||
372 | * free them. This will result in freeing mlocked pages. | ||
373 | */ | ||
374 | void munlock_vma_pages_range(struct vm_area_struct *vma, | ||
375 | unsigned long start, unsigned long end) | ||
376 | { | ||
377 | vma->vm_flags &= ~VM_LOCKED; | ||
378 | __mlock_vma_pages_range(vma, start, end, 0); | ||
379 | } | ||
380 | |||
381 | /* | ||
382 | * mlock_fixup - handle mlock[all]/munlock[all] requests. | ||
383 | * | ||
384 | * Filters out "special" vmas -- VM_LOCKED never gets set for these, and | ||
385 | * munlock is a no-op. However, for some special vmas, we go ahead and | ||
386 | * populate the ptes via make_pages_present(). | ||
387 | * | ||
388 | * For vmas that pass the filters, merge/split as appropriate. | ||
389 | */ | ||
26 | static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, | 390 | static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, |
27 | unsigned long start, unsigned long end, unsigned int newflags) | 391 | unsigned long start, unsigned long end, unsigned int newflags) |
28 | { | 392 | { |
29 | struct mm_struct * mm = vma->vm_mm; | 393 | struct mm_struct *mm = vma->vm_mm; |
30 | pgoff_t pgoff; | 394 | pgoff_t pgoff; |
31 | int pages; | 395 | int nr_pages; |
32 | int ret = 0; | 396 | int ret = 0; |
33 | 397 | int lock = newflags & VM_LOCKED; | |
34 | if (newflags == vma->vm_flags) { | 398 | |
35 | *prev = vma; | 399 | if (newflags == vma->vm_flags || |
36 | goto out; | 400 | (vma->vm_flags & (VM_IO | VM_PFNMAP))) |
401 | goto out; /* don't set VM_LOCKED, don't count */ | ||
402 | |||
403 | if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || | ||
404 | is_vm_hugetlb_page(vma) || | ||
405 | vma == get_gate_vma(current)) { | ||
406 | if (lock) | ||
407 | make_pages_present(start, end); | ||
408 | goto out; /* don't set VM_LOCKED, don't count */ | ||
37 | } | 409 | } |
38 | 410 | ||
39 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); | 411 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); |
@@ -44,8 +416,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
44 | goto success; | 416 | goto success; |
45 | } | 417 | } |
46 | 418 | ||
47 | *prev = vma; | ||
48 | |||
49 | if (start != vma->vm_start) { | 419 | if (start != vma->vm_start) { |
50 | ret = split_vma(mm, vma, start, 1); | 420 | ret = split_vma(mm, vma, start, 1); |
51 | if (ret) | 421 | if (ret) |
@@ -60,26 +430,61 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
60 | 430 | ||
61 | success: | 431 | success: |
62 | /* | 432 | /* |
433 | * Keep track of amount of locked VM. | ||
434 | */ | ||
435 | nr_pages = (end - start) >> PAGE_SHIFT; | ||
436 | if (!lock) | ||
437 | nr_pages = -nr_pages; | ||
438 | mm->locked_vm += nr_pages; | ||
439 | |||
440 | /* | ||
63 | * vm_flags is protected by the mmap_sem held in write mode. | 441 | * vm_flags is protected by the mmap_sem held in write mode. |
64 | * It's okay if try_to_unmap_one unmaps a page just after we | 442 | * It's okay if try_to_unmap_one unmaps a page just after we |
65 | * set VM_LOCKED, make_pages_present below will bring it back. | 443 | * set VM_LOCKED, __mlock_vma_pages_range will bring it back. |
66 | */ | 444 | */ |
67 | vma->vm_flags = newflags; | 445 | vma->vm_flags = newflags; |
68 | 446 | ||
69 | /* | 447 | if (lock) { |
70 | * Keep track of amount of locked VM. | 448 | /* |
71 | */ | 449 | * mmap_sem is currently held for write. Downgrade the write |
72 | pages = (end - start) >> PAGE_SHIFT; | 450 | * lock to a read lock so that other faults, mmap scans, ... |
73 | if (newflags & VM_LOCKED) { | 451 | * while we fault in all pages. |
74 | pages = -pages; | 452 | */ |
75 | if (!(newflags & VM_IO)) | 453 | downgrade_write(&mm->mmap_sem); |
76 | ret = make_pages_present(start, end); | 454 | |
455 | ret = __mlock_vma_pages_range(vma, start, end, 1); | ||
456 | |||
457 | /* | ||
458 | * Need to reacquire mmap sem in write mode, as our callers | ||
459 | * expect this. We have no support for atomically upgrading | ||
460 | * a sem to write, so we need to check for ranges while sem | ||
461 | * is unlocked. | ||
462 | */ | ||
463 | up_read(&mm->mmap_sem); | ||
464 | /* vma can change or disappear */ | ||
465 | down_write(&mm->mmap_sem); | ||
466 | *prev = find_vma(mm, start); | ||
467 | /* non-NULL *prev must contain @start, but need to check @end */ | ||
468 | if (!(*prev) || end > (*prev)->vm_end) | ||
469 | ret = -ENOMEM; | ||
470 | else if (ret > 0) { | ||
471 | mm->locked_vm -= ret; | ||
472 | ret = 0; | ||
473 | } else | ||
474 | ret = __mlock_posix_error_return(ret); /* translate if needed */ | ||
475 | } else { | ||
476 | /* | ||
477 | * TODO: for unlocking, pages will already be resident, so | ||
478 | * we don't need to wait for allocations/reclaim/pagein, ... | ||
479 | * However, unlocking a very large region can still take a | ||
480 | * while. Should we downgrade the semaphore for both lock | ||
481 | * AND unlock ? | ||
482 | */ | ||
483 | __mlock_vma_pages_range(vma, start, end, 0); | ||
77 | } | 484 | } |
78 | 485 | ||
79 | mm->locked_vm -= pages; | ||
80 | out: | 486 | out: |
81 | if (ret == -ENOMEM) | 487 | *prev = vma; |
82 | ret = -EAGAIN; | ||
83 | return ret; | 488 | return ret; |
84 | } | 489 | } |
85 | 490 | ||
diff --git a/mm/mm_init.c b/mm/mm_init.c new file mode 100644 index 000000000000..4e0e26591dfa --- /dev/null +++ b/mm/mm_init.c | |||
@@ -0,0 +1,152 @@ | |||
1 | /* | ||
2 | * mm_init.c - Memory initialisation verification and debugging | ||
3 | * | ||
4 | * Copyright 2008 IBM Corporation, 2008 | ||
5 | * Author Mel Gorman <mel@csn.ul.ie> | ||
6 | * | ||
7 | */ | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/init.h> | ||
10 | #include <linux/kobject.h> | ||
11 | #include <linux/module.h> | ||
12 | #include "internal.h" | ||
13 | |||
14 | #ifdef CONFIG_DEBUG_MEMORY_INIT | ||
15 | int mminit_loglevel; | ||
16 | |||
17 | #ifndef SECTIONS_SHIFT | ||
18 | #define SECTIONS_SHIFT 0 | ||
19 | #endif | ||
20 | |||
21 | /* The zonelists are simply reported, validation is manual. */ | ||
22 | void mminit_verify_zonelist(void) | ||
23 | { | ||
24 | int nid; | ||
25 | |||
26 | if (mminit_loglevel < MMINIT_VERIFY) | ||
27 | return; | ||
28 | |||
29 | for_each_online_node(nid) { | ||
30 | pg_data_t *pgdat = NODE_DATA(nid); | ||
31 | struct zone *zone; | ||
32 | struct zoneref *z; | ||
33 | struct zonelist *zonelist; | ||
34 | int i, listid, zoneid; | ||
35 | |||
36 | BUG_ON(MAX_ZONELISTS > 2); | ||
37 | for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) { | ||
38 | |||
39 | /* Identify the zone and nodelist */ | ||
40 | zoneid = i % MAX_NR_ZONES; | ||
41 | listid = i / MAX_NR_ZONES; | ||
42 | zonelist = &pgdat->node_zonelists[listid]; | ||
43 | zone = &pgdat->node_zones[zoneid]; | ||
44 | if (!populated_zone(zone)) | ||
45 | continue; | ||
46 | |||
47 | /* Print information about the zonelist */ | ||
48 | printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ", | ||
49 | listid > 0 ? "thisnode" : "general", nid, | ||
50 | zone->name); | ||
51 | |||
52 | /* Iterate the zonelist */ | ||
53 | for_each_zone_zonelist(zone, z, zonelist, zoneid) { | ||
54 | #ifdef CONFIG_NUMA | ||
55 | printk(KERN_CONT "%d:%s ", | ||
56 | zone->node, zone->name); | ||
57 | #else | ||
58 | printk(KERN_CONT "0:%s ", zone->name); | ||
59 | #endif /* CONFIG_NUMA */ | ||
60 | } | ||
61 | printk(KERN_CONT "\n"); | ||
62 | } | ||
63 | } | ||
64 | } | ||
65 | |||
66 | void __init mminit_verify_pageflags_layout(void) | ||
67 | { | ||
68 | int shift, width; | ||
69 | unsigned long or_mask, add_mask; | ||
70 | |||
71 | shift = 8 * sizeof(unsigned long); | ||
72 | width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH; | ||
73 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", | ||
74 | "Section %d Node %d Zone %d Flags %d\n", | ||
75 | SECTIONS_WIDTH, | ||
76 | NODES_WIDTH, | ||
77 | ZONES_WIDTH, | ||
78 | NR_PAGEFLAGS); | ||
79 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", | ||
80 | "Section %d Node %d Zone %d\n", | ||
81 | SECTIONS_SHIFT, | ||
82 | NODES_SHIFT, | ||
83 | ZONES_SHIFT); | ||
84 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets", | ||
85 | "Section %lu Node %lu Zone %lu\n", | ||
86 | (unsigned long)SECTIONS_PGSHIFT, | ||
87 | (unsigned long)NODES_PGSHIFT, | ||
88 | (unsigned long)ZONES_PGSHIFT); | ||
89 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid", | ||
90 | "Zone ID: %lu -> %lu\n", | ||
91 | (unsigned long)ZONEID_PGOFF, | ||
92 | (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT)); | ||
93 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage", | ||
94 | "location: %d -> %d unused %d -> %d flags %d -> %d\n", | ||
95 | shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0); | ||
96 | #ifdef NODE_NOT_IN_PAGE_FLAGS | ||
97 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", | ||
98 | "Node not in page flags"); | ||
99 | #endif | ||
100 | |||
101 | if (SECTIONS_WIDTH) { | ||
102 | shift -= SECTIONS_WIDTH; | ||
103 | BUG_ON(shift != SECTIONS_PGSHIFT); | ||
104 | } | ||
105 | if (NODES_WIDTH) { | ||
106 | shift -= NODES_WIDTH; | ||
107 | BUG_ON(shift != NODES_PGSHIFT); | ||
108 | } | ||
109 | if (ZONES_WIDTH) { | ||
110 | shift -= ZONES_WIDTH; | ||
111 | BUG_ON(shift != ZONES_PGSHIFT); | ||
112 | } | ||
113 | |||
114 | /* Check for bitmask overlaps */ | ||
115 | or_mask = (ZONES_MASK << ZONES_PGSHIFT) | | ||
116 | (NODES_MASK << NODES_PGSHIFT) | | ||
117 | (SECTIONS_MASK << SECTIONS_PGSHIFT); | ||
118 | add_mask = (ZONES_MASK << ZONES_PGSHIFT) + | ||
119 | (NODES_MASK << NODES_PGSHIFT) + | ||
120 | (SECTIONS_MASK << SECTIONS_PGSHIFT); | ||
121 | BUG_ON(or_mask != add_mask); | ||
122 | } | ||
123 | |||
124 | void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone, | ||
125 | unsigned long nid, unsigned long pfn) | ||
126 | { | ||
127 | BUG_ON(page_to_nid(page) != nid); | ||
128 | BUG_ON(page_zonenum(page) != zone); | ||
129 | BUG_ON(page_to_pfn(page) != pfn); | ||
130 | } | ||
131 | |||
132 | static __init int set_mminit_loglevel(char *str) | ||
133 | { | ||
134 | get_option(&str, &mminit_loglevel); | ||
135 | return 0; | ||
136 | } | ||
137 | early_param("mminit_loglevel", set_mminit_loglevel); | ||
138 | #endif /* CONFIG_DEBUG_MEMORY_INIT */ | ||
139 | |||
140 | struct kobject *mm_kobj; | ||
141 | EXPORT_SYMBOL_GPL(mm_kobj); | ||
142 | |||
143 | static int __init mm_sysfs_init(void) | ||
144 | { | ||
145 | mm_kobj = kobject_create_and_add("mm", kernel_kobj); | ||
146 | if (!mm_kobj) | ||
147 | return -ENOMEM; | ||
148 | |||
149 | return 0; | ||
150 | } | ||
151 | |||
152 | __initcall(mm_sysfs_init); | ||
@@ -26,12 +26,15 @@ | |||
26 | #include <linux/mount.h> | 26 | #include <linux/mount.h> |
27 | #include <linux/mempolicy.h> | 27 | #include <linux/mempolicy.h> |
28 | #include <linux/rmap.h> | 28 | #include <linux/rmap.h> |
29 | #include <linux/mmu_notifier.h> | ||
29 | 30 | ||
30 | #include <asm/uaccess.h> | 31 | #include <asm/uaccess.h> |
31 | #include <asm/cacheflush.h> | 32 | #include <asm/cacheflush.h> |
32 | #include <asm/tlb.h> | 33 | #include <asm/tlb.h> |
33 | #include <asm/mmu_context.h> | 34 | #include <asm/mmu_context.h> |
34 | 35 | ||
36 | #include "internal.h" | ||
37 | |||
35 | #ifndef arch_mmap_check | 38 | #ifndef arch_mmap_check |
36 | #define arch_mmap_check(addr, len, flags) (0) | 39 | #define arch_mmap_check(addr, len, flags) (0) |
37 | #endif | 40 | #endif |
@@ -367,7 +370,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, | |||
367 | if (vma_tmp->vm_end > addr) { | 370 | if (vma_tmp->vm_end > addr) { |
368 | vma = vma_tmp; | 371 | vma = vma_tmp; |
369 | if (vma_tmp->vm_start <= addr) | 372 | if (vma_tmp->vm_start <= addr) |
370 | return vma; | 373 | break; |
371 | __rb_link = &__rb_parent->rb_left; | 374 | __rb_link = &__rb_parent->rb_left; |
372 | } else { | 375 | } else { |
373 | rb_prev = __rb_parent; | 376 | rb_prev = __rb_parent; |
@@ -407,7 +410,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, | |||
407 | rb_insert_color(&vma->vm_rb, &mm->mm_rb); | 410 | rb_insert_color(&vma->vm_rb, &mm->mm_rb); |
408 | } | 411 | } |
409 | 412 | ||
410 | static inline void __vma_link_file(struct vm_area_struct *vma) | 413 | static void __vma_link_file(struct vm_area_struct *vma) |
411 | { | 414 | { |
412 | struct file * file; | 415 | struct file * file; |
413 | 416 | ||
@@ -659,8 +662,6 @@ again: remove_next = 1 + (end > next->vm_end); | |||
659 | * If the vma has a ->close operation then the driver probably needs to release | 662 | * If the vma has a ->close operation then the driver probably needs to release |
660 | * per-vma resources, so we don't attempt to merge those. | 663 | * per-vma resources, so we don't attempt to merge those. |
661 | */ | 664 | */ |
662 | #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) | ||
663 | |||
664 | static inline int is_mergeable_vma(struct vm_area_struct *vma, | 665 | static inline int is_mergeable_vma(struct vm_area_struct *vma, |
665 | struct file *file, unsigned long vm_flags) | 666 | struct file *file, unsigned long vm_flags) |
666 | { | 667 | { |
@@ -969,6 +970,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | |||
969 | return -EPERM; | 970 | return -EPERM; |
970 | vm_flags |= VM_LOCKED; | 971 | vm_flags |= VM_LOCKED; |
971 | } | 972 | } |
973 | |||
972 | /* mlock MCL_FUTURE? */ | 974 | /* mlock MCL_FUTURE? */ |
973 | if (vm_flags & VM_LOCKED) { | 975 | if (vm_flags & VM_LOCKED) { |
974 | unsigned long locked, lock_limit; | 976 | unsigned long locked, lock_limit; |
@@ -1027,6 +1029,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | |||
1027 | } else { | 1029 | } else { |
1028 | switch (flags & MAP_TYPE) { | 1030 | switch (flags & MAP_TYPE) { |
1029 | case MAP_SHARED: | 1031 | case MAP_SHARED: |
1032 | /* | ||
1033 | * Ignore pgoff. | ||
1034 | */ | ||
1035 | pgoff = 0; | ||
1030 | vm_flags |= VM_SHARED | VM_MAYSHARE; | 1036 | vm_flags |= VM_SHARED | VM_MAYSHARE; |
1031 | break; | 1037 | break; |
1032 | case MAP_PRIVATE: | 1038 | case MAP_PRIVATE: |
@@ -1108,6 +1114,9 @@ munmap_back: | |||
1108 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) | 1114 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) |
1109 | return -ENOMEM; | 1115 | return -ENOMEM; |
1110 | 1116 | ||
1117 | if (flags & MAP_NORESERVE) | ||
1118 | vm_flags |= VM_NORESERVE; | ||
1119 | |||
1111 | if (accountable && (!(flags & MAP_NORESERVE) || | 1120 | if (accountable && (!(flags & MAP_NORESERVE) || |
1112 | sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { | 1121 | sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { |
1113 | if (vm_flags & VM_SHARED) { | 1122 | if (vm_flags & VM_SHARED) { |
@@ -1129,10 +1138,12 @@ munmap_back: | |||
1129 | * The VM_SHARED test is necessary because shmem_zero_setup | 1138 | * The VM_SHARED test is necessary because shmem_zero_setup |
1130 | * will create the file object for a shared anonymous map below. | 1139 | * will create the file object for a shared anonymous map below. |
1131 | */ | 1140 | */ |
1132 | if (!file && !(vm_flags & VM_SHARED) && | 1141 | if (!file && !(vm_flags & VM_SHARED)) { |
1133 | vma_merge(mm, prev, addr, addr + len, vm_flags, | 1142 | vma = vma_merge(mm, prev, addr, addr + len, vm_flags, |
1134 | NULL, NULL, pgoff, NULL)) | 1143 | NULL, NULL, pgoff, NULL); |
1135 | goto out; | 1144 | if (vma) |
1145 | goto out; | ||
1146 | } | ||
1136 | 1147 | ||
1137 | /* | 1148 | /* |
1138 | * Determine the object being mapped and call the appropriate | 1149 | * Determine the object being mapped and call the appropriate |
@@ -1214,10 +1225,14 @@ out: | |||
1214 | mm->total_vm += len >> PAGE_SHIFT; | 1225 | mm->total_vm += len >> PAGE_SHIFT; |
1215 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1226 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
1216 | if (vm_flags & VM_LOCKED) { | 1227 | if (vm_flags & VM_LOCKED) { |
1217 | mm->locked_vm += len >> PAGE_SHIFT; | 1228 | /* |
1218 | make_pages_present(addr, addr + len); | 1229 | * makes pages present; downgrades, drops, reacquires mmap_sem |
1219 | } | 1230 | */ |
1220 | if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) | 1231 | long nr_pages = mlock_vma_pages_range(vma, addr, addr + len); |
1232 | if (nr_pages < 0) | ||
1233 | return nr_pages; /* vma gone! */ | ||
1234 | mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages; | ||
1235 | } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) | ||
1221 | make_pages_present(addr, addr + len); | 1236 | make_pages_present(addr, addr + len); |
1222 | return addr; | 1237 | return addr; |
1223 | 1238 | ||
@@ -1576,7 +1591,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un | |||
1576 | * vma is the last one with address > vma->vm_end. Have to extend vma. | 1591 | * vma is the last one with address > vma->vm_end. Have to extend vma. |
1577 | */ | 1592 | */ |
1578 | #ifndef CONFIG_IA64 | 1593 | #ifndef CONFIG_IA64 |
1579 | static inline | 1594 | static |
1580 | #endif | 1595 | #endif |
1581 | int expand_upwards(struct vm_area_struct *vma, unsigned long address) | 1596 | int expand_upwards(struct vm_area_struct *vma, unsigned long address) |
1582 | { | 1597 | { |
@@ -1626,7 +1641,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
1626 | /* | 1641 | /* |
1627 | * vma is the first one with address < vma->vm_start. Have to extend vma. | 1642 | * vma is the first one with address < vma->vm_start. Have to extend vma. |
1628 | */ | 1643 | */ |
1629 | static inline int expand_downwards(struct vm_area_struct *vma, | 1644 | static int expand_downwards(struct vm_area_struct *vma, |
1630 | unsigned long address) | 1645 | unsigned long address) |
1631 | { | 1646 | { |
1632 | int error; | 1647 | int error; |
@@ -1688,10 +1703,12 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) | |||
1688 | vma = find_vma_prev(mm, addr, &prev); | 1703 | vma = find_vma_prev(mm, addr, &prev); |
1689 | if (vma && (vma->vm_start <= addr)) | 1704 | if (vma && (vma->vm_start <= addr)) |
1690 | return vma; | 1705 | return vma; |
1691 | if (!prev || expand_stack(prev, addr)) | 1706 | if (expand_stack(prev, addr)) |
1692 | return NULL; | 1707 | return NULL; |
1693 | if (prev->vm_flags & VM_LOCKED) | 1708 | if (prev->vm_flags & VM_LOCKED) { |
1694 | make_pages_present(addr, prev->vm_end); | 1709 | if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) |
1710 | return NULL; /* vma gone! */ | ||
1711 | } | ||
1695 | return prev; | 1712 | return prev; |
1696 | } | 1713 | } |
1697 | #else | 1714 | #else |
@@ -1717,8 +1734,10 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) | |||
1717 | start = vma->vm_start; | 1734 | start = vma->vm_start; |
1718 | if (expand_stack(vma, addr)) | 1735 | if (expand_stack(vma, addr)) |
1719 | return NULL; | 1736 | return NULL; |
1720 | if (vma->vm_flags & VM_LOCKED) | 1737 | if (vma->vm_flags & VM_LOCKED) { |
1721 | make_pages_present(addr, start); | 1738 | if (mlock_vma_pages_range(vma, addr, start) < 0) |
1739 | return NULL; /* vma gone! */ | ||
1740 | } | ||
1722 | return vma; | 1741 | return vma; |
1723 | } | 1742 | } |
1724 | #endif | 1743 | #endif |
@@ -1737,8 +1756,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) | |||
1737 | long nrpages = vma_pages(vma); | 1756 | long nrpages = vma_pages(vma); |
1738 | 1757 | ||
1739 | mm->total_vm -= nrpages; | 1758 | mm->total_vm -= nrpages; |
1740 | if (vma->vm_flags & VM_LOCKED) | ||
1741 | mm->locked_vm -= nrpages; | ||
1742 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); | 1759 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); |
1743 | vma = remove_vma(vma); | 1760 | vma = remove_vma(vma); |
1744 | } while (vma); | 1761 | } while (vma); |
@@ -1763,7 +1780,7 @@ static void unmap_region(struct mm_struct *mm, | |||
1763 | update_hiwater_rss(mm); | 1780 | update_hiwater_rss(mm); |
1764 | unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); | 1781 | unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); |
1765 | vm_unacct_memory(nr_accounted); | 1782 | vm_unacct_memory(nr_accounted); |
1766 | free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, | 1783 | free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, |
1767 | next? next->vm_start: 0); | 1784 | next? next->vm_start: 0); |
1768 | tlb_finish_mmu(tlb, start, end); | 1785 | tlb_finish_mmu(tlb, start, end); |
1769 | } | 1786 | } |
@@ -1807,7 +1824,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
1807 | struct mempolicy *pol; | 1824 | struct mempolicy *pol; |
1808 | struct vm_area_struct *new; | 1825 | struct vm_area_struct *new; |
1809 | 1826 | ||
1810 | if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK)) | 1827 | if (is_vm_hugetlb_page(vma) && (addr & |
1828 | ~(huge_page_mask(hstate_vma(vma))))) | ||
1811 | return -EINVAL; | 1829 | return -EINVAL; |
1812 | 1830 | ||
1813 | if (mm->map_count >= sysctl_max_map_count) | 1831 | if (mm->map_count >= sysctl_max_map_count) |
@@ -1903,6 +1921,20 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
1903 | vma = prev? prev->vm_next: mm->mmap; | 1921 | vma = prev? prev->vm_next: mm->mmap; |
1904 | 1922 | ||
1905 | /* | 1923 | /* |
1924 | * unlock any mlock()ed ranges before detaching vmas | ||
1925 | */ | ||
1926 | if (mm->locked_vm) { | ||
1927 | struct vm_area_struct *tmp = vma; | ||
1928 | while (tmp && tmp->vm_start < end) { | ||
1929 | if (tmp->vm_flags & VM_LOCKED) { | ||
1930 | mm->locked_vm -= vma_pages(tmp); | ||
1931 | munlock_vma_pages_all(tmp); | ||
1932 | } | ||
1933 | tmp = tmp->vm_next; | ||
1934 | } | ||
1935 | } | ||
1936 | |||
1937 | /* | ||
1906 | * Remove the vma's, and unmap the actual pages | 1938 | * Remove the vma's, and unmap the actual pages |
1907 | */ | 1939 | */ |
1908 | detach_vmas_to_be_unmapped(mm, vma, prev, end); | 1940 | detach_vmas_to_be_unmapped(mm, vma, prev, end); |
@@ -2014,8 +2046,9 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2014 | return -ENOMEM; | 2046 | return -ENOMEM; |
2015 | 2047 | ||
2016 | /* Can we just expand an old private anonymous mapping? */ | 2048 | /* Can we just expand an old private anonymous mapping? */ |
2017 | if (vma_merge(mm, prev, addr, addr + len, flags, | 2049 | vma = vma_merge(mm, prev, addr, addr + len, flags, |
2018 | NULL, NULL, pgoff, NULL)) | 2050 | NULL, NULL, pgoff, NULL); |
2051 | if (vma) | ||
2019 | goto out; | 2052 | goto out; |
2020 | 2053 | ||
2021 | /* | 2054 | /* |
@@ -2037,8 +2070,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2037 | out: | 2070 | out: |
2038 | mm->total_vm += len >> PAGE_SHIFT; | 2071 | mm->total_vm += len >> PAGE_SHIFT; |
2039 | if (flags & VM_LOCKED) { | 2072 | if (flags & VM_LOCKED) { |
2040 | mm->locked_vm += len >> PAGE_SHIFT; | 2073 | if (!mlock_vma_pages_range(vma, addr, addr + len)) |
2041 | make_pages_present(addr, addr + len); | 2074 | mm->locked_vm += (len >> PAGE_SHIFT); |
2042 | } | 2075 | } |
2043 | return addr; | 2076 | return addr; |
2044 | } | 2077 | } |
@@ -2049,13 +2082,23 @@ EXPORT_SYMBOL(do_brk); | |||
2049 | void exit_mmap(struct mm_struct *mm) | 2082 | void exit_mmap(struct mm_struct *mm) |
2050 | { | 2083 | { |
2051 | struct mmu_gather *tlb; | 2084 | struct mmu_gather *tlb; |
2052 | struct vm_area_struct *vma = mm->mmap; | 2085 | struct vm_area_struct *vma; |
2053 | unsigned long nr_accounted = 0; | 2086 | unsigned long nr_accounted = 0; |
2054 | unsigned long end; | 2087 | unsigned long end; |
2055 | 2088 | ||
2056 | /* mm's last user has gone, and its about to be pulled down */ | 2089 | /* mm's last user has gone, and its about to be pulled down */ |
2057 | arch_exit_mmap(mm); | 2090 | arch_exit_mmap(mm); |
2058 | 2091 | mmu_notifier_release(mm); | |
2092 | |||
2093 | if (mm->locked_vm) { | ||
2094 | vma = mm->mmap; | ||
2095 | while (vma) { | ||
2096 | if (vma->vm_flags & VM_LOCKED) | ||
2097 | munlock_vma_pages_all(vma); | ||
2098 | vma = vma->vm_next; | ||
2099 | } | ||
2100 | } | ||
2101 | vma = mm->mmap; | ||
2059 | lru_add_drain(); | 2102 | lru_add_drain(); |
2060 | flush_cache_mm(mm); | 2103 | flush_cache_mm(mm); |
2061 | tlb = tlb_gather_mmu(mm, 1); | 2104 | tlb = tlb_gather_mmu(mm, 1); |
@@ -2063,7 +2106,7 @@ void exit_mmap(struct mm_struct *mm) | |||
2063 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ | 2106 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
2064 | end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); | 2107 | end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); |
2065 | vm_unacct_memory(nr_accounted); | 2108 | vm_unacct_memory(nr_accounted); |
2066 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); | 2109 | free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); |
2067 | tlb_finish_mmu(tlb, 0, end); | 2110 | tlb_finish_mmu(tlb, 0, end); |
2068 | 2111 | ||
2069 | /* | 2112 | /* |
@@ -2262,3 +2305,167 @@ int install_special_mapping(struct mm_struct *mm, | |||
2262 | 2305 | ||
2263 | return 0; | 2306 | return 0; |
2264 | } | 2307 | } |
2308 | |||
2309 | static DEFINE_MUTEX(mm_all_locks_mutex); | ||
2310 | |||
2311 | static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) | ||
2312 | { | ||
2313 | if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { | ||
2314 | /* | ||
2315 | * The LSB of head.next can't change from under us | ||
2316 | * because we hold the mm_all_locks_mutex. | ||
2317 | */ | ||
2318 | spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem); | ||
2319 | /* | ||
2320 | * We can safely modify head.next after taking the | ||
2321 | * anon_vma->lock. If some other vma in this mm shares | ||
2322 | * the same anon_vma we won't take it again. | ||
2323 | * | ||
2324 | * No need of atomic instructions here, head.next | ||
2325 | * can't change from under us thanks to the | ||
2326 | * anon_vma->lock. | ||
2327 | */ | ||
2328 | if (__test_and_set_bit(0, (unsigned long *) | ||
2329 | &anon_vma->head.next)) | ||
2330 | BUG(); | ||
2331 | } | ||
2332 | } | ||
2333 | |||
2334 | static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | ||
2335 | { | ||
2336 | if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { | ||
2337 | /* | ||
2338 | * AS_MM_ALL_LOCKS can't change from under us because | ||
2339 | * we hold the mm_all_locks_mutex. | ||
2340 | * | ||
2341 | * Operations on ->flags have to be atomic because | ||
2342 | * even if AS_MM_ALL_LOCKS is stable thanks to the | ||
2343 | * mm_all_locks_mutex, there may be other cpus | ||
2344 | * changing other bitflags in parallel to us. | ||
2345 | */ | ||
2346 | if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) | ||
2347 | BUG(); | ||
2348 | spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem); | ||
2349 | } | ||
2350 | } | ||
2351 | |||
2352 | /* | ||
2353 | * This operation locks against the VM for all pte/vma/mm related | ||
2354 | * operations that could ever happen on a certain mm. This includes | ||
2355 | * vmtruncate, try_to_unmap, and all page faults. | ||
2356 | * | ||
2357 | * The caller must take the mmap_sem in write mode before calling | ||
2358 | * mm_take_all_locks(). The caller isn't allowed to release the | ||
2359 | * mmap_sem until mm_drop_all_locks() returns. | ||
2360 | * | ||
2361 | * mmap_sem in write mode is required in order to block all operations | ||
2362 | * that could modify pagetables and free pages without need of | ||
2363 | * altering the vma layout (for example populate_range() with | ||
2364 | * nonlinear vmas). It's also needed in write mode to avoid new | ||
2365 | * anon_vmas to be associated with existing vmas. | ||
2366 | * | ||
2367 | * A single task can't take more than one mm_take_all_locks() in a row | ||
2368 | * or it would deadlock. | ||
2369 | * | ||
2370 | * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in | ||
2371 | * mapping->flags avoid to take the same lock twice, if more than one | ||
2372 | * vma in this mm is backed by the same anon_vma or address_space. | ||
2373 | * | ||
2374 | * We can take all the locks in random order because the VM code | ||
2375 | * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never | ||
2376 | * takes more than one of them in a row. Secondly we're protected | ||
2377 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. | ||
2378 | * | ||
2379 | * mm_take_all_locks() and mm_drop_all_locks are expensive operations | ||
2380 | * that may have to take thousand of locks. | ||
2381 | * | ||
2382 | * mm_take_all_locks() can fail if it's interrupted by signals. | ||
2383 | */ | ||
2384 | int mm_take_all_locks(struct mm_struct *mm) | ||
2385 | { | ||
2386 | struct vm_area_struct *vma; | ||
2387 | int ret = -EINTR; | ||
2388 | |||
2389 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | ||
2390 | |||
2391 | mutex_lock(&mm_all_locks_mutex); | ||
2392 | |||
2393 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
2394 | if (signal_pending(current)) | ||
2395 | goto out_unlock; | ||
2396 | if (vma->vm_file && vma->vm_file->f_mapping) | ||
2397 | vm_lock_mapping(mm, vma->vm_file->f_mapping); | ||
2398 | } | ||
2399 | |||
2400 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
2401 | if (signal_pending(current)) | ||
2402 | goto out_unlock; | ||
2403 | if (vma->anon_vma) | ||
2404 | vm_lock_anon_vma(mm, vma->anon_vma); | ||
2405 | } | ||
2406 | |||
2407 | ret = 0; | ||
2408 | |||
2409 | out_unlock: | ||
2410 | if (ret) | ||
2411 | mm_drop_all_locks(mm); | ||
2412 | |||
2413 | return ret; | ||
2414 | } | ||
2415 | |||
2416 | static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | ||
2417 | { | ||
2418 | if (test_bit(0, (unsigned long *) &anon_vma->head.next)) { | ||
2419 | /* | ||
2420 | * The LSB of head.next can't change to 0 from under | ||
2421 | * us because we hold the mm_all_locks_mutex. | ||
2422 | * | ||
2423 | * We must however clear the bitflag before unlocking | ||
2424 | * the vma so the users using the anon_vma->head will | ||
2425 | * never see our bitflag. | ||
2426 | * | ||
2427 | * No need of atomic instructions here, head.next | ||
2428 | * can't change from under us until we release the | ||
2429 | * anon_vma->lock. | ||
2430 | */ | ||
2431 | if (!__test_and_clear_bit(0, (unsigned long *) | ||
2432 | &anon_vma->head.next)) | ||
2433 | BUG(); | ||
2434 | spin_unlock(&anon_vma->lock); | ||
2435 | } | ||
2436 | } | ||
2437 | |||
2438 | static void vm_unlock_mapping(struct address_space *mapping) | ||
2439 | { | ||
2440 | if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { | ||
2441 | /* | ||
2442 | * AS_MM_ALL_LOCKS can't change to 0 from under us | ||
2443 | * because we hold the mm_all_locks_mutex. | ||
2444 | */ | ||
2445 | spin_unlock(&mapping->i_mmap_lock); | ||
2446 | if (!test_and_clear_bit(AS_MM_ALL_LOCKS, | ||
2447 | &mapping->flags)) | ||
2448 | BUG(); | ||
2449 | } | ||
2450 | } | ||
2451 | |||
2452 | /* | ||
2453 | * The mmap_sem cannot be released by the caller until | ||
2454 | * mm_drop_all_locks() returns. | ||
2455 | */ | ||
2456 | void mm_drop_all_locks(struct mm_struct *mm) | ||
2457 | { | ||
2458 | struct vm_area_struct *vma; | ||
2459 | |||
2460 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | ||
2461 | BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); | ||
2462 | |||
2463 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
2464 | if (vma->anon_vma) | ||
2465 | vm_unlock_anon_vma(vma->anon_vma); | ||
2466 | if (vma->vm_file && vma->vm_file->f_mapping) | ||
2467 | vm_unlock_mapping(vma->vm_file->f_mapping); | ||
2468 | } | ||
2469 | |||
2470 | mutex_unlock(&mm_all_locks_mutex); | ||
2471 | } | ||
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c new file mode 100644 index 000000000000..5f4ef0250bee --- /dev/null +++ b/mm/mmu_notifier.c | |||
@@ -0,0 +1,277 @@ | |||
1 | /* | ||
2 | * linux/mm/mmu_notifier.c | ||
3 | * | ||
4 | * Copyright (C) 2008 Qumranet, Inc. | ||
5 | * Copyright (C) 2008 SGI | ||
6 | * Christoph Lameter <clameter@sgi.com> | ||
7 | * | ||
8 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
9 | * the COPYING file in the top-level directory. | ||
10 | */ | ||
11 | |||
12 | #include <linux/rculist.h> | ||
13 | #include <linux/mmu_notifier.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/mm.h> | ||
16 | #include <linux/err.h> | ||
17 | #include <linux/rcupdate.h> | ||
18 | #include <linux/sched.h> | ||
19 | |||
20 | /* | ||
21 | * This function can't run concurrently against mmu_notifier_register | ||
22 | * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap | ||
23 | * runs with mm_users == 0. Other tasks may still invoke mmu notifiers | ||
24 | * in parallel despite there being no task using this mm any more, | ||
25 | * through the vmas outside of the exit_mmap context, such as with | ||
26 | * vmtruncate. This serializes against mmu_notifier_unregister with | ||
27 | * the mmu_notifier_mm->lock in addition to RCU and it serializes | ||
28 | * against the other mmu notifiers with RCU. struct mmu_notifier_mm | ||
29 | * can't go away from under us as exit_mmap holds an mm_count pin | ||
30 | * itself. | ||
31 | */ | ||
32 | void __mmu_notifier_release(struct mm_struct *mm) | ||
33 | { | ||
34 | struct mmu_notifier *mn; | ||
35 | |||
36 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
37 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { | ||
38 | mn = hlist_entry(mm->mmu_notifier_mm->list.first, | ||
39 | struct mmu_notifier, | ||
40 | hlist); | ||
41 | /* | ||
42 | * We arrived before mmu_notifier_unregister so | ||
43 | * mmu_notifier_unregister will do nothing other than | ||
44 | * to wait ->release to finish and | ||
45 | * mmu_notifier_unregister to return. | ||
46 | */ | ||
47 | hlist_del_init_rcu(&mn->hlist); | ||
48 | /* | ||
49 | * RCU here will block mmu_notifier_unregister until | ||
50 | * ->release returns. | ||
51 | */ | ||
52 | rcu_read_lock(); | ||
53 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
54 | /* | ||
55 | * if ->release runs before mmu_notifier_unregister it | ||
56 | * must be handled as it's the only way for the driver | ||
57 | * to flush all existing sptes and stop the driver | ||
58 | * from establishing any more sptes before all the | ||
59 | * pages in the mm are freed. | ||
60 | */ | ||
61 | if (mn->ops->release) | ||
62 | mn->ops->release(mn, mm); | ||
63 | rcu_read_unlock(); | ||
64 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
65 | } | ||
66 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
67 | |||
68 | /* | ||
69 | * synchronize_rcu here prevents mmu_notifier_release to | ||
70 | * return to exit_mmap (which would proceed freeing all pages | ||
71 | * in the mm) until the ->release method returns, if it was | ||
72 | * invoked by mmu_notifier_unregister. | ||
73 | * | ||
74 | * The mmu_notifier_mm can't go away from under us because one | ||
75 | * mm_count is hold by exit_mmap. | ||
76 | */ | ||
77 | synchronize_rcu(); | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * If no young bitflag is supported by the hardware, ->clear_flush_young can | ||
82 | * unmap the address and return 1 or 0 depending if the mapping previously | ||
83 | * existed or not. | ||
84 | */ | ||
85 | int __mmu_notifier_clear_flush_young(struct mm_struct *mm, | ||
86 | unsigned long address) | ||
87 | { | ||
88 | struct mmu_notifier *mn; | ||
89 | struct hlist_node *n; | ||
90 | int young = 0; | ||
91 | |||
92 | rcu_read_lock(); | ||
93 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | ||
94 | if (mn->ops->clear_flush_young) | ||
95 | young |= mn->ops->clear_flush_young(mn, mm, address); | ||
96 | } | ||
97 | rcu_read_unlock(); | ||
98 | |||
99 | return young; | ||
100 | } | ||
101 | |||
102 | void __mmu_notifier_invalidate_page(struct mm_struct *mm, | ||
103 | unsigned long address) | ||
104 | { | ||
105 | struct mmu_notifier *mn; | ||
106 | struct hlist_node *n; | ||
107 | |||
108 | rcu_read_lock(); | ||
109 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | ||
110 | if (mn->ops->invalidate_page) | ||
111 | mn->ops->invalidate_page(mn, mm, address); | ||
112 | } | ||
113 | rcu_read_unlock(); | ||
114 | } | ||
115 | |||
116 | void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, | ||
117 | unsigned long start, unsigned long end) | ||
118 | { | ||
119 | struct mmu_notifier *mn; | ||
120 | struct hlist_node *n; | ||
121 | |||
122 | rcu_read_lock(); | ||
123 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | ||
124 | if (mn->ops->invalidate_range_start) | ||
125 | mn->ops->invalidate_range_start(mn, mm, start, end); | ||
126 | } | ||
127 | rcu_read_unlock(); | ||
128 | } | ||
129 | |||
130 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | ||
131 | unsigned long start, unsigned long end) | ||
132 | { | ||
133 | struct mmu_notifier *mn; | ||
134 | struct hlist_node *n; | ||
135 | |||
136 | rcu_read_lock(); | ||
137 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | ||
138 | if (mn->ops->invalidate_range_end) | ||
139 | mn->ops->invalidate_range_end(mn, mm, start, end); | ||
140 | } | ||
141 | rcu_read_unlock(); | ||
142 | } | ||
143 | |||
144 | static int do_mmu_notifier_register(struct mmu_notifier *mn, | ||
145 | struct mm_struct *mm, | ||
146 | int take_mmap_sem) | ||
147 | { | ||
148 | struct mmu_notifier_mm *mmu_notifier_mm; | ||
149 | int ret; | ||
150 | |||
151 | BUG_ON(atomic_read(&mm->mm_users) <= 0); | ||
152 | |||
153 | ret = -ENOMEM; | ||
154 | mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); | ||
155 | if (unlikely(!mmu_notifier_mm)) | ||
156 | goto out; | ||
157 | |||
158 | if (take_mmap_sem) | ||
159 | down_write(&mm->mmap_sem); | ||
160 | ret = mm_take_all_locks(mm); | ||
161 | if (unlikely(ret)) | ||
162 | goto out_cleanup; | ||
163 | |||
164 | if (!mm_has_notifiers(mm)) { | ||
165 | INIT_HLIST_HEAD(&mmu_notifier_mm->list); | ||
166 | spin_lock_init(&mmu_notifier_mm->lock); | ||
167 | mm->mmu_notifier_mm = mmu_notifier_mm; | ||
168 | mmu_notifier_mm = NULL; | ||
169 | } | ||
170 | atomic_inc(&mm->mm_count); | ||
171 | |||
172 | /* | ||
173 | * Serialize the update against mmu_notifier_unregister. A | ||
174 | * side note: mmu_notifier_release can't run concurrently with | ||
175 | * us because we hold the mm_users pin (either implicitly as | ||
176 | * current->mm or explicitly with get_task_mm() or similar). | ||
177 | * We can't race against any other mmu notifier method either | ||
178 | * thanks to mm_take_all_locks(). | ||
179 | */ | ||
180 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
181 | hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); | ||
182 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
183 | |||
184 | mm_drop_all_locks(mm); | ||
185 | out_cleanup: | ||
186 | if (take_mmap_sem) | ||
187 | up_write(&mm->mmap_sem); | ||
188 | /* kfree() does nothing if mmu_notifier_mm is NULL */ | ||
189 | kfree(mmu_notifier_mm); | ||
190 | out: | ||
191 | BUG_ON(atomic_read(&mm->mm_users) <= 0); | ||
192 | return ret; | ||
193 | } | ||
194 | |||
195 | /* | ||
196 | * Must not hold mmap_sem nor any other VM related lock when calling | ||
197 | * this registration function. Must also ensure mm_users can't go down | ||
198 | * to zero while this runs to avoid races with mmu_notifier_release, | ||
199 | * so mm has to be current->mm or the mm should be pinned safely such | ||
200 | * as with get_task_mm(). If the mm is not current->mm, the mm_users | ||
201 | * pin should be released by calling mmput after mmu_notifier_register | ||
202 | * returns. mmu_notifier_unregister must be always called to | ||
203 | * unregister the notifier. mm_count is automatically pinned to allow | ||
204 | * mmu_notifier_unregister to safely run at any time later, before or | ||
205 | * after exit_mmap. ->release will always be called before exit_mmap | ||
206 | * frees the pages. | ||
207 | */ | ||
208 | int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) | ||
209 | { | ||
210 | return do_mmu_notifier_register(mn, mm, 1); | ||
211 | } | ||
212 | EXPORT_SYMBOL_GPL(mmu_notifier_register); | ||
213 | |||
214 | /* | ||
215 | * Same as mmu_notifier_register but here the caller must hold the | ||
216 | * mmap_sem in write mode. | ||
217 | */ | ||
218 | int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) | ||
219 | { | ||
220 | return do_mmu_notifier_register(mn, mm, 0); | ||
221 | } | ||
222 | EXPORT_SYMBOL_GPL(__mmu_notifier_register); | ||
223 | |||
224 | /* this is called after the last mmu_notifier_unregister() returned */ | ||
225 | void __mmu_notifier_mm_destroy(struct mm_struct *mm) | ||
226 | { | ||
227 | BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list)); | ||
228 | kfree(mm->mmu_notifier_mm); | ||
229 | mm->mmu_notifier_mm = LIST_POISON1; /* debug */ | ||
230 | } | ||
231 | |||
232 | /* | ||
233 | * This releases the mm_count pin automatically and frees the mm | ||
234 | * structure if it was the last user of it. It serializes against | ||
235 | * running mmu notifiers with RCU and against mmu_notifier_unregister | ||
236 | * with the unregister lock + RCU. All sptes must be dropped before | ||
237 | * calling mmu_notifier_unregister. ->release or any other notifier | ||
238 | * method may be invoked concurrently with mmu_notifier_unregister, | ||
239 | * and only after mmu_notifier_unregister returned we're guaranteed | ||
240 | * that ->release or any other method can't run anymore. | ||
241 | */ | ||
242 | void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | ||
243 | { | ||
244 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | ||
245 | |||
246 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
247 | if (!hlist_unhashed(&mn->hlist)) { | ||
248 | hlist_del_rcu(&mn->hlist); | ||
249 | |||
250 | /* | ||
251 | * RCU here will force exit_mmap to wait ->release to finish | ||
252 | * before freeing the pages. | ||
253 | */ | ||
254 | rcu_read_lock(); | ||
255 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
256 | /* | ||
257 | * exit_mmap will block in mmu_notifier_release to | ||
258 | * guarantee ->release is called before freeing the | ||
259 | * pages. | ||
260 | */ | ||
261 | if (mn->ops->release) | ||
262 | mn->ops->release(mn, mm); | ||
263 | rcu_read_unlock(); | ||
264 | } else | ||
265 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
266 | |||
267 | /* | ||
268 | * Wait any running method to finish, of course including | ||
269 | * ->release if it was run by mmu_notifier_relase instead of us. | ||
270 | */ | ||
271 | synchronize_rcu(); | ||
272 | |||
273 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | ||
274 | |||
275 | mmdrop(mm); | ||
276 | } | ||
277 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister); | ||
diff --git a/mm/mmzone.c b/mm/mmzone.c index 486ed595ee6f..16ce8b955dcf 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -69,6 +69,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, | |||
69 | (z->zone && !zref_in_nodemask(z, nodes))) | 69 | (z->zone && !zref_in_nodemask(z, nodes))) |
70 | z++; | 70 | z++; |
71 | 71 | ||
72 | *zone = zonelist_zone(z++); | 72 | *zone = zonelist_zone(z); |
73 | return z; | 73 | return z; |
74 | } | 74 | } |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 360d9cc8b38c..fded06f923f4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/syscalls.h> | 21 | #include <linux/syscalls.h> |
22 | #include <linux/swap.h> | 22 | #include <linux/swap.h> |
23 | #include <linux/swapops.h> | 23 | #include <linux/swapops.h> |
24 | #include <linux/mmu_notifier.h> | ||
24 | #include <asm/uaccess.h> | 25 | #include <asm/uaccess.h> |
25 | #include <asm/pgtable.h> | 26 | #include <asm/pgtable.h> |
26 | #include <asm/cacheflush.h> | 27 | #include <asm/cacheflush.h> |
@@ -153,12 +154,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
153 | * If we make a private mapping writable we increase our commit; | 154 | * If we make a private mapping writable we increase our commit; |
154 | * but (without finer accounting) cannot reduce our commit if we | 155 | * but (without finer accounting) cannot reduce our commit if we |
155 | * make it unwritable again. | 156 | * make it unwritable again. |
156 | * | ||
157 | * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting | ||
158 | * a MAP_NORESERVE private mapping to writable will now reserve. | ||
159 | */ | 157 | */ |
160 | if (newflags & VM_WRITE) { | 158 | if (newflags & VM_WRITE) { |
161 | if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { | 159 | if (!(oldflags & (VM_ACCOUNT|VM_WRITE| |
160 | VM_SHARED|VM_NORESERVE))) { | ||
162 | charged = nrpages; | 161 | charged = nrpages; |
163 | if (security_vm_enough_memory(charged)) | 162 | if (security_vm_enough_memory(charged)) |
164 | return -ENOMEM; | 163 | return -ENOMEM; |
@@ -205,10 +204,12 @@ success: | |||
205 | dirty_accountable = 1; | 204 | dirty_accountable = 1; |
206 | } | 205 | } |
207 | 206 | ||
207 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
208 | if (is_vm_hugetlb_page(vma)) | 208 | if (is_vm_hugetlb_page(vma)) |
209 | hugetlb_change_protection(vma, start, end, vma->vm_page_prot); | 209 | hugetlb_change_protection(vma, start, end, vma->vm_page_prot); |
210 | else | 210 | else |
211 | change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); | 211 | change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); |
212 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
212 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); | 213 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); |
213 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); | 214 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); |
214 | return 0; | 215 | return 0; |
diff --git a/mm/mremap.c b/mm/mremap.c index 08e3c7f2bd15..58a2908f42f5 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -18,11 +18,14 @@ | |||
18 | #include <linux/highmem.h> | 18 | #include <linux/highmem.h> |
19 | #include <linux/security.h> | 19 | #include <linux/security.h> |
20 | #include <linux/syscalls.h> | 20 | #include <linux/syscalls.h> |
21 | #include <linux/mmu_notifier.h> | ||
21 | 22 | ||
22 | #include <asm/uaccess.h> | 23 | #include <asm/uaccess.h> |
23 | #include <asm/cacheflush.h> | 24 | #include <asm/cacheflush.h> |
24 | #include <asm/tlbflush.h> | 25 | #include <asm/tlbflush.h> |
25 | 26 | ||
27 | #include "internal.h" | ||
28 | |||
26 | static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) | 29 | static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) |
27 | { | 30 | { |
28 | pgd_t *pgd; | 31 | pgd_t *pgd; |
@@ -74,7 +77,11 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
74 | struct mm_struct *mm = vma->vm_mm; | 77 | struct mm_struct *mm = vma->vm_mm; |
75 | pte_t *old_pte, *new_pte, pte; | 78 | pte_t *old_pte, *new_pte, pte; |
76 | spinlock_t *old_ptl, *new_ptl; | 79 | spinlock_t *old_ptl, *new_ptl; |
80 | unsigned long old_start; | ||
77 | 81 | ||
82 | old_start = old_addr; | ||
83 | mmu_notifier_invalidate_range_start(vma->vm_mm, | ||
84 | old_start, old_end); | ||
78 | if (vma->vm_file) { | 85 | if (vma->vm_file) { |
79 | /* | 86 | /* |
80 | * Subtle point from Rajesh Venkatasubramanian: before | 87 | * Subtle point from Rajesh Venkatasubramanian: before |
@@ -116,6 +123,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
116 | pte_unmap_unlock(old_pte - 1, old_ptl); | 123 | pte_unmap_unlock(old_pte - 1, old_ptl); |
117 | if (mapping) | 124 | if (mapping) |
118 | spin_unlock(&mapping->i_mmap_lock); | 125 | spin_unlock(&mapping->i_mmap_lock); |
126 | mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); | ||
119 | } | 127 | } |
120 | 128 | ||
121 | #define LATENCY_LIMIT (64 * PAGE_SIZE) | 129 | #define LATENCY_LIMIT (64 * PAGE_SIZE) |
@@ -232,8 +240,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
232 | if (vm_flags & VM_LOCKED) { | 240 | if (vm_flags & VM_LOCKED) { |
233 | mm->locked_vm += new_len >> PAGE_SHIFT; | 241 | mm->locked_vm += new_len >> PAGE_SHIFT; |
234 | if (new_len > old_len) | 242 | if (new_len > old_len) |
235 | make_pages_present(new_addr + old_len, | 243 | mlock_vma_pages_range(new_vma, new_addr + old_len, |
236 | new_addr + new_len); | 244 | new_addr + new_len); |
237 | } | 245 | } |
238 | 246 | ||
239 | return new_addr; | 247 | return new_addr; |
@@ -373,7 +381,7 @@ unsigned long do_mremap(unsigned long addr, | |||
373 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); | 381 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); |
374 | if (vma->vm_flags & VM_LOCKED) { | 382 | if (vma->vm_flags & VM_LOCKED) { |
375 | mm->locked_vm += pages; | 383 | mm->locked_vm += pages; |
376 | make_pages_present(addr + old_len, | 384 | mlock_vma_pages_range(vma, addr + old_len, |
377 | addr + new_len); | 385 | addr + new_len); |
378 | } | 386 | } |
379 | ret = addr; | 387 | ret = addr; |
diff --git a/mm/nommu.c b/mm/nommu.c index 4462b6a3fcb9..2696b24f2bb3 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -22,7 +22,7 @@ | |||
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <linux/vmalloc.h> | 24 | #include <linux/vmalloc.h> |
25 | #include <linux/ptrace.h> | 25 | #include <linux/tracehook.h> |
26 | #include <linux/blkdev.h> | 26 | #include <linux/blkdev.h> |
27 | #include <linux/backing-dev.h> | 27 | #include <linux/backing-dev.h> |
28 | #include <linux/mount.h> | 28 | #include <linux/mount.h> |
@@ -34,6 +34,8 @@ | |||
34 | #include <asm/tlb.h> | 34 | #include <asm/tlb.h> |
35 | #include <asm/tlbflush.h> | 35 | #include <asm/tlbflush.h> |
36 | 36 | ||
37 | #include "internal.h" | ||
38 | |||
37 | void *high_memory; | 39 | void *high_memory; |
38 | struct page *mem_map; | 40 | struct page *mem_map; |
39 | unsigned long max_mapnr; | 41 | unsigned long max_mapnr; |
@@ -128,20 +130,16 @@ unsigned int kobjsize(const void *objp) | |||
128 | return PAGE_SIZE << compound_order(page); | 130 | return PAGE_SIZE << compound_order(page); |
129 | } | 131 | } |
130 | 132 | ||
131 | /* | 133 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
132 | * get a list of pages in an address range belonging to the specified process | 134 | unsigned long start, int len, int flags, |
133 | * and indicate the VMA that covers each page | 135 | struct page **pages, struct vm_area_struct **vmas) |
134 | * - this is potentially dodgy as we may end incrementing the page count of a | ||
135 | * slab page or a secondary page from a compound page | ||
136 | * - don't permit access to VMAs that don't support it, such as I/O mappings | ||
137 | */ | ||
138 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | ||
139 | unsigned long start, int len, int write, int force, | ||
140 | struct page **pages, struct vm_area_struct **vmas) | ||
141 | { | 136 | { |
142 | struct vm_area_struct *vma; | 137 | struct vm_area_struct *vma; |
143 | unsigned long vm_flags; | 138 | unsigned long vm_flags; |
144 | int i; | 139 | int i; |
140 | int write = !!(flags & GUP_FLAGS_WRITE); | ||
141 | int force = !!(flags & GUP_FLAGS_FORCE); | ||
142 | int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); | ||
145 | 143 | ||
146 | /* calculate required read or write permissions. | 144 | /* calculate required read or write permissions. |
147 | * - if 'force' is set, we only require the "MAY" flags. | 145 | * - if 'force' is set, we only require the "MAY" flags. |
@@ -156,7 +154,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
156 | 154 | ||
157 | /* protect what we can, including chardevs */ | 155 | /* protect what we can, including chardevs */ |
158 | if (vma->vm_flags & (VM_IO | VM_PFNMAP) || | 156 | if (vma->vm_flags & (VM_IO | VM_PFNMAP) || |
159 | !(vm_flags & vma->vm_flags)) | 157 | (!ignore && !(vm_flags & vma->vm_flags))) |
160 | goto finish_or_fault; | 158 | goto finish_or_fault; |
161 | 159 | ||
162 | if (pages) { | 160 | if (pages) { |
@@ -174,6 +172,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
174 | finish_or_fault: | 172 | finish_or_fault: |
175 | return i ? : -EFAULT; | 173 | return i ? : -EFAULT; |
176 | } | 174 | } |
175 | |||
176 | |||
177 | /* | ||
178 | * get a list of pages in an address range belonging to the specified process | ||
179 | * and indicate the VMA that covers each page | ||
180 | * - this is potentially dodgy as we may end incrementing the page count of a | ||
181 | * slab page or a secondary page from a compound page | ||
182 | * - don't permit access to VMAs that don't support it, such as I/O mappings | ||
183 | */ | ||
184 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | ||
185 | unsigned long start, int len, int write, int force, | ||
186 | struct page **pages, struct vm_area_struct **vmas) | ||
187 | { | ||
188 | int flags = 0; | ||
189 | |||
190 | if (write) | ||
191 | flags |= GUP_FLAGS_WRITE; | ||
192 | if (force) | ||
193 | flags |= GUP_FLAGS_FORCE; | ||
194 | |||
195 | return __get_user_pages(tsk, mm, | ||
196 | start, len, flags, | ||
197 | pages, vmas); | ||
198 | } | ||
177 | EXPORT_SYMBOL(get_user_pages); | 199 | EXPORT_SYMBOL(get_user_pages); |
178 | 200 | ||
179 | DEFINE_RWLOCK(vmlist_lock); | 201 | DEFINE_RWLOCK(vmlist_lock); |
@@ -266,6 +288,27 @@ void *vmalloc_node(unsigned long size, int node) | |||
266 | } | 288 | } |
267 | EXPORT_SYMBOL(vmalloc_node); | 289 | EXPORT_SYMBOL(vmalloc_node); |
268 | 290 | ||
291 | #ifndef PAGE_KERNEL_EXEC | ||
292 | # define PAGE_KERNEL_EXEC PAGE_KERNEL | ||
293 | #endif | ||
294 | |||
295 | /** | ||
296 | * vmalloc_exec - allocate virtually contiguous, executable memory | ||
297 | * @size: allocation size | ||
298 | * | ||
299 | * Kernel-internal function to allocate enough pages to cover @size | ||
300 | * the page level allocator and map them into contiguous and | ||
301 | * executable kernel virtual space. | ||
302 | * | ||
303 | * For tight control over page level allocator and protection flags | ||
304 | * use __vmalloc() instead. | ||
305 | */ | ||
306 | |||
307 | void *vmalloc_exec(unsigned long size) | ||
308 | { | ||
309 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); | ||
310 | } | ||
311 | |||
269 | /** | 312 | /** |
270 | * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) | 313 | * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) |
271 | * @size: allocation size | 314 | * @size: allocation size |
@@ -745,7 +788,7 @@ static unsigned long determine_vm_flags(struct file *file, | |||
745 | * it's being traced - otherwise breakpoints set in it may interfere | 788 | * it's being traced - otherwise breakpoints set in it may interfere |
746 | * with another untraced process | 789 | * with another untraced process |
747 | */ | 790 | */ |
748 | if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED)) | 791 | if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) |
749 | vm_flags &= ~VM_MAYSHARE; | 792 | vm_flags &= ~VM_MAYSHARE; |
750 | 793 | ||
751 | return vm_flags; | 794 | return vm_flags; |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8a5467ee6265..64e5b4bcd964 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/module.h> | 26 | #include <linux/module.h> |
27 | #include <linux/notifier.h> | 27 | #include <linux/notifier.h> |
28 | #include <linux/memcontrol.h> | 28 | #include <linux/memcontrol.h> |
29 | #include <linux/security.h> | ||
29 | 30 | ||
30 | int sysctl_panic_on_oom; | 31 | int sysctl_panic_on_oom; |
31 | int sysctl_oom_kill_allocating_task; | 32 | int sysctl_oom_kill_allocating_task; |
@@ -128,7 +129,8 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
128 | * Superuser processes are usually more important, so we make it | 129 | * Superuser processes are usually more important, so we make it |
129 | * less likely that we kill those. | 130 | * less likely that we kill those. |
130 | */ | 131 | */ |
131 | if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE)) | 132 | if (has_capability(p, CAP_SYS_ADMIN) || |
133 | has_capability(p, CAP_SYS_RESOURCE)) | ||
132 | points /= 4; | 134 | points /= 4; |
133 | 135 | ||
134 | /* | 136 | /* |
@@ -137,7 +139,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
137 | * tend to only have this flag set on applications they think | 139 | * tend to only have this flag set on applications they think |
138 | * of as important. | 140 | * of as important. |
139 | */ | 141 | */ |
140 | if (__capable(p, CAP_SYS_RAWIO)) | 142 | if (has_capability(p, CAP_SYS_RAWIO)) |
141 | points /= 4; | 143 | points /= 4; |
142 | 144 | ||
143 | /* | 145 | /* |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 94c6d8988ab3..2970e35fd03f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -7,7 +7,7 @@ | |||
7 | * Contains functions related to writing back dirty pages at the | 7 | * Contains functions related to writing back dirty pages at the |
8 | * address_space level. | 8 | * address_space level. |
9 | * | 9 | * |
10 | * 10Apr2002 akpm@zip.com.au | 10 | * 10Apr2002 Andrew Morton |
11 | * Initial version | 11 | * Initial version |
12 | */ | 12 | */ |
13 | 13 | ||
@@ -329,9 +329,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) | |||
329 | struct zone *z = | 329 | struct zone *z = |
330 | &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; | 330 | &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; |
331 | 331 | ||
332 | x += zone_page_state(z, NR_FREE_PAGES) | 332 | x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); |
333 | + zone_page_state(z, NR_INACTIVE) | ||
334 | + zone_page_state(z, NR_ACTIVE); | ||
335 | } | 333 | } |
336 | /* | 334 | /* |
337 | * Make sure that the number of highmem pages is never larger | 335 | * Make sure that the number of highmem pages is never larger |
@@ -355,9 +353,7 @@ unsigned long determine_dirtyable_memory(void) | |||
355 | { | 353 | { |
356 | unsigned long x; | 354 | unsigned long x; |
357 | 355 | ||
358 | x = global_page_state(NR_FREE_PAGES) | 356 | x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); |
359 | + global_page_state(NR_INACTIVE) | ||
360 | + global_page_state(NR_ACTIVE); | ||
361 | 357 | ||
362 | if (!vm_highmem_is_dirtyable) | 358 | if (!vm_highmem_is_dirtyable) |
363 | x -= highmem_dirtyable_memory(x); | 359 | x -= highmem_dirtyable_memory(x); |
@@ -876,6 +872,7 @@ int write_cache_pages(struct address_space *mapping, | |||
876 | pgoff_t end; /* Inclusive */ | 872 | pgoff_t end; /* Inclusive */ |
877 | int scanned = 0; | 873 | int scanned = 0; |
878 | int range_whole = 0; | 874 | int range_whole = 0; |
875 | long nr_to_write = wbc->nr_to_write; | ||
879 | 876 | ||
880 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | 877 | if (wbc->nonblocking && bdi_write_congested(bdi)) { |
881 | wbc->encountered_congestion = 1; | 878 | wbc->encountered_congestion = 1; |
@@ -939,7 +936,7 @@ retry: | |||
939 | unlock_page(page); | 936 | unlock_page(page); |
940 | ret = 0; | 937 | ret = 0; |
941 | } | 938 | } |
942 | if (ret || (--(wbc->nr_to_write) <= 0)) | 939 | if (ret || (--nr_to_write <= 0)) |
943 | done = 1; | 940 | done = 1; |
944 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | 941 | if (wbc->nonblocking && bdi_write_congested(bdi)) { |
945 | wbc->encountered_congestion = 1; | 942 | wbc->encountered_congestion = 1; |
@@ -958,11 +955,12 @@ retry: | |||
958 | index = 0; | 955 | index = 0; |
959 | goto retry; | 956 | goto retry; |
960 | } | 957 | } |
961 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) | 958 | if (!wbc->no_nrwrite_index_update) { |
962 | mapping->writeback_index = index; | 959 | if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) |
960 | mapping->writeback_index = index; | ||
961 | wbc->nr_to_write = nr_to_write; | ||
962 | } | ||
963 | 963 | ||
964 | if (wbc->range_cont) | ||
965 | wbc->range_start = index << PAGE_CACHE_SHIFT; | ||
966 | return ret; | 964 | return ret; |
967 | } | 965 | } |
968 | EXPORT_SYMBOL(write_cache_pages); | 966 | EXPORT_SYMBOL(write_cache_pages); |
@@ -1088,7 +1086,7 @@ int __set_page_dirty_nobuffers(struct page *page) | |||
1088 | if (!mapping) | 1086 | if (!mapping) |
1089 | return 1; | 1087 | return 1; |
1090 | 1088 | ||
1091 | write_lock_irq(&mapping->tree_lock); | 1089 | spin_lock_irq(&mapping->tree_lock); |
1092 | mapping2 = page_mapping(page); | 1090 | mapping2 = page_mapping(page); |
1093 | if (mapping2) { /* Race with truncate? */ | 1091 | if (mapping2) { /* Race with truncate? */ |
1094 | BUG_ON(mapping2 != mapping); | 1092 | BUG_ON(mapping2 != mapping); |
@@ -1102,7 +1100,7 @@ int __set_page_dirty_nobuffers(struct page *page) | |||
1102 | radix_tree_tag_set(&mapping->page_tree, | 1100 | radix_tree_tag_set(&mapping->page_tree, |
1103 | page_index(page), PAGECACHE_TAG_DIRTY); | 1101 | page_index(page), PAGECACHE_TAG_DIRTY); |
1104 | } | 1102 | } |
1105 | write_unlock_irq(&mapping->tree_lock); | 1103 | spin_unlock_irq(&mapping->tree_lock); |
1106 | if (mapping->host) { | 1104 | if (mapping->host) { |
1107 | /* !PageAnon && !swapper_space */ | 1105 | /* !PageAnon && !swapper_space */ |
1108 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | 1106 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
@@ -1258,7 +1256,7 @@ int test_clear_page_writeback(struct page *page) | |||
1258 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1256 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
1259 | unsigned long flags; | 1257 | unsigned long flags; |
1260 | 1258 | ||
1261 | write_lock_irqsave(&mapping->tree_lock, flags); | 1259 | spin_lock_irqsave(&mapping->tree_lock, flags); |
1262 | ret = TestClearPageWriteback(page); | 1260 | ret = TestClearPageWriteback(page); |
1263 | if (ret) { | 1261 | if (ret) { |
1264 | radix_tree_tag_clear(&mapping->page_tree, | 1262 | radix_tree_tag_clear(&mapping->page_tree, |
@@ -1269,7 +1267,7 @@ int test_clear_page_writeback(struct page *page) | |||
1269 | __bdi_writeout_inc(bdi); | 1267 | __bdi_writeout_inc(bdi); |
1270 | } | 1268 | } |
1271 | } | 1269 | } |
1272 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 1270 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
1273 | } else { | 1271 | } else { |
1274 | ret = TestClearPageWriteback(page); | 1272 | ret = TestClearPageWriteback(page); |
1275 | } | 1273 | } |
@@ -1287,7 +1285,7 @@ int test_set_page_writeback(struct page *page) | |||
1287 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1285 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
1288 | unsigned long flags; | 1286 | unsigned long flags; |
1289 | 1287 | ||
1290 | write_lock_irqsave(&mapping->tree_lock, flags); | 1288 | spin_lock_irqsave(&mapping->tree_lock, flags); |
1291 | ret = TestSetPageWriteback(page); | 1289 | ret = TestSetPageWriteback(page); |
1292 | if (!ret) { | 1290 | if (!ret) { |
1293 | radix_tree_tag_set(&mapping->page_tree, | 1291 | radix_tree_tag_set(&mapping->page_tree, |
@@ -1300,7 +1298,7 @@ int test_set_page_writeback(struct page *page) | |||
1300 | radix_tree_tag_clear(&mapping->page_tree, | 1298 | radix_tree_tag_clear(&mapping->page_tree, |
1301 | page_index(page), | 1299 | page_index(page), |
1302 | PAGECACHE_TAG_DIRTY); | 1300 | PAGECACHE_TAG_DIRTY); |
1303 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 1301 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
1304 | } else { | 1302 | } else { |
1305 | ret = TestSetPageWriteback(page); | 1303 | ret = TestSetPageWriteback(page); |
1306 | } | 1304 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 79ac4afc908c..d0a240fbb8bf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -44,7 +44,7 @@ | |||
44 | #include <linux/backing-dev.h> | 44 | #include <linux/backing-dev.h> |
45 | #include <linux/fault-inject.h> | 45 | #include <linux/fault-inject.h> |
46 | #include <linux/page-isolation.h> | 46 | #include <linux/page-isolation.h> |
47 | #include <linux/memcontrol.h> | 47 | #include <linux/page_cgroup.h> |
48 | #include <linux/debugobjects.h> | 48 | #include <linux/debugobjects.h> |
49 | 49 | ||
50 | #include <asm/tlbflush.h> | 50 | #include <asm/tlbflush.h> |
@@ -153,9 +153,9 @@ static unsigned long __meminitdata dma_reserve; | |||
153 | static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; | 153 | static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; |
154 | static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; | 154 | static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; |
155 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | 155 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ |
156 | unsigned long __initdata required_kernelcore; | 156 | static unsigned long __initdata required_kernelcore; |
157 | static unsigned long __initdata required_movablecore; | 157 | static unsigned long __initdata required_movablecore; |
158 | unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; | 158 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; |
159 | 159 | ||
160 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ | 160 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ |
161 | int movable_zone; | 161 | int movable_zone; |
@@ -223,17 +223,12 @@ static inline int bad_range(struct zone *zone, struct page *page) | |||
223 | 223 | ||
224 | static void bad_page(struct page *page) | 224 | static void bad_page(struct page *page) |
225 | { | 225 | { |
226 | void *pc = page_get_page_cgroup(page); | ||
227 | |||
228 | printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG | 226 | printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG |
229 | "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", | 227 | "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", |
230 | current->comm, page, (int)(2*sizeof(unsigned long)), | 228 | current->comm, page, (int)(2*sizeof(unsigned long)), |
231 | (unsigned long)page->flags, page->mapping, | 229 | (unsigned long)page->flags, page->mapping, |
232 | page_mapcount(page), page_count(page)); | 230 | page_mapcount(page), page_count(page)); |
233 | if (pc) { | 231 | |
234 | printk(KERN_EMERG "cgroup:%p\n", pc); | ||
235 | page_reset_bad_cgroup(page); | ||
236 | } | ||
237 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" | 232 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" |
238 | KERN_EMERG "Backtrace:\n"); | 233 | KERN_EMERG "Backtrace:\n"); |
239 | dump_stack(); | 234 | dump_stack(); |
@@ -264,17 +259,18 @@ static void free_compound_page(struct page *page) | |||
264 | __free_pages_ok(page, compound_order(page)); | 259 | __free_pages_ok(page, compound_order(page)); |
265 | } | 260 | } |
266 | 261 | ||
267 | static void prep_compound_page(struct page *page, unsigned long order) | 262 | void prep_compound_page(struct page *page, unsigned long order) |
268 | { | 263 | { |
269 | int i; | 264 | int i; |
270 | int nr_pages = 1 << order; | 265 | int nr_pages = 1 << order; |
266 | struct page *p = page + 1; | ||
271 | 267 | ||
272 | set_compound_page_dtor(page, free_compound_page); | 268 | set_compound_page_dtor(page, free_compound_page); |
273 | set_compound_order(page, order); | 269 | set_compound_order(page, order); |
274 | __SetPageHead(page); | 270 | __SetPageHead(page); |
275 | for (i = 1; i < nr_pages; i++) { | 271 | for (i = 1; i < nr_pages; i++, p++) { |
276 | struct page *p = page + i; | 272 | if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) |
277 | 273 | p = pfn_to_page(page_to_pfn(page) + i); | |
278 | __SetPageTail(p); | 274 | __SetPageTail(p); |
279 | p->first_page = page; | 275 | p->first_page = page; |
280 | } | 276 | } |
@@ -284,6 +280,7 @@ static void destroy_compound_page(struct page *page, unsigned long order) | |||
284 | { | 280 | { |
285 | int i; | 281 | int i; |
286 | int nr_pages = 1 << order; | 282 | int nr_pages = 1 << order; |
283 | struct page *p = page + 1; | ||
287 | 284 | ||
288 | if (unlikely(compound_order(page) != order)) | 285 | if (unlikely(compound_order(page) != order)) |
289 | bad_page(page); | 286 | bad_page(page); |
@@ -291,8 +288,9 @@ static void destroy_compound_page(struct page *page, unsigned long order) | |||
291 | if (unlikely(!PageHead(page))) | 288 | if (unlikely(!PageHead(page))) |
292 | bad_page(page); | 289 | bad_page(page); |
293 | __ClearPageHead(page); | 290 | __ClearPageHead(page); |
294 | for (i = 1; i < nr_pages; i++) { | 291 | for (i = 1; i < nr_pages; i++, p++) { |
295 | struct page *p = page + i; | 292 | if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) |
293 | p = pfn_to_page(page_to_pfn(page) + i); | ||
296 | 294 | ||
297 | if (unlikely(!PageTail(p) | | 295 | if (unlikely(!PageTail(p) | |
298 | (p->first_page != page))) | 296 | (p->first_page != page))) |
@@ -432,8 +430,9 @@ static inline void __free_one_page(struct page *page, | |||
432 | 430 | ||
433 | buddy = __page_find_buddy(page, page_idx, order); | 431 | buddy = __page_find_buddy(page, page_idx, order); |
434 | if (!page_is_buddy(page, buddy, order)) | 432 | if (!page_is_buddy(page, buddy, order)) |
435 | break; /* Move the buddy up one level. */ | 433 | break; |
436 | 434 | ||
435 | /* Our buddy is free, merge with it and move up one order. */ | ||
437 | list_del(&buddy->lru); | 436 | list_del(&buddy->lru); |
438 | zone->free_area[order].nr_free--; | 437 | zone->free_area[order].nr_free--; |
439 | rmv_page_order(buddy); | 438 | rmv_page_order(buddy); |
@@ -450,14 +449,16 @@ static inline void __free_one_page(struct page *page, | |||
450 | 449 | ||
451 | static inline int free_pages_check(struct page *page) | 450 | static inline int free_pages_check(struct page *page) |
452 | { | 451 | { |
452 | free_page_mlock(page); | ||
453 | if (unlikely(page_mapcount(page) | | 453 | if (unlikely(page_mapcount(page) | |
454 | (page->mapping != NULL) | | 454 | (page->mapping != NULL) | |
455 | (page_get_page_cgroup(page) != NULL) | | ||
456 | (page_count(page) != 0) | | 455 | (page_count(page) != 0) | |
457 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) | 456 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) |
458 | bad_page(page); | 457 | bad_page(page); |
459 | if (PageDirty(page)) | 458 | if (PageDirty(page)) |
460 | __ClearPageDirty(page); | 459 | __ClearPageDirty(page); |
460 | if (PageSwapBacked(page)) | ||
461 | __ClearPageSwapBacked(page); | ||
461 | /* | 462 | /* |
462 | * For now, we report if PG_reserved was found set, but do not | 463 | * For now, we report if PG_reserved was found set, but do not |
463 | * clear it, and do not free the page. But we shall soon need | 464 | * clear it, and do not free the page. But we shall soon need |
@@ -532,7 +533,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
532 | /* | 533 | /* |
533 | * permit the bootmem allocator to evade page validation on high-order frees | 534 | * permit the bootmem allocator to evade page validation on high-order frees |
534 | */ | 535 | */ |
535 | void __free_pages_bootmem(struct page *page, unsigned int order) | 536 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) |
536 | { | 537 | { |
537 | if (order == 0) { | 538 | if (order == 0) { |
538 | __ClearPageReserved(page); | 539 | __ClearPageReserved(page); |
@@ -596,7 +597,6 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
596 | { | 597 | { |
597 | if (unlikely(page_mapcount(page) | | 598 | if (unlikely(page_mapcount(page) | |
598 | (page->mapping != NULL) | | 599 | (page->mapping != NULL) | |
599 | (page_get_page_cgroup(page) != NULL) | | ||
600 | (page_count(page) != 0) | | 600 | (page_count(page) != 0) | |
601 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) | 601 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) |
602 | bad_page(page); | 602 | bad_page(page); |
@@ -610,7 +610,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
610 | 610 | ||
611 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | | 611 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | |
612 | 1 << PG_referenced | 1 << PG_arch_1 | | 612 | 1 << PG_referenced | 1 << PG_arch_1 | |
613 | 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); | 613 | 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk |
614 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
615 | | 1 << PG_mlocked | ||
616 | #endif | ||
617 | ); | ||
614 | set_page_private(page, 0); | 618 | set_page_private(page, 0); |
615 | set_page_refcounted(page); | 619 | set_page_refcounted(page); |
616 | 620 | ||
@@ -673,9 +677,9 @@ static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { | |||
673 | * Note that start_page and end_pages are not aligned on a pageblock | 677 | * Note that start_page and end_pages are not aligned on a pageblock |
674 | * boundary. If alignment is required, use move_freepages_block() | 678 | * boundary. If alignment is required, use move_freepages_block() |
675 | */ | 679 | */ |
676 | int move_freepages(struct zone *zone, | 680 | static int move_freepages(struct zone *zone, |
677 | struct page *start_page, struct page *end_page, | 681 | struct page *start_page, struct page *end_page, |
678 | int migratetype) | 682 | int migratetype) |
679 | { | 683 | { |
680 | struct page *page; | 684 | struct page *page; |
681 | unsigned long order; | 685 | unsigned long order; |
@@ -693,6 +697,9 @@ int move_freepages(struct zone *zone, | |||
693 | #endif | 697 | #endif |
694 | 698 | ||
695 | for (page = start_page; page <= end_page;) { | 699 | for (page = start_page; page <= end_page;) { |
700 | /* Make sure we are not inadvertently changing nodes */ | ||
701 | VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); | ||
702 | |||
696 | if (!pfn_valid_within(page_to_pfn(page))) { | 703 | if (!pfn_valid_within(page_to_pfn(page))) { |
697 | page++; | 704 | page++; |
698 | continue; | 705 | continue; |
@@ -714,7 +721,8 @@ int move_freepages(struct zone *zone, | |||
714 | return pages_moved; | 721 | return pages_moved; |
715 | } | 722 | } |
716 | 723 | ||
717 | int move_freepages_block(struct zone *zone, struct page *page, int migratetype) | 724 | static int move_freepages_block(struct zone *zone, struct page *page, |
725 | int migratetype) | ||
718 | { | 726 | { |
719 | unsigned long start_pfn, end_pfn; | 727 | unsigned long start_pfn, end_pfn; |
720 | struct page *start_page, *end_page; | 728 | struct page *start_page, *end_page; |
@@ -1429,7 +1437,7 @@ try_next_zone: | |||
1429 | /* | 1437 | /* |
1430 | * This is the 'heart' of the zoned buddy allocator. | 1438 | * This is the 'heart' of the zoned buddy allocator. |
1431 | */ | 1439 | */ |
1432 | static struct page * | 1440 | struct page * |
1433 | __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, | 1441 | __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, |
1434 | struct zonelist *zonelist, nodemask_t *nodemask) | 1442 | struct zonelist *zonelist, nodemask_t *nodemask) |
1435 | { | 1443 | { |
@@ -1632,22 +1640,7 @@ nopage: | |||
1632 | got_pg: | 1640 | got_pg: |
1633 | return page; | 1641 | return page; |
1634 | } | 1642 | } |
1635 | 1643 | EXPORT_SYMBOL(__alloc_pages_internal); | |
1636 | struct page * | ||
1637 | __alloc_pages(gfp_t gfp_mask, unsigned int order, | ||
1638 | struct zonelist *zonelist) | ||
1639 | { | ||
1640 | return __alloc_pages_internal(gfp_mask, order, zonelist, NULL); | ||
1641 | } | ||
1642 | |||
1643 | struct page * | ||
1644 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | ||
1645 | struct zonelist *zonelist, nodemask_t *nodemask) | ||
1646 | { | ||
1647 | return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask); | ||
1648 | } | ||
1649 | |||
1650 | EXPORT_SYMBOL(__alloc_pages); | ||
1651 | 1644 | ||
1652 | /* | 1645 | /* |
1653 | * Common helper functions. | 1646 | * Common helper functions. |
@@ -1711,6 +1704,59 @@ void free_pages(unsigned long addr, unsigned int order) | |||
1711 | 1704 | ||
1712 | EXPORT_SYMBOL(free_pages); | 1705 | EXPORT_SYMBOL(free_pages); |
1713 | 1706 | ||
1707 | /** | ||
1708 | * alloc_pages_exact - allocate an exact number physically-contiguous pages. | ||
1709 | * @size: the number of bytes to allocate | ||
1710 | * @gfp_mask: GFP flags for the allocation | ||
1711 | * | ||
1712 | * This function is similar to alloc_pages(), except that it allocates the | ||
1713 | * minimum number of pages to satisfy the request. alloc_pages() can only | ||
1714 | * allocate memory in power-of-two pages. | ||
1715 | * | ||
1716 | * This function is also limited by MAX_ORDER. | ||
1717 | * | ||
1718 | * Memory allocated by this function must be released by free_pages_exact(). | ||
1719 | */ | ||
1720 | void *alloc_pages_exact(size_t size, gfp_t gfp_mask) | ||
1721 | { | ||
1722 | unsigned int order = get_order(size); | ||
1723 | unsigned long addr; | ||
1724 | |||
1725 | addr = __get_free_pages(gfp_mask, order); | ||
1726 | if (addr) { | ||
1727 | unsigned long alloc_end = addr + (PAGE_SIZE << order); | ||
1728 | unsigned long used = addr + PAGE_ALIGN(size); | ||
1729 | |||
1730 | split_page(virt_to_page(addr), order); | ||
1731 | while (used < alloc_end) { | ||
1732 | free_page(used); | ||
1733 | used += PAGE_SIZE; | ||
1734 | } | ||
1735 | } | ||
1736 | |||
1737 | return (void *)addr; | ||
1738 | } | ||
1739 | EXPORT_SYMBOL(alloc_pages_exact); | ||
1740 | |||
1741 | /** | ||
1742 | * free_pages_exact - release memory allocated via alloc_pages_exact() | ||
1743 | * @virt: the value returned by alloc_pages_exact. | ||
1744 | * @size: size of allocation, same value as passed to alloc_pages_exact(). | ||
1745 | * | ||
1746 | * Release the memory allocated by a previous call to alloc_pages_exact. | ||
1747 | */ | ||
1748 | void free_pages_exact(void *virt, size_t size) | ||
1749 | { | ||
1750 | unsigned long addr = (unsigned long)virt; | ||
1751 | unsigned long end = addr + PAGE_ALIGN(size); | ||
1752 | |||
1753 | while (addr < end) { | ||
1754 | free_page(addr); | ||
1755 | addr += PAGE_SIZE; | ||
1756 | } | ||
1757 | } | ||
1758 | EXPORT_SYMBOL(free_pages_exact); | ||
1759 | |||
1714 | static unsigned int nr_free_zone_pages(int offset) | 1760 | static unsigned int nr_free_zone_pages(int offset) |
1715 | { | 1761 | { |
1716 | struct zoneref *z; | 1762 | struct zoneref *z; |
@@ -1816,10 +1862,21 @@ void show_free_areas(void) | |||
1816 | } | 1862 | } |
1817 | } | 1863 | } |
1818 | 1864 | ||
1819 | printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" | 1865 | printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" |
1866 | " inactive_file:%lu" | ||
1867 | //TODO: check/adjust line lengths | ||
1868 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
1869 | " unevictable:%lu" | ||
1870 | #endif | ||
1871 | " dirty:%lu writeback:%lu unstable:%lu\n" | ||
1820 | " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", | 1872 | " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", |
1821 | global_page_state(NR_ACTIVE), | 1873 | global_page_state(NR_ACTIVE_ANON), |
1822 | global_page_state(NR_INACTIVE), | 1874 | global_page_state(NR_ACTIVE_FILE), |
1875 | global_page_state(NR_INACTIVE_ANON), | ||
1876 | global_page_state(NR_INACTIVE_FILE), | ||
1877 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
1878 | global_page_state(NR_UNEVICTABLE), | ||
1879 | #endif | ||
1823 | global_page_state(NR_FILE_DIRTY), | 1880 | global_page_state(NR_FILE_DIRTY), |
1824 | global_page_state(NR_WRITEBACK), | 1881 | global_page_state(NR_WRITEBACK), |
1825 | global_page_state(NR_UNSTABLE_NFS), | 1882 | global_page_state(NR_UNSTABLE_NFS), |
@@ -1842,8 +1899,13 @@ void show_free_areas(void) | |||
1842 | " min:%lukB" | 1899 | " min:%lukB" |
1843 | " low:%lukB" | 1900 | " low:%lukB" |
1844 | " high:%lukB" | 1901 | " high:%lukB" |
1845 | " active:%lukB" | 1902 | " active_anon:%lukB" |
1846 | " inactive:%lukB" | 1903 | " inactive_anon:%lukB" |
1904 | " active_file:%lukB" | ||
1905 | " inactive_file:%lukB" | ||
1906 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
1907 | " unevictable:%lukB" | ||
1908 | #endif | ||
1847 | " present:%lukB" | 1909 | " present:%lukB" |
1848 | " pages_scanned:%lu" | 1910 | " pages_scanned:%lu" |
1849 | " all_unreclaimable? %s" | 1911 | " all_unreclaimable? %s" |
@@ -1853,8 +1915,13 @@ void show_free_areas(void) | |||
1853 | K(zone->pages_min), | 1915 | K(zone->pages_min), |
1854 | K(zone->pages_low), | 1916 | K(zone->pages_low), |
1855 | K(zone->pages_high), | 1917 | K(zone->pages_high), |
1856 | K(zone_page_state(zone, NR_ACTIVE)), | 1918 | K(zone_page_state(zone, NR_ACTIVE_ANON)), |
1857 | K(zone_page_state(zone, NR_INACTIVE)), | 1919 | K(zone_page_state(zone, NR_INACTIVE_ANON)), |
1920 | K(zone_page_state(zone, NR_ACTIVE_FILE)), | ||
1921 | K(zone_page_state(zone, NR_INACTIVE_FILE)), | ||
1922 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
1923 | K(zone_page_state(zone, NR_UNEVICTABLE)), | ||
1924 | #endif | ||
1858 | K(zone->present_pages), | 1925 | K(zone->present_pages), |
1859 | zone->pages_scanned, | 1926 | zone->pages_scanned, |
1860 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") | 1927 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") |
@@ -2332,7 +2399,7 @@ static void build_zonelist_cache(pg_data_t *pgdat) | |||
2332 | 2399 | ||
2333 | #endif /* CONFIG_NUMA */ | 2400 | #endif /* CONFIG_NUMA */ |
2334 | 2401 | ||
2335 | /* return values int ....just for stop_machine_run() */ | 2402 | /* return values int ....just for stop_machine() */ |
2336 | static int __build_all_zonelists(void *dummy) | 2403 | static int __build_all_zonelists(void *dummy) |
2337 | { | 2404 | { |
2338 | int nid; | 2405 | int nid; |
@@ -2352,11 +2419,12 @@ void build_all_zonelists(void) | |||
2352 | 2419 | ||
2353 | if (system_state == SYSTEM_BOOTING) { | 2420 | if (system_state == SYSTEM_BOOTING) { |
2354 | __build_all_zonelists(NULL); | 2421 | __build_all_zonelists(NULL); |
2422 | mminit_verify_zonelist(); | ||
2355 | cpuset_init_current_mems_allowed(); | 2423 | cpuset_init_current_mems_allowed(); |
2356 | } else { | 2424 | } else { |
2357 | /* we have to stop all cpus to guarantee there is no user | 2425 | /* we have to stop all cpus to guarantee there is no user |
2358 | of zonelist */ | 2426 | of zonelist */ |
2359 | stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); | 2427 | stop_machine(__build_all_zonelists, NULL, NULL); |
2360 | /* cpuset refresh routine should be here */ | 2428 | /* cpuset refresh routine should be here */ |
2361 | } | 2429 | } |
2362 | vm_total_pages = nr_free_pagecache_pages(); | 2430 | vm_total_pages = nr_free_pagecache_pages(); |
@@ -2475,6 +2543,10 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
2475 | continue; | 2543 | continue; |
2476 | page = pfn_to_page(pfn); | 2544 | page = pfn_to_page(pfn); |
2477 | 2545 | ||
2546 | /* Watch out for overlapping nodes */ | ||
2547 | if (page_to_nid(page) != zone_to_nid(zone)) | ||
2548 | continue; | ||
2549 | |||
2478 | /* Blocks with reserved pages will never free, skip them. */ | 2550 | /* Blocks with reserved pages will never free, skip them. */ |
2479 | if (PageReserved(page)) | 2551 | if (PageReserved(page)) |
2480 | continue; | 2552 | continue; |
@@ -2534,6 +2606,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
2534 | } | 2606 | } |
2535 | page = pfn_to_page(pfn); | 2607 | page = pfn_to_page(pfn); |
2536 | set_page_links(page, zone, nid, pfn); | 2608 | set_page_links(page, zone, nid, pfn); |
2609 | mminit_verify_page_links(page, zone, nid, pfn); | ||
2537 | init_page_count(page); | 2610 | init_page_count(page); |
2538 | reset_page_mapcount(page); | 2611 | reset_page_mapcount(page); |
2539 | SetPageReserved(page); | 2612 | SetPageReserved(page); |
@@ -2611,7 +2684,7 @@ static int zone_batchsize(struct zone *zone) | |||
2611 | return batch; | 2684 | return batch; |
2612 | } | 2685 | } |
2613 | 2686 | ||
2614 | inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | 2687 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) |
2615 | { | 2688 | { |
2616 | struct per_cpu_pages *pcp; | 2689 | struct per_cpu_pages *pcp; |
2617 | 2690 | ||
@@ -2836,6 +2909,12 @@ __meminit int init_currently_empty_zone(struct zone *zone, | |||
2836 | 2909 | ||
2837 | zone->zone_start_pfn = zone_start_pfn; | 2910 | zone->zone_start_pfn = zone_start_pfn; |
2838 | 2911 | ||
2912 | mminit_dprintk(MMINIT_TRACE, "memmap_init", | ||
2913 | "Initialising map node %d zone %lu pfns %lu -> %lu\n", | ||
2914 | pgdat->node_id, | ||
2915 | (unsigned long)zone_idx(zone), | ||
2916 | zone_start_pfn, (zone_start_pfn + size)); | ||
2917 | |||
2839 | zone_init_free_lists(zone); | 2918 | zone_init_free_lists(zone); |
2840 | 2919 | ||
2841 | return 0; | 2920 | return 0; |
@@ -2975,7 +3054,8 @@ void __init sparse_memory_present_with_active_regions(int nid) | |||
2975 | void __init push_node_boundaries(unsigned int nid, | 3054 | void __init push_node_boundaries(unsigned int nid, |
2976 | unsigned long start_pfn, unsigned long end_pfn) | 3055 | unsigned long start_pfn, unsigned long end_pfn) |
2977 | { | 3056 | { |
2978 | printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", | 3057 | mminit_dprintk(MMINIT_TRACE, "zoneboundary", |
3058 | "Entering push_node_boundaries(%u, %lu, %lu)\n", | ||
2979 | nid, start_pfn, end_pfn); | 3059 | nid, start_pfn, end_pfn); |
2980 | 3060 | ||
2981 | /* Initialise the boundary for this node if necessary */ | 3061 | /* Initialise the boundary for this node if necessary */ |
@@ -2993,7 +3073,8 @@ void __init push_node_boundaries(unsigned int nid, | |||
2993 | static void __meminit account_node_boundary(unsigned int nid, | 3073 | static void __meminit account_node_boundary(unsigned int nid, |
2994 | unsigned long *start_pfn, unsigned long *end_pfn) | 3074 | unsigned long *start_pfn, unsigned long *end_pfn) |
2995 | { | 3075 | { |
2996 | printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", | 3076 | mminit_dprintk(MMINIT_TRACE, "zoneboundary", |
3077 | "Entering account_node_boundary(%u, %lu, %lu)\n", | ||
2997 | nid, *start_pfn, *end_pfn); | 3078 | nid, *start_pfn, *end_pfn); |
2998 | 3079 | ||
2999 | /* Return if boundary information has not been provided */ | 3080 | /* Return if boundary information has not been provided */ |
@@ -3050,7 +3131,7 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, | |||
3050 | * assumption is made that zones within a node are ordered in monotonic | 3131 | * assumption is made that zones within a node are ordered in monotonic |
3051 | * increasing memory addresses so that the "highest" populated zone is used | 3132 | * increasing memory addresses so that the "highest" populated zone is used |
3052 | */ | 3133 | */ |
3053 | void __init find_usable_zone_for_movable(void) | 3134 | static void __init find_usable_zone_for_movable(void) |
3054 | { | 3135 | { |
3055 | int zone_index; | 3136 | int zone_index; |
3056 | for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { | 3137 | for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { |
@@ -3076,7 +3157,7 @@ void __init find_usable_zone_for_movable(void) | |||
3076 | * highest usable zone for ZONE_MOVABLE. This preserves the assumption that | 3157 | * highest usable zone for ZONE_MOVABLE. This preserves the assumption that |
3077 | * zones within a node are in order of monotonic increases memory addresses | 3158 | * zones within a node are in order of monotonic increases memory addresses |
3078 | */ | 3159 | */ |
3079 | void __meminit adjust_zone_range_for_zone_movable(int nid, | 3160 | static void __meminit adjust_zone_range_for_zone_movable(int nid, |
3080 | unsigned long zone_type, | 3161 | unsigned long zone_type, |
3081 | unsigned long node_start_pfn, | 3162 | unsigned long node_start_pfn, |
3082 | unsigned long node_end_pfn, | 3163 | unsigned long node_end_pfn, |
@@ -3137,7 +3218,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, | |||
3137 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, | 3218 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, |
3138 | * then all holes in the requested range will be accounted for. | 3219 | * then all holes in the requested range will be accounted for. |
3139 | */ | 3220 | */ |
3140 | unsigned long __meminit __absent_pages_in_range(int nid, | 3221 | static unsigned long __meminit __absent_pages_in_range(int nid, |
3141 | unsigned long range_start_pfn, | 3222 | unsigned long range_start_pfn, |
3142 | unsigned long range_end_pfn) | 3223 | unsigned long range_end_pfn) |
3143 | { | 3224 | { |
@@ -3350,10 +3431,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
3350 | pgdat->nr_zones = 0; | 3431 | pgdat->nr_zones = 0; |
3351 | init_waitqueue_head(&pgdat->kswapd_wait); | 3432 | init_waitqueue_head(&pgdat->kswapd_wait); |
3352 | pgdat->kswapd_max_order = 0; | 3433 | pgdat->kswapd_max_order = 0; |
3434 | pgdat_page_cgroup_init(pgdat); | ||
3353 | 3435 | ||
3354 | for (j = 0; j < MAX_NR_ZONES; j++) { | 3436 | for (j = 0; j < MAX_NR_ZONES; j++) { |
3355 | struct zone *zone = pgdat->node_zones + j; | 3437 | struct zone *zone = pgdat->node_zones + j; |
3356 | unsigned long size, realsize, memmap_pages; | 3438 | unsigned long size, realsize, memmap_pages; |
3439 | enum lru_list l; | ||
3357 | 3440 | ||
3358 | size = zone_spanned_pages_in_node(nid, j, zones_size); | 3441 | size = zone_spanned_pages_in_node(nid, j, zones_size); |
3359 | realsize = size - zone_absent_pages_in_node(nid, j, | 3442 | realsize = size - zone_absent_pages_in_node(nid, j, |
@@ -3404,10 +3487,14 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
3404 | zone->prev_priority = DEF_PRIORITY; | 3487 | zone->prev_priority = DEF_PRIORITY; |
3405 | 3488 | ||
3406 | zone_pcp_init(zone); | 3489 | zone_pcp_init(zone); |
3407 | INIT_LIST_HEAD(&zone->active_list); | 3490 | for_each_lru(l) { |
3408 | INIT_LIST_HEAD(&zone->inactive_list); | 3491 | INIT_LIST_HEAD(&zone->lru[l].list); |
3409 | zone->nr_scan_active = 0; | 3492 | zone->lru[l].nr_scan = 0; |
3410 | zone->nr_scan_inactive = 0; | 3493 | } |
3494 | zone->recent_rotated[0] = 0; | ||
3495 | zone->recent_rotated[1] = 0; | ||
3496 | zone->recent_scanned[0] = 0; | ||
3497 | zone->recent_scanned[1] = 0; | ||
3411 | zap_zone_vm_stats(zone); | 3498 | zap_zone_vm_stats(zone); |
3412 | zone->flags = 0; | 3499 | zone->flags = 0; |
3413 | if (!size) | 3500 | if (!size) |
@@ -3464,10 +3551,11 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) | |||
3464 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ | 3551 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ |
3465 | } | 3552 | } |
3466 | 3553 | ||
3467 | void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat, | 3554 | void __paginginit free_area_init_node(int nid, unsigned long *zones_size, |
3468 | unsigned long *zones_size, unsigned long node_start_pfn, | 3555 | unsigned long node_start_pfn, unsigned long *zholes_size) |
3469 | unsigned long *zholes_size) | ||
3470 | { | 3556 | { |
3557 | pg_data_t *pgdat = NODE_DATA(nid); | ||
3558 | |||
3471 | pgdat->node_id = nid; | 3559 | pgdat->node_id = nid; |
3472 | pgdat->node_start_pfn = node_start_pfn; | 3560 | pgdat->node_start_pfn = node_start_pfn; |
3473 | calculate_node_totalpages(pgdat, zones_size, zholes_size); | 3561 | calculate_node_totalpages(pgdat, zones_size, zholes_size); |
@@ -3520,10 +3608,13 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, | |||
3520 | { | 3608 | { |
3521 | int i; | 3609 | int i; |
3522 | 3610 | ||
3523 | printk(KERN_DEBUG "Entering add_active_range(%d, %#lx, %#lx) " | 3611 | mminit_dprintk(MMINIT_TRACE, "memory_register", |
3524 | "%d entries of %d used\n", | 3612 | "Entering add_active_range(%d, %#lx, %#lx) " |
3525 | nid, start_pfn, end_pfn, | 3613 | "%d entries of %d used\n", |
3526 | nr_nodemap_entries, MAX_ACTIVE_REGIONS); | 3614 | nid, start_pfn, end_pfn, |
3615 | nr_nodemap_entries, MAX_ACTIVE_REGIONS); | ||
3616 | |||
3617 | mminit_validate_memmodel_limits(&start_pfn, &end_pfn); | ||
3527 | 3618 | ||
3528 | /* Merge with existing active regions if possible */ | 3619 | /* Merge with existing active regions if possible */ |
3529 | for (i = 0; i < nr_nodemap_entries; i++) { | 3620 | for (i = 0; i < nr_nodemap_entries; i++) { |
@@ -3669,7 +3760,7 @@ static void __init sort_node_map(void) | |||
3669 | } | 3760 | } |
3670 | 3761 | ||
3671 | /* Find the lowest pfn for a node */ | 3762 | /* Find the lowest pfn for a node */ |
3672 | unsigned long __init find_min_pfn_for_node(int nid) | 3763 | static unsigned long __init find_min_pfn_for_node(int nid) |
3673 | { | 3764 | { |
3674 | int i; | 3765 | int i; |
3675 | unsigned long min_pfn = ULONG_MAX; | 3766 | unsigned long min_pfn = ULONG_MAX; |
@@ -3698,23 +3789,6 @@ unsigned long __init find_min_pfn_with_active_regions(void) | |||
3698 | return find_min_pfn_for_node(MAX_NUMNODES); | 3789 | return find_min_pfn_for_node(MAX_NUMNODES); |
3699 | } | 3790 | } |
3700 | 3791 | ||
3701 | /** | ||
3702 | * find_max_pfn_with_active_regions - Find the maximum PFN registered | ||
3703 | * | ||
3704 | * It returns the maximum PFN based on information provided via | ||
3705 | * add_active_range(). | ||
3706 | */ | ||
3707 | unsigned long __init find_max_pfn_with_active_regions(void) | ||
3708 | { | ||
3709 | int i; | ||
3710 | unsigned long max_pfn = 0; | ||
3711 | |||
3712 | for (i = 0; i < nr_nodemap_entries; i++) | ||
3713 | max_pfn = max(max_pfn, early_node_map[i].end_pfn); | ||
3714 | |||
3715 | return max_pfn; | ||
3716 | } | ||
3717 | |||
3718 | /* | 3792 | /* |
3719 | * early_calculate_totalpages() | 3793 | * early_calculate_totalpages() |
3720 | * Sum pages in active regions for movable zone. | 3794 | * Sum pages in active regions for movable zone. |
@@ -3741,7 +3815,7 @@ static unsigned long __init early_calculate_totalpages(void) | |||
3741 | * memory. When they don't, some nodes will have more kernelcore than | 3815 | * memory. When they don't, some nodes will have more kernelcore than |
3742 | * others | 3816 | * others |
3743 | */ | 3817 | */ |
3744 | void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | 3818 | static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) |
3745 | { | 3819 | { |
3746 | int i, nid; | 3820 | int i, nid; |
3747 | unsigned long usable_startpfn; | 3821 | unsigned long usable_startpfn; |
@@ -3904,7 +3978,7 @@ static void check_for_regular_memory(pg_data_t *pgdat) | |||
3904 | void __init free_area_init_nodes(unsigned long *max_zone_pfn) | 3978 | void __init free_area_init_nodes(unsigned long *max_zone_pfn) |
3905 | { | 3979 | { |
3906 | unsigned long nid; | 3980 | unsigned long nid; |
3907 | enum zone_type i; | 3981 | int i; |
3908 | 3982 | ||
3909 | /* Sort early_node_map as initialisation assumes it is sorted */ | 3983 | /* Sort early_node_map as initialisation assumes it is sorted */ |
3910 | sort_node_map(); | 3984 | sort_node_map(); |
@@ -3957,10 +4031,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
3957 | early_node_map[i].end_pfn); | 4031 | early_node_map[i].end_pfn); |
3958 | 4032 | ||
3959 | /* Initialise every node */ | 4033 | /* Initialise every node */ |
4034 | mminit_verify_pageflags_layout(); | ||
3960 | setup_nr_node_ids(); | 4035 | setup_nr_node_ids(); |
3961 | for_each_online_node(nid) { | 4036 | for_each_online_node(nid) { |
3962 | pg_data_t *pgdat = NODE_DATA(nid); | 4037 | pg_data_t *pgdat = NODE_DATA(nid); |
3963 | free_area_init_node(nid, pgdat, NULL, | 4038 | free_area_init_node(nid, NULL, |
3964 | find_min_pfn_for_node(nid), NULL); | 4039 | find_min_pfn_for_node(nid), NULL); |
3965 | 4040 | ||
3966 | /* Any memory on that node */ | 4041 | /* Any memory on that node */ |
@@ -4025,15 +4100,13 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) | |||
4025 | } | 4100 | } |
4026 | 4101 | ||
4027 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 4102 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
4028 | static bootmem_data_t contig_bootmem_data; | 4103 | struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; |
4029 | struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; | ||
4030 | |||
4031 | EXPORT_SYMBOL(contig_page_data); | 4104 | EXPORT_SYMBOL(contig_page_data); |
4032 | #endif | 4105 | #endif |
4033 | 4106 | ||
4034 | void __init free_area_init(unsigned long *zones_size) | 4107 | void __init free_area_init(unsigned long *zones_size) |
4035 | { | 4108 | { |
4036 | free_area_init_node(0, NODE_DATA(0), zones_size, | 4109 | free_area_init_node(0, zones_size, |
4037 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); | 4110 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); |
4038 | } | 4111 | } |
4039 | 4112 | ||
@@ -4163,7 +4236,7 @@ void setup_per_zone_pages_min(void) | |||
4163 | for_each_zone(zone) { | 4236 | for_each_zone(zone) { |
4164 | u64 tmp; | 4237 | u64 tmp; |
4165 | 4238 | ||
4166 | spin_lock_irqsave(&zone->lru_lock, flags); | 4239 | spin_lock_irqsave(&zone->lock, flags); |
4167 | tmp = (u64)pages_min * zone->present_pages; | 4240 | tmp = (u64)pages_min * zone->present_pages; |
4168 | do_div(tmp, lowmem_pages); | 4241 | do_div(tmp, lowmem_pages); |
4169 | if (is_highmem(zone)) { | 4242 | if (is_highmem(zone)) { |
@@ -4195,13 +4268,53 @@ void setup_per_zone_pages_min(void) | |||
4195 | zone->pages_low = zone->pages_min + (tmp >> 2); | 4268 | zone->pages_low = zone->pages_min + (tmp >> 2); |
4196 | zone->pages_high = zone->pages_min + (tmp >> 1); | 4269 | zone->pages_high = zone->pages_min + (tmp >> 1); |
4197 | setup_zone_migrate_reserve(zone); | 4270 | setup_zone_migrate_reserve(zone); |
4198 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 4271 | spin_unlock_irqrestore(&zone->lock, flags); |
4199 | } | 4272 | } |
4200 | 4273 | ||
4201 | /* update totalreserve_pages */ | 4274 | /* update totalreserve_pages */ |
4202 | calculate_totalreserve_pages(); | 4275 | calculate_totalreserve_pages(); |
4203 | } | 4276 | } |
4204 | 4277 | ||
4278 | /** | ||
4279 | * setup_per_zone_inactive_ratio - called when min_free_kbytes changes. | ||
4280 | * | ||
4281 | * The inactive anon list should be small enough that the VM never has to | ||
4282 | * do too much work, but large enough that each inactive page has a chance | ||
4283 | * to be referenced again before it is swapped out. | ||
4284 | * | ||
4285 | * The inactive_anon ratio is the target ratio of ACTIVE_ANON to | ||
4286 | * INACTIVE_ANON pages on this zone's LRU, maintained by the | ||
4287 | * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of | ||
4288 | * the anonymous pages are kept on the inactive list. | ||
4289 | * | ||
4290 | * total target max | ||
4291 | * memory ratio inactive anon | ||
4292 | * ------------------------------------- | ||
4293 | * 10MB 1 5MB | ||
4294 | * 100MB 1 50MB | ||
4295 | * 1GB 3 250MB | ||
4296 | * 10GB 10 0.9GB | ||
4297 | * 100GB 31 3GB | ||
4298 | * 1TB 101 10GB | ||
4299 | * 10TB 320 32GB | ||
4300 | */ | ||
4301 | void setup_per_zone_inactive_ratio(void) | ||
4302 | { | ||
4303 | struct zone *zone; | ||
4304 | |||
4305 | for_each_zone(zone) { | ||
4306 | unsigned int gb, ratio; | ||
4307 | |||
4308 | /* Zone size in gigabytes */ | ||
4309 | gb = zone->present_pages >> (30 - PAGE_SHIFT); | ||
4310 | ratio = int_sqrt(10 * gb); | ||
4311 | if (!ratio) | ||
4312 | ratio = 1; | ||
4313 | |||
4314 | zone->inactive_ratio = ratio; | ||
4315 | } | ||
4316 | } | ||
4317 | |||
4205 | /* | 4318 | /* |
4206 | * Initialise min_free_kbytes. | 4319 | * Initialise min_free_kbytes. |
4207 | * | 4320 | * |
@@ -4239,6 +4352,7 @@ static int __init init_per_zone_pages_min(void) | |||
4239 | min_free_kbytes = 65536; | 4352 | min_free_kbytes = 65536; |
4240 | setup_per_zone_pages_min(); | 4353 | setup_per_zone_pages_min(); |
4241 | setup_per_zone_lowmem_reserve(); | 4354 | setup_per_zone_lowmem_reserve(); |
4355 | setup_per_zone_inactive_ratio(); | ||
4242 | return 0; | 4356 | return 0; |
4243 | } | 4357 | } |
4244 | module_init(init_per_zone_pages_min) | 4358 | module_init(init_per_zone_pages_min) |
@@ -4400,7 +4514,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
4400 | do { | 4514 | do { |
4401 | size = bucketsize << log2qty; | 4515 | size = bucketsize << log2qty; |
4402 | if (flags & HASH_EARLY) | 4516 | if (flags & HASH_EARLY) |
4403 | table = alloc_bootmem(size); | 4517 | table = alloc_bootmem_nopanic(size); |
4404 | else if (hashdist) | 4518 | else if (hashdist) |
4405 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); | 4519 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); |
4406 | else { | 4520 | else { |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c new file mode 100644 index 000000000000..f59d797dc5a9 --- /dev/null +++ b/mm/page_cgroup.c | |||
@@ -0,0 +1,256 @@ | |||
1 | #include <linux/mm.h> | ||
2 | #include <linux/mmzone.h> | ||
3 | #include <linux/bootmem.h> | ||
4 | #include <linux/bit_spinlock.h> | ||
5 | #include <linux/page_cgroup.h> | ||
6 | #include <linux/hash.h> | ||
7 | #include <linux/slab.h> | ||
8 | #include <linux/memory.h> | ||
9 | #include <linux/vmalloc.h> | ||
10 | #include <linux/cgroup.h> | ||
11 | |||
12 | static void __meminit | ||
13 | __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) | ||
14 | { | ||
15 | pc->flags = 0; | ||
16 | pc->mem_cgroup = NULL; | ||
17 | pc->page = pfn_to_page(pfn); | ||
18 | } | ||
19 | static unsigned long total_usage; | ||
20 | |||
21 | #if !defined(CONFIG_SPARSEMEM) | ||
22 | |||
23 | |||
24 | void __init pgdat_page_cgroup_init(struct pglist_data *pgdat) | ||
25 | { | ||
26 | pgdat->node_page_cgroup = NULL; | ||
27 | } | ||
28 | |||
29 | struct page_cgroup *lookup_page_cgroup(struct page *page) | ||
30 | { | ||
31 | unsigned long pfn = page_to_pfn(page); | ||
32 | unsigned long offset; | ||
33 | struct page_cgroup *base; | ||
34 | |||
35 | base = NODE_DATA(page_to_nid(page))->node_page_cgroup; | ||
36 | if (unlikely(!base)) | ||
37 | return NULL; | ||
38 | |||
39 | offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; | ||
40 | return base + offset; | ||
41 | } | ||
42 | |||
43 | static int __init alloc_node_page_cgroup(int nid) | ||
44 | { | ||
45 | struct page_cgroup *base, *pc; | ||
46 | unsigned long table_size; | ||
47 | unsigned long start_pfn, nr_pages, index; | ||
48 | |||
49 | start_pfn = NODE_DATA(nid)->node_start_pfn; | ||
50 | nr_pages = NODE_DATA(nid)->node_spanned_pages; | ||
51 | |||
52 | table_size = sizeof(struct page_cgroup) * nr_pages; | ||
53 | |||
54 | base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), | ||
55 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | ||
56 | if (!base) | ||
57 | return -ENOMEM; | ||
58 | for (index = 0; index < nr_pages; index++) { | ||
59 | pc = base + index; | ||
60 | __init_page_cgroup(pc, start_pfn + index); | ||
61 | } | ||
62 | NODE_DATA(nid)->node_page_cgroup = base; | ||
63 | total_usage += table_size; | ||
64 | return 0; | ||
65 | } | ||
66 | |||
67 | void __init page_cgroup_init(void) | ||
68 | { | ||
69 | |||
70 | int nid, fail; | ||
71 | |||
72 | if (mem_cgroup_subsys.disabled) | ||
73 | return; | ||
74 | |||
75 | for_each_online_node(nid) { | ||
76 | fail = alloc_node_page_cgroup(nid); | ||
77 | if (fail) | ||
78 | goto fail; | ||
79 | } | ||
80 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); | ||
81 | printk(KERN_INFO "please try cgroup_disable=memory option if you" | ||
82 | " don't want\n"); | ||
83 | return; | ||
84 | fail: | ||
85 | printk(KERN_CRIT "allocation of page_cgroup was failed.\n"); | ||
86 | printk(KERN_CRIT "please try cgroup_disable=memory boot option\n"); | ||
87 | panic("Out of memory"); | ||
88 | } | ||
89 | |||
90 | #else /* CONFIG_FLAT_NODE_MEM_MAP */ | ||
91 | |||
92 | struct page_cgroup *lookup_page_cgroup(struct page *page) | ||
93 | { | ||
94 | unsigned long pfn = page_to_pfn(page); | ||
95 | struct mem_section *section = __pfn_to_section(pfn); | ||
96 | |||
97 | return section->page_cgroup + pfn; | ||
98 | } | ||
99 | |||
100 | int __meminit init_section_page_cgroup(unsigned long pfn) | ||
101 | { | ||
102 | struct mem_section *section; | ||
103 | struct page_cgroup *base, *pc; | ||
104 | unsigned long table_size; | ||
105 | int nid, index; | ||
106 | |||
107 | section = __pfn_to_section(pfn); | ||
108 | |||
109 | if (section->page_cgroup) | ||
110 | return 0; | ||
111 | |||
112 | nid = page_to_nid(pfn_to_page(pfn)); | ||
113 | |||
114 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; | ||
115 | if (slab_is_available()) { | ||
116 | base = kmalloc_node(table_size, GFP_KERNEL, nid); | ||
117 | if (!base) | ||
118 | base = vmalloc_node(table_size, nid); | ||
119 | } else { | ||
120 | base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), table_size, | ||
121 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | ||
122 | } | ||
123 | |||
124 | if (!base) { | ||
125 | printk(KERN_ERR "page cgroup allocation failure\n"); | ||
126 | return -ENOMEM; | ||
127 | } | ||
128 | |||
129 | for (index = 0; index < PAGES_PER_SECTION; index++) { | ||
130 | pc = base + index; | ||
131 | __init_page_cgroup(pc, pfn + index); | ||
132 | } | ||
133 | |||
134 | section = __pfn_to_section(pfn); | ||
135 | section->page_cgroup = base - pfn; | ||
136 | total_usage += table_size; | ||
137 | return 0; | ||
138 | } | ||
139 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
140 | void __free_page_cgroup(unsigned long pfn) | ||
141 | { | ||
142 | struct mem_section *ms; | ||
143 | struct page_cgroup *base; | ||
144 | |||
145 | ms = __pfn_to_section(pfn); | ||
146 | if (!ms || !ms->page_cgroup) | ||
147 | return; | ||
148 | base = ms->page_cgroup + pfn; | ||
149 | if (is_vmalloc_addr(base)) { | ||
150 | vfree(base); | ||
151 | ms->page_cgroup = NULL; | ||
152 | } else { | ||
153 | struct page *page = virt_to_page(base); | ||
154 | if (!PageReserved(page)) { /* Is bootmem ? */ | ||
155 | kfree(base); | ||
156 | ms->page_cgroup = NULL; | ||
157 | } | ||
158 | } | ||
159 | } | ||
160 | |||
161 | int online_page_cgroup(unsigned long start_pfn, | ||
162 | unsigned long nr_pages, | ||
163 | int nid) | ||
164 | { | ||
165 | unsigned long start, end, pfn; | ||
166 | int fail = 0; | ||
167 | |||
168 | start = start_pfn & (PAGES_PER_SECTION - 1); | ||
169 | end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); | ||
170 | |||
171 | for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { | ||
172 | if (!pfn_present(pfn)) | ||
173 | continue; | ||
174 | fail = init_section_page_cgroup(pfn); | ||
175 | } | ||
176 | if (!fail) | ||
177 | return 0; | ||
178 | |||
179 | /* rollback */ | ||
180 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) | ||
181 | __free_page_cgroup(pfn); | ||
182 | |||
183 | return -ENOMEM; | ||
184 | } | ||
185 | |||
186 | int offline_page_cgroup(unsigned long start_pfn, | ||
187 | unsigned long nr_pages, int nid) | ||
188 | { | ||
189 | unsigned long start, end, pfn; | ||
190 | |||
191 | start = start_pfn & (PAGES_PER_SECTION - 1); | ||
192 | end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); | ||
193 | |||
194 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) | ||
195 | __free_page_cgroup(pfn); | ||
196 | return 0; | ||
197 | |||
198 | } | ||
199 | |||
200 | static int page_cgroup_callback(struct notifier_block *self, | ||
201 | unsigned long action, void *arg) | ||
202 | { | ||
203 | struct memory_notify *mn = arg; | ||
204 | int ret = 0; | ||
205 | switch (action) { | ||
206 | case MEM_GOING_ONLINE: | ||
207 | ret = online_page_cgroup(mn->start_pfn, | ||
208 | mn->nr_pages, mn->status_change_nid); | ||
209 | break; | ||
210 | case MEM_CANCEL_ONLINE: | ||
211 | case MEM_OFFLINE: | ||
212 | offline_page_cgroup(mn->start_pfn, | ||
213 | mn->nr_pages, mn->status_change_nid); | ||
214 | break; | ||
215 | case MEM_GOING_OFFLINE: | ||
216 | break; | ||
217 | case MEM_ONLINE: | ||
218 | case MEM_CANCEL_OFFLINE: | ||
219 | break; | ||
220 | } | ||
221 | ret = notifier_from_errno(ret); | ||
222 | return ret; | ||
223 | } | ||
224 | |||
225 | #endif | ||
226 | |||
227 | void __init page_cgroup_init(void) | ||
228 | { | ||
229 | unsigned long pfn; | ||
230 | int fail = 0; | ||
231 | |||
232 | if (mem_cgroup_subsys.disabled) | ||
233 | return; | ||
234 | |||
235 | for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { | ||
236 | if (!pfn_present(pfn)) | ||
237 | continue; | ||
238 | fail = init_section_page_cgroup(pfn); | ||
239 | } | ||
240 | if (fail) { | ||
241 | printk(KERN_CRIT "try cgroup_disable=memory boot option\n"); | ||
242 | panic("Out of memory"); | ||
243 | } else { | ||
244 | hotplug_memory_notifier(page_cgroup_callback, 0); | ||
245 | } | ||
246 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); | ||
247 | printk(KERN_INFO "please try cgroup_disable=memory option if you don't" | ||
248 | " want\n"); | ||
249 | } | ||
250 | |||
251 | void __init pgdat_page_cgroup_init(struct pglist_data *pgdat) | ||
252 | { | ||
253 | return; | ||
254 | } | ||
255 | |||
256 | #endif | ||
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 3444b58033c8..b70a7fec1ff6 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -2,7 +2,6 @@ | |||
2 | * linux/mm/page_isolation.c | 2 | * linux/mm/page_isolation.c |
3 | */ | 3 | */ |
4 | 4 | ||
5 | #include <stddef.h> | ||
6 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
7 | #include <linux/page-isolation.h> | 6 | #include <linux/page-isolation.h> |
8 | #include <linux/pageblock-flags.h> | 7 | #include <linux/pageblock-flags.h> |
@@ -115,8 +114,10 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) | |||
115 | 114 | ||
116 | int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | 115 | int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) |
117 | { | 116 | { |
118 | unsigned long pfn; | 117 | unsigned long pfn, flags; |
119 | struct page *page; | 118 | struct page *page; |
119 | struct zone *zone; | ||
120 | int ret; | ||
120 | 121 | ||
121 | pfn = start_pfn; | 122 | pfn = start_pfn; |
122 | /* | 123 | /* |
@@ -132,7 +133,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
132 | if (pfn < end_pfn) | 133 | if (pfn < end_pfn) |
133 | return -EBUSY; | 134 | return -EBUSY; |
134 | /* Check all pages are free or Marked as ISOLATED */ | 135 | /* Check all pages are free or Marked as ISOLATED */ |
135 | if (__test_page_isolated_in_pageblock(start_pfn, end_pfn)) | 136 | zone = page_zone(pfn_to_page(pfn)); |
136 | return 0; | 137 | spin_lock_irqsave(&zone->lock, flags); |
137 | return -EBUSY; | 138 | ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); |
139 | spin_unlock_irqrestore(&zone->lock, flags); | ||
140 | return ret ? 0 : -EBUSY; | ||
138 | } | 141 | } |
diff --git a/mm/pdflush.c b/mm/pdflush.c index 9d834aa4b979..a0a14c4d5072 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c | |||
@@ -3,7 +3,7 @@ | |||
3 | * | 3 | * |
4 | * Copyright (C) 2002, Linus Torvalds. | 4 | * Copyright (C) 2002, Linus Torvalds. |
5 | * | 5 | * |
6 | * 09Apr2002 akpm@zip.com.au | 6 | * 09Apr2002 Andrew Morton |
7 | * Initial version | 7 | * Initial version |
8 | * 29Feb2004 kaos@sgi.com | 8 | * 29Feb2004 kaos@sgi.com |
9 | * Move worker thread creation to kthread to avoid chewing | 9 | * Move worker thread creation to kthread to avoid chewing |
@@ -130,7 +130,7 @@ static int __pdflush(struct pdflush_work *my_work) | |||
130 | * Thread creation: For how long have there been zero | 130 | * Thread creation: For how long have there been zero |
131 | * available threads? | 131 | * available threads? |
132 | */ | 132 | */ |
133 | if (jiffies - last_empty_jifs > 1 * HZ) { | 133 | if (time_after(jiffies, last_empty_jifs + 1 * HZ)) { |
134 | /* unlocked list_empty() test is OK here */ | 134 | /* unlocked list_empty() test is OK here */ |
135 | if (list_empty(&pdflush_list)) { | 135 | if (list_empty(&pdflush_list)) { |
136 | /* unlocked test is OK here */ | 136 | /* unlocked test is OK here */ |
@@ -151,7 +151,7 @@ static int __pdflush(struct pdflush_work *my_work) | |||
151 | if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) | 151 | if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) |
152 | continue; | 152 | continue; |
153 | pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); | 153 | pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); |
154 | if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) { | 154 | if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) { |
155 | /* Limit exit rate */ | 155 | /* Limit exit rate */ |
156 | pdf->when_i_went_to_sleep = jiffies; | 156 | pdf->when_i_went_to_sleep = jiffies; |
157 | break; /* exeunt */ | 157 | break; /* exeunt */ |
diff --git a/mm/quicklist.c b/mm/quicklist.c index 3f703f7cb398..8dbb6805ef35 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c | |||
@@ -26,7 +26,10 @@ DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; | |||
26 | static unsigned long max_pages(unsigned long min_pages) | 26 | static unsigned long max_pages(unsigned long min_pages) |
27 | { | 27 | { |
28 | unsigned long node_free_pages, max; | 28 | unsigned long node_free_pages, max; |
29 | struct zone *zones = NODE_DATA(numa_node_id())->node_zones; | 29 | int node = numa_node_id(); |
30 | struct zone *zones = NODE_DATA(node)->node_zones; | ||
31 | int num_cpus_on_node; | ||
32 | node_to_cpumask_ptr(cpumask_on_node, node); | ||
30 | 33 | ||
31 | node_free_pages = | 34 | node_free_pages = |
32 | #ifdef CONFIG_ZONE_DMA | 35 | #ifdef CONFIG_ZONE_DMA |
@@ -38,6 +41,10 @@ static unsigned long max_pages(unsigned long min_pages) | |||
38 | zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES); | 41 | zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES); |
39 | 42 | ||
40 | max = node_free_pages / FRACTION_OF_NODE_MEM; | 43 | max = node_free_pages / FRACTION_OF_NODE_MEM; |
44 | |||
45 | num_cpus_on_node = cpus_weight_nr(*cpumask_on_node); | ||
46 | max /= num_cpus_on_node; | ||
47 | |||
41 | return max(max, min_pages); | 48 | return max(max, min_pages); |
42 | } | 49 | } |
43 | 50 | ||
diff --git a/mm/readahead.c b/mm/readahead.c index d8723a5f6496..bec83c15a78f 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -3,7 +3,7 @@ | |||
3 | * | 3 | * |
4 | * Copyright (C) 2002, Linus Torvalds | 4 | * Copyright (C) 2002, Linus Torvalds |
5 | * | 5 | * |
6 | * 09Apr2002 akpm@zip.com.au | 6 | * 09Apr2002 Andrew Morton |
7 | * Initial version. | 7 | * Initial version. |
8 | */ | 8 | */ |
9 | 9 | ||
@@ -229,7 +229,7 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
229 | */ | 229 | */ |
230 | unsigned long max_sane_readahead(unsigned long nr) | 230 | unsigned long max_sane_readahead(unsigned long nr) |
231 | { | 231 | { |
232 | return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE) | 232 | return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) |
233 | + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); | 233 | + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); |
234 | } | 234 | } |
235 | 235 | ||
@@ -382,9 +382,9 @@ ondemand_readahead(struct address_space *mapping, | |||
382 | if (hit_readahead_marker) { | 382 | if (hit_readahead_marker) { |
383 | pgoff_t start; | 383 | pgoff_t start; |
384 | 384 | ||
385 | read_lock_irq(&mapping->tree_lock); | 385 | rcu_read_lock(); |
386 | start = radix_tree_next_hole(&mapping->page_tree, offset, max+1); | 386 | start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); |
387 | read_unlock_irq(&mapping->tree_lock); | 387 | rcu_read_unlock(); |
388 | 388 | ||
389 | if (!start || start - offset > max) | 389 | if (!start || start - offset > max) |
390 | return 0; | 390 | return 0; |
@@ -49,12 +49,51 @@ | |||
49 | #include <linux/module.h> | 49 | #include <linux/module.h> |
50 | #include <linux/kallsyms.h> | 50 | #include <linux/kallsyms.h> |
51 | #include <linux/memcontrol.h> | 51 | #include <linux/memcontrol.h> |
52 | #include <linux/mmu_notifier.h> | ||
52 | 53 | ||
53 | #include <asm/tlbflush.h> | 54 | #include <asm/tlbflush.h> |
54 | 55 | ||
55 | struct kmem_cache *anon_vma_cachep; | 56 | #include "internal.h" |
56 | 57 | ||
57 | /* This must be called under the mmap_sem. */ | 58 | static struct kmem_cache *anon_vma_cachep; |
59 | |||
60 | static inline struct anon_vma *anon_vma_alloc(void) | ||
61 | { | ||
62 | return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); | ||
63 | } | ||
64 | |||
65 | static inline void anon_vma_free(struct anon_vma *anon_vma) | ||
66 | { | ||
67 | kmem_cache_free(anon_vma_cachep, anon_vma); | ||
68 | } | ||
69 | |||
70 | /** | ||
71 | * anon_vma_prepare - attach an anon_vma to a memory region | ||
72 | * @vma: the memory region in question | ||
73 | * | ||
74 | * This makes sure the memory mapping described by 'vma' has | ||
75 | * an 'anon_vma' attached to it, so that we can associate the | ||
76 | * anonymous pages mapped into it with that anon_vma. | ||
77 | * | ||
78 | * The common case will be that we already have one, but if | ||
79 | * if not we either need to find an adjacent mapping that we | ||
80 | * can re-use the anon_vma from (very common when the only | ||
81 | * reason for splitting a vma has been mprotect()), or we | ||
82 | * allocate a new one. | ||
83 | * | ||
84 | * Anon-vma allocations are very subtle, because we may have | ||
85 | * optimistically looked up an anon_vma in page_lock_anon_vma() | ||
86 | * and that may actually touch the spinlock even in the newly | ||
87 | * allocated vma (it depends on RCU to make sure that the | ||
88 | * anon_vma isn't actually destroyed). | ||
89 | * | ||
90 | * As a result, we need to do proper anon_vma locking even | ||
91 | * for the new allocation. At the same time, we do not want | ||
92 | * to do any locking for the common case of already having | ||
93 | * an anon_vma. | ||
94 | * | ||
95 | * This must be called with the mmap_sem held for reading. | ||
96 | */ | ||
58 | int anon_vma_prepare(struct vm_area_struct *vma) | 97 | int anon_vma_prepare(struct vm_area_struct *vma) |
59 | { | 98 | { |
60 | struct anon_vma *anon_vma = vma->anon_vma; | 99 | struct anon_vma *anon_vma = vma->anon_vma; |
@@ -62,20 +101,17 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
62 | might_sleep(); | 101 | might_sleep(); |
63 | if (unlikely(!anon_vma)) { | 102 | if (unlikely(!anon_vma)) { |
64 | struct mm_struct *mm = vma->vm_mm; | 103 | struct mm_struct *mm = vma->vm_mm; |
65 | struct anon_vma *allocated, *locked; | 104 | struct anon_vma *allocated; |
66 | 105 | ||
67 | anon_vma = find_mergeable_anon_vma(vma); | 106 | anon_vma = find_mergeable_anon_vma(vma); |
68 | if (anon_vma) { | 107 | allocated = NULL; |
69 | allocated = NULL; | 108 | if (!anon_vma) { |
70 | locked = anon_vma; | ||
71 | spin_lock(&locked->lock); | ||
72 | } else { | ||
73 | anon_vma = anon_vma_alloc(); | 109 | anon_vma = anon_vma_alloc(); |
74 | if (unlikely(!anon_vma)) | 110 | if (unlikely(!anon_vma)) |
75 | return -ENOMEM; | 111 | return -ENOMEM; |
76 | allocated = anon_vma; | 112 | allocated = anon_vma; |
77 | locked = NULL; | ||
78 | } | 113 | } |
114 | spin_lock(&anon_vma->lock); | ||
79 | 115 | ||
80 | /* page_table_lock to protect against threads */ | 116 | /* page_table_lock to protect against threads */ |
81 | spin_lock(&mm->page_table_lock); | 117 | spin_lock(&mm->page_table_lock); |
@@ -86,8 +122,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
86 | } | 122 | } |
87 | spin_unlock(&mm->page_table_lock); | 123 | spin_unlock(&mm->page_table_lock); |
88 | 124 | ||
89 | if (locked) | 125 | spin_unlock(&anon_vma->lock); |
90 | spin_unlock(&locked->lock); | ||
91 | if (unlikely(allocated)) | 126 | if (unlikely(allocated)) |
92 | anon_vma_free(allocated); | 127 | anon_vma_free(allocated); |
93 | } | 128 | } |
@@ -138,7 +173,7 @@ void anon_vma_unlink(struct vm_area_struct *vma) | |||
138 | anon_vma_free(anon_vma); | 173 | anon_vma_free(anon_vma); |
139 | } | 174 | } |
140 | 175 | ||
141 | static void anon_vma_ctor(struct kmem_cache *cachep, void *data) | 176 | static void anon_vma_ctor(void *data) |
142 | { | 177 | { |
143 | struct anon_vma *anon_vma = data; | 178 | struct anon_vma *anon_vma = data; |
144 | 179 | ||
@@ -156,7 +191,7 @@ void __init anon_vma_init(void) | |||
156 | * Getting a lock on a stable anon_vma from a page off the LRU is | 191 | * Getting a lock on a stable anon_vma from a page off the LRU is |
157 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. | 192 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. |
158 | */ | 193 | */ |
159 | static struct anon_vma *page_lock_anon_vma(struct page *page) | 194 | struct anon_vma *page_lock_anon_vma(struct page *page) |
160 | { | 195 | { |
161 | struct anon_vma *anon_vma; | 196 | struct anon_vma *anon_vma; |
162 | unsigned long anon_mapping; | 197 | unsigned long anon_mapping; |
@@ -176,7 +211,7 @@ out: | |||
176 | return NULL; | 211 | return NULL; |
177 | } | 212 | } |
178 | 213 | ||
179 | static void page_unlock_anon_vma(struct anon_vma *anon_vma) | 214 | void page_unlock_anon_vma(struct anon_vma *anon_vma) |
180 | { | 215 | { |
181 | spin_unlock(&anon_vma->lock); | 216 | spin_unlock(&anon_vma->lock); |
182 | rcu_read_unlock(); | 217 | rcu_read_unlock(); |
@@ -223,10 +258,14 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | |||
223 | /* | 258 | /* |
224 | * Check that @page is mapped at @address into @mm. | 259 | * Check that @page is mapped at @address into @mm. |
225 | * | 260 | * |
261 | * If @sync is false, page_check_address may perform a racy check to avoid | ||
262 | * the page table lock when the pte is not present (helpful when reclaiming | ||
263 | * highly shared pages). | ||
264 | * | ||
226 | * On success returns with pte mapped and locked. | 265 | * On success returns with pte mapped and locked. |
227 | */ | 266 | */ |
228 | pte_t *page_check_address(struct page *page, struct mm_struct *mm, | 267 | pte_t *page_check_address(struct page *page, struct mm_struct *mm, |
229 | unsigned long address, spinlock_t **ptlp) | 268 | unsigned long address, spinlock_t **ptlp, int sync) |
230 | { | 269 | { |
231 | pgd_t *pgd; | 270 | pgd_t *pgd; |
232 | pud_t *pud; | 271 | pud_t *pud; |
@@ -248,7 +287,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, | |||
248 | 287 | ||
249 | pte = pte_offset_map(pmd, address); | 288 | pte = pte_offset_map(pmd, address); |
250 | /* Make a quick check before getting the lock */ | 289 | /* Make a quick check before getting the lock */ |
251 | if (!pte_present(*pte)) { | 290 | if (!sync && !pte_present(*pte)) { |
252 | pte_unmap(pte); | 291 | pte_unmap(pte); |
253 | return NULL; | 292 | return NULL; |
254 | } | 293 | } |
@@ -263,6 +302,32 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, | |||
263 | return NULL; | 302 | return NULL; |
264 | } | 303 | } |
265 | 304 | ||
305 | /** | ||
306 | * page_mapped_in_vma - check whether a page is really mapped in a VMA | ||
307 | * @page: the page to test | ||
308 | * @vma: the VMA to test | ||
309 | * | ||
310 | * Returns 1 if the page is mapped into the page tables of the VMA, 0 | ||
311 | * if the page is not mapped into the page tables of this VMA. Only | ||
312 | * valid for normal file or anonymous VMAs. | ||
313 | */ | ||
314 | static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | ||
315 | { | ||
316 | unsigned long address; | ||
317 | pte_t *pte; | ||
318 | spinlock_t *ptl; | ||
319 | |||
320 | address = vma_address(page, vma); | ||
321 | if (address == -EFAULT) /* out of vma range */ | ||
322 | return 0; | ||
323 | pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); | ||
324 | if (!pte) /* the page is not in this mm */ | ||
325 | return 0; | ||
326 | pte_unmap_unlock(pte, ptl); | ||
327 | |||
328 | return 1; | ||
329 | } | ||
330 | |||
266 | /* | 331 | /* |
267 | * Subfunctions of page_referenced: page_referenced_one called | 332 | * Subfunctions of page_referenced: page_referenced_one called |
268 | * repeatedly from either page_referenced_anon or page_referenced_file. | 333 | * repeatedly from either page_referenced_anon or page_referenced_file. |
@@ -280,14 +345,21 @@ static int page_referenced_one(struct page *page, | |||
280 | if (address == -EFAULT) | 345 | if (address == -EFAULT) |
281 | goto out; | 346 | goto out; |
282 | 347 | ||
283 | pte = page_check_address(page, mm, address, &ptl); | 348 | pte = page_check_address(page, mm, address, &ptl, 0); |
284 | if (!pte) | 349 | if (!pte) |
285 | goto out; | 350 | goto out; |
286 | 351 | ||
352 | /* | ||
353 | * Don't want to elevate referenced for mlocked page that gets this far, | ||
354 | * in order that it progresses to try_to_unmap and is moved to the | ||
355 | * unevictable list. | ||
356 | */ | ||
287 | if (vma->vm_flags & VM_LOCKED) { | 357 | if (vma->vm_flags & VM_LOCKED) { |
288 | referenced++; | ||
289 | *mapcount = 1; /* break early from loop */ | 358 | *mapcount = 1; /* break early from loop */ |
290 | } else if (ptep_clear_flush_young(vma, address, pte)) | 359 | goto out_unmap; |
360 | } | ||
361 | |||
362 | if (ptep_clear_flush_young_notify(vma, address, pte)) | ||
291 | referenced++; | 363 | referenced++; |
292 | 364 | ||
293 | /* Pretend the page is referenced if the task has the | 365 | /* Pretend the page is referenced if the task has the |
@@ -296,6 +368,7 @@ static int page_referenced_one(struct page *page, | |||
296 | rwsem_is_locked(&mm->mmap_sem)) | 368 | rwsem_is_locked(&mm->mmap_sem)) |
297 | referenced++; | 369 | referenced++; |
298 | 370 | ||
371 | out_unmap: | ||
299 | (*mapcount)--; | 372 | (*mapcount)--; |
300 | pte_unmap_unlock(pte, ptl); | 373 | pte_unmap_unlock(pte, ptl); |
301 | out: | 374 | out: |
@@ -385,11 +458,6 @@ static int page_referenced_file(struct page *page, | |||
385 | */ | 458 | */ |
386 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 459 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
387 | continue; | 460 | continue; |
388 | if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) | ||
389 | == (VM_LOCKED|VM_MAYSHARE)) { | ||
390 | referenced++; | ||
391 | break; | ||
392 | } | ||
393 | referenced += page_referenced_one(page, vma, &mapcount); | 461 | referenced += page_referenced_one(page, vma, &mapcount); |
394 | if (!mapcount) | 462 | if (!mapcount) |
395 | break; | 463 | break; |
@@ -421,7 +489,7 @@ int page_referenced(struct page *page, int is_locked, | |||
421 | referenced += page_referenced_anon(page, mem_cont); | 489 | referenced += page_referenced_anon(page, mem_cont); |
422 | else if (is_locked) | 490 | else if (is_locked) |
423 | referenced += page_referenced_file(page, mem_cont); | 491 | referenced += page_referenced_file(page, mem_cont); |
424 | else if (TestSetPageLocked(page)) | 492 | else if (!trylock_page(page)) |
425 | referenced++; | 493 | referenced++; |
426 | else { | 494 | else { |
427 | if (page->mapping) | 495 | if (page->mapping) |
@@ -449,7 +517,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | |||
449 | if (address == -EFAULT) | 517 | if (address == -EFAULT) |
450 | goto out; | 518 | goto out; |
451 | 519 | ||
452 | pte = page_check_address(page, mm, address, &ptl); | 520 | pte = page_check_address(page, mm, address, &ptl, 1); |
453 | if (!pte) | 521 | if (!pte) |
454 | goto out; | 522 | goto out; |
455 | 523 | ||
@@ -457,7 +525,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | |||
457 | pte_t entry; | 525 | pte_t entry; |
458 | 526 | ||
459 | flush_cache_page(vma, address, pte_pfn(*pte)); | 527 | flush_cache_page(vma, address, pte_pfn(*pte)); |
460 | entry = ptep_clear_flush(vma, address, pte); | 528 | entry = ptep_clear_flush_notify(vma, address, pte); |
461 | entry = pte_wrprotect(entry); | 529 | entry = pte_wrprotect(entry); |
462 | entry = pte_mkclean(entry); | 530 | entry = pte_mkclean(entry); |
463 | set_pte_at(mm, address, pte, entry); | 531 | set_pte_at(mm, address, pte, entry); |
@@ -576,14 +644,8 @@ void page_add_anon_rmap(struct page *page, | |||
576 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 644 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
577 | if (atomic_inc_and_test(&page->_mapcount)) | 645 | if (atomic_inc_and_test(&page->_mapcount)) |
578 | __page_set_anon_rmap(page, vma, address); | 646 | __page_set_anon_rmap(page, vma, address); |
579 | else { | 647 | else |
580 | __page_check_anon_rmap(page, vma, address); | 648 | __page_check_anon_rmap(page, vma, address); |
581 | /* | ||
582 | * We unconditionally charged during prepare, we uncharge here | ||
583 | * This takes care of balancing the reference counts | ||
584 | */ | ||
585 | mem_cgroup_uncharge_page(page); | ||
586 | } | ||
587 | } | 649 | } |
588 | 650 | ||
589 | /** | 651 | /** |
@@ -614,12 +676,6 @@ void page_add_file_rmap(struct page *page) | |||
614 | { | 676 | { |
615 | if (atomic_inc_and_test(&page->_mapcount)) | 677 | if (atomic_inc_and_test(&page->_mapcount)) |
616 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 678 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
617 | else | ||
618 | /* | ||
619 | * We unconditionally charged during prepare, we uncharge here | ||
620 | * This takes care of balancing the reference counts | ||
621 | */ | ||
622 | mem_cgroup_uncharge_page(page); | ||
623 | } | 679 | } |
624 | 680 | ||
625 | #ifdef CONFIG_DEBUG_VM | 681 | #ifdef CONFIG_DEBUG_VM |
@@ -670,6 +726,22 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) | |||
670 | } | 726 | } |
671 | 727 | ||
672 | /* | 728 | /* |
729 | * Now that the last pte has gone, s390 must transfer dirty | ||
730 | * flag from storage key to struct page. We can usually skip | ||
731 | * this if the page is anon, so about to be freed; but perhaps | ||
732 | * not if it's in swapcache - there might be another pte slot | ||
733 | * containing the swap entry, but page not yet written to swap. | ||
734 | */ | ||
735 | if ((!PageAnon(page) || PageSwapCache(page)) && | ||
736 | page_test_dirty(page)) { | ||
737 | page_clear_dirty(page); | ||
738 | set_page_dirty(page); | ||
739 | } | ||
740 | if (PageAnon(page)) | ||
741 | mem_cgroup_uncharge_page(page); | ||
742 | __dec_zone_page_state(page, | ||
743 | PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); | ||
744 | /* | ||
673 | * It would be tidy to reset the PageAnon mapping here, | 745 | * It would be tidy to reset the PageAnon mapping here, |
674 | * but that might overwrite a racing page_add_anon_rmap | 746 | * but that might overwrite a racing page_add_anon_rmap |
675 | * which increments mapcount after us but sets mapping | 747 | * which increments mapcount after us but sets mapping |
@@ -678,14 +750,6 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) | |||
678 | * Leaving it set also helps swapoff to reinstate ptes | 750 | * Leaving it set also helps swapoff to reinstate ptes |
679 | * faster for those pages still in swapcache. | 751 | * faster for those pages still in swapcache. |
680 | */ | 752 | */ |
681 | if (page_test_dirty(page)) { | ||
682 | page_clear_dirty(page); | ||
683 | set_page_dirty(page); | ||
684 | } | ||
685 | mem_cgroup_uncharge_page(page); | ||
686 | |||
687 | __dec_zone_page_state(page, | ||
688 | PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); | ||
689 | } | 753 | } |
690 | } | 754 | } |
691 | 755 | ||
@@ -707,7 +771,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
707 | if (address == -EFAULT) | 771 | if (address == -EFAULT) |
708 | goto out; | 772 | goto out; |
709 | 773 | ||
710 | pte = page_check_address(page, mm, address, &ptl); | 774 | pte = page_check_address(page, mm, address, &ptl, 0); |
711 | if (!pte) | 775 | if (!pte) |
712 | goto out; | 776 | goto out; |
713 | 777 | ||
@@ -716,15 +780,20 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
716 | * If it's recently referenced (perhaps page_referenced | 780 | * If it's recently referenced (perhaps page_referenced |
717 | * skipped over this mm) then we should reactivate it. | 781 | * skipped over this mm) then we should reactivate it. |
718 | */ | 782 | */ |
719 | if (!migration && ((vma->vm_flags & VM_LOCKED) || | 783 | if (!migration) { |
720 | (ptep_clear_flush_young(vma, address, pte)))) { | 784 | if (vma->vm_flags & VM_LOCKED) { |
721 | ret = SWAP_FAIL; | 785 | ret = SWAP_MLOCK; |
722 | goto out_unmap; | 786 | goto out_unmap; |
723 | } | 787 | } |
788 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | ||
789 | ret = SWAP_FAIL; | ||
790 | goto out_unmap; | ||
791 | } | ||
792 | } | ||
724 | 793 | ||
725 | /* Nuke the page table entry. */ | 794 | /* Nuke the page table entry. */ |
726 | flush_cache_page(vma, address, page_to_pfn(page)); | 795 | flush_cache_page(vma, address, page_to_pfn(page)); |
727 | pteval = ptep_clear_flush(vma, address, pte); | 796 | pteval = ptep_clear_flush_notify(vma, address, pte); |
728 | 797 | ||
729 | /* Move the dirty bit to the physical page now the pte is gone. */ | 798 | /* Move the dirty bit to the physical page now the pte is gone. */ |
730 | if (pte_dirty(pteval)) | 799 | if (pte_dirty(pteval)) |
@@ -801,12 +870,17 @@ out: | |||
801 | * For very sparsely populated VMAs this is a little inefficient - chances are | 870 | * For very sparsely populated VMAs this is a little inefficient - chances are |
802 | * there there won't be many ptes located within the scan cluster. In this case | 871 | * there there won't be many ptes located within the scan cluster. In this case |
803 | * maybe we could scan further - to the end of the pte page, perhaps. | 872 | * maybe we could scan further - to the end of the pte page, perhaps. |
873 | * | ||
874 | * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can | ||
875 | * acquire it without blocking. If vma locked, mlock the pages in the cluster, | ||
876 | * rather than unmapping them. If we encounter the "check_page" that vmscan is | ||
877 | * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN. | ||
804 | */ | 878 | */ |
805 | #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) | 879 | #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) |
806 | #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) | 880 | #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) |
807 | 881 | ||
808 | static void try_to_unmap_cluster(unsigned long cursor, | 882 | static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, |
809 | unsigned int *mapcount, struct vm_area_struct *vma) | 883 | struct vm_area_struct *vma, struct page *check_page) |
810 | { | 884 | { |
811 | struct mm_struct *mm = vma->vm_mm; | 885 | struct mm_struct *mm = vma->vm_mm; |
812 | pgd_t *pgd; | 886 | pgd_t *pgd; |
@@ -818,6 +892,8 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
818 | struct page *page; | 892 | struct page *page; |
819 | unsigned long address; | 893 | unsigned long address; |
820 | unsigned long end; | 894 | unsigned long end; |
895 | int ret = SWAP_AGAIN; | ||
896 | int locked_vma = 0; | ||
821 | 897 | ||
822 | address = (vma->vm_start + cursor) & CLUSTER_MASK; | 898 | address = (vma->vm_start + cursor) & CLUSTER_MASK; |
823 | end = address + CLUSTER_SIZE; | 899 | end = address + CLUSTER_SIZE; |
@@ -828,15 +904,26 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
828 | 904 | ||
829 | pgd = pgd_offset(mm, address); | 905 | pgd = pgd_offset(mm, address); |
830 | if (!pgd_present(*pgd)) | 906 | if (!pgd_present(*pgd)) |
831 | return; | 907 | return ret; |
832 | 908 | ||
833 | pud = pud_offset(pgd, address); | 909 | pud = pud_offset(pgd, address); |
834 | if (!pud_present(*pud)) | 910 | if (!pud_present(*pud)) |
835 | return; | 911 | return ret; |
836 | 912 | ||
837 | pmd = pmd_offset(pud, address); | 913 | pmd = pmd_offset(pud, address); |
838 | if (!pmd_present(*pmd)) | 914 | if (!pmd_present(*pmd)) |
839 | return; | 915 | return ret; |
916 | |||
917 | /* | ||
918 | * MLOCK_PAGES => feature is configured. | ||
919 | * if we can acquire the mmap_sem for read, and vma is VM_LOCKED, | ||
920 | * keep the sem while scanning the cluster for mlocking pages. | ||
921 | */ | ||
922 | if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) { | ||
923 | locked_vma = (vma->vm_flags & VM_LOCKED); | ||
924 | if (!locked_vma) | ||
925 | up_read(&vma->vm_mm->mmap_sem); /* don't need it */ | ||
926 | } | ||
840 | 927 | ||
841 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 928 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
842 | 929 | ||
@@ -849,12 +936,19 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
849 | page = vm_normal_page(vma, address, *pte); | 936 | page = vm_normal_page(vma, address, *pte); |
850 | BUG_ON(!page || PageAnon(page)); | 937 | BUG_ON(!page || PageAnon(page)); |
851 | 938 | ||
852 | if (ptep_clear_flush_young(vma, address, pte)) | 939 | if (locked_vma) { |
940 | mlock_vma_page(page); /* no-op if already mlocked */ | ||
941 | if (page == check_page) | ||
942 | ret = SWAP_MLOCK; | ||
943 | continue; /* don't unmap */ | ||
944 | } | ||
945 | |||
946 | if (ptep_clear_flush_young_notify(vma, address, pte)) | ||
853 | continue; | 947 | continue; |
854 | 948 | ||
855 | /* Nuke the page table entry. */ | 949 | /* Nuke the page table entry. */ |
856 | flush_cache_page(vma, address, pte_pfn(*pte)); | 950 | flush_cache_page(vma, address, pte_pfn(*pte)); |
857 | pteval = ptep_clear_flush(vma, address, pte); | 951 | pteval = ptep_clear_flush_notify(vma, address, pte); |
858 | 952 | ||
859 | /* If nonlinear, store the file page offset in the pte. */ | 953 | /* If nonlinear, store the file page offset in the pte. */ |
860 | if (page->index != linear_page_index(vma, address)) | 954 | if (page->index != linear_page_index(vma, address)) |
@@ -870,39 +964,104 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
870 | (*mapcount)--; | 964 | (*mapcount)--; |
871 | } | 965 | } |
872 | pte_unmap_unlock(pte - 1, ptl); | 966 | pte_unmap_unlock(pte - 1, ptl); |
967 | if (locked_vma) | ||
968 | up_read(&vma->vm_mm->mmap_sem); | ||
969 | return ret; | ||
873 | } | 970 | } |
874 | 971 | ||
875 | static int try_to_unmap_anon(struct page *page, int migration) | 972 | /* |
973 | * common handling for pages mapped in VM_LOCKED vmas | ||
974 | */ | ||
975 | static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) | ||
976 | { | ||
977 | int mlocked = 0; | ||
978 | |||
979 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { | ||
980 | if (vma->vm_flags & VM_LOCKED) { | ||
981 | mlock_vma_page(page); | ||
982 | mlocked++; /* really mlocked the page */ | ||
983 | } | ||
984 | up_read(&vma->vm_mm->mmap_sem); | ||
985 | } | ||
986 | return mlocked; | ||
987 | } | ||
988 | |||
989 | /** | ||
990 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based | ||
991 | * rmap method | ||
992 | * @page: the page to unmap/unlock | ||
993 | * @unlock: request for unlock rather than unmap [unlikely] | ||
994 | * @migration: unmapping for migration - ignored if @unlock | ||
995 | * | ||
996 | * Find all the mappings of a page using the mapping pointer and the vma chains | ||
997 | * contained in the anon_vma struct it points to. | ||
998 | * | ||
999 | * This function is only called from try_to_unmap/try_to_munlock for | ||
1000 | * anonymous pages. | ||
1001 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | ||
1002 | * where the page was found will be held for write. So, we won't recheck | ||
1003 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | ||
1004 | * 'LOCKED. | ||
1005 | */ | ||
1006 | static int try_to_unmap_anon(struct page *page, int unlock, int migration) | ||
876 | { | 1007 | { |
877 | struct anon_vma *anon_vma; | 1008 | struct anon_vma *anon_vma; |
878 | struct vm_area_struct *vma; | 1009 | struct vm_area_struct *vma; |
1010 | unsigned int mlocked = 0; | ||
879 | int ret = SWAP_AGAIN; | 1011 | int ret = SWAP_AGAIN; |
880 | 1012 | ||
1013 | if (MLOCK_PAGES && unlikely(unlock)) | ||
1014 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ | ||
1015 | |||
881 | anon_vma = page_lock_anon_vma(page); | 1016 | anon_vma = page_lock_anon_vma(page); |
882 | if (!anon_vma) | 1017 | if (!anon_vma) |
883 | return ret; | 1018 | return ret; |
884 | 1019 | ||
885 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 1020 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { |
886 | ret = try_to_unmap_one(page, vma, migration); | 1021 | if (MLOCK_PAGES && unlikely(unlock)) { |
887 | if (ret == SWAP_FAIL || !page_mapped(page)) | 1022 | if (!((vma->vm_flags & VM_LOCKED) && |
888 | break; | 1023 | page_mapped_in_vma(page, vma))) |
1024 | continue; /* must visit all unlocked vmas */ | ||
1025 | ret = SWAP_MLOCK; /* saw at least one mlocked vma */ | ||
1026 | } else { | ||
1027 | ret = try_to_unmap_one(page, vma, migration); | ||
1028 | if (ret == SWAP_FAIL || !page_mapped(page)) | ||
1029 | break; | ||
1030 | } | ||
1031 | if (ret == SWAP_MLOCK) { | ||
1032 | mlocked = try_to_mlock_page(page, vma); | ||
1033 | if (mlocked) | ||
1034 | break; /* stop if actually mlocked page */ | ||
1035 | } | ||
889 | } | 1036 | } |
890 | 1037 | ||
891 | page_unlock_anon_vma(anon_vma); | 1038 | page_unlock_anon_vma(anon_vma); |
1039 | |||
1040 | if (mlocked) | ||
1041 | ret = SWAP_MLOCK; /* actually mlocked the page */ | ||
1042 | else if (ret == SWAP_MLOCK) | ||
1043 | ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ | ||
1044 | |||
892 | return ret; | 1045 | return ret; |
893 | } | 1046 | } |
894 | 1047 | ||
895 | /** | 1048 | /** |
896 | * try_to_unmap_file - unmap file page using the object-based rmap method | 1049 | * try_to_unmap_file - unmap/unlock file page using the object-based rmap method |
897 | * @page: the page to unmap | 1050 | * @page: the page to unmap/unlock |
898 | * @migration: migration flag | 1051 | * @unlock: request for unlock rather than unmap [unlikely] |
1052 | * @migration: unmapping for migration - ignored if @unlock | ||
899 | * | 1053 | * |
900 | * Find all the mappings of a page using the mapping pointer and the vma chains | 1054 | * Find all the mappings of a page using the mapping pointer and the vma chains |
901 | * contained in the address_space struct it points to. | 1055 | * contained in the address_space struct it points to. |
902 | * | 1056 | * |
903 | * This function is only called from try_to_unmap for object-based pages. | 1057 | * This function is only called from try_to_unmap/try_to_munlock for |
1058 | * object-based pages. | ||
1059 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | ||
1060 | * where the page was found will be held for write. So, we won't recheck | ||
1061 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | ||
1062 | * 'LOCKED. | ||
904 | */ | 1063 | */ |
905 | static int try_to_unmap_file(struct page *page, int migration) | 1064 | static int try_to_unmap_file(struct page *page, int unlock, int migration) |
906 | { | 1065 | { |
907 | struct address_space *mapping = page->mapping; | 1066 | struct address_space *mapping = page->mapping; |
908 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1067 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
@@ -913,20 +1072,44 @@ static int try_to_unmap_file(struct page *page, int migration) | |||
913 | unsigned long max_nl_cursor = 0; | 1072 | unsigned long max_nl_cursor = 0; |
914 | unsigned long max_nl_size = 0; | 1073 | unsigned long max_nl_size = 0; |
915 | unsigned int mapcount; | 1074 | unsigned int mapcount; |
1075 | unsigned int mlocked = 0; | ||
1076 | |||
1077 | if (MLOCK_PAGES && unlikely(unlock)) | ||
1078 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ | ||
916 | 1079 | ||
917 | spin_lock(&mapping->i_mmap_lock); | 1080 | spin_lock(&mapping->i_mmap_lock); |
918 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1081 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
919 | ret = try_to_unmap_one(page, vma, migration); | 1082 | if (MLOCK_PAGES && unlikely(unlock)) { |
920 | if (ret == SWAP_FAIL || !page_mapped(page)) | 1083 | if (!(vma->vm_flags & VM_LOCKED)) |
921 | goto out; | 1084 | continue; /* must visit all vmas */ |
1085 | ret = SWAP_MLOCK; | ||
1086 | } else { | ||
1087 | ret = try_to_unmap_one(page, vma, migration); | ||
1088 | if (ret == SWAP_FAIL || !page_mapped(page)) | ||
1089 | goto out; | ||
1090 | } | ||
1091 | if (ret == SWAP_MLOCK) { | ||
1092 | mlocked = try_to_mlock_page(page, vma); | ||
1093 | if (mlocked) | ||
1094 | break; /* stop if actually mlocked page */ | ||
1095 | } | ||
922 | } | 1096 | } |
923 | 1097 | ||
1098 | if (mlocked) | ||
1099 | goto out; | ||
1100 | |||
924 | if (list_empty(&mapping->i_mmap_nonlinear)) | 1101 | if (list_empty(&mapping->i_mmap_nonlinear)) |
925 | goto out; | 1102 | goto out; |
926 | 1103 | ||
927 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1104 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
928 | shared.vm_set.list) { | 1105 | shared.vm_set.list) { |
929 | if ((vma->vm_flags & VM_LOCKED) && !migration) | 1106 | if (MLOCK_PAGES && unlikely(unlock)) { |
1107 | if (!(vma->vm_flags & VM_LOCKED)) | ||
1108 | continue; /* must visit all vmas */ | ||
1109 | ret = SWAP_MLOCK; /* leave mlocked == 0 */ | ||
1110 | goto out; /* no need to look further */ | ||
1111 | } | ||
1112 | if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) | ||
930 | continue; | 1113 | continue; |
931 | cursor = (unsigned long) vma->vm_private_data; | 1114 | cursor = (unsigned long) vma->vm_private_data; |
932 | if (cursor > max_nl_cursor) | 1115 | if (cursor > max_nl_cursor) |
@@ -936,7 +1119,7 @@ static int try_to_unmap_file(struct page *page, int migration) | |||
936 | max_nl_size = cursor; | 1119 | max_nl_size = cursor; |
937 | } | 1120 | } |
938 | 1121 | ||
939 | if (max_nl_size == 0) { /* any nonlinears locked or reserved */ | 1122 | if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ |
940 | ret = SWAP_FAIL; | 1123 | ret = SWAP_FAIL; |
941 | goto out; | 1124 | goto out; |
942 | } | 1125 | } |
@@ -960,12 +1143,16 @@ static int try_to_unmap_file(struct page *page, int migration) | |||
960 | do { | 1143 | do { |
961 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1144 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
962 | shared.vm_set.list) { | 1145 | shared.vm_set.list) { |
963 | if ((vma->vm_flags & VM_LOCKED) && !migration) | 1146 | if (!MLOCK_PAGES && !migration && |
1147 | (vma->vm_flags & VM_LOCKED)) | ||
964 | continue; | 1148 | continue; |
965 | cursor = (unsigned long) vma->vm_private_data; | 1149 | cursor = (unsigned long) vma->vm_private_data; |
966 | while ( cursor < max_nl_cursor && | 1150 | while ( cursor < max_nl_cursor && |
967 | cursor < vma->vm_end - vma->vm_start) { | 1151 | cursor < vma->vm_end - vma->vm_start) { |
968 | try_to_unmap_cluster(cursor, &mapcount, vma); | 1152 | ret = try_to_unmap_cluster(cursor, &mapcount, |
1153 | vma, page); | ||
1154 | if (ret == SWAP_MLOCK) | ||
1155 | mlocked = 2; /* to return below */ | ||
969 | cursor += CLUSTER_SIZE; | 1156 | cursor += CLUSTER_SIZE; |
970 | vma->vm_private_data = (void *) cursor; | 1157 | vma->vm_private_data = (void *) cursor; |
971 | if ((int)mapcount <= 0) | 1158 | if ((int)mapcount <= 0) |
@@ -986,6 +1173,10 @@ static int try_to_unmap_file(struct page *page, int migration) | |||
986 | vma->vm_private_data = NULL; | 1173 | vma->vm_private_data = NULL; |
987 | out: | 1174 | out: |
988 | spin_unlock(&mapping->i_mmap_lock); | 1175 | spin_unlock(&mapping->i_mmap_lock); |
1176 | if (mlocked) | ||
1177 | ret = SWAP_MLOCK; /* actually mlocked the page */ | ||
1178 | else if (ret == SWAP_MLOCK) | ||
1179 | ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ | ||
989 | return ret; | 1180 | return ret; |
990 | } | 1181 | } |
991 | 1182 | ||
@@ -1001,6 +1192,7 @@ out: | |||
1001 | * SWAP_SUCCESS - we succeeded in removing all mappings | 1192 | * SWAP_SUCCESS - we succeeded in removing all mappings |
1002 | * SWAP_AGAIN - we missed a mapping, try again later | 1193 | * SWAP_AGAIN - we missed a mapping, try again later |
1003 | * SWAP_FAIL - the page is unswappable | 1194 | * SWAP_FAIL - the page is unswappable |
1195 | * SWAP_MLOCK - page is mlocked. | ||
1004 | */ | 1196 | */ |
1005 | int try_to_unmap(struct page *page, int migration) | 1197 | int try_to_unmap(struct page *page, int migration) |
1006 | { | 1198 | { |
@@ -1009,12 +1201,36 @@ int try_to_unmap(struct page *page, int migration) | |||
1009 | BUG_ON(!PageLocked(page)); | 1201 | BUG_ON(!PageLocked(page)); |
1010 | 1202 | ||
1011 | if (PageAnon(page)) | 1203 | if (PageAnon(page)) |
1012 | ret = try_to_unmap_anon(page, migration); | 1204 | ret = try_to_unmap_anon(page, 0, migration); |
1013 | else | 1205 | else |
1014 | ret = try_to_unmap_file(page, migration); | 1206 | ret = try_to_unmap_file(page, 0, migration); |
1015 | 1207 | if (ret != SWAP_MLOCK && !page_mapped(page)) | |
1016 | if (!page_mapped(page)) | ||
1017 | ret = SWAP_SUCCESS; | 1208 | ret = SWAP_SUCCESS; |
1018 | return ret; | 1209 | return ret; |
1019 | } | 1210 | } |
1020 | 1211 | ||
1212 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
1213 | /** | ||
1214 | * try_to_munlock - try to munlock a page | ||
1215 | * @page: the page to be munlocked | ||
1216 | * | ||
1217 | * Called from munlock code. Checks all of the VMAs mapping the page | ||
1218 | * to make sure nobody else has this page mlocked. The page will be | ||
1219 | * returned with PG_mlocked cleared if no other vmas have it mlocked. | ||
1220 | * | ||
1221 | * Return values are: | ||
1222 | * | ||
1223 | * SWAP_SUCCESS - no vma's holding page mlocked. | ||
1224 | * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem | ||
1225 | * SWAP_MLOCK - page is now mlocked. | ||
1226 | */ | ||
1227 | int try_to_munlock(struct page *page) | ||
1228 | { | ||
1229 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); | ||
1230 | |||
1231 | if (PageAnon(page)) | ||
1232 | return try_to_unmap_anon(page, 1, 0); | ||
1233 | else | ||
1234 | return try_to_unmap_file(page, 1, 0); | ||
1235 | } | ||
1236 | #endif | ||
diff --git a/mm/shmem.c b/mm/shmem.c index e2a6ae1a44e9..d38d7e61fcd0 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -50,14 +50,12 @@ | |||
50 | #include <linux/migrate.h> | 50 | #include <linux/migrate.h> |
51 | #include <linux/highmem.h> | 51 | #include <linux/highmem.h> |
52 | #include <linux/seq_file.h> | 52 | #include <linux/seq_file.h> |
53 | #include <linux/magic.h> | ||
53 | 54 | ||
54 | #include <asm/uaccess.h> | 55 | #include <asm/uaccess.h> |
55 | #include <asm/div64.h> | 56 | #include <asm/div64.h> |
56 | #include <asm/pgtable.h> | 57 | #include <asm/pgtable.h> |
57 | 58 | ||
58 | /* This magic number is used in glibc for posix shared memory */ | ||
59 | #define TMPFS_MAGIC 0x01021994 | ||
60 | |||
61 | #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) | 59 | #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) |
62 | #define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) | 60 | #define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) |
63 | #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) | 61 | #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) |
@@ -201,7 +199,7 @@ static struct vm_operations_struct shmem_vm_ops; | |||
201 | 199 | ||
202 | static struct backing_dev_info shmem_backing_dev_info __read_mostly = { | 200 | static struct backing_dev_info shmem_backing_dev_info __read_mostly = { |
203 | .ra_pages = 0, /* No readahead */ | 201 | .ra_pages = 0, /* No readahead */ |
204 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, | 202 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, |
205 | .unplug_io_fn = default_unplug_io_fn, | 203 | .unplug_io_fn = default_unplug_io_fn, |
206 | }; | 204 | }; |
207 | 205 | ||
@@ -922,20 +920,26 @@ found: | |||
922 | error = 1; | 920 | error = 1; |
923 | if (!inode) | 921 | if (!inode) |
924 | goto out; | 922 | goto out; |
925 | /* Precharge page while we can wait, compensate afterwards */ | 923 | /* Precharge page using GFP_KERNEL while we can wait */ |
926 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); | 924 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); |
927 | if (error) | 925 | if (error) |
928 | goto out; | 926 | goto out; |
929 | error = radix_tree_preload(GFP_KERNEL); | 927 | error = radix_tree_preload(GFP_KERNEL); |
930 | if (error) | 928 | if (error) { |
931 | goto uncharge; | 929 | mem_cgroup_uncharge_cache_page(page); |
930 | goto out; | ||
931 | } | ||
932 | error = 1; | 932 | error = 1; |
933 | 933 | ||
934 | spin_lock(&info->lock); | 934 | spin_lock(&info->lock); |
935 | ptr = shmem_swp_entry(info, idx, NULL); | 935 | ptr = shmem_swp_entry(info, idx, NULL); |
936 | if (ptr && ptr->val == entry.val) | 936 | if (ptr && ptr->val == entry.val) { |
937 | error = add_to_page_cache(page, inode->i_mapping, | 937 | error = add_to_page_cache_locked(page, inode->i_mapping, |
938 | idx, GFP_NOWAIT); | 938 | idx, GFP_NOWAIT); |
939 | /* does mem_cgroup_uncharge_cache_page on error */ | ||
940 | } else /* we must compensate for our precharge above */ | ||
941 | mem_cgroup_uncharge_cache_page(page); | ||
942 | |||
939 | if (error == -EEXIST) { | 943 | if (error == -EEXIST) { |
940 | struct page *filepage = find_get_page(inode->i_mapping, idx); | 944 | struct page *filepage = find_get_page(inode->i_mapping, idx); |
941 | error = 1; | 945 | error = 1; |
@@ -961,8 +965,6 @@ found: | |||
961 | shmem_swp_unmap(ptr); | 965 | shmem_swp_unmap(ptr); |
962 | spin_unlock(&info->lock); | 966 | spin_unlock(&info->lock); |
963 | radix_tree_preload_end(); | 967 | radix_tree_preload_end(); |
964 | uncharge: | ||
965 | mem_cgroup_uncharge_page(page); | ||
966 | out: | 968 | out: |
967 | unlock_page(page); | 969 | unlock_page(page); |
968 | page_cache_release(page); | 970 | page_cache_release(page); |
@@ -1261,7 +1263,7 @@ repeat: | |||
1261 | } | 1263 | } |
1262 | 1264 | ||
1263 | /* We have to do this with page locked to prevent races */ | 1265 | /* We have to do this with page locked to prevent races */ |
1264 | if (TestSetPageLocked(swappage)) { | 1266 | if (!trylock_page(swappage)) { |
1265 | shmem_swp_unmap(entry); | 1267 | shmem_swp_unmap(entry); |
1266 | spin_unlock(&info->lock); | 1268 | spin_unlock(&info->lock); |
1267 | wait_on_page_locked(swappage); | 1269 | wait_on_page_locked(swappage); |
@@ -1297,8 +1299,8 @@ repeat: | |||
1297 | SetPageUptodate(filepage); | 1299 | SetPageUptodate(filepage); |
1298 | set_page_dirty(filepage); | 1300 | set_page_dirty(filepage); |
1299 | swap_free(swap); | 1301 | swap_free(swap); |
1300 | } else if (!(error = add_to_page_cache( | 1302 | } else if (!(error = add_to_page_cache_locked(swappage, mapping, |
1301 | swappage, mapping, idx, GFP_NOWAIT))) { | 1303 | idx, GFP_NOWAIT))) { |
1302 | info->flags |= SHMEM_PAGEIN; | 1304 | info->flags |= SHMEM_PAGEIN; |
1303 | shmem_swp_set(info, entry, 0); | 1305 | shmem_swp_set(info, entry, 0); |
1304 | shmem_swp_unmap(entry); | 1306 | shmem_swp_unmap(entry); |
@@ -1311,24 +1313,21 @@ repeat: | |||
1311 | shmem_swp_unmap(entry); | 1313 | shmem_swp_unmap(entry); |
1312 | spin_unlock(&info->lock); | 1314 | spin_unlock(&info->lock); |
1313 | unlock_page(swappage); | 1315 | unlock_page(swappage); |
1316 | page_cache_release(swappage); | ||
1314 | if (error == -ENOMEM) { | 1317 | if (error == -ENOMEM) { |
1315 | /* allow reclaim from this memory cgroup */ | 1318 | /* allow reclaim from this memory cgroup */ |
1316 | error = mem_cgroup_cache_charge(swappage, | 1319 | error = mem_cgroup_shrink_usage(current->mm, |
1317 | current->mm, gfp & ~__GFP_HIGHMEM); | 1320 | gfp); |
1318 | if (error) { | 1321 | if (error) |
1319 | page_cache_release(swappage); | ||
1320 | goto failed; | 1322 | goto failed; |
1321 | } | ||
1322 | mem_cgroup_uncharge_page(swappage); | ||
1323 | } | 1323 | } |
1324 | page_cache_release(swappage); | ||
1325 | goto repeat; | 1324 | goto repeat; |
1326 | } | 1325 | } |
1327 | } else if (sgp == SGP_READ && !filepage) { | 1326 | } else if (sgp == SGP_READ && !filepage) { |
1328 | shmem_swp_unmap(entry); | 1327 | shmem_swp_unmap(entry); |
1329 | filepage = find_get_page(mapping, idx); | 1328 | filepage = find_get_page(mapping, idx); |
1330 | if (filepage && | 1329 | if (filepage && |
1331 | (!PageUptodate(filepage) || TestSetPageLocked(filepage))) { | 1330 | (!PageUptodate(filepage) || !trylock_page(filepage))) { |
1332 | spin_unlock(&info->lock); | 1331 | spin_unlock(&info->lock); |
1333 | wait_on_page_locked(filepage); | 1332 | wait_on_page_locked(filepage); |
1334 | page_cache_release(filepage); | 1333 | page_cache_release(filepage); |
@@ -1358,6 +1357,8 @@ repeat: | |||
1358 | } | 1357 | } |
1359 | 1358 | ||
1360 | if (!filepage) { | 1359 | if (!filepage) { |
1360 | int ret; | ||
1361 | |||
1361 | spin_unlock(&info->lock); | 1362 | spin_unlock(&info->lock); |
1362 | filepage = shmem_alloc_page(gfp, info, idx); | 1363 | filepage = shmem_alloc_page(gfp, info, idx); |
1363 | if (!filepage) { | 1364 | if (!filepage) { |
@@ -1366,6 +1367,7 @@ repeat: | |||
1366 | error = -ENOMEM; | 1367 | error = -ENOMEM; |
1367 | goto failed; | 1368 | goto failed; |
1368 | } | 1369 | } |
1370 | SetPageSwapBacked(filepage); | ||
1369 | 1371 | ||
1370 | /* Precharge page while we can wait, compensate after */ | 1372 | /* Precharge page while we can wait, compensate after */ |
1371 | error = mem_cgroup_cache_charge(filepage, current->mm, | 1373 | error = mem_cgroup_cache_charge(filepage, current->mm, |
@@ -1386,10 +1388,18 @@ repeat: | |||
1386 | swap = *entry; | 1388 | swap = *entry; |
1387 | shmem_swp_unmap(entry); | 1389 | shmem_swp_unmap(entry); |
1388 | } | 1390 | } |
1389 | if (error || swap.val || 0 != add_to_page_cache_lru( | 1391 | ret = error || swap.val; |
1390 | filepage, mapping, idx, GFP_NOWAIT)) { | 1392 | if (ret) |
1393 | mem_cgroup_uncharge_cache_page(filepage); | ||
1394 | else | ||
1395 | ret = add_to_page_cache_lru(filepage, mapping, | ||
1396 | idx, GFP_NOWAIT); | ||
1397 | /* | ||
1398 | * At add_to_page_cache_lru() failure, uncharge will | ||
1399 | * be done automatically. | ||
1400 | */ | ||
1401 | if (ret) { | ||
1391 | spin_unlock(&info->lock); | 1402 | spin_unlock(&info->lock); |
1392 | mem_cgroup_uncharge_page(filepage); | ||
1393 | page_cache_release(filepage); | 1403 | page_cache_release(filepage); |
1394 | shmem_unacct_blocks(info->flags, 1); | 1404 | shmem_unacct_blocks(info->flags, 1); |
1395 | shmem_free_blocks(inode, 1); | 1405 | shmem_free_blocks(inode, 1); |
@@ -1398,7 +1408,6 @@ repeat: | |||
1398 | goto failed; | 1408 | goto failed; |
1399 | goto repeat; | 1409 | goto repeat; |
1400 | } | 1410 | } |
1401 | mem_cgroup_uncharge_page(filepage); | ||
1402 | info->flags |= SHMEM_PAGEIN; | 1411 | info->flags |= SHMEM_PAGEIN; |
1403 | } | 1412 | } |
1404 | 1413 | ||
@@ -1468,12 +1477,16 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) | |||
1468 | if (!user_shm_lock(inode->i_size, user)) | 1477 | if (!user_shm_lock(inode->i_size, user)) |
1469 | goto out_nomem; | 1478 | goto out_nomem; |
1470 | info->flags |= VM_LOCKED; | 1479 | info->flags |= VM_LOCKED; |
1480 | mapping_set_unevictable(file->f_mapping); | ||
1471 | } | 1481 | } |
1472 | if (!lock && (info->flags & VM_LOCKED) && user) { | 1482 | if (!lock && (info->flags & VM_LOCKED) && user) { |
1473 | user_shm_unlock(inode->i_size, user); | 1483 | user_shm_unlock(inode->i_size, user); |
1474 | info->flags &= ~VM_LOCKED; | 1484 | info->flags &= ~VM_LOCKED; |
1485 | mapping_clear_unevictable(file->f_mapping); | ||
1486 | scan_mapping_unevictable_pages(file->f_mapping); | ||
1475 | } | 1487 | } |
1476 | retval = 0; | 1488 | retval = 0; |
1489 | |||
1477 | out_nomem: | 1490 | out_nomem: |
1478 | spin_unlock(&info->lock); | 1491 | spin_unlock(&info->lock); |
1479 | return retval; | 1492 | return retval; |
@@ -1503,7 +1516,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | |||
1503 | inode->i_uid = current->fsuid; | 1516 | inode->i_uid = current->fsuid; |
1504 | inode->i_gid = current->fsgid; | 1517 | inode->i_gid = current->fsgid; |
1505 | inode->i_blocks = 0; | 1518 | inode->i_blocks = 0; |
1506 | inode->i_mapping->a_ops = &shmem_aops; | ||
1507 | inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; | 1519 | inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; |
1508 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 1520 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
1509 | inode->i_generation = get_seconds(); | 1521 | inode->i_generation = get_seconds(); |
@@ -1518,6 +1530,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | |||
1518 | init_special_inode(inode, mode, dev); | 1530 | init_special_inode(inode, mode, dev); |
1519 | break; | 1531 | break; |
1520 | case S_IFREG: | 1532 | case S_IFREG: |
1533 | inode->i_mapping->a_ops = &shmem_aops; | ||
1521 | inode->i_op = &shmem_inode_operations; | 1534 | inode->i_op = &shmem_inode_operations; |
1522 | inode->i_fop = &shmem_file_operations; | 1535 | inode->i_fop = &shmem_file_operations; |
1523 | mpol_shared_policy_init(&info->policy, | 1536 | mpol_shared_policy_init(&info->policy, |
@@ -1690,26 +1703,38 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ | |||
1690 | file_accessed(filp); | 1703 | file_accessed(filp); |
1691 | } | 1704 | } |
1692 | 1705 | ||
1693 | static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) | 1706 | static ssize_t shmem_file_aio_read(struct kiocb *iocb, |
1707 | const struct iovec *iov, unsigned long nr_segs, loff_t pos) | ||
1694 | { | 1708 | { |
1695 | read_descriptor_t desc; | 1709 | struct file *filp = iocb->ki_filp; |
1710 | ssize_t retval; | ||
1711 | unsigned long seg; | ||
1712 | size_t count; | ||
1713 | loff_t *ppos = &iocb->ki_pos; | ||
1696 | 1714 | ||
1697 | if ((ssize_t) count < 0) | 1715 | retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); |
1698 | return -EINVAL; | 1716 | if (retval) |
1699 | if (!access_ok(VERIFY_WRITE, buf, count)) | 1717 | return retval; |
1700 | return -EFAULT; | ||
1701 | if (!count) | ||
1702 | return 0; | ||
1703 | 1718 | ||
1704 | desc.written = 0; | 1719 | for (seg = 0; seg < nr_segs; seg++) { |
1705 | desc.count = count; | 1720 | read_descriptor_t desc; |
1706 | desc.arg.buf = buf; | ||
1707 | desc.error = 0; | ||
1708 | 1721 | ||
1709 | do_shmem_file_read(filp, ppos, &desc, file_read_actor); | 1722 | desc.written = 0; |
1710 | if (desc.written) | 1723 | desc.arg.buf = iov[seg].iov_base; |
1711 | return desc.written; | 1724 | desc.count = iov[seg].iov_len; |
1712 | return desc.error; | 1725 | if (desc.count == 0) |
1726 | continue; | ||
1727 | desc.error = 0; | ||
1728 | do_shmem_file_read(filp, ppos, &desc, file_read_actor); | ||
1729 | retval += desc.written; | ||
1730 | if (desc.error) { | ||
1731 | retval = retval ?: desc.error; | ||
1732 | break; | ||
1733 | } | ||
1734 | if (desc.count > 0) | ||
1735 | break; | ||
1736 | } | ||
1737 | return retval; | ||
1713 | } | 1738 | } |
1714 | 1739 | ||
1715 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) | 1740 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) |
@@ -1907,6 +1932,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
1907 | return error; | 1932 | return error; |
1908 | } | 1933 | } |
1909 | unlock_page(page); | 1934 | unlock_page(page); |
1935 | inode->i_mapping->a_ops = &shmem_aops; | ||
1910 | inode->i_op = &shmem_symlink_inode_operations; | 1936 | inode->i_op = &shmem_symlink_inode_operations; |
1911 | kaddr = kmap_atomic(page, KM_USER0); | 1937 | kaddr = kmap_atomic(page, KM_USER0); |
1912 | memcpy(kaddr, symname, len); | 1938 | memcpy(kaddr, symname, len); |
@@ -2330,7 +2356,7 @@ static void shmem_destroy_inode(struct inode *inode) | |||
2330 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); | 2356 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); |
2331 | } | 2357 | } |
2332 | 2358 | ||
2333 | static void init_once(struct kmem_cache *cachep, void *foo) | 2359 | static void init_once(void *foo) |
2334 | { | 2360 | { |
2335 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; | 2361 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; |
2336 | 2362 | ||
@@ -2369,8 +2395,9 @@ static const struct file_operations shmem_file_operations = { | |||
2369 | .mmap = shmem_mmap, | 2395 | .mmap = shmem_mmap, |
2370 | #ifdef CONFIG_TMPFS | 2396 | #ifdef CONFIG_TMPFS |
2371 | .llseek = generic_file_llseek, | 2397 | .llseek = generic_file_llseek, |
2372 | .read = shmem_file_read, | 2398 | .read = do_sync_read, |
2373 | .write = do_sync_write, | 2399 | .write = do_sync_write, |
2400 | .aio_read = shmem_file_aio_read, | ||
2374 | .aio_write = generic_file_aio_write, | 2401 | .aio_write = generic_file_aio_write, |
2375 | .fsync = simple_sync_file, | 2402 | .fsync = simple_sync_file, |
2376 | .splice_read = generic_file_splice_read, | 2403 | .splice_read = generic_file_splice_read, |
@@ -2558,6 +2585,7 @@ put_memory: | |||
2558 | shmem_unacct_size(flags, size); | 2585 | shmem_unacct_size(flags, size); |
2559 | return ERR_PTR(error); | 2586 | return ERR_PTR(error); |
2560 | } | 2587 | } |
2588 | EXPORT_SYMBOL_GPL(shmem_file_setup); | ||
2561 | 2589 | ||
2562 | /** | 2590 | /** |
2563 | * shmem_zero_setup - setup a shared anonymous mapping | 2591 | * shmem_zero_setup - setup a shared anonymous mapping |
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c index f5664c5b9eb1..8e5aadd7dcd6 100644 --- a/mm/shmem_acl.c +++ b/mm/shmem_acl.c | |||
@@ -191,7 +191,7 @@ shmem_check_acl(struct inode *inode, int mask) | |||
191 | * shmem_permission - permission() inode operation | 191 | * shmem_permission - permission() inode operation |
192 | */ | 192 | */ |
193 | int | 193 | int |
194 | shmem_permission(struct inode *inode, int mask, struct nameidata *nd) | 194 | shmem_permission(struct inode *inode, int mask) |
195 | { | 195 | { |
196 | return generic_permission(inode, mask, shmem_check_acl); | 196 | return generic_permission(inode, mask, shmem_check_acl); |
197 | } | 197 | } |
@@ -95,6 +95,7 @@ | |||
95 | #include <linux/init.h> | 95 | #include <linux/init.h> |
96 | #include <linux/compiler.h> | 96 | #include <linux/compiler.h> |
97 | #include <linux/cpuset.h> | 97 | #include <linux/cpuset.h> |
98 | #include <linux/proc_fs.h> | ||
98 | #include <linux/seq_file.h> | 99 | #include <linux/seq_file.h> |
99 | #include <linux/notifier.h> | 100 | #include <linux/notifier.h> |
100 | #include <linux/kallsyms.h> | 101 | #include <linux/kallsyms.h> |
@@ -406,7 +407,7 @@ struct kmem_cache { | |||
406 | unsigned int dflags; /* dynamic flags */ | 407 | unsigned int dflags; /* dynamic flags */ |
407 | 408 | ||
408 | /* constructor func */ | 409 | /* constructor func */ |
409 | void (*ctor)(struct kmem_cache *, void *); | 410 | void (*ctor)(void *obj); |
410 | 411 | ||
411 | /* 5) cache creation/removal */ | 412 | /* 5) cache creation/removal */ |
412 | const char *name; | 413 | const char *name; |
@@ -2137,8 +2138,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) | |||
2137 | */ | 2138 | */ |
2138 | struct kmem_cache * | 2139 | struct kmem_cache * |
2139 | kmem_cache_create (const char *name, size_t size, size_t align, | 2140 | kmem_cache_create (const char *name, size_t size, size_t align, |
2140 | unsigned long flags, | 2141 | unsigned long flags, void (*ctor)(void *)) |
2141 | void (*ctor)(struct kmem_cache *, void *)) | ||
2142 | { | 2142 | { |
2143 | size_t left_over, slab_size, ralign; | 2143 | size_t left_over, slab_size, ralign; |
2144 | struct kmem_cache *cachep = NULL, *pc; | 2144 | struct kmem_cache *cachep = NULL, *pc; |
@@ -2653,7 +2653,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2653 | * They must also be threaded. | 2653 | * They must also be threaded. |
2654 | */ | 2654 | */ |
2655 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) | 2655 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) |
2656 | cachep->ctor(cachep, objp + obj_offset(cachep)); | 2656 | cachep->ctor(objp + obj_offset(cachep)); |
2657 | 2657 | ||
2658 | if (cachep->flags & SLAB_RED_ZONE) { | 2658 | if (cachep->flags & SLAB_RED_ZONE) { |
2659 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) | 2659 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) |
@@ -2669,7 +2669,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2669 | cachep->buffer_size / PAGE_SIZE, 0); | 2669 | cachep->buffer_size / PAGE_SIZE, 0); |
2670 | #else | 2670 | #else |
2671 | if (cachep->ctor) | 2671 | if (cachep->ctor) |
2672 | cachep->ctor(cachep, objp); | 2672 | cachep->ctor(objp); |
2673 | #endif | 2673 | #endif |
2674 | slab_bufctl(slabp)[i] = i + 1; | 2674 | slab_bufctl(slabp)[i] = i + 1; |
2675 | } | 2675 | } |
@@ -3093,7 +3093,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3093 | #endif | 3093 | #endif |
3094 | objp += obj_offset(cachep); | 3094 | objp += obj_offset(cachep); |
3095 | if (cachep->ctor && cachep->flags & SLAB_POISON) | 3095 | if (cachep->ctor && cachep->flags & SLAB_POISON) |
3096 | cachep->ctor(cachep, objp); | 3096 | cachep->ctor(objp); |
3097 | #if ARCH_SLAB_MINALIGN | 3097 | #if ARCH_SLAB_MINALIGN |
3098 | if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { | 3098 | if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { |
3099 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", | 3099 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", |
@@ -4259,7 +4259,7 @@ static int s_show(struct seq_file *m, void *p) | |||
4259 | * + further values on SMP and with statistics enabled | 4259 | * + further values on SMP and with statistics enabled |
4260 | */ | 4260 | */ |
4261 | 4261 | ||
4262 | const struct seq_operations slabinfo_op = { | 4262 | static const struct seq_operations slabinfo_op = { |
4263 | .start = s_start, | 4263 | .start = s_start, |
4264 | .next = s_next, | 4264 | .next = s_next, |
4265 | .stop = s_stop, | 4265 | .stop = s_stop, |
@@ -4316,6 +4316,19 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, | |||
4316 | return res; | 4316 | return res; |
4317 | } | 4317 | } |
4318 | 4318 | ||
4319 | static int slabinfo_open(struct inode *inode, struct file *file) | ||
4320 | { | ||
4321 | return seq_open(file, &slabinfo_op); | ||
4322 | } | ||
4323 | |||
4324 | static const struct file_operations proc_slabinfo_operations = { | ||
4325 | .open = slabinfo_open, | ||
4326 | .read = seq_read, | ||
4327 | .write = slabinfo_write, | ||
4328 | .llseek = seq_lseek, | ||
4329 | .release = seq_release, | ||
4330 | }; | ||
4331 | |||
4319 | #ifdef CONFIG_DEBUG_SLAB_LEAK | 4332 | #ifdef CONFIG_DEBUG_SLAB_LEAK |
4320 | 4333 | ||
4321 | static void *leaks_start(struct seq_file *m, loff_t *pos) | 4334 | static void *leaks_start(struct seq_file *m, loff_t *pos) |
@@ -4444,13 +4457,47 @@ static int leaks_show(struct seq_file *m, void *p) | |||
4444 | return 0; | 4457 | return 0; |
4445 | } | 4458 | } |
4446 | 4459 | ||
4447 | const struct seq_operations slabstats_op = { | 4460 | static const struct seq_operations slabstats_op = { |
4448 | .start = leaks_start, | 4461 | .start = leaks_start, |
4449 | .next = s_next, | 4462 | .next = s_next, |
4450 | .stop = s_stop, | 4463 | .stop = s_stop, |
4451 | .show = leaks_show, | 4464 | .show = leaks_show, |
4452 | }; | 4465 | }; |
4466 | |||
4467 | static int slabstats_open(struct inode *inode, struct file *file) | ||
4468 | { | ||
4469 | unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL); | ||
4470 | int ret = -ENOMEM; | ||
4471 | if (n) { | ||
4472 | ret = seq_open(file, &slabstats_op); | ||
4473 | if (!ret) { | ||
4474 | struct seq_file *m = file->private_data; | ||
4475 | *n = PAGE_SIZE / (2 * sizeof(unsigned long)); | ||
4476 | m->private = n; | ||
4477 | n = NULL; | ||
4478 | } | ||
4479 | kfree(n); | ||
4480 | } | ||
4481 | return ret; | ||
4482 | } | ||
4483 | |||
4484 | static const struct file_operations proc_slabstats_operations = { | ||
4485 | .open = slabstats_open, | ||
4486 | .read = seq_read, | ||
4487 | .llseek = seq_lseek, | ||
4488 | .release = seq_release_private, | ||
4489 | }; | ||
4490 | #endif | ||
4491 | |||
4492 | static int __init slab_proc_init(void) | ||
4493 | { | ||
4494 | proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); | ||
4495 | #ifdef CONFIG_DEBUG_SLAB_LEAK | ||
4496 | proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); | ||
4453 | #endif | 4497 | #endif |
4498 | return 0; | ||
4499 | } | ||
4500 | module_init(slab_proc_init); | ||
4454 | #endif | 4501 | #endif |
4455 | 4502 | ||
4456 | /** | 4503 | /** |
@@ -4473,4 +4520,3 @@ size_t ksize(const void *objp) | |||
4473 | 4520 | ||
4474 | return obj_size(virt_to_cache(objp)); | 4521 | return obj_size(virt_to_cache(objp)); |
4475 | } | 4522 | } |
4476 | EXPORT_SYMBOL(ksize); | ||
@@ -130,17 +130,17 @@ static LIST_HEAD(free_slob_large); | |||
130 | */ | 130 | */ |
131 | static inline int slob_page(struct slob_page *sp) | 131 | static inline int slob_page(struct slob_page *sp) |
132 | { | 132 | { |
133 | return test_bit(PG_active, &sp->flags); | 133 | return PageSlobPage((struct page *)sp); |
134 | } | 134 | } |
135 | 135 | ||
136 | static inline void set_slob_page(struct slob_page *sp) | 136 | static inline void set_slob_page(struct slob_page *sp) |
137 | { | 137 | { |
138 | __set_bit(PG_active, &sp->flags); | 138 | __SetPageSlobPage((struct page *)sp); |
139 | } | 139 | } |
140 | 140 | ||
141 | static inline void clear_slob_page(struct slob_page *sp) | 141 | static inline void clear_slob_page(struct slob_page *sp) |
142 | { | 142 | { |
143 | __clear_bit(PG_active, &sp->flags); | 143 | __ClearPageSlobPage((struct page *)sp); |
144 | } | 144 | } |
145 | 145 | ||
146 | /* | 146 | /* |
@@ -148,19 +148,19 @@ static inline void clear_slob_page(struct slob_page *sp) | |||
148 | */ | 148 | */ |
149 | static inline int slob_page_free(struct slob_page *sp) | 149 | static inline int slob_page_free(struct slob_page *sp) |
150 | { | 150 | { |
151 | return test_bit(PG_private, &sp->flags); | 151 | return PageSlobFree((struct page *)sp); |
152 | } | 152 | } |
153 | 153 | ||
154 | static void set_slob_page_free(struct slob_page *sp, struct list_head *list) | 154 | static void set_slob_page_free(struct slob_page *sp, struct list_head *list) |
155 | { | 155 | { |
156 | list_add(&sp->list, list); | 156 | list_add(&sp->list, list); |
157 | __set_bit(PG_private, &sp->flags); | 157 | __SetPageSlobFree((struct page *)sp); |
158 | } | 158 | } |
159 | 159 | ||
160 | static inline void clear_slob_page_free(struct slob_page *sp) | 160 | static inline void clear_slob_page_free(struct slob_page *sp) |
161 | { | 161 | { |
162 | list_del(&sp->list); | 162 | list_del(&sp->list); |
163 | __clear_bit(PG_private, &sp->flags); | 163 | __ClearPageSlobFree((struct page *)sp); |
164 | } | 164 | } |
165 | 165 | ||
166 | #define SLOB_UNIT sizeof(slob_t) | 166 | #define SLOB_UNIT sizeof(slob_t) |
@@ -514,23 +514,23 @@ size_t ksize(const void *block) | |||
514 | return 0; | 514 | return 0; |
515 | 515 | ||
516 | sp = (struct slob_page *)virt_to_page(block); | 516 | sp = (struct slob_page *)virt_to_page(block); |
517 | if (slob_page(sp)) | 517 | if (slob_page(sp)) { |
518 | return ((slob_t *)block - 1)->units + SLOB_UNIT; | 518 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); |
519 | else | 519 | unsigned int *m = (unsigned int *)(block - align); |
520 | return SLOB_UNITS(*m) * SLOB_UNIT; | ||
521 | } else | ||
520 | return sp->page.private; | 522 | return sp->page.private; |
521 | } | 523 | } |
522 | EXPORT_SYMBOL(ksize); | ||
523 | 524 | ||
524 | struct kmem_cache { | 525 | struct kmem_cache { |
525 | unsigned int size, align; | 526 | unsigned int size, align; |
526 | unsigned long flags; | 527 | unsigned long flags; |
527 | const char *name; | 528 | const char *name; |
528 | void (*ctor)(struct kmem_cache *, void *); | 529 | void (*ctor)(void *); |
529 | }; | 530 | }; |
530 | 531 | ||
531 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, | 532 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, |
532 | size_t align, unsigned long flags, | 533 | size_t align, unsigned long flags, void (*ctor)(void *)) |
533 | void (*ctor)(struct kmem_cache *, void *)) | ||
534 | { | 534 | { |
535 | struct kmem_cache *c; | 535 | struct kmem_cache *c; |
536 | 536 | ||
@@ -575,7 +575,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) | |||
575 | b = slob_new_page(flags, get_order(c->size), node); | 575 | b = slob_new_page(flags, get_order(c->size), node); |
576 | 576 | ||
577 | if (c->ctor) | 577 | if (c->ctor) |
578 | c->ctor(c, b); | 578 | c->ctor(b); |
579 | 579 | ||
580 | return b; | 580 | return b; |
581 | } | 581 | } |
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/interrupt.h> | 14 | #include <linux/interrupt.h> |
15 | #include <linux/bitops.h> | 15 | #include <linux/bitops.h> |
16 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
17 | #include <linux/proc_fs.h> | ||
17 | #include <linux/seq_file.h> | 18 | #include <linux/seq_file.h> |
18 | #include <linux/cpu.h> | 19 | #include <linux/cpu.h> |
19 | #include <linux/cpuset.h> | 20 | #include <linux/cpuset.h> |
@@ -102,44 +103,12 @@ | |||
102 | * the fast path and disables lockless freelists. | 103 | * the fast path and disables lockless freelists. |
103 | */ | 104 | */ |
104 | 105 | ||
105 | #define FROZEN (1 << PG_active) | ||
106 | |||
107 | #ifdef CONFIG_SLUB_DEBUG | 106 | #ifdef CONFIG_SLUB_DEBUG |
108 | #define SLABDEBUG (1 << PG_error) | 107 | #define SLABDEBUG 1 |
109 | #else | 108 | #else |
110 | #define SLABDEBUG 0 | 109 | #define SLABDEBUG 0 |
111 | #endif | 110 | #endif |
112 | 111 | ||
113 | static inline int SlabFrozen(struct page *page) | ||
114 | { | ||
115 | return page->flags & FROZEN; | ||
116 | } | ||
117 | |||
118 | static inline void SetSlabFrozen(struct page *page) | ||
119 | { | ||
120 | page->flags |= FROZEN; | ||
121 | } | ||
122 | |||
123 | static inline void ClearSlabFrozen(struct page *page) | ||
124 | { | ||
125 | page->flags &= ~FROZEN; | ||
126 | } | ||
127 | |||
128 | static inline int SlabDebug(struct page *page) | ||
129 | { | ||
130 | return page->flags & SLABDEBUG; | ||
131 | } | ||
132 | |||
133 | static inline void SetSlabDebug(struct page *page) | ||
134 | { | ||
135 | page->flags |= SLABDEBUG; | ||
136 | } | ||
137 | |||
138 | static inline void ClearSlabDebug(struct page *page) | ||
139 | { | ||
140 | page->flags &= ~SLABDEBUG; | ||
141 | } | ||
142 | |||
143 | /* | 112 | /* |
144 | * Issues still to be resolved: | 113 | * Issues still to be resolved: |
145 | * | 114 | * |
@@ -492,7 +461,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) | |||
492 | if (p > addr + 16) | 461 | if (p > addr + 16) |
493 | print_section("Bytes b4", p - 16, 16); | 462 | print_section("Bytes b4", p - 16, 16); |
494 | 463 | ||
495 | print_section("Object", p, min(s->objsize, 128)); | 464 | print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE)); |
496 | 465 | ||
497 | if (s->flags & SLAB_RED_ZONE) | 466 | if (s->flags & SLAB_RED_ZONE) |
498 | print_section("Redzone", p + s->objsize, | 467 | print_section("Redzone", p + s->objsize, |
@@ -971,7 +940,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, | |||
971 | } | 940 | } |
972 | 941 | ||
973 | /* Special debug activities for freeing objects */ | 942 | /* Special debug activities for freeing objects */ |
974 | if (!SlabFrozen(page) && !page->freelist) | 943 | if (!PageSlubFrozen(page) && !page->freelist) |
975 | remove_full(s, page); | 944 | remove_full(s, page); |
976 | if (s->flags & SLAB_STORE_USER) | 945 | if (s->flags & SLAB_STORE_USER) |
977 | set_track(s, object, TRACK_FREE, addr); | 946 | set_track(s, object, TRACK_FREE, addr); |
@@ -1044,7 +1013,7 @@ __setup("slub_debug", setup_slub_debug); | |||
1044 | 1013 | ||
1045 | static unsigned long kmem_cache_flags(unsigned long objsize, | 1014 | static unsigned long kmem_cache_flags(unsigned long objsize, |
1046 | unsigned long flags, const char *name, | 1015 | unsigned long flags, const char *name, |
1047 | void (*ctor)(struct kmem_cache *, void *)) | 1016 | void (*ctor)(void *)) |
1048 | { | 1017 | { |
1049 | /* | 1018 | /* |
1050 | * Enable debugging if selected on the kernel commandline. | 1019 | * Enable debugging if selected on the kernel commandline. |
@@ -1072,7 +1041,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page, | |||
1072 | static inline void add_full(struct kmem_cache_node *n, struct page *page) {} | 1041 | static inline void add_full(struct kmem_cache_node *n, struct page *page) {} |
1073 | static inline unsigned long kmem_cache_flags(unsigned long objsize, | 1042 | static inline unsigned long kmem_cache_flags(unsigned long objsize, |
1074 | unsigned long flags, const char *name, | 1043 | unsigned long flags, const char *name, |
1075 | void (*ctor)(struct kmem_cache *, void *)) | 1044 | void (*ctor)(void *)) |
1076 | { | 1045 | { |
1077 | return flags; | 1046 | return flags; |
1078 | } | 1047 | } |
@@ -1135,7 +1104,7 @@ static void setup_object(struct kmem_cache *s, struct page *page, | |||
1135 | { | 1104 | { |
1136 | setup_object_debug(s, page, object); | 1105 | setup_object_debug(s, page, object); |
1137 | if (unlikely(s->ctor)) | 1106 | if (unlikely(s->ctor)) |
1138 | s->ctor(s, object); | 1107 | s->ctor(object); |
1139 | } | 1108 | } |
1140 | 1109 | ||
1141 | static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | 1110 | static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) |
@@ -1157,7 +1126,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1157 | page->flags |= 1 << PG_slab; | 1126 | page->flags |= 1 << PG_slab; |
1158 | if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | | 1127 | if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | |
1159 | SLAB_STORE_USER | SLAB_TRACE)) | 1128 | SLAB_STORE_USER | SLAB_TRACE)) |
1160 | SetSlabDebug(page); | 1129 | __SetPageSlubDebug(page); |
1161 | 1130 | ||
1162 | start = page_address(page); | 1131 | start = page_address(page); |
1163 | 1132 | ||
@@ -1184,14 +1153,14 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1184 | int order = compound_order(page); | 1153 | int order = compound_order(page); |
1185 | int pages = 1 << order; | 1154 | int pages = 1 << order; |
1186 | 1155 | ||
1187 | if (unlikely(SlabDebug(page))) { | 1156 | if (unlikely(SLABDEBUG && PageSlubDebug(page))) { |
1188 | void *p; | 1157 | void *p; |
1189 | 1158 | ||
1190 | slab_pad_check(s, page); | 1159 | slab_pad_check(s, page); |
1191 | for_each_object(p, s, page_address(page), | 1160 | for_each_object(p, s, page_address(page), |
1192 | page->objects) | 1161 | page->objects) |
1193 | check_object(s, page, p, 0); | 1162 | check_object(s, page, p, 0); |
1194 | ClearSlabDebug(page); | 1163 | __ClearPageSlubDebug(page); |
1195 | } | 1164 | } |
1196 | 1165 | ||
1197 | mod_zone_page_state(page_zone(page), | 1166 | mod_zone_page_state(page_zone(page), |
@@ -1288,7 +1257,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n, | |||
1288 | if (slab_trylock(page)) { | 1257 | if (slab_trylock(page)) { |
1289 | list_del(&page->lru); | 1258 | list_del(&page->lru); |
1290 | n->nr_partial--; | 1259 | n->nr_partial--; |
1291 | SetSlabFrozen(page); | 1260 | __SetPageSlubFrozen(page); |
1292 | return 1; | 1261 | return 1; |
1293 | } | 1262 | } |
1294 | return 0; | 1263 | return 0; |
@@ -1361,7 +1330,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
1361 | n = get_node(s, zone_to_nid(zone)); | 1330 | n = get_node(s, zone_to_nid(zone)); |
1362 | 1331 | ||
1363 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && | 1332 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && |
1364 | n->nr_partial > MIN_PARTIAL) { | 1333 | n->nr_partial > n->min_partial) { |
1365 | page = get_partial_node(n); | 1334 | page = get_partial_node(n); |
1366 | if (page) | 1335 | if (page) |
1367 | return page; | 1336 | return page; |
@@ -1398,7 +1367,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) | |||
1398 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | 1367 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); |
1399 | struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); | 1368 | struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); |
1400 | 1369 | ||
1401 | ClearSlabFrozen(page); | 1370 | __ClearPageSlubFrozen(page); |
1402 | if (page->inuse) { | 1371 | if (page->inuse) { |
1403 | 1372 | ||
1404 | if (page->freelist) { | 1373 | if (page->freelist) { |
@@ -1406,13 +1375,14 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) | |||
1406 | stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); | 1375 | stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); |
1407 | } else { | 1376 | } else { |
1408 | stat(c, DEACTIVATE_FULL); | 1377 | stat(c, DEACTIVATE_FULL); |
1409 | if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) | 1378 | if (SLABDEBUG && PageSlubDebug(page) && |
1379 | (s->flags & SLAB_STORE_USER)) | ||
1410 | add_full(n, page); | 1380 | add_full(n, page); |
1411 | } | 1381 | } |
1412 | slab_unlock(page); | 1382 | slab_unlock(page); |
1413 | } else { | 1383 | } else { |
1414 | stat(c, DEACTIVATE_EMPTY); | 1384 | stat(c, DEACTIVATE_EMPTY); |
1415 | if (n->nr_partial < MIN_PARTIAL) { | 1385 | if (n->nr_partial < n->min_partial) { |
1416 | /* | 1386 | /* |
1417 | * Adding an empty slab to the partial slabs in order | 1387 | * Adding an empty slab to the partial slabs in order |
1418 | * to avoid page allocator overhead. This slab needs | 1388 | * to avoid page allocator overhead. This slab needs |
@@ -1495,15 +1465,7 @@ static void flush_cpu_slab(void *d) | |||
1495 | 1465 | ||
1496 | static void flush_all(struct kmem_cache *s) | 1466 | static void flush_all(struct kmem_cache *s) |
1497 | { | 1467 | { |
1498 | #ifdef CONFIG_SMP | ||
1499 | on_each_cpu(flush_cpu_slab, s, 1); | 1468 | on_each_cpu(flush_cpu_slab, s, 1); |
1500 | #else | ||
1501 | unsigned long flags; | ||
1502 | |||
1503 | local_irq_save(flags); | ||
1504 | flush_cpu_slab(s); | ||
1505 | local_irq_restore(flags); | ||
1506 | #endif | ||
1507 | } | 1469 | } |
1508 | 1470 | ||
1509 | /* | 1471 | /* |
@@ -1559,7 +1521,7 @@ load_freelist: | |||
1559 | object = c->page->freelist; | 1521 | object = c->page->freelist; |
1560 | if (unlikely(!object)) | 1522 | if (unlikely(!object)) |
1561 | goto another_slab; | 1523 | goto another_slab; |
1562 | if (unlikely(SlabDebug(c->page))) | 1524 | if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) |
1563 | goto debug; | 1525 | goto debug; |
1564 | 1526 | ||
1565 | c->freelist = object[c->offset]; | 1527 | c->freelist = object[c->offset]; |
@@ -1596,7 +1558,7 @@ new_slab: | |||
1596 | if (c->page) | 1558 | if (c->page) |
1597 | flush_slab(s, c); | 1559 | flush_slab(s, c); |
1598 | slab_lock(new); | 1560 | slab_lock(new); |
1599 | SetSlabFrozen(new); | 1561 | __SetPageSlubFrozen(new); |
1600 | c->page = new; | 1562 | c->page = new; |
1601 | goto load_freelist; | 1563 | goto load_freelist; |
1602 | } | 1564 | } |
@@ -1682,7 +1644,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
1682 | stat(c, FREE_SLOWPATH); | 1644 | stat(c, FREE_SLOWPATH); |
1683 | slab_lock(page); | 1645 | slab_lock(page); |
1684 | 1646 | ||
1685 | if (unlikely(SlabDebug(page))) | 1647 | if (unlikely(SLABDEBUG && PageSlubDebug(page))) |
1686 | goto debug; | 1648 | goto debug; |
1687 | 1649 | ||
1688 | checks_ok: | 1650 | checks_ok: |
@@ -1690,7 +1652,7 @@ checks_ok: | |||
1690 | page->freelist = object; | 1652 | page->freelist = object; |
1691 | page->inuse--; | 1653 | page->inuse--; |
1692 | 1654 | ||
1693 | if (unlikely(SlabFrozen(page))) { | 1655 | if (unlikely(PageSlubFrozen(page))) { |
1694 | stat(c, FREE_FROZEN); | 1656 | stat(c, FREE_FROZEN); |
1695 | goto out_unlock; | 1657 | goto out_unlock; |
1696 | } | 1658 | } |
@@ -1952,13 +1914,26 @@ static void init_kmem_cache_cpu(struct kmem_cache *s, | |||
1952 | #endif | 1914 | #endif |
1953 | } | 1915 | } |
1954 | 1916 | ||
1955 | static void init_kmem_cache_node(struct kmem_cache_node *n) | 1917 | static void |
1918 | init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) | ||
1956 | { | 1919 | { |
1957 | n->nr_partial = 0; | 1920 | n->nr_partial = 0; |
1921 | |||
1922 | /* | ||
1923 | * The larger the object size is, the more pages we want on the partial | ||
1924 | * list to avoid pounding the page allocator excessively. | ||
1925 | */ | ||
1926 | n->min_partial = ilog2(s->size); | ||
1927 | if (n->min_partial < MIN_PARTIAL) | ||
1928 | n->min_partial = MIN_PARTIAL; | ||
1929 | else if (n->min_partial > MAX_PARTIAL) | ||
1930 | n->min_partial = MAX_PARTIAL; | ||
1931 | |||
1958 | spin_lock_init(&n->list_lock); | 1932 | spin_lock_init(&n->list_lock); |
1959 | INIT_LIST_HEAD(&n->partial); | 1933 | INIT_LIST_HEAD(&n->partial); |
1960 | #ifdef CONFIG_SLUB_DEBUG | 1934 | #ifdef CONFIG_SLUB_DEBUG |
1961 | atomic_long_set(&n->nr_slabs, 0); | 1935 | atomic_long_set(&n->nr_slabs, 0); |
1936 | atomic_long_set(&n->total_objects, 0); | ||
1962 | INIT_LIST_HEAD(&n->full); | 1937 | INIT_LIST_HEAD(&n->full); |
1963 | #endif | 1938 | #endif |
1964 | } | 1939 | } |
@@ -2126,7 +2101,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, | |||
2126 | init_object(kmalloc_caches, n, 1); | 2101 | init_object(kmalloc_caches, n, 1); |
2127 | init_tracking(kmalloc_caches, n); | 2102 | init_tracking(kmalloc_caches, n); |
2128 | #endif | 2103 | #endif |
2129 | init_kmem_cache_node(n); | 2104 | init_kmem_cache_node(n, kmalloc_caches); |
2130 | inc_slabs_node(kmalloc_caches, node, page->objects); | 2105 | inc_slabs_node(kmalloc_caches, node, page->objects); |
2131 | 2106 | ||
2132 | /* | 2107 | /* |
@@ -2183,7 +2158,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) | |||
2183 | 2158 | ||
2184 | } | 2159 | } |
2185 | s->node[node] = n; | 2160 | s->node[node] = n; |
2186 | init_kmem_cache_node(n); | 2161 | init_kmem_cache_node(n, s); |
2187 | } | 2162 | } |
2188 | return 1; | 2163 | return 1; |
2189 | } | 2164 | } |
@@ -2194,7 +2169,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) | |||
2194 | 2169 | ||
2195 | static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) | 2170 | static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) |
2196 | { | 2171 | { |
2197 | init_kmem_cache_node(&s->local_node); | 2172 | init_kmem_cache_node(&s->local_node, s); |
2198 | return 1; | 2173 | return 1; |
2199 | } | 2174 | } |
2200 | #endif | 2175 | #endif |
@@ -2325,7 +2300,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
2325 | static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, | 2300 | static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, |
2326 | const char *name, size_t size, | 2301 | const char *name, size_t size, |
2327 | size_t align, unsigned long flags, | 2302 | size_t align, unsigned long flags, |
2328 | void (*ctor)(struct kmem_cache *, void *)) | 2303 | void (*ctor)(void *)) |
2329 | { | 2304 | { |
2330 | memset(s, 0, kmem_size); | 2305 | memset(s, 0, kmem_size); |
2331 | s->name = name; | 2306 | s->name = name; |
@@ -2339,7 +2314,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, | |||
2339 | 2314 | ||
2340 | s->refcount = 1; | 2315 | s->refcount = 1; |
2341 | #ifdef CONFIG_NUMA | 2316 | #ifdef CONFIG_NUMA |
2342 | s->remote_node_defrag_ratio = 100; | 2317 | s->remote_node_defrag_ratio = 1000; |
2343 | #endif | 2318 | #endif |
2344 | if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) | 2319 | if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) |
2345 | goto error; | 2320 | goto error; |
@@ -2754,7 +2729,6 @@ size_t ksize(const void *object) | |||
2754 | */ | 2729 | */ |
2755 | return s->size; | 2730 | return s->size; |
2756 | } | 2731 | } |
2757 | EXPORT_SYMBOL(ksize); | ||
2758 | 2732 | ||
2759 | void kfree(const void *x) | 2733 | void kfree(const void *x) |
2760 | { | 2734 | { |
@@ -2929,7 +2903,7 @@ static int slab_mem_going_online_callback(void *arg) | |||
2929 | ret = -ENOMEM; | 2903 | ret = -ENOMEM; |
2930 | goto out; | 2904 | goto out; |
2931 | } | 2905 | } |
2932 | init_kmem_cache_node(n); | 2906 | init_kmem_cache_node(n, s); |
2933 | s->node[nid] = n; | 2907 | s->node[nid] = n; |
2934 | } | 2908 | } |
2935 | out: | 2909 | out: |
@@ -3081,7 +3055,7 @@ static int slab_unmergeable(struct kmem_cache *s) | |||
3081 | 3055 | ||
3082 | static struct kmem_cache *find_mergeable(size_t size, | 3056 | static struct kmem_cache *find_mergeable(size_t size, |
3083 | size_t align, unsigned long flags, const char *name, | 3057 | size_t align, unsigned long flags, const char *name, |
3084 | void (*ctor)(struct kmem_cache *, void *)) | 3058 | void (*ctor)(void *)) |
3085 | { | 3059 | { |
3086 | struct kmem_cache *s; | 3060 | struct kmem_cache *s; |
3087 | 3061 | ||
@@ -3121,8 +3095,7 @@ static struct kmem_cache *find_mergeable(size_t size, | |||
3121 | } | 3095 | } |
3122 | 3096 | ||
3123 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, | 3097 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, |
3124 | size_t align, unsigned long flags, | 3098 | size_t align, unsigned long flags, void (*ctor)(void *)) |
3125 | void (*ctor)(struct kmem_cache *, void *)) | ||
3126 | { | 3099 | { |
3127 | struct kmem_cache *s; | 3100 | struct kmem_cache *s; |
3128 | 3101 | ||
@@ -3325,12 +3298,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page, | |||
3325 | s->name, page); | 3298 | s->name, page); |
3326 | 3299 | ||
3327 | if (s->flags & DEBUG_DEFAULT_FLAGS) { | 3300 | if (s->flags & DEBUG_DEFAULT_FLAGS) { |
3328 | if (!SlabDebug(page)) | 3301 | if (!PageSlubDebug(page)) |
3329 | printk(KERN_ERR "SLUB %s: SlabDebug not set " | 3302 | printk(KERN_ERR "SLUB %s: SlubDebug not set " |
3330 | "on slab 0x%p\n", s->name, page); | 3303 | "on slab 0x%p\n", s->name, page); |
3331 | } else { | 3304 | } else { |
3332 | if (SlabDebug(page)) | 3305 | if (PageSlubDebug(page)) |
3333 | printk(KERN_ERR "SLUB %s: SlabDebug set on " | 3306 | printk(KERN_ERR "SLUB %s: SlubDebug set on " |
3334 | "slab 0x%p\n", s->name, page); | 3307 | "slab 0x%p\n", s->name, page); |
3335 | } | 3308 | } |
3336 | } | 3309 | } |
@@ -4087,7 +4060,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, | |||
4087 | if (err) | 4060 | if (err) |
4088 | return err; | 4061 | return err; |
4089 | 4062 | ||
4090 | if (ratio < 100) | 4063 | if (ratio <= 100) |
4091 | s->remote_node_defrag_ratio = ratio * 10; | 4064 | s->remote_node_defrag_ratio = ratio * 10; |
4092 | 4065 | ||
4093 | return length; | 4066 | return length; |
@@ -4445,14 +4418,6 @@ __initcall(slab_sysfs_init); | |||
4445 | * The /proc/slabinfo ABI | 4418 | * The /proc/slabinfo ABI |
4446 | */ | 4419 | */ |
4447 | #ifdef CONFIG_SLABINFO | 4420 | #ifdef CONFIG_SLABINFO |
4448 | |||
4449 | ssize_t slabinfo_write(struct file *file, const char __user *buffer, | ||
4450 | size_t count, loff_t *ppos) | ||
4451 | { | ||
4452 | return -EINVAL; | ||
4453 | } | ||
4454 | |||
4455 | |||
4456 | static void print_slabinfo_header(struct seq_file *m) | 4421 | static void print_slabinfo_header(struct seq_file *m) |
4457 | { | 4422 | { |
4458 | seq_puts(m, "slabinfo - version: 2.1\n"); | 4423 | seq_puts(m, "slabinfo - version: 2.1\n"); |
@@ -4520,11 +4485,29 @@ static int s_show(struct seq_file *m, void *p) | |||
4520 | return 0; | 4485 | return 0; |
4521 | } | 4486 | } |
4522 | 4487 | ||
4523 | const struct seq_operations slabinfo_op = { | 4488 | static const struct seq_operations slabinfo_op = { |
4524 | .start = s_start, | 4489 | .start = s_start, |
4525 | .next = s_next, | 4490 | .next = s_next, |
4526 | .stop = s_stop, | 4491 | .stop = s_stop, |
4527 | .show = s_show, | 4492 | .show = s_show, |
4528 | }; | 4493 | }; |
4529 | 4494 | ||
4495 | static int slabinfo_open(struct inode *inode, struct file *file) | ||
4496 | { | ||
4497 | return seq_open(file, &slabinfo_op); | ||
4498 | } | ||
4499 | |||
4500 | static const struct file_operations proc_slabinfo_operations = { | ||
4501 | .open = slabinfo_open, | ||
4502 | .read = seq_read, | ||
4503 | .llseek = seq_lseek, | ||
4504 | .release = seq_release, | ||
4505 | }; | ||
4506 | |||
4507 | static int __init slab_proc_init(void) | ||
4508 | { | ||
4509 | proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); | ||
4510 | return 0; | ||
4511 | } | ||
4512 | module_init(slab_proc_init); | ||
4530 | #endif /* CONFIG_SLABINFO */ | 4513 | #endif /* CONFIG_SLABINFO */ |
diff --git a/mm/sparse.c b/mm/sparse.c index 36511c7b5e2c..39db301b920d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -147,22 +147,41 @@ static inline int sparse_early_nid(struct mem_section *section) | |||
147 | return (section->section_mem_map >> SECTION_NID_SHIFT); | 147 | return (section->section_mem_map >> SECTION_NID_SHIFT); |
148 | } | 148 | } |
149 | 149 | ||
150 | /* Record a memory area against a node. */ | 150 | /* Validate the physical addressing limitations of the model */ |
151 | void __init memory_present(int nid, unsigned long start, unsigned long end) | 151 | void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, |
152 | unsigned long *end_pfn) | ||
152 | { | 153 | { |
153 | unsigned long max_arch_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); | 154 | unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); |
154 | unsigned long pfn; | ||
155 | 155 | ||
156 | /* | 156 | /* |
157 | * Sanity checks - do not allow an architecture to pass | 157 | * Sanity checks - do not allow an architecture to pass |
158 | * in larger pfns than the maximum scope of sparsemem: | 158 | * in larger pfns than the maximum scope of sparsemem: |
159 | */ | 159 | */ |
160 | if (start >= max_arch_pfn) | 160 | if (*start_pfn > max_sparsemem_pfn) { |
161 | return; | 161 | mminit_dprintk(MMINIT_WARNING, "pfnvalidation", |
162 | if (end >= max_arch_pfn) | 162 | "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n", |
163 | end = max_arch_pfn; | 163 | *start_pfn, *end_pfn, max_sparsemem_pfn); |
164 | WARN_ON_ONCE(1); | ||
165 | *start_pfn = max_sparsemem_pfn; | ||
166 | *end_pfn = max_sparsemem_pfn; | ||
167 | } | ||
168 | |||
169 | if (*end_pfn > max_sparsemem_pfn) { | ||
170 | mminit_dprintk(MMINIT_WARNING, "pfnvalidation", | ||
171 | "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n", | ||
172 | *start_pfn, *end_pfn, max_sparsemem_pfn); | ||
173 | WARN_ON_ONCE(1); | ||
174 | *end_pfn = max_sparsemem_pfn; | ||
175 | } | ||
176 | } | ||
177 | |||
178 | /* Record a memory area against a node. */ | ||
179 | void __init memory_present(int nid, unsigned long start, unsigned long end) | ||
180 | { | ||
181 | unsigned long pfn; | ||
164 | 182 | ||
165 | start &= PAGE_SECTION_MASK; | 183 | start &= PAGE_SECTION_MASK; |
184 | mminit_validate_memmodel_limits(&start, &end); | ||
166 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { | 185 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { |
167 | unsigned long section = pfn_to_section_nr(pfn); | 186 | unsigned long section = pfn_to_section_nr(pfn); |
168 | struct mem_section *ms; | 187 | struct mem_section *ms; |
@@ -187,6 +206,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, | |||
187 | unsigned long pfn; | 206 | unsigned long pfn; |
188 | unsigned long nr_pages = 0; | 207 | unsigned long nr_pages = 0; |
189 | 208 | ||
209 | mminit_validate_memmodel_limits(&start_pfn, &end_pfn); | ||
190 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { | 210 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { |
191 | if (nid != early_pfn_to_nid(pfn)) | 211 | if (nid != early_pfn_to_nid(pfn)) |
192 | continue; | 212 | continue; |
@@ -248,16 +268,92 @@ static unsigned long *__kmalloc_section_usemap(void) | |||
248 | } | 268 | } |
249 | #endif /* CONFIG_MEMORY_HOTPLUG */ | 269 | #endif /* CONFIG_MEMORY_HOTPLUG */ |
250 | 270 | ||
271 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
272 | static unsigned long * __init | ||
273 | sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) | ||
274 | { | ||
275 | unsigned long section_nr; | ||
276 | |||
277 | /* | ||
278 | * A page may contain usemaps for other sections preventing the | ||
279 | * page being freed and making a section unremovable while | ||
280 | * other sections referencing the usemap retmain active. Similarly, | ||
281 | * a pgdat can prevent a section being removed. If section A | ||
282 | * contains a pgdat and section B contains the usemap, both | ||
283 | * sections become inter-dependent. This allocates usemaps | ||
284 | * from the same section as the pgdat where possible to avoid | ||
285 | * this problem. | ||
286 | */ | ||
287 | section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); | ||
288 | return alloc_bootmem_section(usemap_size(), section_nr); | ||
289 | } | ||
290 | |||
291 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | ||
292 | { | ||
293 | unsigned long usemap_snr, pgdat_snr; | ||
294 | static unsigned long old_usemap_snr = NR_MEM_SECTIONS; | ||
295 | static unsigned long old_pgdat_snr = NR_MEM_SECTIONS; | ||
296 | struct pglist_data *pgdat = NODE_DATA(nid); | ||
297 | int usemap_nid; | ||
298 | |||
299 | usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT); | ||
300 | pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); | ||
301 | if (usemap_snr == pgdat_snr) | ||
302 | return; | ||
303 | |||
304 | if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr) | ||
305 | /* skip redundant message */ | ||
306 | return; | ||
307 | |||
308 | old_usemap_snr = usemap_snr; | ||
309 | old_pgdat_snr = pgdat_snr; | ||
310 | |||
311 | usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr)); | ||
312 | if (usemap_nid != nid) { | ||
313 | printk(KERN_INFO | ||
314 | "node %d must be removed before remove section %ld\n", | ||
315 | nid, usemap_snr); | ||
316 | return; | ||
317 | } | ||
318 | /* | ||
319 | * There is a circular dependency. | ||
320 | * Some platforms allow un-removable section because they will just | ||
321 | * gather other removable sections for dynamic partitioning. | ||
322 | * Just notify un-removable section's number here. | ||
323 | */ | ||
324 | printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr, | ||
325 | pgdat_snr, nid); | ||
326 | printk(KERN_CONT | ||
327 | " have a circular dependency on usemap and pgdat allocations\n"); | ||
328 | } | ||
329 | #else | ||
330 | static unsigned long * __init | ||
331 | sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) | ||
332 | { | ||
333 | return NULL; | ||
334 | } | ||
335 | |||
336 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | ||
337 | { | ||
338 | } | ||
339 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | ||
340 | |||
251 | static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) | 341 | static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) |
252 | { | 342 | { |
253 | unsigned long *usemap; | 343 | unsigned long *usemap; |
254 | struct mem_section *ms = __nr_to_section(pnum); | 344 | struct mem_section *ms = __nr_to_section(pnum); |
255 | int nid = sparse_early_nid(ms); | 345 | int nid = sparse_early_nid(ms); |
256 | 346 | ||
257 | usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); | 347 | usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid)); |
258 | if (usemap) | 348 | if (usemap) |
259 | return usemap; | 349 | return usemap; |
260 | 350 | ||
351 | usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); | ||
352 | if (usemap) { | ||
353 | check_usemap_section_nr(nid, usemap); | ||
354 | return usemap; | ||
355 | } | ||
356 | |||
261 | /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ | 357 | /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ |
262 | nid = 0; | 358 | nid = 0; |
263 | 359 | ||
@@ -280,7 +376,7 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) | |||
280 | } | 376 | } |
281 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | 377 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ |
282 | 378 | ||
283 | struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | 379 | static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) |
284 | { | 380 | { |
285 | struct page *map; | 381 | struct page *map; |
286 | struct mem_section *ms = __nr_to_section(pnum); | 382 | struct mem_section *ms = __nr_to_section(pnum); |
@@ -31,12 +31,13 @@ | |||
31 | #include <linux/backing-dev.h> | 31 | #include <linux/backing-dev.h> |
32 | #include <linux/memcontrol.h> | 32 | #include <linux/memcontrol.h> |
33 | 33 | ||
34 | #include "internal.h" | ||
35 | |||
34 | /* How many pages do we try to swap or page in/out together? */ | 36 | /* How many pages do we try to swap or page in/out together? */ |
35 | int page_cluster; | 37 | int page_cluster; |
36 | 38 | ||
37 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; | 39 | static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); |
38 | static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; | 40 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); |
39 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; | ||
40 | 41 | ||
41 | /* | 42 | /* |
42 | * This path almost never happens for VM activity - pages are normally | 43 | * This path almost never happens for VM activity - pages are normally |
@@ -116,8 +117,9 @@ static void pagevec_move_tail(struct pagevec *pvec) | |||
116 | zone = pagezone; | 117 | zone = pagezone; |
117 | spin_lock(&zone->lru_lock); | 118 | spin_lock(&zone->lru_lock); |
118 | } | 119 | } |
119 | if (PageLRU(page) && !PageActive(page)) { | 120 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { |
120 | list_move_tail(&page->lru, &zone->inactive_list); | 121 | int lru = page_is_file_cache(page); |
122 | list_move_tail(&page->lru, &zone->lru[lru].list); | ||
121 | pgmoved++; | 123 | pgmoved++; |
122 | } | 124 | } |
123 | } | 125 | } |
@@ -136,7 +138,7 @@ static void pagevec_move_tail(struct pagevec *pvec) | |||
136 | void rotate_reclaimable_page(struct page *page) | 138 | void rotate_reclaimable_page(struct page *page) |
137 | { | 139 | { |
138 | if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && | 140 | if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && |
139 | PageLRU(page)) { | 141 | !PageUnevictable(page) && PageLRU(page)) { |
140 | struct pagevec *pvec; | 142 | struct pagevec *pvec; |
141 | unsigned long flags; | 143 | unsigned long flags; |
142 | 144 | ||
@@ -157,12 +159,19 @@ void activate_page(struct page *page) | |||
157 | struct zone *zone = page_zone(page); | 159 | struct zone *zone = page_zone(page); |
158 | 160 | ||
159 | spin_lock_irq(&zone->lru_lock); | 161 | spin_lock_irq(&zone->lru_lock); |
160 | if (PageLRU(page) && !PageActive(page)) { | 162 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { |
161 | del_page_from_inactive_list(zone, page); | 163 | int file = page_is_file_cache(page); |
164 | int lru = LRU_BASE + file; | ||
165 | del_page_from_lru_list(zone, page, lru); | ||
166 | |||
162 | SetPageActive(page); | 167 | SetPageActive(page); |
163 | add_page_to_active_list(zone, page); | 168 | lru += LRU_ACTIVE; |
169 | add_page_to_lru_list(zone, page, lru); | ||
164 | __count_vm_event(PGACTIVATE); | 170 | __count_vm_event(PGACTIVATE); |
165 | mem_cgroup_move_lists(page, true); | 171 | mem_cgroup_move_lists(page, lru); |
172 | |||
173 | zone->recent_rotated[!!file]++; | ||
174 | zone->recent_scanned[!!file]++; | ||
166 | } | 175 | } |
167 | spin_unlock_irq(&zone->lru_lock); | 176 | spin_unlock_irq(&zone->lru_lock); |
168 | } | 177 | } |
@@ -176,7 +185,8 @@ void activate_page(struct page *page) | |||
176 | */ | 185 | */ |
177 | void mark_page_accessed(struct page *page) | 186 | void mark_page_accessed(struct page *page) |
178 | { | 187 | { |
179 | if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { | 188 | if (!PageActive(page) && !PageUnevictable(page) && |
189 | PageReferenced(page) && PageLRU(page)) { | ||
180 | activate_page(page); | 190 | activate_page(page); |
181 | ClearPageReferenced(page); | 191 | ClearPageReferenced(page); |
182 | } else if (!PageReferenced(page)) { | 192 | } else if (!PageReferenced(page)) { |
@@ -186,28 +196,73 @@ void mark_page_accessed(struct page *page) | |||
186 | 196 | ||
187 | EXPORT_SYMBOL(mark_page_accessed); | 197 | EXPORT_SYMBOL(mark_page_accessed); |
188 | 198 | ||
189 | /** | 199 | void __lru_cache_add(struct page *page, enum lru_list lru) |
190 | * lru_cache_add: add a page to the page lists | ||
191 | * @page: the page to add | ||
192 | */ | ||
193 | void lru_cache_add(struct page *page) | ||
194 | { | 200 | { |
195 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); | 201 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; |
196 | 202 | ||
197 | page_cache_get(page); | 203 | page_cache_get(page); |
198 | if (!pagevec_add(pvec, page)) | 204 | if (!pagevec_add(pvec, page)) |
199 | __pagevec_lru_add(pvec); | 205 | ____pagevec_lru_add(pvec, lru); |
200 | put_cpu_var(lru_add_pvecs); | 206 | put_cpu_var(lru_add_pvecs); |
201 | } | 207 | } |
202 | 208 | ||
203 | void lru_cache_add_active(struct page *page) | 209 | /** |
210 | * lru_cache_add_lru - add a page to a page list | ||
211 | * @page: the page to be added to the LRU. | ||
212 | * @lru: the LRU list to which the page is added. | ||
213 | */ | ||
214 | void lru_cache_add_lru(struct page *page, enum lru_list lru) | ||
204 | { | 215 | { |
205 | struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); | 216 | if (PageActive(page)) { |
217 | VM_BUG_ON(PageUnevictable(page)); | ||
218 | ClearPageActive(page); | ||
219 | } else if (PageUnevictable(page)) { | ||
220 | VM_BUG_ON(PageActive(page)); | ||
221 | ClearPageUnevictable(page); | ||
222 | } | ||
206 | 223 | ||
207 | page_cache_get(page); | 224 | VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page)); |
208 | if (!pagevec_add(pvec, page)) | 225 | __lru_cache_add(page, lru); |
209 | __pagevec_lru_add_active(pvec); | 226 | } |
210 | put_cpu_var(lru_add_active_pvecs); | 227 | |
228 | /** | ||
229 | * add_page_to_unevictable_list - add a page to the unevictable list | ||
230 | * @page: the page to be added to the unevictable list | ||
231 | * | ||
232 | * Add page directly to its zone's unevictable list. To avoid races with | ||
233 | * tasks that might be making the page evictable, through eg. munlock, | ||
234 | * munmap or exit, while it's not on the lru, we want to add the page | ||
235 | * while it's locked or otherwise "invisible" to other tasks. This is | ||
236 | * difficult to do when using the pagevec cache, so bypass that. | ||
237 | */ | ||
238 | void add_page_to_unevictable_list(struct page *page) | ||
239 | { | ||
240 | struct zone *zone = page_zone(page); | ||
241 | |||
242 | spin_lock_irq(&zone->lru_lock); | ||
243 | SetPageUnevictable(page); | ||
244 | SetPageLRU(page); | ||
245 | add_page_to_lru_list(zone, page, LRU_UNEVICTABLE); | ||
246 | spin_unlock_irq(&zone->lru_lock); | ||
247 | } | ||
248 | |||
249 | /** | ||
250 | * lru_cache_add_active_or_unevictable | ||
251 | * @page: the page to be added to LRU | ||
252 | * @vma: vma in which page is mapped for determining reclaimability | ||
253 | * | ||
254 | * place @page on active or unevictable LRU list, depending on | ||
255 | * page_evictable(). Note that if the page is not evictable, | ||
256 | * it goes directly back onto it's zone's unevictable list. It does | ||
257 | * NOT use a per cpu pagevec. | ||
258 | */ | ||
259 | void lru_cache_add_active_or_unevictable(struct page *page, | ||
260 | struct vm_area_struct *vma) | ||
261 | { | ||
262 | if (page_evictable(page, vma)) | ||
263 | lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page)); | ||
264 | else | ||
265 | add_page_to_unevictable_list(page); | ||
211 | } | 266 | } |
212 | 267 | ||
213 | /* | 268 | /* |
@@ -217,15 +272,15 @@ void lru_cache_add_active(struct page *page) | |||
217 | */ | 272 | */ |
218 | static void drain_cpu_pagevecs(int cpu) | 273 | static void drain_cpu_pagevecs(int cpu) |
219 | { | 274 | { |
275 | struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); | ||
220 | struct pagevec *pvec; | 276 | struct pagevec *pvec; |
277 | int lru; | ||
221 | 278 | ||
222 | pvec = &per_cpu(lru_add_pvecs, cpu); | 279 | for_each_lru(lru) { |
223 | if (pagevec_count(pvec)) | 280 | pvec = &pvecs[lru - LRU_BASE]; |
224 | __pagevec_lru_add(pvec); | 281 | if (pagevec_count(pvec)) |
225 | 282 | ____pagevec_lru_add(pvec, lru); | |
226 | pvec = &per_cpu(lru_add_active_pvecs, cpu); | 283 | } |
227 | if (pagevec_count(pvec)) | ||
228 | __pagevec_lru_add_active(pvec); | ||
229 | 284 | ||
230 | pvec = &per_cpu(lru_rotate_pvecs, cpu); | 285 | pvec = &per_cpu(lru_rotate_pvecs, cpu); |
231 | if (pagevec_count(pvec)) { | 286 | if (pagevec_count(pvec)) { |
@@ -244,7 +299,7 @@ void lru_add_drain(void) | |||
244 | put_cpu(); | 299 | put_cpu(); |
245 | } | 300 | } |
246 | 301 | ||
247 | #ifdef CONFIG_NUMA | 302 | #if defined(CONFIG_NUMA) || defined(CONFIG_UNEVICTABLE_LRU) |
248 | static void lru_add_drain_per_cpu(struct work_struct *dummy) | 303 | static void lru_add_drain_per_cpu(struct work_struct *dummy) |
249 | { | 304 | { |
250 | lru_add_drain(); | 305 | lru_add_drain(); |
@@ -278,9 +333,10 @@ int lru_add_drain_all(void) | |||
278 | * Avoid taking zone->lru_lock if possible, but if it is taken, retain it | 333 | * Avoid taking zone->lru_lock if possible, but if it is taken, retain it |
279 | * for the remainder of the operation. | 334 | * for the remainder of the operation. |
280 | * | 335 | * |
281 | * The locking in this function is against shrink_cache(): we recheck the | 336 | * The locking in this function is against shrink_inactive_list(): we recheck |
282 | * page count inside the lock to see whether shrink_cache grabbed the page | 337 | * the page count inside the lock to see whether shrink_inactive_list() |
283 | * via the LRU. If it did, give up: shrink_cache will free it. | 338 | * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() |
339 | * will free it. | ||
284 | */ | 340 | */ |
285 | void release_pages(struct page **pages, int nr, int cold) | 341 | void release_pages(struct page **pages, int nr, int cold) |
286 | { | 342 | { |
@@ -307,6 +363,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
307 | 363 | ||
308 | if (PageLRU(page)) { | 364 | if (PageLRU(page)) { |
309 | struct zone *pagezone = page_zone(page); | 365 | struct zone *pagezone = page_zone(page); |
366 | |||
310 | if (pagezone != zone) { | 367 | if (pagezone != zone) { |
311 | if (zone) | 368 | if (zone) |
312 | spin_unlock_irqrestore(&zone->lru_lock, | 369 | spin_unlock_irqrestore(&zone->lru_lock, |
@@ -379,10 +436,11 @@ void __pagevec_release_nonlru(struct pagevec *pvec) | |||
379 | * Add the passed pages to the LRU, then drop the caller's refcount | 436 | * Add the passed pages to the LRU, then drop the caller's refcount |
380 | * on them. Reinitialises the caller's pagevec. | 437 | * on them. Reinitialises the caller's pagevec. |
381 | */ | 438 | */ |
382 | void __pagevec_lru_add(struct pagevec *pvec) | 439 | void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) |
383 | { | 440 | { |
384 | int i; | 441 | int i; |
385 | struct zone *zone = NULL; | 442 | struct zone *zone = NULL; |
443 | VM_BUG_ON(is_unevictable_lru(lru)); | ||
386 | 444 | ||
387 | for (i = 0; i < pagevec_count(pvec); i++) { | 445 | for (i = 0; i < pagevec_count(pvec); i++) { |
388 | struct page *page = pvec->pages[i]; | 446 | struct page *page = pvec->pages[i]; |
@@ -394,9 +452,13 @@ void __pagevec_lru_add(struct pagevec *pvec) | |||
394 | zone = pagezone; | 452 | zone = pagezone; |
395 | spin_lock_irq(&zone->lru_lock); | 453 | spin_lock_irq(&zone->lru_lock); |
396 | } | 454 | } |
455 | VM_BUG_ON(PageActive(page)); | ||
456 | VM_BUG_ON(PageUnevictable(page)); | ||
397 | VM_BUG_ON(PageLRU(page)); | 457 | VM_BUG_ON(PageLRU(page)); |
398 | SetPageLRU(page); | 458 | SetPageLRU(page); |
399 | add_page_to_inactive_list(zone, page); | 459 | if (is_active_lru(lru)) |
460 | SetPageActive(page); | ||
461 | add_page_to_lru_list(zone, page, lru); | ||
400 | } | 462 | } |
401 | if (zone) | 463 | if (zone) |
402 | spin_unlock_irq(&zone->lru_lock); | 464 | spin_unlock_irq(&zone->lru_lock); |
@@ -404,48 +466,45 @@ void __pagevec_lru_add(struct pagevec *pvec) | |||
404 | pagevec_reinit(pvec); | 466 | pagevec_reinit(pvec); |
405 | } | 467 | } |
406 | 468 | ||
407 | EXPORT_SYMBOL(__pagevec_lru_add); | 469 | EXPORT_SYMBOL(____pagevec_lru_add); |
408 | 470 | ||
409 | void __pagevec_lru_add_active(struct pagevec *pvec) | 471 | /* |
472 | * Try to drop buffers from the pages in a pagevec | ||
473 | */ | ||
474 | void pagevec_strip(struct pagevec *pvec) | ||
410 | { | 475 | { |
411 | int i; | 476 | int i; |
412 | struct zone *zone = NULL; | ||
413 | 477 | ||
414 | for (i = 0; i < pagevec_count(pvec); i++) { | 478 | for (i = 0; i < pagevec_count(pvec); i++) { |
415 | struct page *page = pvec->pages[i]; | 479 | struct page *page = pvec->pages[i]; |
416 | struct zone *pagezone = page_zone(page); | ||
417 | 480 | ||
418 | if (pagezone != zone) { | 481 | if (PagePrivate(page) && trylock_page(page)) { |
419 | if (zone) | 482 | if (PagePrivate(page)) |
420 | spin_unlock_irq(&zone->lru_lock); | 483 | try_to_release_page(page, 0); |
421 | zone = pagezone; | 484 | unlock_page(page); |
422 | spin_lock_irq(&zone->lru_lock); | ||
423 | } | 485 | } |
424 | VM_BUG_ON(PageLRU(page)); | ||
425 | SetPageLRU(page); | ||
426 | VM_BUG_ON(PageActive(page)); | ||
427 | SetPageActive(page); | ||
428 | add_page_to_active_list(zone, page); | ||
429 | } | 486 | } |
430 | if (zone) | ||
431 | spin_unlock_irq(&zone->lru_lock); | ||
432 | release_pages(pvec->pages, pvec->nr, pvec->cold); | ||
433 | pagevec_reinit(pvec); | ||
434 | } | 487 | } |
435 | 488 | ||
436 | /* | 489 | /** |
437 | * Try to drop buffers from the pages in a pagevec | 490 | * pagevec_swap_free - try to free swap space from the pages in a pagevec |
491 | * @pvec: pagevec with swapcache pages to free the swap space of | ||
492 | * | ||
493 | * The caller needs to hold an extra reference to each page and | ||
494 | * not hold the page lock on the pages. This function uses a | ||
495 | * trylock on the page lock so it may not always free the swap | ||
496 | * space associated with a page. | ||
438 | */ | 497 | */ |
439 | void pagevec_strip(struct pagevec *pvec) | 498 | void pagevec_swap_free(struct pagevec *pvec) |
440 | { | 499 | { |
441 | int i; | 500 | int i; |
442 | 501 | ||
443 | for (i = 0; i < pagevec_count(pvec); i++) { | 502 | for (i = 0; i < pagevec_count(pvec); i++) { |
444 | struct page *page = pvec->pages[i]; | 503 | struct page *page = pvec->pages[i]; |
445 | 504 | ||
446 | if (PagePrivate(page) && !TestSetPageLocked(page)) { | 505 | if (PageSwapCache(page) && trylock_page(page)) { |
447 | if (PagePrivate(page)) | 506 | if (PageSwapCache(page)) |
448 | try_to_release_page(page, 0); | 507 | remove_exclusive_swap_page_ref(page); |
449 | unlock_page(page); | 508 | unlock_page(page); |
450 | } | 509 | } |
451 | } | 510 | } |
@@ -493,7 +552,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag); | |||
493 | */ | 552 | */ |
494 | #define ACCT_THRESHOLD max(16, NR_CPUS * 2) | 553 | #define ACCT_THRESHOLD max(16, NR_CPUS * 2) |
495 | 554 | ||
496 | static DEFINE_PER_CPU(long, committed_space) = 0; | 555 | static DEFINE_PER_CPU(long, committed_space); |
497 | 556 | ||
498 | void vm_acct_memory(long pages) | 557 | void vm_acct_memory(long pages) |
499 | { | 558 | { |
diff --git a/mm/swap_state.c b/mm/swap_state.c index d8aadaf2a0ba..3353c9029cef 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -33,13 +33,13 @@ static const struct address_space_operations swap_aops = { | |||
33 | }; | 33 | }; |
34 | 34 | ||
35 | static struct backing_dev_info swap_backing_dev_info = { | 35 | static struct backing_dev_info swap_backing_dev_info = { |
36 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, | 36 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, |
37 | .unplug_io_fn = swap_unplug_io_fn, | 37 | .unplug_io_fn = swap_unplug_io_fn, |
38 | }; | 38 | }; |
39 | 39 | ||
40 | struct address_space swapper_space = { | 40 | struct address_space swapper_space = { |
41 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), | 41 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), |
42 | .tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock), | 42 | .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), |
43 | .a_ops = &swap_aops, | 43 | .a_ops = &swap_aops, |
44 | .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), | 44 | .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), |
45 | .backing_dev_info = &swap_backing_dev_info, | 45 | .backing_dev_info = &swap_backing_dev_info, |
@@ -56,15 +56,16 @@ static struct { | |||
56 | 56 | ||
57 | void show_swap_cache_info(void) | 57 | void show_swap_cache_info(void) |
58 | { | 58 | { |
59 | printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n", | 59 | printk("%lu pages in swap cache\n", total_swapcache_pages); |
60 | printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", | ||
60 | swap_cache_info.add_total, swap_cache_info.del_total, | 61 | swap_cache_info.add_total, swap_cache_info.del_total, |
61 | swap_cache_info.find_success, swap_cache_info.find_total); | 62 | swap_cache_info.find_success, swap_cache_info.find_total); |
62 | printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); | 63 | printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10)); |
63 | printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); | 64 | printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); |
64 | } | 65 | } |
65 | 66 | ||
66 | /* | 67 | /* |
67 | * add_to_swap_cache resembles add_to_page_cache on swapper_space, | 68 | * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, |
68 | * but sets SwapCache flag and private instead of mapping and index. | 69 | * but sets SwapCache flag and private instead of mapping and index. |
69 | */ | 70 | */ |
70 | int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | 71 | int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) |
@@ -74,21 +75,29 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
74 | BUG_ON(!PageLocked(page)); | 75 | BUG_ON(!PageLocked(page)); |
75 | BUG_ON(PageSwapCache(page)); | 76 | BUG_ON(PageSwapCache(page)); |
76 | BUG_ON(PagePrivate(page)); | 77 | BUG_ON(PagePrivate(page)); |
78 | BUG_ON(!PageSwapBacked(page)); | ||
77 | error = radix_tree_preload(gfp_mask); | 79 | error = radix_tree_preload(gfp_mask); |
78 | if (!error) { | 80 | if (!error) { |
79 | write_lock_irq(&swapper_space.tree_lock); | 81 | page_cache_get(page); |
82 | SetPageSwapCache(page); | ||
83 | set_page_private(page, entry.val); | ||
84 | |||
85 | spin_lock_irq(&swapper_space.tree_lock); | ||
80 | error = radix_tree_insert(&swapper_space.page_tree, | 86 | error = radix_tree_insert(&swapper_space.page_tree, |
81 | entry.val, page); | 87 | entry.val, page); |
82 | if (!error) { | 88 | if (likely(!error)) { |
83 | page_cache_get(page); | ||
84 | SetPageSwapCache(page); | ||
85 | set_page_private(page, entry.val); | ||
86 | total_swapcache_pages++; | 89 | total_swapcache_pages++; |
87 | __inc_zone_page_state(page, NR_FILE_PAGES); | 90 | __inc_zone_page_state(page, NR_FILE_PAGES); |
88 | INC_CACHE_INFO(add_total); | 91 | INC_CACHE_INFO(add_total); |
89 | } | 92 | } |
90 | write_unlock_irq(&swapper_space.tree_lock); | 93 | spin_unlock_irq(&swapper_space.tree_lock); |
91 | radix_tree_preload_end(); | 94 | radix_tree_preload_end(); |
95 | |||
96 | if (unlikely(error)) { | ||
97 | set_page_private(page, 0UL); | ||
98 | ClearPageSwapCache(page); | ||
99 | page_cache_release(page); | ||
100 | } | ||
92 | } | 101 | } |
93 | return error; | 102 | return error; |
94 | } | 103 | } |
@@ -175,9 +184,9 @@ void delete_from_swap_cache(struct page *page) | |||
175 | 184 | ||
176 | entry.val = page_private(page); | 185 | entry.val = page_private(page); |
177 | 186 | ||
178 | write_lock_irq(&swapper_space.tree_lock); | 187 | spin_lock_irq(&swapper_space.tree_lock); |
179 | __delete_from_swap_cache(page); | 188 | __delete_from_swap_cache(page); |
180 | write_unlock_irq(&swapper_space.tree_lock); | 189 | spin_unlock_irq(&swapper_space.tree_lock); |
181 | 190 | ||
182 | swap_free(entry); | 191 | swap_free(entry); |
183 | page_cache_release(page); | 192 | page_cache_release(page); |
@@ -193,7 +202,7 @@ void delete_from_swap_cache(struct page *page) | |||
193 | */ | 202 | */ |
194 | static inline void free_swap_cache(struct page *page) | 203 | static inline void free_swap_cache(struct page *page) |
195 | { | 204 | { |
196 | if (PageSwapCache(page) && !TestSetPageLocked(page)) { | 205 | if (PageSwapCache(page) && trylock_page(page)) { |
197 | remove_exclusive_swap_page(page); | 206 | remove_exclusive_swap_page(page); |
198 | unlock_page(page); | 207 | unlock_page(page); |
199 | } | 208 | } |
@@ -294,17 +303,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
294 | * re-using the just freed swap entry for an existing page. | 303 | * re-using the just freed swap entry for an existing page. |
295 | * May fail (-ENOMEM) if radix-tree node allocation failed. | 304 | * May fail (-ENOMEM) if radix-tree node allocation failed. |
296 | */ | 305 | */ |
297 | SetPageLocked(new_page); | 306 | __set_page_locked(new_page); |
307 | SetPageSwapBacked(new_page); | ||
298 | err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); | 308 | err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); |
299 | if (!err) { | 309 | if (likely(!err)) { |
300 | /* | 310 | /* |
301 | * Initiate read into locked page and return. | 311 | * Initiate read into locked page and return. |
302 | */ | 312 | */ |
303 | lru_cache_add_active(new_page); | 313 | lru_cache_add_anon(new_page); |
304 | swap_readpage(NULL, new_page); | 314 | swap_readpage(NULL, new_page); |
305 | return new_page; | 315 | return new_page; |
306 | } | 316 | } |
307 | ClearPageLocked(new_page); | 317 | ClearPageSwapBacked(new_page); |
318 | __clear_page_locked(new_page); | ||
308 | swap_free(entry); | 319 | swap_free(entry); |
309 | } while (err != -ENOMEM); | 320 | } while (err != -ENOMEM); |
310 | 321 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index bd1bb5920306..90cb67a5417c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -33,17 +33,18 @@ | |||
33 | #include <asm/tlbflush.h> | 33 | #include <asm/tlbflush.h> |
34 | #include <linux/swapops.h> | 34 | #include <linux/swapops.h> |
35 | 35 | ||
36 | DEFINE_SPINLOCK(swap_lock); | 36 | static DEFINE_SPINLOCK(swap_lock); |
37 | unsigned int nr_swapfiles; | 37 | static unsigned int nr_swapfiles; |
38 | long total_swap_pages; | 38 | long total_swap_pages; |
39 | static int swap_overflow; | 39 | static int swap_overflow; |
40 | static int least_priority; | ||
40 | 41 | ||
41 | static const char Bad_file[] = "Bad swap file entry "; | 42 | static const char Bad_file[] = "Bad swap file entry "; |
42 | static const char Unused_file[] = "Unused swap file entry "; | 43 | static const char Unused_file[] = "Unused swap file entry "; |
43 | static const char Bad_offset[] = "Bad swap offset entry "; | 44 | static const char Bad_offset[] = "Bad swap offset entry "; |
44 | static const char Unused_offset[] = "Unused swap offset entry "; | 45 | static const char Unused_offset[] = "Unused swap offset entry "; |
45 | 46 | ||
46 | struct swap_list_t swap_list = {-1, -1}; | 47 | static struct swap_list_t swap_list = {-1, -1}; |
47 | 48 | ||
48 | static struct swap_info_struct swap_info[MAX_SWAPFILES]; | 49 | static struct swap_info_struct swap_info[MAX_SWAPFILES]; |
49 | 50 | ||
@@ -343,7 +344,7 @@ int can_share_swap_page(struct page *page) | |||
343 | * Work out if there are any other processes sharing this | 344 | * Work out if there are any other processes sharing this |
344 | * swap cache page. Free it if you can. Return success. | 345 | * swap cache page. Free it if you can. Return success. |
345 | */ | 346 | */ |
346 | int remove_exclusive_swap_page(struct page *page) | 347 | static int remove_exclusive_swap_page_count(struct page *page, int count) |
347 | { | 348 | { |
348 | int retval; | 349 | int retval; |
349 | struct swap_info_struct * p; | 350 | struct swap_info_struct * p; |
@@ -356,7 +357,7 @@ int remove_exclusive_swap_page(struct page *page) | |||
356 | return 0; | 357 | return 0; |
357 | if (PageWriteback(page)) | 358 | if (PageWriteback(page)) |
358 | return 0; | 359 | return 0; |
359 | if (page_count(page) != 2) /* 2: us + cache */ | 360 | if (page_count(page) != count) /* us + cache + ptes */ |
360 | return 0; | 361 | return 0; |
361 | 362 | ||
362 | entry.val = page_private(page); | 363 | entry.val = page_private(page); |
@@ -368,13 +369,13 @@ int remove_exclusive_swap_page(struct page *page) | |||
368 | retval = 0; | 369 | retval = 0; |
369 | if (p->swap_map[swp_offset(entry)] == 1) { | 370 | if (p->swap_map[swp_offset(entry)] == 1) { |
370 | /* Recheck the page count with the swapcache lock held.. */ | 371 | /* Recheck the page count with the swapcache lock held.. */ |
371 | write_lock_irq(&swapper_space.tree_lock); | 372 | spin_lock_irq(&swapper_space.tree_lock); |
372 | if ((page_count(page) == 2) && !PageWriteback(page)) { | 373 | if ((page_count(page) == count) && !PageWriteback(page)) { |
373 | __delete_from_swap_cache(page); | 374 | __delete_from_swap_cache(page); |
374 | SetPageDirty(page); | 375 | SetPageDirty(page); |
375 | retval = 1; | 376 | retval = 1; |
376 | } | 377 | } |
377 | write_unlock_irq(&swapper_space.tree_lock); | 378 | spin_unlock_irq(&swapper_space.tree_lock); |
378 | } | 379 | } |
379 | spin_unlock(&swap_lock); | 380 | spin_unlock(&swap_lock); |
380 | 381 | ||
@@ -387,6 +388,25 @@ int remove_exclusive_swap_page(struct page *page) | |||
387 | } | 388 | } |
388 | 389 | ||
389 | /* | 390 | /* |
391 | * Most of the time the page should have two references: one for the | ||
392 | * process and one for the swap cache. | ||
393 | */ | ||
394 | int remove_exclusive_swap_page(struct page *page) | ||
395 | { | ||
396 | return remove_exclusive_swap_page_count(page, 2); | ||
397 | } | ||
398 | |||
399 | /* | ||
400 | * The pageout code holds an extra reference to the page. That raises | ||
401 | * the reference count to test for to 2 for a page that is only in the | ||
402 | * swap cache plus 1 for each process that maps the page. | ||
403 | */ | ||
404 | int remove_exclusive_swap_page_ref(struct page *page) | ||
405 | { | ||
406 | return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page)); | ||
407 | } | ||
408 | |||
409 | /* | ||
390 | * Free the swap entry like above, but also try to | 410 | * Free the swap entry like above, but also try to |
391 | * free the page cache entry if it is the last user. | 411 | * free the page cache entry if it is the last user. |
392 | */ | 412 | */ |
@@ -402,7 +422,7 @@ void free_swap_and_cache(swp_entry_t entry) | |||
402 | if (p) { | 422 | if (p) { |
403 | if (swap_entry_free(p, swp_offset(entry)) == 1) { | 423 | if (swap_entry_free(p, swp_offset(entry)) == 1) { |
404 | page = find_get_page(&swapper_space, entry.val); | 424 | page = find_get_page(&swapper_space, entry.val); |
405 | if (page && unlikely(TestSetPageLocked(page))) { | 425 | if (page && !trylock_page(page)) { |
406 | page_cache_release(page); | 426 | page_cache_release(page); |
407 | page = NULL; | 427 | page = NULL; |
408 | } | 428 | } |
@@ -655,8 +675,8 @@ static int unuse_mm(struct mm_struct *mm, | |||
655 | 675 | ||
656 | if (!down_read_trylock(&mm->mmap_sem)) { | 676 | if (!down_read_trylock(&mm->mmap_sem)) { |
657 | /* | 677 | /* |
658 | * Activate page so shrink_cache is unlikely to unmap its | 678 | * Activate page so shrink_inactive_list is unlikely to unmap |
659 | * ptes while lock is dropped, so swapoff can make progress. | 679 | * its ptes while lock is dropped, so swapoff can make progress. |
660 | */ | 680 | */ |
661 | activate_page(page); | 681 | activate_page(page); |
662 | unlock_page(page); | 682 | unlock_page(page); |
@@ -1260,6 +1280,11 @@ asmlinkage long sys_swapoff(const char __user * specialfile) | |||
1260 | /* just pick something that's safe... */ | 1280 | /* just pick something that's safe... */ |
1261 | swap_list.next = swap_list.head; | 1281 | swap_list.next = swap_list.head; |
1262 | } | 1282 | } |
1283 | if (p->prio < 0) { | ||
1284 | for (i = p->next; i >= 0; i = swap_info[i].next) | ||
1285 | swap_info[i].prio = p->prio--; | ||
1286 | least_priority++; | ||
1287 | } | ||
1263 | nr_swap_pages -= p->pages; | 1288 | nr_swap_pages -= p->pages; |
1264 | total_swap_pages -= p->pages; | 1289 | total_swap_pages -= p->pages; |
1265 | p->flags &= ~SWP_WRITEOK; | 1290 | p->flags &= ~SWP_WRITEOK; |
@@ -1272,9 +1297,14 @@ asmlinkage long sys_swapoff(const char __user * specialfile) | |||
1272 | if (err) { | 1297 | if (err) { |
1273 | /* re-insert swap space back into swap_list */ | 1298 | /* re-insert swap space back into swap_list */ |
1274 | spin_lock(&swap_lock); | 1299 | spin_lock(&swap_lock); |
1275 | for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) | 1300 | if (p->prio < 0) |
1301 | p->prio = --least_priority; | ||
1302 | prev = -1; | ||
1303 | for (i = swap_list.head; i >= 0; i = swap_info[i].next) { | ||
1276 | if (p->prio >= swap_info[i].prio) | 1304 | if (p->prio >= swap_info[i].prio) |
1277 | break; | 1305 | break; |
1306 | prev = i; | ||
1307 | } | ||
1278 | p->next = i; | 1308 | p->next = i; |
1279 | if (prev < 0) | 1309 | if (prev < 0) |
1280 | swap_list.head = swap_list.next = p - swap_info; | 1310 | swap_list.head = swap_list.next = p - swap_info; |
@@ -1447,7 +1477,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1447 | unsigned int type; | 1477 | unsigned int type; |
1448 | int i, prev; | 1478 | int i, prev; |
1449 | int error; | 1479 | int error; |
1450 | static int least_priority; | ||
1451 | union swap_header *swap_header = NULL; | 1480 | union swap_header *swap_header = NULL; |
1452 | int swap_header_version; | 1481 | int swap_header_version; |
1453 | unsigned int nr_good_pages = 0; | 1482 | unsigned int nr_good_pages = 0; |
@@ -1455,7 +1484,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1455 | sector_t span; | 1484 | sector_t span; |
1456 | unsigned long maxpages = 1; | 1485 | unsigned long maxpages = 1; |
1457 | int swapfilesize; | 1486 | int swapfilesize; |
1458 | unsigned short *swap_map; | 1487 | unsigned short *swap_map = NULL; |
1459 | struct page *page = NULL; | 1488 | struct page *page = NULL; |
1460 | struct inode *inode = NULL; | 1489 | struct inode *inode = NULL; |
1461 | int did_down = 0; | 1490 | int did_down = 0; |
@@ -1474,22 +1503,10 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1474 | } | 1503 | } |
1475 | if (type >= nr_swapfiles) | 1504 | if (type >= nr_swapfiles) |
1476 | nr_swapfiles = type+1; | 1505 | nr_swapfiles = type+1; |
1506 | memset(p, 0, sizeof(*p)); | ||
1477 | INIT_LIST_HEAD(&p->extent_list); | 1507 | INIT_LIST_HEAD(&p->extent_list); |
1478 | p->flags = SWP_USED; | 1508 | p->flags = SWP_USED; |
1479 | p->swap_file = NULL; | ||
1480 | p->old_block_size = 0; | ||
1481 | p->swap_map = NULL; | ||
1482 | p->lowest_bit = 0; | ||
1483 | p->highest_bit = 0; | ||
1484 | p->cluster_nr = 0; | ||
1485 | p->inuse_pages = 0; | ||
1486 | p->next = -1; | 1509 | p->next = -1; |
1487 | if (swap_flags & SWAP_FLAG_PREFER) { | ||
1488 | p->prio = | ||
1489 | (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT; | ||
1490 | } else { | ||
1491 | p->prio = --least_priority; | ||
1492 | } | ||
1493 | spin_unlock(&swap_lock); | 1510 | spin_unlock(&swap_lock); |
1494 | name = getname(specialfile); | 1511 | name = getname(specialfile); |
1495 | error = PTR_ERR(name); | 1512 | error = PTR_ERR(name); |
@@ -1632,19 +1649,20 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1632 | goto bad_swap; | 1649 | goto bad_swap; |
1633 | 1650 | ||
1634 | /* OK, set up the swap map and apply the bad block list */ | 1651 | /* OK, set up the swap map and apply the bad block list */ |
1635 | if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { | 1652 | swap_map = vmalloc(maxpages * sizeof(short)); |
1653 | if (!swap_map) { | ||
1636 | error = -ENOMEM; | 1654 | error = -ENOMEM; |
1637 | goto bad_swap; | 1655 | goto bad_swap; |
1638 | } | 1656 | } |
1639 | 1657 | ||
1640 | error = 0; | 1658 | error = 0; |
1641 | memset(p->swap_map, 0, maxpages * sizeof(short)); | 1659 | memset(swap_map, 0, maxpages * sizeof(short)); |
1642 | for (i = 0; i < swap_header->info.nr_badpages; i++) { | 1660 | for (i = 0; i < swap_header->info.nr_badpages; i++) { |
1643 | int page_nr = swap_header->info.badpages[i]; | 1661 | int page_nr = swap_header->info.badpages[i]; |
1644 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) | 1662 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) |
1645 | error = -EINVAL; | 1663 | error = -EINVAL; |
1646 | else | 1664 | else |
1647 | p->swap_map[page_nr] = SWAP_MAP_BAD; | 1665 | swap_map[page_nr] = SWAP_MAP_BAD; |
1648 | } | 1666 | } |
1649 | nr_good_pages = swap_header->info.last_page - | 1667 | nr_good_pages = swap_header->info.last_page - |
1650 | swap_header->info.nr_badpages - | 1668 | swap_header->info.nr_badpages - |
@@ -1654,7 +1672,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1654 | } | 1672 | } |
1655 | 1673 | ||
1656 | if (nr_good_pages) { | 1674 | if (nr_good_pages) { |
1657 | p->swap_map[0] = SWAP_MAP_BAD; | 1675 | swap_map[0] = SWAP_MAP_BAD; |
1658 | p->max = maxpages; | 1676 | p->max = maxpages; |
1659 | p->pages = nr_good_pages; | 1677 | p->pages = nr_good_pages; |
1660 | nr_extents = setup_swap_extents(p, &span); | 1678 | nr_extents = setup_swap_extents(p, &span); |
@@ -1672,6 +1690,12 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1672 | 1690 | ||
1673 | mutex_lock(&swapon_mutex); | 1691 | mutex_lock(&swapon_mutex); |
1674 | spin_lock(&swap_lock); | 1692 | spin_lock(&swap_lock); |
1693 | if (swap_flags & SWAP_FLAG_PREFER) | ||
1694 | p->prio = | ||
1695 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; | ||
1696 | else | ||
1697 | p->prio = --least_priority; | ||
1698 | p->swap_map = swap_map; | ||
1675 | p->flags = SWP_ACTIVE; | 1699 | p->flags = SWP_ACTIVE; |
1676 | nr_swap_pages += nr_good_pages; | 1700 | nr_swap_pages += nr_good_pages; |
1677 | total_swap_pages += nr_good_pages; | 1701 | total_swap_pages += nr_good_pages; |
@@ -1707,12 +1731,8 @@ bad_swap: | |||
1707 | destroy_swap_extents(p); | 1731 | destroy_swap_extents(p); |
1708 | bad_swap_2: | 1732 | bad_swap_2: |
1709 | spin_lock(&swap_lock); | 1733 | spin_lock(&swap_lock); |
1710 | swap_map = p->swap_map; | ||
1711 | p->swap_file = NULL; | 1734 | p->swap_file = NULL; |
1712 | p->swap_map = NULL; | ||
1713 | p->flags = 0; | 1735 | p->flags = 0; |
1714 | if (!(swap_flags & SWAP_FLAG_PREFER)) | ||
1715 | ++least_priority; | ||
1716 | spin_unlock(&swap_lock); | 1736 | spin_unlock(&swap_lock); |
1717 | vfree(swap_map); | 1737 | vfree(swap_map); |
1718 | if (swap_file) | 1738 | if (swap_file) |
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index ae532f501943..3e67d575ee6e 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c | |||
@@ -65,36 +65,37 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | |||
65 | if (!dentry) | 65 | if (!dentry) |
66 | goto put_memory; | 66 | goto put_memory; |
67 | 67 | ||
68 | error = -ENFILE; | ||
69 | file = get_empty_filp(); | ||
70 | if (!file) | ||
71 | goto put_dentry; | ||
72 | |||
68 | error = -ENOSPC; | 73 | error = -ENOSPC; |
69 | inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); | 74 | inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); |
70 | if (!inode) | 75 | if (!inode) |
71 | goto put_dentry; | 76 | goto close_file; |
72 | 77 | ||
73 | d_instantiate(dentry, inode); | 78 | d_instantiate(dentry, inode); |
74 | error = -ENFILE; | 79 | inode->i_size = size; |
75 | file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ, | ||
76 | &ramfs_file_operations); | ||
77 | if (!file) | ||
78 | goto put_dentry; | ||
79 | |||
80 | inode->i_nlink = 0; /* It is unlinked */ | 80 | inode->i_nlink = 0; /* It is unlinked */ |
81 | init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, | ||
82 | &ramfs_file_operations); | ||
81 | 83 | ||
82 | /* notify everyone as to the change of file size */ | 84 | #ifndef CONFIG_MMU |
83 | error = do_truncate(dentry, size, 0, file); | 85 | error = ramfs_nommu_expand_for_mapping(inode, size); |
84 | if (error < 0) | 86 | if (error) |
85 | goto close_file; | 87 | goto close_file; |
86 | 88 | #endif | |
87 | return file; | 89 | return file; |
88 | 90 | ||
89 | close_file: | 91 | close_file: |
90 | put_filp(file); | 92 | put_filp(file); |
91 | return ERR_PTR(error); | ||
92 | |||
93 | put_dentry: | 93 | put_dentry: |
94 | dput(dentry); | 94 | dput(dentry); |
95 | put_memory: | 95 | put_memory: |
96 | return ERR_PTR(error); | 96 | return ERR_PTR(error); |
97 | } | 97 | } |
98 | EXPORT_SYMBOL_GPL(shmem_file_setup); | ||
98 | 99 | ||
99 | /** | 100 | /** |
100 | * shmem_zero_setup - setup a shared anonymous mapping | 101 | * shmem_zero_setup - setup a shared anonymous mapping |
diff --git a/mm/truncate.c b/mm/truncate.c index b8961cb63414..1229211104f8 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -3,7 +3,7 @@ | |||
3 | * | 3 | * |
4 | * Copyright (C) 2002, Linus Torvalds | 4 | * Copyright (C) 2002, Linus Torvalds |
5 | * | 5 | * |
6 | * 10Sep2002 akpm@zip.com.au | 6 | * 10Sep2002 Andrew Morton |
7 | * Initial version. | 7 | * Initial version. |
8 | */ | 8 | */ |
9 | 9 | ||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/task_io_accounting_ops.h> | 18 | #include <linux/task_io_accounting_ops.h> |
19 | #include <linux/buffer_head.h> /* grr. try_to_release_page, | 19 | #include <linux/buffer_head.h> /* grr. try_to_release_page, |
20 | do_invalidatepage */ | 20 | do_invalidatepage */ |
21 | #include "internal.h" | ||
21 | 22 | ||
22 | 23 | ||
23 | /** | 24 | /** |
@@ -103,8 +104,8 @@ truncate_complete_page(struct address_space *mapping, struct page *page) | |||
103 | 104 | ||
104 | cancel_dirty_page(page, PAGE_CACHE_SIZE); | 105 | cancel_dirty_page(page, PAGE_CACHE_SIZE); |
105 | 106 | ||
107 | clear_page_mlock(page); | ||
106 | remove_from_page_cache(page); | 108 | remove_from_page_cache(page); |
107 | ClearPageUptodate(page); | ||
108 | ClearPageMappedToDisk(page); | 109 | ClearPageMappedToDisk(page); |
109 | page_cache_release(page); /* pagecache ref */ | 110 | page_cache_release(page); /* pagecache ref */ |
110 | } | 111 | } |
@@ -128,6 +129,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) | |||
128 | if (PagePrivate(page) && !try_to_release_page(page, 0)) | 129 | if (PagePrivate(page) && !try_to_release_page(page, 0)) |
129 | return 0; | 130 | return 0; |
130 | 131 | ||
132 | clear_page_mlock(page); | ||
131 | ret = remove_mapping(mapping, page); | 133 | ret = remove_mapping(mapping, page); |
132 | 134 | ||
133 | return ret; | 135 | return ret; |
@@ -188,7 +190,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
188 | if (page_index > next) | 190 | if (page_index > next) |
189 | next = page_index; | 191 | next = page_index; |
190 | next++; | 192 | next++; |
191 | if (TestSetPageLocked(page)) | 193 | if (!trylock_page(page)) |
192 | continue; | 194 | continue; |
193 | if (PageWriteback(page)) { | 195 | if (PageWriteback(page)) { |
194 | unlock_page(page); | 196 | unlock_page(page); |
@@ -281,7 +283,7 @@ unsigned long __invalidate_mapping_pages(struct address_space *mapping, | |||
281 | pgoff_t index; | 283 | pgoff_t index; |
282 | int lock_failed; | 284 | int lock_failed; |
283 | 285 | ||
284 | lock_failed = TestSetPageLocked(page); | 286 | lock_failed = !trylock_page(page); |
285 | 287 | ||
286 | /* | 288 | /* |
287 | * We really shouldn't be looking at the ->index of an | 289 | * We really shouldn't be looking at the ->index of an |
@@ -349,18 +351,18 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) | |||
349 | if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) | 351 | if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) |
350 | return 0; | 352 | return 0; |
351 | 353 | ||
352 | write_lock_irq(&mapping->tree_lock); | 354 | spin_lock_irq(&mapping->tree_lock); |
353 | if (PageDirty(page)) | 355 | if (PageDirty(page)) |
354 | goto failed; | 356 | goto failed; |
355 | 357 | ||
358 | clear_page_mlock(page); | ||
356 | BUG_ON(PagePrivate(page)); | 359 | BUG_ON(PagePrivate(page)); |
357 | __remove_from_page_cache(page); | 360 | __remove_from_page_cache(page); |
358 | write_unlock_irq(&mapping->tree_lock); | 361 | spin_unlock_irq(&mapping->tree_lock); |
359 | ClearPageUptodate(page); | ||
360 | page_cache_release(page); /* pagecache ref */ | 362 | page_cache_release(page); /* pagecache ref */ |
361 | return 1; | 363 | return 1; |
362 | failed: | 364 | failed: |
363 | write_unlock_irq(&mapping->tree_lock); | 365 | spin_unlock_irq(&mapping->tree_lock); |
364 | return 0; | 366 | return 0; |
365 | } | 367 | } |
366 | 368 | ||
@@ -382,7 +384,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page) | |||
382 | * Any pages which are found to be mapped into pagetables are unmapped prior to | 384 | * Any pages which are found to be mapped into pagetables are unmapped prior to |
383 | * invalidation. | 385 | * invalidation. |
384 | * | 386 | * |
385 | * Returns -EIO if any pages could not be invalidated. | 387 | * Returns -EBUSY if any pages could not be invalidated. |
386 | */ | 388 | */ |
387 | int invalidate_inode_pages2_range(struct address_space *mapping, | 389 | int invalidate_inode_pages2_range(struct address_space *mapping, |
388 | pgoff_t start, pgoff_t end) | 390 | pgoff_t start, pgoff_t end) |
@@ -442,7 +444,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
442 | ret2 = do_launder_page(mapping, page); | 444 | ret2 = do_launder_page(mapping, page); |
443 | if (ret2 == 0) { | 445 | if (ret2 == 0) { |
444 | if (!invalidate_complete_page2(mapping, page)) | 446 | if (!invalidate_complete_page2(mapping, page)) |
445 | ret2 = -EIO; | 447 | ret2 = -EBUSY; |
446 | } | 448 | } |
447 | if (ret2 < 0) | 449 | if (ret2 < 0) |
448 | ret = ret2; | 450 | ret = ret2; |
@@ -1,7 +1,9 @@ | |||
1 | #include <linux/mm.h> | ||
1 | #include <linux/slab.h> | 2 | #include <linux/slab.h> |
2 | #include <linux/string.h> | 3 | #include <linux/string.h> |
3 | #include <linux/module.h> | 4 | #include <linux/module.h> |
4 | #include <linux/err.h> | 5 | #include <linux/err.h> |
6 | #include <linux/sched.h> | ||
5 | #include <asm/uaccess.h> | 7 | #include <asm/uaccess.h> |
6 | 8 | ||
7 | /** | 9 | /** |
@@ -68,25 +70,22 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp) | |||
68 | EXPORT_SYMBOL(kmemdup); | 70 | EXPORT_SYMBOL(kmemdup); |
69 | 71 | ||
70 | /** | 72 | /** |
71 | * krealloc - reallocate memory. The contents will remain unchanged. | 73 | * __krealloc - like krealloc() but don't free @p. |
72 | * @p: object to reallocate memory for. | 74 | * @p: object to reallocate memory for. |
73 | * @new_size: how many bytes of memory are required. | 75 | * @new_size: how many bytes of memory are required. |
74 | * @flags: the type of memory to allocate. | 76 | * @flags: the type of memory to allocate. |
75 | * | 77 | * |
76 | * The contents of the object pointed to are preserved up to the | 78 | * This function is like krealloc() except it never frees the originally |
77 | * lesser of the new and old sizes. If @p is %NULL, krealloc() | 79 | * allocated buffer. Use this if you don't want to free the buffer immediately |
78 | * behaves exactly like kmalloc(). If @size is 0 and @p is not a | 80 | * like, for example, with RCU. |
79 | * %NULL pointer, the object pointed to is freed. | ||
80 | */ | 81 | */ |
81 | void *krealloc(const void *p, size_t new_size, gfp_t flags) | 82 | void *__krealloc(const void *p, size_t new_size, gfp_t flags) |
82 | { | 83 | { |
83 | void *ret; | 84 | void *ret; |
84 | size_t ks = 0; | 85 | size_t ks = 0; |
85 | 86 | ||
86 | if (unlikely(!new_size)) { | 87 | if (unlikely(!new_size)) |
87 | kfree(p); | ||
88 | return ZERO_SIZE_PTR; | 88 | return ZERO_SIZE_PTR; |
89 | } | ||
90 | 89 | ||
91 | if (p) | 90 | if (p) |
92 | ks = ksize(p); | 91 | ks = ksize(p); |
@@ -95,10 +94,37 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) | |||
95 | return (void *)p; | 94 | return (void *)p; |
96 | 95 | ||
97 | ret = kmalloc_track_caller(new_size, flags); | 96 | ret = kmalloc_track_caller(new_size, flags); |
98 | if (ret && p) { | 97 | if (ret && p) |
99 | memcpy(ret, p, ks); | 98 | memcpy(ret, p, ks); |
99 | |||
100 | return ret; | ||
101 | } | ||
102 | EXPORT_SYMBOL(__krealloc); | ||
103 | |||
104 | /** | ||
105 | * krealloc - reallocate memory. The contents will remain unchanged. | ||
106 | * @p: object to reallocate memory for. | ||
107 | * @new_size: how many bytes of memory are required. | ||
108 | * @flags: the type of memory to allocate. | ||
109 | * | ||
110 | * The contents of the object pointed to are preserved up to the | ||
111 | * lesser of the new and old sizes. If @p is %NULL, krealloc() | ||
112 | * behaves exactly like kmalloc(). If @size is 0 and @p is not a | ||
113 | * %NULL pointer, the object pointed to is freed. | ||
114 | */ | ||
115 | void *krealloc(const void *p, size_t new_size, gfp_t flags) | ||
116 | { | ||
117 | void *ret; | ||
118 | |||
119 | if (unlikely(!new_size)) { | ||
100 | kfree(p); | 120 | kfree(p); |
121 | return ZERO_SIZE_PTR; | ||
101 | } | 122 | } |
123 | |||
124 | ret = __krealloc(p, new_size, flags); | ||
125 | if (ret && p != ret) | ||
126 | kfree(p); | ||
127 | |||
102 | return ret; | 128 | return ret; |
103 | } | 129 | } |
104 | EXPORT_SYMBOL(krealloc); | 130 | EXPORT_SYMBOL(krealloc); |
@@ -136,3 +162,27 @@ char *strndup_user(const char __user *s, long n) | |||
136 | return p; | 162 | return p; |
137 | } | 163 | } |
138 | EXPORT_SYMBOL(strndup_user); | 164 | EXPORT_SYMBOL(strndup_user); |
165 | |||
166 | #ifndef HAVE_ARCH_PICK_MMAP_LAYOUT | ||
167 | void arch_pick_mmap_layout(struct mm_struct *mm) | ||
168 | { | ||
169 | mm->mmap_base = TASK_UNMAPPED_BASE; | ||
170 | mm->get_unmapped_area = arch_get_unmapped_area; | ||
171 | mm->unmap_area = arch_unmap_area; | ||
172 | } | ||
173 | #endif | ||
174 | |||
175 | int __attribute__((weak)) get_user_pages_fast(unsigned long start, | ||
176 | int nr_pages, int write, struct page **pages) | ||
177 | { | ||
178 | struct mm_struct *mm = current->mm; | ||
179 | int ret; | ||
180 | |||
181 | down_read(&mm->mmap_sem); | ||
182 | ret = get_user_pages(current, mm, start, nr_pages, | ||
183 | write, 0, pages, NULL); | ||
184 | up_read(&mm->mmap_sem); | ||
185 | |||
186 | return ret; | ||
187 | } | ||
188 | EXPORT_SYMBOL_GPL(get_user_pages_fast); | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6e45b0f3d125..036536945dd9 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -8,26 +8,28 @@ | |||
8 | * Numa awareness, Christoph Lameter, SGI, June 2005 | 8 | * Numa awareness, Christoph Lameter, SGI, June 2005 |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #include <linux/vmalloc.h> | ||
11 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
12 | #include <linux/module.h> | 13 | #include <linux/module.h> |
13 | #include <linux/highmem.h> | 14 | #include <linux/highmem.h> |
14 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
15 | #include <linux/spinlock.h> | 16 | #include <linux/spinlock.h> |
16 | #include <linux/interrupt.h> | 17 | #include <linux/interrupt.h> |
18 | #include <linux/proc_fs.h> | ||
17 | #include <linux/seq_file.h> | 19 | #include <linux/seq_file.h> |
18 | #include <linux/debugobjects.h> | 20 | #include <linux/debugobjects.h> |
19 | #include <linux/vmalloc.h> | ||
20 | #include <linux/kallsyms.h> | 21 | #include <linux/kallsyms.h> |
22 | #include <linux/list.h> | ||
23 | #include <linux/rbtree.h> | ||
24 | #include <linux/radix-tree.h> | ||
25 | #include <linux/rcupdate.h> | ||
21 | 26 | ||
27 | #include <asm/atomic.h> | ||
22 | #include <asm/uaccess.h> | 28 | #include <asm/uaccess.h> |
23 | #include <asm/tlbflush.h> | 29 | #include <asm/tlbflush.h> |
24 | 30 | ||
25 | 31 | ||
26 | DEFINE_RWLOCK(vmlist_lock); | 32 | /*** Page table manipulation functions ***/ |
27 | struct vm_struct *vmlist; | ||
28 | |||
29 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | ||
30 | int node, void *caller); | ||
31 | 33 | ||
32 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) | 34 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) |
33 | { | 35 | { |
@@ -40,8 +42,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) | |||
40 | } while (pte++, addr += PAGE_SIZE, addr != end); | 42 | } while (pte++, addr += PAGE_SIZE, addr != end); |
41 | } | 43 | } |
42 | 44 | ||
43 | static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, | 45 | static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) |
44 | unsigned long end) | ||
45 | { | 46 | { |
46 | pmd_t *pmd; | 47 | pmd_t *pmd; |
47 | unsigned long next; | 48 | unsigned long next; |
@@ -55,8 +56,7 @@ static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, | |||
55 | } while (pmd++, addr = next, addr != end); | 56 | } while (pmd++, addr = next, addr != end); |
56 | } | 57 | } |
57 | 58 | ||
58 | static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, | 59 | static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) |
59 | unsigned long end) | ||
60 | { | 60 | { |
61 | pud_t *pud; | 61 | pud_t *pud; |
62 | unsigned long next; | 62 | unsigned long next; |
@@ -70,12 +70,10 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, | |||
70 | } while (pud++, addr = next, addr != end); | 70 | } while (pud++, addr = next, addr != end); |
71 | } | 71 | } |
72 | 72 | ||
73 | void unmap_kernel_range(unsigned long addr, unsigned long size) | 73 | static void vunmap_page_range(unsigned long addr, unsigned long end) |
74 | { | 74 | { |
75 | pgd_t *pgd; | 75 | pgd_t *pgd; |
76 | unsigned long next; | 76 | unsigned long next; |
77 | unsigned long start = addr; | ||
78 | unsigned long end = addr + size; | ||
79 | 77 | ||
80 | BUG_ON(addr >= end); | 78 | BUG_ON(addr >= end); |
81 | pgd = pgd_offset_k(addr); | 79 | pgd = pgd_offset_k(addr); |
@@ -86,35 +84,36 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) | |||
86 | continue; | 84 | continue; |
87 | vunmap_pud_range(pgd, addr, next); | 85 | vunmap_pud_range(pgd, addr, next); |
88 | } while (pgd++, addr = next, addr != end); | 86 | } while (pgd++, addr = next, addr != end); |
89 | flush_tlb_kernel_range(start, end); | ||
90 | } | ||
91 | |||
92 | static void unmap_vm_area(struct vm_struct *area) | ||
93 | { | ||
94 | unmap_kernel_range((unsigned long)area->addr, area->size); | ||
95 | } | 87 | } |
96 | 88 | ||
97 | static int vmap_pte_range(pmd_t *pmd, unsigned long addr, | 89 | static int vmap_pte_range(pmd_t *pmd, unsigned long addr, |
98 | unsigned long end, pgprot_t prot, struct page ***pages) | 90 | unsigned long end, pgprot_t prot, struct page **pages, int *nr) |
99 | { | 91 | { |
100 | pte_t *pte; | 92 | pte_t *pte; |
101 | 93 | ||
94 | /* | ||
95 | * nr is a running index into the array which helps higher level | ||
96 | * callers keep track of where we're up to. | ||
97 | */ | ||
98 | |||
102 | pte = pte_alloc_kernel(pmd, addr); | 99 | pte = pte_alloc_kernel(pmd, addr); |
103 | if (!pte) | 100 | if (!pte) |
104 | return -ENOMEM; | 101 | return -ENOMEM; |
105 | do { | 102 | do { |
106 | struct page *page = **pages; | 103 | struct page *page = pages[*nr]; |
107 | WARN_ON(!pte_none(*pte)); | 104 | |
108 | if (!page) | 105 | if (WARN_ON(!pte_none(*pte))) |
106 | return -EBUSY; | ||
107 | if (WARN_ON(!page)) | ||
109 | return -ENOMEM; | 108 | return -ENOMEM; |
110 | set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); | 109 | set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); |
111 | (*pages)++; | 110 | (*nr)++; |
112 | } while (pte++, addr += PAGE_SIZE, addr != end); | 111 | } while (pte++, addr += PAGE_SIZE, addr != end); |
113 | return 0; | 112 | return 0; |
114 | } | 113 | } |
115 | 114 | ||
116 | static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, | 115 | static int vmap_pmd_range(pud_t *pud, unsigned long addr, |
117 | unsigned long end, pgprot_t prot, struct page ***pages) | 116 | unsigned long end, pgprot_t prot, struct page **pages, int *nr) |
118 | { | 117 | { |
119 | pmd_t *pmd; | 118 | pmd_t *pmd; |
120 | unsigned long next; | 119 | unsigned long next; |
@@ -124,14 +123,14 @@ static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, | |||
124 | return -ENOMEM; | 123 | return -ENOMEM; |
125 | do { | 124 | do { |
126 | next = pmd_addr_end(addr, end); | 125 | next = pmd_addr_end(addr, end); |
127 | if (vmap_pte_range(pmd, addr, next, prot, pages)) | 126 | if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) |
128 | return -ENOMEM; | 127 | return -ENOMEM; |
129 | } while (pmd++, addr = next, addr != end); | 128 | } while (pmd++, addr = next, addr != end); |
130 | return 0; | 129 | return 0; |
131 | } | 130 | } |
132 | 131 | ||
133 | static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, | 132 | static int vmap_pud_range(pgd_t *pgd, unsigned long addr, |
134 | unsigned long end, pgprot_t prot, struct page ***pages) | 133 | unsigned long end, pgprot_t prot, struct page **pages, int *nr) |
135 | { | 134 | { |
136 | pud_t *pud; | 135 | pud_t *pud; |
137 | unsigned long next; | 136 | unsigned long next; |
@@ -141,50 +140,78 @@ static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, | |||
141 | return -ENOMEM; | 140 | return -ENOMEM; |
142 | do { | 141 | do { |
143 | next = pud_addr_end(addr, end); | 142 | next = pud_addr_end(addr, end); |
144 | if (vmap_pmd_range(pud, addr, next, prot, pages)) | 143 | if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) |
145 | return -ENOMEM; | 144 | return -ENOMEM; |
146 | } while (pud++, addr = next, addr != end); | 145 | } while (pud++, addr = next, addr != end); |
147 | return 0; | 146 | return 0; |
148 | } | 147 | } |
149 | 148 | ||
150 | int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) | 149 | /* |
150 | * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and | ||
151 | * will have pfns corresponding to the "pages" array. | ||
152 | * | ||
153 | * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] | ||
154 | */ | ||
155 | static int vmap_page_range(unsigned long addr, unsigned long end, | ||
156 | pgprot_t prot, struct page **pages) | ||
151 | { | 157 | { |
152 | pgd_t *pgd; | 158 | pgd_t *pgd; |
153 | unsigned long next; | 159 | unsigned long next; |
154 | unsigned long addr = (unsigned long) area->addr; | 160 | int err = 0; |
155 | unsigned long end = addr + area->size - PAGE_SIZE; | 161 | int nr = 0; |
156 | int err; | ||
157 | 162 | ||
158 | BUG_ON(addr >= end); | 163 | BUG_ON(addr >= end); |
159 | pgd = pgd_offset_k(addr); | 164 | pgd = pgd_offset_k(addr); |
160 | do { | 165 | do { |
161 | next = pgd_addr_end(addr, end); | 166 | next = pgd_addr_end(addr, end); |
162 | err = vmap_pud_range(pgd, addr, next, prot, pages); | 167 | err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); |
163 | if (err) | 168 | if (err) |
164 | break; | 169 | break; |
165 | } while (pgd++, addr = next, addr != end); | 170 | } while (pgd++, addr = next, addr != end); |
166 | flush_cache_vmap((unsigned long) area->addr, end); | 171 | flush_cache_vmap(addr, end); |
167 | return err; | 172 | |
173 | if (unlikely(err)) | ||
174 | return err; | ||
175 | return nr; | ||
176 | } | ||
177 | |||
178 | static inline int is_vmalloc_or_module_addr(const void *x) | ||
179 | { | ||
180 | /* | ||
181 | * x86-64 and sparc64 put modules in a special place, | ||
182 | * and fall back on vmalloc() if that fails. Others | ||
183 | * just put it in the vmalloc space. | ||
184 | */ | ||
185 | #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) | ||
186 | unsigned long addr = (unsigned long)x; | ||
187 | if (addr >= MODULES_VADDR && addr < MODULES_END) | ||
188 | return 1; | ||
189 | #endif | ||
190 | return is_vmalloc_addr(x); | ||
168 | } | 191 | } |
169 | EXPORT_SYMBOL_GPL(map_vm_area); | ||
170 | 192 | ||
171 | /* | 193 | /* |
172 | * Map a vmalloc()-space virtual address to the physical page. | 194 | * Walk a vmap address to the struct page it maps. |
173 | */ | 195 | */ |
174 | struct page *vmalloc_to_page(const void *vmalloc_addr) | 196 | struct page *vmalloc_to_page(const void *vmalloc_addr) |
175 | { | 197 | { |
176 | unsigned long addr = (unsigned long) vmalloc_addr; | 198 | unsigned long addr = (unsigned long) vmalloc_addr; |
177 | struct page *page = NULL; | 199 | struct page *page = NULL; |
178 | pgd_t *pgd = pgd_offset_k(addr); | 200 | pgd_t *pgd = pgd_offset_k(addr); |
179 | pud_t *pud; | 201 | |
180 | pmd_t *pmd; | 202 | /* |
181 | pte_t *ptep, pte; | 203 | * XXX we might need to change this if we add VIRTUAL_BUG_ON for |
204 | * architectures that do not vmalloc module space | ||
205 | */ | ||
206 | VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); | ||
182 | 207 | ||
183 | if (!pgd_none(*pgd)) { | 208 | if (!pgd_none(*pgd)) { |
184 | pud = pud_offset(pgd, addr); | 209 | pud_t *pud = pud_offset(pgd, addr); |
185 | if (!pud_none(*pud)) { | 210 | if (!pud_none(*pud)) { |
186 | pmd = pmd_offset(pud, addr); | 211 | pmd_t *pmd = pmd_offset(pud, addr); |
187 | if (!pmd_none(*pmd)) { | 212 | if (!pmd_none(*pmd)) { |
213 | pte_t *ptep, pte; | ||
214 | |||
188 | ptep = pte_offset_map(pmd, addr); | 215 | ptep = pte_offset_map(pmd, addr); |
189 | pte = *ptep; | 216 | pte = *ptep; |
190 | if (pte_present(pte)) | 217 | if (pte_present(pte)) |
@@ -206,13 +233,751 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr) | |||
206 | } | 233 | } |
207 | EXPORT_SYMBOL(vmalloc_to_pfn); | 234 | EXPORT_SYMBOL(vmalloc_to_pfn); |
208 | 235 | ||
209 | static struct vm_struct * | 236 | |
210 | __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, | 237 | /*** Global kva allocator ***/ |
211 | unsigned long end, int node, gfp_t gfp_mask, void *caller) | 238 | |
239 | #define VM_LAZY_FREE 0x01 | ||
240 | #define VM_LAZY_FREEING 0x02 | ||
241 | #define VM_VM_AREA 0x04 | ||
242 | |||
243 | struct vmap_area { | ||
244 | unsigned long va_start; | ||
245 | unsigned long va_end; | ||
246 | unsigned long flags; | ||
247 | struct rb_node rb_node; /* address sorted rbtree */ | ||
248 | struct list_head list; /* address sorted list */ | ||
249 | struct list_head purge_list; /* "lazy purge" list */ | ||
250 | void *private; | ||
251 | struct rcu_head rcu_head; | ||
252 | }; | ||
253 | |||
254 | static DEFINE_SPINLOCK(vmap_area_lock); | ||
255 | static struct rb_root vmap_area_root = RB_ROOT; | ||
256 | static LIST_HEAD(vmap_area_list); | ||
257 | |||
258 | static struct vmap_area *__find_vmap_area(unsigned long addr) | ||
212 | { | 259 | { |
213 | struct vm_struct **p, *tmp, *area; | 260 | struct rb_node *n = vmap_area_root.rb_node; |
214 | unsigned long align = 1; | 261 | |
262 | while (n) { | ||
263 | struct vmap_area *va; | ||
264 | |||
265 | va = rb_entry(n, struct vmap_area, rb_node); | ||
266 | if (addr < va->va_start) | ||
267 | n = n->rb_left; | ||
268 | else if (addr > va->va_start) | ||
269 | n = n->rb_right; | ||
270 | else | ||
271 | return va; | ||
272 | } | ||
273 | |||
274 | return NULL; | ||
275 | } | ||
276 | |||
277 | static void __insert_vmap_area(struct vmap_area *va) | ||
278 | { | ||
279 | struct rb_node **p = &vmap_area_root.rb_node; | ||
280 | struct rb_node *parent = NULL; | ||
281 | struct rb_node *tmp; | ||
282 | |||
283 | while (*p) { | ||
284 | struct vmap_area *tmp; | ||
285 | |||
286 | parent = *p; | ||
287 | tmp = rb_entry(parent, struct vmap_area, rb_node); | ||
288 | if (va->va_start < tmp->va_end) | ||
289 | p = &(*p)->rb_left; | ||
290 | else if (va->va_end > tmp->va_start) | ||
291 | p = &(*p)->rb_right; | ||
292 | else | ||
293 | BUG(); | ||
294 | } | ||
295 | |||
296 | rb_link_node(&va->rb_node, parent, p); | ||
297 | rb_insert_color(&va->rb_node, &vmap_area_root); | ||
298 | |||
299 | /* address-sort this list so it is usable like the vmlist */ | ||
300 | tmp = rb_prev(&va->rb_node); | ||
301 | if (tmp) { | ||
302 | struct vmap_area *prev; | ||
303 | prev = rb_entry(tmp, struct vmap_area, rb_node); | ||
304 | list_add_rcu(&va->list, &prev->list); | ||
305 | } else | ||
306 | list_add_rcu(&va->list, &vmap_area_list); | ||
307 | } | ||
308 | |||
309 | static void purge_vmap_area_lazy(void); | ||
310 | |||
311 | /* | ||
312 | * Allocate a region of KVA of the specified size and alignment, within the | ||
313 | * vstart and vend. | ||
314 | */ | ||
315 | static struct vmap_area *alloc_vmap_area(unsigned long size, | ||
316 | unsigned long align, | ||
317 | unsigned long vstart, unsigned long vend, | ||
318 | int node, gfp_t gfp_mask) | ||
319 | { | ||
320 | struct vmap_area *va; | ||
321 | struct rb_node *n; | ||
322 | unsigned long addr; | ||
323 | int purged = 0; | ||
324 | |||
325 | BUG_ON(size & ~PAGE_MASK); | ||
326 | |||
327 | addr = ALIGN(vstart, align); | ||
328 | |||
329 | va = kmalloc_node(sizeof(struct vmap_area), | ||
330 | gfp_mask & GFP_RECLAIM_MASK, node); | ||
331 | if (unlikely(!va)) | ||
332 | return ERR_PTR(-ENOMEM); | ||
333 | |||
334 | retry: | ||
335 | spin_lock(&vmap_area_lock); | ||
336 | /* XXX: could have a last_hole cache */ | ||
337 | n = vmap_area_root.rb_node; | ||
338 | if (n) { | ||
339 | struct vmap_area *first = NULL; | ||
340 | |||
341 | do { | ||
342 | struct vmap_area *tmp; | ||
343 | tmp = rb_entry(n, struct vmap_area, rb_node); | ||
344 | if (tmp->va_end >= addr) { | ||
345 | if (!first && tmp->va_start < addr + size) | ||
346 | first = tmp; | ||
347 | n = n->rb_left; | ||
348 | } else { | ||
349 | first = tmp; | ||
350 | n = n->rb_right; | ||
351 | } | ||
352 | } while (n); | ||
353 | |||
354 | if (!first) | ||
355 | goto found; | ||
356 | |||
357 | if (first->va_end < addr) { | ||
358 | n = rb_next(&first->rb_node); | ||
359 | if (n) | ||
360 | first = rb_entry(n, struct vmap_area, rb_node); | ||
361 | else | ||
362 | goto found; | ||
363 | } | ||
364 | |||
365 | while (addr + size >= first->va_start && addr + size <= vend) { | ||
366 | addr = ALIGN(first->va_end + PAGE_SIZE, align); | ||
367 | |||
368 | n = rb_next(&first->rb_node); | ||
369 | if (n) | ||
370 | first = rb_entry(n, struct vmap_area, rb_node); | ||
371 | else | ||
372 | goto found; | ||
373 | } | ||
374 | } | ||
375 | found: | ||
376 | if (addr + size > vend) { | ||
377 | spin_unlock(&vmap_area_lock); | ||
378 | if (!purged) { | ||
379 | purge_vmap_area_lazy(); | ||
380 | purged = 1; | ||
381 | goto retry; | ||
382 | } | ||
383 | if (printk_ratelimit()) | ||
384 | printk(KERN_WARNING "vmap allocation failed: " | ||
385 | "use vmalloc=<size> to increase size.\n"); | ||
386 | return ERR_PTR(-EBUSY); | ||
387 | } | ||
388 | |||
389 | BUG_ON(addr & (align-1)); | ||
390 | |||
391 | va->va_start = addr; | ||
392 | va->va_end = addr + size; | ||
393 | va->flags = 0; | ||
394 | __insert_vmap_area(va); | ||
395 | spin_unlock(&vmap_area_lock); | ||
396 | |||
397 | return va; | ||
398 | } | ||
399 | |||
400 | static void rcu_free_va(struct rcu_head *head) | ||
401 | { | ||
402 | struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); | ||
403 | |||
404 | kfree(va); | ||
405 | } | ||
406 | |||
407 | static void __free_vmap_area(struct vmap_area *va) | ||
408 | { | ||
409 | BUG_ON(RB_EMPTY_NODE(&va->rb_node)); | ||
410 | rb_erase(&va->rb_node, &vmap_area_root); | ||
411 | RB_CLEAR_NODE(&va->rb_node); | ||
412 | list_del_rcu(&va->list); | ||
413 | |||
414 | call_rcu(&va->rcu_head, rcu_free_va); | ||
415 | } | ||
416 | |||
417 | /* | ||
418 | * Free a region of KVA allocated by alloc_vmap_area | ||
419 | */ | ||
420 | static void free_vmap_area(struct vmap_area *va) | ||
421 | { | ||
422 | spin_lock(&vmap_area_lock); | ||
423 | __free_vmap_area(va); | ||
424 | spin_unlock(&vmap_area_lock); | ||
425 | } | ||
426 | |||
427 | /* | ||
428 | * Clear the pagetable entries of a given vmap_area | ||
429 | */ | ||
430 | static void unmap_vmap_area(struct vmap_area *va) | ||
431 | { | ||
432 | vunmap_page_range(va->va_start, va->va_end); | ||
433 | } | ||
434 | |||
435 | /* | ||
436 | * lazy_max_pages is the maximum amount of virtual address space we gather up | ||
437 | * before attempting to purge with a TLB flush. | ||
438 | * | ||
439 | * There is a tradeoff here: a larger number will cover more kernel page tables | ||
440 | * and take slightly longer to purge, but it will linearly reduce the number of | ||
441 | * global TLB flushes that must be performed. It would seem natural to scale | ||
442 | * this number up linearly with the number of CPUs (because vmapping activity | ||
443 | * could also scale linearly with the number of CPUs), however it is likely | ||
444 | * that in practice, workloads might be constrained in other ways that mean | ||
445 | * vmap activity will not scale linearly with CPUs. Also, I want to be | ||
446 | * conservative and not introduce a big latency on huge systems, so go with | ||
447 | * a less aggressive log scale. It will still be an improvement over the old | ||
448 | * code, and it will be simple to change the scale factor if we find that it | ||
449 | * becomes a problem on bigger systems. | ||
450 | */ | ||
451 | static unsigned long lazy_max_pages(void) | ||
452 | { | ||
453 | unsigned int log; | ||
454 | |||
455 | log = fls(num_online_cpus()); | ||
456 | |||
457 | return log * (32UL * 1024 * 1024 / PAGE_SIZE); | ||
458 | } | ||
459 | |||
460 | static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); | ||
461 | |||
462 | /* | ||
463 | * Purges all lazily-freed vmap areas. | ||
464 | * | ||
465 | * If sync is 0 then don't purge if there is already a purge in progress. | ||
466 | * If force_flush is 1, then flush kernel TLBs between *start and *end even | ||
467 | * if we found no lazy vmap areas to unmap (callers can use this to optimise | ||
468 | * their own TLB flushing). | ||
469 | * Returns with *start = min(*start, lowest purged address) | ||
470 | * *end = max(*end, highest purged address) | ||
471 | */ | ||
472 | static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, | ||
473 | int sync, int force_flush) | ||
474 | { | ||
475 | static DEFINE_SPINLOCK(purge_lock); | ||
476 | LIST_HEAD(valist); | ||
477 | struct vmap_area *va; | ||
478 | int nr = 0; | ||
479 | |||
480 | /* | ||
481 | * If sync is 0 but force_flush is 1, we'll go sync anyway but callers | ||
482 | * should not expect such behaviour. This just simplifies locking for | ||
483 | * the case that isn't actually used at the moment anyway. | ||
484 | */ | ||
485 | if (!sync && !force_flush) { | ||
486 | if (!spin_trylock(&purge_lock)) | ||
487 | return; | ||
488 | } else | ||
489 | spin_lock(&purge_lock); | ||
490 | |||
491 | rcu_read_lock(); | ||
492 | list_for_each_entry_rcu(va, &vmap_area_list, list) { | ||
493 | if (va->flags & VM_LAZY_FREE) { | ||
494 | if (va->va_start < *start) | ||
495 | *start = va->va_start; | ||
496 | if (va->va_end > *end) | ||
497 | *end = va->va_end; | ||
498 | nr += (va->va_end - va->va_start) >> PAGE_SHIFT; | ||
499 | unmap_vmap_area(va); | ||
500 | list_add_tail(&va->purge_list, &valist); | ||
501 | va->flags |= VM_LAZY_FREEING; | ||
502 | va->flags &= ~VM_LAZY_FREE; | ||
503 | } | ||
504 | } | ||
505 | rcu_read_unlock(); | ||
506 | |||
507 | if (nr) { | ||
508 | BUG_ON(nr > atomic_read(&vmap_lazy_nr)); | ||
509 | atomic_sub(nr, &vmap_lazy_nr); | ||
510 | } | ||
511 | |||
512 | if (nr || force_flush) | ||
513 | flush_tlb_kernel_range(*start, *end); | ||
514 | |||
515 | if (nr) { | ||
516 | spin_lock(&vmap_area_lock); | ||
517 | list_for_each_entry(va, &valist, purge_list) | ||
518 | __free_vmap_area(va); | ||
519 | spin_unlock(&vmap_area_lock); | ||
520 | } | ||
521 | spin_unlock(&purge_lock); | ||
522 | } | ||
523 | |||
524 | /* | ||
525 | * Kick off a purge of the outstanding lazy areas. | ||
526 | */ | ||
527 | static void purge_vmap_area_lazy(void) | ||
528 | { | ||
529 | unsigned long start = ULONG_MAX, end = 0; | ||
530 | |||
531 | __purge_vmap_area_lazy(&start, &end, 0, 0); | ||
532 | } | ||
533 | |||
534 | /* | ||
535 | * Free and unmap a vmap area | ||
536 | */ | ||
537 | static void free_unmap_vmap_area(struct vmap_area *va) | ||
538 | { | ||
539 | va->flags |= VM_LAZY_FREE; | ||
540 | atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); | ||
541 | if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) | ||
542 | purge_vmap_area_lazy(); | ||
543 | } | ||
544 | |||
545 | static struct vmap_area *find_vmap_area(unsigned long addr) | ||
546 | { | ||
547 | struct vmap_area *va; | ||
548 | |||
549 | spin_lock(&vmap_area_lock); | ||
550 | va = __find_vmap_area(addr); | ||
551 | spin_unlock(&vmap_area_lock); | ||
552 | |||
553 | return va; | ||
554 | } | ||
555 | |||
556 | static void free_unmap_vmap_area_addr(unsigned long addr) | ||
557 | { | ||
558 | struct vmap_area *va; | ||
559 | |||
560 | va = find_vmap_area(addr); | ||
561 | BUG_ON(!va); | ||
562 | free_unmap_vmap_area(va); | ||
563 | } | ||
564 | |||
565 | |||
566 | /*** Per cpu kva allocator ***/ | ||
567 | |||
568 | /* | ||
569 | * vmap space is limited especially on 32 bit architectures. Ensure there is | ||
570 | * room for at least 16 percpu vmap blocks per CPU. | ||
571 | */ | ||
572 | /* | ||
573 | * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able | ||
574 | * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess | ||
575 | * instead (we just need a rough idea) | ||
576 | */ | ||
577 | #if BITS_PER_LONG == 32 | ||
578 | #define VMALLOC_SPACE (128UL*1024*1024) | ||
579 | #else | ||
580 | #define VMALLOC_SPACE (128UL*1024*1024*1024) | ||
581 | #endif | ||
582 | |||
583 | #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) | ||
584 | #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ | ||
585 | #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ | ||
586 | #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) | ||
587 | #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ | ||
588 | #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ | ||
589 | #define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ | ||
590 | VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ | ||
591 | VMALLOC_PAGES / NR_CPUS / 16)) | ||
592 | |||
593 | #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) | ||
594 | |||
595 | struct vmap_block_queue { | ||
596 | spinlock_t lock; | ||
597 | struct list_head free; | ||
598 | struct list_head dirty; | ||
599 | unsigned int nr_dirty; | ||
600 | }; | ||
601 | |||
602 | struct vmap_block { | ||
603 | spinlock_t lock; | ||
604 | struct vmap_area *va; | ||
605 | struct vmap_block_queue *vbq; | ||
606 | unsigned long free, dirty; | ||
607 | DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); | ||
608 | DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); | ||
609 | union { | ||
610 | struct { | ||
611 | struct list_head free_list; | ||
612 | struct list_head dirty_list; | ||
613 | }; | ||
614 | struct rcu_head rcu_head; | ||
615 | }; | ||
616 | }; | ||
617 | |||
618 | /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ | ||
619 | static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); | ||
620 | |||
621 | /* | ||
622 | * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block | ||
623 | * in the free path. Could get rid of this if we change the API to return a | ||
624 | * "cookie" from alloc, to be passed to free. But no big deal yet. | ||
625 | */ | ||
626 | static DEFINE_SPINLOCK(vmap_block_tree_lock); | ||
627 | static RADIX_TREE(vmap_block_tree, GFP_ATOMIC); | ||
628 | |||
629 | /* | ||
630 | * We should probably have a fallback mechanism to allocate virtual memory | ||
631 | * out of partially filled vmap blocks. However vmap block sizing should be | ||
632 | * fairly reasonable according to the vmalloc size, so it shouldn't be a | ||
633 | * big problem. | ||
634 | */ | ||
635 | |||
636 | static unsigned long addr_to_vb_idx(unsigned long addr) | ||
637 | { | ||
638 | addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); | ||
639 | addr /= VMAP_BLOCK_SIZE; | ||
640 | return addr; | ||
641 | } | ||
642 | |||
643 | static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | ||
644 | { | ||
645 | struct vmap_block_queue *vbq; | ||
646 | struct vmap_block *vb; | ||
647 | struct vmap_area *va; | ||
648 | unsigned long vb_idx; | ||
649 | int node, err; | ||
650 | |||
651 | node = numa_node_id(); | ||
652 | |||
653 | vb = kmalloc_node(sizeof(struct vmap_block), | ||
654 | gfp_mask & GFP_RECLAIM_MASK, node); | ||
655 | if (unlikely(!vb)) | ||
656 | return ERR_PTR(-ENOMEM); | ||
657 | |||
658 | va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, | ||
659 | VMALLOC_START, VMALLOC_END, | ||
660 | node, gfp_mask); | ||
661 | if (unlikely(IS_ERR(va))) { | ||
662 | kfree(vb); | ||
663 | return ERR_PTR(PTR_ERR(va)); | ||
664 | } | ||
665 | |||
666 | err = radix_tree_preload(gfp_mask); | ||
667 | if (unlikely(err)) { | ||
668 | kfree(vb); | ||
669 | free_vmap_area(va); | ||
670 | return ERR_PTR(err); | ||
671 | } | ||
672 | |||
673 | spin_lock_init(&vb->lock); | ||
674 | vb->va = va; | ||
675 | vb->free = VMAP_BBMAP_BITS; | ||
676 | vb->dirty = 0; | ||
677 | bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); | ||
678 | bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); | ||
679 | INIT_LIST_HEAD(&vb->free_list); | ||
680 | INIT_LIST_HEAD(&vb->dirty_list); | ||
681 | |||
682 | vb_idx = addr_to_vb_idx(va->va_start); | ||
683 | spin_lock(&vmap_block_tree_lock); | ||
684 | err = radix_tree_insert(&vmap_block_tree, vb_idx, vb); | ||
685 | spin_unlock(&vmap_block_tree_lock); | ||
686 | BUG_ON(err); | ||
687 | radix_tree_preload_end(); | ||
688 | |||
689 | vbq = &get_cpu_var(vmap_block_queue); | ||
690 | vb->vbq = vbq; | ||
691 | spin_lock(&vbq->lock); | ||
692 | list_add(&vb->free_list, &vbq->free); | ||
693 | spin_unlock(&vbq->lock); | ||
694 | put_cpu_var(vmap_cpu_blocks); | ||
695 | |||
696 | return vb; | ||
697 | } | ||
698 | |||
699 | static void rcu_free_vb(struct rcu_head *head) | ||
700 | { | ||
701 | struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); | ||
702 | |||
703 | kfree(vb); | ||
704 | } | ||
705 | |||
706 | static void free_vmap_block(struct vmap_block *vb) | ||
707 | { | ||
708 | struct vmap_block *tmp; | ||
709 | unsigned long vb_idx; | ||
710 | |||
711 | spin_lock(&vb->vbq->lock); | ||
712 | if (!list_empty(&vb->free_list)) | ||
713 | list_del(&vb->free_list); | ||
714 | if (!list_empty(&vb->dirty_list)) | ||
715 | list_del(&vb->dirty_list); | ||
716 | spin_unlock(&vb->vbq->lock); | ||
717 | |||
718 | vb_idx = addr_to_vb_idx(vb->va->va_start); | ||
719 | spin_lock(&vmap_block_tree_lock); | ||
720 | tmp = radix_tree_delete(&vmap_block_tree, vb_idx); | ||
721 | spin_unlock(&vmap_block_tree_lock); | ||
722 | BUG_ON(tmp != vb); | ||
723 | |||
724 | free_unmap_vmap_area(vb->va); | ||
725 | call_rcu(&vb->rcu_head, rcu_free_vb); | ||
726 | } | ||
727 | |||
728 | static void *vb_alloc(unsigned long size, gfp_t gfp_mask) | ||
729 | { | ||
730 | struct vmap_block_queue *vbq; | ||
731 | struct vmap_block *vb; | ||
732 | unsigned long addr = 0; | ||
733 | unsigned int order; | ||
734 | |||
735 | BUG_ON(size & ~PAGE_MASK); | ||
736 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | ||
737 | order = get_order(size); | ||
738 | |||
739 | again: | ||
740 | rcu_read_lock(); | ||
741 | vbq = &get_cpu_var(vmap_block_queue); | ||
742 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | ||
743 | int i; | ||
744 | |||
745 | spin_lock(&vb->lock); | ||
746 | i = bitmap_find_free_region(vb->alloc_map, | ||
747 | VMAP_BBMAP_BITS, order); | ||
748 | |||
749 | if (i >= 0) { | ||
750 | addr = vb->va->va_start + (i << PAGE_SHIFT); | ||
751 | BUG_ON(addr_to_vb_idx(addr) != | ||
752 | addr_to_vb_idx(vb->va->va_start)); | ||
753 | vb->free -= 1UL << order; | ||
754 | if (vb->free == 0) { | ||
755 | spin_lock(&vbq->lock); | ||
756 | list_del_init(&vb->free_list); | ||
757 | spin_unlock(&vbq->lock); | ||
758 | } | ||
759 | spin_unlock(&vb->lock); | ||
760 | break; | ||
761 | } | ||
762 | spin_unlock(&vb->lock); | ||
763 | } | ||
764 | put_cpu_var(vmap_cpu_blocks); | ||
765 | rcu_read_unlock(); | ||
766 | |||
767 | if (!addr) { | ||
768 | vb = new_vmap_block(gfp_mask); | ||
769 | if (IS_ERR(vb)) | ||
770 | return vb; | ||
771 | goto again; | ||
772 | } | ||
773 | |||
774 | return (void *)addr; | ||
775 | } | ||
776 | |||
777 | static void vb_free(const void *addr, unsigned long size) | ||
778 | { | ||
779 | unsigned long offset; | ||
780 | unsigned long vb_idx; | ||
781 | unsigned int order; | ||
782 | struct vmap_block *vb; | ||
783 | |||
784 | BUG_ON(size & ~PAGE_MASK); | ||
785 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | ||
786 | order = get_order(size); | ||
787 | |||
788 | offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); | ||
789 | |||
790 | vb_idx = addr_to_vb_idx((unsigned long)addr); | ||
791 | rcu_read_lock(); | ||
792 | vb = radix_tree_lookup(&vmap_block_tree, vb_idx); | ||
793 | rcu_read_unlock(); | ||
794 | BUG_ON(!vb); | ||
795 | |||
796 | spin_lock(&vb->lock); | ||
797 | bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); | ||
798 | if (!vb->dirty) { | ||
799 | spin_lock(&vb->vbq->lock); | ||
800 | list_add(&vb->dirty_list, &vb->vbq->dirty); | ||
801 | spin_unlock(&vb->vbq->lock); | ||
802 | } | ||
803 | vb->dirty += 1UL << order; | ||
804 | if (vb->dirty == VMAP_BBMAP_BITS) { | ||
805 | BUG_ON(vb->free || !list_empty(&vb->free_list)); | ||
806 | spin_unlock(&vb->lock); | ||
807 | free_vmap_block(vb); | ||
808 | } else | ||
809 | spin_unlock(&vb->lock); | ||
810 | } | ||
811 | |||
812 | /** | ||
813 | * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer | ||
814 | * | ||
815 | * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily | ||
816 | * to amortize TLB flushing overheads. What this means is that any page you | ||
817 | * have now, may, in a former life, have been mapped into kernel virtual | ||
818 | * address by the vmap layer and so there might be some CPUs with TLB entries | ||
819 | * still referencing that page (additional to the regular 1:1 kernel mapping). | ||
820 | * | ||
821 | * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can | ||
822 | * be sure that none of the pages we have control over will have any aliases | ||
823 | * from the vmap layer. | ||
824 | */ | ||
825 | void vm_unmap_aliases(void) | ||
826 | { | ||
827 | unsigned long start = ULONG_MAX, end = 0; | ||
828 | int cpu; | ||
829 | int flush = 0; | ||
830 | |||
831 | for_each_possible_cpu(cpu) { | ||
832 | struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); | ||
833 | struct vmap_block *vb; | ||
834 | |||
835 | rcu_read_lock(); | ||
836 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | ||
837 | int i; | ||
838 | |||
839 | spin_lock(&vb->lock); | ||
840 | i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); | ||
841 | while (i < VMAP_BBMAP_BITS) { | ||
842 | unsigned long s, e; | ||
843 | int j; | ||
844 | j = find_next_zero_bit(vb->dirty_map, | ||
845 | VMAP_BBMAP_BITS, i); | ||
846 | |||
847 | s = vb->va->va_start + (i << PAGE_SHIFT); | ||
848 | e = vb->va->va_start + (j << PAGE_SHIFT); | ||
849 | vunmap_page_range(s, e); | ||
850 | flush = 1; | ||
851 | |||
852 | if (s < start) | ||
853 | start = s; | ||
854 | if (e > end) | ||
855 | end = e; | ||
856 | |||
857 | i = j; | ||
858 | i = find_next_bit(vb->dirty_map, | ||
859 | VMAP_BBMAP_BITS, i); | ||
860 | } | ||
861 | spin_unlock(&vb->lock); | ||
862 | } | ||
863 | rcu_read_unlock(); | ||
864 | } | ||
865 | |||
866 | __purge_vmap_area_lazy(&start, &end, 1, flush); | ||
867 | } | ||
868 | EXPORT_SYMBOL_GPL(vm_unmap_aliases); | ||
869 | |||
870 | /** | ||
871 | * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram | ||
872 | * @mem: the pointer returned by vm_map_ram | ||
873 | * @count: the count passed to that vm_map_ram call (cannot unmap partial) | ||
874 | */ | ||
875 | void vm_unmap_ram(const void *mem, unsigned int count) | ||
876 | { | ||
877 | unsigned long size = count << PAGE_SHIFT; | ||
878 | unsigned long addr = (unsigned long)mem; | ||
879 | |||
880 | BUG_ON(!addr); | ||
881 | BUG_ON(addr < VMALLOC_START); | ||
882 | BUG_ON(addr > VMALLOC_END); | ||
883 | BUG_ON(addr & (PAGE_SIZE-1)); | ||
884 | |||
885 | debug_check_no_locks_freed(mem, size); | ||
886 | |||
887 | if (likely(count <= VMAP_MAX_ALLOC)) | ||
888 | vb_free(mem, size); | ||
889 | else | ||
890 | free_unmap_vmap_area_addr(addr); | ||
891 | } | ||
892 | EXPORT_SYMBOL(vm_unmap_ram); | ||
893 | |||
894 | /** | ||
895 | * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) | ||
896 | * @pages: an array of pointers to the pages to be mapped | ||
897 | * @count: number of pages | ||
898 | * @node: prefer to allocate data structures on this node | ||
899 | * @prot: memory protection to use. PAGE_KERNEL for regular RAM | ||
900 | * @returns: a pointer to the address that has been mapped, or NULL on failure | ||
901 | */ | ||
902 | void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) | ||
903 | { | ||
904 | unsigned long size = count << PAGE_SHIFT; | ||
215 | unsigned long addr; | 905 | unsigned long addr; |
906 | void *mem; | ||
907 | |||
908 | if (likely(count <= VMAP_MAX_ALLOC)) { | ||
909 | mem = vb_alloc(size, GFP_KERNEL); | ||
910 | if (IS_ERR(mem)) | ||
911 | return NULL; | ||
912 | addr = (unsigned long)mem; | ||
913 | } else { | ||
914 | struct vmap_area *va; | ||
915 | va = alloc_vmap_area(size, PAGE_SIZE, | ||
916 | VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); | ||
917 | if (IS_ERR(va)) | ||
918 | return NULL; | ||
919 | |||
920 | addr = va->va_start; | ||
921 | mem = (void *)addr; | ||
922 | } | ||
923 | if (vmap_page_range(addr, addr + size, prot, pages) < 0) { | ||
924 | vm_unmap_ram(mem, count); | ||
925 | return NULL; | ||
926 | } | ||
927 | return mem; | ||
928 | } | ||
929 | EXPORT_SYMBOL(vm_map_ram); | ||
930 | |||
931 | void __init vmalloc_init(void) | ||
932 | { | ||
933 | int i; | ||
934 | |||
935 | for_each_possible_cpu(i) { | ||
936 | struct vmap_block_queue *vbq; | ||
937 | |||
938 | vbq = &per_cpu(vmap_block_queue, i); | ||
939 | spin_lock_init(&vbq->lock); | ||
940 | INIT_LIST_HEAD(&vbq->free); | ||
941 | INIT_LIST_HEAD(&vbq->dirty); | ||
942 | vbq->nr_dirty = 0; | ||
943 | } | ||
944 | } | ||
945 | |||
946 | void unmap_kernel_range(unsigned long addr, unsigned long size) | ||
947 | { | ||
948 | unsigned long end = addr + size; | ||
949 | vunmap_page_range(addr, end); | ||
950 | flush_tlb_kernel_range(addr, end); | ||
951 | } | ||
952 | |||
953 | int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) | ||
954 | { | ||
955 | unsigned long addr = (unsigned long)area->addr; | ||
956 | unsigned long end = addr + area->size - PAGE_SIZE; | ||
957 | int err; | ||
958 | |||
959 | err = vmap_page_range(addr, end, prot, *pages); | ||
960 | if (err > 0) { | ||
961 | *pages += err; | ||
962 | err = 0; | ||
963 | } | ||
964 | |||
965 | return err; | ||
966 | } | ||
967 | EXPORT_SYMBOL_GPL(map_vm_area); | ||
968 | |||
969 | /*** Old vmalloc interfaces ***/ | ||
970 | DEFINE_RWLOCK(vmlist_lock); | ||
971 | struct vm_struct *vmlist; | ||
972 | |||
973 | static struct vm_struct *__get_vm_area_node(unsigned long size, | ||
974 | unsigned long flags, unsigned long start, unsigned long end, | ||
975 | int node, gfp_t gfp_mask, void *caller) | ||
976 | { | ||
977 | static struct vmap_area *va; | ||
978 | struct vm_struct *area; | ||
979 | struct vm_struct *tmp, **p; | ||
980 | unsigned long align = 1; | ||
216 | 981 | ||
217 | BUG_ON(in_interrupt()); | 982 | BUG_ON(in_interrupt()); |
218 | if (flags & VM_IOREMAP) { | 983 | if (flags & VM_IOREMAP) { |
@@ -225,13 +990,12 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, | |||
225 | 990 | ||
226 | align = 1ul << bit; | 991 | align = 1ul << bit; |
227 | } | 992 | } |
228 | addr = ALIGN(start, align); | 993 | |
229 | size = PAGE_ALIGN(size); | 994 | size = PAGE_ALIGN(size); |
230 | if (unlikely(!size)) | 995 | if (unlikely(!size)) |
231 | return NULL; | 996 | return NULL; |
232 | 997 | ||
233 | area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); | 998 | area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); |
234 | |||
235 | if (unlikely(!area)) | 999 | if (unlikely(!area)) |
236 | return NULL; | 1000 | return NULL; |
237 | 1001 | ||
@@ -240,48 +1004,32 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, | |||
240 | */ | 1004 | */ |
241 | size += PAGE_SIZE; | 1005 | size += PAGE_SIZE; |
242 | 1006 | ||
243 | write_lock(&vmlist_lock); | 1007 | va = alloc_vmap_area(size, align, start, end, node, gfp_mask); |
244 | for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) { | 1008 | if (IS_ERR(va)) { |
245 | if ((unsigned long)tmp->addr < addr) { | 1009 | kfree(area); |
246 | if((unsigned long)tmp->addr + tmp->size >= addr) | 1010 | return NULL; |
247 | addr = ALIGN(tmp->size + | ||
248 | (unsigned long)tmp->addr, align); | ||
249 | continue; | ||
250 | } | ||
251 | if ((size + addr) < addr) | ||
252 | goto out; | ||
253 | if (size + addr <= (unsigned long)tmp->addr) | ||
254 | goto found; | ||
255 | addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align); | ||
256 | if (addr > end - size) | ||
257 | goto out; | ||
258 | } | 1011 | } |
259 | if ((size + addr) < addr) | ||
260 | goto out; | ||
261 | if (addr > end - size) | ||
262 | goto out; | ||
263 | |||
264 | found: | ||
265 | area->next = *p; | ||
266 | *p = area; | ||
267 | 1012 | ||
268 | area->flags = flags; | 1013 | area->flags = flags; |
269 | area->addr = (void *)addr; | 1014 | area->addr = (void *)va->va_start; |
270 | area->size = size; | 1015 | area->size = size; |
271 | area->pages = NULL; | 1016 | area->pages = NULL; |
272 | area->nr_pages = 0; | 1017 | area->nr_pages = 0; |
273 | area->phys_addr = 0; | 1018 | area->phys_addr = 0; |
274 | area->caller = caller; | 1019 | area->caller = caller; |
1020 | va->private = area; | ||
1021 | va->flags |= VM_VM_AREA; | ||
1022 | |||
1023 | write_lock(&vmlist_lock); | ||
1024 | for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { | ||
1025 | if (tmp->addr >= area->addr) | ||
1026 | break; | ||
1027 | } | ||
1028 | area->next = *p; | ||
1029 | *p = area; | ||
275 | write_unlock(&vmlist_lock); | 1030 | write_unlock(&vmlist_lock); |
276 | 1031 | ||
277 | return area; | 1032 | return area; |
278 | |||
279 | out: | ||
280 | write_unlock(&vmlist_lock); | ||
281 | kfree(area); | ||
282 | if (printk_ratelimit()) | ||
283 | printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n"); | ||
284 | return NULL; | ||
285 | } | 1033 | } |
286 | 1034 | ||
287 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | 1035 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, |
@@ -321,39 +1069,15 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, | |||
321 | gfp_mask, __builtin_return_address(0)); | 1069 | gfp_mask, __builtin_return_address(0)); |
322 | } | 1070 | } |
323 | 1071 | ||
324 | /* Caller must hold vmlist_lock */ | 1072 | static struct vm_struct *find_vm_area(const void *addr) |
325 | static struct vm_struct *__find_vm_area(const void *addr) | ||
326 | { | 1073 | { |
327 | struct vm_struct *tmp; | 1074 | struct vmap_area *va; |
328 | 1075 | ||
329 | for (tmp = vmlist; tmp != NULL; tmp = tmp->next) { | 1076 | va = find_vmap_area((unsigned long)addr); |
330 | if (tmp->addr == addr) | 1077 | if (va && va->flags & VM_VM_AREA) |
331 | break; | 1078 | return va->private; |
332 | } | ||
333 | 1079 | ||
334 | return tmp; | ||
335 | } | ||
336 | |||
337 | /* Caller must hold vmlist_lock */ | ||
338 | static struct vm_struct *__remove_vm_area(const void *addr) | ||
339 | { | ||
340 | struct vm_struct **p, *tmp; | ||
341 | |||
342 | for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) { | ||
343 | if (tmp->addr == addr) | ||
344 | goto found; | ||
345 | } | ||
346 | return NULL; | 1080 | return NULL; |
347 | |||
348 | found: | ||
349 | unmap_vm_area(tmp); | ||
350 | *p = tmp->next; | ||
351 | |||
352 | /* | ||
353 | * Remove the guard page. | ||
354 | */ | ||
355 | tmp->size -= PAGE_SIZE; | ||
356 | return tmp; | ||
357 | } | 1081 | } |
358 | 1082 | ||
359 | /** | 1083 | /** |
@@ -366,11 +1090,24 @@ found: | |||
366 | */ | 1090 | */ |
367 | struct vm_struct *remove_vm_area(const void *addr) | 1091 | struct vm_struct *remove_vm_area(const void *addr) |
368 | { | 1092 | { |
369 | struct vm_struct *v; | 1093 | struct vmap_area *va; |
370 | write_lock(&vmlist_lock); | 1094 | |
371 | v = __remove_vm_area(addr); | 1095 | va = find_vmap_area((unsigned long)addr); |
372 | write_unlock(&vmlist_lock); | 1096 | if (va && va->flags & VM_VM_AREA) { |
373 | return v; | 1097 | struct vm_struct *vm = va->private; |
1098 | struct vm_struct *tmp, **p; | ||
1099 | free_unmap_vmap_area(va); | ||
1100 | vm->size -= PAGE_SIZE; | ||
1101 | |||
1102 | write_lock(&vmlist_lock); | ||
1103 | for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) | ||
1104 | ; | ||
1105 | *p = tmp->next; | ||
1106 | write_unlock(&vmlist_lock); | ||
1107 | |||
1108 | return vm; | ||
1109 | } | ||
1110 | return NULL; | ||
374 | } | 1111 | } |
375 | 1112 | ||
376 | static void __vunmap(const void *addr, int deallocate_pages) | 1113 | static void __vunmap(const void *addr, int deallocate_pages) |
@@ -381,16 +1118,14 @@ static void __vunmap(const void *addr, int deallocate_pages) | |||
381 | return; | 1118 | return; |
382 | 1119 | ||
383 | if ((PAGE_SIZE-1) & (unsigned long)addr) { | 1120 | if ((PAGE_SIZE-1) & (unsigned long)addr) { |
384 | printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); | 1121 | WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); |
385 | WARN_ON(1); | ||
386 | return; | 1122 | return; |
387 | } | 1123 | } |
388 | 1124 | ||
389 | area = remove_vm_area(addr); | 1125 | area = remove_vm_area(addr); |
390 | if (unlikely(!area)) { | 1126 | if (unlikely(!area)) { |
391 | printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", | 1127 | WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", |
392 | addr); | 1128 | addr); |
393 | WARN_ON(1); | ||
394 | return; | 1129 | return; |
395 | } | 1130 | } |
396 | 1131 | ||
@@ -482,6 +1217,8 @@ void *vmap(struct page **pages, unsigned int count, | |||
482 | } | 1217 | } |
483 | EXPORT_SYMBOL(vmap); | 1218 | EXPORT_SYMBOL(vmap); |
484 | 1219 | ||
1220 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | ||
1221 | int node, void *caller); | ||
485 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | 1222 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, |
486 | pgprot_t prot, int node, void *caller) | 1223 | pgprot_t prot, int node, void *caller) |
487 | { | 1224 | { |
@@ -608,10 +1345,8 @@ void *vmalloc_user(unsigned long size) | |||
608 | 1345 | ||
609 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); | 1346 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); |
610 | if (ret) { | 1347 | if (ret) { |
611 | write_lock(&vmlist_lock); | 1348 | area = find_vm_area(ret); |
612 | area = __find_vm_area(ret); | ||
613 | area->flags |= VM_USERMAP; | 1349 | area->flags |= VM_USERMAP; |
614 | write_unlock(&vmlist_lock); | ||
615 | } | 1350 | } |
616 | return ret; | 1351 | return ret; |
617 | } | 1352 | } |
@@ -691,10 +1426,8 @@ void *vmalloc_32_user(unsigned long size) | |||
691 | 1426 | ||
692 | ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); | 1427 | ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); |
693 | if (ret) { | 1428 | if (ret) { |
694 | write_lock(&vmlist_lock); | 1429 | area = find_vm_area(ret); |
695 | area = __find_vm_area(ret); | ||
696 | area->flags |= VM_USERMAP; | 1430 | area->flags |= VM_USERMAP; |
697 | write_unlock(&vmlist_lock); | ||
698 | } | 1431 | } |
699 | return ret; | 1432 | return ret; |
700 | } | 1433 | } |
@@ -795,26 +1528,25 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | |||
795 | struct vm_struct *area; | 1528 | struct vm_struct *area; |
796 | unsigned long uaddr = vma->vm_start; | 1529 | unsigned long uaddr = vma->vm_start; |
797 | unsigned long usize = vma->vm_end - vma->vm_start; | 1530 | unsigned long usize = vma->vm_end - vma->vm_start; |
798 | int ret; | ||
799 | 1531 | ||
800 | if ((PAGE_SIZE-1) & (unsigned long)addr) | 1532 | if ((PAGE_SIZE-1) & (unsigned long)addr) |
801 | return -EINVAL; | 1533 | return -EINVAL; |
802 | 1534 | ||
803 | read_lock(&vmlist_lock); | 1535 | area = find_vm_area(addr); |
804 | area = __find_vm_area(addr); | ||
805 | if (!area) | 1536 | if (!area) |
806 | goto out_einval_locked; | 1537 | return -EINVAL; |
807 | 1538 | ||
808 | if (!(area->flags & VM_USERMAP)) | 1539 | if (!(area->flags & VM_USERMAP)) |
809 | goto out_einval_locked; | 1540 | return -EINVAL; |
810 | 1541 | ||
811 | if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) | 1542 | if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) |
812 | goto out_einval_locked; | 1543 | return -EINVAL; |
813 | read_unlock(&vmlist_lock); | ||
814 | 1544 | ||
815 | addr += pgoff << PAGE_SHIFT; | 1545 | addr += pgoff << PAGE_SHIFT; |
816 | do { | 1546 | do { |
817 | struct page *page = vmalloc_to_page(addr); | 1547 | struct page *page = vmalloc_to_page(addr); |
1548 | int ret; | ||
1549 | |||
818 | ret = vm_insert_page(vma, uaddr, page); | 1550 | ret = vm_insert_page(vma, uaddr, page); |
819 | if (ret) | 1551 | if (ret) |
820 | return ret; | 1552 | return ret; |
@@ -827,11 +1559,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | |||
827 | /* Prevent "things" like memory migration? VM_flags need a cleanup... */ | 1559 | /* Prevent "things" like memory migration? VM_flags need a cleanup... */ |
828 | vma->vm_flags |= VM_RESERVED; | 1560 | vma->vm_flags |= VM_RESERVED; |
829 | 1561 | ||
830 | return ret; | 1562 | return 0; |
831 | |||
832 | out_einval_locked: | ||
833 | read_unlock(&vmlist_lock); | ||
834 | return -EINVAL; | ||
835 | } | 1563 | } |
836 | EXPORT_SYMBOL(remap_vmalloc_range); | 1564 | EXPORT_SYMBOL(remap_vmalloc_range); |
837 | 1565 | ||
@@ -931,6 +1659,25 @@ static void s_stop(struct seq_file *m, void *p) | |||
931 | read_unlock(&vmlist_lock); | 1659 | read_unlock(&vmlist_lock); |
932 | } | 1660 | } |
933 | 1661 | ||
1662 | static void show_numa_info(struct seq_file *m, struct vm_struct *v) | ||
1663 | { | ||
1664 | if (NUMA_BUILD) { | ||
1665 | unsigned int nr, *counters = m->private; | ||
1666 | |||
1667 | if (!counters) | ||
1668 | return; | ||
1669 | |||
1670 | memset(counters, 0, nr_node_ids * sizeof(unsigned int)); | ||
1671 | |||
1672 | for (nr = 0; nr < v->nr_pages; nr++) | ||
1673 | counters[page_to_nid(v->pages[nr])]++; | ||
1674 | |||
1675 | for_each_node_state(nr, N_HIGH_MEMORY) | ||
1676 | if (counters[nr]) | ||
1677 | seq_printf(m, " N%u=%u", nr, counters[nr]); | ||
1678 | } | ||
1679 | } | ||
1680 | |||
934 | static int s_show(struct seq_file *m, void *p) | 1681 | static int s_show(struct seq_file *m, void *p) |
935 | { | 1682 | { |
936 | struct vm_struct *v = p; | 1683 | struct vm_struct *v = p; |
@@ -967,15 +1714,46 @@ static int s_show(struct seq_file *m, void *p) | |||
967 | if (v->flags & VM_VPAGES) | 1714 | if (v->flags & VM_VPAGES) |
968 | seq_printf(m, " vpages"); | 1715 | seq_printf(m, " vpages"); |
969 | 1716 | ||
1717 | show_numa_info(m, v); | ||
970 | seq_putc(m, '\n'); | 1718 | seq_putc(m, '\n'); |
971 | return 0; | 1719 | return 0; |
972 | } | 1720 | } |
973 | 1721 | ||
974 | const struct seq_operations vmalloc_op = { | 1722 | static const struct seq_operations vmalloc_op = { |
975 | .start = s_start, | 1723 | .start = s_start, |
976 | .next = s_next, | 1724 | .next = s_next, |
977 | .stop = s_stop, | 1725 | .stop = s_stop, |
978 | .show = s_show, | 1726 | .show = s_show, |
979 | }; | 1727 | }; |
1728 | |||
1729 | static int vmalloc_open(struct inode *inode, struct file *file) | ||
1730 | { | ||
1731 | unsigned int *ptr = NULL; | ||
1732 | int ret; | ||
1733 | |||
1734 | if (NUMA_BUILD) | ||
1735 | ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); | ||
1736 | ret = seq_open(file, &vmalloc_op); | ||
1737 | if (!ret) { | ||
1738 | struct seq_file *m = file->private_data; | ||
1739 | m->private = ptr; | ||
1740 | } else | ||
1741 | kfree(ptr); | ||
1742 | return ret; | ||
1743 | } | ||
1744 | |||
1745 | static const struct file_operations proc_vmalloc_operations = { | ||
1746 | .open = vmalloc_open, | ||
1747 | .read = seq_read, | ||
1748 | .llseek = seq_lseek, | ||
1749 | .release = seq_release_private, | ||
1750 | }; | ||
1751 | |||
1752 | static int __init proc_vmalloc_init(void) | ||
1753 | { | ||
1754 | proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations); | ||
1755 | return 0; | ||
1756 | } | ||
1757 | module_init(proc_vmalloc_init); | ||
980 | #endif | 1758 | #endif |
981 | 1759 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 967d30ccd92b..3b5860294bb6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -38,6 +38,8 @@ | |||
38 | #include <linux/kthread.h> | 38 | #include <linux/kthread.h> |
39 | #include <linux/freezer.h> | 39 | #include <linux/freezer.h> |
40 | #include <linux/memcontrol.h> | 40 | #include <linux/memcontrol.h> |
41 | #include <linux/delayacct.h> | ||
42 | #include <linux/sysctl.h> | ||
41 | 43 | ||
42 | #include <asm/tlbflush.h> | 44 | #include <asm/tlbflush.h> |
43 | #include <asm/div64.h> | 45 | #include <asm/div64.h> |
@@ -77,7 +79,7 @@ struct scan_control { | |||
77 | unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, | 79 | unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, |
78 | unsigned long *scanned, int order, int mode, | 80 | unsigned long *scanned, int order, int mode, |
79 | struct zone *z, struct mem_cgroup *mem_cont, | 81 | struct zone *z, struct mem_cgroup *mem_cont, |
80 | int active); | 82 | int active, int file); |
81 | }; | 83 | }; |
82 | 84 | ||
83 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | 85 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
@@ -390,17 +392,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
390 | } | 392 | } |
391 | 393 | ||
392 | /* | 394 | /* |
393 | * Attempt to detach a locked page from its ->mapping. If it is dirty or if | 395 | * Same as remove_mapping, but if the page is removed from the mapping, it |
394 | * someone else has a ref on the page, abort and return 0. If it was | 396 | * gets returned with a refcount of 0. |
395 | * successfully detached, return 1. Assumes the caller has a single ref on | ||
396 | * this page. | ||
397 | */ | 397 | */ |
398 | int remove_mapping(struct address_space *mapping, struct page *page) | 398 | static int __remove_mapping(struct address_space *mapping, struct page *page) |
399 | { | 399 | { |
400 | BUG_ON(!PageLocked(page)); | 400 | BUG_ON(!PageLocked(page)); |
401 | BUG_ON(mapping != page_mapping(page)); | 401 | BUG_ON(mapping != page_mapping(page)); |
402 | 402 | ||
403 | write_lock_irq(&mapping->tree_lock); | 403 | spin_lock_irq(&mapping->tree_lock); |
404 | /* | 404 | /* |
405 | * The non racy check for a busy page. | 405 | * The non racy check for a busy page. |
406 | * | 406 | * |
@@ -426,32 +426,131 @@ int remove_mapping(struct address_space *mapping, struct page *page) | |||
426 | * Note that if SetPageDirty is always performed via set_page_dirty, | 426 | * Note that if SetPageDirty is always performed via set_page_dirty, |
427 | * and thus under tree_lock, then this ordering is not required. | 427 | * and thus under tree_lock, then this ordering is not required. |
428 | */ | 428 | */ |
429 | if (unlikely(page_count(page) != 2)) | 429 | if (!page_freeze_refs(page, 2)) |
430 | goto cannot_free; | 430 | goto cannot_free; |
431 | smp_rmb(); | 431 | /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ |
432 | if (unlikely(PageDirty(page))) | 432 | if (unlikely(PageDirty(page))) { |
433 | page_unfreeze_refs(page, 2); | ||
433 | goto cannot_free; | 434 | goto cannot_free; |
435 | } | ||
434 | 436 | ||
435 | if (PageSwapCache(page)) { | 437 | if (PageSwapCache(page)) { |
436 | swp_entry_t swap = { .val = page_private(page) }; | 438 | swp_entry_t swap = { .val = page_private(page) }; |
437 | __delete_from_swap_cache(page); | 439 | __delete_from_swap_cache(page); |
438 | write_unlock_irq(&mapping->tree_lock); | 440 | spin_unlock_irq(&mapping->tree_lock); |
439 | swap_free(swap); | 441 | swap_free(swap); |
440 | __put_page(page); /* The pagecache ref */ | 442 | } else { |
441 | return 1; | 443 | __remove_from_page_cache(page); |
444 | spin_unlock_irq(&mapping->tree_lock); | ||
442 | } | 445 | } |
443 | 446 | ||
444 | __remove_from_page_cache(page); | ||
445 | write_unlock_irq(&mapping->tree_lock); | ||
446 | __put_page(page); | ||
447 | return 1; | 447 | return 1; |
448 | 448 | ||
449 | cannot_free: | 449 | cannot_free: |
450 | write_unlock_irq(&mapping->tree_lock); | 450 | spin_unlock_irq(&mapping->tree_lock); |
451 | return 0; | 451 | return 0; |
452 | } | 452 | } |
453 | 453 | ||
454 | /* | 454 | /* |
455 | * Attempt to detach a locked page from its ->mapping. If it is dirty or if | ||
456 | * someone else has a ref on the page, abort and return 0. If it was | ||
457 | * successfully detached, return 1. Assumes the caller has a single ref on | ||
458 | * this page. | ||
459 | */ | ||
460 | int remove_mapping(struct address_space *mapping, struct page *page) | ||
461 | { | ||
462 | if (__remove_mapping(mapping, page)) { | ||
463 | /* | ||
464 | * Unfreezing the refcount with 1 rather than 2 effectively | ||
465 | * drops the pagecache ref for us without requiring another | ||
466 | * atomic operation. | ||
467 | */ | ||
468 | page_unfreeze_refs(page, 1); | ||
469 | return 1; | ||
470 | } | ||
471 | return 0; | ||
472 | } | ||
473 | |||
474 | /** | ||
475 | * putback_lru_page - put previously isolated page onto appropriate LRU list | ||
476 | * @page: page to be put back to appropriate lru list | ||
477 | * | ||
478 | * Add previously isolated @page to appropriate LRU list. | ||
479 | * Page may still be unevictable for other reasons. | ||
480 | * | ||
481 | * lru_lock must not be held, interrupts must be enabled. | ||
482 | */ | ||
483 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
484 | void putback_lru_page(struct page *page) | ||
485 | { | ||
486 | int lru; | ||
487 | int active = !!TestClearPageActive(page); | ||
488 | int was_unevictable = PageUnevictable(page); | ||
489 | |||
490 | VM_BUG_ON(PageLRU(page)); | ||
491 | |||
492 | redo: | ||
493 | ClearPageUnevictable(page); | ||
494 | |||
495 | if (page_evictable(page, NULL)) { | ||
496 | /* | ||
497 | * For evictable pages, we can use the cache. | ||
498 | * In event of a race, worst case is we end up with an | ||
499 | * unevictable page on [in]active list. | ||
500 | * We know how to handle that. | ||
501 | */ | ||
502 | lru = active + page_is_file_cache(page); | ||
503 | lru_cache_add_lru(page, lru); | ||
504 | } else { | ||
505 | /* | ||
506 | * Put unevictable pages directly on zone's unevictable | ||
507 | * list. | ||
508 | */ | ||
509 | lru = LRU_UNEVICTABLE; | ||
510 | add_page_to_unevictable_list(page); | ||
511 | } | ||
512 | mem_cgroup_move_lists(page, lru); | ||
513 | |||
514 | /* | ||
515 | * page's status can change while we move it among lru. If an evictable | ||
516 | * page is on unevictable list, it never be freed. To avoid that, | ||
517 | * check after we added it to the list, again. | ||
518 | */ | ||
519 | if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { | ||
520 | if (!isolate_lru_page(page)) { | ||
521 | put_page(page); | ||
522 | goto redo; | ||
523 | } | ||
524 | /* This means someone else dropped this page from LRU | ||
525 | * So, it will be freed or putback to LRU again. There is | ||
526 | * nothing to do here. | ||
527 | */ | ||
528 | } | ||
529 | |||
530 | if (was_unevictable && lru != LRU_UNEVICTABLE) | ||
531 | count_vm_event(UNEVICTABLE_PGRESCUED); | ||
532 | else if (!was_unevictable && lru == LRU_UNEVICTABLE) | ||
533 | count_vm_event(UNEVICTABLE_PGCULLED); | ||
534 | |||
535 | put_page(page); /* drop ref from isolate */ | ||
536 | } | ||
537 | |||
538 | #else /* CONFIG_UNEVICTABLE_LRU */ | ||
539 | |||
540 | void putback_lru_page(struct page *page) | ||
541 | { | ||
542 | int lru; | ||
543 | VM_BUG_ON(PageLRU(page)); | ||
544 | |||
545 | lru = !!TestClearPageActive(page) + page_is_file_cache(page); | ||
546 | lru_cache_add_lru(page, lru); | ||
547 | mem_cgroup_move_lists(page, lru); | ||
548 | put_page(page); | ||
549 | } | ||
550 | #endif /* CONFIG_UNEVICTABLE_LRU */ | ||
551 | |||
552 | |||
553 | /* | ||
455 | * shrink_page_list() returns the number of reclaimed pages | 554 | * shrink_page_list() returns the number of reclaimed pages |
456 | */ | 555 | */ |
457 | static unsigned long shrink_page_list(struct list_head *page_list, | 556 | static unsigned long shrink_page_list(struct list_head *page_list, |
@@ -477,13 +576,16 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
477 | page = lru_to_page(page_list); | 576 | page = lru_to_page(page_list); |
478 | list_del(&page->lru); | 577 | list_del(&page->lru); |
479 | 578 | ||
480 | if (TestSetPageLocked(page)) | 579 | if (!trylock_page(page)) |
481 | goto keep; | 580 | goto keep; |
482 | 581 | ||
483 | VM_BUG_ON(PageActive(page)); | 582 | VM_BUG_ON(PageActive(page)); |
484 | 583 | ||
485 | sc->nr_scanned++; | 584 | sc->nr_scanned++; |
486 | 585 | ||
586 | if (unlikely(!page_evictable(page, NULL))) | ||
587 | goto cull_mlocked; | ||
588 | |||
487 | if (!sc->may_swap && page_mapped(page)) | 589 | if (!sc->may_swap && page_mapped(page)) |
488 | goto keep_locked; | 590 | goto keep_locked; |
489 | 591 | ||
@@ -520,9 +622,19 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
520 | * Anonymous process memory has backing store? | 622 | * Anonymous process memory has backing store? |
521 | * Try to allocate it some swap space here. | 623 | * Try to allocate it some swap space here. |
522 | */ | 624 | */ |
523 | if (PageAnon(page) && !PageSwapCache(page)) | 625 | if (PageAnon(page) && !PageSwapCache(page)) { |
626 | switch (try_to_munlock(page)) { | ||
627 | case SWAP_FAIL: /* shouldn't happen */ | ||
628 | case SWAP_AGAIN: | ||
629 | goto keep_locked; | ||
630 | case SWAP_MLOCK: | ||
631 | goto cull_mlocked; | ||
632 | case SWAP_SUCCESS: | ||
633 | ; /* fall thru'; add to swap cache */ | ||
634 | } | ||
524 | if (!add_to_swap(page, GFP_ATOMIC)) | 635 | if (!add_to_swap(page, GFP_ATOMIC)) |
525 | goto activate_locked; | 636 | goto activate_locked; |
637 | } | ||
526 | #endif /* CONFIG_SWAP */ | 638 | #endif /* CONFIG_SWAP */ |
527 | 639 | ||
528 | mapping = page_mapping(page); | 640 | mapping = page_mapping(page); |
@@ -537,6 +649,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
537 | goto activate_locked; | 649 | goto activate_locked; |
538 | case SWAP_AGAIN: | 650 | case SWAP_AGAIN: |
539 | goto keep_locked; | 651 | goto keep_locked; |
652 | case SWAP_MLOCK: | ||
653 | goto cull_mlocked; | ||
540 | case SWAP_SUCCESS: | 654 | case SWAP_SUCCESS: |
541 | ; /* try to free the page below */ | 655 | ; /* try to free the page below */ |
542 | } | 656 | } |
@@ -563,7 +677,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
563 | * A synchronous write - probably a ramdisk. Go | 677 | * A synchronous write - probably a ramdisk. Go |
564 | * ahead and try to reclaim the page. | 678 | * ahead and try to reclaim the page. |
565 | */ | 679 | */ |
566 | if (TestSetPageLocked(page)) | 680 | if (!trylock_page(page)) |
567 | goto keep; | 681 | goto keep; |
568 | if (PageDirty(page) || PageWriteback(page)) | 682 | if (PageDirty(page) || PageWriteback(page)) |
569 | goto keep_locked; | 683 | goto keep_locked; |
@@ -583,7 +697,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
583 | * possible for a page to have PageDirty set, but it is actually | 697 | * possible for a page to have PageDirty set, but it is actually |
584 | * clean (all its buffers are clean). This happens if the | 698 | * clean (all its buffers are clean). This happens if the |
585 | * buffers were written out directly, with submit_bh(). ext3 | 699 | * buffers were written out directly, with submit_bh(). ext3 |
586 | * will do this, as well as the blockdev mapping. | 700 | * will do this, as well as the blockdev mapping. |
587 | * try_to_release_page() will discover that cleanness and will | 701 | * try_to_release_page() will discover that cleanness and will |
588 | * drop the buffers and mark the page clean - it can be freed. | 702 | * drop the buffers and mark the page clean - it can be freed. |
589 | * | 703 | * |
@@ -597,32 +711,64 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
597 | if (PagePrivate(page)) { | 711 | if (PagePrivate(page)) { |
598 | if (!try_to_release_page(page, sc->gfp_mask)) | 712 | if (!try_to_release_page(page, sc->gfp_mask)) |
599 | goto activate_locked; | 713 | goto activate_locked; |
600 | if (!mapping && page_count(page) == 1) | 714 | if (!mapping && page_count(page) == 1) { |
601 | goto free_it; | 715 | unlock_page(page); |
716 | if (put_page_testzero(page)) | ||
717 | goto free_it; | ||
718 | else { | ||
719 | /* | ||
720 | * rare race with speculative reference. | ||
721 | * the speculative reference will free | ||
722 | * this page shortly, so we may | ||
723 | * increment nr_reclaimed here (and | ||
724 | * leave it off the LRU). | ||
725 | */ | ||
726 | nr_reclaimed++; | ||
727 | continue; | ||
728 | } | ||
729 | } | ||
602 | } | 730 | } |
603 | 731 | ||
604 | if (!mapping || !remove_mapping(mapping, page)) | 732 | if (!mapping || !__remove_mapping(mapping, page)) |
605 | goto keep_locked; | 733 | goto keep_locked; |
606 | 734 | ||
735 | /* | ||
736 | * At this point, we have no other references and there is | ||
737 | * no way to pick any more up (removed from LRU, removed | ||
738 | * from pagecache). Can use non-atomic bitops now (and | ||
739 | * we obviously don't have to worry about waking up a process | ||
740 | * waiting on the page lock, because there are no references. | ||
741 | */ | ||
742 | __clear_page_locked(page); | ||
607 | free_it: | 743 | free_it: |
608 | unlock_page(page); | ||
609 | nr_reclaimed++; | 744 | nr_reclaimed++; |
610 | if (!pagevec_add(&freed_pvec, page)) | 745 | if (!pagevec_add(&freed_pvec, page)) { |
611 | __pagevec_release_nonlru(&freed_pvec); | 746 | __pagevec_free(&freed_pvec); |
747 | pagevec_reinit(&freed_pvec); | ||
748 | } | ||
749 | continue; | ||
750 | |||
751 | cull_mlocked: | ||
752 | unlock_page(page); | ||
753 | putback_lru_page(page); | ||
612 | continue; | 754 | continue; |
613 | 755 | ||
614 | activate_locked: | 756 | activate_locked: |
757 | /* Not a candidate for swapping, so reclaim swap space. */ | ||
758 | if (PageSwapCache(page) && vm_swap_full()) | ||
759 | remove_exclusive_swap_page_ref(page); | ||
760 | VM_BUG_ON(PageActive(page)); | ||
615 | SetPageActive(page); | 761 | SetPageActive(page); |
616 | pgactivate++; | 762 | pgactivate++; |
617 | keep_locked: | 763 | keep_locked: |
618 | unlock_page(page); | 764 | unlock_page(page); |
619 | keep: | 765 | keep: |
620 | list_add(&page->lru, &ret_pages); | 766 | list_add(&page->lru, &ret_pages); |
621 | VM_BUG_ON(PageLRU(page)); | 767 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); |
622 | } | 768 | } |
623 | list_splice(&ret_pages, page_list); | 769 | list_splice(&ret_pages, page_list); |
624 | if (pagevec_count(&freed_pvec)) | 770 | if (pagevec_count(&freed_pvec)) |
625 | __pagevec_release_nonlru(&freed_pvec); | 771 | __pagevec_free(&freed_pvec); |
626 | count_vm_events(PGACTIVATE, pgactivate); | 772 | count_vm_events(PGACTIVATE, pgactivate); |
627 | return nr_reclaimed; | 773 | return nr_reclaimed; |
628 | } | 774 | } |
@@ -642,7 +788,7 @@ keep: | |||
642 | * | 788 | * |
643 | * returns 0 on success, -ve errno on failure. | 789 | * returns 0 on success, -ve errno on failure. |
644 | */ | 790 | */ |
645 | int __isolate_lru_page(struct page *page, int mode) | 791 | int __isolate_lru_page(struct page *page, int mode, int file) |
646 | { | 792 | { |
647 | int ret = -EINVAL; | 793 | int ret = -EINVAL; |
648 | 794 | ||
@@ -658,6 +804,17 @@ int __isolate_lru_page(struct page *page, int mode) | |||
658 | if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) | 804 | if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) |
659 | return ret; | 805 | return ret; |
660 | 806 | ||
807 | if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file)) | ||
808 | return ret; | ||
809 | |||
810 | /* | ||
811 | * When this function is being called for lumpy reclaim, we | ||
812 | * initially look into all LRU pages, active, inactive and | ||
813 | * unevictable; only give shrink_page_list evictable pages. | ||
814 | */ | ||
815 | if (PageUnevictable(page)) | ||
816 | return ret; | ||
817 | |||
661 | ret = -EBUSY; | 818 | ret = -EBUSY; |
662 | if (likely(get_page_unless_zero(page))) { | 819 | if (likely(get_page_unless_zero(page))) { |
663 | /* | 820 | /* |
@@ -688,12 +845,13 @@ int __isolate_lru_page(struct page *page, int mode) | |||
688 | * @scanned: The number of pages that were scanned. | 845 | * @scanned: The number of pages that were scanned. |
689 | * @order: The caller's attempted allocation order | 846 | * @order: The caller's attempted allocation order |
690 | * @mode: One of the LRU isolation modes | 847 | * @mode: One of the LRU isolation modes |
848 | * @file: True [1] if isolating file [!anon] pages | ||
691 | * | 849 | * |
692 | * returns how many pages were moved onto *@dst. | 850 | * returns how many pages were moved onto *@dst. |
693 | */ | 851 | */ |
694 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | 852 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, |
695 | struct list_head *src, struct list_head *dst, | 853 | struct list_head *src, struct list_head *dst, |
696 | unsigned long *scanned, int order, int mode) | 854 | unsigned long *scanned, int order, int mode, int file) |
697 | { | 855 | { |
698 | unsigned long nr_taken = 0; | 856 | unsigned long nr_taken = 0; |
699 | unsigned long scan; | 857 | unsigned long scan; |
@@ -710,7 +868,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
710 | 868 | ||
711 | VM_BUG_ON(!PageLRU(page)); | 869 | VM_BUG_ON(!PageLRU(page)); |
712 | 870 | ||
713 | switch (__isolate_lru_page(page, mode)) { | 871 | switch (__isolate_lru_page(page, mode, file)) { |
714 | case 0: | 872 | case 0: |
715 | list_move(&page->lru, dst); | 873 | list_move(&page->lru, dst); |
716 | nr_taken++; | 874 | nr_taken++; |
@@ -753,10 +911,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
753 | break; | 911 | break; |
754 | 912 | ||
755 | cursor_page = pfn_to_page(pfn); | 913 | cursor_page = pfn_to_page(pfn); |
914 | |||
756 | /* Check that we have not crossed a zone boundary. */ | 915 | /* Check that we have not crossed a zone boundary. */ |
757 | if (unlikely(page_zone_id(cursor_page) != zone_id)) | 916 | if (unlikely(page_zone_id(cursor_page) != zone_id)) |
758 | continue; | 917 | continue; |
759 | switch (__isolate_lru_page(cursor_page, mode)) { | 918 | switch (__isolate_lru_page(cursor_page, mode, file)) { |
760 | case 0: | 919 | case 0: |
761 | list_move(&cursor_page->lru, dst); | 920 | list_move(&cursor_page->lru, dst); |
762 | nr_taken++; | 921 | nr_taken++; |
@@ -767,7 +926,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
767 | /* else it is being freed elsewhere */ | 926 | /* else it is being freed elsewhere */ |
768 | list_move(&cursor_page->lru, src); | 927 | list_move(&cursor_page->lru, src); |
769 | default: | 928 | default: |
770 | break; | 929 | break; /* ! on LRU or wrong list */ |
771 | } | 930 | } |
772 | } | 931 | } |
773 | } | 932 | } |
@@ -781,40 +940,93 @@ static unsigned long isolate_pages_global(unsigned long nr, | |||
781 | unsigned long *scanned, int order, | 940 | unsigned long *scanned, int order, |
782 | int mode, struct zone *z, | 941 | int mode, struct zone *z, |
783 | struct mem_cgroup *mem_cont, | 942 | struct mem_cgroup *mem_cont, |
784 | int active) | 943 | int active, int file) |
785 | { | 944 | { |
945 | int lru = LRU_BASE; | ||
786 | if (active) | 946 | if (active) |
787 | return isolate_lru_pages(nr, &z->active_list, dst, | 947 | lru += LRU_ACTIVE; |
788 | scanned, order, mode); | 948 | if (file) |
789 | else | 949 | lru += LRU_FILE; |
790 | return isolate_lru_pages(nr, &z->inactive_list, dst, | 950 | return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, |
791 | scanned, order, mode); | 951 | mode, !!file); |
792 | } | 952 | } |
793 | 953 | ||
794 | /* | 954 | /* |
795 | * clear_active_flags() is a helper for shrink_active_list(), clearing | 955 | * clear_active_flags() is a helper for shrink_active_list(), clearing |
796 | * any active bits from the pages in the list. | 956 | * any active bits from the pages in the list. |
797 | */ | 957 | */ |
798 | static unsigned long clear_active_flags(struct list_head *page_list) | 958 | static unsigned long clear_active_flags(struct list_head *page_list, |
959 | unsigned int *count) | ||
799 | { | 960 | { |
800 | int nr_active = 0; | 961 | int nr_active = 0; |
962 | int lru; | ||
801 | struct page *page; | 963 | struct page *page; |
802 | 964 | ||
803 | list_for_each_entry(page, page_list, lru) | 965 | list_for_each_entry(page, page_list, lru) { |
966 | lru = page_is_file_cache(page); | ||
804 | if (PageActive(page)) { | 967 | if (PageActive(page)) { |
968 | lru += LRU_ACTIVE; | ||
805 | ClearPageActive(page); | 969 | ClearPageActive(page); |
806 | nr_active++; | 970 | nr_active++; |
807 | } | 971 | } |
972 | count[lru]++; | ||
973 | } | ||
808 | 974 | ||
809 | return nr_active; | 975 | return nr_active; |
810 | } | 976 | } |
811 | 977 | ||
978 | /** | ||
979 | * isolate_lru_page - tries to isolate a page from its LRU list | ||
980 | * @page: page to isolate from its LRU list | ||
981 | * | ||
982 | * Isolates a @page from an LRU list, clears PageLRU and adjusts the | ||
983 | * vmstat statistic corresponding to whatever LRU list the page was on. | ||
984 | * | ||
985 | * Returns 0 if the page was removed from an LRU list. | ||
986 | * Returns -EBUSY if the page was not on an LRU list. | ||
987 | * | ||
988 | * The returned page will have PageLRU() cleared. If it was found on | ||
989 | * the active list, it will have PageActive set. If it was found on | ||
990 | * the unevictable list, it will have the PageUnevictable bit set. That flag | ||
991 | * may need to be cleared by the caller before letting the page go. | ||
992 | * | ||
993 | * The vmstat statistic corresponding to the list on which the page was | ||
994 | * found will be decremented. | ||
995 | * | ||
996 | * Restrictions: | ||
997 | * (1) Must be called with an elevated refcount on the page. This is a | ||
998 | * fundamentnal difference from isolate_lru_pages (which is called | ||
999 | * without a stable reference). | ||
1000 | * (2) the lru_lock must not be held. | ||
1001 | * (3) interrupts must be enabled. | ||
1002 | */ | ||
1003 | int isolate_lru_page(struct page *page) | ||
1004 | { | ||
1005 | int ret = -EBUSY; | ||
1006 | |||
1007 | if (PageLRU(page)) { | ||
1008 | struct zone *zone = page_zone(page); | ||
1009 | |||
1010 | spin_lock_irq(&zone->lru_lock); | ||
1011 | if (PageLRU(page) && get_page_unless_zero(page)) { | ||
1012 | int lru = page_lru(page); | ||
1013 | ret = 0; | ||
1014 | ClearPageLRU(page); | ||
1015 | |||
1016 | del_page_from_lru_list(zone, page, lru); | ||
1017 | } | ||
1018 | spin_unlock_irq(&zone->lru_lock); | ||
1019 | } | ||
1020 | return ret; | ||
1021 | } | ||
1022 | |||
812 | /* | 1023 | /* |
813 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number | 1024 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number |
814 | * of reclaimed pages | 1025 | * of reclaimed pages |
815 | */ | 1026 | */ |
816 | static unsigned long shrink_inactive_list(unsigned long max_scan, | 1027 | static unsigned long shrink_inactive_list(unsigned long max_scan, |
817 | struct zone *zone, struct scan_control *sc) | 1028 | struct zone *zone, struct scan_control *sc, |
1029 | int priority, int file) | ||
818 | { | 1030 | { |
819 | LIST_HEAD(page_list); | 1031 | LIST_HEAD(page_list); |
820 | struct pagevec pvec; | 1032 | struct pagevec pvec; |
@@ -831,20 +1043,43 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
831 | unsigned long nr_scan; | 1043 | unsigned long nr_scan; |
832 | unsigned long nr_freed; | 1044 | unsigned long nr_freed; |
833 | unsigned long nr_active; | 1045 | unsigned long nr_active; |
1046 | unsigned int count[NR_LRU_LISTS] = { 0, }; | ||
1047 | int mode = ISOLATE_INACTIVE; | ||
1048 | |||
1049 | /* | ||
1050 | * If we need a large contiguous chunk of memory, or have | ||
1051 | * trouble getting a small set of contiguous pages, we | ||
1052 | * will reclaim both active and inactive pages. | ||
1053 | * | ||
1054 | * We use the same threshold as pageout congestion_wait below. | ||
1055 | */ | ||
1056 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
1057 | mode = ISOLATE_BOTH; | ||
1058 | else if (sc->order && priority < DEF_PRIORITY - 2) | ||
1059 | mode = ISOLATE_BOTH; | ||
834 | 1060 | ||
835 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, | 1061 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, |
836 | &page_list, &nr_scan, sc->order, | 1062 | &page_list, &nr_scan, sc->order, mode, |
837 | (sc->order > PAGE_ALLOC_COSTLY_ORDER)? | 1063 | zone, sc->mem_cgroup, 0, file); |
838 | ISOLATE_BOTH : ISOLATE_INACTIVE, | 1064 | nr_active = clear_active_flags(&page_list, count); |
839 | zone, sc->mem_cgroup, 0); | ||
840 | nr_active = clear_active_flags(&page_list); | ||
841 | __count_vm_events(PGDEACTIVATE, nr_active); | 1065 | __count_vm_events(PGDEACTIVATE, nr_active); |
842 | 1066 | ||
843 | __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); | 1067 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, |
844 | __mod_zone_page_state(zone, NR_INACTIVE, | 1068 | -count[LRU_ACTIVE_FILE]); |
845 | -(nr_taken - nr_active)); | 1069 | __mod_zone_page_state(zone, NR_INACTIVE_FILE, |
846 | if (scan_global_lru(sc)) | 1070 | -count[LRU_INACTIVE_FILE]); |
1071 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, | ||
1072 | -count[LRU_ACTIVE_ANON]); | ||
1073 | __mod_zone_page_state(zone, NR_INACTIVE_ANON, | ||
1074 | -count[LRU_INACTIVE_ANON]); | ||
1075 | |||
1076 | if (scan_global_lru(sc)) { | ||
847 | zone->pages_scanned += nr_scan; | 1077 | zone->pages_scanned += nr_scan; |
1078 | zone->recent_scanned[0] += count[LRU_INACTIVE_ANON]; | ||
1079 | zone->recent_scanned[0] += count[LRU_ACTIVE_ANON]; | ||
1080 | zone->recent_scanned[1] += count[LRU_INACTIVE_FILE]; | ||
1081 | zone->recent_scanned[1] += count[LRU_ACTIVE_FILE]; | ||
1082 | } | ||
848 | spin_unlock_irq(&zone->lru_lock); | 1083 | spin_unlock_irq(&zone->lru_lock); |
849 | 1084 | ||
850 | nr_scanned += nr_scan; | 1085 | nr_scanned += nr_scan; |
@@ -864,7 +1099,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
864 | * The attempt at page out may have made some | 1099 | * The attempt at page out may have made some |
865 | * of the pages active, mark them inactive again. | 1100 | * of the pages active, mark them inactive again. |
866 | */ | 1101 | */ |
867 | nr_active = clear_active_flags(&page_list); | 1102 | nr_active = clear_active_flags(&page_list, count); |
868 | count_vm_events(PGDEACTIVATE, nr_active); | 1103 | count_vm_events(PGDEACTIVATE, nr_active); |
869 | 1104 | ||
870 | nr_freed += shrink_page_list(&page_list, sc, | 1105 | nr_freed += shrink_page_list(&page_list, sc, |
@@ -889,14 +1124,24 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
889 | * Put back any unfreeable pages. | 1124 | * Put back any unfreeable pages. |
890 | */ | 1125 | */ |
891 | while (!list_empty(&page_list)) { | 1126 | while (!list_empty(&page_list)) { |
1127 | int lru; | ||
892 | page = lru_to_page(&page_list); | 1128 | page = lru_to_page(&page_list); |
893 | VM_BUG_ON(PageLRU(page)); | 1129 | VM_BUG_ON(PageLRU(page)); |
894 | SetPageLRU(page); | ||
895 | list_del(&page->lru); | 1130 | list_del(&page->lru); |
896 | if (PageActive(page)) | 1131 | if (unlikely(!page_evictable(page, NULL))) { |
897 | add_page_to_active_list(zone, page); | 1132 | spin_unlock_irq(&zone->lru_lock); |
898 | else | 1133 | putback_lru_page(page); |
899 | add_page_to_inactive_list(zone, page); | 1134 | spin_lock_irq(&zone->lru_lock); |
1135 | continue; | ||
1136 | } | ||
1137 | SetPageLRU(page); | ||
1138 | lru = page_lru(page); | ||
1139 | add_page_to_lru_list(zone, page, lru); | ||
1140 | mem_cgroup_move_lists(page, lru); | ||
1141 | if (PageActive(page) && scan_global_lru(sc)) { | ||
1142 | int file = !!page_is_file_cache(page); | ||
1143 | zone->recent_rotated[file]++; | ||
1144 | } | ||
900 | if (!pagevec_add(&pvec, page)) { | 1145 | if (!pagevec_add(&pvec, page)) { |
901 | spin_unlock_irq(&zone->lru_lock); | 1146 | spin_unlock_irq(&zone->lru_lock); |
902 | __pagevec_release(&pvec); | 1147 | __pagevec_release(&pvec); |
@@ -927,115 +1172,7 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) | |||
927 | 1172 | ||
928 | static inline int zone_is_near_oom(struct zone *zone) | 1173 | static inline int zone_is_near_oom(struct zone *zone) |
929 | { | 1174 | { |
930 | return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE) | 1175 | return zone->pages_scanned >= (zone_lru_pages(zone) * 3); |
931 | + zone_page_state(zone, NR_INACTIVE))*3; | ||
932 | } | ||
933 | |||
934 | /* | ||
935 | * Determine we should try to reclaim mapped pages. | ||
936 | * This is called only when sc->mem_cgroup is NULL. | ||
937 | */ | ||
938 | static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone, | ||
939 | int priority) | ||
940 | { | ||
941 | long mapped_ratio; | ||
942 | long distress; | ||
943 | long swap_tendency; | ||
944 | long imbalance; | ||
945 | int reclaim_mapped = 0; | ||
946 | int prev_priority; | ||
947 | |||
948 | if (scan_global_lru(sc) && zone_is_near_oom(zone)) | ||
949 | return 1; | ||
950 | /* | ||
951 | * `distress' is a measure of how much trouble we're having | ||
952 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. | ||
953 | */ | ||
954 | if (scan_global_lru(sc)) | ||
955 | prev_priority = zone->prev_priority; | ||
956 | else | ||
957 | prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup); | ||
958 | |||
959 | distress = 100 >> min(prev_priority, priority); | ||
960 | |||
961 | /* | ||
962 | * The point of this algorithm is to decide when to start | ||
963 | * reclaiming mapped memory instead of just pagecache. Work out | ||
964 | * how much memory | ||
965 | * is mapped. | ||
966 | */ | ||
967 | if (scan_global_lru(sc)) | ||
968 | mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + | ||
969 | global_page_state(NR_ANON_PAGES)) * 100) / | ||
970 | vm_total_pages; | ||
971 | else | ||
972 | mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup); | ||
973 | |||
974 | /* | ||
975 | * Now decide how much we really want to unmap some pages. The | ||
976 | * mapped ratio is downgraded - just because there's a lot of | ||
977 | * mapped memory doesn't necessarily mean that page reclaim | ||
978 | * isn't succeeding. | ||
979 | * | ||
980 | * The distress ratio is important - we don't want to start | ||
981 | * going oom. | ||
982 | * | ||
983 | * A 100% value of vm_swappiness overrides this algorithm | ||
984 | * altogether. | ||
985 | */ | ||
986 | swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; | ||
987 | |||
988 | /* | ||
989 | * If there's huge imbalance between active and inactive | ||
990 | * (think active 100 times larger than inactive) we should | ||
991 | * become more permissive, or the system will take too much | ||
992 | * cpu before it start swapping during memory pressure. | ||
993 | * Distress is about avoiding early-oom, this is about | ||
994 | * making swappiness graceful despite setting it to low | ||
995 | * values. | ||
996 | * | ||
997 | * Avoid div by zero with nr_inactive+1, and max resulting | ||
998 | * value is vm_total_pages. | ||
999 | */ | ||
1000 | if (scan_global_lru(sc)) { | ||
1001 | imbalance = zone_page_state(zone, NR_ACTIVE); | ||
1002 | imbalance /= zone_page_state(zone, NR_INACTIVE) + 1; | ||
1003 | } else | ||
1004 | imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup); | ||
1005 | |||
1006 | /* | ||
1007 | * Reduce the effect of imbalance if swappiness is low, | ||
1008 | * this means for a swappiness very low, the imbalance | ||
1009 | * must be much higher than 100 for this logic to make | ||
1010 | * the difference. | ||
1011 | * | ||
1012 | * Max temporary value is vm_total_pages*100. | ||
1013 | */ | ||
1014 | imbalance *= (vm_swappiness + 1); | ||
1015 | imbalance /= 100; | ||
1016 | |||
1017 | /* | ||
1018 | * If not much of the ram is mapped, makes the imbalance | ||
1019 | * less relevant, it's high priority we refill the inactive | ||
1020 | * list with mapped pages only in presence of high ratio of | ||
1021 | * mapped pages. | ||
1022 | * | ||
1023 | * Max temporary value is vm_total_pages*100. | ||
1024 | */ | ||
1025 | imbalance *= mapped_ratio; | ||
1026 | imbalance /= 100; | ||
1027 | |||
1028 | /* apply imbalance feedback to swap_tendency */ | ||
1029 | swap_tendency += imbalance; | ||
1030 | |||
1031 | /* | ||
1032 | * Now use this metric to decide whether to start moving mapped | ||
1033 | * memory onto the inactive list. | ||
1034 | */ | ||
1035 | if (swap_tendency >= 100) | ||
1036 | reclaim_mapped = 1; | ||
1037 | |||
1038 | return reclaim_mapped; | ||
1039 | } | 1176 | } |
1040 | 1177 | ||
1041 | /* | 1178 | /* |
@@ -1058,53 +1195,71 @@ static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone, | |||
1058 | 1195 | ||
1059 | 1196 | ||
1060 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | 1197 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, |
1061 | struct scan_control *sc, int priority) | 1198 | struct scan_control *sc, int priority, int file) |
1062 | { | 1199 | { |
1063 | unsigned long pgmoved; | 1200 | unsigned long pgmoved; |
1064 | int pgdeactivate = 0; | 1201 | int pgdeactivate = 0; |
1065 | unsigned long pgscanned; | 1202 | unsigned long pgscanned; |
1066 | LIST_HEAD(l_hold); /* The pages which were snipped off */ | 1203 | LIST_HEAD(l_hold); /* The pages which were snipped off */ |
1067 | LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ | 1204 | LIST_HEAD(l_inactive); |
1068 | LIST_HEAD(l_active); /* Pages to go onto the active_list */ | ||
1069 | struct page *page; | 1205 | struct page *page; |
1070 | struct pagevec pvec; | 1206 | struct pagevec pvec; |
1071 | int reclaim_mapped = 0; | 1207 | enum lru_list lru; |
1072 | |||
1073 | if (sc->may_swap) | ||
1074 | reclaim_mapped = calc_reclaim_mapped(sc, zone, priority); | ||
1075 | 1208 | ||
1076 | lru_add_drain(); | 1209 | lru_add_drain(); |
1077 | spin_lock_irq(&zone->lru_lock); | 1210 | spin_lock_irq(&zone->lru_lock); |
1078 | pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, | 1211 | pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, |
1079 | ISOLATE_ACTIVE, zone, | 1212 | ISOLATE_ACTIVE, zone, |
1080 | sc->mem_cgroup, 1); | 1213 | sc->mem_cgroup, 1, file); |
1081 | /* | 1214 | /* |
1082 | * zone->pages_scanned is used for detect zone's oom | 1215 | * zone->pages_scanned is used for detect zone's oom |
1083 | * mem_cgroup remembers nr_scan by itself. | 1216 | * mem_cgroup remembers nr_scan by itself. |
1084 | */ | 1217 | */ |
1085 | if (scan_global_lru(sc)) | 1218 | if (scan_global_lru(sc)) { |
1086 | zone->pages_scanned += pgscanned; | 1219 | zone->pages_scanned += pgscanned; |
1220 | zone->recent_scanned[!!file] += pgmoved; | ||
1221 | } | ||
1087 | 1222 | ||
1088 | __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); | 1223 | if (file) |
1224 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); | ||
1225 | else | ||
1226 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); | ||
1089 | spin_unlock_irq(&zone->lru_lock); | 1227 | spin_unlock_irq(&zone->lru_lock); |
1090 | 1228 | ||
1229 | pgmoved = 0; | ||
1091 | while (!list_empty(&l_hold)) { | 1230 | while (!list_empty(&l_hold)) { |
1092 | cond_resched(); | 1231 | cond_resched(); |
1093 | page = lru_to_page(&l_hold); | 1232 | page = lru_to_page(&l_hold); |
1094 | list_del(&page->lru); | 1233 | list_del(&page->lru); |
1095 | if (page_mapped(page)) { | 1234 | |
1096 | if (!reclaim_mapped || | 1235 | if (unlikely(!page_evictable(page, NULL))) { |
1097 | (total_swap_pages == 0 && PageAnon(page)) || | 1236 | putback_lru_page(page); |
1098 | page_referenced(page, 0, sc->mem_cgroup)) { | 1237 | continue; |
1099 | list_add(&page->lru, &l_active); | ||
1100 | continue; | ||
1101 | } | ||
1102 | } | 1238 | } |
1239 | |||
1240 | /* page_referenced clears PageReferenced */ | ||
1241 | if (page_mapping_inuse(page) && | ||
1242 | page_referenced(page, 0, sc->mem_cgroup)) | ||
1243 | pgmoved++; | ||
1244 | |||
1103 | list_add(&page->lru, &l_inactive); | 1245 | list_add(&page->lru, &l_inactive); |
1104 | } | 1246 | } |
1105 | 1247 | ||
1248 | /* | ||
1249 | * Count referenced pages from currently used mappings as | ||
1250 | * rotated, even though they are moved to the inactive list. | ||
1251 | * This helps balance scan pressure between file and anonymous | ||
1252 | * pages in get_scan_ratio. | ||
1253 | */ | ||
1254 | zone->recent_rotated[!!file] += pgmoved; | ||
1255 | |||
1256 | /* | ||
1257 | * Move the pages to the [file or anon] inactive list. | ||
1258 | */ | ||
1106 | pagevec_init(&pvec, 1); | 1259 | pagevec_init(&pvec, 1); |
1260 | |||
1107 | pgmoved = 0; | 1261 | pgmoved = 0; |
1262 | lru = LRU_BASE + file * LRU_FILE; | ||
1108 | spin_lock_irq(&zone->lru_lock); | 1263 | spin_lock_irq(&zone->lru_lock); |
1109 | while (!list_empty(&l_inactive)) { | 1264 | while (!list_empty(&l_inactive)) { |
1110 | page = lru_to_page(&l_inactive); | 1265 | page = lru_to_page(&l_inactive); |
@@ -1114,11 +1269,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1114 | VM_BUG_ON(!PageActive(page)); | 1269 | VM_BUG_ON(!PageActive(page)); |
1115 | ClearPageActive(page); | 1270 | ClearPageActive(page); |
1116 | 1271 | ||
1117 | list_move(&page->lru, &zone->inactive_list); | 1272 | list_move(&page->lru, &zone->lru[lru].list); |
1118 | mem_cgroup_move_lists(page, false); | 1273 | mem_cgroup_move_lists(page, lru); |
1119 | pgmoved++; | 1274 | pgmoved++; |
1120 | if (!pagevec_add(&pvec, page)) { | 1275 | if (!pagevec_add(&pvec, page)) { |
1121 | __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); | 1276 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); |
1122 | spin_unlock_irq(&zone->lru_lock); | 1277 | spin_unlock_irq(&zone->lru_lock); |
1123 | pgdeactivate += pgmoved; | 1278 | pgdeactivate += pgmoved; |
1124 | pgmoved = 0; | 1279 | pgmoved = 0; |
@@ -1128,104 +1283,189 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1128 | spin_lock_irq(&zone->lru_lock); | 1283 | spin_lock_irq(&zone->lru_lock); |
1129 | } | 1284 | } |
1130 | } | 1285 | } |
1131 | __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); | 1286 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); |
1132 | pgdeactivate += pgmoved; | 1287 | pgdeactivate += pgmoved; |
1133 | if (buffer_heads_over_limit) { | 1288 | if (buffer_heads_over_limit) { |
1134 | spin_unlock_irq(&zone->lru_lock); | 1289 | spin_unlock_irq(&zone->lru_lock); |
1135 | pagevec_strip(&pvec); | 1290 | pagevec_strip(&pvec); |
1136 | spin_lock_irq(&zone->lru_lock); | 1291 | spin_lock_irq(&zone->lru_lock); |
1137 | } | 1292 | } |
1138 | |||
1139 | pgmoved = 0; | ||
1140 | while (!list_empty(&l_active)) { | ||
1141 | page = lru_to_page(&l_active); | ||
1142 | prefetchw_prev_lru_page(page, &l_active, flags); | ||
1143 | VM_BUG_ON(PageLRU(page)); | ||
1144 | SetPageLRU(page); | ||
1145 | VM_BUG_ON(!PageActive(page)); | ||
1146 | |||
1147 | list_move(&page->lru, &zone->active_list); | ||
1148 | mem_cgroup_move_lists(page, true); | ||
1149 | pgmoved++; | ||
1150 | if (!pagevec_add(&pvec, page)) { | ||
1151 | __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); | ||
1152 | pgmoved = 0; | ||
1153 | spin_unlock_irq(&zone->lru_lock); | ||
1154 | __pagevec_release(&pvec); | ||
1155 | spin_lock_irq(&zone->lru_lock); | ||
1156 | } | ||
1157 | } | ||
1158 | __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); | ||
1159 | |||
1160 | __count_zone_vm_events(PGREFILL, zone, pgscanned); | 1293 | __count_zone_vm_events(PGREFILL, zone, pgscanned); |
1161 | __count_vm_events(PGDEACTIVATE, pgdeactivate); | 1294 | __count_vm_events(PGDEACTIVATE, pgdeactivate); |
1162 | spin_unlock_irq(&zone->lru_lock); | 1295 | spin_unlock_irq(&zone->lru_lock); |
1296 | if (vm_swap_full()) | ||
1297 | pagevec_swap_free(&pvec); | ||
1163 | 1298 | ||
1164 | pagevec_release(&pvec); | 1299 | pagevec_release(&pvec); |
1165 | } | 1300 | } |
1166 | 1301 | ||
1302 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | ||
1303 | struct zone *zone, struct scan_control *sc, int priority) | ||
1304 | { | ||
1305 | int file = is_file_lru(lru); | ||
1306 | |||
1307 | if (lru == LRU_ACTIVE_FILE) { | ||
1308 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | ||
1309 | return 0; | ||
1310 | } | ||
1311 | |||
1312 | if (lru == LRU_ACTIVE_ANON && | ||
1313 | (!scan_global_lru(sc) || inactive_anon_is_low(zone))) { | ||
1314 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | ||
1315 | return 0; | ||
1316 | } | ||
1317 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); | ||
1318 | } | ||
1319 | |||
1320 | /* | ||
1321 | * Determine how aggressively the anon and file LRU lists should be | ||
1322 | * scanned. The relative value of each set of LRU lists is determined | ||
1323 | * by looking at the fraction of the pages scanned we did rotate back | ||
1324 | * onto the active list instead of evict. | ||
1325 | * | ||
1326 | * percent[0] specifies how much pressure to put on ram/swap backed | ||
1327 | * memory, while percent[1] determines pressure on the file LRUs. | ||
1328 | */ | ||
1329 | static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | ||
1330 | unsigned long *percent) | ||
1331 | { | ||
1332 | unsigned long anon, file, free; | ||
1333 | unsigned long anon_prio, file_prio; | ||
1334 | unsigned long ap, fp; | ||
1335 | |||
1336 | anon = zone_page_state(zone, NR_ACTIVE_ANON) + | ||
1337 | zone_page_state(zone, NR_INACTIVE_ANON); | ||
1338 | file = zone_page_state(zone, NR_ACTIVE_FILE) + | ||
1339 | zone_page_state(zone, NR_INACTIVE_FILE); | ||
1340 | free = zone_page_state(zone, NR_FREE_PAGES); | ||
1341 | |||
1342 | /* If we have no swap space, do not bother scanning anon pages. */ | ||
1343 | if (nr_swap_pages <= 0) { | ||
1344 | percent[0] = 0; | ||
1345 | percent[1] = 100; | ||
1346 | return; | ||
1347 | } | ||
1348 | |||
1349 | /* If we have very few page cache pages, force-scan anon pages. */ | ||
1350 | if (unlikely(file + free <= zone->pages_high)) { | ||
1351 | percent[0] = 100; | ||
1352 | percent[1] = 0; | ||
1353 | return; | ||
1354 | } | ||
1355 | |||
1356 | /* | ||
1357 | * OK, so we have swap space and a fair amount of page cache | ||
1358 | * pages. We use the recently rotated / recently scanned | ||
1359 | * ratios to determine how valuable each cache is. | ||
1360 | * | ||
1361 | * Because workloads change over time (and to avoid overflow) | ||
1362 | * we keep these statistics as a floating average, which ends | ||
1363 | * up weighing recent references more than old ones. | ||
1364 | * | ||
1365 | * anon in [0], file in [1] | ||
1366 | */ | ||
1367 | if (unlikely(zone->recent_scanned[0] > anon / 4)) { | ||
1368 | spin_lock_irq(&zone->lru_lock); | ||
1369 | zone->recent_scanned[0] /= 2; | ||
1370 | zone->recent_rotated[0] /= 2; | ||
1371 | spin_unlock_irq(&zone->lru_lock); | ||
1372 | } | ||
1373 | |||
1374 | if (unlikely(zone->recent_scanned[1] > file / 4)) { | ||
1375 | spin_lock_irq(&zone->lru_lock); | ||
1376 | zone->recent_scanned[1] /= 2; | ||
1377 | zone->recent_rotated[1] /= 2; | ||
1378 | spin_unlock_irq(&zone->lru_lock); | ||
1379 | } | ||
1380 | |||
1381 | /* | ||
1382 | * With swappiness at 100, anonymous and file have the same priority. | ||
1383 | * This scanning priority is essentially the inverse of IO cost. | ||
1384 | */ | ||
1385 | anon_prio = sc->swappiness; | ||
1386 | file_prio = 200 - sc->swappiness; | ||
1387 | |||
1388 | /* | ||
1389 | * anon recent_rotated[0] | ||
1390 | * %anon = 100 * ----------- / ----------------- * IO cost | ||
1391 | * anon + file rotate_sum | ||
1392 | */ | ||
1393 | ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1); | ||
1394 | ap /= zone->recent_rotated[0] + 1; | ||
1395 | |||
1396 | fp = (file_prio + 1) * (zone->recent_scanned[1] + 1); | ||
1397 | fp /= zone->recent_rotated[1] + 1; | ||
1398 | |||
1399 | /* Normalize to percentages */ | ||
1400 | percent[0] = 100 * ap / (ap + fp + 1); | ||
1401 | percent[1] = 100 - percent[0]; | ||
1402 | } | ||
1403 | |||
1404 | |||
1167 | /* | 1405 | /* |
1168 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 1406 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
1169 | */ | 1407 | */ |
1170 | static unsigned long shrink_zone(int priority, struct zone *zone, | 1408 | static unsigned long shrink_zone(int priority, struct zone *zone, |
1171 | struct scan_control *sc) | 1409 | struct scan_control *sc) |
1172 | { | 1410 | { |
1173 | unsigned long nr_active; | 1411 | unsigned long nr[NR_LRU_LISTS]; |
1174 | unsigned long nr_inactive; | ||
1175 | unsigned long nr_to_scan; | 1412 | unsigned long nr_to_scan; |
1176 | unsigned long nr_reclaimed = 0; | 1413 | unsigned long nr_reclaimed = 0; |
1414 | unsigned long percent[2]; /* anon @ 0; file @ 1 */ | ||
1415 | enum lru_list l; | ||
1177 | 1416 | ||
1178 | if (scan_global_lru(sc)) { | 1417 | get_scan_ratio(zone, sc, percent); |
1179 | /* | ||
1180 | * Add one to nr_to_scan just to make sure that the kernel | ||
1181 | * will slowly sift through the active list. | ||
1182 | */ | ||
1183 | zone->nr_scan_active += | ||
1184 | (zone_page_state(zone, NR_ACTIVE) >> priority) + 1; | ||
1185 | nr_active = zone->nr_scan_active; | ||
1186 | zone->nr_scan_inactive += | ||
1187 | (zone_page_state(zone, NR_INACTIVE) >> priority) + 1; | ||
1188 | nr_inactive = zone->nr_scan_inactive; | ||
1189 | if (nr_inactive >= sc->swap_cluster_max) | ||
1190 | zone->nr_scan_inactive = 0; | ||
1191 | else | ||
1192 | nr_inactive = 0; | ||
1193 | |||
1194 | if (nr_active >= sc->swap_cluster_max) | ||
1195 | zone->nr_scan_active = 0; | ||
1196 | else | ||
1197 | nr_active = 0; | ||
1198 | } else { | ||
1199 | /* | ||
1200 | * This reclaim occurs not because zone memory shortage but | ||
1201 | * because memory controller hits its limit. | ||
1202 | * Then, don't modify zone reclaim related data. | ||
1203 | */ | ||
1204 | nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup, | ||
1205 | zone, priority); | ||
1206 | |||
1207 | nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup, | ||
1208 | zone, priority); | ||
1209 | } | ||
1210 | 1418 | ||
1419 | for_each_evictable_lru(l) { | ||
1420 | if (scan_global_lru(sc)) { | ||
1421 | int file = is_file_lru(l); | ||
1422 | int scan; | ||
1211 | 1423 | ||
1212 | while (nr_active || nr_inactive) { | 1424 | scan = zone_page_state(zone, NR_LRU_BASE + l); |
1213 | if (nr_active) { | 1425 | if (priority) { |
1214 | nr_to_scan = min(nr_active, | 1426 | scan >>= priority; |
1215 | (unsigned long)sc->swap_cluster_max); | 1427 | scan = (scan * percent[file]) / 100; |
1216 | nr_active -= nr_to_scan; | 1428 | } |
1217 | shrink_active_list(nr_to_scan, zone, sc, priority); | 1429 | zone->lru[l].nr_scan += scan; |
1430 | nr[l] = zone->lru[l].nr_scan; | ||
1431 | if (nr[l] >= sc->swap_cluster_max) | ||
1432 | zone->lru[l].nr_scan = 0; | ||
1433 | else | ||
1434 | nr[l] = 0; | ||
1435 | } else { | ||
1436 | /* | ||
1437 | * This reclaim occurs not because zone memory shortage | ||
1438 | * but because memory controller hits its limit. | ||
1439 | * Don't modify zone reclaim related data. | ||
1440 | */ | ||
1441 | nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone, | ||
1442 | priority, l); | ||
1218 | } | 1443 | } |
1444 | } | ||
1219 | 1445 | ||
1220 | if (nr_inactive) { | 1446 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
1221 | nr_to_scan = min(nr_inactive, | 1447 | nr[LRU_INACTIVE_FILE]) { |
1448 | for_each_evictable_lru(l) { | ||
1449 | if (nr[l]) { | ||
1450 | nr_to_scan = min(nr[l], | ||
1222 | (unsigned long)sc->swap_cluster_max); | 1451 | (unsigned long)sc->swap_cluster_max); |
1223 | nr_inactive -= nr_to_scan; | 1452 | nr[l] -= nr_to_scan; |
1224 | nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, | 1453 | |
1225 | sc); | 1454 | nr_reclaimed += shrink_list(l, nr_to_scan, |
1455 | zone, sc, priority); | ||
1456 | } | ||
1226 | } | 1457 | } |
1227 | } | 1458 | } |
1228 | 1459 | ||
1460 | /* | ||
1461 | * Even if we did not try to evict anon pages at all, we want to | ||
1462 | * rebalance the anon lru active/inactive ratio. | ||
1463 | */ | ||
1464 | if (!scan_global_lru(sc) || inactive_anon_is_low(zone)) | ||
1465 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | ||
1466 | else if (!scan_global_lru(sc)) | ||
1467 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | ||
1468 | |||
1229 | throttle_vm_writeout(sc->gfp_mask); | 1469 | throttle_vm_writeout(sc->gfp_mask); |
1230 | return nr_reclaimed; | 1470 | return nr_reclaimed; |
1231 | } | 1471 | } |
@@ -1286,7 +1526,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist, | |||
1286 | 1526 | ||
1287 | return nr_reclaimed; | 1527 | return nr_reclaimed; |
1288 | } | 1528 | } |
1289 | 1529 | ||
1290 | /* | 1530 | /* |
1291 | * This is the main entry point to direct page reclaim. | 1531 | * This is the main entry point to direct page reclaim. |
1292 | * | 1532 | * |
@@ -1316,6 +1556,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1316 | struct zone *zone; | 1556 | struct zone *zone; |
1317 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); | 1557 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); |
1318 | 1558 | ||
1559 | delayacct_freepages_start(); | ||
1560 | |||
1319 | if (scan_global_lru(sc)) | 1561 | if (scan_global_lru(sc)) |
1320 | count_vm_event(ALLOCSTALL); | 1562 | count_vm_event(ALLOCSTALL); |
1321 | /* | 1563 | /* |
@@ -1327,8 +1569,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1327 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 1569 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
1328 | continue; | 1570 | continue; |
1329 | 1571 | ||
1330 | lru_pages += zone_page_state(zone, NR_ACTIVE) | 1572 | lru_pages += zone_lru_pages(zone); |
1331 | + zone_page_state(zone, NR_INACTIVE); | ||
1332 | } | 1573 | } |
1333 | } | 1574 | } |
1334 | 1575 | ||
@@ -1371,7 +1612,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1371 | if (sc->nr_scanned && priority < DEF_PRIORITY - 2) | 1612 | if (sc->nr_scanned && priority < DEF_PRIORITY - 2) |
1372 | congestion_wait(WRITE, HZ/10); | 1613 | congestion_wait(WRITE, HZ/10); |
1373 | } | 1614 | } |
1374 | /* top priority shrink_caches still had more to do? don't OOM, then */ | 1615 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
1375 | if (!sc->all_unreclaimable && scan_global_lru(sc)) | 1616 | if (!sc->all_unreclaimable && scan_global_lru(sc)) |
1376 | ret = nr_reclaimed; | 1617 | ret = nr_reclaimed; |
1377 | out: | 1618 | out: |
@@ -1396,6 +1637,8 @@ out: | |||
1396 | } else | 1637 | } else |
1397 | mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); | 1638 | mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); |
1398 | 1639 | ||
1640 | delayacct_freepages_end(); | ||
1641 | |||
1399 | return ret; | 1642 | return ret; |
1400 | } | 1643 | } |
1401 | 1644 | ||
@@ -1516,6 +1759,14 @@ loop_again: | |||
1516 | priority != DEF_PRIORITY) | 1759 | priority != DEF_PRIORITY) |
1517 | continue; | 1760 | continue; |
1518 | 1761 | ||
1762 | /* | ||
1763 | * Do some background aging of the anon list, to give | ||
1764 | * pages a chance to be referenced before reclaiming. | ||
1765 | */ | ||
1766 | if (inactive_anon_is_low(zone)) | ||
1767 | shrink_active_list(SWAP_CLUSTER_MAX, zone, | ||
1768 | &sc, priority, 0); | ||
1769 | |||
1519 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1770 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
1520 | 0, 0)) { | 1771 | 0, 0)) { |
1521 | end_zone = i; | 1772 | end_zone = i; |
@@ -1528,8 +1779,7 @@ loop_again: | |||
1528 | for (i = 0; i <= end_zone; i++) { | 1779 | for (i = 0; i <= end_zone; i++) { |
1529 | struct zone *zone = pgdat->node_zones + i; | 1780 | struct zone *zone = pgdat->node_zones + i; |
1530 | 1781 | ||
1531 | lru_pages += zone_page_state(zone, NR_ACTIVE) | 1782 | lru_pages += zone_lru_pages(zone); |
1532 | + zone_page_state(zone, NR_INACTIVE); | ||
1533 | } | 1783 | } |
1534 | 1784 | ||
1535 | /* | 1785 | /* |
@@ -1573,8 +1823,7 @@ loop_again: | |||
1573 | if (zone_is_all_unreclaimable(zone)) | 1823 | if (zone_is_all_unreclaimable(zone)) |
1574 | continue; | 1824 | continue; |
1575 | if (nr_slab == 0 && zone->pages_scanned >= | 1825 | if (nr_slab == 0 && zone->pages_scanned >= |
1576 | (zone_page_state(zone, NR_ACTIVE) | 1826 | (zone_lru_pages(zone) * 6)) |
1577 | + zone_page_state(zone, NR_INACTIVE)) * 6) | ||
1578 | zone_set_flag(zone, | 1827 | zone_set_flag(zone, |
1579 | ZONE_ALL_UNRECLAIMABLE); | 1828 | ZONE_ALL_UNRECLAIMABLE); |
1580 | /* | 1829 | /* |
@@ -1628,7 +1877,7 @@ out: | |||
1628 | 1877 | ||
1629 | /* | 1878 | /* |
1630 | * The background pageout daemon, started as a kernel thread | 1879 | * The background pageout daemon, started as a kernel thread |
1631 | * from the init process. | 1880 | * from the init process. |
1632 | * | 1881 | * |
1633 | * This basically trickles out pages so that we have _some_ | 1882 | * This basically trickles out pages so that we have _some_ |
1634 | * free memory available even if there is no other activity | 1883 | * free memory available even if there is no other activity |
@@ -1722,6 +1971,14 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
1722 | wake_up_interruptible(&pgdat->kswapd_wait); | 1971 | wake_up_interruptible(&pgdat->kswapd_wait); |
1723 | } | 1972 | } |
1724 | 1973 | ||
1974 | unsigned long global_lru_pages(void) | ||
1975 | { | ||
1976 | return global_page_state(NR_ACTIVE_ANON) | ||
1977 | + global_page_state(NR_ACTIVE_FILE) | ||
1978 | + global_page_state(NR_INACTIVE_ANON) | ||
1979 | + global_page_state(NR_INACTIVE_FILE); | ||
1980 | } | ||
1981 | |||
1725 | #ifdef CONFIG_PM | 1982 | #ifdef CONFIG_PM |
1726 | /* | 1983 | /* |
1727 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages | 1984 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages |
@@ -1735,6 +1992,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, | |||
1735 | { | 1992 | { |
1736 | struct zone *zone; | 1993 | struct zone *zone; |
1737 | unsigned long nr_to_scan, ret = 0; | 1994 | unsigned long nr_to_scan, ret = 0; |
1995 | enum lru_list l; | ||
1738 | 1996 | ||
1739 | for_each_zone(zone) { | 1997 | for_each_zone(zone) { |
1740 | 1998 | ||
@@ -1744,38 +2002,31 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, | |||
1744 | if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) | 2002 | if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) |
1745 | continue; | 2003 | continue; |
1746 | 2004 | ||
1747 | /* For pass = 0 we don't shrink the active list */ | 2005 | for_each_evictable_lru(l) { |
1748 | if (pass > 0) { | 2006 | /* For pass = 0, we don't shrink the active list */ |
1749 | zone->nr_scan_active += | 2007 | if (pass == 0 && |
1750 | (zone_page_state(zone, NR_ACTIVE) >> prio) + 1; | 2008 | (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE)) |
1751 | if (zone->nr_scan_active >= nr_pages || pass > 3) { | 2009 | continue; |
1752 | zone->nr_scan_active = 0; | 2010 | |
2011 | zone->lru[l].nr_scan += | ||
2012 | (zone_page_state(zone, NR_LRU_BASE + l) | ||
2013 | >> prio) + 1; | ||
2014 | if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { | ||
2015 | zone->lru[l].nr_scan = 0; | ||
1753 | nr_to_scan = min(nr_pages, | 2016 | nr_to_scan = min(nr_pages, |
1754 | zone_page_state(zone, NR_ACTIVE)); | 2017 | zone_page_state(zone, |
1755 | shrink_active_list(nr_to_scan, zone, sc, prio); | 2018 | NR_LRU_BASE + l)); |
2019 | ret += shrink_list(l, nr_to_scan, zone, | ||
2020 | sc, prio); | ||
2021 | if (ret >= nr_pages) | ||
2022 | return ret; | ||
1756 | } | 2023 | } |
1757 | } | 2024 | } |
1758 | |||
1759 | zone->nr_scan_inactive += | ||
1760 | (zone_page_state(zone, NR_INACTIVE) >> prio) + 1; | ||
1761 | if (zone->nr_scan_inactive >= nr_pages || pass > 3) { | ||
1762 | zone->nr_scan_inactive = 0; | ||
1763 | nr_to_scan = min(nr_pages, | ||
1764 | zone_page_state(zone, NR_INACTIVE)); | ||
1765 | ret += shrink_inactive_list(nr_to_scan, zone, sc); | ||
1766 | if (ret >= nr_pages) | ||
1767 | return ret; | ||
1768 | } | ||
1769 | } | 2025 | } |
1770 | 2026 | ||
1771 | return ret; | 2027 | return ret; |
1772 | } | 2028 | } |
1773 | 2029 | ||
1774 | static unsigned long count_lru_pages(void) | ||
1775 | { | ||
1776 | return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE); | ||
1777 | } | ||
1778 | |||
1779 | /* | 2030 | /* |
1780 | * Try to free `nr_pages' of memory, system-wide, and return the number of | 2031 | * Try to free `nr_pages' of memory, system-wide, and return the number of |
1781 | * freed pages. | 2032 | * freed pages. |
@@ -1801,7 +2052,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1801 | 2052 | ||
1802 | current->reclaim_state = &reclaim_state; | 2053 | current->reclaim_state = &reclaim_state; |
1803 | 2054 | ||
1804 | lru_pages = count_lru_pages(); | 2055 | lru_pages = global_lru_pages(); |
1805 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); | 2056 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); |
1806 | /* If slab caches are huge, it's better to hit them first */ | 2057 | /* If slab caches are huge, it's better to hit them first */ |
1807 | while (nr_slab >= lru_pages) { | 2058 | while (nr_slab >= lru_pages) { |
@@ -1844,7 +2095,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1844 | 2095 | ||
1845 | reclaim_state.reclaimed_slab = 0; | 2096 | reclaim_state.reclaimed_slab = 0; |
1846 | shrink_slab(sc.nr_scanned, sc.gfp_mask, | 2097 | shrink_slab(sc.nr_scanned, sc.gfp_mask, |
1847 | count_lru_pages()); | 2098 | global_lru_pages()); |
1848 | ret += reclaim_state.reclaimed_slab; | 2099 | ret += reclaim_state.reclaimed_slab; |
1849 | if (ret >= nr_pages) | 2100 | if (ret >= nr_pages) |
1850 | goto out; | 2101 | goto out; |
@@ -1861,7 +2112,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1861 | if (!ret) { | 2112 | if (!ret) { |
1862 | do { | 2113 | do { |
1863 | reclaim_state.reclaimed_slab = 0; | 2114 | reclaim_state.reclaimed_slab = 0; |
1864 | shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); | 2115 | shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); |
1865 | ret += reclaim_state.reclaimed_slab; | 2116 | ret += reclaim_state.reclaimed_slab; |
1866 | } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); | 2117 | } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); |
1867 | } | 2118 | } |
@@ -1940,7 +2191,7 @@ module_init(kswapd_init) | |||
1940 | int zone_reclaim_mode __read_mostly; | 2191 | int zone_reclaim_mode __read_mostly; |
1941 | 2192 | ||
1942 | #define RECLAIM_OFF 0 | 2193 | #define RECLAIM_OFF 0 |
1943 | #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ | 2194 | #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ |
1944 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ | 2195 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ |
1945 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ | 2196 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ |
1946 | 2197 | ||
@@ -2089,3 +2340,285 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2089 | return ret; | 2340 | return ret; |
2090 | } | 2341 | } |
2091 | #endif | 2342 | #endif |
2343 | |||
2344 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
2345 | /* | ||
2346 | * page_evictable - test whether a page is evictable | ||
2347 | * @page: the page to test | ||
2348 | * @vma: the VMA in which the page is or will be mapped, may be NULL | ||
2349 | * | ||
2350 | * Test whether page is evictable--i.e., should be placed on active/inactive | ||
2351 | * lists vs unevictable list. The vma argument is !NULL when called from the | ||
2352 | * fault path to determine how to instantate a new page. | ||
2353 | * | ||
2354 | * Reasons page might not be evictable: | ||
2355 | * (1) page's mapping marked unevictable | ||
2356 | * (2) page is part of an mlocked VMA | ||
2357 | * | ||
2358 | */ | ||
2359 | int page_evictable(struct page *page, struct vm_area_struct *vma) | ||
2360 | { | ||
2361 | |||
2362 | if (mapping_unevictable(page_mapping(page))) | ||
2363 | return 0; | ||
2364 | |||
2365 | if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) | ||
2366 | return 0; | ||
2367 | |||
2368 | return 1; | ||
2369 | } | ||
2370 | |||
2371 | static void show_page_path(struct page *page) | ||
2372 | { | ||
2373 | char buf[256]; | ||
2374 | if (page_is_file_cache(page)) { | ||
2375 | struct address_space *mapping = page->mapping; | ||
2376 | struct dentry *dentry; | ||
2377 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
2378 | |||
2379 | spin_lock(&mapping->i_mmap_lock); | ||
2380 | dentry = d_find_alias(mapping->host); | ||
2381 | printk(KERN_INFO "rescued: %s %lu\n", | ||
2382 | dentry_path(dentry, buf, 256), pgoff); | ||
2383 | spin_unlock(&mapping->i_mmap_lock); | ||
2384 | } else { | ||
2385 | #if defined(CONFIG_MM_OWNER) && defined(CONFIG_MMU) | ||
2386 | struct anon_vma *anon_vma; | ||
2387 | struct vm_area_struct *vma; | ||
2388 | |||
2389 | anon_vma = page_lock_anon_vma(page); | ||
2390 | if (!anon_vma) | ||
2391 | return; | ||
2392 | |||
2393 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | ||
2394 | printk(KERN_INFO "rescued: anon %s\n", | ||
2395 | vma->vm_mm->owner->comm); | ||
2396 | break; | ||
2397 | } | ||
2398 | page_unlock_anon_vma(anon_vma); | ||
2399 | #endif | ||
2400 | } | ||
2401 | } | ||
2402 | |||
2403 | |||
2404 | /** | ||
2405 | * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list | ||
2406 | * @page: page to check evictability and move to appropriate lru list | ||
2407 | * @zone: zone page is in | ||
2408 | * | ||
2409 | * Checks a page for evictability and moves the page to the appropriate | ||
2410 | * zone lru list. | ||
2411 | * | ||
2412 | * Restrictions: zone->lru_lock must be held, page must be on LRU and must | ||
2413 | * have PageUnevictable set. | ||
2414 | */ | ||
2415 | static void check_move_unevictable_page(struct page *page, struct zone *zone) | ||
2416 | { | ||
2417 | VM_BUG_ON(PageActive(page)); | ||
2418 | |||
2419 | retry: | ||
2420 | ClearPageUnevictable(page); | ||
2421 | if (page_evictable(page, NULL)) { | ||
2422 | enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); | ||
2423 | |||
2424 | show_page_path(page); | ||
2425 | |||
2426 | __dec_zone_state(zone, NR_UNEVICTABLE); | ||
2427 | list_move(&page->lru, &zone->lru[l].list); | ||
2428 | __inc_zone_state(zone, NR_INACTIVE_ANON + l); | ||
2429 | __count_vm_event(UNEVICTABLE_PGRESCUED); | ||
2430 | } else { | ||
2431 | /* | ||
2432 | * rotate unevictable list | ||
2433 | */ | ||
2434 | SetPageUnevictable(page); | ||
2435 | list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); | ||
2436 | if (page_evictable(page, NULL)) | ||
2437 | goto retry; | ||
2438 | } | ||
2439 | } | ||
2440 | |||
2441 | /** | ||
2442 | * scan_mapping_unevictable_pages - scan an address space for evictable pages | ||
2443 | * @mapping: struct address_space to scan for evictable pages | ||
2444 | * | ||
2445 | * Scan all pages in mapping. Check unevictable pages for | ||
2446 | * evictability and move them to the appropriate zone lru list. | ||
2447 | */ | ||
2448 | void scan_mapping_unevictable_pages(struct address_space *mapping) | ||
2449 | { | ||
2450 | pgoff_t next = 0; | ||
2451 | pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >> | ||
2452 | PAGE_CACHE_SHIFT; | ||
2453 | struct zone *zone; | ||
2454 | struct pagevec pvec; | ||
2455 | |||
2456 | if (mapping->nrpages == 0) | ||
2457 | return; | ||
2458 | |||
2459 | pagevec_init(&pvec, 0); | ||
2460 | while (next < end && | ||
2461 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | ||
2462 | int i; | ||
2463 | int pg_scanned = 0; | ||
2464 | |||
2465 | zone = NULL; | ||
2466 | |||
2467 | for (i = 0; i < pagevec_count(&pvec); i++) { | ||
2468 | struct page *page = pvec.pages[i]; | ||
2469 | pgoff_t page_index = page->index; | ||
2470 | struct zone *pagezone = page_zone(page); | ||
2471 | |||
2472 | pg_scanned++; | ||
2473 | if (page_index > next) | ||
2474 | next = page_index; | ||
2475 | next++; | ||
2476 | |||
2477 | if (pagezone != zone) { | ||
2478 | if (zone) | ||
2479 | spin_unlock_irq(&zone->lru_lock); | ||
2480 | zone = pagezone; | ||
2481 | spin_lock_irq(&zone->lru_lock); | ||
2482 | } | ||
2483 | |||
2484 | if (PageLRU(page) && PageUnevictable(page)) | ||
2485 | check_move_unevictable_page(page, zone); | ||
2486 | } | ||
2487 | if (zone) | ||
2488 | spin_unlock_irq(&zone->lru_lock); | ||
2489 | pagevec_release(&pvec); | ||
2490 | |||
2491 | count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned); | ||
2492 | } | ||
2493 | |||
2494 | } | ||
2495 | |||
2496 | /** | ||
2497 | * scan_zone_unevictable_pages - check unevictable list for evictable pages | ||
2498 | * @zone - zone of which to scan the unevictable list | ||
2499 | * | ||
2500 | * Scan @zone's unevictable LRU lists to check for pages that have become | ||
2501 | * evictable. Move those that have to @zone's inactive list where they | ||
2502 | * become candidates for reclaim, unless shrink_inactive_zone() decides | ||
2503 | * to reactivate them. Pages that are still unevictable are rotated | ||
2504 | * back onto @zone's unevictable list. | ||
2505 | */ | ||
2506 | #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ | ||
2507 | void scan_zone_unevictable_pages(struct zone *zone) | ||
2508 | { | ||
2509 | struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; | ||
2510 | unsigned long scan; | ||
2511 | unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); | ||
2512 | |||
2513 | while (nr_to_scan > 0) { | ||
2514 | unsigned long batch_size = min(nr_to_scan, | ||
2515 | SCAN_UNEVICTABLE_BATCH_SIZE); | ||
2516 | |||
2517 | spin_lock_irq(&zone->lru_lock); | ||
2518 | for (scan = 0; scan < batch_size; scan++) { | ||
2519 | struct page *page = lru_to_page(l_unevictable); | ||
2520 | |||
2521 | if (!trylock_page(page)) | ||
2522 | continue; | ||
2523 | |||
2524 | prefetchw_prev_lru_page(page, l_unevictable, flags); | ||
2525 | |||
2526 | if (likely(PageLRU(page) && PageUnevictable(page))) | ||
2527 | check_move_unevictable_page(page, zone); | ||
2528 | |||
2529 | unlock_page(page); | ||
2530 | } | ||
2531 | spin_unlock_irq(&zone->lru_lock); | ||
2532 | |||
2533 | nr_to_scan -= batch_size; | ||
2534 | } | ||
2535 | } | ||
2536 | |||
2537 | |||
2538 | /** | ||
2539 | * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages | ||
2540 | * | ||
2541 | * A really big hammer: scan all zones' unevictable LRU lists to check for | ||
2542 | * pages that have become evictable. Move those back to the zones' | ||
2543 | * inactive list where they become candidates for reclaim. | ||
2544 | * This occurs when, e.g., we have unswappable pages on the unevictable lists, | ||
2545 | * and we add swap to the system. As such, it runs in the context of a task | ||
2546 | * that has possibly/probably made some previously unevictable pages | ||
2547 | * evictable. | ||
2548 | */ | ||
2549 | void scan_all_zones_unevictable_pages(void) | ||
2550 | { | ||
2551 | struct zone *zone; | ||
2552 | |||
2553 | for_each_zone(zone) { | ||
2554 | scan_zone_unevictable_pages(zone); | ||
2555 | } | ||
2556 | } | ||
2557 | |||
2558 | /* | ||
2559 | * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of | ||
2560 | * all nodes' unevictable lists for evictable pages | ||
2561 | */ | ||
2562 | unsigned long scan_unevictable_pages; | ||
2563 | |||
2564 | int scan_unevictable_handler(struct ctl_table *table, int write, | ||
2565 | struct file *file, void __user *buffer, | ||
2566 | size_t *length, loff_t *ppos) | ||
2567 | { | ||
2568 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); | ||
2569 | |||
2570 | if (write && *(unsigned long *)table->data) | ||
2571 | scan_all_zones_unevictable_pages(); | ||
2572 | |||
2573 | scan_unevictable_pages = 0; | ||
2574 | return 0; | ||
2575 | } | ||
2576 | |||
2577 | /* | ||
2578 | * per node 'scan_unevictable_pages' attribute. On demand re-scan of | ||
2579 | * a specified node's per zone unevictable lists for evictable pages. | ||
2580 | */ | ||
2581 | |||
2582 | static ssize_t read_scan_unevictable_node(struct sys_device *dev, | ||
2583 | struct sysdev_attribute *attr, | ||
2584 | char *buf) | ||
2585 | { | ||
2586 | return sprintf(buf, "0\n"); /* always zero; should fit... */ | ||
2587 | } | ||
2588 | |||
2589 | static ssize_t write_scan_unevictable_node(struct sys_device *dev, | ||
2590 | struct sysdev_attribute *attr, | ||
2591 | const char *buf, size_t count) | ||
2592 | { | ||
2593 | struct zone *node_zones = NODE_DATA(dev->id)->node_zones; | ||
2594 | struct zone *zone; | ||
2595 | unsigned long res; | ||
2596 | unsigned long req = strict_strtoul(buf, 10, &res); | ||
2597 | |||
2598 | if (!req) | ||
2599 | return 1; /* zero is no-op */ | ||
2600 | |||
2601 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | ||
2602 | if (!populated_zone(zone)) | ||
2603 | continue; | ||
2604 | scan_zone_unevictable_pages(zone); | ||
2605 | } | ||
2606 | return 1; | ||
2607 | } | ||
2608 | |||
2609 | |||
2610 | static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, | ||
2611 | read_scan_unevictable_node, | ||
2612 | write_scan_unevictable_node); | ||
2613 | |||
2614 | int scan_unevictable_register_node(struct node *node) | ||
2615 | { | ||
2616 | return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages); | ||
2617 | } | ||
2618 | |||
2619 | void scan_unevictable_unregister_node(struct node *node) | ||
2620 | { | ||
2621 | sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); | ||
2622 | } | ||
2623 | |||
2624 | #endif | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index db9eabb2c5b3..c3ccfda23adc 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -8,11 +8,12 @@ | |||
8 | * Copyright (C) 2006 Silicon Graphics, Inc., | 8 | * Copyright (C) 2006 Silicon Graphics, Inc., |
9 | * Christoph Lameter <christoph@lameter.com> | 9 | * Christoph Lameter <christoph@lameter.com> |
10 | */ | 10 | */ |
11 | 11 | #include <linux/fs.h> | |
12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
13 | #include <linux/err.h> | 13 | #include <linux/err.h> |
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/cpu.h> | 15 | #include <linux/cpu.h> |
16 | #include <linux/vmstat.h> | ||
16 | #include <linux/sched.h> | 17 | #include <linux/sched.h> |
17 | 18 | ||
18 | #ifdef CONFIG_VM_EVENT_COUNTERS | 19 | #ifdef CONFIG_VM_EVENT_COUNTERS |
@@ -26,7 +27,7 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) | |||
26 | 27 | ||
27 | memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); | 28 | memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); |
28 | 29 | ||
29 | for_each_cpu_mask(cpu, *cpumask) { | 30 | for_each_cpu_mask_nr(cpu, *cpumask) { |
30 | struct vm_event_state *this = &per_cpu(vm_event_states, cpu); | 31 | struct vm_event_state *this = &per_cpu(vm_event_states, cpu); |
31 | 32 | ||
32 | for (i = 0; i < NR_VM_EVENT_ITEMS; i++) | 33 | for (i = 0; i < NR_VM_EVENT_ITEMS; i++) |
@@ -383,7 +384,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z) | |||
383 | #endif | 384 | #endif |
384 | 385 | ||
385 | #ifdef CONFIG_PROC_FS | 386 | #ifdef CONFIG_PROC_FS |
386 | 387 | #include <linux/proc_fs.h> | |
387 | #include <linux/seq_file.h> | 388 | #include <linux/seq_file.h> |
388 | 389 | ||
389 | static char * const migratetype_names[MIGRATE_TYPES] = { | 390 | static char * const migratetype_names[MIGRATE_TYPES] = { |
@@ -515,9 +516,26 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, | |||
515 | continue; | 516 | continue; |
516 | 517 | ||
517 | page = pfn_to_page(pfn); | 518 | page = pfn_to_page(pfn); |
519 | #ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES | ||
520 | /* | ||
521 | * Ordinarily, memory holes in flatmem still have a valid | ||
522 | * memmap for the PFN range. However, an architecture for | ||
523 | * embedded systems (e.g. ARM) can free up the memmap backing | ||
524 | * holes to save memory on the assumption the memmap is | ||
525 | * never used. The page_zone linkages are then broken even | ||
526 | * though pfn_valid() returns true. Skip the page if the | ||
527 | * linkages are broken. Even if this test passed, the impact | ||
528 | * is that the counters for the movable type are off but | ||
529 | * fragmentation monitoring is likely meaningless on small | ||
530 | * systems. | ||
531 | */ | ||
532 | if (page_zone(page) != zone) | ||
533 | continue; | ||
534 | #endif | ||
518 | mtype = get_pageblock_migratetype(page); | 535 | mtype = get_pageblock_migratetype(page); |
519 | 536 | ||
520 | count[mtype]++; | 537 | if (mtype < MIGRATE_TYPES) |
538 | count[mtype]++; | ||
521 | } | 539 | } |
522 | 540 | ||
523 | /* Print counts */ | 541 | /* Print counts */ |
@@ -563,20 +581,44 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg) | |||
563 | return 0; | 581 | return 0; |
564 | } | 582 | } |
565 | 583 | ||
566 | const struct seq_operations fragmentation_op = { | 584 | static const struct seq_operations fragmentation_op = { |
567 | .start = frag_start, | 585 | .start = frag_start, |
568 | .next = frag_next, | 586 | .next = frag_next, |
569 | .stop = frag_stop, | 587 | .stop = frag_stop, |
570 | .show = frag_show, | 588 | .show = frag_show, |
571 | }; | 589 | }; |
572 | 590 | ||
573 | const struct seq_operations pagetypeinfo_op = { | 591 | static int fragmentation_open(struct inode *inode, struct file *file) |
592 | { | ||
593 | return seq_open(file, &fragmentation_op); | ||
594 | } | ||
595 | |||
596 | static const struct file_operations fragmentation_file_operations = { | ||
597 | .open = fragmentation_open, | ||
598 | .read = seq_read, | ||
599 | .llseek = seq_lseek, | ||
600 | .release = seq_release, | ||
601 | }; | ||
602 | |||
603 | static const struct seq_operations pagetypeinfo_op = { | ||
574 | .start = frag_start, | 604 | .start = frag_start, |
575 | .next = frag_next, | 605 | .next = frag_next, |
576 | .stop = frag_stop, | 606 | .stop = frag_stop, |
577 | .show = pagetypeinfo_show, | 607 | .show = pagetypeinfo_show, |
578 | }; | 608 | }; |
579 | 609 | ||
610 | static int pagetypeinfo_open(struct inode *inode, struct file *file) | ||
611 | { | ||
612 | return seq_open(file, &pagetypeinfo_op); | ||
613 | } | ||
614 | |||
615 | static const struct file_operations pagetypeinfo_file_ops = { | ||
616 | .open = pagetypeinfo_open, | ||
617 | .read = seq_read, | ||
618 | .llseek = seq_lseek, | ||
619 | .release = seq_release, | ||
620 | }; | ||
621 | |||
580 | #ifdef CONFIG_ZONE_DMA | 622 | #ifdef CONFIG_ZONE_DMA |
581 | #define TEXT_FOR_DMA(xx) xx "_dma", | 623 | #define TEXT_FOR_DMA(xx) xx "_dma", |
582 | #else | 624 | #else |
@@ -601,8 +643,14 @@ const struct seq_operations pagetypeinfo_op = { | |||
601 | static const char * const vmstat_text[] = { | 643 | static const char * const vmstat_text[] = { |
602 | /* Zoned VM counters */ | 644 | /* Zoned VM counters */ |
603 | "nr_free_pages", | 645 | "nr_free_pages", |
604 | "nr_inactive", | 646 | "nr_inactive_anon", |
605 | "nr_active", | 647 | "nr_active_anon", |
648 | "nr_inactive_file", | ||
649 | "nr_active_file", | ||
650 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
651 | "nr_unevictable", | ||
652 | "nr_mlock", | ||
653 | #endif | ||
606 | "nr_anon_pages", | 654 | "nr_anon_pages", |
607 | "nr_mapped", | 655 | "nr_mapped", |
608 | "nr_file_pages", | 656 | "nr_file_pages", |
@@ -657,6 +705,16 @@ static const char * const vmstat_text[] = { | |||
657 | "htlb_buddy_alloc_success", | 705 | "htlb_buddy_alloc_success", |
658 | "htlb_buddy_alloc_fail", | 706 | "htlb_buddy_alloc_fail", |
659 | #endif | 707 | #endif |
708 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
709 | "unevictable_pgs_culled", | ||
710 | "unevictable_pgs_scanned", | ||
711 | "unevictable_pgs_rescued", | ||
712 | "unevictable_pgs_mlocked", | ||
713 | "unevictable_pgs_munlocked", | ||
714 | "unevictable_pgs_cleared", | ||
715 | "unevictable_pgs_stranded", | ||
716 | "unevictable_pgs_mlockfreed", | ||
717 | #endif | ||
660 | #endif | 718 | #endif |
661 | }; | 719 | }; |
662 | 720 | ||
@@ -670,7 +728,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
670 | "\n min %lu" | 728 | "\n min %lu" |
671 | "\n low %lu" | 729 | "\n low %lu" |
672 | "\n high %lu" | 730 | "\n high %lu" |
673 | "\n scanned %lu (a: %lu i: %lu)" | 731 | "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" |
674 | "\n spanned %lu" | 732 | "\n spanned %lu" |
675 | "\n present %lu", | 733 | "\n present %lu", |
676 | zone_page_state(zone, NR_FREE_PAGES), | 734 | zone_page_state(zone, NR_FREE_PAGES), |
@@ -678,7 +736,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
678 | zone->pages_low, | 736 | zone->pages_low, |
679 | zone->pages_high, | 737 | zone->pages_high, |
680 | zone->pages_scanned, | 738 | zone->pages_scanned, |
681 | zone->nr_scan_active, zone->nr_scan_inactive, | 739 | zone->lru[LRU_ACTIVE_ANON].nr_scan, |
740 | zone->lru[LRU_INACTIVE_ANON].nr_scan, | ||
741 | zone->lru[LRU_ACTIVE_FILE].nr_scan, | ||
742 | zone->lru[LRU_INACTIVE_FILE].nr_scan, | ||
682 | zone->spanned_pages, | 743 | zone->spanned_pages, |
683 | zone->present_pages); | 744 | zone->present_pages); |
684 | 745 | ||
@@ -715,10 +776,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
715 | seq_printf(m, | 776 | seq_printf(m, |
716 | "\n all_unreclaimable: %u" | 777 | "\n all_unreclaimable: %u" |
717 | "\n prev_priority: %i" | 778 | "\n prev_priority: %i" |
718 | "\n start_pfn: %lu", | 779 | "\n start_pfn: %lu" |
780 | "\n inactive_ratio: %u", | ||
719 | zone_is_all_unreclaimable(zone), | 781 | zone_is_all_unreclaimable(zone), |
720 | zone->prev_priority, | 782 | zone->prev_priority, |
721 | zone->zone_start_pfn); | 783 | zone->zone_start_pfn, |
784 | zone->inactive_ratio); | ||
722 | seq_putc(m, '\n'); | 785 | seq_putc(m, '\n'); |
723 | } | 786 | } |
724 | 787 | ||
@@ -732,7 +795,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) | |||
732 | return 0; | 795 | return 0; |
733 | } | 796 | } |
734 | 797 | ||
735 | const struct seq_operations zoneinfo_op = { | 798 | static const struct seq_operations zoneinfo_op = { |
736 | .start = frag_start, /* iterate over all zones. The same as in | 799 | .start = frag_start, /* iterate over all zones. The same as in |
737 | * fragmentation. */ | 800 | * fragmentation. */ |
738 | .next = frag_next, | 801 | .next = frag_next, |
@@ -740,6 +803,18 @@ const struct seq_operations zoneinfo_op = { | |||
740 | .show = zoneinfo_show, | 803 | .show = zoneinfo_show, |
741 | }; | 804 | }; |
742 | 805 | ||
806 | static int zoneinfo_open(struct inode *inode, struct file *file) | ||
807 | { | ||
808 | return seq_open(file, &zoneinfo_op); | ||
809 | } | ||
810 | |||
811 | static const struct file_operations proc_zoneinfo_file_operations = { | ||
812 | .open = zoneinfo_open, | ||
813 | .read = seq_read, | ||
814 | .llseek = seq_lseek, | ||
815 | .release = seq_release, | ||
816 | }; | ||
817 | |||
743 | static void *vmstat_start(struct seq_file *m, loff_t *pos) | 818 | static void *vmstat_start(struct seq_file *m, loff_t *pos) |
744 | { | 819 | { |
745 | unsigned long *v; | 820 | unsigned long *v; |
@@ -795,13 +870,24 @@ static void vmstat_stop(struct seq_file *m, void *arg) | |||
795 | m->private = NULL; | 870 | m->private = NULL; |
796 | } | 871 | } |
797 | 872 | ||
798 | const struct seq_operations vmstat_op = { | 873 | static const struct seq_operations vmstat_op = { |
799 | .start = vmstat_start, | 874 | .start = vmstat_start, |
800 | .next = vmstat_next, | 875 | .next = vmstat_next, |
801 | .stop = vmstat_stop, | 876 | .stop = vmstat_stop, |
802 | .show = vmstat_show, | 877 | .show = vmstat_show, |
803 | }; | 878 | }; |
804 | 879 | ||
880 | static int vmstat_open(struct inode *inode, struct file *file) | ||
881 | { | ||
882 | return seq_open(file, &vmstat_op); | ||
883 | } | ||
884 | |||
885 | static const struct file_operations proc_vmstat_file_operations = { | ||
886 | .open = vmstat_open, | ||
887 | .read = seq_read, | ||
888 | .llseek = seq_lseek, | ||
889 | .release = seq_release, | ||
890 | }; | ||
805 | #endif /* CONFIG_PROC_FS */ | 891 | #endif /* CONFIG_PROC_FS */ |
806 | 892 | ||
807 | #ifdef CONFIG_SMP | 893 | #ifdef CONFIG_SMP |
@@ -859,9 +945,11 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, | |||
859 | 945 | ||
860 | static struct notifier_block __cpuinitdata vmstat_notifier = | 946 | static struct notifier_block __cpuinitdata vmstat_notifier = |
861 | { &vmstat_cpuup_callback, NULL, 0 }; | 947 | { &vmstat_cpuup_callback, NULL, 0 }; |
948 | #endif | ||
862 | 949 | ||
863 | static int __init setup_vmstat(void) | 950 | static int __init setup_vmstat(void) |
864 | { | 951 | { |
952 | #ifdef CONFIG_SMP | ||
865 | int cpu; | 953 | int cpu; |
866 | 954 | ||
867 | refresh_zone_stat_thresholds(); | 955 | refresh_zone_stat_thresholds(); |
@@ -869,7 +957,13 @@ static int __init setup_vmstat(void) | |||
869 | 957 | ||
870 | for_each_online_cpu(cpu) | 958 | for_each_online_cpu(cpu) |
871 | start_cpu_timer(cpu); | 959 | start_cpu_timer(cpu); |
960 | #endif | ||
961 | #ifdef CONFIG_PROC_FS | ||
962 | proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); | ||
963 | proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops); | ||
964 | proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); | ||
965 | proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); | ||
966 | #endif | ||
872 | return 0; | 967 | return 0; |
873 | } | 968 | } |
874 | module_init(setup_vmstat) | 969 | module_init(setup_vmstat) |
875 | #endif | ||