diff options
author | David Woodhouse <dwmw2@infradead.org> | 2006-10-01 12:55:53 -0400 |
---|---|---|
committer | David Woodhouse <dwmw2@infradead.org> | 2006-10-01 12:55:53 -0400 |
commit | 8a84fc15ae5cafcc366dd85cf8e1ab2040679abc (patch) | |
tree | 5d8dce194c9667fa92e9ec9f545cec867a9a1e0d /mm | |
parent | 28b79ff9661b22e4c41c0d00d4ab8503e810f13d (diff) | |
parent | 82965addad66fce61a92c5f03104ea90b0b87124 (diff) |
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Manually resolve conflict in include/mtd/Kbuild
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 7 | ||||
-rw-r--r-- | mm/Makefile | 6 | ||||
-rw-r--r-- | mm/allocpercpu.c | 129 | ||||
-rw-r--r-- | mm/bootmem.c | 202 | ||||
-rw-r--r-- | mm/bounce.c | 302 | ||||
-rw-r--r-- | mm/filemap.c | 211 | ||||
-rw-r--r-- | mm/fremap.c | 6 | ||||
-rw-r--r-- | mm/highmem.c | 294 | ||||
-rw-r--r-- | mm/hugetlb.c | 10 | ||||
-rw-r--r-- | mm/internal.h | 4 | ||||
-rw-r--r-- | mm/memory.c | 215 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 71 | ||||
-rw-r--r-- | mm/mempolicy.c | 26 | ||||
-rw-r--r-- | mm/migrate.c | 6 | ||||
-rw-r--r-- | mm/mmap.c | 19 | ||||
-rw-r--r-- | mm/mprotect.c | 53 | ||||
-rw-r--r-- | mm/mremap.c | 2 | ||||
-rw-r--r-- | mm/msync.c | 196 | ||||
-rw-r--r-- | mm/nommu.c | 250 | ||||
-rw-r--r-- | mm/oom_kill.c | 126 | ||||
-rw-r--r-- | mm/page-writeback.c | 198 | ||||
-rw-r--r-- | mm/page_alloc.c | 976 | ||||
-rw-r--r-- | mm/page_io.c | 48 | ||||
-rw-r--r-- | mm/rmap.c | 65 | ||||
-rw-r--r-- | mm/shmem.c | 122 | ||||
-rw-r--r-- | mm/shmem_acl.c | 197 | ||||
-rw-r--r-- | mm/slab.c | 458 | ||||
-rw-r--r-- | mm/slob.c | 52 | ||||
-rw-r--r-- | mm/swap.c | 49 | ||||
-rw-r--r-- | mm/swapfile.c | 7 | ||||
-rw-r--r-- | mm/truncate.c | 85 | ||||
-rw-r--r-- | mm/util.c | 18 | ||||
-rw-r--r-- | mm/vmalloc.c | 38 | ||||
-rw-r--r-- | mm/vmscan.c | 140 | ||||
-rw-r--r-- | mm/vmstat.c | 52 |
35 files changed, 3199 insertions, 1441 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 8f5b45615f..5d88489ef2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -115,12 +115,17 @@ config SPARSEMEM_EXTREME | |||
115 | # eventually, we can have this option just 'select SPARSEMEM' | 115 | # eventually, we can have this option just 'select SPARSEMEM' |
116 | config MEMORY_HOTPLUG | 116 | config MEMORY_HOTPLUG |
117 | bool "Allow for memory hot-add" | 117 | bool "Allow for memory hot-add" |
118 | depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG | 118 | depends on SPARSEMEM || X86_64_ACPI_NUMA |
119 | depends on HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG | ||
119 | depends on (IA64 || X86 || PPC64) | 120 | depends on (IA64 || X86 || PPC64) |
120 | 121 | ||
121 | comment "Memory hotplug is currently incompatible with Software Suspend" | 122 | comment "Memory hotplug is currently incompatible with Software Suspend" |
122 | depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND | 123 | depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND |
123 | 124 | ||
125 | config MEMORY_HOTPLUG_SPARSE | ||
126 | def_bool y | ||
127 | depends on SPARSEMEM && MEMORY_HOTPLUG | ||
128 | |||
124 | # Heavily threaded applications may benefit from splitting the mm-wide | 129 | # Heavily threaded applications may benefit from splitting the mm-wide |
125 | # page_table_lock, so that faults on different parts of the user address | 130 | # page_table_lock, so that faults on different parts of the user address |
126 | # space can be handled with less contention: split it at this NR_CPUS. | 131 | # space can be handled with less contention: split it at this NR_CPUS. |
diff --git a/mm/Makefile b/mm/Makefile index 9dd824c11e..12b3a4eee8 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -12,15 +12,19 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | |||
12 | readahead.o swap.o truncate.o vmscan.o \ | 12 | readahead.o swap.o truncate.o vmscan.o \ |
13 | prio_tree.o util.o mmzone.o vmstat.o $(mmu-y) | 13 | prio_tree.o util.o mmzone.o vmstat.o $(mmu-y) |
14 | 14 | ||
15 | ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy) | ||
16 | obj-y += bounce.o | ||
17 | endif | ||
15 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o | 18 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o |
16 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o | 19 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o |
17 | obj-$(CONFIG_NUMA) += mempolicy.o | 20 | obj-$(CONFIG_NUMA) += mempolicy.o |
18 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 21 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
19 | obj-$(CONFIG_SHMEM) += shmem.o | 22 | obj-$(CONFIG_SHMEM) += shmem.o |
23 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o | ||
20 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o | 24 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o |
21 | obj-$(CONFIG_SLOB) += slob.o | 25 | obj-$(CONFIG_SLOB) += slob.o |
22 | obj-$(CONFIG_SLAB) += slab.o | 26 | obj-$(CONFIG_SLAB) += slab.o |
23 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 27 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
24 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 28 | obj-$(CONFIG_FS_XIP) += filemap_xip.o |
25 | obj-$(CONFIG_MIGRATION) += migrate.o | 29 | obj-$(CONFIG_MIGRATION) += migrate.o |
26 | 30 | obj-$(CONFIG_SMP) += allocpercpu.o | |
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c new file mode 100644 index 0000000000..eaa9abeea5 --- /dev/null +++ b/mm/allocpercpu.c | |||
@@ -0,0 +1,129 @@ | |||
1 | /* | ||
2 | * linux/mm/allocpercpu.c | ||
3 | * | ||
4 | * Separated from slab.c August 11, 2006 Christoph Lameter <clameter@sgi.com> | ||
5 | */ | ||
6 | #include <linux/mm.h> | ||
7 | #include <linux/module.h> | ||
8 | |||
9 | /** | ||
10 | * percpu_depopulate - depopulate per-cpu data for given cpu | ||
11 | * @__pdata: per-cpu data to depopulate | ||
12 | * @cpu: depopulate per-cpu data for this cpu | ||
13 | * | ||
14 | * Depopulating per-cpu data for a cpu going offline would be a typical | ||
15 | * use case. You need to register a cpu hotplug handler for that purpose. | ||
16 | */ | ||
17 | void percpu_depopulate(void *__pdata, int cpu) | ||
18 | { | ||
19 | struct percpu_data *pdata = __percpu_disguise(__pdata); | ||
20 | if (pdata->ptrs[cpu]) { | ||
21 | kfree(pdata->ptrs[cpu]); | ||
22 | pdata->ptrs[cpu] = NULL; | ||
23 | } | ||
24 | } | ||
25 | EXPORT_SYMBOL_GPL(percpu_depopulate); | ||
26 | |||
27 | /** | ||
28 | * percpu_depopulate_mask - depopulate per-cpu data for some cpu's | ||
29 | * @__pdata: per-cpu data to depopulate | ||
30 | * @mask: depopulate per-cpu data for cpu's selected through mask bits | ||
31 | */ | ||
32 | void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) | ||
33 | { | ||
34 | int cpu; | ||
35 | for_each_cpu_mask(cpu, *mask) | ||
36 | percpu_depopulate(__pdata, cpu); | ||
37 | } | ||
38 | EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); | ||
39 | |||
40 | /** | ||
41 | * percpu_populate - populate per-cpu data for given cpu | ||
42 | * @__pdata: per-cpu data to populate further | ||
43 | * @size: size of per-cpu object | ||
44 | * @gfp: may sleep or not etc. | ||
45 | * @cpu: populate per-data for this cpu | ||
46 | * | ||
47 | * Populating per-cpu data for a cpu coming online would be a typical | ||
48 | * use case. You need to register a cpu hotplug handler for that purpose. | ||
49 | * Per-cpu object is populated with zeroed buffer. | ||
50 | */ | ||
51 | void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) | ||
52 | { | ||
53 | struct percpu_data *pdata = __percpu_disguise(__pdata); | ||
54 | int node = cpu_to_node(cpu); | ||
55 | |||
56 | BUG_ON(pdata->ptrs[cpu]); | ||
57 | if (node_online(node)) { | ||
58 | /* FIXME: kzalloc_node(size, gfp, node) */ | ||
59 | pdata->ptrs[cpu] = kmalloc_node(size, gfp, node); | ||
60 | if (pdata->ptrs[cpu]) | ||
61 | memset(pdata->ptrs[cpu], 0, size); | ||
62 | } else | ||
63 | pdata->ptrs[cpu] = kzalloc(size, gfp); | ||
64 | return pdata->ptrs[cpu]; | ||
65 | } | ||
66 | EXPORT_SYMBOL_GPL(percpu_populate); | ||
67 | |||
68 | /** | ||
69 | * percpu_populate_mask - populate per-cpu data for more cpu's | ||
70 | * @__pdata: per-cpu data to populate further | ||
71 | * @size: size of per-cpu object | ||
72 | * @gfp: may sleep or not etc. | ||
73 | * @mask: populate per-cpu data for cpu's selected through mask bits | ||
74 | * | ||
75 | * Per-cpu objects are populated with zeroed buffers. | ||
76 | */ | ||
77 | int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, | ||
78 | cpumask_t *mask) | ||
79 | { | ||
80 | cpumask_t populated = CPU_MASK_NONE; | ||
81 | int cpu; | ||
82 | |||
83 | for_each_cpu_mask(cpu, *mask) | ||
84 | if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { | ||
85 | __percpu_depopulate_mask(__pdata, &populated); | ||
86 | return -ENOMEM; | ||
87 | } else | ||
88 | cpu_set(cpu, populated); | ||
89 | return 0; | ||
90 | } | ||
91 | EXPORT_SYMBOL_GPL(__percpu_populate_mask); | ||
92 | |||
93 | /** | ||
94 | * percpu_alloc_mask - initial setup of per-cpu data | ||
95 | * @size: size of per-cpu object | ||
96 | * @gfp: may sleep or not etc. | ||
97 | * @mask: populate per-data for cpu's selected through mask bits | ||
98 | * | ||
99 | * Populating per-cpu data for all online cpu's would be a typical use case, | ||
100 | * which is simplified by the percpu_alloc() wrapper. | ||
101 | * Per-cpu objects are populated with zeroed buffers. | ||
102 | */ | ||
103 | void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) | ||
104 | { | ||
105 | void *pdata = kzalloc(sizeof(struct percpu_data), gfp); | ||
106 | void *__pdata = __percpu_disguise(pdata); | ||
107 | |||
108 | if (unlikely(!pdata)) | ||
109 | return NULL; | ||
110 | if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask))) | ||
111 | return __pdata; | ||
112 | kfree(pdata); | ||
113 | return NULL; | ||
114 | } | ||
115 | EXPORT_SYMBOL_GPL(__percpu_alloc_mask); | ||
116 | |||
117 | /** | ||
118 | * percpu_free - final cleanup of per-cpu data | ||
119 | * @__pdata: object to clean up | ||
120 | * | ||
121 | * We simply clean up any per-cpu object left. No need for the client to | ||
122 | * track and specify through a bis mask which per-cpu objects are to free. | ||
123 | */ | ||
124 | void percpu_free(void *__pdata) | ||
125 | { | ||
126 | __percpu_depopulate_mask(__pdata, &cpu_possible_map); | ||
127 | kfree(__percpu_disguise(__pdata)); | ||
128 | } | ||
129 | EXPORT_SYMBOL_GPL(percpu_free); | ||
diff --git a/mm/bootmem.c b/mm/bootmem.c index 50353e0dac..d53112fcb4 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -8,17 +8,15 @@ | |||
8 | * free memory collector. It's used to deal with reserved | 8 | * free memory collector. It's used to deal with reserved |
9 | * system memory and memory holes as well. | 9 | * system memory and memory holes as well. |
10 | */ | 10 | */ |
11 | |||
12 | #include <linux/mm.h> | ||
13 | #include <linux/kernel_stat.h> | ||
14 | #include <linux/swap.h> | ||
15 | #include <linux/interrupt.h> | ||
16 | #include <linux/init.h> | 11 | #include <linux/init.h> |
12 | #include <linux/pfn.h> | ||
17 | #include <linux/bootmem.h> | 13 | #include <linux/bootmem.h> |
18 | #include <linux/mmzone.h> | ||
19 | #include <linux/module.h> | 14 | #include <linux/module.h> |
20 | #include <asm/dma.h> | 15 | |
16 | #include <asm/bug.h> | ||
21 | #include <asm/io.h> | 17 | #include <asm/io.h> |
18 | #include <asm/processor.h> | ||
19 | |||
22 | #include "internal.h" | 20 | #include "internal.h" |
23 | 21 | ||
24 | /* | 22 | /* |
@@ -41,7 +39,7 @@ unsigned long saved_max_pfn; | |||
41 | #endif | 39 | #endif |
42 | 40 | ||
43 | /* return the number of _pages_ that will be allocated for the boot bitmap */ | 41 | /* return the number of _pages_ that will be allocated for the boot bitmap */ |
44 | unsigned long __init bootmem_bootmap_pages (unsigned long pages) | 42 | unsigned long __init bootmem_bootmap_pages(unsigned long pages) |
45 | { | 43 | { |
46 | unsigned long mapsize; | 44 | unsigned long mapsize; |
47 | 45 | ||
@@ -51,12 +49,14 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages) | |||
51 | 49 | ||
52 | return mapsize; | 50 | return mapsize; |
53 | } | 51 | } |
52 | |||
54 | /* | 53 | /* |
55 | * link bdata in order | 54 | * link bdata in order |
56 | */ | 55 | */ |
57 | static void link_bootmem(bootmem_data_t *bdata) | 56 | static void __init link_bootmem(bootmem_data_t *bdata) |
58 | { | 57 | { |
59 | bootmem_data_t *ent; | 58 | bootmem_data_t *ent; |
59 | |||
60 | if (list_empty(&bdata_list)) { | 60 | if (list_empty(&bdata_list)) { |
61 | list_add(&bdata->list, &bdata_list); | 61 | list_add(&bdata->list, &bdata_list); |
62 | return; | 62 | return; |
@@ -69,22 +69,32 @@ static void link_bootmem(bootmem_data_t *bdata) | |||
69 | } | 69 | } |
70 | } | 70 | } |
71 | list_add_tail(&bdata->list, &bdata_list); | 71 | list_add_tail(&bdata->list, &bdata_list); |
72 | return; | ||
73 | } | 72 | } |
74 | 73 | ||
74 | /* | ||
75 | * Given an initialised bdata, it returns the size of the boot bitmap | ||
76 | */ | ||
77 | static unsigned long __init get_mapsize(bootmem_data_t *bdata) | ||
78 | { | ||
79 | unsigned long mapsize; | ||
80 | unsigned long start = PFN_DOWN(bdata->node_boot_start); | ||
81 | unsigned long end = bdata->node_low_pfn; | ||
82 | |||
83 | mapsize = ((end - start) + 7) / 8; | ||
84 | return ALIGN(mapsize, sizeof(long)); | ||
85 | } | ||
75 | 86 | ||
76 | /* | 87 | /* |
77 | * Called once to set up the allocator itself. | 88 | * Called once to set up the allocator itself. |
78 | */ | 89 | */ |
79 | static unsigned long __init init_bootmem_core (pg_data_t *pgdat, | 90 | static unsigned long __init init_bootmem_core(pg_data_t *pgdat, |
80 | unsigned long mapstart, unsigned long start, unsigned long end) | 91 | unsigned long mapstart, unsigned long start, unsigned long end) |
81 | { | 92 | { |
82 | bootmem_data_t *bdata = pgdat->bdata; | 93 | bootmem_data_t *bdata = pgdat->bdata; |
83 | unsigned long mapsize = ((end - start)+7)/8; | 94 | unsigned long mapsize; |
84 | 95 | ||
85 | mapsize = ALIGN(mapsize, sizeof(long)); | 96 | bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); |
86 | bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); | 97 | bdata->node_boot_start = PFN_PHYS(start); |
87 | bdata->node_boot_start = (start << PAGE_SHIFT); | ||
88 | bdata->node_low_pfn = end; | 98 | bdata->node_low_pfn = end; |
89 | link_bootmem(bdata); | 99 | link_bootmem(bdata); |
90 | 100 | ||
@@ -92,6 +102,7 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat, | |||
92 | * Initially all pages are reserved - setup_arch() has to | 102 | * Initially all pages are reserved - setup_arch() has to |
93 | * register free RAM areas explicitly. | 103 | * register free RAM areas explicitly. |
94 | */ | 104 | */ |
105 | mapsize = get_mapsize(bdata); | ||
95 | memset(bdata->node_bootmem_map, 0xff, mapsize); | 106 | memset(bdata->node_bootmem_map, 0xff, mapsize); |
96 | 107 | ||
97 | return mapsize; | 108 | return mapsize; |
@@ -102,22 +113,22 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat, | |||
102 | * might be used for boot-time allocations - or it might get added | 113 | * might be used for boot-time allocations - or it might get added |
103 | * to the free page pool later on. | 114 | * to the free page pool later on. |
104 | */ | 115 | */ |
105 | static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) | 116 | static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, |
117 | unsigned long size) | ||
106 | { | 118 | { |
119 | unsigned long sidx, eidx; | ||
107 | unsigned long i; | 120 | unsigned long i; |
121 | |||
108 | /* | 122 | /* |
109 | * round up, partially reserved pages are considered | 123 | * round up, partially reserved pages are considered |
110 | * fully reserved. | 124 | * fully reserved. |
111 | */ | 125 | */ |
112 | unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE; | ||
113 | unsigned long eidx = (addr + size - bdata->node_boot_start + | ||
114 | PAGE_SIZE-1)/PAGE_SIZE; | ||
115 | unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE; | ||
116 | |||
117 | BUG_ON(!size); | 126 | BUG_ON(!size); |
118 | BUG_ON(sidx >= eidx); | 127 | BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn); |
119 | BUG_ON((addr >> PAGE_SHIFT) >= bdata->node_low_pfn); | 128 | BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn); |
120 | BUG_ON(end > bdata->node_low_pfn); | 129 | |
130 | sidx = PFN_DOWN(addr - bdata->node_boot_start); | ||
131 | eidx = PFN_UP(addr + size - bdata->node_boot_start); | ||
121 | 132 | ||
122 | for (i = sidx; i < eidx; i++) | 133 | for (i = sidx; i < eidx; i++) |
123 | if (test_and_set_bit(i, bdata->node_bootmem_map)) { | 134 | if (test_and_set_bit(i, bdata->node_bootmem_map)) { |
@@ -127,20 +138,18 @@ static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long add | |||
127 | } | 138 | } |
128 | } | 139 | } |
129 | 140 | ||
130 | static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) | 141 | static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, |
142 | unsigned long size) | ||
131 | { | 143 | { |
144 | unsigned long sidx, eidx; | ||
132 | unsigned long i; | 145 | unsigned long i; |
133 | unsigned long start; | 146 | |
134 | /* | 147 | /* |
135 | * round down end of usable mem, partially free pages are | 148 | * round down end of usable mem, partially free pages are |
136 | * considered reserved. | 149 | * considered reserved. |
137 | */ | 150 | */ |
138 | unsigned long sidx; | ||
139 | unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE; | ||
140 | unsigned long end = (addr + size)/PAGE_SIZE; | ||
141 | |||
142 | BUG_ON(!size); | 151 | BUG_ON(!size); |
143 | BUG_ON(end > bdata->node_low_pfn); | 152 | BUG_ON(PFN_DOWN(addr + size) > bdata->node_low_pfn); |
144 | 153 | ||
145 | if (addr < bdata->last_success) | 154 | if (addr < bdata->last_success) |
146 | bdata->last_success = addr; | 155 | bdata->last_success = addr; |
@@ -148,8 +157,8 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, | |||
148 | /* | 157 | /* |
149 | * Round up the beginning of the address. | 158 | * Round up the beginning of the address. |
150 | */ | 159 | */ |
151 | start = (addr + PAGE_SIZE-1) / PAGE_SIZE; | 160 | sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start); |
152 | sidx = start - (bdata->node_boot_start/PAGE_SIZE); | 161 | eidx = PFN_DOWN(addr + size - bdata->node_boot_start); |
153 | 162 | ||
154 | for (i = sidx; i < eidx; i++) { | 163 | for (i = sidx; i < eidx; i++) { |
155 | if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) | 164 | if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) |
@@ -175,10 +184,10 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, | |||
175 | unsigned long align, unsigned long goal, unsigned long limit) | 184 | unsigned long align, unsigned long goal, unsigned long limit) |
176 | { | 185 | { |
177 | unsigned long offset, remaining_size, areasize, preferred; | 186 | unsigned long offset, remaining_size, areasize, preferred; |
178 | unsigned long i, start = 0, incr, eidx, end_pfn = bdata->node_low_pfn; | 187 | unsigned long i, start = 0, incr, eidx, end_pfn; |
179 | void *ret; | 188 | void *ret; |
180 | 189 | ||
181 | if(!size) { | 190 | if (!size) { |
182 | printk("__alloc_bootmem_core(): zero-sized request\n"); | 191 | printk("__alloc_bootmem_core(): zero-sized request\n"); |
183 | BUG(); | 192 | BUG(); |
184 | } | 193 | } |
@@ -187,23 +196,22 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, | |||
187 | if (limit && bdata->node_boot_start >= limit) | 196 | if (limit && bdata->node_boot_start >= limit) |
188 | return NULL; | 197 | return NULL; |
189 | 198 | ||
190 | limit >>=PAGE_SHIFT; | 199 | end_pfn = bdata->node_low_pfn; |
200 | limit = PFN_DOWN(limit); | ||
191 | if (limit && end_pfn > limit) | 201 | if (limit && end_pfn > limit) |
192 | end_pfn = limit; | 202 | end_pfn = limit; |
193 | 203 | ||
194 | eidx = end_pfn - (bdata->node_boot_start >> PAGE_SHIFT); | 204 | eidx = end_pfn - PFN_DOWN(bdata->node_boot_start); |
195 | offset = 0; | 205 | offset = 0; |
196 | if (align && | 206 | if (align && (bdata->node_boot_start & (align - 1UL)) != 0) |
197 | (bdata->node_boot_start & (align - 1UL)) != 0) | 207 | offset = align - (bdata->node_boot_start & (align - 1UL)); |
198 | offset = (align - (bdata->node_boot_start & (align - 1UL))); | 208 | offset = PFN_DOWN(offset); |
199 | offset >>= PAGE_SHIFT; | ||
200 | 209 | ||
201 | /* | 210 | /* |
202 | * We try to allocate bootmem pages above 'goal' | 211 | * We try to allocate bootmem pages above 'goal' |
203 | * first, then we try to allocate lower pages. | 212 | * first, then we try to allocate lower pages. |
204 | */ | 213 | */ |
205 | if (goal && (goal >= bdata->node_boot_start) && | 214 | if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) { |
206 | ((goal >> PAGE_SHIFT) < end_pfn)) { | ||
207 | preferred = goal - bdata->node_boot_start; | 215 | preferred = goal - bdata->node_boot_start; |
208 | 216 | ||
209 | if (bdata->last_success >= preferred) | 217 | if (bdata->last_success >= preferred) |
@@ -212,9 +220,8 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, | |||
212 | } else | 220 | } else |
213 | preferred = 0; | 221 | preferred = 0; |
214 | 222 | ||
215 | preferred = ALIGN(preferred, align) >> PAGE_SHIFT; | 223 | preferred = PFN_DOWN(ALIGN(preferred, align)) + offset; |
216 | preferred += offset; | 224 | areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; |
217 | areasize = (size+PAGE_SIZE-1)/PAGE_SIZE; | ||
218 | incr = align >> PAGE_SHIFT ? : 1; | 225 | incr = align >> PAGE_SHIFT ? : 1; |
219 | 226 | ||
220 | restart_scan: | 227 | restart_scan: |
@@ -229,7 +236,7 @@ restart_scan: | |||
229 | for (j = i + 1; j < i + areasize; ++j) { | 236 | for (j = i + 1; j < i + areasize; ++j) { |
230 | if (j >= eidx) | 237 | if (j >= eidx) |
231 | goto fail_block; | 238 | goto fail_block; |
232 | if (test_bit (j, bdata->node_bootmem_map)) | 239 | if (test_bit(j, bdata->node_bootmem_map)) |
233 | goto fail_block; | 240 | goto fail_block; |
234 | } | 241 | } |
235 | start = i; | 242 | start = i; |
@@ -245,7 +252,7 @@ restart_scan: | |||
245 | return NULL; | 252 | return NULL; |
246 | 253 | ||
247 | found: | 254 | found: |
248 | bdata->last_success = start << PAGE_SHIFT; | 255 | bdata->last_success = PFN_PHYS(start); |
249 | BUG_ON(start >= eidx); | 256 | BUG_ON(start >= eidx); |
250 | 257 | ||
251 | /* | 258 | /* |
@@ -257,19 +264,21 @@ found: | |||
257 | bdata->last_offset && bdata->last_pos+1 == start) { | 264 | bdata->last_offset && bdata->last_pos+1 == start) { |
258 | offset = ALIGN(bdata->last_offset, align); | 265 | offset = ALIGN(bdata->last_offset, align); |
259 | BUG_ON(offset > PAGE_SIZE); | 266 | BUG_ON(offset > PAGE_SIZE); |
260 | remaining_size = PAGE_SIZE-offset; | 267 | remaining_size = PAGE_SIZE - offset; |
261 | if (size < remaining_size) { | 268 | if (size < remaining_size) { |
262 | areasize = 0; | 269 | areasize = 0; |
263 | /* last_pos unchanged */ | 270 | /* last_pos unchanged */ |
264 | bdata->last_offset = offset+size; | 271 | bdata->last_offset = offset + size; |
265 | ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + | 272 | ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + |
266 | bdata->node_boot_start); | 273 | offset + |
274 | bdata->node_boot_start); | ||
267 | } else { | 275 | } else { |
268 | remaining_size = size - remaining_size; | 276 | remaining_size = size - remaining_size; |
269 | areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE; | 277 | areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE; |
270 | ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + | 278 | ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + |
271 | bdata->node_boot_start); | 279 | offset + |
272 | bdata->last_pos = start+areasize-1; | 280 | bdata->node_boot_start); |
281 | bdata->last_pos = start + areasize - 1; | ||
273 | bdata->last_offset = remaining_size; | 282 | bdata->last_offset = remaining_size; |
274 | } | 283 | } |
275 | bdata->last_offset &= ~PAGE_MASK; | 284 | bdata->last_offset &= ~PAGE_MASK; |
@@ -282,7 +291,7 @@ found: | |||
282 | /* | 291 | /* |
283 | * Reserve the area now: | 292 | * Reserve the area now: |
284 | */ | 293 | */ |
285 | for (i = start; i < start+areasize; i++) | 294 | for (i = start; i < start + areasize; i++) |
286 | if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map))) | 295 | if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map))) |
287 | BUG(); | 296 | BUG(); |
288 | memset(ret, 0, size); | 297 | memset(ret, 0, size); |
@@ -303,8 +312,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) | |||
303 | 312 | ||
304 | count = 0; | 313 | count = 0; |
305 | /* first extant page of the node */ | 314 | /* first extant page of the node */ |
306 | pfn = bdata->node_boot_start >> PAGE_SHIFT; | 315 | pfn = PFN_DOWN(bdata->node_boot_start); |
307 | idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); | 316 | idx = bdata->node_low_pfn - pfn; |
308 | map = bdata->node_bootmem_map; | 317 | map = bdata->node_bootmem_map; |
309 | /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */ | 318 | /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */ |
310 | if (bdata->node_boot_start == 0 || | 319 | if (bdata->node_boot_start == 0 || |
@@ -333,7 +342,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) | |||
333 | } | 342 | } |
334 | } | 343 | } |
335 | } else { | 344 | } else { |
336 | i+=BITS_PER_LONG; | 345 | i += BITS_PER_LONG; |
337 | } | 346 | } |
338 | pfn += BITS_PER_LONG; | 347 | pfn += BITS_PER_LONG; |
339 | } | 348 | } |
@@ -345,9 +354,10 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) | |||
345 | */ | 354 | */ |
346 | page = virt_to_page(bdata->node_bootmem_map); | 355 | page = virt_to_page(bdata->node_bootmem_map); |
347 | count = 0; | 356 | count = 0; |
348 | for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { | 357 | idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT; |
349 | count++; | 358 | for (i = 0; i < idx; i++, page++) { |
350 | __free_pages_bootmem(page, 0); | 359 | __free_pages_bootmem(page, 0); |
360 | count++; | ||
351 | } | 361 | } |
352 | total += count; | 362 | total += count; |
353 | bdata->node_bootmem_map = NULL; | 363 | bdata->node_bootmem_map = NULL; |
@@ -355,64 +365,72 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) | |||
355 | return total; | 365 | return total; |
356 | } | 366 | } |
357 | 367 | ||
358 | unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn) | 368 | unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, |
369 | unsigned long startpfn, unsigned long endpfn) | ||
359 | { | 370 | { |
360 | return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn)); | 371 | return init_bootmem_core(pgdat, freepfn, startpfn, endpfn); |
361 | } | 372 | } |
362 | 373 | ||
363 | void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) | 374 | void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, |
375 | unsigned long size) | ||
364 | { | 376 | { |
365 | reserve_bootmem_core(pgdat->bdata, physaddr, size); | 377 | reserve_bootmem_core(pgdat->bdata, physaddr, size); |
366 | } | 378 | } |
367 | 379 | ||
368 | void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) | 380 | void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, |
381 | unsigned long size) | ||
369 | { | 382 | { |
370 | free_bootmem_core(pgdat->bdata, physaddr, size); | 383 | free_bootmem_core(pgdat->bdata, physaddr, size); |
371 | } | 384 | } |
372 | 385 | ||
373 | unsigned long __init free_all_bootmem_node (pg_data_t *pgdat) | 386 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) |
374 | { | 387 | { |
375 | return(free_all_bootmem_core(pgdat)); | 388 | return free_all_bootmem_core(pgdat); |
376 | } | 389 | } |
377 | 390 | ||
378 | unsigned long __init init_bootmem (unsigned long start, unsigned long pages) | 391 | unsigned long __init init_bootmem(unsigned long start, unsigned long pages) |
379 | { | 392 | { |
380 | max_low_pfn = pages; | 393 | max_low_pfn = pages; |
381 | min_low_pfn = start; | 394 | min_low_pfn = start; |
382 | return(init_bootmem_core(NODE_DATA(0), start, 0, pages)); | 395 | return init_bootmem_core(NODE_DATA(0), start, 0, pages); |
383 | } | 396 | } |
384 | 397 | ||
385 | #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE | 398 | #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE |
386 | void __init reserve_bootmem (unsigned long addr, unsigned long size) | 399 | void __init reserve_bootmem(unsigned long addr, unsigned long size) |
387 | { | 400 | { |
388 | reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size); | 401 | reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size); |
389 | } | 402 | } |
390 | #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ | 403 | #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ |
391 | 404 | ||
392 | void __init free_bootmem (unsigned long addr, unsigned long size) | 405 | void __init free_bootmem(unsigned long addr, unsigned long size) |
393 | { | 406 | { |
394 | free_bootmem_core(NODE_DATA(0)->bdata, addr, size); | 407 | free_bootmem_core(NODE_DATA(0)->bdata, addr, size); |
395 | } | 408 | } |
396 | 409 | ||
397 | unsigned long __init free_all_bootmem (void) | 410 | unsigned long __init free_all_bootmem(void) |
398 | { | 411 | { |
399 | return(free_all_bootmem_core(NODE_DATA(0))); | 412 | return free_all_bootmem_core(NODE_DATA(0)); |
400 | } | 413 | } |
401 | 414 | ||
402 | void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) | 415 | void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, |
416 | unsigned long goal) | ||
403 | { | 417 | { |
404 | bootmem_data_t *bdata; | 418 | bootmem_data_t *bdata; |
405 | void *ptr; | 419 | void *ptr; |
406 | 420 | ||
407 | list_for_each_entry(bdata, &bdata_list, list) | 421 | list_for_each_entry(bdata, &bdata_list, list) { |
408 | if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, 0))) | 422 | ptr = __alloc_bootmem_core(bdata, size, align, goal, 0); |
409 | return(ptr); | 423 | if (ptr) |
424 | return ptr; | ||
425 | } | ||
410 | return NULL; | 426 | return NULL; |
411 | } | 427 | } |
412 | 428 | ||
413 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) | 429 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, |
430 | unsigned long goal) | ||
414 | { | 431 | { |
415 | void *mem = __alloc_bootmem_nopanic(size,align,goal); | 432 | void *mem = __alloc_bootmem_nopanic(size,align,goal); |
433 | |||
416 | if (mem) | 434 | if (mem) |
417 | return mem; | 435 | return mem; |
418 | /* | 436 | /* |
@@ -424,29 +442,34 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned | |||
424 | } | 442 | } |
425 | 443 | ||
426 | 444 | ||
427 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, | 445 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, |
428 | unsigned long goal) | 446 | unsigned long align, unsigned long goal) |
429 | { | 447 | { |
430 | void *ptr; | 448 | void *ptr; |
431 | 449 | ||
432 | ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); | 450 | ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); |
433 | if (ptr) | 451 | if (ptr) |
434 | return (ptr); | 452 | return ptr; |
435 | 453 | ||
436 | return __alloc_bootmem(size, align, goal); | 454 | return __alloc_bootmem(size, align, goal); |
437 | } | 455 | } |
438 | 456 | ||
439 | #define LOW32LIMIT 0xffffffff | 457 | #ifndef ARCH_LOW_ADDRESS_LIMIT |
458 | #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL | ||
459 | #endif | ||
440 | 460 | ||
441 | void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) | 461 | void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, |
462 | unsigned long goal) | ||
442 | { | 463 | { |
443 | bootmem_data_t *bdata; | 464 | bootmem_data_t *bdata; |
444 | void *ptr; | 465 | void *ptr; |
445 | 466 | ||
446 | list_for_each_entry(bdata, &bdata_list, list) | 467 | list_for_each_entry(bdata, &bdata_list, list) { |
447 | if ((ptr = __alloc_bootmem_core(bdata, size, | 468 | ptr = __alloc_bootmem_core(bdata, size, align, goal, |
448 | align, goal, LOW32LIMIT))) | 469 | ARCH_LOW_ADDRESS_LIMIT); |
449 | return(ptr); | 470 | if (ptr) |
471 | return ptr; | ||
472 | } | ||
450 | 473 | ||
451 | /* | 474 | /* |
452 | * Whoops, we cannot satisfy the allocation request. | 475 | * Whoops, we cannot satisfy the allocation request. |
@@ -459,5 +482,6 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsig | |||
459 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | 482 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, |
460 | unsigned long align, unsigned long goal) | 483 | unsigned long align, unsigned long goal) |
461 | { | 484 | { |
462 | return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT); | 485 | return __alloc_bootmem_core(pgdat->bdata, size, align, goal, |
486 | ARCH_LOW_ADDRESS_LIMIT); | ||
463 | } | 487 | } |
diff --git a/mm/bounce.c b/mm/bounce.c new file mode 100644 index 0000000000..e4b62d2a40 --- /dev/null +++ b/mm/bounce.c | |||
@@ -0,0 +1,302 @@ | |||
1 | /* bounce buffer handling for block devices | ||
2 | * | ||
3 | * - Split from highmem.c | ||
4 | */ | ||
5 | |||
6 | #include <linux/mm.h> | ||
7 | #include <linux/module.h> | ||
8 | #include <linux/swap.h> | ||
9 | #include <linux/bio.h> | ||
10 | #include <linux/pagemap.h> | ||
11 | #include <linux/mempool.h> | ||
12 | #include <linux/blkdev.h> | ||
13 | #include <linux/init.h> | ||
14 | #include <linux/hash.h> | ||
15 | #include <linux/highmem.h> | ||
16 | #include <linux/blktrace_api.h> | ||
17 | #include <asm/tlbflush.h> | ||
18 | |||
19 | #define POOL_SIZE 64 | ||
20 | #define ISA_POOL_SIZE 16 | ||
21 | |||
22 | static mempool_t *page_pool, *isa_page_pool; | ||
23 | |||
24 | #ifdef CONFIG_HIGHMEM | ||
25 | static __init int init_emergency_pool(void) | ||
26 | { | ||
27 | struct sysinfo i; | ||
28 | si_meminfo(&i); | ||
29 | si_swapinfo(&i); | ||
30 | |||
31 | if (!i.totalhigh) | ||
32 | return 0; | ||
33 | |||
34 | page_pool = mempool_create_page_pool(POOL_SIZE, 0); | ||
35 | BUG_ON(!page_pool); | ||
36 | printk("highmem bounce pool size: %d pages\n", POOL_SIZE); | ||
37 | |||
38 | return 0; | ||
39 | } | ||
40 | |||
41 | __initcall(init_emergency_pool); | ||
42 | |||
43 | /* | ||
44 | * highmem version, map in to vec | ||
45 | */ | ||
46 | static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) | ||
47 | { | ||
48 | unsigned long flags; | ||
49 | unsigned char *vto; | ||
50 | |||
51 | local_irq_save(flags); | ||
52 | vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); | ||
53 | memcpy(vto + to->bv_offset, vfrom, to->bv_len); | ||
54 | kunmap_atomic(vto, KM_BOUNCE_READ); | ||
55 | local_irq_restore(flags); | ||
56 | } | ||
57 | |||
58 | #else /* CONFIG_HIGHMEM */ | ||
59 | |||
60 | #define bounce_copy_vec(to, vfrom) \ | ||
61 | memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) | ||
62 | |||
63 | #endif /* CONFIG_HIGHMEM */ | ||
64 | |||
65 | /* | ||
66 | * allocate pages in the DMA region for the ISA pool | ||
67 | */ | ||
68 | static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) | ||
69 | { | ||
70 | return mempool_alloc_pages(gfp_mask | GFP_DMA, data); | ||
71 | } | ||
72 | |||
73 | /* | ||
74 | * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA | ||
75 | * as the max address, so check if the pool has already been created. | ||
76 | */ | ||
77 | int init_emergency_isa_pool(void) | ||
78 | { | ||
79 | if (isa_page_pool) | ||
80 | return 0; | ||
81 | |||
82 | isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa, | ||
83 | mempool_free_pages, (void *) 0); | ||
84 | BUG_ON(!isa_page_pool); | ||
85 | |||
86 | printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE); | ||
87 | return 0; | ||
88 | } | ||
89 | |||
90 | /* | ||
91 | * Simple bounce buffer support for highmem pages. Depending on the | ||
92 | * queue gfp mask set, *to may or may not be a highmem page. kmap it | ||
93 | * always, it will do the Right Thing | ||
94 | */ | ||
95 | static void copy_to_high_bio_irq(struct bio *to, struct bio *from) | ||
96 | { | ||
97 | unsigned char *vfrom; | ||
98 | struct bio_vec *tovec, *fromvec; | ||
99 | int i; | ||
100 | |||
101 | __bio_for_each_segment(tovec, to, i, 0) { | ||
102 | fromvec = from->bi_io_vec + i; | ||
103 | |||
104 | /* | ||
105 | * not bounced | ||
106 | */ | ||
107 | if (tovec->bv_page == fromvec->bv_page) | ||
108 | continue; | ||
109 | |||
110 | /* | ||
111 | * fromvec->bv_offset and fromvec->bv_len might have been | ||
112 | * modified by the block layer, so use the original copy, | ||
113 | * bounce_copy_vec already uses tovec->bv_len | ||
114 | */ | ||
115 | vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; | ||
116 | |||
117 | flush_dcache_page(tovec->bv_page); | ||
118 | bounce_copy_vec(tovec, vfrom); | ||
119 | } | ||
120 | } | ||
121 | |||
122 | static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) | ||
123 | { | ||
124 | struct bio *bio_orig = bio->bi_private; | ||
125 | struct bio_vec *bvec, *org_vec; | ||
126 | int i; | ||
127 | |||
128 | if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) | ||
129 | set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); | ||
130 | |||
131 | /* | ||
132 | * free up bounce indirect pages used | ||
133 | */ | ||
134 | __bio_for_each_segment(bvec, bio, i, 0) { | ||
135 | org_vec = bio_orig->bi_io_vec + i; | ||
136 | if (bvec->bv_page == org_vec->bv_page) | ||
137 | continue; | ||
138 | |||
139 | dec_zone_page_state(bvec->bv_page, NR_BOUNCE); | ||
140 | mempool_free(bvec->bv_page, pool); | ||
141 | } | ||
142 | |||
143 | bio_endio(bio_orig, bio_orig->bi_size, err); | ||
144 | bio_put(bio); | ||
145 | } | ||
146 | |||
147 | static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err) | ||
148 | { | ||
149 | if (bio->bi_size) | ||
150 | return 1; | ||
151 | |||
152 | bounce_end_io(bio, page_pool, err); | ||
153 | return 0; | ||
154 | } | ||
155 | |||
156 | static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err) | ||
157 | { | ||
158 | if (bio->bi_size) | ||
159 | return 1; | ||
160 | |||
161 | bounce_end_io(bio, isa_page_pool, err); | ||
162 | return 0; | ||
163 | } | ||
164 | |||
165 | static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) | ||
166 | { | ||
167 | struct bio *bio_orig = bio->bi_private; | ||
168 | |||
169 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) | ||
170 | copy_to_high_bio_irq(bio_orig, bio); | ||
171 | |||
172 | bounce_end_io(bio, pool, err); | ||
173 | } | ||
174 | |||
175 | static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err) | ||
176 | { | ||
177 | if (bio->bi_size) | ||
178 | return 1; | ||
179 | |||
180 | __bounce_end_io_read(bio, page_pool, err); | ||
181 | return 0; | ||
182 | } | ||
183 | |||
184 | static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err) | ||
185 | { | ||
186 | if (bio->bi_size) | ||
187 | return 1; | ||
188 | |||
189 | __bounce_end_io_read(bio, isa_page_pool, err); | ||
190 | return 0; | ||
191 | } | ||
192 | |||
193 | static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, | ||
194 | mempool_t *pool) | ||
195 | { | ||
196 | struct page *page; | ||
197 | struct bio *bio = NULL; | ||
198 | int i, rw = bio_data_dir(*bio_orig); | ||
199 | struct bio_vec *to, *from; | ||
200 | |||
201 | bio_for_each_segment(from, *bio_orig, i) { | ||
202 | page = from->bv_page; | ||
203 | |||
204 | /* | ||
205 | * is destination page below bounce pfn? | ||
206 | */ | ||
207 | if (page_to_pfn(page) < q->bounce_pfn) | ||
208 | continue; | ||
209 | |||
210 | /* | ||
211 | * irk, bounce it | ||
212 | */ | ||
213 | if (!bio) | ||
214 | bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt); | ||
215 | |||
216 | to = bio->bi_io_vec + i; | ||
217 | |||
218 | to->bv_page = mempool_alloc(pool, q->bounce_gfp); | ||
219 | to->bv_len = from->bv_len; | ||
220 | to->bv_offset = from->bv_offset; | ||
221 | inc_zone_page_state(to->bv_page, NR_BOUNCE); | ||
222 | |||
223 | if (rw == WRITE) { | ||
224 | char *vto, *vfrom; | ||
225 | |||
226 | flush_dcache_page(from->bv_page); | ||
227 | vto = page_address(to->bv_page) + to->bv_offset; | ||
228 | vfrom = kmap(from->bv_page) + from->bv_offset; | ||
229 | memcpy(vto, vfrom, to->bv_len); | ||
230 | kunmap(from->bv_page); | ||
231 | } | ||
232 | } | ||
233 | |||
234 | /* | ||
235 | * no pages bounced | ||
236 | */ | ||
237 | if (!bio) | ||
238 | return; | ||
239 | |||
240 | /* | ||
241 | * at least one page was bounced, fill in possible non-highmem | ||
242 | * pages | ||
243 | */ | ||
244 | __bio_for_each_segment(from, *bio_orig, i, 0) { | ||
245 | to = bio_iovec_idx(bio, i); | ||
246 | if (!to->bv_page) { | ||
247 | to->bv_page = from->bv_page; | ||
248 | to->bv_len = from->bv_len; | ||
249 | to->bv_offset = from->bv_offset; | ||
250 | } | ||
251 | } | ||
252 | |||
253 | bio->bi_bdev = (*bio_orig)->bi_bdev; | ||
254 | bio->bi_flags |= (1 << BIO_BOUNCED); | ||
255 | bio->bi_sector = (*bio_orig)->bi_sector; | ||
256 | bio->bi_rw = (*bio_orig)->bi_rw; | ||
257 | |||
258 | bio->bi_vcnt = (*bio_orig)->bi_vcnt; | ||
259 | bio->bi_idx = (*bio_orig)->bi_idx; | ||
260 | bio->bi_size = (*bio_orig)->bi_size; | ||
261 | |||
262 | if (pool == page_pool) { | ||
263 | bio->bi_end_io = bounce_end_io_write; | ||
264 | if (rw == READ) | ||
265 | bio->bi_end_io = bounce_end_io_read; | ||
266 | } else { | ||
267 | bio->bi_end_io = bounce_end_io_write_isa; | ||
268 | if (rw == READ) | ||
269 | bio->bi_end_io = bounce_end_io_read_isa; | ||
270 | } | ||
271 | |||
272 | bio->bi_private = *bio_orig; | ||
273 | *bio_orig = bio; | ||
274 | } | ||
275 | |||
276 | void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) | ||
277 | { | ||
278 | mempool_t *pool; | ||
279 | |||
280 | /* | ||
281 | * for non-isa bounce case, just check if the bounce pfn is equal | ||
282 | * to or bigger than the highest pfn in the system -- in that case, | ||
283 | * don't waste time iterating over bio segments | ||
284 | */ | ||
285 | if (!(q->bounce_gfp & GFP_DMA)) { | ||
286 | if (q->bounce_pfn >= blk_max_pfn) | ||
287 | return; | ||
288 | pool = page_pool; | ||
289 | } else { | ||
290 | BUG_ON(!isa_page_pool); | ||
291 | pool = isa_page_pool; | ||
292 | } | ||
293 | |||
294 | blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); | ||
295 | |||
296 | /* | ||
297 | * slow path | ||
298 | */ | ||
299 | __blk_queue_bounce(q, bio_orig, pool); | ||
300 | } | ||
301 | |||
302 | EXPORT_SYMBOL(blk_queue_bounce); | ||
diff --git a/mm/filemap.c b/mm/filemap.c index b9a60c43b6..ec46923598 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -488,6 +488,12 @@ struct page *page_cache_alloc_cold(struct address_space *x) | |||
488 | EXPORT_SYMBOL(page_cache_alloc_cold); | 488 | EXPORT_SYMBOL(page_cache_alloc_cold); |
489 | #endif | 489 | #endif |
490 | 490 | ||
491 | static int __sleep_on_page_lock(void *word) | ||
492 | { | ||
493 | io_schedule(); | ||
494 | return 0; | ||
495 | } | ||
496 | |||
491 | /* | 497 | /* |
492 | * In order to wait for pages to become available there must be | 498 | * In order to wait for pages to become available there must be |
493 | * waitqueues associated with pages. By using a hash table of | 499 | * waitqueues associated with pages. By using a hash table of |
@@ -577,13 +583,24 @@ void fastcall __lock_page(struct page *page) | |||
577 | } | 583 | } |
578 | EXPORT_SYMBOL(__lock_page); | 584 | EXPORT_SYMBOL(__lock_page); |
579 | 585 | ||
586 | /* | ||
587 | * Variant of lock_page that does not require the caller to hold a reference | ||
588 | * on the page's mapping. | ||
589 | */ | ||
590 | void fastcall __lock_page_nosync(struct page *page) | ||
591 | { | ||
592 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | ||
593 | __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, | ||
594 | TASK_UNINTERRUPTIBLE); | ||
595 | } | ||
596 | |||
580 | /** | 597 | /** |
581 | * find_get_page - find and get a page reference | 598 | * find_get_page - find and get a page reference |
582 | * @mapping: the address_space to search | 599 | * @mapping: the address_space to search |
583 | * @offset: the page index | 600 | * @offset: the page index |
584 | * | 601 | * |
585 | * A rather lightweight function, finding and getting a reference to a | 602 | * Is there a pagecache struct page at the given (mapping, offset) tuple? |
586 | * hashed page atomically. | 603 | * If yes, increment its refcount and return it; if no, return NULL. |
587 | */ | 604 | */ |
588 | struct page * find_get_page(struct address_space *mapping, unsigned long offset) | 605 | struct page * find_get_page(struct address_space *mapping, unsigned long offset) |
589 | { | 606 | { |
@@ -970,7 +987,7 @@ page_not_up_to_date: | |||
970 | /* Get exclusive access to the page ... */ | 987 | /* Get exclusive access to the page ... */ |
971 | lock_page(page); | 988 | lock_page(page); |
972 | 989 | ||
973 | /* Did it get unhashed before we got the lock? */ | 990 | /* Did it get truncated before we got the lock? */ |
974 | if (!page->mapping) { | 991 | if (!page->mapping) { |
975 | unlock_page(page); | 992 | unlock_page(page); |
976 | page_cache_release(page); | 993 | page_cache_release(page); |
@@ -1132,13 +1149,14 @@ success: | |||
1132 | * that can use the page cache directly. | 1149 | * that can use the page cache directly. |
1133 | */ | 1150 | */ |
1134 | ssize_t | 1151 | ssize_t |
1135 | __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | 1152 | generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, |
1136 | unsigned long nr_segs, loff_t *ppos) | 1153 | unsigned long nr_segs, loff_t pos) |
1137 | { | 1154 | { |
1138 | struct file *filp = iocb->ki_filp; | 1155 | struct file *filp = iocb->ki_filp; |
1139 | ssize_t retval; | 1156 | ssize_t retval; |
1140 | unsigned long seg; | 1157 | unsigned long seg; |
1141 | size_t count; | 1158 | size_t count; |
1159 | loff_t *ppos = &iocb->ki_pos; | ||
1142 | 1160 | ||
1143 | count = 0; | 1161 | count = 0; |
1144 | for (seg = 0; seg < nr_segs; seg++) { | 1162 | for (seg = 0; seg < nr_segs; seg++) { |
@@ -1162,7 +1180,7 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1162 | 1180 | ||
1163 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ | 1181 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ |
1164 | if (filp->f_flags & O_DIRECT) { | 1182 | if (filp->f_flags & O_DIRECT) { |
1165 | loff_t pos = *ppos, size; | 1183 | loff_t size; |
1166 | struct address_space *mapping; | 1184 | struct address_space *mapping; |
1167 | struct inode *inode; | 1185 | struct inode *inode; |
1168 | 1186 | ||
@@ -1206,33 +1224,8 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1206 | out: | 1224 | out: |
1207 | return retval; | 1225 | return retval; |
1208 | } | 1226 | } |
1209 | EXPORT_SYMBOL(__generic_file_aio_read); | ||
1210 | |||
1211 | ssize_t | ||
1212 | generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos) | ||
1213 | { | ||
1214 | struct iovec local_iov = { .iov_base = buf, .iov_len = count }; | ||
1215 | |||
1216 | BUG_ON(iocb->ki_pos != pos); | ||
1217 | return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos); | ||
1218 | } | ||
1219 | EXPORT_SYMBOL(generic_file_aio_read); | 1227 | EXPORT_SYMBOL(generic_file_aio_read); |
1220 | 1228 | ||
1221 | ssize_t | ||
1222 | generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) | ||
1223 | { | ||
1224 | struct iovec local_iov = { .iov_base = buf, .iov_len = count }; | ||
1225 | struct kiocb kiocb; | ||
1226 | ssize_t ret; | ||
1227 | |||
1228 | init_sync_kiocb(&kiocb, filp); | ||
1229 | ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos); | ||
1230 | if (-EIOCBQUEUED == ret) | ||
1231 | ret = wait_on_sync_kiocb(&kiocb); | ||
1232 | return ret; | ||
1233 | } | ||
1234 | EXPORT_SYMBOL(generic_file_read); | ||
1235 | |||
1236 | int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) | 1229 | int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) |
1237 | { | 1230 | { |
1238 | ssize_t written; | 1231 | ssize_t written; |
@@ -1454,7 +1447,7 @@ outside_data_content: | |||
1454 | * accessible.. | 1447 | * accessible.. |
1455 | */ | 1448 | */ |
1456 | if (area->vm_mm == current->mm) | 1449 | if (area->vm_mm == current->mm) |
1457 | return NULL; | 1450 | return NOPAGE_SIGBUS; |
1458 | /* Fall through to the non-read-ahead case */ | 1451 | /* Fall through to the non-read-ahead case */ |
1459 | no_cached_page: | 1452 | no_cached_page: |
1460 | /* | 1453 | /* |
@@ -1479,7 +1472,7 @@ no_cached_page: | |||
1479 | */ | 1472 | */ |
1480 | if (error == -ENOMEM) | 1473 | if (error == -ENOMEM) |
1481 | return NOPAGE_OOM; | 1474 | return NOPAGE_OOM; |
1482 | return NULL; | 1475 | return NOPAGE_SIGBUS; |
1483 | 1476 | ||
1484 | page_not_uptodate: | 1477 | page_not_uptodate: |
1485 | if (!did_readaround) { | 1478 | if (!did_readaround) { |
@@ -1548,7 +1541,7 @@ page_not_uptodate: | |||
1548 | */ | 1541 | */ |
1549 | shrink_readahead_size_eio(file, ra); | 1542 | shrink_readahead_size_eio(file, ra); |
1550 | page_cache_release(page); | 1543 | page_cache_release(page); |
1551 | return NULL; | 1544 | return NOPAGE_SIGBUS; |
1552 | } | 1545 | } |
1553 | EXPORT_SYMBOL(filemap_nopage); | 1546 | EXPORT_SYMBOL(filemap_nopage); |
1554 | 1547 | ||
@@ -1610,7 +1603,7 @@ no_cached_page: | |||
1610 | page_not_uptodate: | 1603 | page_not_uptodate: |
1611 | lock_page(page); | 1604 | lock_page(page); |
1612 | 1605 | ||
1613 | /* Did it get unhashed while we waited for it? */ | 1606 | /* Did it get truncated while we waited for it? */ |
1614 | if (!page->mapping) { | 1607 | if (!page->mapping) { |
1615 | unlock_page(page); | 1608 | unlock_page(page); |
1616 | goto err; | 1609 | goto err; |
@@ -2003,6 +1996,7 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i | |||
2003 | if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) | 1996 | if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) |
2004 | *count = inode->i_sb->s_maxbytes - *pos; | 1997 | *count = inode->i_sb->s_maxbytes - *pos; |
2005 | } else { | 1998 | } else { |
1999 | #ifdef CONFIG_BLOCK | ||
2006 | loff_t isize; | 2000 | loff_t isize; |
2007 | if (bdev_read_only(I_BDEV(inode))) | 2001 | if (bdev_read_only(I_BDEV(inode))) |
2008 | return -EPERM; | 2002 | return -EPERM; |
@@ -2014,6 +2008,9 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i | |||
2014 | 2008 | ||
2015 | if (*pos + *count > isize) | 2009 | if (*pos + *count > isize) |
2016 | *count = isize - *pos; | 2010 | *count = isize - *pos; |
2011 | #else | ||
2012 | return -EPERM; | ||
2013 | #endif | ||
2017 | } | 2014 | } |
2018 | return 0; | 2015 | return 0; |
2019 | } | 2016 | } |
@@ -2294,22 +2291,22 @@ out: | |||
2294 | current->backing_dev_info = NULL; | 2291 | current->backing_dev_info = NULL; |
2295 | return written ? written : err; | 2292 | return written ? written : err; |
2296 | } | 2293 | } |
2297 | EXPORT_SYMBOL(generic_file_aio_write_nolock); | ||
2298 | 2294 | ||
2299 | ssize_t | 2295 | ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, |
2300 | generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | 2296 | const struct iovec *iov, unsigned long nr_segs, loff_t pos) |
2301 | unsigned long nr_segs, loff_t *ppos) | ||
2302 | { | 2297 | { |
2303 | struct file *file = iocb->ki_filp; | 2298 | struct file *file = iocb->ki_filp; |
2304 | struct address_space *mapping = file->f_mapping; | 2299 | struct address_space *mapping = file->f_mapping; |
2305 | struct inode *inode = mapping->host; | 2300 | struct inode *inode = mapping->host; |
2306 | ssize_t ret; | 2301 | ssize_t ret; |
2307 | loff_t pos = *ppos; | ||
2308 | 2302 | ||
2309 | ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos); | 2303 | BUG_ON(iocb->ki_pos != pos); |
2304 | |||
2305 | ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, | ||
2306 | &iocb->ki_pos); | ||
2310 | 2307 | ||
2311 | if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 2308 | if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { |
2312 | int err; | 2309 | ssize_t err; |
2313 | 2310 | ||
2314 | err = sync_page_range_nolock(inode, mapping, pos, ret); | 2311 | err = sync_page_range_nolock(inode, mapping, pos, ret); |
2315 | if (err < 0) | 2312 | if (err < 0) |
@@ -2317,51 +2314,21 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | |||
2317 | } | 2314 | } |
2318 | return ret; | 2315 | return ret; |
2319 | } | 2316 | } |
2317 | EXPORT_SYMBOL(generic_file_aio_write_nolock); | ||
2320 | 2318 | ||
2321 | static ssize_t | 2319 | ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, |
2322 | __generic_file_write_nolock(struct file *file, const struct iovec *iov, | 2320 | unsigned long nr_segs, loff_t pos) |
2323 | unsigned long nr_segs, loff_t *ppos) | ||
2324 | { | ||
2325 | struct kiocb kiocb; | ||
2326 | ssize_t ret; | ||
2327 | |||
2328 | init_sync_kiocb(&kiocb, file); | ||
2329 | ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); | ||
2330 | if (ret == -EIOCBQUEUED) | ||
2331 | ret = wait_on_sync_kiocb(&kiocb); | ||
2332 | return ret; | ||
2333 | } | ||
2334 | |||
2335 | ssize_t | ||
2336 | generic_file_write_nolock(struct file *file, const struct iovec *iov, | ||
2337 | unsigned long nr_segs, loff_t *ppos) | ||
2338 | { | ||
2339 | struct kiocb kiocb; | ||
2340 | ssize_t ret; | ||
2341 | |||
2342 | init_sync_kiocb(&kiocb, file); | ||
2343 | ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); | ||
2344 | if (-EIOCBQUEUED == ret) | ||
2345 | ret = wait_on_sync_kiocb(&kiocb); | ||
2346 | return ret; | ||
2347 | } | ||
2348 | EXPORT_SYMBOL(generic_file_write_nolock); | ||
2349 | |||
2350 | ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf, | ||
2351 | size_t count, loff_t pos) | ||
2352 | { | 2321 | { |
2353 | struct file *file = iocb->ki_filp; | 2322 | struct file *file = iocb->ki_filp; |
2354 | struct address_space *mapping = file->f_mapping; | 2323 | struct address_space *mapping = file->f_mapping; |
2355 | struct inode *inode = mapping->host; | 2324 | struct inode *inode = mapping->host; |
2356 | ssize_t ret; | 2325 | ssize_t ret; |
2357 | struct iovec local_iov = { .iov_base = (void __user *)buf, | ||
2358 | .iov_len = count }; | ||
2359 | 2326 | ||
2360 | BUG_ON(iocb->ki_pos != pos); | 2327 | BUG_ON(iocb->ki_pos != pos); |
2361 | 2328 | ||
2362 | mutex_lock(&inode->i_mutex); | 2329 | mutex_lock(&inode->i_mutex); |
2363 | ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1, | 2330 | ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, |
2364 | &iocb->ki_pos); | 2331 | &iocb->ki_pos); |
2365 | mutex_unlock(&inode->i_mutex); | 2332 | mutex_unlock(&inode->i_mutex); |
2366 | 2333 | ||
2367 | if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 2334 | if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { |
@@ -2375,66 +2342,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf, | |||
2375 | } | 2342 | } |
2376 | EXPORT_SYMBOL(generic_file_aio_write); | 2343 | EXPORT_SYMBOL(generic_file_aio_write); |
2377 | 2344 | ||
2378 | ssize_t generic_file_write(struct file *file, const char __user *buf, | ||
2379 | size_t count, loff_t *ppos) | ||
2380 | { | ||
2381 | struct address_space *mapping = file->f_mapping; | ||
2382 | struct inode *inode = mapping->host; | ||
2383 | ssize_t ret; | ||
2384 | struct iovec local_iov = { .iov_base = (void __user *)buf, | ||
2385 | .iov_len = count }; | ||
2386 | |||
2387 | mutex_lock(&inode->i_mutex); | ||
2388 | ret = __generic_file_write_nolock(file, &local_iov, 1, ppos); | ||
2389 | mutex_unlock(&inode->i_mutex); | ||
2390 | |||
2391 | if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | ||
2392 | ssize_t err; | ||
2393 | |||
2394 | err = sync_page_range(inode, mapping, *ppos - ret, ret); | ||
2395 | if (err < 0) | ||
2396 | ret = err; | ||
2397 | } | ||
2398 | return ret; | ||
2399 | } | ||
2400 | EXPORT_SYMBOL(generic_file_write); | ||
2401 | |||
2402 | ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, | ||
2403 | unsigned long nr_segs, loff_t *ppos) | ||
2404 | { | ||
2405 | struct kiocb kiocb; | ||
2406 | ssize_t ret; | ||
2407 | |||
2408 | init_sync_kiocb(&kiocb, filp); | ||
2409 | ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos); | ||
2410 | if (-EIOCBQUEUED == ret) | ||
2411 | ret = wait_on_sync_kiocb(&kiocb); | ||
2412 | return ret; | ||
2413 | } | ||
2414 | EXPORT_SYMBOL(generic_file_readv); | ||
2415 | |||
2416 | ssize_t generic_file_writev(struct file *file, const struct iovec *iov, | ||
2417 | unsigned long nr_segs, loff_t *ppos) | ||
2418 | { | ||
2419 | struct address_space *mapping = file->f_mapping; | ||
2420 | struct inode *inode = mapping->host; | ||
2421 | ssize_t ret; | ||
2422 | |||
2423 | mutex_lock(&inode->i_mutex); | ||
2424 | ret = __generic_file_write_nolock(file, iov, nr_segs, ppos); | ||
2425 | mutex_unlock(&inode->i_mutex); | ||
2426 | |||
2427 | if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | ||
2428 | int err; | ||
2429 | |||
2430 | err = sync_page_range(inode, mapping, *ppos - ret, ret); | ||
2431 | if (err < 0) | ||
2432 | ret = err; | ||
2433 | } | ||
2434 | return ret; | ||
2435 | } | ||
2436 | EXPORT_SYMBOL(generic_file_writev); | ||
2437 | |||
2438 | /* | 2345 | /* |
2439 | * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something | 2346 | * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something |
2440 | * went wrong during pagecache shootdown. | 2347 | * went wrong during pagecache shootdown. |
@@ -2474,3 +2381,33 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | |||
2474 | } | 2381 | } |
2475 | return retval; | 2382 | return retval; |
2476 | } | 2383 | } |
2384 | |||
2385 | /** | ||
2386 | * try_to_release_page() - release old fs-specific metadata on a page | ||
2387 | * | ||
2388 | * @page: the page which the kernel is trying to free | ||
2389 | * @gfp_mask: memory allocation flags (and I/O mode) | ||
2390 | * | ||
2391 | * The address_space is to try to release any data against the page | ||
2392 | * (presumably at page->private). If the release was successful, return `1'. | ||
2393 | * Otherwise return zero. | ||
2394 | * | ||
2395 | * The @gfp_mask argument specifies whether I/O may be performed to release | ||
2396 | * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). | ||
2397 | * | ||
2398 | * NOTE: @gfp_mask may go away, and this function may become non-blocking. | ||
2399 | */ | ||
2400 | int try_to_release_page(struct page *page, gfp_t gfp_mask) | ||
2401 | { | ||
2402 | struct address_space * const mapping = page->mapping; | ||
2403 | |||
2404 | BUG_ON(!PageLocked(page)); | ||
2405 | if (PageWriteback(page)) | ||
2406 | return 0; | ||
2407 | |||
2408 | if (mapping && mapping->a_ops->releasepage) | ||
2409 | return mapping->a_ops->releasepage(page, gfp_mask); | ||
2410 | return try_to_free_buffers(page); | ||
2411 | } | ||
2412 | |||
2413 | EXPORT_SYMBOL(try_to_release_page); | ||
diff --git a/mm/fremap.c b/mm/fremap.c index 21b7d0cbc9..7a9d0f5d24 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -39,7 +39,7 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
39 | } else { | 39 | } else { |
40 | if (!pte_file(pte)) | 40 | if (!pte_file(pte)) |
41 | free_swap_and_cache(pte_to_swp_entry(pte)); | 41 | free_swap_and_cache(pte_to_swp_entry(pte)); |
42 | pte_clear(mm, addr, ptep); | 42 | pte_clear_not_present_full(mm, addr, ptep, 0); |
43 | } | 43 | } |
44 | return !!page; | 44 | return !!page; |
45 | } | 45 | } |
@@ -79,9 +79,9 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
79 | inc_mm_counter(mm, file_rss); | 79 | inc_mm_counter(mm, file_rss); |
80 | 80 | ||
81 | flush_icache_page(vma, page); | 81 | flush_icache_page(vma, page); |
82 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); | 82 | pte_val = mk_pte(page, prot); |
83 | set_pte_at(mm, addr, pte, pte_val); | ||
83 | page_add_file_rmap(page); | 84 | page_add_file_rmap(page); |
84 | pte_val = *pte; | ||
85 | update_mmu_cache(vma, addr, pte_val); | 85 | update_mmu_cache(vma, addr, pte_val); |
86 | lazy_mmu_prot_update(pte_val); | 86 | lazy_mmu_prot_update(pte_val); |
87 | err = 0; | 87 | err = 0; |
diff --git a/mm/highmem.c b/mm/highmem.c index 9b2a5403c4..0206e7e501 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -29,13 +29,6 @@ | |||
29 | #include <linux/blktrace_api.h> | 29 | #include <linux/blktrace_api.h> |
30 | #include <asm/tlbflush.h> | 30 | #include <asm/tlbflush.h> |
31 | 31 | ||
32 | static mempool_t *page_pool, *isa_page_pool; | ||
33 | |||
34 | static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) | ||
35 | { | ||
36 | return mempool_alloc_pages(gfp_mask | GFP_DMA, data); | ||
37 | } | ||
38 | |||
39 | /* | 32 | /* |
40 | * Virtual_count is not a pure "count". | 33 | * Virtual_count is not a pure "count". |
41 | * 0 means that it is not mapped, and has not been mapped | 34 | * 0 means that it is not mapped, and has not been mapped |
@@ -46,6 +39,19 @@ static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) | |||
46 | */ | 39 | */ |
47 | #ifdef CONFIG_HIGHMEM | 40 | #ifdef CONFIG_HIGHMEM |
48 | 41 | ||
42 | unsigned long totalhigh_pages __read_mostly; | ||
43 | |||
44 | unsigned int nr_free_highpages (void) | ||
45 | { | ||
46 | pg_data_t *pgdat; | ||
47 | unsigned int pages = 0; | ||
48 | |||
49 | for_each_online_pgdat(pgdat) | ||
50 | pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; | ||
51 | |||
52 | return pages; | ||
53 | } | ||
54 | |||
49 | static int pkmap_count[LAST_PKMAP]; | 55 | static int pkmap_count[LAST_PKMAP]; |
50 | static unsigned int last_pkmap_nr; | 56 | static unsigned int last_pkmap_nr; |
51 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); | 57 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); |
@@ -204,282 +210,8 @@ void fastcall kunmap_high(struct page *page) | |||
204 | } | 210 | } |
205 | 211 | ||
206 | EXPORT_SYMBOL(kunmap_high); | 212 | EXPORT_SYMBOL(kunmap_high); |
207 | |||
208 | #define POOL_SIZE 64 | ||
209 | |||
210 | static __init int init_emergency_pool(void) | ||
211 | { | ||
212 | struct sysinfo i; | ||
213 | si_meminfo(&i); | ||
214 | si_swapinfo(&i); | ||
215 | |||
216 | if (!i.totalhigh) | ||
217 | return 0; | ||
218 | |||
219 | page_pool = mempool_create_page_pool(POOL_SIZE, 0); | ||
220 | BUG_ON(!page_pool); | ||
221 | printk("highmem bounce pool size: %d pages\n", POOL_SIZE); | ||
222 | |||
223 | return 0; | ||
224 | } | ||
225 | |||
226 | __initcall(init_emergency_pool); | ||
227 | |||
228 | /* | ||
229 | * highmem version, map in to vec | ||
230 | */ | ||
231 | static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) | ||
232 | { | ||
233 | unsigned long flags; | ||
234 | unsigned char *vto; | ||
235 | |||
236 | local_irq_save(flags); | ||
237 | vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); | ||
238 | memcpy(vto + to->bv_offset, vfrom, to->bv_len); | ||
239 | kunmap_atomic(vto, KM_BOUNCE_READ); | ||
240 | local_irq_restore(flags); | ||
241 | } | ||
242 | |||
243 | #else /* CONFIG_HIGHMEM */ | ||
244 | |||
245 | #define bounce_copy_vec(to, vfrom) \ | ||
246 | memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) | ||
247 | |||
248 | #endif | 213 | #endif |
249 | 214 | ||
250 | #define ISA_POOL_SIZE 16 | ||
251 | |||
252 | /* | ||
253 | * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA | ||
254 | * as the max address, so check if the pool has already been created. | ||
255 | */ | ||
256 | int init_emergency_isa_pool(void) | ||
257 | { | ||
258 | if (isa_page_pool) | ||
259 | return 0; | ||
260 | |||
261 | isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa, | ||
262 | mempool_free_pages, (void *) 0); | ||
263 | BUG_ON(!isa_page_pool); | ||
264 | |||
265 | printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE); | ||
266 | return 0; | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * Simple bounce buffer support for highmem pages. Depending on the | ||
271 | * queue gfp mask set, *to may or may not be a highmem page. kmap it | ||
272 | * always, it will do the Right Thing | ||
273 | */ | ||
274 | static void copy_to_high_bio_irq(struct bio *to, struct bio *from) | ||
275 | { | ||
276 | unsigned char *vfrom; | ||
277 | struct bio_vec *tovec, *fromvec; | ||
278 | int i; | ||
279 | |||
280 | __bio_for_each_segment(tovec, to, i, 0) { | ||
281 | fromvec = from->bi_io_vec + i; | ||
282 | |||
283 | /* | ||
284 | * not bounced | ||
285 | */ | ||
286 | if (tovec->bv_page == fromvec->bv_page) | ||
287 | continue; | ||
288 | |||
289 | /* | ||
290 | * fromvec->bv_offset and fromvec->bv_len might have been | ||
291 | * modified by the block layer, so use the original copy, | ||
292 | * bounce_copy_vec already uses tovec->bv_len | ||
293 | */ | ||
294 | vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; | ||
295 | |||
296 | flush_dcache_page(tovec->bv_page); | ||
297 | bounce_copy_vec(tovec, vfrom); | ||
298 | } | ||
299 | } | ||
300 | |||
301 | static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) | ||
302 | { | ||
303 | struct bio *bio_orig = bio->bi_private; | ||
304 | struct bio_vec *bvec, *org_vec; | ||
305 | int i; | ||
306 | |||
307 | if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) | ||
308 | set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); | ||
309 | |||
310 | /* | ||
311 | * free up bounce indirect pages used | ||
312 | */ | ||
313 | __bio_for_each_segment(bvec, bio, i, 0) { | ||
314 | org_vec = bio_orig->bi_io_vec + i; | ||
315 | if (bvec->bv_page == org_vec->bv_page) | ||
316 | continue; | ||
317 | |||
318 | dec_zone_page_state(bvec->bv_page, NR_BOUNCE); | ||
319 | mempool_free(bvec->bv_page, pool); | ||
320 | } | ||
321 | |||
322 | bio_endio(bio_orig, bio_orig->bi_size, err); | ||
323 | bio_put(bio); | ||
324 | } | ||
325 | |||
326 | static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err) | ||
327 | { | ||
328 | if (bio->bi_size) | ||
329 | return 1; | ||
330 | |||
331 | bounce_end_io(bio, page_pool, err); | ||
332 | return 0; | ||
333 | } | ||
334 | |||
335 | static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err) | ||
336 | { | ||
337 | if (bio->bi_size) | ||
338 | return 1; | ||
339 | |||
340 | bounce_end_io(bio, isa_page_pool, err); | ||
341 | return 0; | ||
342 | } | ||
343 | |||
344 | static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) | ||
345 | { | ||
346 | struct bio *bio_orig = bio->bi_private; | ||
347 | |||
348 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) | ||
349 | copy_to_high_bio_irq(bio_orig, bio); | ||
350 | |||
351 | bounce_end_io(bio, pool, err); | ||
352 | } | ||
353 | |||
354 | static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err) | ||
355 | { | ||
356 | if (bio->bi_size) | ||
357 | return 1; | ||
358 | |||
359 | __bounce_end_io_read(bio, page_pool, err); | ||
360 | return 0; | ||
361 | } | ||
362 | |||
363 | static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err) | ||
364 | { | ||
365 | if (bio->bi_size) | ||
366 | return 1; | ||
367 | |||
368 | __bounce_end_io_read(bio, isa_page_pool, err); | ||
369 | return 0; | ||
370 | } | ||
371 | |||
372 | static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, | ||
373 | mempool_t *pool) | ||
374 | { | ||
375 | struct page *page; | ||
376 | struct bio *bio = NULL; | ||
377 | int i, rw = bio_data_dir(*bio_orig); | ||
378 | struct bio_vec *to, *from; | ||
379 | |||
380 | bio_for_each_segment(from, *bio_orig, i) { | ||
381 | page = from->bv_page; | ||
382 | |||
383 | /* | ||
384 | * is destination page below bounce pfn? | ||
385 | */ | ||
386 | if (page_to_pfn(page) < q->bounce_pfn) | ||
387 | continue; | ||
388 | |||
389 | /* | ||
390 | * irk, bounce it | ||
391 | */ | ||
392 | if (!bio) | ||
393 | bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt); | ||
394 | |||
395 | to = bio->bi_io_vec + i; | ||
396 | |||
397 | to->bv_page = mempool_alloc(pool, q->bounce_gfp); | ||
398 | to->bv_len = from->bv_len; | ||
399 | to->bv_offset = from->bv_offset; | ||
400 | inc_zone_page_state(to->bv_page, NR_BOUNCE); | ||
401 | |||
402 | if (rw == WRITE) { | ||
403 | char *vto, *vfrom; | ||
404 | |||
405 | flush_dcache_page(from->bv_page); | ||
406 | vto = page_address(to->bv_page) + to->bv_offset; | ||
407 | vfrom = kmap(from->bv_page) + from->bv_offset; | ||
408 | memcpy(vto, vfrom, to->bv_len); | ||
409 | kunmap(from->bv_page); | ||
410 | } | ||
411 | } | ||
412 | |||
413 | /* | ||
414 | * no pages bounced | ||
415 | */ | ||
416 | if (!bio) | ||
417 | return; | ||
418 | |||
419 | /* | ||
420 | * at least one page was bounced, fill in possible non-highmem | ||
421 | * pages | ||
422 | */ | ||
423 | __bio_for_each_segment(from, *bio_orig, i, 0) { | ||
424 | to = bio_iovec_idx(bio, i); | ||
425 | if (!to->bv_page) { | ||
426 | to->bv_page = from->bv_page; | ||
427 | to->bv_len = from->bv_len; | ||
428 | to->bv_offset = from->bv_offset; | ||
429 | } | ||
430 | } | ||
431 | |||
432 | bio->bi_bdev = (*bio_orig)->bi_bdev; | ||
433 | bio->bi_flags |= (1 << BIO_BOUNCED); | ||
434 | bio->bi_sector = (*bio_orig)->bi_sector; | ||
435 | bio->bi_rw = (*bio_orig)->bi_rw; | ||
436 | |||
437 | bio->bi_vcnt = (*bio_orig)->bi_vcnt; | ||
438 | bio->bi_idx = (*bio_orig)->bi_idx; | ||
439 | bio->bi_size = (*bio_orig)->bi_size; | ||
440 | |||
441 | if (pool == page_pool) { | ||
442 | bio->bi_end_io = bounce_end_io_write; | ||
443 | if (rw == READ) | ||
444 | bio->bi_end_io = bounce_end_io_read; | ||
445 | } else { | ||
446 | bio->bi_end_io = bounce_end_io_write_isa; | ||
447 | if (rw == READ) | ||
448 | bio->bi_end_io = bounce_end_io_read_isa; | ||
449 | } | ||
450 | |||
451 | bio->bi_private = *bio_orig; | ||
452 | *bio_orig = bio; | ||
453 | } | ||
454 | |||
455 | void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) | ||
456 | { | ||
457 | mempool_t *pool; | ||
458 | |||
459 | /* | ||
460 | * for non-isa bounce case, just check if the bounce pfn is equal | ||
461 | * to or bigger than the highest pfn in the system -- in that case, | ||
462 | * don't waste time iterating over bio segments | ||
463 | */ | ||
464 | if (!(q->bounce_gfp & GFP_DMA)) { | ||
465 | if (q->bounce_pfn >= blk_max_pfn) | ||
466 | return; | ||
467 | pool = page_pool; | ||
468 | } else { | ||
469 | BUG_ON(!isa_page_pool); | ||
470 | pool = isa_page_pool; | ||
471 | } | ||
472 | |||
473 | blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); | ||
474 | |||
475 | /* | ||
476 | * slow path | ||
477 | */ | ||
478 | __blk_queue_bounce(q, bio_orig, pool); | ||
479 | } | ||
480 | |||
481 | EXPORT_SYMBOL(blk_queue_bounce); | ||
482 | |||
483 | #if defined(HASHED_PAGE_VIRTUAL) | 215 | #if defined(HASHED_PAGE_VIRTUAL) |
484 | 216 | ||
485 | #define PA_HASH_ORDER 7 | 217 | #define PA_HASH_ORDER 7 |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index df49997325..7c7d03dbf7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -72,7 +72,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, | |||
72 | struct zone **z; | 72 | struct zone **z; |
73 | 73 | ||
74 | for (z = zonelist->zones; *z; z++) { | 74 | for (z = zonelist->zones; *z; z++) { |
75 | nid = (*z)->zone_pgdat->node_id; | 75 | nid = zone_to_nid(*z); |
76 | if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && | 76 | if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && |
77 | !list_empty(&hugepage_freelists[nid])) | 77 | !list_empty(&hugepage_freelists[nid])) |
78 | break; | 78 | break; |
@@ -177,7 +177,7 @@ static void update_and_free_page(struct page *page) | |||
177 | { | 177 | { |
178 | int i; | 178 | int i; |
179 | nr_huge_pages--; | 179 | nr_huge_pages--; |
180 | nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--; | 180 | nr_huge_pages_node[page_to_nid(page)]--; |
181 | for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { | 181 | for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { |
182 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | | 182 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | |
183 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | | 183 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | |
@@ -191,7 +191,8 @@ static void update_and_free_page(struct page *page) | |||
191 | #ifdef CONFIG_HIGHMEM | 191 | #ifdef CONFIG_HIGHMEM |
192 | static void try_to_free_low(unsigned long count) | 192 | static void try_to_free_low(unsigned long count) |
193 | { | 193 | { |
194 | int i, nid; | 194 | int i; |
195 | |||
195 | for (i = 0; i < MAX_NUMNODES; ++i) { | 196 | for (i = 0; i < MAX_NUMNODES; ++i) { |
196 | struct page *page, *next; | 197 | struct page *page, *next; |
197 | list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { | 198 | list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { |
@@ -199,9 +200,8 @@ static void try_to_free_low(unsigned long count) | |||
199 | continue; | 200 | continue; |
200 | list_del(&page->lru); | 201 | list_del(&page->lru); |
201 | update_and_free_page(page); | 202 | update_and_free_page(page); |
202 | nid = page_zone(page)->zone_pgdat->node_id; | ||
203 | free_huge_pages--; | 203 | free_huge_pages--; |
204 | free_huge_pages_node[nid]--; | 204 | free_huge_pages_node[page_to_nid(page)]--; |
205 | if (count >= nr_huge_pages) | 205 | if (count >= nr_huge_pages) |
206 | return; | 206 | return; |
207 | } | 207 | } |
diff --git a/mm/internal.h b/mm/internal.h index d20e3cc4ae..d527b80b29 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -24,8 +24,8 @@ static inline void set_page_count(struct page *page, int v) | |||
24 | */ | 24 | */ |
25 | static inline void set_page_refcounted(struct page *page) | 25 | static inline void set_page_refcounted(struct page *page) |
26 | { | 26 | { |
27 | BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); | 27 | VM_BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); |
28 | BUG_ON(atomic_read(&page->_count)); | 28 | VM_BUG_ON(atomic_read(&page->_count)); |
29 | set_page_count(page, 1); | 29 | set_page_count(page, 1); |
30 | } | 30 | } |
31 | 31 | ||
diff --git a/mm/memory.c b/mm/memory.c index 109e986623..9cf3f341a2 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -49,6 +49,7 @@ | |||
49 | #include <linux/module.h> | 49 | #include <linux/module.h> |
50 | #include <linux/delayacct.h> | 50 | #include <linux/delayacct.h> |
51 | #include <linux/init.h> | 51 | #include <linux/init.h> |
52 | #include <linux/writeback.h> | ||
52 | 53 | ||
53 | #include <asm/pgalloc.h> | 54 | #include <asm/pgalloc.h> |
54 | #include <asm/uaccess.h> | 55 | #include <asm/uaccess.h> |
@@ -466,7 +467,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
466 | */ | 467 | */ |
467 | if (is_cow_mapping(vm_flags)) { | 468 | if (is_cow_mapping(vm_flags)) { |
468 | ptep_set_wrprotect(src_mm, addr, src_pte); | 469 | ptep_set_wrprotect(src_mm, addr, src_pte); |
469 | pte = *src_pte; | 470 | pte = pte_wrprotect(pte); |
470 | } | 471 | } |
471 | 472 | ||
472 | /* | 473 | /* |
@@ -505,6 +506,7 @@ again: | |||
505 | src_pte = pte_offset_map_nested(src_pmd, addr); | 506 | src_pte = pte_offset_map_nested(src_pmd, addr); |
506 | src_ptl = pte_lockptr(src_mm, src_pmd); | 507 | src_ptl = pte_lockptr(src_mm, src_pmd); |
507 | spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); | 508 | spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); |
509 | arch_enter_lazy_mmu_mode(); | ||
508 | 510 | ||
509 | do { | 511 | do { |
510 | /* | 512 | /* |
@@ -526,6 +528,7 @@ again: | |||
526 | progress += 8; | 528 | progress += 8; |
527 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); | 529 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); |
528 | 530 | ||
531 | arch_leave_lazy_mmu_mode(); | ||
529 | spin_unlock(src_ptl); | 532 | spin_unlock(src_ptl); |
530 | pte_unmap_nested(src_pte - 1); | 533 | pte_unmap_nested(src_pte - 1); |
531 | add_mm_rss(dst_mm, rss[0], rss[1]); | 534 | add_mm_rss(dst_mm, rss[0], rss[1]); |
@@ -627,6 +630,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
627 | int anon_rss = 0; | 630 | int anon_rss = 0; |
628 | 631 | ||
629 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 632 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
633 | arch_enter_lazy_mmu_mode(); | ||
630 | do { | 634 | do { |
631 | pte_t ptent = *pte; | 635 | pte_t ptent = *pte; |
632 | if (pte_none(ptent)) { | 636 | if (pte_none(ptent)) { |
@@ -689,10 +693,11 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
689 | continue; | 693 | continue; |
690 | if (!pte_file(ptent)) | 694 | if (!pte_file(ptent)) |
691 | free_swap_and_cache(pte_to_swp_entry(ptent)); | 695 | free_swap_and_cache(pte_to_swp_entry(ptent)); |
692 | pte_clear_full(mm, addr, pte, tlb->fullmm); | 696 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); |
693 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); | 697 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); |
694 | 698 | ||
695 | add_mm_rss(mm, file_rss, anon_rss); | 699 | add_mm_rss(mm, file_rss, anon_rss); |
700 | arch_leave_lazy_mmu_mode(); | ||
696 | pte_unmap_unlock(pte - 1, ptl); | 701 | pte_unmap_unlock(pte - 1, ptl); |
697 | 702 | ||
698 | return addr; | 703 | return addr; |
@@ -1108,6 +1113,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
1108 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); | 1113 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); |
1109 | if (!pte) | 1114 | if (!pte) |
1110 | return -ENOMEM; | 1115 | return -ENOMEM; |
1116 | arch_enter_lazy_mmu_mode(); | ||
1111 | do { | 1117 | do { |
1112 | struct page *page = ZERO_PAGE(addr); | 1118 | struct page *page = ZERO_PAGE(addr); |
1113 | pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); | 1119 | pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); |
@@ -1117,6 +1123,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
1117 | BUG_ON(!pte_none(*pte)); | 1123 | BUG_ON(!pte_none(*pte)); |
1118 | set_pte_at(mm, addr, pte, zero_pte); | 1124 | set_pte_at(mm, addr, pte, zero_pte); |
1119 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1125 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1126 | arch_leave_lazy_mmu_mode(); | ||
1120 | pte_unmap_unlock(pte - 1, ptl); | 1127 | pte_unmap_unlock(pte - 1, ptl); |
1121 | return 0; | 1128 | return 0; |
1122 | } | 1129 | } |
@@ -1226,7 +1233,12 @@ out: | |||
1226 | return retval; | 1233 | return retval; |
1227 | } | 1234 | } |
1228 | 1235 | ||
1229 | /* | 1236 | /** |
1237 | * vm_insert_page - insert single page into user vma | ||
1238 | * @vma: user vma to map to | ||
1239 | * @addr: target user address of this page | ||
1240 | * @page: source kernel page | ||
1241 | * | ||
1230 | * This allows drivers to insert individual pages they've allocated | 1242 | * This allows drivers to insert individual pages they've allocated |
1231 | * into a user vma. | 1243 | * into a user vma. |
1232 | * | 1244 | * |
@@ -1269,11 +1281,13 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
1269 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); | 1281 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); |
1270 | if (!pte) | 1282 | if (!pte) |
1271 | return -ENOMEM; | 1283 | return -ENOMEM; |
1284 | arch_enter_lazy_mmu_mode(); | ||
1272 | do { | 1285 | do { |
1273 | BUG_ON(!pte_none(*pte)); | 1286 | BUG_ON(!pte_none(*pte)); |
1274 | set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); | 1287 | set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); |
1275 | pfn++; | 1288 | pfn++; |
1276 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1289 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1290 | arch_leave_lazy_mmu_mode(); | ||
1277 | pte_unmap_unlock(pte - 1, ptl); | 1291 | pte_unmap_unlock(pte - 1, ptl); |
1278 | return 0; | 1292 | return 0; |
1279 | } | 1293 | } |
@@ -1318,7 +1332,16 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, | |||
1318 | return 0; | 1332 | return 0; |
1319 | } | 1333 | } |
1320 | 1334 | ||
1321 | /* Note: this is only safe if the mm semaphore is held when called. */ | 1335 | /** |
1336 | * remap_pfn_range - remap kernel memory to userspace | ||
1337 | * @vma: user vma to map to | ||
1338 | * @addr: target user address to start at | ||
1339 | * @pfn: physical address of kernel memory | ||
1340 | * @size: size of map area | ||
1341 | * @prot: page protection flags for this mapping | ||
1342 | * | ||
1343 | * Note: this is only safe if the mm semaphore is held when called. | ||
1344 | */ | ||
1322 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | 1345 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, |
1323 | unsigned long pfn, unsigned long size, pgprot_t prot) | 1346 | unsigned long pfn, unsigned long size, pgprot_t prot) |
1324 | { | 1347 | { |
@@ -1458,14 +1481,29 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1458 | { | 1481 | { |
1459 | struct page *old_page, *new_page; | 1482 | struct page *old_page, *new_page; |
1460 | pte_t entry; | 1483 | pte_t entry; |
1461 | int reuse, ret = VM_FAULT_MINOR; | 1484 | int reuse = 0, ret = VM_FAULT_MINOR; |
1485 | struct page *dirty_page = NULL; | ||
1462 | 1486 | ||
1463 | old_page = vm_normal_page(vma, address, orig_pte); | 1487 | old_page = vm_normal_page(vma, address, orig_pte); |
1464 | if (!old_page) | 1488 | if (!old_page) |
1465 | goto gotten; | 1489 | goto gotten; |
1466 | 1490 | ||
1467 | if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) == | 1491 | /* |
1468 | (VM_SHARED|VM_WRITE))) { | 1492 | * Take out anonymous pages first, anonymous shared vmas are |
1493 | * not dirty accountable. | ||
1494 | */ | ||
1495 | if (PageAnon(old_page)) { | ||
1496 | if (!TestSetPageLocked(old_page)) { | ||
1497 | reuse = can_share_swap_page(old_page); | ||
1498 | unlock_page(old_page); | ||
1499 | } | ||
1500 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | ||
1501 | (VM_WRITE|VM_SHARED))) { | ||
1502 | /* | ||
1503 | * Only catch write-faults on shared writable pages, | ||
1504 | * read-only shared pages can get COWed by | ||
1505 | * get_user_pages(.write=1, .force=1). | ||
1506 | */ | ||
1469 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { | 1507 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { |
1470 | /* | 1508 | /* |
1471 | * Notify the address space that the page is about to | 1509 | * Notify the address space that the page is about to |
@@ -1494,13 +1532,9 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1494 | if (!pte_same(*page_table, orig_pte)) | 1532 | if (!pte_same(*page_table, orig_pte)) |
1495 | goto unlock; | 1533 | goto unlock; |
1496 | } | 1534 | } |
1497 | 1535 | dirty_page = old_page; | |
1536 | get_page(dirty_page); | ||
1498 | reuse = 1; | 1537 | reuse = 1; |
1499 | } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { | ||
1500 | reuse = can_share_swap_page(old_page); | ||
1501 | unlock_page(old_page); | ||
1502 | } else { | ||
1503 | reuse = 0; | ||
1504 | } | 1538 | } |
1505 | 1539 | ||
1506 | if (reuse) { | 1540 | if (reuse) { |
@@ -1551,7 +1585,14 @@ gotten: | |||
1551 | entry = mk_pte(new_page, vma->vm_page_prot); | 1585 | entry = mk_pte(new_page, vma->vm_page_prot); |
1552 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1586 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1553 | lazy_mmu_prot_update(entry); | 1587 | lazy_mmu_prot_update(entry); |
1554 | ptep_establish(vma, address, page_table, entry); | 1588 | /* |
1589 | * Clear the pte entry and flush it first, before updating the | ||
1590 | * pte with the new entry. This will avoid a race condition | ||
1591 | * seen in the presence of one thread doing SMC and another | ||
1592 | * thread doing COW. | ||
1593 | */ | ||
1594 | ptep_clear_flush(vma, address, page_table); | ||
1595 | set_pte_at(mm, address, page_table, entry); | ||
1555 | update_mmu_cache(vma, address, entry); | 1596 | update_mmu_cache(vma, address, entry); |
1556 | lru_cache_add_active(new_page); | 1597 | lru_cache_add_active(new_page); |
1557 | page_add_new_anon_rmap(new_page, vma, address); | 1598 | page_add_new_anon_rmap(new_page, vma, address); |
@@ -1566,6 +1607,10 @@ gotten: | |||
1566 | page_cache_release(old_page); | 1607 | page_cache_release(old_page); |
1567 | unlock: | 1608 | unlock: |
1568 | pte_unmap_unlock(page_table, ptl); | 1609 | pte_unmap_unlock(page_table, ptl); |
1610 | if (dirty_page) { | ||
1611 | set_page_dirty_balance(dirty_page); | ||
1612 | put_page(dirty_page); | ||
1613 | } | ||
1569 | return ret; | 1614 | return ret; |
1570 | oom: | 1615 | oom: |
1571 | if (old_page) | 1616 | if (old_page) |
@@ -1785,9 +1830,10 @@ void unmap_mapping_range(struct address_space *mapping, | |||
1785 | } | 1830 | } |
1786 | EXPORT_SYMBOL(unmap_mapping_range); | 1831 | EXPORT_SYMBOL(unmap_mapping_range); |
1787 | 1832 | ||
1788 | /* | 1833 | /** |
1789 | * Handle all mappings that got truncated by a "truncate()" | 1834 | * vmtruncate - unmap mappings "freed" by truncate() syscall |
1790 | * system call. | 1835 | * @inode: inode of the file used |
1836 | * @offset: file offset to start truncating | ||
1791 | * | 1837 | * |
1792 | * NOTE! We have to be ready to update the memory sharing | 1838 | * NOTE! We have to be ready to update the memory sharing |
1793 | * between the file and the memory map for a potential last | 1839 | * between the file and the memory map for a potential last |
@@ -1856,11 +1902,16 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | |||
1856 | } | 1902 | } |
1857 | EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */ | 1903 | EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */ |
1858 | 1904 | ||
1859 | /* | 1905 | /** |
1906 | * swapin_readahead - swap in pages in hope we need them soon | ||
1907 | * @entry: swap entry of this memory | ||
1908 | * @addr: address to start | ||
1909 | * @vma: user vma this addresses belong to | ||
1910 | * | ||
1860 | * Primitive swap readahead code. We simply read an aligned block of | 1911 | * Primitive swap readahead code. We simply read an aligned block of |
1861 | * (1 << page_cluster) entries in the swap area. This method is chosen | 1912 | * (1 << page_cluster) entries in the swap area. This method is chosen |
1862 | * because it doesn't cost us any seek time. We also make sure to queue | 1913 | * because it doesn't cost us any seek time. We also make sure to queue |
1863 | * the 'original' request together with the readahead ones... | 1914 | * the 'original' request together with the readahead ones... |
1864 | * | 1915 | * |
1865 | * This has been extended to use the NUMA policies from the mm triggering | 1916 | * This has been extended to use the NUMA policies from the mm triggering |
1866 | * the readahead. | 1917 | * the readahead. |
@@ -2098,6 +2149,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2098 | unsigned int sequence = 0; | 2149 | unsigned int sequence = 0; |
2099 | int ret = VM_FAULT_MINOR; | 2150 | int ret = VM_FAULT_MINOR; |
2100 | int anon = 0; | 2151 | int anon = 0; |
2152 | struct page *dirty_page = NULL; | ||
2101 | 2153 | ||
2102 | pte_unmap(page_table); | 2154 | pte_unmap(page_table); |
2103 | BUG_ON(vma->vm_flags & VM_PFNMAP); | 2155 | BUG_ON(vma->vm_flags & VM_PFNMAP); |
@@ -2192,6 +2244,10 @@ retry: | |||
2192 | } else { | 2244 | } else { |
2193 | inc_mm_counter(mm, file_rss); | 2245 | inc_mm_counter(mm, file_rss); |
2194 | page_add_file_rmap(new_page); | 2246 | page_add_file_rmap(new_page); |
2247 | if (write_access) { | ||
2248 | dirty_page = new_page; | ||
2249 | get_page(dirty_page); | ||
2250 | } | ||
2195 | } | 2251 | } |
2196 | } else { | 2252 | } else { |
2197 | /* One of our sibling threads was faster, back out. */ | 2253 | /* One of our sibling threads was faster, back out. */ |
@@ -2204,6 +2260,10 @@ retry: | |||
2204 | lazy_mmu_prot_update(entry); | 2260 | lazy_mmu_prot_update(entry); |
2205 | unlock: | 2261 | unlock: |
2206 | pte_unmap_unlock(page_table, ptl); | 2262 | pte_unmap_unlock(page_table, ptl); |
2263 | if (dirty_page) { | ||
2264 | set_page_dirty_balance(dirty_page); | ||
2265 | put_page(dirty_page); | ||
2266 | } | ||
2207 | return ret; | 2267 | return ret; |
2208 | oom: | 2268 | oom: |
2209 | page_cache_release(new_page); | 2269 | page_cache_release(new_page); |
@@ -2211,6 +2271,54 @@ oom: | |||
2211 | } | 2271 | } |
2212 | 2272 | ||
2213 | /* | 2273 | /* |
2274 | * do_no_pfn() tries to create a new page mapping for a page without | ||
2275 | * a struct_page backing it | ||
2276 | * | ||
2277 | * As this is called only for pages that do not currently exist, we | ||
2278 | * do not need to flush old virtual caches or the TLB. | ||
2279 | * | ||
2280 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | ||
2281 | * but allow concurrent faults), and pte mapped but not yet locked. | ||
2282 | * We return with mmap_sem still held, but pte unmapped and unlocked. | ||
2283 | * | ||
2284 | * It is expected that the ->nopfn handler always returns the same pfn | ||
2285 | * for a given virtual mapping. | ||
2286 | * | ||
2287 | * Mark this `noinline' to prevent it from bloating the main pagefault code. | ||
2288 | */ | ||
2289 | static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, | ||
2290 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
2291 | int write_access) | ||
2292 | { | ||
2293 | spinlock_t *ptl; | ||
2294 | pte_t entry; | ||
2295 | unsigned long pfn; | ||
2296 | int ret = VM_FAULT_MINOR; | ||
2297 | |||
2298 | pte_unmap(page_table); | ||
2299 | BUG_ON(!(vma->vm_flags & VM_PFNMAP)); | ||
2300 | BUG_ON(is_cow_mapping(vma->vm_flags)); | ||
2301 | |||
2302 | pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); | ||
2303 | if (pfn == NOPFN_OOM) | ||
2304 | return VM_FAULT_OOM; | ||
2305 | if (pfn == NOPFN_SIGBUS) | ||
2306 | return VM_FAULT_SIGBUS; | ||
2307 | |||
2308 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
2309 | |||
2310 | /* Only go through if we didn't race with anybody else... */ | ||
2311 | if (pte_none(*page_table)) { | ||
2312 | entry = pfn_pte(pfn, vma->vm_page_prot); | ||
2313 | if (write_access) | ||
2314 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
2315 | set_pte_at(mm, address, page_table, entry); | ||
2316 | } | ||
2317 | pte_unmap_unlock(page_table, ptl); | ||
2318 | return ret; | ||
2319 | } | ||
2320 | |||
2321 | /* | ||
2214 | * Fault of a previously existing named mapping. Repopulate the pte | 2322 | * Fault of a previously existing named mapping. Repopulate the pte |
2215 | * from the encoded file_pte if possible. This enables swappable | 2323 | * from the encoded file_pte if possible. This enables swappable |
2216 | * nonlinear vmas. | 2324 | * nonlinear vmas. |
@@ -2272,11 +2380,17 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
2272 | old_entry = entry = *pte; | 2380 | old_entry = entry = *pte; |
2273 | if (!pte_present(entry)) { | 2381 | if (!pte_present(entry)) { |
2274 | if (pte_none(entry)) { | 2382 | if (pte_none(entry)) { |
2275 | if (!vma->vm_ops || !vma->vm_ops->nopage) | 2383 | if (vma->vm_ops) { |
2276 | return do_anonymous_page(mm, vma, address, | 2384 | if (vma->vm_ops->nopage) |
2277 | pte, pmd, write_access); | 2385 | return do_no_page(mm, vma, address, |
2278 | return do_no_page(mm, vma, address, | 2386 | pte, pmd, |
2279 | pte, pmd, write_access); | 2387 | write_access); |
2388 | if (unlikely(vma->vm_ops->nopfn)) | ||
2389 | return do_no_pfn(mm, vma, address, pte, | ||
2390 | pmd, write_access); | ||
2391 | } | ||
2392 | return do_anonymous_page(mm, vma, address, | ||
2393 | pte, pmd, write_access); | ||
2280 | } | 2394 | } |
2281 | if (pte_file(entry)) | 2395 | if (pte_file(entry)) |
2282 | return do_file_page(mm, vma, address, | 2396 | return do_file_page(mm, vma, address, |
@@ -2505,3 +2619,56 @@ int in_gate_area_no_task(unsigned long addr) | |||
2505 | } | 2619 | } |
2506 | 2620 | ||
2507 | #endif /* __HAVE_ARCH_GATE_AREA */ | 2621 | #endif /* __HAVE_ARCH_GATE_AREA */ |
2622 | |||
2623 | /* | ||
2624 | * Access another process' address space. | ||
2625 | * Source/target buffer must be kernel space, | ||
2626 | * Do not walk the page table directly, use get_user_pages | ||
2627 | */ | ||
2628 | int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) | ||
2629 | { | ||
2630 | struct mm_struct *mm; | ||
2631 | struct vm_area_struct *vma; | ||
2632 | struct page *page; | ||
2633 | void *old_buf = buf; | ||
2634 | |||
2635 | mm = get_task_mm(tsk); | ||
2636 | if (!mm) | ||
2637 | return 0; | ||
2638 | |||
2639 | down_read(&mm->mmap_sem); | ||
2640 | /* ignore errors, just check how much was sucessfully transfered */ | ||
2641 | while (len) { | ||
2642 | int bytes, ret, offset; | ||
2643 | void *maddr; | ||
2644 | |||
2645 | ret = get_user_pages(tsk, mm, addr, 1, | ||
2646 | write, 1, &page, &vma); | ||
2647 | if (ret <= 0) | ||
2648 | break; | ||
2649 | |||
2650 | bytes = len; | ||
2651 | offset = addr & (PAGE_SIZE-1); | ||
2652 | if (bytes > PAGE_SIZE-offset) | ||
2653 | bytes = PAGE_SIZE-offset; | ||
2654 | |||
2655 | maddr = kmap(page); | ||
2656 | if (write) { | ||
2657 | copy_to_user_page(vma, page, addr, | ||
2658 | maddr + offset, buf, bytes); | ||
2659 | set_page_dirty_lock(page); | ||
2660 | } else { | ||
2661 | copy_from_user_page(vma, page, addr, | ||
2662 | buf, maddr + offset, bytes); | ||
2663 | } | ||
2664 | kunmap(page); | ||
2665 | page_cache_release(page); | ||
2666 | len -= bytes; | ||
2667 | buf += bytes; | ||
2668 | addr += bytes; | ||
2669 | } | ||
2670 | up_read(&mm->mmap_sem); | ||
2671 | mmput(mm); | ||
2672 | |||
2673 | return buf - old_buf; | ||
2674 | } | ||
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c37319542b..fd678a662e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/compiler.h> | 13 | #include <linux/compiler.h> |
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/pagevec.h> | 15 | #include <linux/pagevec.h> |
16 | #include <linux/writeback.h> | ||
16 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
17 | #include <linux/sysctl.h> | 18 | #include <linux/sysctl.h> |
18 | #include <linux/cpu.h> | 19 | #include <linux/cpu.h> |
@@ -21,11 +22,41 @@ | |||
21 | #include <linux/highmem.h> | 22 | #include <linux/highmem.h> |
22 | #include <linux/vmalloc.h> | 23 | #include <linux/vmalloc.h> |
23 | #include <linux/ioport.h> | 24 | #include <linux/ioport.h> |
25 | #include <linux/cpuset.h> | ||
24 | 26 | ||
25 | #include <asm/tlbflush.h> | 27 | #include <asm/tlbflush.h> |
26 | 28 | ||
27 | extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, | 29 | /* add this memory to iomem resource */ |
28 | unsigned long size); | 30 | static struct resource *register_memory_resource(u64 start, u64 size) |
31 | { | ||
32 | struct resource *res; | ||
33 | res = kzalloc(sizeof(struct resource), GFP_KERNEL); | ||
34 | BUG_ON(!res); | ||
35 | |||
36 | res->name = "System RAM"; | ||
37 | res->start = start; | ||
38 | res->end = start + size - 1; | ||
39 | res->flags = IORESOURCE_MEM; | ||
40 | if (request_resource(&iomem_resource, res) < 0) { | ||
41 | printk("System RAM resource %llx - %llx cannot be added\n", | ||
42 | (unsigned long long)res->start, (unsigned long long)res->end); | ||
43 | kfree(res); | ||
44 | res = NULL; | ||
45 | } | ||
46 | return res; | ||
47 | } | ||
48 | |||
49 | static void release_memory_resource(struct resource *res) | ||
50 | { | ||
51 | if (!res) | ||
52 | return; | ||
53 | release_resource(res); | ||
54 | kfree(res); | ||
55 | return; | ||
56 | } | ||
57 | |||
58 | |||
59 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE | ||
29 | static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) | 60 | static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) |
30 | { | 61 | { |
31 | struct pglist_data *pgdat = zone->zone_pgdat; | 62 | struct pglist_data *pgdat = zone->zone_pgdat; |
@@ -45,8 +76,6 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) | |||
45 | return 0; | 76 | return 0; |
46 | } | 77 | } |
47 | 78 | ||
48 | extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | ||
49 | int nr_pages); | ||
50 | static int __add_section(struct zone *zone, unsigned long phys_start_pfn) | 79 | static int __add_section(struct zone *zone, unsigned long phys_start_pfn) |
51 | { | 80 | { |
52 | int nr_pages = PAGES_PER_SECTION; | 81 | int nr_pages = PAGES_PER_SECTION; |
@@ -191,8 +220,10 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
191 | if (need_zonelists_rebuild) | 220 | if (need_zonelists_rebuild) |
192 | build_all_zonelists(); | 221 | build_all_zonelists(); |
193 | vm_total_pages = nr_free_pagecache_pages(); | 222 | vm_total_pages = nr_free_pagecache_pages(); |
223 | writeback_set_ratelimit(); | ||
194 | return 0; | 224 | return 0; |
195 | } | 225 | } |
226 | #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ | ||
196 | 227 | ||
197 | static pg_data_t *hotadd_new_pgdat(int nid, u64 start) | 228 | static pg_data_t *hotadd_new_pgdat(int nid, u64 start) |
198 | { | 229 | { |
@@ -222,36 +253,6 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) | |||
222 | return; | 253 | return; |
223 | } | 254 | } |
224 | 255 | ||
225 | /* add this memory to iomem resource */ | ||
226 | static struct resource *register_memory_resource(u64 start, u64 size) | ||
227 | { | ||
228 | struct resource *res; | ||
229 | res = kzalloc(sizeof(struct resource), GFP_KERNEL); | ||
230 | BUG_ON(!res); | ||
231 | |||
232 | res->name = "System RAM"; | ||
233 | res->start = start; | ||
234 | res->end = start + size - 1; | ||
235 | res->flags = IORESOURCE_MEM; | ||
236 | if (request_resource(&iomem_resource, res) < 0) { | ||
237 | printk("System RAM resource %llx - %llx cannot be added\n", | ||
238 | (unsigned long long)res->start, (unsigned long long)res->end); | ||
239 | kfree(res); | ||
240 | res = NULL; | ||
241 | } | ||
242 | return res; | ||
243 | } | ||
244 | |||
245 | static void release_memory_resource(struct resource *res) | ||
246 | { | ||
247 | if (!res) | ||
248 | return; | ||
249 | release_resource(res); | ||
250 | kfree(res); | ||
251 | return; | ||
252 | } | ||
253 | |||
254 | |||
255 | 256 | ||
256 | int add_memory(int nid, u64 start, u64 size) | 257 | int add_memory(int nid, u64 start, u64 size) |
257 | { | 258 | { |
@@ -283,6 +284,8 @@ int add_memory(int nid, u64 start, u64 size) | |||
283 | /* we online node here. we can't roll back from here. */ | 284 | /* we online node here. we can't roll back from here. */ |
284 | node_set_online(nid); | 285 | node_set_online(nid); |
285 | 286 | ||
287 | cpuset_track_online_nodes(); | ||
288 | |||
286 | if (new_pgdat) { | 289 | if (new_pgdat) { |
287 | ret = register_one_node(nid); | 290 | ret = register_one_node(nid); |
288 | /* | 291 | /* |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a9963ceddd..25788b1b7f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -105,7 +105,7 @@ static struct kmem_cache *sn_cache; | |||
105 | 105 | ||
106 | /* Highest zone. An specific allocation for a zone below that is not | 106 | /* Highest zone. An specific allocation for a zone below that is not |
107 | policied. */ | 107 | policied. */ |
108 | int policy_zone = ZONE_DMA; | 108 | enum zone_type policy_zone = ZONE_DMA; |
109 | 109 | ||
110 | struct mempolicy default_policy = { | 110 | struct mempolicy default_policy = { |
111 | .refcnt = ATOMIC_INIT(1), /* never free it */ | 111 | .refcnt = ATOMIC_INIT(1), /* never free it */ |
@@ -137,7 +137,8 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) | |||
137 | static struct zonelist *bind_zonelist(nodemask_t *nodes) | 137 | static struct zonelist *bind_zonelist(nodemask_t *nodes) |
138 | { | 138 | { |
139 | struct zonelist *zl; | 139 | struct zonelist *zl; |
140 | int num, max, nd, k; | 140 | int num, max, nd; |
141 | enum zone_type k; | ||
141 | 142 | ||
142 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); | 143 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); |
143 | zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); | 144 | zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); |
@@ -148,12 +149,16 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) | |||
148 | lower zones etc. Avoid empty zones because the memory allocator | 149 | lower zones etc. Avoid empty zones because the memory allocator |
149 | doesn't like them. If you implement node hot removal you | 150 | doesn't like them. If you implement node hot removal you |
150 | have to fix that. */ | 151 | have to fix that. */ |
151 | for (k = policy_zone; k >= 0; k--) { | 152 | k = policy_zone; |
153 | while (1) { | ||
152 | for_each_node_mask(nd, *nodes) { | 154 | for_each_node_mask(nd, *nodes) { |
153 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; | 155 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; |
154 | if (z->present_pages > 0) | 156 | if (z->present_pages > 0) |
155 | zl->zones[num++] = z; | 157 | zl->zones[num++] = z; |
156 | } | 158 | } |
159 | if (k == 0) | ||
160 | break; | ||
161 | k--; | ||
157 | } | 162 | } |
158 | zl->zones[num] = NULL; | 163 | zl->zones[num] = NULL; |
159 | return zl; | 164 | return zl; |
@@ -482,7 +487,7 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) | |||
482 | switch (p->policy) { | 487 | switch (p->policy) { |
483 | case MPOL_BIND: | 488 | case MPOL_BIND: |
484 | for (i = 0; p->v.zonelist->zones[i]; i++) | 489 | for (i = 0; p->v.zonelist->zones[i]; i++) |
485 | node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, | 490 | node_set(zone_to_nid(p->v.zonelist->zones[i]), |
486 | *nodes); | 491 | *nodes); |
487 | break; | 492 | break; |
488 | case MPOL_DEFAULT: | 493 | case MPOL_DEFAULT: |
@@ -1131,7 +1136,9 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
1131 | */ | 1136 | */ |
1132 | unsigned slab_node(struct mempolicy *policy) | 1137 | unsigned slab_node(struct mempolicy *policy) |
1133 | { | 1138 | { |
1134 | switch (policy->policy) { | 1139 | int pol = policy ? policy->policy : MPOL_DEFAULT; |
1140 | |||
1141 | switch (pol) { | ||
1135 | case MPOL_INTERLEAVE: | 1142 | case MPOL_INTERLEAVE: |
1136 | return interleave_nodes(policy); | 1143 | return interleave_nodes(policy); |
1137 | 1144 | ||
@@ -1140,7 +1147,7 @@ unsigned slab_node(struct mempolicy *policy) | |||
1140 | * Follow bind policy behavior and start allocation at the | 1147 | * Follow bind policy behavior and start allocation at the |
1141 | * first node. | 1148 | * first node. |
1142 | */ | 1149 | */ |
1143 | return policy->v.zonelist->zones[0]->zone_pgdat->node_id; | 1150 | return zone_to_nid(policy->v.zonelist->zones[0]); |
1144 | 1151 | ||
1145 | case MPOL_PREFERRED: | 1152 | case MPOL_PREFERRED: |
1146 | if (policy->v.preferred_node >= 0) | 1153 | if (policy->v.preferred_node >= 0) |
@@ -1285,7 +1292,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1285 | 1292 | ||
1286 | if ((gfp & __GFP_WAIT) && !in_interrupt()) | 1293 | if ((gfp & __GFP_WAIT) && !in_interrupt()) |
1287 | cpuset_update_task_memory_state(); | 1294 | cpuset_update_task_memory_state(); |
1288 | if (!pol || in_interrupt()) | 1295 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) |
1289 | pol = &default_policy; | 1296 | pol = &default_policy; |
1290 | if (pol->policy == MPOL_INTERLEAVE) | 1297 | if (pol->policy == MPOL_INTERLEAVE) |
1291 | return alloc_page_interleave(gfp, order, interleave_nodes(pol)); | 1298 | return alloc_page_interleave(gfp, order, interleave_nodes(pol)); |
@@ -1317,12 +1324,11 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) | |||
1317 | atomic_set(&new->refcnt, 1); | 1324 | atomic_set(&new->refcnt, 1); |
1318 | if (new->policy == MPOL_BIND) { | 1325 | if (new->policy == MPOL_BIND) { |
1319 | int sz = ksize(old->v.zonelist); | 1326 | int sz = ksize(old->v.zonelist); |
1320 | new->v.zonelist = kmalloc(sz, SLAB_KERNEL); | 1327 | new->v.zonelist = kmemdup(old->v.zonelist, sz, SLAB_KERNEL); |
1321 | if (!new->v.zonelist) { | 1328 | if (!new->v.zonelist) { |
1322 | kmem_cache_free(policy_cache, new); | 1329 | kmem_cache_free(policy_cache, new); |
1323 | return ERR_PTR(-ENOMEM); | 1330 | return ERR_PTR(-ENOMEM); |
1324 | } | 1331 | } |
1325 | memcpy(new->v.zonelist, old->v.zonelist, sz); | ||
1326 | } | 1332 | } |
1327 | return new; | 1333 | return new; |
1328 | } | 1334 | } |
@@ -1644,7 +1650,7 @@ void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) | |||
1644 | 1650 | ||
1645 | nodes_clear(nodes); | 1651 | nodes_clear(nodes); |
1646 | for (z = pol->v.zonelist->zones; *z; z++) | 1652 | for (z = pol->v.zonelist->zones; *z; z++) |
1647 | node_set((*z)->zone_pgdat->node_id, nodes); | 1653 | node_set(zone_to_nid(*z), nodes); |
1648 | nodes_remap(tmp, nodes, *mpolmask, *newmask); | 1654 | nodes_remap(tmp, nodes, *mpolmask, *newmask); |
1649 | nodes = tmp; | 1655 | nodes = tmp; |
1650 | 1656 | ||
diff --git a/mm/migrate.c b/mm/migrate.c index 3f1e0c2c94..ba2453f948 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -409,6 +409,7 @@ int migrate_page(struct address_space *mapping, | |||
409 | } | 409 | } |
410 | EXPORT_SYMBOL(migrate_page); | 410 | EXPORT_SYMBOL(migrate_page); |
411 | 411 | ||
412 | #ifdef CONFIG_BLOCK | ||
412 | /* | 413 | /* |
413 | * Migration function for pages with buffers. This function can only be used | 414 | * Migration function for pages with buffers. This function can only be used |
414 | * if the underlying filesystem guarantees that no other references to "page" | 415 | * if the underlying filesystem guarantees that no other references to "page" |
@@ -466,6 +467,7 @@ int buffer_migrate_page(struct address_space *mapping, | |||
466 | return 0; | 467 | return 0; |
467 | } | 468 | } |
468 | EXPORT_SYMBOL(buffer_migrate_page); | 469 | EXPORT_SYMBOL(buffer_migrate_page); |
470 | #endif | ||
469 | 471 | ||
470 | /* | 472 | /* |
471 | * Writeback a page to clean the dirty state | 473 | * Writeback a page to clean the dirty state |
@@ -525,7 +527,7 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
525 | * Buffers may be managed in a filesystem specific way. | 527 | * Buffers may be managed in a filesystem specific way. |
526 | * We must have no buffers or drop them. | 528 | * We must have no buffers or drop them. |
527 | */ | 529 | */ |
528 | if (page_has_buffers(page) && | 530 | if (PagePrivate(page) && |
529 | !try_to_release_page(page, GFP_KERNEL)) | 531 | !try_to_release_page(page, GFP_KERNEL)) |
530 | return -EAGAIN; | 532 | return -EAGAIN; |
531 | 533 | ||
@@ -741,7 +743,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, | |||
741 | 743 | ||
742 | *result = &pm->status; | 744 | *result = &pm->status; |
743 | 745 | ||
744 | return alloc_pages_node(pm->node, GFP_HIGHUSER, 0); | 746 | return alloc_pages_node(pm->node, GFP_HIGHUSER | GFP_THISNODE, 0); |
745 | } | 747 | } |
746 | 748 | ||
747 | /* | 749 | /* |
@@ -64,6 +64,13 @@ pgprot_t protection_map[16] = { | |||
64 | __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 | 64 | __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 |
65 | }; | 65 | }; |
66 | 66 | ||
67 | pgprot_t vm_get_page_prot(unsigned long vm_flags) | ||
68 | { | ||
69 | return protection_map[vm_flags & | ||
70 | (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; | ||
71 | } | ||
72 | EXPORT_SYMBOL(vm_get_page_prot); | ||
73 | |||
67 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 74 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
68 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | 75 | int sysctl_overcommit_ratio = 50; /* default is 50% */ |
69 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | 76 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; |
@@ -109,7 +116,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin) | |||
109 | * which are reclaimable, under pressure. The dentry | 116 | * which are reclaimable, under pressure. The dentry |
110 | * cache and most inode caches should fall into this | 117 | * cache and most inode caches should fall into this |
111 | */ | 118 | */ |
112 | free += atomic_read(&slab_reclaim_pages); | 119 | free += global_page_state(NR_SLAB_RECLAIMABLE); |
113 | 120 | ||
114 | /* | 121 | /* |
115 | * Leave the last 3% for root | 122 | * Leave the last 3% for root |
@@ -1098,12 +1105,6 @@ munmap_back: | |||
1098 | goto free_vma; | 1105 | goto free_vma; |
1099 | } | 1106 | } |
1100 | 1107 | ||
1101 | /* Don't make the VMA automatically writable if it's shared, but the | ||
1102 | * backer wishes to know when pages are first written to */ | ||
1103 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) | ||
1104 | vma->vm_page_prot = | ||
1105 | protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)]; | ||
1106 | |||
1107 | /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform | 1108 | /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform |
1108 | * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) | 1109 | * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) |
1109 | * that memory reservation must be checked; but that reservation | 1110 | * that memory reservation must be checked; but that reservation |
@@ -1121,6 +1122,10 @@ munmap_back: | |||
1121 | pgoff = vma->vm_pgoff; | 1122 | pgoff = vma->vm_pgoff; |
1122 | vm_flags = vma->vm_flags; | 1123 | vm_flags = vma->vm_flags; |
1123 | 1124 | ||
1125 | if (vma_wants_writenotify(vma)) | ||
1126 | vma->vm_page_prot = | ||
1127 | protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)]; | ||
1128 | |||
1124 | if (!file || !vma_merge(mm, prev, addr, vma->vm_end, | 1129 | if (!file || !vma_merge(mm, prev, addr, vma->vm_end, |
1125 | vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { | 1130 | vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { |
1126 | file = vma->vm_file; | 1131 | file = vma->vm_file; |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 638edabaff..3b8f3c0c63 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -27,12 +27,14 @@ | |||
27 | #include <asm/tlbflush.h> | 27 | #include <asm/tlbflush.h> |
28 | 28 | ||
29 | static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | 29 | static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, |
30 | unsigned long addr, unsigned long end, pgprot_t newprot) | 30 | unsigned long addr, unsigned long end, pgprot_t newprot, |
31 | int dirty_accountable) | ||
31 | { | 32 | { |
32 | pte_t *pte, oldpte; | 33 | pte_t *pte, oldpte; |
33 | spinlock_t *ptl; | 34 | spinlock_t *ptl; |
34 | 35 | ||
35 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 36 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
37 | arch_enter_lazy_mmu_mode(); | ||
36 | do { | 38 | do { |
37 | oldpte = *pte; | 39 | oldpte = *pte; |
38 | if (pte_present(oldpte)) { | 40 | if (pte_present(oldpte)) { |
@@ -42,7 +44,14 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
42 | * bits by wiping the pte and then setting the new pte | 44 | * bits by wiping the pte and then setting the new pte |
43 | * into place. | 45 | * into place. |
44 | */ | 46 | */ |
45 | ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot); | 47 | ptent = ptep_get_and_clear(mm, addr, pte); |
48 | ptent = pte_modify(ptent, newprot); | ||
49 | /* | ||
50 | * Avoid taking write faults for pages we know to be | ||
51 | * dirty. | ||
52 | */ | ||
53 | if (dirty_accountable && pte_dirty(ptent)) | ||
54 | ptent = pte_mkwrite(ptent); | ||
46 | set_pte_at(mm, addr, pte, ptent); | 55 | set_pte_at(mm, addr, pte, ptent); |
47 | lazy_mmu_prot_update(ptent); | 56 | lazy_mmu_prot_update(ptent); |
48 | #ifdef CONFIG_MIGRATION | 57 | #ifdef CONFIG_MIGRATION |
@@ -62,11 +71,13 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
62 | } | 71 | } |
63 | 72 | ||
64 | } while (pte++, addr += PAGE_SIZE, addr != end); | 73 | } while (pte++, addr += PAGE_SIZE, addr != end); |
74 | arch_leave_lazy_mmu_mode(); | ||
65 | pte_unmap_unlock(pte - 1, ptl); | 75 | pte_unmap_unlock(pte - 1, ptl); |
66 | } | 76 | } |
67 | 77 | ||
68 | static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, | 78 | static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, |
69 | unsigned long addr, unsigned long end, pgprot_t newprot) | 79 | unsigned long addr, unsigned long end, pgprot_t newprot, |
80 | int dirty_accountable) | ||
70 | { | 81 | { |
71 | pmd_t *pmd; | 82 | pmd_t *pmd; |
72 | unsigned long next; | 83 | unsigned long next; |
@@ -76,12 +87,13 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
76 | next = pmd_addr_end(addr, end); | 87 | next = pmd_addr_end(addr, end); |
77 | if (pmd_none_or_clear_bad(pmd)) | 88 | if (pmd_none_or_clear_bad(pmd)) |
78 | continue; | 89 | continue; |
79 | change_pte_range(mm, pmd, addr, next, newprot); | 90 | change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); |
80 | } while (pmd++, addr = next, addr != end); | 91 | } while (pmd++, addr = next, addr != end); |
81 | } | 92 | } |
82 | 93 | ||
83 | static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, | 94 | static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, |
84 | unsigned long addr, unsigned long end, pgprot_t newprot) | 95 | unsigned long addr, unsigned long end, pgprot_t newprot, |
96 | int dirty_accountable) | ||
85 | { | 97 | { |
86 | pud_t *pud; | 98 | pud_t *pud; |
87 | unsigned long next; | 99 | unsigned long next; |
@@ -91,12 +103,13 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, | |||
91 | next = pud_addr_end(addr, end); | 103 | next = pud_addr_end(addr, end); |
92 | if (pud_none_or_clear_bad(pud)) | 104 | if (pud_none_or_clear_bad(pud)) |
93 | continue; | 105 | continue; |
94 | change_pmd_range(mm, pud, addr, next, newprot); | 106 | change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable); |
95 | } while (pud++, addr = next, addr != end); | 107 | } while (pud++, addr = next, addr != end); |
96 | } | 108 | } |
97 | 109 | ||
98 | static void change_protection(struct vm_area_struct *vma, | 110 | static void change_protection(struct vm_area_struct *vma, |
99 | unsigned long addr, unsigned long end, pgprot_t newprot) | 111 | unsigned long addr, unsigned long end, pgprot_t newprot, |
112 | int dirty_accountable) | ||
100 | { | 113 | { |
101 | struct mm_struct *mm = vma->vm_mm; | 114 | struct mm_struct *mm = vma->vm_mm; |
102 | pgd_t *pgd; | 115 | pgd_t *pgd; |
@@ -110,7 +123,7 @@ static void change_protection(struct vm_area_struct *vma, | |||
110 | next = pgd_addr_end(addr, end); | 123 | next = pgd_addr_end(addr, end); |
111 | if (pgd_none_or_clear_bad(pgd)) | 124 | if (pgd_none_or_clear_bad(pgd)) |
112 | continue; | 125 | continue; |
113 | change_pud_range(mm, pgd, addr, next, newprot); | 126 | change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable); |
114 | } while (pgd++, addr = next, addr != end); | 127 | } while (pgd++, addr = next, addr != end); |
115 | flush_tlb_range(vma, start, end); | 128 | flush_tlb_range(vma, start, end); |
116 | } | 129 | } |
@@ -123,10 +136,9 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
123 | unsigned long oldflags = vma->vm_flags; | 136 | unsigned long oldflags = vma->vm_flags; |
124 | long nrpages = (end - start) >> PAGE_SHIFT; | 137 | long nrpages = (end - start) >> PAGE_SHIFT; |
125 | unsigned long charged = 0; | 138 | unsigned long charged = 0; |
126 | unsigned int mask; | ||
127 | pgprot_t newprot; | ||
128 | pgoff_t pgoff; | 139 | pgoff_t pgoff; |
129 | int error; | 140 | int error; |
141 | int dirty_accountable = 0; | ||
130 | 142 | ||
131 | if (newflags == oldflags) { | 143 | if (newflags == oldflags) { |
132 | *pprev = vma; | 144 | *pprev = vma; |
@@ -176,24 +188,23 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
176 | } | 188 | } |
177 | 189 | ||
178 | success: | 190 | success: |
179 | /* Don't make the VMA automatically writable if it's shared, but the | ||
180 | * backer wishes to know when pages are first written to */ | ||
181 | mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED; | ||
182 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) | ||
183 | mask &= ~VM_SHARED; | ||
184 | |||
185 | newprot = protection_map[newflags & mask]; | ||
186 | |||
187 | /* | 191 | /* |
188 | * vm_flags and vm_page_prot are protected by the mmap_sem | 192 | * vm_flags and vm_page_prot are protected by the mmap_sem |
189 | * held in write mode. | 193 | * held in write mode. |
190 | */ | 194 | */ |
191 | vma->vm_flags = newflags; | 195 | vma->vm_flags = newflags; |
192 | vma->vm_page_prot = newprot; | 196 | vma->vm_page_prot = protection_map[newflags & |
197 | (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; | ||
198 | if (vma_wants_writenotify(vma)) { | ||
199 | vma->vm_page_prot = protection_map[newflags & | ||
200 | (VM_READ|VM_WRITE|VM_EXEC)]; | ||
201 | dirty_accountable = 1; | ||
202 | } | ||
203 | |||
193 | if (is_vm_hugetlb_page(vma)) | 204 | if (is_vm_hugetlb_page(vma)) |
194 | hugetlb_change_protection(vma, start, end, newprot); | 205 | hugetlb_change_protection(vma, start, end, vma->vm_page_prot); |
195 | else | 206 | else |
196 | change_protection(vma, start, end, newprot); | 207 | change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); |
197 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); | 208 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); |
198 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); | 209 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); |
199 | return 0; | 210 | return 0; |
diff --git a/mm/mremap.c b/mm/mremap.c index 7c15cf3373..9c769fa29f 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -98,6 +98,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
98 | new_ptl = pte_lockptr(mm, new_pmd); | 98 | new_ptl = pte_lockptr(mm, new_pmd); |
99 | if (new_ptl != old_ptl) | 99 | if (new_ptl != old_ptl) |
100 | spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); | 100 | spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); |
101 | arch_enter_lazy_mmu_mode(); | ||
101 | 102 | ||
102 | for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, | 103 | for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, |
103 | new_pte++, new_addr += PAGE_SIZE) { | 104 | new_pte++, new_addr += PAGE_SIZE) { |
@@ -109,6 +110,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
109 | set_pte_at(mm, new_addr, new_pte, pte); | 110 | set_pte_at(mm, new_addr, new_pte, pte); |
110 | } | 111 | } |
111 | 112 | ||
113 | arch_leave_lazy_mmu_mode(); | ||
112 | if (new_ptl != old_ptl) | 114 | if (new_ptl != old_ptl) |
113 | spin_unlock(new_ptl); | 115 | spin_unlock(new_ptl); |
114 | pte_unmap_nested(new_pte - 1); | 116 | pte_unmap_nested(new_pte - 1); |
diff --git a/mm/msync.c b/mm/msync.c index d083544df2..358d73cf7b 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
@@ -7,149 +7,33 @@ | |||
7 | /* | 7 | /* |
8 | * The msync() system call. | 8 | * The msync() system call. |
9 | */ | 9 | */ |
10 | #include <linux/slab.h> | ||
11 | #include <linux/pagemap.h> | ||
12 | #include <linux/fs.h> | 10 | #include <linux/fs.h> |
13 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
14 | #include <linux/mman.h> | 12 | #include <linux/mman.h> |
15 | #include <linux/hugetlb.h> | ||
16 | #include <linux/writeback.h> | ||
17 | #include <linux/file.h> | 13 | #include <linux/file.h> |
18 | #include <linux/syscalls.h> | 14 | #include <linux/syscalls.h> |
19 | 15 | ||
20 | #include <asm/pgtable.h> | ||
21 | #include <asm/tlbflush.h> | ||
22 | |||
23 | static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | ||
24 | unsigned long addr, unsigned long end) | ||
25 | { | ||
26 | pte_t *pte; | ||
27 | spinlock_t *ptl; | ||
28 | int progress = 0; | ||
29 | unsigned long ret = 0; | ||
30 | |||
31 | again: | ||
32 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||
33 | do { | ||
34 | struct page *page; | ||
35 | |||
36 | if (progress >= 64) { | ||
37 | progress = 0; | ||
38 | if (need_resched() || need_lockbreak(ptl)) | ||
39 | break; | ||
40 | } | ||
41 | progress++; | ||
42 | if (!pte_present(*pte)) | ||
43 | continue; | ||
44 | if (!pte_maybe_dirty(*pte)) | ||
45 | continue; | ||
46 | page = vm_normal_page(vma, addr, *pte); | ||
47 | if (!page) | ||
48 | continue; | ||
49 | if (ptep_clear_flush_dirty(vma, addr, pte) || | ||
50 | page_test_and_clear_dirty(page)) | ||
51 | ret += set_page_dirty(page); | ||
52 | progress += 3; | ||
53 | } while (pte++, addr += PAGE_SIZE, addr != end); | ||
54 | pte_unmap_unlock(pte - 1, ptl); | ||
55 | cond_resched(); | ||
56 | if (addr != end) | ||
57 | goto again; | ||
58 | return ret; | ||
59 | } | ||
60 | |||
61 | static inline unsigned long msync_pmd_range(struct vm_area_struct *vma, | ||
62 | pud_t *pud, unsigned long addr, unsigned long end) | ||
63 | { | ||
64 | pmd_t *pmd; | ||
65 | unsigned long next; | ||
66 | unsigned long ret = 0; | ||
67 | |||
68 | pmd = pmd_offset(pud, addr); | ||
69 | do { | ||
70 | next = pmd_addr_end(addr, end); | ||
71 | if (pmd_none_or_clear_bad(pmd)) | ||
72 | continue; | ||
73 | ret += msync_pte_range(vma, pmd, addr, next); | ||
74 | } while (pmd++, addr = next, addr != end); | ||
75 | return ret; | ||
76 | } | ||
77 | |||
78 | static inline unsigned long msync_pud_range(struct vm_area_struct *vma, | ||
79 | pgd_t *pgd, unsigned long addr, unsigned long end) | ||
80 | { | ||
81 | pud_t *pud; | ||
82 | unsigned long next; | ||
83 | unsigned long ret = 0; | ||
84 | |||
85 | pud = pud_offset(pgd, addr); | ||
86 | do { | ||
87 | next = pud_addr_end(addr, end); | ||
88 | if (pud_none_or_clear_bad(pud)) | ||
89 | continue; | ||
90 | ret += msync_pmd_range(vma, pud, addr, next); | ||
91 | } while (pud++, addr = next, addr != end); | ||
92 | return ret; | ||
93 | } | ||
94 | |||
95 | static unsigned long msync_page_range(struct vm_area_struct *vma, | ||
96 | unsigned long addr, unsigned long end) | ||
97 | { | ||
98 | pgd_t *pgd; | ||
99 | unsigned long next; | ||
100 | unsigned long ret = 0; | ||
101 | |||
102 | /* For hugepages we can't go walking the page table normally, | ||
103 | * but that's ok, hugetlbfs is memory based, so we don't need | ||
104 | * to do anything more on an msync(). | ||
105 | */ | ||
106 | if (vma->vm_flags & VM_HUGETLB) | ||
107 | return 0; | ||
108 | |||
109 | BUG_ON(addr >= end); | ||
110 | pgd = pgd_offset(vma->vm_mm, addr); | ||
111 | flush_cache_range(vma, addr, end); | ||
112 | do { | ||
113 | next = pgd_addr_end(addr, end); | ||
114 | if (pgd_none_or_clear_bad(pgd)) | ||
115 | continue; | ||
116 | ret += msync_pud_range(vma, pgd, addr, next); | ||
117 | } while (pgd++, addr = next, addr != end); | ||
118 | return ret; | ||
119 | } | ||
120 | |||
121 | /* | 16 | /* |
122 | * MS_SYNC syncs the entire file - including mappings. | 17 | * MS_SYNC syncs the entire file - including mappings. |
123 | * | 18 | * |
124 | * MS_ASYNC does not start I/O (it used to, up to 2.5.67). Instead, it just | 19 | * MS_ASYNC does not start I/O (it used to, up to 2.5.67). |
125 | * marks the relevant pages dirty. The application may now run fsync() to | 20 | * Nor does it marks the relevant pages dirty (it used to up to 2.6.17). |
21 | * Now it doesn't do anything, since dirty pages are properly tracked. | ||
22 | * | ||
23 | * The application may now run fsync() to | ||
126 | * write out the dirty pages and wait on the writeout and check the result. | 24 | * write out the dirty pages and wait on the writeout and check the result. |
127 | * Or the application may run fadvise(FADV_DONTNEED) against the fd to start | 25 | * Or the application may run fadvise(FADV_DONTNEED) against the fd to start |
128 | * async writeout immediately. | 26 | * async writeout immediately. |
129 | * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to | 27 | * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to |
130 | * applications. | 28 | * applications. |
131 | */ | 29 | */ |
132 | static int msync_interval(struct vm_area_struct *vma, unsigned long addr, | ||
133 | unsigned long end, int flags, | ||
134 | unsigned long *nr_pages_dirtied) | ||
135 | { | ||
136 | struct file *file = vma->vm_file; | ||
137 | |||
138 | if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED)) | ||
139 | return -EBUSY; | ||
140 | |||
141 | if (file && (vma->vm_flags & VM_SHARED)) | ||
142 | *nr_pages_dirtied = msync_page_range(vma, addr, end); | ||
143 | return 0; | ||
144 | } | ||
145 | |||
146 | asmlinkage long sys_msync(unsigned long start, size_t len, int flags) | 30 | asmlinkage long sys_msync(unsigned long start, size_t len, int flags) |
147 | { | 31 | { |
148 | unsigned long end; | 32 | unsigned long end; |
33 | struct mm_struct *mm = current->mm; | ||
149 | struct vm_area_struct *vma; | 34 | struct vm_area_struct *vma; |
150 | int unmapped_error = 0; | 35 | int unmapped_error = 0; |
151 | int error = -EINVAL; | 36 | int error = -EINVAL; |
152 | int done = 0; | ||
153 | 37 | ||
154 | if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) | 38 | if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) |
155 | goto out; | 39 | goto out; |
@@ -169,64 +53,50 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) | |||
169 | * If the interval [start,end) covers some unmapped address ranges, | 53 | * If the interval [start,end) covers some unmapped address ranges, |
170 | * just ignore them, but return -ENOMEM at the end. | 54 | * just ignore them, but return -ENOMEM at the end. |
171 | */ | 55 | */ |
172 | down_read(¤t->mm->mmap_sem); | 56 | down_read(&mm->mmap_sem); |
173 | vma = find_vma(current->mm, start); | 57 | vma = find_vma(mm, start); |
174 | if (!vma) { | 58 | for (;;) { |
175 | error = -ENOMEM; | ||
176 | goto out_unlock; | ||
177 | } | ||
178 | do { | ||
179 | unsigned long nr_pages_dirtied = 0; | ||
180 | struct file *file; | 59 | struct file *file; |
181 | 60 | ||
61 | /* Still start < end. */ | ||
62 | error = -ENOMEM; | ||
63 | if (!vma) | ||
64 | goto out_unlock; | ||
182 | /* Here start < vma->vm_end. */ | 65 | /* Here start < vma->vm_end. */ |
183 | if (start < vma->vm_start) { | 66 | if (start < vma->vm_start) { |
184 | unmapped_error = -ENOMEM; | ||
185 | start = vma->vm_start; | 67 | start = vma->vm_start; |
68 | if (start >= end) | ||
69 | goto out_unlock; | ||
70 | unmapped_error = -ENOMEM; | ||
186 | } | 71 | } |
187 | /* Here vma->vm_start <= start < vma->vm_end. */ | 72 | /* Here vma->vm_start <= start < vma->vm_end. */ |
188 | if (end <= vma->vm_end) { | 73 | if ((flags & MS_INVALIDATE) && |
189 | if (start < end) { | 74 | (vma->vm_flags & VM_LOCKED)) { |
190 | error = msync_interval(vma, start, end, flags, | 75 | error = -EBUSY; |
191 | &nr_pages_dirtied); | 76 | goto out_unlock; |
192 | if (error) | ||
193 | goto out_unlock; | ||
194 | } | ||
195 | error = unmapped_error; | ||
196 | done = 1; | ||
197 | } else { | ||
198 | /* Here vma->vm_start <= start < vma->vm_end < end. */ | ||
199 | error = msync_interval(vma, start, vma->vm_end, flags, | ||
200 | &nr_pages_dirtied); | ||
201 | if (error) | ||
202 | goto out_unlock; | ||
203 | } | 77 | } |
204 | file = vma->vm_file; | 78 | file = vma->vm_file; |
205 | start = vma->vm_end; | 79 | start = vma->vm_end; |
206 | if ((flags & MS_ASYNC) && file && nr_pages_dirtied) { | 80 | if ((flags & MS_SYNC) && file && |
207 | get_file(file); | ||
208 | up_read(¤t->mm->mmap_sem); | ||
209 | balance_dirty_pages_ratelimited_nr(file->f_mapping, | ||
210 | nr_pages_dirtied); | ||
211 | fput(file); | ||
212 | down_read(¤t->mm->mmap_sem); | ||
213 | vma = find_vma(current->mm, start); | ||
214 | } else if ((flags & MS_SYNC) && file && | ||
215 | (vma->vm_flags & VM_SHARED)) { | 81 | (vma->vm_flags & VM_SHARED)) { |
216 | get_file(file); | 82 | get_file(file); |
217 | up_read(¤t->mm->mmap_sem); | 83 | up_read(&mm->mmap_sem); |
218 | error = do_fsync(file, 0); | 84 | error = do_fsync(file, 0); |
219 | fput(file); | 85 | fput(file); |
220 | down_read(¤t->mm->mmap_sem); | 86 | if (error || start >= end) |
221 | if (error) | 87 | goto out; |
222 | goto out_unlock; | 88 | down_read(&mm->mmap_sem); |
223 | vma = find_vma(current->mm, start); | 89 | vma = find_vma(mm, start); |
224 | } else { | 90 | } else { |
91 | if (start >= end) { | ||
92 | error = 0; | ||
93 | goto out_unlock; | ||
94 | } | ||
225 | vma = vma->vm_next; | 95 | vma = vma->vm_next; |
226 | } | 96 | } |
227 | } while (vma && !done); | 97 | } |
228 | out_unlock: | 98 | out_unlock: |
229 | up_read(¤t->mm->mmap_sem); | 99 | up_read(&mm->mmap_sem); |
230 | out: | 100 | out: |
231 | return error; | 101 | return error ? : unmapped_error; |
232 | } | 102 | } |
diff --git a/mm/nommu.c b/mm/nommu.c index c576df71e3..365019599d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -122,26 +122,50 @@ unsigned int kobjsize(const void *objp) | |||
122 | } | 122 | } |
123 | 123 | ||
124 | /* | 124 | /* |
125 | * The nommu dodgy version :-) | 125 | * get a list of pages in an address range belonging to the specified process |
126 | * and indicate the VMA that covers each page | ||
127 | * - this is potentially dodgy as we may end incrementing the page count of a | ||
128 | * slab page or a secondary page from a compound page | ||
129 | * - don't permit access to VMAs that don't support it, such as I/O mappings | ||
126 | */ | 130 | */ |
127 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 131 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
128 | unsigned long start, int len, int write, int force, | 132 | unsigned long start, int len, int write, int force, |
129 | struct page **pages, struct vm_area_struct **vmas) | 133 | struct page **pages, struct vm_area_struct **vmas) |
130 | { | 134 | { |
135 | struct vm_area_struct *vma; | ||
136 | unsigned long vm_flags; | ||
131 | int i; | 137 | int i; |
132 | static struct vm_area_struct dummy_vma; | 138 | |
139 | /* calculate required read or write permissions. | ||
140 | * - if 'force' is set, we only require the "MAY" flags. | ||
141 | */ | ||
142 | vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); | ||
143 | vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | ||
133 | 144 | ||
134 | for (i = 0; i < len; i++) { | 145 | for (i = 0; i < len; i++) { |
146 | vma = find_vma(mm, start); | ||
147 | if (!vma) | ||
148 | goto finish_or_fault; | ||
149 | |||
150 | /* protect what we can, including chardevs */ | ||
151 | if (vma->vm_flags & (VM_IO | VM_PFNMAP) || | ||
152 | !(vm_flags & vma->vm_flags)) | ||
153 | goto finish_or_fault; | ||
154 | |||
135 | if (pages) { | 155 | if (pages) { |
136 | pages[i] = virt_to_page(start); | 156 | pages[i] = virt_to_page(start); |
137 | if (pages[i]) | 157 | if (pages[i]) |
138 | page_cache_get(pages[i]); | 158 | page_cache_get(pages[i]); |
139 | } | 159 | } |
140 | if (vmas) | 160 | if (vmas) |
141 | vmas[i] = &dummy_vma; | 161 | vmas[i] = vma; |
142 | start += PAGE_SIZE; | 162 | start += PAGE_SIZE; |
143 | } | 163 | } |
144 | return(i); | 164 | |
165 | return i; | ||
166 | |||
167 | finish_or_fault: | ||
168 | return i ? : -EFAULT; | ||
145 | } | 169 | } |
146 | 170 | ||
147 | EXPORT_SYMBOL(get_user_pages); | 171 | EXPORT_SYMBOL(get_user_pages); |
@@ -286,6 +310,77 @@ static void show_process_blocks(void) | |||
286 | } | 310 | } |
287 | #endif /* DEBUG */ | 311 | #endif /* DEBUG */ |
288 | 312 | ||
313 | /* | ||
314 | * add a VMA into a process's mm_struct in the appropriate place in the list | ||
315 | * - should be called with mm->mmap_sem held writelocked | ||
316 | */ | ||
317 | static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml) | ||
318 | { | ||
319 | struct vm_list_struct **ppv; | ||
320 | |||
321 | for (ppv = ¤t->mm->context.vmlist; *ppv; ppv = &(*ppv)->next) | ||
322 | if ((*ppv)->vma->vm_start > vml->vma->vm_start) | ||
323 | break; | ||
324 | |||
325 | vml->next = *ppv; | ||
326 | *ppv = vml; | ||
327 | } | ||
328 | |||
329 | /* | ||
330 | * look up the first VMA in which addr resides, NULL if none | ||
331 | * - should be called with mm->mmap_sem at least held readlocked | ||
332 | */ | ||
333 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | ||
334 | { | ||
335 | struct vm_list_struct *loop, *vml; | ||
336 | |||
337 | /* search the vm_start ordered list */ | ||
338 | vml = NULL; | ||
339 | for (loop = mm->context.vmlist; loop; loop = loop->next) { | ||
340 | if (loop->vma->vm_start > addr) | ||
341 | break; | ||
342 | vml = loop; | ||
343 | } | ||
344 | |||
345 | if (vml && vml->vma->vm_end > addr) | ||
346 | return vml->vma; | ||
347 | |||
348 | return NULL; | ||
349 | } | ||
350 | EXPORT_SYMBOL(find_vma); | ||
351 | |||
352 | /* | ||
353 | * find a VMA | ||
354 | * - we don't extend stack VMAs under NOMMU conditions | ||
355 | */ | ||
356 | struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) | ||
357 | { | ||
358 | return find_vma(mm, addr); | ||
359 | } | ||
360 | |||
361 | /* | ||
362 | * look up the first VMA exactly that exactly matches addr | ||
363 | * - should be called with mm->mmap_sem at least held readlocked | ||
364 | */ | ||
365 | static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm, | ||
366 | unsigned long addr) | ||
367 | { | ||
368 | struct vm_list_struct *vml; | ||
369 | |||
370 | /* search the vm_start ordered list */ | ||
371 | for (vml = mm->context.vmlist; vml; vml = vml->next) { | ||
372 | if (vml->vma->vm_start == addr) | ||
373 | return vml->vma; | ||
374 | if (vml->vma->vm_start > addr) | ||
375 | break; | ||
376 | } | ||
377 | |||
378 | return NULL; | ||
379 | } | ||
380 | |||
381 | /* | ||
382 | * find a VMA in the global tree | ||
383 | */ | ||
289 | static inline struct vm_area_struct *find_nommu_vma(unsigned long start) | 384 | static inline struct vm_area_struct *find_nommu_vma(unsigned long start) |
290 | { | 385 | { |
291 | struct vm_area_struct *vma; | 386 | struct vm_area_struct *vma; |
@@ -305,6 +400,9 @@ static inline struct vm_area_struct *find_nommu_vma(unsigned long start) | |||
305 | return NULL; | 400 | return NULL; |
306 | } | 401 | } |
307 | 402 | ||
403 | /* | ||
404 | * add a VMA in the global tree | ||
405 | */ | ||
308 | static void add_nommu_vma(struct vm_area_struct *vma) | 406 | static void add_nommu_vma(struct vm_area_struct *vma) |
309 | { | 407 | { |
310 | struct vm_area_struct *pvma; | 408 | struct vm_area_struct *pvma; |
@@ -351,6 +449,9 @@ static void add_nommu_vma(struct vm_area_struct *vma) | |||
351 | rb_insert_color(&vma->vm_rb, &nommu_vma_tree); | 449 | rb_insert_color(&vma->vm_rb, &nommu_vma_tree); |
352 | } | 450 | } |
353 | 451 | ||
452 | /* | ||
453 | * delete a VMA from the global list | ||
454 | */ | ||
354 | static void delete_nommu_vma(struct vm_area_struct *vma) | 455 | static void delete_nommu_vma(struct vm_area_struct *vma) |
355 | { | 456 | { |
356 | struct address_space *mapping; | 457 | struct address_space *mapping; |
@@ -828,8 +929,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
828 | realalloc += kobjsize(vml); | 929 | realalloc += kobjsize(vml); |
829 | askedalloc += sizeof(*vml); | 930 | askedalloc += sizeof(*vml); |
830 | 931 | ||
831 | vml->next = current->mm->context.vmlist; | 932 | add_vma_to_mm(current->mm, vml); |
832 | current->mm->context.vmlist = vml; | ||
833 | 933 | ||
834 | up_write(&nommu_vma_sem); | 934 | up_write(&nommu_vma_sem); |
835 | 935 | ||
@@ -848,7 +948,8 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
848 | up_write(&nommu_vma_sem); | 948 | up_write(&nommu_vma_sem); |
849 | kfree(vml); | 949 | kfree(vml); |
850 | if (vma) { | 950 | if (vma) { |
851 | fput(vma->vm_file); | 951 | if (vma->vm_file) |
952 | fput(vma->vm_file); | ||
852 | kfree(vma); | 953 | kfree(vma); |
853 | } | 954 | } |
854 | return ret; | 955 | return ret; |
@@ -908,6 +1009,11 @@ static void put_vma(struct vm_area_struct *vma) | |||
908 | } | 1009 | } |
909 | } | 1010 | } |
910 | 1011 | ||
1012 | /* | ||
1013 | * release a mapping | ||
1014 | * - under NOMMU conditions the parameters must match exactly to the mapping to | ||
1015 | * be removed | ||
1016 | */ | ||
911 | int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) | 1017 | int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) |
912 | { | 1018 | { |
913 | struct vm_list_struct *vml, **parent; | 1019 | struct vm_list_struct *vml, **parent; |
@@ -917,10 +1023,13 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) | |||
917 | printk("do_munmap:\n"); | 1023 | printk("do_munmap:\n"); |
918 | #endif | 1024 | #endif |
919 | 1025 | ||
920 | for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) | 1026 | for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) { |
1027 | if ((*parent)->vma->vm_start > addr) | ||
1028 | break; | ||
921 | if ((*parent)->vma->vm_start == addr && | 1029 | if ((*parent)->vma->vm_start == addr && |
922 | ((len == 0) || ((*parent)->vma->vm_end == end))) | 1030 | ((len == 0) || ((*parent)->vma->vm_end == end))) |
923 | goto found; | 1031 | goto found; |
1032 | } | ||
924 | 1033 | ||
925 | printk("munmap of non-mmaped memory by process %d (%s): %p\n", | 1034 | printk("munmap of non-mmaped memory by process %d (%s): %p\n", |
926 | current->pid, current->comm, (void *) addr); | 1035 | current->pid, current->comm, (void *) addr); |
@@ -946,7 +1055,20 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) | |||
946 | return 0; | 1055 | return 0; |
947 | } | 1056 | } |
948 | 1057 | ||
949 | /* Release all mmaps. */ | 1058 | asmlinkage long sys_munmap(unsigned long addr, size_t len) |
1059 | { | ||
1060 | int ret; | ||
1061 | struct mm_struct *mm = current->mm; | ||
1062 | |||
1063 | down_write(&mm->mmap_sem); | ||
1064 | ret = do_munmap(mm, addr, len); | ||
1065 | up_write(&mm->mmap_sem); | ||
1066 | return ret; | ||
1067 | } | ||
1068 | |||
1069 | /* | ||
1070 | * Release all mappings | ||
1071 | */ | ||
950 | void exit_mmap(struct mm_struct * mm) | 1072 | void exit_mmap(struct mm_struct * mm) |
951 | { | 1073 | { |
952 | struct vm_list_struct *tmp; | 1074 | struct vm_list_struct *tmp; |
@@ -973,37 +1095,26 @@ void exit_mmap(struct mm_struct * mm) | |||
973 | } | 1095 | } |
974 | } | 1096 | } |
975 | 1097 | ||
976 | asmlinkage long sys_munmap(unsigned long addr, size_t len) | ||
977 | { | ||
978 | int ret; | ||
979 | struct mm_struct *mm = current->mm; | ||
980 | |||
981 | down_write(&mm->mmap_sem); | ||
982 | ret = do_munmap(mm, addr, len); | ||
983 | up_write(&mm->mmap_sem); | ||
984 | return ret; | ||
985 | } | ||
986 | |||
987 | unsigned long do_brk(unsigned long addr, unsigned long len) | 1098 | unsigned long do_brk(unsigned long addr, unsigned long len) |
988 | { | 1099 | { |
989 | return -ENOMEM; | 1100 | return -ENOMEM; |
990 | } | 1101 | } |
991 | 1102 | ||
992 | /* | 1103 | /* |
993 | * Expand (or shrink) an existing mapping, potentially moving it at the | 1104 | * expand (or shrink) an existing mapping, potentially moving it at the same |
994 | * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) | 1105 | * time (controlled by the MREMAP_MAYMOVE flag and available VM space) |
995 | * | 1106 | * |
996 | * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise | 1107 | * under NOMMU conditions, we only permit changing a mapping's size, and only |
997 | * This option implies MREMAP_MAYMOVE. | 1108 | * as long as it stays within the hole allocated by the kmalloc() call in |
1109 | * do_mmap_pgoff() and the block is not shareable | ||
998 | * | 1110 | * |
999 | * on uClinux, we only permit changing a mapping's size, and only as long as it stays within the | 1111 | * MREMAP_FIXED is not supported under NOMMU conditions |
1000 | * hole allocated by the kmalloc() call in do_mmap_pgoff() and the block is not shareable | ||
1001 | */ | 1112 | */ |
1002 | unsigned long do_mremap(unsigned long addr, | 1113 | unsigned long do_mremap(unsigned long addr, |
1003 | unsigned long old_len, unsigned long new_len, | 1114 | unsigned long old_len, unsigned long new_len, |
1004 | unsigned long flags, unsigned long new_addr) | 1115 | unsigned long flags, unsigned long new_addr) |
1005 | { | 1116 | { |
1006 | struct vm_list_struct *vml = NULL; | 1117 | struct vm_area_struct *vma; |
1007 | 1118 | ||
1008 | /* insanity checks first */ | 1119 | /* insanity checks first */ |
1009 | if (new_len == 0) | 1120 | if (new_len == 0) |
@@ -1012,58 +1123,46 @@ unsigned long do_mremap(unsigned long addr, | |||
1012 | if (flags & MREMAP_FIXED && new_addr != addr) | 1123 | if (flags & MREMAP_FIXED && new_addr != addr) |
1013 | return (unsigned long) -EINVAL; | 1124 | return (unsigned long) -EINVAL; |
1014 | 1125 | ||
1015 | for (vml = current->mm->context.vmlist; vml; vml = vml->next) | 1126 | vma = find_vma_exact(current->mm, addr); |
1016 | if (vml->vma->vm_start == addr) | 1127 | if (!vma) |
1017 | goto found; | 1128 | return (unsigned long) -EINVAL; |
1018 | |||
1019 | return (unsigned long) -EINVAL; | ||
1020 | 1129 | ||
1021 | found: | 1130 | if (vma->vm_end != vma->vm_start + old_len) |
1022 | if (vml->vma->vm_end != vml->vma->vm_start + old_len) | ||
1023 | return (unsigned long) -EFAULT; | 1131 | return (unsigned long) -EFAULT; |
1024 | 1132 | ||
1025 | if (vml->vma->vm_flags & VM_MAYSHARE) | 1133 | if (vma->vm_flags & VM_MAYSHARE) |
1026 | return (unsigned long) -EPERM; | 1134 | return (unsigned long) -EPERM; |
1027 | 1135 | ||
1028 | if (new_len > kobjsize((void *) addr)) | 1136 | if (new_len > kobjsize((void *) addr)) |
1029 | return (unsigned long) -ENOMEM; | 1137 | return (unsigned long) -ENOMEM; |
1030 | 1138 | ||
1031 | /* all checks complete - do it */ | 1139 | /* all checks complete - do it */ |
1032 | vml->vma->vm_end = vml->vma->vm_start + new_len; | 1140 | vma->vm_end = vma->vm_start + new_len; |
1033 | 1141 | ||
1034 | askedalloc -= old_len; | 1142 | askedalloc -= old_len; |
1035 | askedalloc += new_len; | 1143 | askedalloc += new_len; |
1036 | 1144 | ||
1037 | return vml->vma->vm_start; | 1145 | return vma->vm_start; |
1038 | } | 1146 | } |
1039 | 1147 | ||
1040 | /* | 1148 | asmlinkage unsigned long sys_mremap(unsigned long addr, |
1041 | * Look up the first VMA which satisfies addr < vm_end, NULL if none | 1149 | unsigned long old_len, unsigned long new_len, |
1042 | */ | 1150 | unsigned long flags, unsigned long new_addr) |
1043 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | ||
1044 | { | 1151 | { |
1045 | struct vm_list_struct *vml; | 1152 | unsigned long ret; |
1046 | |||
1047 | for (vml = mm->context.vmlist; vml; vml = vml->next) | ||
1048 | if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end) | ||
1049 | return vml->vma; | ||
1050 | 1153 | ||
1051 | return NULL; | 1154 | down_write(¤t->mm->mmap_sem); |
1155 | ret = do_mremap(addr, old_len, new_len, flags, new_addr); | ||
1156 | up_write(¤t->mm->mmap_sem); | ||
1157 | return ret; | ||
1052 | } | 1158 | } |
1053 | 1159 | ||
1054 | EXPORT_SYMBOL(find_vma); | ||
1055 | |||
1056 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | 1160 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, |
1057 | unsigned int foll_flags) | 1161 | unsigned int foll_flags) |
1058 | { | 1162 | { |
1059 | return NULL; | 1163 | return NULL; |
1060 | } | 1164 | } |
1061 | 1165 | ||
1062 | struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) | ||
1063 | { | ||
1064 | return NULL; | ||
1065 | } | ||
1066 | |||
1067 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, | 1166 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, |
1068 | unsigned long to, unsigned long size, pgprot_t prot) | 1167 | unsigned long to, unsigned long size, pgprot_t prot) |
1069 | { | 1168 | { |
@@ -1133,7 +1232,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin) | |||
1133 | * which are reclaimable, under pressure. The dentry | 1232 | * which are reclaimable, under pressure. The dentry |
1134 | * cache and most inode caches should fall into this | 1233 | * cache and most inode caches should fall into this |
1135 | */ | 1234 | */ |
1136 | free += atomic_read(&slab_reclaim_pages); | 1235 | free += global_page_state(NR_SLAB_RECLAIMABLE); |
1137 | 1236 | ||
1138 | /* | 1237 | /* |
1139 | * Leave the last 3% for root | 1238 | * Leave the last 3% for root |
@@ -1206,3 +1305,44 @@ struct page *filemap_nopage(struct vm_area_struct *area, | |||
1206 | BUG(); | 1305 | BUG(); |
1207 | return NULL; | 1306 | return NULL; |
1208 | } | 1307 | } |
1308 | |||
1309 | /* | ||
1310 | * Access another process' address space. | ||
1311 | * - source/target buffer must be kernel space | ||
1312 | */ | ||
1313 | int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) | ||
1314 | { | ||
1315 | struct vm_area_struct *vma; | ||
1316 | struct mm_struct *mm; | ||
1317 | |||
1318 | if (addr + len < addr) | ||
1319 | return 0; | ||
1320 | |||
1321 | mm = get_task_mm(tsk); | ||
1322 | if (!mm) | ||
1323 | return 0; | ||
1324 | |||
1325 | down_read(&mm->mmap_sem); | ||
1326 | |||
1327 | /* the access must start within one of the target process's mappings */ | ||
1328 | vma = find_vma(mm, addr); | ||
1329 | if (vma) { | ||
1330 | /* don't overrun this mapping */ | ||
1331 | if (addr + len >= vma->vm_end) | ||
1332 | len = vma->vm_end - addr; | ||
1333 | |||
1334 | /* only read or write mappings where it is permitted */ | ||
1335 | if (write && vma->vm_flags & VM_MAYWRITE) | ||
1336 | len -= copy_to_user((void *) addr, buf, len); | ||
1337 | else if (!write && vma->vm_flags & VM_MAYREAD) | ||
1338 | len -= copy_from_user(buf, (void *) addr, len); | ||
1339 | else | ||
1340 | len = 0; | ||
1341 | } else { | ||
1342 | len = 0; | ||
1343 | } | ||
1344 | |||
1345 | up_read(&mm->mmap_sem); | ||
1346 | mmput(mm); | ||
1347 | return len; | ||
1348 | } | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b9af136e5c..20f41b082e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -21,6 +21,8 @@ | |||
21 | #include <linux/timex.h> | 21 | #include <linux/timex.h> |
22 | #include <linux/jiffies.h> | 22 | #include <linux/jiffies.h> |
23 | #include <linux/cpuset.h> | 23 | #include <linux/cpuset.h> |
24 | #include <linux/module.h> | ||
25 | #include <linux/notifier.h> | ||
24 | 26 | ||
25 | int sysctl_panic_on_oom; | 27 | int sysctl_panic_on_oom; |
26 | /* #define DEBUG */ | 28 | /* #define DEBUG */ |
@@ -58,6 +60,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
58 | } | 60 | } |
59 | 61 | ||
60 | /* | 62 | /* |
63 | * swapoff can easily use up all memory, so kill those first. | ||
64 | */ | ||
65 | if (p->flags & PF_SWAPOFF) | ||
66 | return ULONG_MAX; | ||
67 | |||
68 | /* | ||
61 | * The memory size of the process is the basis for the badness. | 69 | * The memory size of the process is the basis for the badness. |
62 | */ | 70 | */ |
63 | points = mm->total_vm; | 71 | points = mm->total_vm; |
@@ -127,6 +135,14 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
127 | points /= 4; | 135 | points /= 4; |
128 | 136 | ||
129 | /* | 137 | /* |
138 | * If p's nodes don't overlap ours, it may still help to kill p | ||
139 | * because p may have allocated or otherwise mapped memory on | ||
140 | * this node before. However it will be less likely. | ||
141 | */ | ||
142 | if (!cpuset_excl_nodes_overlap(p)) | ||
143 | points /= 8; | ||
144 | |||
145 | /* | ||
130 | * Adjust the score by oomkilladj. | 146 | * Adjust the score by oomkilladj. |
131 | */ | 147 | */ |
132 | if (p->oomkilladj) { | 148 | if (p->oomkilladj) { |
@@ -161,8 +177,7 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) | |||
161 | 177 | ||
162 | for (z = zonelist->zones; *z; z++) | 178 | for (z = zonelist->zones; *z; z++) |
163 | if (cpuset_zone_allowed(*z, gfp_mask)) | 179 | if (cpuset_zone_allowed(*z, gfp_mask)) |
164 | node_clear((*z)->zone_pgdat->node_id, | 180 | node_clear(zone_to_nid(*z), nodes); |
165 | nodes); | ||
166 | else | 181 | else |
167 | return CONSTRAINT_CPUSET; | 182 | return CONSTRAINT_CPUSET; |
168 | 183 | ||
@@ -189,27 +204,49 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) | |||
189 | do_posix_clock_monotonic_gettime(&uptime); | 204 | do_posix_clock_monotonic_gettime(&uptime); |
190 | do_each_thread(g, p) { | 205 | do_each_thread(g, p) { |
191 | unsigned long points; | 206 | unsigned long points; |
192 | int releasing; | ||
193 | 207 | ||
194 | /* skip the init task with pid == 1 */ | 208 | /* |
195 | if (p->pid == 1) | 209 | * skip kernel threads and tasks which have already released |
196 | continue; | 210 | * their mm. |
197 | if (p->oomkilladj == OOM_DISABLE) | 211 | */ |
212 | if (!p->mm) | ||
198 | continue; | 213 | continue; |
199 | /* If p's nodes don't overlap ours, it won't help to kill p. */ | 214 | /* skip the init task */ |
200 | if (!cpuset_excl_nodes_overlap(p)) | 215 | if (is_init(p)) |
201 | continue; | 216 | continue; |
202 | 217 | ||
203 | /* | 218 | /* |
219 | * This task already has access to memory reserves and is | ||
220 | * being killed. Don't allow any other task access to the | ||
221 | * memory reserve. | ||
222 | * | ||
223 | * Note: this may have a chance of deadlock if it gets | ||
224 | * blocked waiting for another task which itself is waiting | ||
225 | * for memory. Is there a better alternative? | ||
226 | */ | ||
227 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) | ||
228 | return ERR_PTR(-1UL); | ||
229 | |||
230 | /* | ||
204 | * This is in the process of releasing memory so wait for it | 231 | * This is in the process of releasing memory so wait for it |
205 | * to finish before killing some other task by mistake. | 232 | * to finish before killing some other task by mistake. |
233 | * | ||
234 | * However, if p is the current task, we allow the 'kill' to | ||
235 | * go ahead if it is exiting: this will simply set TIF_MEMDIE, | ||
236 | * which will allow it to gain access to memory reserves in | ||
237 | * the process of exiting and releasing its resources. | ||
238 | * Otherwise we could get an easy OOM deadlock. | ||
206 | */ | 239 | */ |
207 | releasing = test_tsk_thread_flag(p, TIF_MEMDIE) || | 240 | if (p->flags & PF_EXITING) { |
208 | p->flags & PF_EXITING; | 241 | if (p != current) |
209 | if (releasing && !(p->flags & PF_DEAD)) | 242 | return ERR_PTR(-1UL); |
210 | return ERR_PTR(-1UL); | 243 | |
211 | if (p->flags & PF_SWAPOFF) | 244 | chosen = p; |
212 | return p; | 245 | *ppoints = ULONG_MAX; |
246 | } | ||
247 | |||
248 | if (p->oomkilladj == OOM_DISABLE) | ||
249 | continue; | ||
213 | 250 | ||
214 | points = badness(p, uptime.tv_sec); | 251 | points = badness(p, uptime.tv_sec); |
215 | if (points > *ppoints || !chosen) { | 252 | if (points > *ppoints || !chosen) { |
@@ -217,32 +254,33 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) | |||
217 | *ppoints = points; | 254 | *ppoints = points; |
218 | } | 255 | } |
219 | } while_each_thread(g, p); | 256 | } while_each_thread(g, p); |
257 | |||
220 | return chosen; | 258 | return chosen; |
221 | } | 259 | } |
222 | 260 | ||
223 | /** | 261 | /** |
224 | * We must be careful though to never send SIGKILL a process with | 262 | * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO |
225 | * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that | 263 | * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO |
226 | * we select a process with CAP_SYS_RAW_IO set). | 264 | * set. |
227 | */ | 265 | */ |
228 | static void __oom_kill_task(struct task_struct *p, const char *message) | 266 | static void __oom_kill_task(struct task_struct *p, const char *message) |
229 | { | 267 | { |
230 | if (p->pid == 1) { | 268 | if (is_init(p)) { |
231 | WARN_ON(1); | 269 | WARN_ON(1); |
232 | printk(KERN_WARNING "tried to kill init!\n"); | 270 | printk(KERN_WARNING "tried to kill init!\n"); |
233 | return; | 271 | return; |
234 | } | 272 | } |
235 | 273 | ||
236 | task_lock(p); | 274 | if (!p->mm) { |
237 | if (!p->mm || p->mm == &init_mm) { | ||
238 | WARN_ON(1); | 275 | WARN_ON(1); |
239 | printk(KERN_WARNING "tried to kill an mm-less task!\n"); | 276 | printk(KERN_WARNING "tried to kill an mm-less task!\n"); |
240 | task_unlock(p); | ||
241 | return; | 277 | return; |
242 | } | 278 | } |
243 | task_unlock(p); | 279 | |
244 | printk(KERN_ERR "%s: Killed process %d (%s).\n", | 280 | if (message) { |
281 | printk(KERN_ERR "%s: Killed process %d (%s).\n", | ||
245 | message, p->pid, p->comm); | 282 | message, p->pid, p->comm); |
283 | } | ||
246 | 284 | ||
247 | /* | 285 | /* |
248 | * We give our sacrificial lamb high priority and access to | 286 | * We give our sacrificial lamb high priority and access to |
@@ -271,7 +309,7 @@ static int oom_kill_task(struct task_struct *p, const char *message) | |||
271 | * However, this is of no concern to us. | 309 | * However, this is of no concern to us. |
272 | */ | 310 | */ |
273 | 311 | ||
274 | if (mm == NULL || mm == &init_mm) | 312 | if (mm == NULL) |
275 | return 1; | 313 | return 1; |
276 | 314 | ||
277 | __oom_kill_task(p, message); | 315 | __oom_kill_task(p, message); |
@@ -293,8 +331,17 @@ static int oom_kill_process(struct task_struct *p, unsigned long points, | |||
293 | struct task_struct *c; | 331 | struct task_struct *c; |
294 | struct list_head *tsk; | 332 | struct list_head *tsk; |
295 | 333 | ||
296 | printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and " | 334 | /* |
297 | "children.\n", p->pid, p->comm, points); | 335 | * If the task is already exiting, don't alarm the sysadmin or kill |
336 | * its children or threads, just set TIF_MEMDIE so it can die quickly | ||
337 | */ | ||
338 | if (p->flags & PF_EXITING) { | ||
339 | __oom_kill_task(p, NULL); | ||
340 | return 0; | ||
341 | } | ||
342 | |||
343 | printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li" | ||
344 | " and children.\n", p->pid, p->comm, points); | ||
298 | /* Try to kill a child first */ | 345 | /* Try to kill a child first */ |
299 | list_for_each(tsk, &p->children) { | 346 | list_for_each(tsk, &p->children) { |
300 | c = list_entry(tsk, struct task_struct, sibling); | 347 | c = list_entry(tsk, struct task_struct, sibling); |
@@ -306,6 +353,20 @@ static int oom_kill_process(struct task_struct *p, unsigned long points, | |||
306 | return oom_kill_task(p, message); | 353 | return oom_kill_task(p, message); |
307 | } | 354 | } |
308 | 355 | ||
356 | static BLOCKING_NOTIFIER_HEAD(oom_notify_list); | ||
357 | |||
358 | int register_oom_notifier(struct notifier_block *nb) | ||
359 | { | ||
360 | return blocking_notifier_chain_register(&oom_notify_list, nb); | ||
361 | } | ||
362 | EXPORT_SYMBOL_GPL(register_oom_notifier); | ||
363 | |||
364 | int unregister_oom_notifier(struct notifier_block *nb) | ||
365 | { | ||
366 | return blocking_notifier_chain_unregister(&oom_notify_list, nb); | ||
367 | } | ||
368 | EXPORT_SYMBOL_GPL(unregister_oom_notifier); | ||
369 | |||
309 | /** | 370 | /** |
310 | * out_of_memory - kill the "best" process when we run out of memory | 371 | * out_of_memory - kill the "best" process when we run out of memory |
311 | * | 372 | * |
@@ -318,10 +379,17 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
318 | { | 379 | { |
319 | struct task_struct *p; | 380 | struct task_struct *p; |
320 | unsigned long points = 0; | 381 | unsigned long points = 0; |
382 | unsigned long freed = 0; | ||
383 | |||
384 | blocking_notifier_call_chain(&oom_notify_list, 0, &freed); | ||
385 | if (freed > 0) | ||
386 | /* Got some memory back in the last second. */ | ||
387 | return; | ||
321 | 388 | ||
322 | if (printk_ratelimit()) { | 389 | if (printk_ratelimit()) { |
323 | printk("oom-killer: gfp_mask=0x%x, order=%d\n", | 390 | printk(KERN_WARNING "%s invoked oom-killer: " |
324 | gfp_mask, order); | 391 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", |
392 | current->comm, gfp_mask, order, current->oomkilladj); | ||
325 | dump_stack(); | 393 | dump_stack(); |
326 | show_mem(); | 394 | show_mem(); |
327 | } | 395 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e630188ccc..c0d4ce144d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -23,12 +23,15 @@ | |||
23 | #include <linux/backing-dev.h> | 23 | #include <linux/backing-dev.h> |
24 | #include <linux/blkdev.h> | 24 | #include <linux/blkdev.h> |
25 | #include <linux/mpage.h> | 25 | #include <linux/mpage.h> |
26 | #include <linux/rmap.h> | ||
26 | #include <linux/percpu.h> | 27 | #include <linux/percpu.h> |
27 | #include <linux/notifier.h> | 28 | #include <linux/notifier.h> |
28 | #include <linux/smp.h> | 29 | #include <linux/smp.h> |
29 | #include <linux/sysctl.h> | 30 | #include <linux/sysctl.h> |
30 | #include <linux/cpu.h> | 31 | #include <linux/cpu.h> |
31 | #include <linux/syscalls.h> | 32 | #include <linux/syscalls.h> |
33 | #include <linux/buffer_head.h> | ||
34 | #include <linux/pagevec.h> | ||
32 | 35 | ||
33 | /* | 36 | /* |
34 | * The maximum number of pages to writeout in a single bdflush/kupdate | 37 | * The maximum number of pages to writeout in a single bdflush/kupdate |
@@ -45,7 +48,6 @@ | |||
45 | */ | 48 | */ |
46 | static long ratelimit_pages = 32; | 49 | static long ratelimit_pages = 32; |
47 | 50 | ||
48 | static long total_pages; /* The total number of pages in the machine. */ | ||
49 | static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */ | 51 | static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */ |
50 | 52 | ||
51 | /* | 53 | /* |
@@ -125,7 +127,7 @@ get_dirty_limits(long *pbackground, long *pdirty, | |||
125 | int unmapped_ratio; | 127 | int unmapped_ratio; |
126 | long background; | 128 | long background; |
127 | long dirty; | 129 | long dirty; |
128 | unsigned long available_memory = total_pages; | 130 | unsigned long available_memory = vm_total_pages; |
129 | struct task_struct *tsk; | 131 | struct task_struct *tsk; |
130 | 132 | ||
131 | #ifdef CONFIG_HIGHMEM | 133 | #ifdef CONFIG_HIGHMEM |
@@ -140,7 +142,7 @@ get_dirty_limits(long *pbackground, long *pdirty, | |||
140 | 142 | ||
141 | unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) + | 143 | unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) + |
142 | global_page_state(NR_ANON_PAGES)) * 100) / | 144 | global_page_state(NR_ANON_PAGES)) * 100) / |
143 | total_pages; | 145 | vm_total_pages; |
144 | 146 | ||
145 | dirty_ratio = vm_dirty_ratio; | 147 | dirty_ratio = vm_dirty_ratio; |
146 | if (dirty_ratio > unmapped_ratio / 2) | 148 | if (dirty_ratio > unmapped_ratio / 2) |
@@ -243,6 +245,16 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
243 | pdflush_operation(background_writeout, 0); | 245 | pdflush_operation(background_writeout, 0); |
244 | } | 246 | } |
245 | 247 | ||
248 | void set_page_dirty_balance(struct page *page) | ||
249 | { | ||
250 | if (set_page_dirty(page)) { | ||
251 | struct address_space *mapping = page_mapping(page); | ||
252 | |||
253 | if (mapping) | ||
254 | balance_dirty_pages_ratelimited(mapping); | ||
255 | } | ||
256 | } | ||
257 | |||
246 | /** | 258 | /** |
247 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state | 259 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state |
248 | * @mapping: address_space which was dirtied | 260 | * @mapping: address_space which was dirtied |
@@ -491,9 +503,9 @@ void laptop_sync_completion(void) | |||
491 | * will write six megabyte chunks, max. | 503 | * will write six megabyte chunks, max. |
492 | */ | 504 | */ |
493 | 505 | ||
494 | static void set_ratelimit(void) | 506 | void writeback_set_ratelimit(void) |
495 | { | 507 | { |
496 | ratelimit_pages = total_pages / (num_online_cpus() * 32); | 508 | ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); |
497 | if (ratelimit_pages < 16) | 509 | if (ratelimit_pages < 16) |
498 | ratelimit_pages = 16; | 510 | ratelimit_pages = 16; |
499 | if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) | 511 | if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) |
@@ -503,7 +515,7 @@ static void set_ratelimit(void) | |||
503 | static int __cpuinit | 515 | static int __cpuinit |
504 | ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) | 516 | ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) |
505 | { | 517 | { |
506 | set_ratelimit(); | 518 | writeback_set_ratelimit(); |
507 | return 0; | 519 | return 0; |
508 | } | 520 | } |
509 | 521 | ||
@@ -522,9 +534,7 @@ void __init page_writeback_init(void) | |||
522 | long buffer_pages = nr_free_buffer_pages(); | 534 | long buffer_pages = nr_free_buffer_pages(); |
523 | long correction; | 535 | long correction; |
524 | 536 | ||
525 | total_pages = nr_free_pagecache_pages(); | 537 | correction = (100 * 4 * buffer_pages) / vm_total_pages; |
526 | |||
527 | correction = (100 * 4 * buffer_pages) / total_pages; | ||
528 | 538 | ||
529 | if (correction < 100) { | 539 | if (correction < 100) { |
530 | dirty_background_ratio *= correction; | 540 | dirty_background_ratio *= correction; |
@@ -538,10 +548,143 @@ void __init page_writeback_init(void) | |||
538 | vm_dirty_ratio = 1; | 548 | vm_dirty_ratio = 1; |
539 | } | 549 | } |
540 | mod_timer(&wb_timer, jiffies + dirty_writeback_interval); | 550 | mod_timer(&wb_timer, jiffies + dirty_writeback_interval); |
541 | set_ratelimit(); | 551 | writeback_set_ratelimit(); |
542 | register_cpu_notifier(&ratelimit_nb); | 552 | register_cpu_notifier(&ratelimit_nb); |
543 | } | 553 | } |
544 | 554 | ||
555 | /** | ||
556 | * generic_writepages - walk the list of dirty pages of the given | ||
557 | * address space and writepage() all of them. | ||
558 | * | ||
559 | * @mapping: address space structure to write | ||
560 | * @wbc: subtract the number of written pages from *@wbc->nr_to_write | ||
561 | * | ||
562 | * This is a library function, which implements the writepages() | ||
563 | * address_space_operation. | ||
564 | * | ||
565 | * If a page is already under I/O, generic_writepages() skips it, even | ||
566 | * if it's dirty. This is desirable behaviour for memory-cleaning writeback, | ||
567 | * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() | ||
568 | * and msync() need to guarantee that all the data which was dirty at the time | ||
569 | * the call was made get new I/O started against them. If wbc->sync_mode is | ||
570 | * WB_SYNC_ALL then we were called for data integrity and we must wait for | ||
571 | * existing IO to complete. | ||
572 | * | ||
573 | * Derived from mpage_writepages() - if you fix this you should check that | ||
574 | * also! | ||
575 | */ | ||
576 | int generic_writepages(struct address_space *mapping, | ||
577 | struct writeback_control *wbc) | ||
578 | { | ||
579 | struct backing_dev_info *bdi = mapping->backing_dev_info; | ||
580 | int ret = 0; | ||
581 | int done = 0; | ||
582 | int (*writepage)(struct page *page, struct writeback_control *wbc); | ||
583 | struct pagevec pvec; | ||
584 | int nr_pages; | ||
585 | pgoff_t index; | ||
586 | pgoff_t end; /* Inclusive */ | ||
587 | int scanned = 0; | ||
588 | int range_whole = 0; | ||
589 | |||
590 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | ||
591 | wbc->encountered_congestion = 1; | ||
592 | return 0; | ||
593 | } | ||
594 | |||
595 | writepage = mapping->a_ops->writepage; | ||
596 | |||
597 | /* deal with chardevs and other special file */ | ||
598 | if (!writepage) | ||
599 | return 0; | ||
600 | |||
601 | pagevec_init(&pvec, 0); | ||
602 | if (wbc->range_cyclic) { | ||
603 | index = mapping->writeback_index; /* Start from prev offset */ | ||
604 | end = -1; | ||
605 | } else { | ||
606 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | ||
607 | end = wbc->range_end >> PAGE_CACHE_SHIFT; | ||
608 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) | ||
609 | range_whole = 1; | ||
610 | scanned = 1; | ||
611 | } | ||
612 | retry: | ||
613 | while (!done && (index <= end) && | ||
614 | (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | ||
615 | PAGECACHE_TAG_DIRTY, | ||
616 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { | ||
617 | unsigned i; | ||
618 | |||
619 | scanned = 1; | ||
620 | for (i = 0; i < nr_pages; i++) { | ||
621 | struct page *page = pvec.pages[i]; | ||
622 | |||
623 | /* | ||
624 | * At this point we hold neither mapping->tree_lock nor | ||
625 | * lock on the page itself: the page may be truncated or | ||
626 | * invalidated (changing page->mapping to NULL), or even | ||
627 | * swizzled back from swapper_space to tmpfs file | ||
628 | * mapping | ||
629 | */ | ||
630 | lock_page(page); | ||
631 | |||
632 | if (unlikely(page->mapping != mapping)) { | ||
633 | unlock_page(page); | ||
634 | continue; | ||
635 | } | ||
636 | |||
637 | if (!wbc->range_cyclic && page->index > end) { | ||
638 | done = 1; | ||
639 | unlock_page(page); | ||
640 | continue; | ||
641 | } | ||
642 | |||
643 | if (wbc->sync_mode != WB_SYNC_NONE) | ||
644 | wait_on_page_writeback(page); | ||
645 | |||
646 | if (PageWriteback(page) || | ||
647 | !clear_page_dirty_for_io(page)) { | ||
648 | unlock_page(page); | ||
649 | continue; | ||
650 | } | ||
651 | |||
652 | ret = (*writepage)(page, wbc); | ||
653 | if (ret) { | ||
654 | if (ret == -ENOSPC) | ||
655 | set_bit(AS_ENOSPC, &mapping->flags); | ||
656 | else | ||
657 | set_bit(AS_EIO, &mapping->flags); | ||
658 | } | ||
659 | |||
660 | if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) | ||
661 | unlock_page(page); | ||
662 | if (ret || (--(wbc->nr_to_write) <= 0)) | ||
663 | done = 1; | ||
664 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | ||
665 | wbc->encountered_congestion = 1; | ||
666 | done = 1; | ||
667 | } | ||
668 | } | ||
669 | pagevec_release(&pvec); | ||
670 | cond_resched(); | ||
671 | } | ||
672 | if (!scanned && !done) { | ||
673 | /* | ||
674 | * We hit the last page and there is more work to be done: wrap | ||
675 | * back to the start of the file | ||
676 | */ | ||
677 | scanned = 1; | ||
678 | index = 0; | ||
679 | goto retry; | ||
680 | } | ||
681 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) | ||
682 | mapping->writeback_index = index; | ||
683 | return ret; | ||
684 | } | ||
685 | |||
686 | EXPORT_SYMBOL(generic_writepages); | ||
687 | |||
545 | int do_writepages(struct address_space *mapping, struct writeback_control *wbc) | 688 | int do_writepages(struct address_space *mapping, struct writeback_control *wbc) |
546 | { | 689 | { |
547 | int ret; | 690 | int ret; |
@@ -550,7 +693,7 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) | |||
550 | return 0; | 693 | return 0; |
551 | wbc->for_writepages = 1; | 694 | wbc->for_writepages = 1; |
552 | if (mapping->a_ops->writepages) | 695 | if (mapping->a_ops->writepages) |
553 | ret = mapping->a_ops->writepages(mapping, wbc); | 696 | ret = mapping->a_ops->writepages(mapping, wbc); |
554 | else | 697 | else |
555 | ret = generic_writepages(mapping, wbc); | 698 | ret = generic_writepages(mapping, wbc); |
556 | wbc->for_writepages = 0; | 699 | wbc->for_writepages = 0; |
@@ -664,9 +807,11 @@ int fastcall set_page_dirty(struct page *page) | |||
664 | 807 | ||
665 | if (likely(mapping)) { | 808 | if (likely(mapping)) { |
666 | int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; | 809 | int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; |
667 | if (spd) | 810 | #ifdef CONFIG_BLOCK |
668 | return (*spd)(page); | 811 | if (!spd) |
669 | return __set_page_dirty_buffers(page); | 812 | spd = __set_page_dirty_buffers; |
813 | #endif | ||
814 | return (*spd)(page); | ||
670 | } | 815 | } |
671 | if (!PageDirty(page)) { | 816 | if (!PageDirty(page)) { |
672 | if (!TestSetPageDirty(page)) | 817 | if (!TestSetPageDirty(page)) |
@@ -690,7 +835,7 @@ int set_page_dirty_lock(struct page *page) | |||
690 | { | 835 | { |
691 | int ret; | 836 | int ret; |
692 | 837 | ||
693 | lock_page(page); | 838 | lock_page_nosync(page); |
694 | ret = set_page_dirty(page); | 839 | ret = set_page_dirty(page); |
695 | unlock_page(page); | 840 | unlock_page(page); |
696 | return ret; | 841 | return ret; |
@@ -712,9 +857,15 @@ int test_clear_page_dirty(struct page *page) | |||
712 | radix_tree_tag_clear(&mapping->page_tree, | 857 | radix_tree_tag_clear(&mapping->page_tree, |
713 | page_index(page), | 858 | page_index(page), |
714 | PAGECACHE_TAG_DIRTY); | 859 | PAGECACHE_TAG_DIRTY); |
715 | if (mapping_cap_account_dirty(mapping)) | ||
716 | __dec_zone_page_state(page, NR_FILE_DIRTY); | ||
717 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 860 | write_unlock_irqrestore(&mapping->tree_lock, flags); |
861 | /* | ||
862 | * We can continue to use `mapping' here because the | ||
863 | * page is locked, which pins the address_space | ||
864 | */ | ||
865 | if (mapping_cap_account_dirty(mapping)) { | ||
866 | page_mkclean(page); | ||
867 | dec_zone_page_state(page, NR_FILE_DIRTY); | ||
868 | } | ||
718 | return 1; | 869 | return 1; |
719 | } | 870 | } |
720 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 871 | write_unlock_irqrestore(&mapping->tree_lock, flags); |
@@ -744,8 +895,10 @@ int clear_page_dirty_for_io(struct page *page) | |||
744 | 895 | ||
745 | if (mapping) { | 896 | if (mapping) { |
746 | if (TestClearPageDirty(page)) { | 897 | if (TestClearPageDirty(page)) { |
747 | if (mapping_cap_account_dirty(mapping)) | 898 | if (mapping_cap_account_dirty(mapping)) { |
899 | page_mkclean(page); | ||
748 | dec_zone_page_state(page, NR_FILE_DIRTY); | 900 | dec_zone_page_state(page, NR_FILE_DIRTY); |
901 | } | ||
749 | return 1; | 902 | return 1; |
750 | } | 903 | } |
751 | return 0; | 904 | return 0; |
@@ -803,6 +956,15 @@ int test_set_page_writeback(struct page *page) | |||
803 | EXPORT_SYMBOL(test_set_page_writeback); | 956 | EXPORT_SYMBOL(test_set_page_writeback); |
804 | 957 | ||
805 | /* | 958 | /* |
959 | * Wakes up tasks that are being throttled due to writeback congestion | ||
960 | */ | ||
961 | void writeback_congestion_end(void) | ||
962 | { | ||
963 | blk_congestion_end(WRITE); | ||
964 | } | ||
965 | EXPORT_SYMBOL(writeback_congestion_end); | ||
966 | |||
967 | /* | ||
806 | * Return true if any of the pages in the mapping are marged with the | 968 | * Return true if any of the pages in the mapping are marged with the |
807 | * passed tag. | 969 | * passed tag. |
808 | */ | 970 | */ |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 54a4f5375b..4f59d90b81 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -37,6 +37,8 @@ | |||
37 | #include <linux/vmalloc.h> | 37 | #include <linux/vmalloc.h> |
38 | #include <linux/mempolicy.h> | 38 | #include <linux/mempolicy.h> |
39 | #include <linux/stop_machine.h> | 39 | #include <linux/stop_machine.h> |
40 | #include <linux/sort.h> | ||
41 | #include <linux/pfn.h> | ||
40 | 42 | ||
41 | #include <asm/tlbflush.h> | 43 | #include <asm/tlbflush.h> |
42 | #include <asm/div64.h> | 44 | #include <asm/div64.h> |
@@ -51,7 +53,6 @@ EXPORT_SYMBOL(node_online_map); | |||
51 | nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; | 53 | nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; |
52 | EXPORT_SYMBOL(node_possible_map); | 54 | EXPORT_SYMBOL(node_possible_map); |
53 | unsigned long totalram_pages __read_mostly; | 55 | unsigned long totalram_pages __read_mostly; |
54 | unsigned long totalhigh_pages __read_mostly; | ||
55 | unsigned long totalreserve_pages __read_mostly; | 56 | unsigned long totalreserve_pages __read_mostly; |
56 | long nr_swap_pages; | 57 | long nr_swap_pages; |
57 | int percpu_pagelist_fraction; | 58 | int percpu_pagelist_fraction; |
@@ -69,7 +70,15 @@ static void __free_pages_ok(struct page *page, unsigned int order); | |||
69 | * TBD: should special case ZONE_DMA32 machines here - in those we normally | 70 | * TBD: should special case ZONE_DMA32 machines here - in those we normally |
70 | * don't need any ZONE_NORMAL reservation | 71 | * don't need any ZONE_NORMAL reservation |
71 | */ | 72 | */ |
72 | int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; | 73 | int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { |
74 | 256, | ||
75 | #ifdef CONFIG_ZONE_DMA32 | ||
76 | 256, | ||
77 | #endif | ||
78 | #ifdef CONFIG_HIGHMEM | ||
79 | 32 | ||
80 | #endif | ||
81 | }; | ||
73 | 82 | ||
74 | EXPORT_SYMBOL(totalram_pages); | 83 | EXPORT_SYMBOL(totalram_pages); |
75 | 84 | ||
@@ -80,11 +89,53 @@ EXPORT_SYMBOL(totalram_pages); | |||
80 | struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; | 89 | struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; |
81 | EXPORT_SYMBOL(zone_table); | 90 | EXPORT_SYMBOL(zone_table); |
82 | 91 | ||
83 | static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; | 92 | static char *zone_names[MAX_NR_ZONES] = { |
93 | "DMA", | ||
94 | #ifdef CONFIG_ZONE_DMA32 | ||
95 | "DMA32", | ||
96 | #endif | ||
97 | "Normal", | ||
98 | #ifdef CONFIG_HIGHMEM | ||
99 | "HighMem" | ||
100 | #endif | ||
101 | }; | ||
102 | |||
84 | int min_free_kbytes = 1024; | 103 | int min_free_kbytes = 1024; |
85 | 104 | ||
86 | unsigned long __meminitdata nr_kernel_pages; | 105 | unsigned long __meminitdata nr_kernel_pages; |
87 | unsigned long __meminitdata nr_all_pages; | 106 | unsigned long __meminitdata nr_all_pages; |
107 | static unsigned long __initdata dma_reserve; | ||
108 | |||
109 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | ||
110 | /* | ||
111 | * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct | ||
112 | * ranges of memory (RAM) that may be registered with add_active_range(). | ||
113 | * Ranges passed to add_active_range() will be merged if possible | ||
114 | * so the number of times add_active_range() can be called is | ||
115 | * related to the number of nodes and the number of holes | ||
116 | */ | ||
117 | #ifdef CONFIG_MAX_ACTIVE_REGIONS | ||
118 | /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ | ||
119 | #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS | ||
120 | #else | ||
121 | #if MAX_NUMNODES >= 32 | ||
122 | /* If there can be many nodes, allow up to 50 holes per node */ | ||
123 | #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) | ||
124 | #else | ||
125 | /* By default, allow up to 256 distinct regions */ | ||
126 | #define MAX_ACTIVE_REGIONS 256 | ||
127 | #endif | ||
128 | #endif | ||
129 | |||
130 | struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS]; | ||
131 | int __initdata nr_nodemap_entries; | ||
132 | unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; | ||
133 | unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; | ||
134 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
135 | unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES]; | ||
136 | unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES]; | ||
137 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | ||
138 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ | ||
88 | 139 | ||
89 | #ifdef CONFIG_DEBUG_VM | 140 | #ifdef CONFIG_DEBUG_VM |
90 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 141 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
@@ -127,7 +178,6 @@ static int bad_range(struct zone *zone, struct page *page) | |||
127 | 178 | ||
128 | return 0; | 179 | return 0; |
129 | } | 180 | } |
130 | |||
131 | #else | 181 | #else |
132 | static inline int bad_range(struct zone *zone, struct page *page) | 182 | static inline int bad_range(struct zone *zone, struct page *page) |
133 | { | 183 | { |
@@ -218,12 +268,12 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | |||
218 | { | 268 | { |
219 | int i; | 269 | int i; |
220 | 270 | ||
221 | BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); | 271 | VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); |
222 | /* | 272 | /* |
223 | * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO | 273 | * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO |
224 | * and __GFP_HIGHMEM from hard or soft interrupt context. | 274 | * and __GFP_HIGHMEM from hard or soft interrupt context. |
225 | */ | 275 | */ |
226 | BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); | 276 | VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); |
227 | for (i = 0; i < (1 << order); i++) | 277 | for (i = 0; i < (1 << order); i++) |
228 | clear_highpage(page + i); | 278 | clear_highpage(page + i); |
229 | } | 279 | } |
@@ -347,8 +397,8 @@ static inline void __free_one_page(struct page *page, | |||
347 | 397 | ||
348 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 398 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); |
349 | 399 | ||
350 | BUG_ON(page_idx & (order_size - 1)); | 400 | VM_BUG_ON(page_idx & (order_size - 1)); |
351 | BUG_ON(bad_range(zone, page)); | 401 | VM_BUG_ON(bad_range(zone, page)); |
352 | 402 | ||
353 | zone->free_pages += order_size; | 403 | zone->free_pages += order_size; |
354 | while (order < MAX_ORDER-1) { | 404 | while (order < MAX_ORDER-1) { |
@@ -421,7 +471,7 @@ static void free_pages_bulk(struct zone *zone, int count, | |||
421 | while (count--) { | 471 | while (count--) { |
422 | struct page *page; | 472 | struct page *page; |
423 | 473 | ||
424 | BUG_ON(list_empty(list)); | 474 | VM_BUG_ON(list_empty(list)); |
425 | page = list_entry(list->prev, struct page, lru); | 475 | page = list_entry(list->prev, struct page, lru); |
426 | /* have to delete it as __free_one_page list manipulates */ | 476 | /* have to delete it as __free_one_page list manipulates */ |
427 | list_del(&page->lru); | 477 | list_del(&page->lru); |
@@ -432,9 +482,11 @@ static void free_pages_bulk(struct zone *zone, int count, | |||
432 | 482 | ||
433 | static void free_one_page(struct zone *zone, struct page *page, int order) | 483 | static void free_one_page(struct zone *zone, struct page *page, int order) |
434 | { | 484 | { |
435 | LIST_HEAD(list); | 485 | spin_lock(&zone->lock); |
436 | list_add(&page->lru, &list); | 486 | zone->all_unreclaimable = 0; |
437 | free_pages_bulk(zone, 1, &list, order); | 487 | zone->pages_scanned = 0; |
488 | __free_one_page(page, zone ,order); | ||
489 | spin_unlock(&zone->lock); | ||
438 | } | 490 | } |
439 | 491 | ||
440 | static void __free_pages_ok(struct page *page, unsigned int order) | 492 | static void __free_pages_ok(struct page *page, unsigned int order) |
@@ -512,7 +564,7 @@ static inline void expand(struct zone *zone, struct page *page, | |||
512 | area--; | 564 | area--; |
513 | high--; | 565 | high--; |
514 | size >>= 1; | 566 | size >>= 1; |
515 | BUG_ON(bad_range(zone, &page[size])); | 567 | VM_BUG_ON(bad_range(zone, &page[size])); |
516 | list_add(&page[size].lru, &area->free_list); | 568 | list_add(&page[size].lru, &area->free_list); |
517 | area->nr_free++; | 569 | area->nr_free++; |
518 | set_page_order(&page[size], high); | 570 | set_page_order(&page[size], high); |
@@ -615,19 +667,23 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
615 | #ifdef CONFIG_NUMA | 667 | #ifdef CONFIG_NUMA |
616 | /* | 668 | /* |
617 | * Called from the slab reaper to drain pagesets on a particular node that | 669 | * Called from the slab reaper to drain pagesets on a particular node that |
618 | * belong to the currently executing processor. | 670 | * belongs to the currently executing processor. |
619 | * Note that this function must be called with the thread pinned to | 671 | * Note that this function must be called with the thread pinned to |
620 | * a single processor. | 672 | * a single processor. |
621 | */ | 673 | */ |
622 | void drain_node_pages(int nodeid) | 674 | void drain_node_pages(int nodeid) |
623 | { | 675 | { |
624 | int i, z; | 676 | int i; |
677 | enum zone_type z; | ||
625 | unsigned long flags; | 678 | unsigned long flags; |
626 | 679 | ||
627 | for (z = 0; z < MAX_NR_ZONES; z++) { | 680 | for (z = 0; z < MAX_NR_ZONES; z++) { |
628 | struct zone *zone = NODE_DATA(nodeid)->node_zones + z; | 681 | struct zone *zone = NODE_DATA(nodeid)->node_zones + z; |
629 | struct per_cpu_pageset *pset; | 682 | struct per_cpu_pageset *pset; |
630 | 683 | ||
684 | if (!populated_zone(zone)) | ||
685 | continue; | ||
686 | |||
631 | pset = zone_pcp(zone, smp_processor_id()); | 687 | pset = zone_pcp(zone, smp_processor_id()); |
632 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | 688 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { |
633 | struct per_cpu_pages *pcp; | 689 | struct per_cpu_pages *pcp; |
@@ -672,7 +728,8 @@ static void __drain_pages(unsigned int cpu) | |||
672 | 728 | ||
673 | void mark_free_pages(struct zone *zone) | 729 | void mark_free_pages(struct zone *zone) |
674 | { | 730 | { |
675 | unsigned long zone_pfn, flags; | 731 | unsigned long pfn, max_zone_pfn; |
732 | unsigned long flags; | ||
676 | int order; | 733 | int order; |
677 | struct list_head *curr; | 734 | struct list_head *curr; |
678 | 735 | ||
@@ -680,18 +737,25 @@ void mark_free_pages(struct zone *zone) | |||
680 | return; | 737 | return; |
681 | 738 | ||
682 | spin_lock_irqsave(&zone->lock, flags); | 739 | spin_lock_irqsave(&zone->lock, flags); |
683 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | 740 | |
684 | ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); | 741 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; |
742 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | ||
743 | if (pfn_valid(pfn)) { | ||
744 | struct page *page = pfn_to_page(pfn); | ||
745 | |||
746 | if (!PageNosave(page)) | ||
747 | ClearPageNosaveFree(page); | ||
748 | } | ||
685 | 749 | ||
686 | for (order = MAX_ORDER - 1; order >= 0; --order) | 750 | for (order = MAX_ORDER - 1; order >= 0; --order) |
687 | list_for_each(curr, &zone->free_area[order].free_list) { | 751 | list_for_each(curr, &zone->free_area[order].free_list) { |
688 | unsigned long start_pfn, i; | 752 | unsigned long i; |
689 | 753 | ||
690 | start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); | 754 | pfn = page_to_pfn(list_entry(curr, struct page, lru)); |
755 | for (i = 0; i < (1UL << order); i++) | ||
756 | SetPageNosaveFree(pfn_to_page(pfn + i)); | ||
757 | } | ||
691 | 758 | ||
692 | for (i=0; i < (1<<order); i++) | ||
693 | SetPageNosaveFree(pfn_to_page(start_pfn+i)); | ||
694 | } | ||
695 | spin_unlock_irqrestore(&zone->lock, flags); | 759 | spin_unlock_irqrestore(&zone->lock, flags); |
696 | } | 760 | } |
697 | 761 | ||
@@ -761,8 +825,8 @@ void split_page(struct page *page, unsigned int order) | |||
761 | { | 825 | { |
762 | int i; | 826 | int i; |
763 | 827 | ||
764 | BUG_ON(PageCompound(page)); | 828 | VM_BUG_ON(PageCompound(page)); |
765 | BUG_ON(!page_count(page)); | 829 | VM_BUG_ON(!page_count(page)); |
766 | for (i = 1; i < (1 << order); i++) | 830 | for (i = 1; i < (1 << order); i++) |
767 | set_page_refcounted(page + i); | 831 | set_page_refcounted(page + i); |
768 | } | 832 | } |
@@ -809,7 +873,7 @@ again: | |||
809 | local_irq_restore(flags); | 873 | local_irq_restore(flags); |
810 | put_cpu(); | 874 | put_cpu(); |
811 | 875 | ||
812 | BUG_ON(bad_range(zone, page)); | 876 | VM_BUG_ON(bad_range(zone, page)); |
813 | if (prep_new_page(page, order, gfp_flags)) | 877 | if (prep_new_page(page, order, gfp_flags)) |
814 | goto again; | 878 | goto again; |
815 | return page; | 879 | return page; |
@@ -870,32 +934,37 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | |||
870 | struct zone **z = zonelist->zones; | 934 | struct zone **z = zonelist->zones; |
871 | struct page *page = NULL; | 935 | struct page *page = NULL; |
872 | int classzone_idx = zone_idx(*z); | 936 | int classzone_idx = zone_idx(*z); |
937 | struct zone *zone; | ||
873 | 938 | ||
874 | /* | 939 | /* |
875 | * Go through the zonelist once, looking for a zone with enough free. | 940 | * Go through the zonelist once, looking for a zone with enough free. |
876 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 941 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
877 | */ | 942 | */ |
878 | do { | 943 | do { |
944 | zone = *z; | ||
945 | if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && | ||
946 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) | ||
947 | break; | ||
879 | if ((alloc_flags & ALLOC_CPUSET) && | 948 | if ((alloc_flags & ALLOC_CPUSET) && |
880 | !cpuset_zone_allowed(*z, gfp_mask)) | 949 | !cpuset_zone_allowed(zone, gfp_mask)) |
881 | continue; | 950 | continue; |
882 | 951 | ||
883 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 952 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
884 | unsigned long mark; | 953 | unsigned long mark; |
885 | if (alloc_flags & ALLOC_WMARK_MIN) | 954 | if (alloc_flags & ALLOC_WMARK_MIN) |
886 | mark = (*z)->pages_min; | 955 | mark = zone->pages_min; |
887 | else if (alloc_flags & ALLOC_WMARK_LOW) | 956 | else if (alloc_flags & ALLOC_WMARK_LOW) |
888 | mark = (*z)->pages_low; | 957 | mark = zone->pages_low; |
889 | else | 958 | else |
890 | mark = (*z)->pages_high; | 959 | mark = zone->pages_high; |
891 | if (!zone_watermark_ok(*z, order, mark, | 960 | if (!zone_watermark_ok(zone , order, mark, |
892 | classzone_idx, alloc_flags)) | 961 | classzone_idx, alloc_flags)) |
893 | if (!zone_reclaim_mode || | 962 | if (!zone_reclaim_mode || |
894 | !zone_reclaim(*z, gfp_mask, order)) | 963 | !zone_reclaim(zone, gfp_mask, order)) |
895 | continue; | 964 | continue; |
896 | } | 965 | } |
897 | 966 | ||
898 | page = buffered_rmqueue(zonelist, *z, order, gfp_mask); | 967 | page = buffered_rmqueue(zonelist, zone, order, gfp_mask); |
899 | if (page) { | 968 | if (page) { |
900 | break; | 969 | break; |
901 | } | 970 | } |
@@ -1083,7 +1152,7 @@ fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) | |||
1083 | * get_zeroed_page() returns a 32-bit address, which cannot represent | 1152 | * get_zeroed_page() returns a 32-bit address, which cannot represent |
1084 | * a highmem page | 1153 | * a highmem page |
1085 | */ | 1154 | */ |
1086 | BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); | 1155 | VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); |
1087 | 1156 | ||
1088 | page = alloc_pages(gfp_mask | __GFP_ZERO, 0); | 1157 | page = alloc_pages(gfp_mask | __GFP_ZERO, 0); |
1089 | if (page) | 1158 | if (page) |
@@ -1116,7 +1185,7 @@ EXPORT_SYMBOL(__free_pages); | |||
1116 | fastcall void free_pages(unsigned long addr, unsigned int order) | 1185 | fastcall void free_pages(unsigned long addr, unsigned int order) |
1117 | { | 1186 | { |
1118 | if (addr != 0) { | 1187 | if (addr != 0) { |
1119 | BUG_ON(!virt_addr_valid((void *)addr)); | 1188 | VM_BUG_ON(!virt_addr_valid((void *)addr)); |
1120 | __free_pages(virt_to_page((void *)addr), order); | 1189 | __free_pages(virt_to_page((void *)addr), order); |
1121 | } | 1190 | } |
1122 | } | 1191 | } |
@@ -1142,7 +1211,8 @@ EXPORT_SYMBOL(nr_free_pages); | |||
1142 | #ifdef CONFIG_NUMA | 1211 | #ifdef CONFIG_NUMA |
1143 | unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) | 1212 | unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) |
1144 | { | 1213 | { |
1145 | unsigned int i, sum = 0; | 1214 | unsigned int sum = 0; |
1215 | enum zone_type i; | ||
1146 | 1216 | ||
1147 | for (i = 0; i < MAX_NR_ZONES; i++) | 1217 | for (i = 0; i < MAX_NR_ZONES; i++) |
1148 | sum += pgdat->node_zones[i].free_pages; | 1218 | sum += pgdat->node_zones[i].free_pages; |
@@ -1187,27 +1257,11 @@ unsigned int nr_free_pagecache_pages(void) | |||
1187 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); | 1257 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); |
1188 | } | 1258 | } |
1189 | 1259 | ||
1190 | #ifdef CONFIG_HIGHMEM | 1260 | static inline void show_node(struct zone *zone) |
1191 | unsigned int nr_free_highpages (void) | ||
1192 | { | 1261 | { |
1193 | pg_data_t *pgdat; | 1262 | if (NUMA_BUILD) |
1194 | unsigned int pages = 0; | 1263 | printk("Node %ld ", zone_to_nid(zone)); |
1195 | |||
1196 | for_each_online_pgdat(pgdat) | ||
1197 | pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; | ||
1198 | |||
1199 | return pages; | ||
1200 | } | 1264 | } |
1201 | #endif | ||
1202 | |||
1203 | #ifdef CONFIG_NUMA | ||
1204 | static void show_node(struct zone *zone) | ||
1205 | { | ||
1206 | printk("Node %d ", zone->zone_pgdat->node_id); | ||
1207 | } | ||
1208 | #else | ||
1209 | #define show_node(zone) do { } while (0) | ||
1210 | #endif | ||
1211 | 1265 | ||
1212 | void si_meminfo(struct sysinfo *val) | 1266 | void si_meminfo(struct sysinfo *val) |
1213 | { | 1267 | { |
@@ -1215,13 +1269,8 @@ void si_meminfo(struct sysinfo *val) | |||
1215 | val->sharedram = 0; | 1269 | val->sharedram = 0; |
1216 | val->freeram = nr_free_pages(); | 1270 | val->freeram = nr_free_pages(); |
1217 | val->bufferram = nr_blockdev_pages(); | 1271 | val->bufferram = nr_blockdev_pages(); |
1218 | #ifdef CONFIG_HIGHMEM | ||
1219 | val->totalhigh = totalhigh_pages; | 1272 | val->totalhigh = totalhigh_pages; |
1220 | val->freehigh = nr_free_highpages(); | 1273 | val->freehigh = nr_free_highpages(); |
1221 | #else | ||
1222 | val->totalhigh = 0; | ||
1223 | val->freehigh = 0; | ||
1224 | #endif | ||
1225 | val->mem_unit = PAGE_SIZE; | 1274 | val->mem_unit = PAGE_SIZE; |
1226 | } | 1275 | } |
1227 | 1276 | ||
@@ -1234,8 +1283,13 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
1234 | 1283 | ||
1235 | val->totalram = pgdat->node_present_pages; | 1284 | val->totalram = pgdat->node_present_pages; |
1236 | val->freeram = nr_free_pages_pgdat(pgdat); | 1285 | val->freeram = nr_free_pages_pgdat(pgdat); |
1286 | #ifdef CONFIG_HIGHMEM | ||
1237 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; | 1287 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; |
1238 | val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; | 1288 | val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; |
1289 | #else | ||
1290 | val->totalhigh = 0; | ||
1291 | val->freehigh = 0; | ||
1292 | #endif | ||
1239 | val->mem_unit = PAGE_SIZE; | 1293 | val->mem_unit = PAGE_SIZE; |
1240 | } | 1294 | } |
1241 | #endif | 1295 | #endif |
@@ -1249,43 +1303,35 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
1249 | */ | 1303 | */ |
1250 | void show_free_areas(void) | 1304 | void show_free_areas(void) |
1251 | { | 1305 | { |
1252 | int cpu, temperature; | 1306 | int cpu; |
1253 | unsigned long active; | 1307 | unsigned long active; |
1254 | unsigned long inactive; | 1308 | unsigned long inactive; |
1255 | unsigned long free; | 1309 | unsigned long free; |
1256 | struct zone *zone; | 1310 | struct zone *zone; |
1257 | 1311 | ||
1258 | for_each_zone(zone) { | 1312 | for_each_zone(zone) { |
1259 | show_node(zone); | 1313 | if (!populated_zone(zone)) |
1260 | printk("%s per-cpu:", zone->name); | ||
1261 | |||
1262 | if (!populated_zone(zone)) { | ||
1263 | printk(" empty\n"); | ||
1264 | continue; | 1314 | continue; |
1265 | } else | 1315 | |
1266 | printk("\n"); | 1316 | show_node(zone); |
1317 | printk("%s per-cpu:\n", zone->name); | ||
1267 | 1318 | ||
1268 | for_each_online_cpu(cpu) { | 1319 | for_each_online_cpu(cpu) { |
1269 | struct per_cpu_pageset *pageset; | 1320 | struct per_cpu_pageset *pageset; |
1270 | 1321 | ||
1271 | pageset = zone_pcp(zone, cpu); | 1322 | pageset = zone_pcp(zone, cpu); |
1272 | 1323 | ||
1273 | for (temperature = 0; temperature < 2; temperature++) | 1324 | printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d " |
1274 | printk("cpu %d %s: high %d, batch %d used:%d\n", | 1325 | "Cold: hi:%5d, btch:%4d usd:%4d\n", |
1275 | cpu, | 1326 | cpu, pageset->pcp[0].high, |
1276 | temperature ? "cold" : "hot", | 1327 | pageset->pcp[0].batch, pageset->pcp[0].count, |
1277 | pageset->pcp[temperature].high, | 1328 | pageset->pcp[1].high, pageset->pcp[1].batch, |
1278 | pageset->pcp[temperature].batch, | 1329 | pageset->pcp[1].count); |
1279 | pageset->pcp[temperature].count); | ||
1280 | } | 1330 | } |
1281 | } | 1331 | } |
1282 | 1332 | ||
1283 | get_zone_counts(&active, &inactive, &free); | 1333 | get_zone_counts(&active, &inactive, &free); |
1284 | 1334 | ||
1285 | printk("Free pages: %11ukB (%ukB HighMem)\n", | ||
1286 | K(nr_free_pages()), | ||
1287 | K(nr_free_highpages())); | ||
1288 | |||
1289 | printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " | 1335 | printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " |
1290 | "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", | 1336 | "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", |
1291 | active, | 1337 | active, |
@@ -1294,13 +1340,17 @@ void show_free_areas(void) | |||
1294 | global_page_state(NR_WRITEBACK), | 1340 | global_page_state(NR_WRITEBACK), |
1295 | global_page_state(NR_UNSTABLE_NFS), | 1341 | global_page_state(NR_UNSTABLE_NFS), |
1296 | nr_free_pages(), | 1342 | nr_free_pages(), |
1297 | global_page_state(NR_SLAB), | 1343 | global_page_state(NR_SLAB_RECLAIMABLE) + |
1344 | global_page_state(NR_SLAB_UNRECLAIMABLE), | ||
1298 | global_page_state(NR_FILE_MAPPED), | 1345 | global_page_state(NR_FILE_MAPPED), |
1299 | global_page_state(NR_PAGETABLE)); | 1346 | global_page_state(NR_PAGETABLE)); |
1300 | 1347 | ||
1301 | for_each_zone(zone) { | 1348 | for_each_zone(zone) { |
1302 | int i; | 1349 | int i; |
1303 | 1350 | ||
1351 | if (!populated_zone(zone)) | ||
1352 | continue; | ||
1353 | |||
1304 | show_node(zone); | 1354 | show_node(zone); |
1305 | printk("%s" | 1355 | printk("%s" |
1306 | " free:%lukB" | 1356 | " free:%lukB" |
@@ -1333,12 +1383,11 @@ void show_free_areas(void) | |||
1333 | for_each_zone(zone) { | 1383 | for_each_zone(zone) { |
1334 | unsigned long nr[MAX_ORDER], flags, order, total = 0; | 1384 | unsigned long nr[MAX_ORDER], flags, order, total = 0; |
1335 | 1385 | ||
1386 | if (!populated_zone(zone)) | ||
1387 | continue; | ||
1388 | |||
1336 | show_node(zone); | 1389 | show_node(zone); |
1337 | printk("%s: ", zone->name); | 1390 | printk("%s: ", zone->name); |
1338 | if (!populated_zone(zone)) { | ||
1339 | printk("empty\n"); | ||
1340 | continue; | ||
1341 | } | ||
1342 | 1391 | ||
1343 | spin_lock_irqsave(&zone->lock, flags); | 1392 | spin_lock_irqsave(&zone->lock, flags); |
1344 | for (order = 0; order < MAX_ORDER; order++) { | 1393 | for (order = 0; order < MAX_ORDER; order++) { |
@@ -1360,39 +1409,25 @@ void show_free_areas(void) | |||
1360 | * Add all populated zones of a node to the zonelist. | 1409 | * Add all populated zones of a node to the zonelist. |
1361 | */ | 1410 | */ |
1362 | static int __meminit build_zonelists_node(pg_data_t *pgdat, | 1411 | static int __meminit build_zonelists_node(pg_data_t *pgdat, |
1363 | struct zonelist *zonelist, int nr_zones, int zone_type) | 1412 | struct zonelist *zonelist, int nr_zones, enum zone_type zone_type) |
1364 | { | 1413 | { |
1365 | struct zone *zone; | 1414 | struct zone *zone; |
1366 | 1415 | ||
1367 | BUG_ON(zone_type > ZONE_HIGHMEM); | 1416 | BUG_ON(zone_type >= MAX_NR_ZONES); |
1417 | zone_type++; | ||
1368 | 1418 | ||
1369 | do { | 1419 | do { |
1420 | zone_type--; | ||
1370 | zone = pgdat->node_zones + zone_type; | 1421 | zone = pgdat->node_zones + zone_type; |
1371 | if (populated_zone(zone)) { | 1422 | if (populated_zone(zone)) { |
1372 | #ifndef CONFIG_HIGHMEM | ||
1373 | BUG_ON(zone_type > ZONE_NORMAL); | ||
1374 | #endif | ||
1375 | zonelist->zones[nr_zones++] = zone; | 1423 | zonelist->zones[nr_zones++] = zone; |
1376 | check_highest_zone(zone_type); | 1424 | check_highest_zone(zone_type); |
1377 | } | 1425 | } |
1378 | zone_type--; | ||
1379 | 1426 | ||
1380 | } while (zone_type >= 0); | 1427 | } while (zone_type); |
1381 | return nr_zones; | 1428 | return nr_zones; |
1382 | } | 1429 | } |
1383 | 1430 | ||
1384 | static inline int highest_zone(int zone_bits) | ||
1385 | { | ||
1386 | int res = ZONE_NORMAL; | ||
1387 | if (zone_bits & (__force int)__GFP_HIGHMEM) | ||
1388 | res = ZONE_HIGHMEM; | ||
1389 | if (zone_bits & (__force int)__GFP_DMA32) | ||
1390 | res = ZONE_DMA32; | ||
1391 | if (zone_bits & (__force int)__GFP_DMA) | ||
1392 | res = ZONE_DMA; | ||
1393 | return res; | ||
1394 | } | ||
1395 | |||
1396 | #ifdef CONFIG_NUMA | 1431 | #ifdef CONFIG_NUMA |
1397 | #define MAX_NODE_LOAD (num_online_nodes()) | 1432 | #define MAX_NODE_LOAD (num_online_nodes()) |
1398 | static int __meminitdata node_load[MAX_NUMNODES]; | 1433 | static int __meminitdata node_load[MAX_NUMNODES]; |
@@ -1458,13 +1493,14 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) | |||
1458 | 1493 | ||
1459 | static void __meminit build_zonelists(pg_data_t *pgdat) | 1494 | static void __meminit build_zonelists(pg_data_t *pgdat) |
1460 | { | 1495 | { |
1461 | int i, j, k, node, local_node; | 1496 | int j, node, local_node; |
1497 | enum zone_type i; | ||
1462 | int prev_node, load; | 1498 | int prev_node, load; |
1463 | struct zonelist *zonelist; | 1499 | struct zonelist *zonelist; |
1464 | nodemask_t used_mask; | 1500 | nodemask_t used_mask; |
1465 | 1501 | ||
1466 | /* initialize zonelists */ | 1502 | /* initialize zonelists */ |
1467 | for (i = 0; i < GFP_ZONETYPES; i++) { | 1503 | for (i = 0; i < MAX_NR_ZONES; i++) { |
1468 | zonelist = pgdat->node_zonelists + i; | 1504 | zonelist = pgdat->node_zonelists + i; |
1469 | zonelist->zones[0] = NULL; | 1505 | zonelist->zones[0] = NULL; |
1470 | } | 1506 | } |
@@ -1494,13 +1530,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1494 | node_load[node] += load; | 1530 | node_load[node] += load; |
1495 | prev_node = node; | 1531 | prev_node = node; |
1496 | load--; | 1532 | load--; |
1497 | for (i = 0; i < GFP_ZONETYPES; i++) { | 1533 | for (i = 0; i < MAX_NR_ZONES; i++) { |
1498 | zonelist = pgdat->node_zonelists + i; | 1534 | zonelist = pgdat->node_zonelists + i; |
1499 | for (j = 0; zonelist->zones[j] != NULL; j++); | 1535 | for (j = 0; zonelist->zones[j] != NULL; j++); |
1500 | 1536 | ||
1501 | k = highest_zone(i); | 1537 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); |
1502 | |||
1503 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); | ||
1504 | zonelist->zones[j] = NULL; | 1538 | zonelist->zones[j] = NULL; |
1505 | } | 1539 | } |
1506 | } | 1540 | } |
@@ -1510,17 +1544,16 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1510 | 1544 | ||
1511 | static void __meminit build_zonelists(pg_data_t *pgdat) | 1545 | static void __meminit build_zonelists(pg_data_t *pgdat) |
1512 | { | 1546 | { |
1513 | int i, j, k, node, local_node; | 1547 | int node, local_node; |
1548 | enum zone_type i,j; | ||
1514 | 1549 | ||
1515 | local_node = pgdat->node_id; | 1550 | local_node = pgdat->node_id; |
1516 | for (i = 0; i < GFP_ZONETYPES; i++) { | 1551 | for (i = 0; i < MAX_NR_ZONES; i++) { |
1517 | struct zonelist *zonelist; | 1552 | struct zonelist *zonelist; |
1518 | 1553 | ||
1519 | zonelist = pgdat->node_zonelists + i; | 1554 | zonelist = pgdat->node_zonelists + i; |
1520 | 1555 | ||
1521 | j = 0; | 1556 | j = build_zonelists_node(pgdat, zonelist, 0, i); |
1522 | k = highest_zone(i); | ||
1523 | j = build_zonelists_node(pgdat, zonelist, j, k); | ||
1524 | /* | 1557 | /* |
1525 | * Now we build the zonelist so that it contains the zones | 1558 | * Now we build the zonelist so that it contains the zones |
1526 | * of all the other nodes. | 1559 | * of all the other nodes. |
@@ -1532,12 +1565,12 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1532 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { | 1565 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { |
1533 | if (!node_online(node)) | 1566 | if (!node_online(node)) |
1534 | continue; | 1567 | continue; |
1535 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); | 1568 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); |
1536 | } | 1569 | } |
1537 | for (node = 0; node < local_node; node++) { | 1570 | for (node = 0; node < local_node; node++) { |
1538 | if (!node_online(node)) | 1571 | if (!node_online(node)) |
1539 | continue; | 1572 | continue; |
1540 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); | 1573 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); |
1541 | } | 1574 | } |
1542 | 1575 | ||
1543 | zonelist->zones[j] = NULL; | 1576 | zonelist->zones[j] = NULL; |
@@ -1558,7 +1591,7 @@ static int __meminit __build_all_zonelists(void *dummy) | |||
1558 | void __meminit build_all_zonelists(void) | 1591 | void __meminit build_all_zonelists(void) |
1559 | { | 1592 | { |
1560 | if (system_state == SYSTEM_BOOTING) { | 1593 | if (system_state == SYSTEM_BOOTING) { |
1561 | __build_all_zonelists(0); | 1594 | __build_all_zonelists(NULL); |
1562 | cpuset_init_current_mems_allowed(); | 1595 | cpuset_init_current_mems_allowed(); |
1563 | } else { | 1596 | } else { |
1564 | /* we have to stop all cpus to guaranntee there is no user | 1597 | /* we have to stop all cpus to guaranntee there is no user |
@@ -1639,25 +1672,6 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
1639 | 1672 | ||
1640 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) | 1673 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) |
1641 | 1674 | ||
1642 | static void __init calculate_zone_totalpages(struct pglist_data *pgdat, | ||
1643 | unsigned long *zones_size, unsigned long *zholes_size) | ||
1644 | { | ||
1645 | unsigned long realtotalpages, totalpages = 0; | ||
1646 | int i; | ||
1647 | |||
1648 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
1649 | totalpages += zones_size[i]; | ||
1650 | pgdat->node_spanned_pages = totalpages; | ||
1651 | |||
1652 | realtotalpages = totalpages; | ||
1653 | if (zholes_size) | ||
1654 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
1655 | realtotalpages -= zholes_size[i]; | ||
1656 | pgdat->node_present_pages = realtotalpages; | ||
1657 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); | ||
1658 | } | ||
1659 | |||
1660 | |||
1661 | /* | 1675 | /* |
1662 | * Initially all pages are reserved - free ones are freed | 1676 | * Initially all pages are reserved - free ones are freed |
1663 | * up by free_all_bootmem() once the early boot process is | 1677 | * up by free_all_bootmem() once the early boot process is |
@@ -1698,8 +1712,8 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, | |||
1698 | } | 1712 | } |
1699 | 1713 | ||
1700 | #define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) | 1714 | #define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) |
1701 | void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, | 1715 | void zonetable_add(struct zone *zone, int nid, enum zone_type zid, |
1702 | unsigned long size) | 1716 | unsigned long pfn, unsigned long size) |
1703 | { | 1717 | { |
1704 | unsigned long snum = pfn_to_section_nr(pfn); | 1718 | unsigned long snum = pfn_to_section_nr(pfn); |
1705 | unsigned long end = pfn_to_section_nr(pfn + size); | 1719 | unsigned long end = pfn_to_section_nr(pfn + size); |
@@ -1815,6 +1829,9 @@ static int __cpuinit process_zones(int cpu) | |||
1815 | 1829 | ||
1816 | for_each_zone(zone) { | 1830 | for_each_zone(zone) { |
1817 | 1831 | ||
1832 | if (!populated_zone(zone)) | ||
1833 | continue; | ||
1834 | |||
1818 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), | 1835 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), |
1819 | GFP_KERNEL, cpu_to_node(cpu)); | 1836 | GFP_KERNEL, cpu_to_node(cpu)); |
1820 | if (!zone_pcp(zone, cpu)) | 1837 | if (!zone_pcp(zone, cpu)) |
@@ -1845,8 +1862,10 @@ static inline void free_zone_pagesets(int cpu) | |||
1845 | for_each_zone(zone) { | 1862 | for_each_zone(zone) { |
1846 | struct per_cpu_pageset *pset = zone_pcp(zone, cpu); | 1863 | struct per_cpu_pageset *pset = zone_pcp(zone, cpu); |
1847 | 1864 | ||
1865 | /* Free per_cpu_pageset if it is slab allocated */ | ||
1866 | if (pset != &boot_pageset[cpu]) | ||
1867 | kfree(pset); | ||
1848 | zone_pcp(zone, cpu) = NULL; | 1868 | zone_pcp(zone, cpu) = NULL; |
1849 | kfree(pset); | ||
1850 | } | 1869 | } |
1851 | } | 1870 | } |
1852 | 1871 | ||
@@ -1972,6 +1991,366 @@ __meminit int init_currently_empty_zone(struct zone *zone, | |||
1972 | return 0; | 1991 | return 0; |
1973 | } | 1992 | } |
1974 | 1993 | ||
1994 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | ||
1995 | /* | ||
1996 | * Basic iterator support. Return the first range of PFNs for a node | ||
1997 | * Note: nid == MAX_NUMNODES returns first region regardless of node | ||
1998 | */ | ||
1999 | static int __init first_active_region_index_in_nid(int nid) | ||
2000 | { | ||
2001 | int i; | ||
2002 | |||
2003 | for (i = 0; i < nr_nodemap_entries; i++) | ||
2004 | if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) | ||
2005 | return i; | ||
2006 | |||
2007 | return -1; | ||
2008 | } | ||
2009 | |||
2010 | /* | ||
2011 | * Basic iterator support. Return the next active range of PFNs for a node | ||
2012 | * Note: nid == MAX_NUMNODES returns next region regardles of node | ||
2013 | */ | ||
2014 | static int __init next_active_region_index_in_nid(int index, int nid) | ||
2015 | { | ||
2016 | for (index = index + 1; index < nr_nodemap_entries; index++) | ||
2017 | if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) | ||
2018 | return index; | ||
2019 | |||
2020 | return -1; | ||
2021 | } | ||
2022 | |||
2023 | #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID | ||
2024 | /* | ||
2025 | * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. | ||
2026 | * Architectures may implement their own version but if add_active_range() | ||
2027 | * was used and there are no special requirements, this is a convenient | ||
2028 | * alternative | ||
2029 | */ | ||
2030 | int __init early_pfn_to_nid(unsigned long pfn) | ||
2031 | { | ||
2032 | int i; | ||
2033 | |||
2034 | for (i = 0; i < nr_nodemap_entries; i++) { | ||
2035 | unsigned long start_pfn = early_node_map[i].start_pfn; | ||
2036 | unsigned long end_pfn = early_node_map[i].end_pfn; | ||
2037 | |||
2038 | if (start_pfn <= pfn && pfn < end_pfn) | ||
2039 | return early_node_map[i].nid; | ||
2040 | } | ||
2041 | |||
2042 | return 0; | ||
2043 | } | ||
2044 | #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ | ||
2045 | |||
2046 | /* Basic iterator support to walk early_node_map[] */ | ||
2047 | #define for_each_active_range_index_in_nid(i, nid) \ | ||
2048 | for (i = first_active_region_index_in_nid(nid); i != -1; \ | ||
2049 | i = next_active_region_index_in_nid(i, nid)) | ||
2050 | |||
2051 | /** | ||
2052 | * free_bootmem_with_active_regions - Call free_bootmem_node for each active range | ||
2053 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed | ||
2054 | * @max_low_pfn: The highest PFN that till be passed to free_bootmem_node | ||
2055 | * | ||
2056 | * If an architecture guarantees that all ranges registered with | ||
2057 | * add_active_ranges() contain no holes and may be freed, this | ||
2058 | * this function may be used instead of calling free_bootmem() manually. | ||
2059 | */ | ||
2060 | void __init free_bootmem_with_active_regions(int nid, | ||
2061 | unsigned long max_low_pfn) | ||
2062 | { | ||
2063 | int i; | ||
2064 | |||
2065 | for_each_active_range_index_in_nid(i, nid) { | ||
2066 | unsigned long size_pages = 0; | ||
2067 | unsigned long end_pfn = early_node_map[i].end_pfn; | ||
2068 | |||
2069 | if (early_node_map[i].start_pfn >= max_low_pfn) | ||
2070 | continue; | ||
2071 | |||
2072 | if (end_pfn > max_low_pfn) | ||
2073 | end_pfn = max_low_pfn; | ||
2074 | |||
2075 | size_pages = end_pfn - early_node_map[i].start_pfn; | ||
2076 | free_bootmem_node(NODE_DATA(early_node_map[i].nid), | ||
2077 | PFN_PHYS(early_node_map[i].start_pfn), | ||
2078 | size_pages << PAGE_SHIFT); | ||
2079 | } | ||
2080 | } | ||
2081 | |||
2082 | /** | ||
2083 | * sparse_memory_present_with_active_regions - Call memory_present for each active range | ||
2084 | * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used | ||
2085 | * | ||
2086 | * If an architecture guarantees that all ranges registered with | ||
2087 | * add_active_ranges() contain no holes and may be freed, this | ||
2088 | * this function may be used instead of calling memory_present() manually. | ||
2089 | */ | ||
2090 | void __init sparse_memory_present_with_active_regions(int nid) | ||
2091 | { | ||
2092 | int i; | ||
2093 | |||
2094 | for_each_active_range_index_in_nid(i, nid) | ||
2095 | memory_present(early_node_map[i].nid, | ||
2096 | early_node_map[i].start_pfn, | ||
2097 | early_node_map[i].end_pfn); | ||
2098 | } | ||
2099 | |||
2100 | /** | ||
2101 | * push_node_boundaries - Push node boundaries to at least the requested boundary | ||
2102 | * @nid: The nid of the node to push the boundary for | ||
2103 | * @start_pfn: The start pfn of the node | ||
2104 | * @end_pfn: The end pfn of the node | ||
2105 | * | ||
2106 | * In reserve-based hot-add, mem_map is allocated that is unused until hotadd | ||
2107 | * time. Specifically, on x86_64, SRAT will report ranges that can potentially | ||
2108 | * be hotplugged even though no physical memory exists. This function allows | ||
2109 | * an arch to push out the node boundaries so mem_map is allocated that can | ||
2110 | * be used later. | ||
2111 | */ | ||
2112 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
2113 | void __init push_node_boundaries(unsigned int nid, | ||
2114 | unsigned long start_pfn, unsigned long end_pfn) | ||
2115 | { | ||
2116 | printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", | ||
2117 | nid, start_pfn, end_pfn); | ||
2118 | |||
2119 | /* Initialise the boundary for this node if necessary */ | ||
2120 | if (node_boundary_end_pfn[nid] == 0) | ||
2121 | node_boundary_start_pfn[nid] = -1UL; | ||
2122 | |||
2123 | /* Update the boundaries */ | ||
2124 | if (node_boundary_start_pfn[nid] > start_pfn) | ||
2125 | node_boundary_start_pfn[nid] = start_pfn; | ||
2126 | if (node_boundary_end_pfn[nid] < end_pfn) | ||
2127 | node_boundary_end_pfn[nid] = end_pfn; | ||
2128 | } | ||
2129 | |||
2130 | /* If necessary, push the node boundary out for reserve hotadd */ | ||
2131 | static void __init account_node_boundary(unsigned int nid, | ||
2132 | unsigned long *start_pfn, unsigned long *end_pfn) | ||
2133 | { | ||
2134 | printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", | ||
2135 | nid, *start_pfn, *end_pfn); | ||
2136 | |||
2137 | /* Return if boundary information has not been provided */ | ||
2138 | if (node_boundary_end_pfn[nid] == 0) | ||
2139 | return; | ||
2140 | |||
2141 | /* Check the boundaries and update if necessary */ | ||
2142 | if (node_boundary_start_pfn[nid] < *start_pfn) | ||
2143 | *start_pfn = node_boundary_start_pfn[nid]; | ||
2144 | if (node_boundary_end_pfn[nid] > *end_pfn) | ||
2145 | *end_pfn = node_boundary_end_pfn[nid]; | ||
2146 | } | ||
2147 | #else | ||
2148 | void __init push_node_boundaries(unsigned int nid, | ||
2149 | unsigned long start_pfn, unsigned long end_pfn) {} | ||
2150 | |||
2151 | static void __init account_node_boundary(unsigned int nid, | ||
2152 | unsigned long *start_pfn, unsigned long *end_pfn) {} | ||
2153 | #endif | ||
2154 | |||
2155 | |||
2156 | /** | ||
2157 | * get_pfn_range_for_nid - Return the start and end page frames for a node | ||
2158 | * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned | ||
2159 | * @start_pfn: Passed by reference. On return, it will have the node start_pfn | ||
2160 | * @end_pfn: Passed by reference. On return, it will have the node end_pfn | ||
2161 | * | ||
2162 | * It returns the start and end page frame of a node based on information | ||
2163 | * provided by an arch calling add_active_range(). If called for a node | ||
2164 | * with no available memory, a warning is printed and the start and end | ||
2165 | * PFNs will be 0 | ||
2166 | */ | ||
2167 | void __init get_pfn_range_for_nid(unsigned int nid, | ||
2168 | unsigned long *start_pfn, unsigned long *end_pfn) | ||
2169 | { | ||
2170 | int i; | ||
2171 | *start_pfn = -1UL; | ||
2172 | *end_pfn = 0; | ||
2173 | |||
2174 | for_each_active_range_index_in_nid(i, nid) { | ||
2175 | *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); | ||
2176 | *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); | ||
2177 | } | ||
2178 | |||
2179 | if (*start_pfn == -1UL) { | ||
2180 | printk(KERN_WARNING "Node %u active with no memory\n", nid); | ||
2181 | *start_pfn = 0; | ||
2182 | } | ||
2183 | |||
2184 | /* Push the node boundaries out if requested */ | ||
2185 | account_node_boundary(nid, start_pfn, end_pfn); | ||
2186 | } | ||
2187 | |||
2188 | /* | ||
2189 | * Return the number of pages a zone spans in a node, including holes | ||
2190 | * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() | ||
2191 | */ | ||
2192 | unsigned long __init zone_spanned_pages_in_node(int nid, | ||
2193 | unsigned long zone_type, | ||
2194 | unsigned long *ignored) | ||
2195 | { | ||
2196 | unsigned long node_start_pfn, node_end_pfn; | ||
2197 | unsigned long zone_start_pfn, zone_end_pfn; | ||
2198 | |||
2199 | /* Get the start and end of the node and zone */ | ||
2200 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); | ||
2201 | zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; | ||
2202 | zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; | ||
2203 | |||
2204 | /* Check that this node has pages within the zone's required range */ | ||
2205 | if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) | ||
2206 | return 0; | ||
2207 | |||
2208 | /* Move the zone boundaries inside the node if necessary */ | ||
2209 | zone_end_pfn = min(zone_end_pfn, node_end_pfn); | ||
2210 | zone_start_pfn = max(zone_start_pfn, node_start_pfn); | ||
2211 | |||
2212 | /* Return the spanned pages */ | ||
2213 | return zone_end_pfn - zone_start_pfn; | ||
2214 | } | ||
2215 | |||
2216 | /* | ||
2217 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, | ||
2218 | * then all holes in the requested range will be accounted for | ||
2219 | */ | ||
2220 | unsigned long __init __absent_pages_in_range(int nid, | ||
2221 | unsigned long range_start_pfn, | ||
2222 | unsigned long range_end_pfn) | ||
2223 | { | ||
2224 | int i = 0; | ||
2225 | unsigned long prev_end_pfn = 0, hole_pages = 0; | ||
2226 | unsigned long start_pfn; | ||
2227 | |||
2228 | /* Find the end_pfn of the first active range of pfns in the node */ | ||
2229 | i = first_active_region_index_in_nid(nid); | ||
2230 | if (i == -1) | ||
2231 | return 0; | ||
2232 | |||
2233 | /* Account for ranges before physical memory on this node */ | ||
2234 | if (early_node_map[i].start_pfn > range_start_pfn) | ||
2235 | hole_pages = early_node_map[i].start_pfn - range_start_pfn; | ||
2236 | |||
2237 | prev_end_pfn = early_node_map[i].start_pfn; | ||
2238 | |||
2239 | /* Find all holes for the zone within the node */ | ||
2240 | for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { | ||
2241 | |||
2242 | /* No need to continue if prev_end_pfn is outside the zone */ | ||
2243 | if (prev_end_pfn >= range_end_pfn) | ||
2244 | break; | ||
2245 | |||
2246 | /* Make sure the end of the zone is not within the hole */ | ||
2247 | start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); | ||
2248 | prev_end_pfn = max(prev_end_pfn, range_start_pfn); | ||
2249 | |||
2250 | /* Update the hole size cound and move on */ | ||
2251 | if (start_pfn > range_start_pfn) { | ||
2252 | BUG_ON(prev_end_pfn > start_pfn); | ||
2253 | hole_pages += start_pfn - prev_end_pfn; | ||
2254 | } | ||
2255 | prev_end_pfn = early_node_map[i].end_pfn; | ||
2256 | } | ||
2257 | |||
2258 | /* Account for ranges past physical memory on this node */ | ||
2259 | if (range_end_pfn > prev_end_pfn) | ||
2260 | hole_pages = range_end_pfn - | ||
2261 | max(range_start_pfn, prev_end_pfn); | ||
2262 | |||
2263 | return hole_pages; | ||
2264 | } | ||
2265 | |||
2266 | /** | ||
2267 | * absent_pages_in_range - Return number of page frames in holes within a range | ||
2268 | * @start_pfn: The start PFN to start searching for holes | ||
2269 | * @end_pfn: The end PFN to stop searching for holes | ||
2270 | * | ||
2271 | * It returns the number of pages frames in memory holes within a range | ||
2272 | */ | ||
2273 | unsigned long __init absent_pages_in_range(unsigned long start_pfn, | ||
2274 | unsigned long end_pfn) | ||
2275 | { | ||
2276 | return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); | ||
2277 | } | ||
2278 | |||
2279 | /* Return the number of page frames in holes in a zone on a node */ | ||
2280 | unsigned long __init zone_absent_pages_in_node(int nid, | ||
2281 | unsigned long zone_type, | ||
2282 | unsigned long *ignored) | ||
2283 | { | ||
2284 | unsigned long node_start_pfn, node_end_pfn; | ||
2285 | unsigned long zone_start_pfn, zone_end_pfn; | ||
2286 | |||
2287 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); | ||
2288 | zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], | ||
2289 | node_start_pfn); | ||
2290 | zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], | ||
2291 | node_end_pfn); | ||
2292 | |||
2293 | return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); | ||
2294 | } | ||
2295 | |||
2296 | /* Return the zone index a PFN is in */ | ||
2297 | int memmap_zone_idx(struct page *lmem_map) | ||
2298 | { | ||
2299 | int i; | ||
2300 | unsigned long phys_addr = virt_to_phys(lmem_map); | ||
2301 | unsigned long pfn = phys_addr >> PAGE_SHIFT; | ||
2302 | |||
2303 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
2304 | if (pfn < arch_zone_highest_possible_pfn[i]) | ||
2305 | break; | ||
2306 | |||
2307 | return i; | ||
2308 | } | ||
2309 | #else | ||
2310 | static inline unsigned long zone_spanned_pages_in_node(int nid, | ||
2311 | unsigned long zone_type, | ||
2312 | unsigned long *zones_size) | ||
2313 | { | ||
2314 | return zones_size[zone_type]; | ||
2315 | } | ||
2316 | |||
2317 | static inline unsigned long zone_absent_pages_in_node(int nid, | ||
2318 | unsigned long zone_type, | ||
2319 | unsigned long *zholes_size) | ||
2320 | { | ||
2321 | if (!zholes_size) | ||
2322 | return 0; | ||
2323 | |||
2324 | return zholes_size[zone_type]; | ||
2325 | } | ||
2326 | |||
2327 | static inline int memmap_zone_idx(struct page *lmem_map) | ||
2328 | { | ||
2329 | return MAX_NR_ZONES; | ||
2330 | } | ||
2331 | #endif | ||
2332 | |||
2333 | static void __init calculate_node_totalpages(struct pglist_data *pgdat, | ||
2334 | unsigned long *zones_size, unsigned long *zholes_size) | ||
2335 | { | ||
2336 | unsigned long realtotalpages, totalpages = 0; | ||
2337 | enum zone_type i; | ||
2338 | |||
2339 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
2340 | totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, | ||
2341 | zones_size); | ||
2342 | pgdat->node_spanned_pages = totalpages; | ||
2343 | |||
2344 | realtotalpages = totalpages; | ||
2345 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
2346 | realtotalpages -= | ||
2347 | zone_absent_pages_in_node(pgdat->node_id, i, | ||
2348 | zholes_size); | ||
2349 | pgdat->node_present_pages = realtotalpages; | ||
2350 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, | ||
2351 | realtotalpages); | ||
2352 | } | ||
2353 | |||
1975 | /* | 2354 | /* |
1976 | * Set up the zone data structures: | 2355 | * Set up the zone data structures: |
1977 | * - mark all pages reserved | 2356 | * - mark all pages reserved |
@@ -1981,7 +2360,7 @@ __meminit int init_currently_empty_zone(struct zone *zone, | |||
1981 | static void __meminit free_area_init_core(struct pglist_data *pgdat, | 2360 | static void __meminit free_area_init_core(struct pglist_data *pgdat, |
1982 | unsigned long *zones_size, unsigned long *zholes_size) | 2361 | unsigned long *zones_size, unsigned long *zholes_size) |
1983 | { | 2362 | { |
1984 | unsigned long j; | 2363 | enum zone_type j; |
1985 | int nid = pgdat->node_id; | 2364 | int nid = pgdat->node_id; |
1986 | unsigned long zone_start_pfn = pgdat->node_start_pfn; | 2365 | unsigned long zone_start_pfn = pgdat->node_start_pfn; |
1987 | int ret; | 2366 | int ret; |
@@ -1993,21 +2372,46 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, | |||
1993 | 2372 | ||
1994 | for (j = 0; j < MAX_NR_ZONES; j++) { | 2373 | for (j = 0; j < MAX_NR_ZONES; j++) { |
1995 | struct zone *zone = pgdat->node_zones + j; | 2374 | struct zone *zone = pgdat->node_zones + j; |
1996 | unsigned long size, realsize; | 2375 | unsigned long size, realsize, memmap_pages; |
1997 | 2376 | ||
1998 | realsize = size = zones_size[j]; | 2377 | size = zone_spanned_pages_in_node(nid, j, zones_size); |
1999 | if (zholes_size) | 2378 | realsize = size - zone_absent_pages_in_node(nid, j, |
2000 | realsize -= zholes_size[j]; | 2379 | zholes_size); |
2001 | 2380 | ||
2002 | if (j < ZONE_HIGHMEM) | 2381 | /* |
2382 | * Adjust realsize so that it accounts for how much memory | ||
2383 | * is used by this zone for memmap. This affects the watermark | ||
2384 | * and per-cpu initialisations | ||
2385 | */ | ||
2386 | memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT; | ||
2387 | if (realsize >= memmap_pages) { | ||
2388 | realsize -= memmap_pages; | ||
2389 | printk(KERN_DEBUG | ||
2390 | " %s zone: %lu pages used for memmap\n", | ||
2391 | zone_names[j], memmap_pages); | ||
2392 | } else | ||
2393 | printk(KERN_WARNING | ||
2394 | " %s zone: %lu pages exceeds realsize %lu\n", | ||
2395 | zone_names[j], memmap_pages, realsize); | ||
2396 | |||
2397 | /* Account for reserved DMA pages */ | ||
2398 | if (j == ZONE_DMA && realsize > dma_reserve) { | ||
2399 | realsize -= dma_reserve; | ||
2400 | printk(KERN_DEBUG " DMA zone: %lu pages reserved\n", | ||
2401 | dma_reserve); | ||
2402 | } | ||
2403 | |||
2404 | if (!is_highmem_idx(j)) | ||
2003 | nr_kernel_pages += realsize; | 2405 | nr_kernel_pages += realsize; |
2004 | nr_all_pages += realsize; | 2406 | nr_all_pages += realsize; |
2005 | 2407 | ||
2006 | zone->spanned_pages = size; | 2408 | zone->spanned_pages = size; |
2007 | zone->present_pages = realsize; | 2409 | zone->present_pages = realsize; |
2008 | #ifdef CONFIG_NUMA | 2410 | #ifdef CONFIG_NUMA |
2009 | zone->min_unmapped_ratio = (realsize*sysctl_min_unmapped_ratio) | 2411 | zone->node = nid; |
2412 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | ||
2010 | / 100; | 2413 | / 100; |
2414 | zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; | ||
2011 | #endif | 2415 | #endif |
2012 | zone->name = zone_names[j]; | 2416 | zone->name = zone_names[j]; |
2013 | spin_lock_init(&zone->lock); | 2417 | spin_lock_init(&zone->lock); |
@@ -2067,8 +2471,13 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) | |||
2067 | /* | 2471 | /* |
2068 | * With no DISCONTIG, the global mem_map is just set as node 0's | 2472 | * With no DISCONTIG, the global mem_map is just set as node 0's |
2069 | */ | 2473 | */ |
2070 | if (pgdat == NODE_DATA(0)) | 2474 | if (pgdat == NODE_DATA(0)) { |
2071 | mem_map = NODE_DATA(0)->node_mem_map; | 2475 | mem_map = NODE_DATA(0)->node_mem_map; |
2476 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | ||
2477 | if (page_to_pfn(mem_map) != pgdat->node_start_pfn) | ||
2478 | mem_map -= pgdat->node_start_pfn; | ||
2479 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ | ||
2480 | } | ||
2072 | #endif | 2481 | #endif |
2073 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ | 2482 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ |
2074 | } | 2483 | } |
@@ -2079,13 +2488,255 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, | |||
2079 | { | 2488 | { |
2080 | pgdat->node_id = nid; | 2489 | pgdat->node_id = nid; |
2081 | pgdat->node_start_pfn = node_start_pfn; | 2490 | pgdat->node_start_pfn = node_start_pfn; |
2082 | calculate_zone_totalpages(pgdat, zones_size, zholes_size); | 2491 | calculate_node_totalpages(pgdat, zones_size, zholes_size); |
2083 | 2492 | ||
2084 | alloc_node_mem_map(pgdat); | 2493 | alloc_node_mem_map(pgdat); |
2085 | 2494 | ||
2086 | free_area_init_core(pgdat, zones_size, zholes_size); | 2495 | free_area_init_core(pgdat, zones_size, zholes_size); |
2087 | } | 2496 | } |
2088 | 2497 | ||
2498 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | ||
2499 | /** | ||
2500 | * add_active_range - Register a range of PFNs backed by physical memory | ||
2501 | * @nid: The node ID the range resides on | ||
2502 | * @start_pfn: The start PFN of the available physical memory | ||
2503 | * @end_pfn: The end PFN of the available physical memory | ||
2504 | * | ||
2505 | * These ranges are stored in an early_node_map[] and later used by | ||
2506 | * free_area_init_nodes() to calculate zone sizes and holes. If the | ||
2507 | * range spans a memory hole, it is up to the architecture to ensure | ||
2508 | * the memory is not freed by the bootmem allocator. If possible | ||
2509 | * the range being registered will be merged with existing ranges. | ||
2510 | */ | ||
2511 | void __init add_active_range(unsigned int nid, unsigned long start_pfn, | ||
2512 | unsigned long end_pfn) | ||
2513 | { | ||
2514 | int i; | ||
2515 | |||
2516 | printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) " | ||
2517 | "%d entries of %d used\n", | ||
2518 | nid, start_pfn, end_pfn, | ||
2519 | nr_nodemap_entries, MAX_ACTIVE_REGIONS); | ||
2520 | |||
2521 | /* Merge with existing active regions if possible */ | ||
2522 | for (i = 0; i < nr_nodemap_entries; i++) { | ||
2523 | if (early_node_map[i].nid != nid) | ||
2524 | continue; | ||
2525 | |||
2526 | /* Skip if an existing region covers this new one */ | ||
2527 | if (start_pfn >= early_node_map[i].start_pfn && | ||
2528 | end_pfn <= early_node_map[i].end_pfn) | ||
2529 | return; | ||
2530 | |||
2531 | /* Merge forward if suitable */ | ||
2532 | if (start_pfn <= early_node_map[i].end_pfn && | ||
2533 | end_pfn > early_node_map[i].end_pfn) { | ||
2534 | early_node_map[i].end_pfn = end_pfn; | ||
2535 | return; | ||
2536 | } | ||
2537 | |||
2538 | /* Merge backward if suitable */ | ||
2539 | if (start_pfn < early_node_map[i].end_pfn && | ||
2540 | end_pfn >= early_node_map[i].start_pfn) { | ||
2541 | early_node_map[i].start_pfn = start_pfn; | ||
2542 | return; | ||
2543 | } | ||
2544 | } | ||
2545 | |||
2546 | /* Check that early_node_map is large enough */ | ||
2547 | if (i >= MAX_ACTIVE_REGIONS) { | ||
2548 | printk(KERN_CRIT "More than %d memory regions, truncating\n", | ||
2549 | MAX_ACTIVE_REGIONS); | ||
2550 | return; | ||
2551 | } | ||
2552 | |||
2553 | early_node_map[i].nid = nid; | ||
2554 | early_node_map[i].start_pfn = start_pfn; | ||
2555 | early_node_map[i].end_pfn = end_pfn; | ||
2556 | nr_nodemap_entries = i + 1; | ||
2557 | } | ||
2558 | |||
2559 | /** | ||
2560 | * shrink_active_range - Shrink an existing registered range of PFNs | ||
2561 | * @nid: The node id the range is on that should be shrunk | ||
2562 | * @old_end_pfn: The old end PFN of the range | ||
2563 | * @new_end_pfn: The new PFN of the range | ||
2564 | * | ||
2565 | * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. | ||
2566 | * The map is kept at the end physical page range that has already been | ||
2567 | * registered with add_active_range(). This function allows an arch to shrink | ||
2568 | * an existing registered range. | ||
2569 | */ | ||
2570 | void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, | ||
2571 | unsigned long new_end_pfn) | ||
2572 | { | ||
2573 | int i; | ||
2574 | |||
2575 | /* Find the old active region end and shrink */ | ||
2576 | for_each_active_range_index_in_nid(i, nid) | ||
2577 | if (early_node_map[i].end_pfn == old_end_pfn) { | ||
2578 | early_node_map[i].end_pfn = new_end_pfn; | ||
2579 | break; | ||
2580 | } | ||
2581 | } | ||
2582 | |||
2583 | /** | ||
2584 | * remove_all_active_ranges - Remove all currently registered regions | ||
2585 | * During discovery, it may be found that a table like SRAT is invalid | ||
2586 | * and an alternative discovery method must be used. This function removes | ||
2587 | * all currently registered regions. | ||
2588 | */ | ||
2589 | void __init remove_all_active_ranges() | ||
2590 | { | ||
2591 | memset(early_node_map, 0, sizeof(early_node_map)); | ||
2592 | nr_nodemap_entries = 0; | ||
2593 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
2594 | memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); | ||
2595 | memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); | ||
2596 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | ||
2597 | } | ||
2598 | |||
2599 | /* Compare two active node_active_regions */ | ||
2600 | static int __init cmp_node_active_region(const void *a, const void *b) | ||
2601 | { | ||
2602 | struct node_active_region *arange = (struct node_active_region *)a; | ||
2603 | struct node_active_region *brange = (struct node_active_region *)b; | ||
2604 | |||
2605 | /* Done this way to avoid overflows */ | ||
2606 | if (arange->start_pfn > brange->start_pfn) | ||
2607 | return 1; | ||
2608 | if (arange->start_pfn < brange->start_pfn) | ||
2609 | return -1; | ||
2610 | |||
2611 | return 0; | ||
2612 | } | ||
2613 | |||
2614 | /* sort the node_map by start_pfn */ | ||
2615 | static void __init sort_node_map(void) | ||
2616 | { | ||
2617 | sort(early_node_map, (size_t)nr_nodemap_entries, | ||
2618 | sizeof(struct node_active_region), | ||
2619 | cmp_node_active_region, NULL); | ||
2620 | } | ||
2621 | |||
2622 | /* Find the lowest pfn for a node. This depends on a sorted early_node_map */ | ||
2623 | unsigned long __init find_min_pfn_for_node(unsigned long nid) | ||
2624 | { | ||
2625 | int i; | ||
2626 | |||
2627 | /* Assuming a sorted map, the first range found has the starting pfn */ | ||
2628 | for_each_active_range_index_in_nid(i, nid) | ||
2629 | return early_node_map[i].start_pfn; | ||
2630 | |||
2631 | printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid); | ||
2632 | return 0; | ||
2633 | } | ||
2634 | |||
2635 | /** | ||
2636 | * find_min_pfn_with_active_regions - Find the minimum PFN registered | ||
2637 | * | ||
2638 | * It returns the minimum PFN based on information provided via | ||
2639 | * add_active_range() | ||
2640 | */ | ||
2641 | unsigned long __init find_min_pfn_with_active_regions(void) | ||
2642 | { | ||
2643 | return find_min_pfn_for_node(MAX_NUMNODES); | ||
2644 | } | ||
2645 | |||
2646 | /** | ||
2647 | * find_max_pfn_with_active_regions - Find the maximum PFN registered | ||
2648 | * | ||
2649 | * It returns the maximum PFN based on information provided via | ||
2650 | * add_active_range() | ||
2651 | */ | ||
2652 | unsigned long __init find_max_pfn_with_active_regions(void) | ||
2653 | { | ||
2654 | int i; | ||
2655 | unsigned long max_pfn = 0; | ||
2656 | |||
2657 | for (i = 0; i < nr_nodemap_entries; i++) | ||
2658 | max_pfn = max(max_pfn, early_node_map[i].end_pfn); | ||
2659 | |||
2660 | return max_pfn; | ||
2661 | } | ||
2662 | |||
2663 | /** | ||
2664 | * free_area_init_nodes - Initialise all pg_data_t and zone data | ||
2665 | * @arch_max_dma_pfn: The maximum PFN usable for ZONE_DMA | ||
2666 | * @arch_max_dma32_pfn: The maximum PFN usable for ZONE_DMA32 | ||
2667 | * @arch_max_low_pfn: The maximum PFN usable for ZONE_NORMAL | ||
2668 | * @arch_max_high_pfn: The maximum PFN usable for ZONE_HIGHMEM | ||
2669 | * | ||
2670 | * This will call free_area_init_node() for each active node in the system. | ||
2671 | * Using the page ranges provided by add_active_range(), the size of each | ||
2672 | * zone in each node and their holes is calculated. If the maximum PFN | ||
2673 | * between two adjacent zones match, it is assumed that the zone is empty. | ||
2674 | * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed | ||
2675 | * that arch_max_dma32_pfn has no pages. It is also assumed that a zone | ||
2676 | * starts where the previous one ended. For example, ZONE_DMA32 starts | ||
2677 | * at arch_max_dma_pfn. | ||
2678 | */ | ||
2679 | void __init free_area_init_nodes(unsigned long *max_zone_pfn) | ||
2680 | { | ||
2681 | unsigned long nid; | ||
2682 | enum zone_type i; | ||
2683 | |||
2684 | /* Record where the zone boundaries are */ | ||
2685 | memset(arch_zone_lowest_possible_pfn, 0, | ||
2686 | sizeof(arch_zone_lowest_possible_pfn)); | ||
2687 | memset(arch_zone_highest_possible_pfn, 0, | ||
2688 | sizeof(arch_zone_highest_possible_pfn)); | ||
2689 | arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); | ||
2690 | arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; | ||
2691 | for (i = 1; i < MAX_NR_ZONES; i++) { | ||
2692 | arch_zone_lowest_possible_pfn[i] = | ||
2693 | arch_zone_highest_possible_pfn[i-1]; | ||
2694 | arch_zone_highest_possible_pfn[i] = | ||
2695 | max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); | ||
2696 | } | ||
2697 | |||
2698 | /* Regions in the early_node_map can be in any order */ | ||
2699 | sort_node_map(); | ||
2700 | |||
2701 | /* Print out the zone ranges */ | ||
2702 | printk("Zone PFN ranges:\n"); | ||
2703 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
2704 | printk(" %-8s %8lu -> %8lu\n", | ||
2705 | zone_names[i], | ||
2706 | arch_zone_lowest_possible_pfn[i], | ||
2707 | arch_zone_highest_possible_pfn[i]); | ||
2708 | |||
2709 | /* Print out the early_node_map[] */ | ||
2710 | printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); | ||
2711 | for (i = 0; i < nr_nodemap_entries; i++) | ||
2712 | printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, | ||
2713 | early_node_map[i].start_pfn, | ||
2714 | early_node_map[i].end_pfn); | ||
2715 | |||
2716 | /* Initialise every node */ | ||
2717 | for_each_online_node(nid) { | ||
2718 | pg_data_t *pgdat = NODE_DATA(nid); | ||
2719 | free_area_init_node(nid, pgdat, NULL, | ||
2720 | find_min_pfn_for_node(nid), NULL); | ||
2721 | } | ||
2722 | } | ||
2723 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ | ||
2724 | |||
2725 | /** | ||
2726 | * set_dma_reserve - Account the specified number of pages reserved in ZONE_DMA | ||
2727 | * @new_dma_reserve - The number of pages to mark reserved | ||
2728 | * | ||
2729 | * The per-cpu batchsize and zone watermarks are determined by present_pages. | ||
2730 | * In the DMA zone, a significant percentage may be consumed by kernel image | ||
2731 | * and other unfreeable allocations which can skew the watermarks badly. This | ||
2732 | * function may optionally be used to account for unfreeable pages in | ||
2733 | * ZONE_DMA. The effect will be lower watermarks and smaller per-cpu batchsize | ||
2734 | */ | ||
2735 | void __init set_dma_reserve(unsigned long new_dma_reserve) | ||
2736 | { | ||
2737 | dma_reserve = new_dma_reserve; | ||
2738 | } | ||
2739 | |||
2089 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 2740 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
2090 | static bootmem_data_t contig_bootmem_data; | 2741 | static bootmem_data_t contig_bootmem_data; |
2091 | struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; | 2742 | struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; |
@@ -2129,7 +2780,7 @@ static void calculate_totalreserve_pages(void) | |||
2129 | { | 2780 | { |
2130 | struct pglist_data *pgdat; | 2781 | struct pglist_data *pgdat; |
2131 | unsigned long reserve_pages = 0; | 2782 | unsigned long reserve_pages = 0; |
2132 | int i, j; | 2783 | enum zone_type i, j; |
2133 | 2784 | ||
2134 | for_each_online_pgdat(pgdat) { | 2785 | for_each_online_pgdat(pgdat) { |
2135 | for (i = 0; i < MAX_NR_ZONES; i++) { | 2786 | for (i = 0; i < MAX_NR_ZONES; i++) { |
@@ -2162,7 +2813,7 @@ static void calculate_totalreserve_pages(void) | |||
2162 | static void setup_per_zone_lowmem_reserve(void) | 2813 | static void setup_per_zone_lowmem_reserve(void) |
2163 | { | 2814 | { |
2164 | struct pglist_data *pgdat; | 2815 | struct pglist_data *pgdat; |
2165 | int j, idx; | 2816 | enum zone_type j, idx; |
2166 | 2817 | ||
2167 | for_each_online_pgdat(pgdat) { | 2818 | for_each_online_pgdat(pgdat) { |
2168 | for (j = 0; j < MAX_NR_ZONES; j++) { | 2819 | for (j = 0; j < MAX_NR_ZONES; j++) { |
@@ -2171,9 +2822,12 @@ static void setup_per_zone_lowmem_reserve(void) | |||
2171 | 2822 | ||
2172 | zone->lowmem_reserve[j] = 0; | 2823 | zone->lowmem_reserve[j] = 0; |
2173 | 2824 | ||
2174 | for (idx = j-1; idx >= 0; idx--) { | 2825 | idx = j; |
2826 | while (idx) { | ||
2175 | struct zone *lower_zone; | 2827 | struct zone *lower_zone; |
2176 | 2828 | ||
2829 | idx--; | ||
2830 | |||
2177 | if (sysctl_lowmem_reserve_ratio[idx] < 1) | 2831 | if (sysctl_lowmem_reserve_ratio[idx] < 1) |
2178 | sysctl_lowmem_reserve_ratio[idx] = 1; | 2832 | sysctl_lowmem_reserve_ratio[idx] = 1; |
2179 | 2833 | ||
@@ -2314,10 +2968,26 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, | |||
2314 | return rc; | 2968 | return rc; |
2315 | 2969 | ||
2316 | for_each_zone(zone) | 2970 | for_each_zone(zone) |
2317 | zone->min_unmapped_ratio = (zone->present_pages * | 2971 | zone->min_unmapped_pages = (zone->present_pages * |
2318 | sysctl_min_unmapped_ratio) / 100; | 2972 | sysctl_min_unmapped_ratio) / 100; |
2319 | return 0; | 2973 | return 0; |
2320 | } | 2974 | } |
2975 | |||
2976 | int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, | ||
2977 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | ||
2978 | { | ||
2979 | struct zone *zone; | ||
2980 | int rc; | ||
2981 | |||
2982 | rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); | ||
2983 | if (rc) | ||
2984 | return rc; | ||
2985 | |||
2986 | for_each_zone(zone) | ||
2987 | zone->min_slab_pages = (zone->present_pages * | ||
2988 | sysctl_min_slab_ratio) / 100; | ||
2989 | return 0; | ||
2990 | } | ||
2321 | #endif | 2991 | #endif |
2322 | 2992 | ||
2323 | /* | 2993 | /* |
@@ -2363,7 +3033,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | |||
2363 | return 0; | 3033 | return 0; |
2364 | } | 3034 | } |
2365 | 3035 | ||
2366 | __initdata int hashdist = HASHDIST_DEFAULT; | 3036 | int hashdist = HASHDIST_DEFAULT; |
2367 | 3037 | ||
2368 | #ifdef CONFIG_NUMA | 3038 | #ifdef CONFIG_NUMA |
2369 | static int __init set_hashdist(char *str) | 3039 | static int __init set_hashdist(char *str) |
diff --git a/mm/page_io.c b/mm/page_io.c index 88029948d0..d4840ecbf8 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -52,14 +52,29 @@ static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err) | |||
52 | if (bio->bi_size) | 52 | if (bio->bi_size) |
53 | return 1; | 53 | return 1; |
54 | 54 | ||
55 | if (!uptodate) | 55 | if (!uptodate) { |
56 | SetPageError(page); | 56 | SetPageError(page); |
57 | /* | ||
58 | * We failed to write the page out to swap-space. | ||
59 | * Re-dirty the page in order to avoid it being reclaimed. | ||
60 | * Also print a dire warning that things will go BAD (tm) | ||
61 | * very quickly. | ||
62 | * | ||
63 | * Also clear PG_reclaim to avoid rotate_reclaimable_page() | ||
64 | */ | ||
65 | set_page_dirty(page); | ||
66 | printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n", | ||
67 | imajor(bio->bi_bdev->bd_inode), | ||
68 | iminor(bio->bi_bdev->bd_inode), | ||
69 | (unsigned long long)bio->bi_sector); | ||
70 | ClearPageReclaim(page); | ||
71 | } | ||
57 | end_page_writeback(page); | 72 | end_page_writeback(page); |
58 | bio_put(bio); | 73 | bio_put(bio); |
59 | return 0; | 74 | return 0; |
60 | } | 75 | } |
61 | 76 | ||
62 | static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err) | 77 | int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err) |
63 | { | 78 | { |
64 | const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 79 | const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
65 | struct page *page = bio->bi_io_vec[0].bv_page; | 80 | struct page *page = bio->bi_io_vec[0].bv_page; |
@@ -70,6 +85,10 @@ static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err) | |||
70 | if (!uptodate) { | 85 | if (!uptodate) { |
71 | SetPageError(page); | 86 | SetPageError(page); |
72 | ClearPageUptodate(page); | 87 | ClearPageUptodate(page); |
88 | printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", | ||
89 | imajor(bio->bi_bdev->bd_inode), | ||
90 | iminor(bio->bi_bdev->bd_inode), | ||
91 | (unsigned long long)bio->bi_sector); | ||
73 | } else { | 92 | } else { |
74 | SetPageUptodate(page); | 93 | SetPageUptodate(page); |
75 | } | 94 | } |
@@ -137,10 +156,12 @@ out: | |||
137 | * We use end_swap_bio_read() even for writes, because it happens to do what | 156 | * We use end_swap_bio_read() even for writes, because it happens to do what |
138 | * we want. | 157 | * we want. |
139 | */ | 158 | */ |
140 | int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page) | 159 | int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page, |
160 | struct bio **bio_chain) | ||
141 | { | 161 | { |
142 | struct bio *bio; | 162 | struct bio *bio; |
143 | int ret = 0; | 163 | int ret = 0; |
164 | int bio_rw; | ||
144 | 165 | ||
145 | lock_page(page); | 166 | lock_page(page); |
146 | 167 | ||
@@ -151,11 +172,22 @@ int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page) | |||
151 | goto out; | 172 | goto out; |
152 | } | 173 | } |
153 | 174 | ||
154 | submit_bio(rw | (1 << BIO_RW_SYNC), bio); | 175 | bio_rw = rw; |
155 | wait_on_page_locked(page); | 176 | if (!bio_chain) |
156 | 177 | bio_rw |= (1 << BIO_RW_SYNC); | |
157 | if (!PageUptodate(page) || PageError(page)) | 178 | if (bio_chain) |
158 | ret = -EIO; | 179 | bio_get(bio); |
180 | submit_bio(bio_rw, bio); | ||
181 | if (bio_chain == NULL) { | ||
182 | wait_on_page_locked(page); | ||
183 | |||
184 | if (!PageUptodate(page) || PageError(page)) | ||
185 | ret = -EIO; | ||
186 | } | ||
187 | if (bio_chain) { | ||
188 | bio->bi_private = *bio_chain; | ||
189 | *bio_chain = bio; | ||
190 | } | ||
159 | out: | 191 | out: |
160 | return ret; | 192 | return ret; |
161 | } | 193 | } |
@@ -434,6 +434,71 @@ int page_referenced(struct page *page, int is_locked) | |||
434 | return referenced; | 434 | return referenced; |
435 | } | 435 | } |
436 | 436 | ||
437 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | ||
438 | { | ||
439 | struct mm_struct *mm = vma->vm_mm; | ||
440 | unsigned long address; | ||
441 | pte_t *pte, entry; | ||
442 | spinlock_t *ptl; | ||
443 | int ret = 0; | ||
444 | |||
445 | address = vma_address(page, vma); | ||
446 | if (address == -EFAULT) | ||
447 | goto out; | ||
448 | |||
449 | pte = page_check_address(page, mm, address, &ptl); | ||
450 | if (!pte) | ||
451 | goto out; | ||
452 | |||
453 | if (!pte_dirty(*pte) && !pte_write(*pte)) | ||
454 | goto unlock; | ||
455 | |||
456 | entry = ptep_get_and_clear(mm, address, pte); | ||
457 | entry = pte_mkclean(entry); | ||
458 | entry = pte_wrprotect(entry); | ||
459 | ptep_establish(vma, address, pte, entry); | ||
460 | lazy_mmu_prot_update(entry); | ||
461 | ret = 1; | ||
462 | |||
463 | unlock: | ||
464 | pte_unmap_unlock(pte, ptl); | ||
465 | out: | ||
466 | return ret; | ||
467 | } | ||
468 | |||
469 | static int page_mkclean_file(struct address_space *mapping, struct page *page) | ||
470 | { | ||
471 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
472 | struct vm_area_struct *vma; | ||
473 | struct prio_tree_iter iter; | ||
474 | int ret = 0; | ||
475 | |||
476 | BUG_ON(PageAnon(page)); | ||
477 | |||
478 | spin_lock(&mapping->i_mmap_lock); | ||
479 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | ||
480 | if (vma->vm_flags & VM_SHARED) | ||
481 | ret += page_mkclean_one(page, vma); | ||
482 | } | ||
483 | spin_unlock(&mapping->i_mmap_lock); | ||
484 | return ret; | ||
485 | } | ||
486 | |||
487 | int page_mkclean(struct page *page) | ||
488 | { | ||
489 | int ret = 0; | ||
490 | |||
491 | BUG_ON(!PageLocked(page)); | ||
492 | |||
493 | if (page_mapped(page)) { | ||
494 | struct address_space *mapping = page_mapping(page); | ||
495 | if (mapping) | ||
496 | ret = page_mkclean_file(mapping, page); | ||
497 | } | ||
498 | |||
499 | return ret; | ||
500 | } | ||
501 | |||
437 | /** | 502 | /** |
438 | * page_set_anon_rmap - setup new anonymous rmap | 503 | * page_set_anon_rmap - setup new anonymous rmap |
439 | * @page: the page to add the mapping to | 504 | * @page: the page to add the mapping to |
diff --git a/mm/shmem.c b/mm/shmem.c index db21c51531..bb8ca7ef70 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -26,6 +26,8 @@ | |||
26 | #include <linux/module.h> | 26 | #include <linux/module.h> |
27 | #include <linux/init.h> | 27 | #include <linux/init.h> |
28 | #include <linux/fs.h> | 28 | #include <linux/fs.h> |
29 | #include <linux/xattr.h> | ||
30 | #include <linux/generic_acl.h> | ||
29 | #include <linux/mm.h> | 31 | #include <linux/mm.h> |
30 | #include <linux/mman.h> | 32 | #include <linux/mman.h> |
31 | #include <linux/file.h> | 33 | #include <linux/file.h> |
@@ -45,6 +47,7 @@ | |||
45 | #include <linux/namei.h> | 47 | #include <linux/namei.h> |
46 | #include <linux/ctype.h> | 48 | #include <linux/ctype.h> |
47 | #include <linux/migrate.h> | 49 | #include <linux/migrate.h> |
50 | #include <linux/highmem.h> | ||
48 | 51 | ||
49 | #include <asm/uaccess.h> | 52 | #include <asm/uaccess.h> |
50 | #include <asm/div64.h> | 53 | #include <asm/div64.h> |
@@ -176,6 +179,7 @@ static const struct address_space_operations shmem_aops; | |||
176 | static struct file_operations shmem_file_operations; | 179 | static struct file_operations shmem_file_operations; |
177 | static struct inode_operations shmem_inode_operations; | 180 | static struct inode_operations shmem_inode_operations; |
178 | static struct inode_operations shmem_dir_inode_operations; | 181 | static struct inode_operations shmem_dir_inode_operations; |
182 | static struct inode_operations shmem_special_inode_operations; | ||
179 | static struct vm_operations_struct shmem_vm_ops; | 183 | static struct vm_operations_struct shmem_vm_ops; |
180 | 184 | ||
181 | static struct backing_dev_info shmem_backing_dev_info __read_mostly = { | 185 | static struct backing_dev_info shmem_backing_dev_info __read_mostly = { |
@@ -636,7 +640,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | |||
636 | struct page *page = NULL; | 640 | struct page *page = NULL; |
637 | int error; | 641 | int error; |
638 | 642 | ||
639 | if (attr->ia_valid & ATTR_SIZE) { | 643 | if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { |
640 | if (attr->ia_size < inode->i_size) { | 644 | if (attr->ia_size < inode->i_size) { |
641 | /* | 645 | /* |
642 | * If truncating down to a partial page, then | 646 | * If truncating down to a partial page, then |
@@ -669,6 +673,10 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | |||
669 | error = inode_change_ok(inode, attr); | 673 | error = inode_change_ok(inode, attr); |
670 | if (!error) | 674 | if (!error) |
671 | error = inode_setattr(inode, attr); | 675 | error = inode_setattr(inode, attr); |
676 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
677 | if (!error && (attr->ia_valid & ATTR_MODE)) | ||
678 | error = generic_acl_chmod(inode, &shmem_acl_ops); | ||
679 | #endif | ||
672 | if (page) | 680 | if (page) |
673 | page_cache_release(page); | 681 | page_cache_release(page); |
674 | return error; | 682 | return error; |
@@ -1350,7 +1358,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | |||
1350 | inode->i_mode = mode; | 1358 | inode->i_mode = mode; |
1351 | inode->i_uid = current->fsuid; | 1359 | inode->i_uid = current->fsuid; |
1352 | inode->i_gid = current->fsgid; | 1360 | inode->i_gid = current->fsgid; |
1353 | inode->i_blksize = PAGE_CACHE_SIZE; | ||
1354 | inode->i_blocks = 0; | 1361 | inode->i_blocks = 0; |
1355 | inode->i_mapping->a_ops = &shmem_aops; | 1362 | inode->i_mapping->a_ops = &shmem_aops; |
1356 | inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; | 1363 | inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; |
@@ -1362,6 +1369,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | |||
1362 | 1369 | ||
1363 | switch (mode & S_IFMT) { | 1370 | switch (mode & S_IFMT) { |
1364 | default: | 1371 | default: |
1372 | inode->i_op = &shmem_special_inode_operations; | ||
1365 | init_special_inode(inode, mode, dev); | 1373 | init_special_inode(inode, mode, dev); |
1366 | break; | 1374 | break; |
1367 | case S_IFREG: | 1375 | case S_IFREG: |
@@ -1371,7 +1379,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | |||
1371 | &sbinfo->policy_nodes); | 1379 | &sbinfo->policy_nodes); |
1372 | break; | 1380 | break; |
1373 | case S_IFDIR: | 1381 | case S_IFDIR: |
1374 | inode->i_nlink++; | 1382 | inc_nlink(inode); |
1375 | /* Some things misbehave if size == 0 on a directory */ | 1383 | /* Some things misbehave if size == 0 on a directory */ |
1376 | inode->i_size = 2 * BOGO_DIRENT_SIZE; | 1384 | inode->i_size = 2 * BOGO_DIRENT_SIZE; |
1377 | inode->i_op = &shmem_dir_inode_operations; | 1385 | inode->i_op = &shmem_dir_inode_operations; |
@@ -1682,7 +1690,11 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) | |||
1682 | iput(inode); | 1690 | iput(inode); |
1683 | return error; | 1691 | return error; |
1684 | } | 1692 | } |
1685 | error = 0; | 1693 | } |
1694 | error = shmem_acl_init(inode, dir); | ||
1695 | if (error) { | ||
1696 | iput(inode); | ||
1697 | return error; | ||
1686 | } | 1698 | } |
1687 | if (dir->i_mode & S_ISGID) { | 1699 | if (dir->i_mode & S_ISGID) { |
1688 | inode->i_gid = dir->i_gid; | 1700 | inode->i_gid = dir->i_gid; |
@@ -1703,7 +1715,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode) | |||
1703 | 1715 | ||
1704 | if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) | 1716 | if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) |
1705 | return error; | 1717 | return error; |
1706 | dir->i_nlink++; | 1718 | inc_nlink(dir); |
1707 | return 0; | 1719 | return 0; |
1708 | } | 1720 | } |
1709 | 1721 | ||
@@ -1738,7 +1750,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr | |||
1738 | 1750 | ||
1739 | dir->i_size += BOGO_DIRENT_SIZE; | 1751 | dir->i_size += BOGO_DIRENT_SIZE; |
1740 | inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; | 1752 | inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; |
1741 | inode->i_nlink++; | 1753 | inc_nlink(inode); |
1742 | atomic_inc(&inode->i_count); /* New dentry reference */ | 1754 | atomic_inc(&inode->i_count); /* New dentry reference */ |
1743 | dget(dentry); /* Extra pinning count for the created dentry */ | 1755 | dget(dentry); /* Extra pinning count for the created dentry */ |
1744 | d_instantiate(dentry, inode); | 1756 | d_instantiate(dentry, inode); |
@@ -1760,7 +1772,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) | |||
1760 | 1772 | ||
1761 | dir->i_size -= BOGO_DIRENT_SIZE; | 1773 | dir->i_size -= BOGO_DIRENT_SIZE; |
1762 | inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; | 1774 | inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; |
1763 | inode->i_nlink--; | 1775 | drop_nlink(inode); |
1764 | dput(dentry); /* Undo the count from "create" - this does all the work */ | 1776 | dput(dentry); /* Undo the count from "create" - this does all the work */ |
1765 | return 0; | 1777 | return 0; |
1766 | } | 1778 | } |
@@ -1770,8 +1782,8 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry) | |||
1770 | if (!simple_empty(dentry)) | 1782 | if (!simple_empty(dentry)) |
1771 | return -ENOTEMPTY; | 1783 | return -ENOTEMPTY; |
1772 | 1784 | ||
1773 | dentry->d_inode->i_nlink--; | 1785 | drop_nlink(dentry->d_inode); |
1774 | dir->i_nlink--; | 1786 | drop_nlink(dir); |
1775 | return shmem_unlink(dir, dentry); | 1787 | return shmem_unlink(dir, dentry); |
1776 | } | 1788 | } |
1777 | 1789 | ||
@@ -1792,10 +1804,10 @@ static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct | |||
1792 | if (new_dentry->d_inode) { | 1804 | if (new_dentry->d_inode) { |
1793 | (void) shmem_unlink(new_dir, new_dentry); | 1805 | (void) shmem_unlink(new_dir, new_dentry); |
1794 | if (they_are_dirs) | 1806 | if (they_are_dirs) |
1795 | old_dir->i_nlink--; | 1807 | drop_nlink(old_dir); |
1796 | } else if (they_are_dirs) { | 1808 | } else if (they_are_dirs) { |
1797 | old_dir->i_nlink--; | 1809 | drop_nlink(old_dir); |
1798 | new_dir->i_nlink++; | 1810 | inc_nlink(new_dir); |
1799 | } | 1811 | } |
1800 | 1812 | ||
1801 | old_dir->i_size -= BOGO_DIRENT_SIZE; | 1813 | old_dir->i_size -= BOGO_DIRENT_SIZE; |
@@ -1897,6 +1909,53 @@ static struct inode_operations shmem_symlink_inode_operations = { | |||
1897 | .put_link = shmem_put_link, | 1909 | .put_link = shmem_put_link, |
1898 | }; | 1910 | }; |
1899 | 1911 | ||
1912 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
1913 | /** | ||
1914 | * Superblocks without xattr inode operations will get security.* xattr | ||
1915 | * support from the VFS "for free". As soon as we have any other xattrs | ||
1916 | * like ACLs, we also need to implement the security.* handlers at | ||
1917 | * filesystem level, though. | ||
1918 | */ | ||
1919 | |||
1920 | static size_t shmem_xattr_security_list(struct inode *inode, char *list, | ||
1921 | size_t list_len, const char *name, | ||
1922 | size_t name_len) | ||
1923 | { | ||
1924 | return security_inode_listsecurity(inode, list, list_len); | ||
1925 | } | ||
1926 | |||
1927 | static int shmem_xattr_security_get(struct inode *inode, const char *name, | ||
1928 | void *buffer, size_t size) | ||
1929 | { | ||
1930 | if (strcmp(name, "") == 0) | ||
1931 | return -EINVAL; | ||
1932 | return security_inode_getsecurity(inode, name, buffer, size, | ||
1933 | -EOPNOTSUPP); | ||
1934 | } | ||
1935 | |||
1936 | static int shmem_xattr_security_set(struct inode *inode, const char *name, | ||
1937 | const void *value, size_t size, int flags) | ||
1938 | { | ||
1939 | if (strcmp(name, "") == 0) | ||
1940 | return -EINVAL; | ||
1941 | return security_inode_setsecurity(inode, name, value, size, flags); | ||
1942 | } | ||
1943 | |||
1944 | struct xattr_handler shmem_xattr_security_handler = { | ||
1945 | .prefix = XATTR_SECURITY_PREFIX, | ||
1946 | .list = shmem_xattr_security_list, | ||
1947 | .get = shmem_xattr_security_get, | ||
1948 | .set = shmem_xattr_security_set, | ||
1949 | }; | ||
1950 | |||
1951 | static struct xattr_handler *shmem_xattr_handlers[] = { | ||
1952 | &shmem_xattr_acl_access_handler, | ||
1953 | &shmem_xattr_acl_default_handler, | ||
1954 | &shmem_xattr_security_handler, | ||
1955 | NULL | ||
1956 | }; | ||
1957 | #endif | ||
1958 | |||
1900 | static int shmem_parse_options(char *options, int *mode, uid_t *uid, | 1959 | static int shmem_parse_options(char *options, int *mode, uid_t *uid, |
1901 | gid_t *gid, unsigned long *blocks, unsigned long *inodes, | 1960 | gid_t *gid, unsigned long *blocks, unsigned long *inodes, |
1902 | int *policy, nodemask_t *policy_nodes) | 1961 | int *policy, nodemask_t *policy_nodes) |
@@ -2094,6 +2153,10 @@ static int shmem_fill_super(struct super_block *sb, | |||
2094 | sb->s_magic = TMPFS_MAGIC; | 2153 | sb->s_magic = TMPFS_MAGIC; |
2095 | sb->s_op = &shmem_ops; | 2154 | sb->s_op = &shmem_ops; |
2096 | sb->s_time_gran = 1; | 2155 | sb->s_time_gran = 1; |
2156 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
2157 | sb->s_xattr = shmem_xattr_handlers; | ||
2158 | sb->s_flags |= MS_POSIXACL; | ||
2159 | #endif | ||
2097 | 2160 | ||
2098 | inode = shmem_get_inode(sb, S_IFDIR | mode, 0); | 2161 | inode = shmem_get_inode(sb, S_IFDIR | mode, 0); |
2099 | if (!inode) | 2162 | if (!inode) |
@@ -2130,6 +2193,7 @@ static void shmem_destroy_inode(struct inode *inode) | |||
2130 | /* only struct inode is valid if it's an inline symlink */ | 2193 | /* only struct inode is valid if it's an inline symlink */ |
2131 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); | 2194 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); |
2132 | } | 2195 | } |
2196 | shmem_acl_destroy_inode(inode); | ||
2133 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); | 2197 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); |
2134 | } | 2198 | } |
2135 | 2199 | ||
@@ -2141,6 +2205,10 @@ static void init_once(void *foo, struct kmem_cache *cachep, | |||
2141 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == | 2205 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == |
2142 | SLAB_CTOR_CONSTRUCTOR) { | 2206 | SLAB_CTOR_CONSTRUCTOR) { |
2143 | inode_init_once(&p->vfs_inode); | 2207 | inode_init_once(&p->vfs_inode); |
2208 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
2209 | p->i_acl = NULL; | ||
2210 | p->i_default_acl = NULL; | ||
2211 | #endif | ||
2144 | } | 2212 | } |
2145 | } | 2213 | } |
2146 | 2214 | ||
@@ -2156,8 +2224,7 @@ static int init_inodecache(void) | |||
2156 | 2224 | ||
2157 | static void destroy_inodecache(void) | 2225 | static void destroy_inodecache(void) |
2158 | { | 2226 | { |
2159 | if (kmem_cache_destroy(shmem_inode_cachep)) | 2227 | kmem_cache_destroy(shmem_inode_cachep); |
2160 | printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n"); | ||
2161 | } | 2228 | } |
2162 | 2229 | ||
2163 | static const struct address_space_operations shmem_aops = { | 2230 | static const struct address_space_operations shmem_aops = { |
@@ -2185,6 +2252,14 @@ static struct inode_operations shmem_inode_operations = { | |||
2185 | .truncate = shmem_truncate, | 2252 | .truncate = shmem_truncate, |
2186 | .setattr = shmem_notify_change, | 2253 | .setattr = shmem_notify_change, |
2187 | .truncate_range = shmem_truncate_range, | 2254 | .truncate_range = shmem_truncate_range, |
2255 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
2256 | .setxattr = generic_setxattr, | ||
2257 | .getxattr = generic_getxattr, | ||
2258 | .listxattr = generic_listxattr, | ||
2259 | .removexattr = generic_removexattr, | ||
2260 | .permission = shmem_permission, | ||
2261 | #endif | ||
2262 | |||
2188 | }; | 2263 | }; |
2189 | 2264 | ||
2190 | static struct inode_operations shmem_dir_inode_operations = { | 2265 | static struct inode_operations shmem_dir_inode_operations = { |
@@ -2199,6 +2274,25 @@ static struct inode_operations shmem_dir_inode_operations = { | |||
2199 | .mknod = shmem_mknod, | 2274 | .mknod = shmem_mknod, |
2200 | .rename = shmem_rename, | 2275 | .rename = shmem_rename, |
2201 | #endif | 2276 | #endif |
2277 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
2278 | .setattr = shmem_notify_change, | ||
2279 | .setxattr = generic_setxattr, | ||
2280 | .getxattr = generic_getxattr, | ||
2281 | .listxattr = generic_listxattr, | ||
2282 | .removexattr = generic_removexattr, | ||
2283 | .permission = shmem_permission, | ||
2284 | #endif | ||
2285 | }; | ||
2286 | |||
2287 | static struct inode_operations shmem_special_inode_operations = { | ||
2288 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
2289 | .setattr = shmem_notify_change, | ||
2290 | .setxattr = generic_setxattr, | ||
2291 | .getxattr = generic_getxattr, | ||
2292 | .listxattr = generic_listxattr, | ||
2293 | .removexattr = generic_removexattr, | ||
2294 | .permission = shmem_permission, | ||
2295 | #endif | ||
2202 | }; | 2296 | }; |
2203 | 2297 | ||
2204 | static struct super_operations shmem_ops = { | 2298 | static struct super_operations shmem_ops = { |
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c new file mode 100644 index 0000000000..c946bf4687 --- /dev/null +++ b/mm/shmem_acl.c | |||
@@ -0,0 +1,197 @@ | |||
1 | /* | ||
2 | * mm/shmem_acl.c | ||
3 | * | ||
4 | * (C) 2005 Andreas Gruenbacher <agruen@suse.de> | ||
5 | * | ||
6 | * This file is released under the GPL. | ||
7 | */ | ||
8 | |||
9 | #include <linux/fs.h> | ||
10 | #include <linux/shmem_fs.h> | ||
11 | #include <linux/xattr.h> | ||
12 | #include <linux/generic_acl.h> | ||
13 | |||
14 | /** | ||
15 | * shmem_get_acl - generic_acl_operations->getacl() operation | ||
16 | */ | ||
17 | static struct posix_acl * | ||
18 | shmem_get_acl(struct inode *inode, int type) | ||
19 | { | ||
20 | struct posix_acl *acl = NULL; | ||
21 | |||
22 | spin_lock(&inode->i_lock); | ||
23 | switch(type) { | ||
24 | case ACL_TYPE_ACCESS: | ||
25 | acl = posix_acl_dup(SHMEM_I(inode)->i_acl); | ||
26 | break; | ||
27 | |||
28 | case ACL_TYPE_DEFAULT: | ||
29 | acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl); | ||
30 | break; | ||
31 | } | ||
32 | spin_unlock(&inode->i_lock); | ||
33 | |||
34 | return acl; | ||
35 | } | ||
36 | |||
37 | /** | ||
38 | * shmem_get_acl - generic_acl_operations->setacl() operation | ||
39 | */ | ||
40 | static void | ||
41 | shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) | ||
42 | { | ||
43 | struct posix_acl *free = NULL; | ||
44 | |||
45 | spin_lock(&inode->i_lock); | ||
46 | switch(type) { | ||
47 | case ACL_TYPE_ACCESS: | ||
48 | free = SHMEM_I(inode)->i_acl; | ||
49 | SHMEM_I(inode)->i_acl = posix_acl_dup(acl); | ||
50 | break; | ||
51 | |||
52 | case ACL_TYPE_DEFAULT: | ||
53 | free = SHMEM_I(inode)->i_default_acl; | ||
54 | SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl); | ||
55 | break; | ||
56 | } | ||
57 | spin_unlock(&inode->i_lock); | ||
58 | posix_acl_release(free); | ||
59 | } | ||
60 | |||
61 | struct generic_acl_operations shmem_acl_ops = { | ||
62 | .getacl = shmem_get_acl, | ||
63 | .setacl = shmem_set_acl, | ||
64 | }; | ||
65 | |||
66 | /** | ||
67 | * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access, | ||
68 | * shmem_xattr_acl_access_handler - plumbing code to implement the | ||
69 | * system.posix_acl_access xattr using the generic acl functions. | ||
70 | */ | ||
71 | |||
72 | static size_t | ||
73 | shmem_list_acl_access(struct inode *inode, char *list, size_t list_size, | ||
74 | const char *name, size_t name_len) | ||
75 | { | ||
76 | return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, | ||
77 | list, list_size); | ||
78 | } | ||
79 | |||
80 | static int | ||
81 | shmem_get_acl_access(struct inode *inode, const char *name, void *buffer, | ||
82 | size_t size) | ||
83 | { | ||
84 | if (strcmp(name, "") != 0) | ||
85 | return -EINVAL; | ||
86 | return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer, | ||
87 | size); | ||
88 | } | ||
89 | |||
90 | static int | ||
91 | shmem_set_acl_access(struct inode *inode, const char *name, const void *value, | ||
92 | size_t size, int flags) | ||
93 | { | ||
94 | if (strcmp(name, "") != 0) | ||
95 | return -EINVAL; | ||
96 | return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value, | ||
97 | size); | ||
98 | } | ||
99 | |||
100 | struct xattr_handler shmem_xattr_acl_access_handler = { | ||
101 | .prefix = POSIX_ACL_XATTR_ACCESS, | ||
102 | .list = shmem_list_acl_access, | ||
103 | .get = shmem_get_acl_access, | ||
104 | .set = shmem_set_acl_access, | ||
105 | }; | ||
106 | |||
107 | /** | ||
108 | * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default, | ||
109 | * shmem_xattr_acl_default_handler - plumbing code to implement the | ||
110 | * system.posix_acl_default xattr using the generic acl functions. | ||
111 | */ | ||
112 | |||
113 | static size_t | ||
114 | shmem_list_acl_default(struct inode *inode, char *list, size_t list_size, | ||
115 | const char *name, size_t name_len) | ||
116 | { | ||
117 | return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, | ||
118 | list, list_size); | ||
119 | } | ||
120 | |||
121 | static int | ||
122 | shmem_get_acl_default(struct inode *inode, const char *name, void *buffer, | ||
123 | size_t size) | ||
124 | { | ||
125 | if (strcmp(name, "") != 0) | ||
126 | return -EINVAL; | ||
127 | return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer, | ||
128 | size); | ||
129 | } | ||
130 | |||
131 | static int | ||
132 | shmem_set_acl_default(struct inode *inode, const char *name, const void *value, | ||
133 | size_t size, int flags) | ||
134 | { | ||
135 | if (strcmp(name, "") != 0) | ||
136 | return -EINVAL; | ||
137 | return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value, | ||
138 | size); | ||
139 | } | ||
140 | |||
141 | struct xattr_handler shmem_xattr_acl_default_handler = { | ||
142 | .prefix = POSIX_ACL_XATTR_DEFAULT, | ||
143 | .list = shmem_list_acl_default, | ||
144 | .get = shmem_get_acl_default, | ||
145 | .set = shmem_set_acl_default, | ||
146 | }; | ||
147 | |||
148 | /** | ||
149 | * shmem_acl_init - Inizialize the acl(s) of a new inode | ||
150 | */ | ||
151 | int | ||
152 | shmem_acl_init(struct inode *inode, struct inode *dir) | ||
153 | { | ||
154 | return generic_acl_init(inode, dir, &shmem_acl_ops); | ||
155 | } | ||
156 | |||
157 | /** | ||
158 | * shmem_acl_destroy_inode - destroy acls hanging off the in-memory inode | ||
159 | * | ||
160 | * This is done before destroying the actual inode. | ||
161 | */ | ||
162 | |||
163 | void | ||
164 | shmem_acl_destroy_inode(struct inode *inode) | ||
165 | { | ||
166 | if (SHMEM_I(inode)->i_acl) | ||
167 | posix_acl_release(SHMEM_I(inode)->i_acl); | ||
168 | SHMEM_I(inode)->i_acl = NULL; | ||
169 | if (SHMEM_I(inode)->i_default_acl) | ||
170 | posix_acl_release(SHMEM_I(inode)->i_default_acl); | ||
171 | SHMEM_I(inode)->i_default_acl = NULL; | ||
172 | } | ||
173 | |||
174 | /** | ||
175 | * shmem_check_acl - check_acl() callback for generic_permission() | ||
176 | */ | ||
177 | static int | ||
178 | shmem_check_acl(struct inode *inode, int mask) | ||
179 | { | ||
180 | struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); | ||
181 | |||
182 | if (acl) { | ||
183 | int error = posix_acl_permission(inode, acl, mask); | ||
184 | posix_acl_release(acl); | ||
185 | return error; | ||
186 | } | ||
187 | return -EAGAIN; | ||
188 | } | ||
189 | |||
190 | /** | ||
191 | * shmem_permission - permission() inode operation | ||
192 | */ | ||
193 | int | ||
194 | shmem_permission(struct inode *inode, int mask, struct nameidata *nd) | ||
195 | { | ||
196 | return generic_permission(inode, mask, shmem_check_acl); | ||
197 | } | ||
@@ -313,7 +313,7 @@ static int drain_freelist(struct kmem_cache *cache, | |||
313 | struct kmem_list3 *l3, int tofree); | 313 | struct kmem_list3 *l3, int tofree); |
314 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, | 314 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, |
315 | int node); | 315 | int node); |
316 | static void enable_cpucache(struct kmem_cache *cachep); | 316 | static int enable_cpucache(struct kmem_cache *cachep); |
317 | static void cache_reap(void *unused); | 317 | static void cache_reap(void *unused); |
318 | 318 | ||
319 | /* | 319 | /* |
@@ -674,6 +674,8 @@ static struct kmem_cache cache_cache = { | |||
674 | #endif | 674 | #endif |
675 | }; | 675 | }; |
676 | 676 | ||
677 | #define BAD_ALIEN_MAGIC 0x01020304ul | ||
678 | |||
677 | #ifdef CONFIG_LOCKDEP | 679 | #ifdef CONFIG_LOCKDEP |
678 | 680 | ||
679 | /* | 681 | /* |
@@ -682,42 +684,58 @@ static struct kmem_cache cache_cache = { | |||
682 | * The locking for this is tricky in that it nests within the locks | 684 | * The locking for this is tricky in that it nests within the locks |
683 | * of all other slabs in a few places; to deal with this special | 685 | * of all other slabs in a few places; to deal with this special |
684 | * locking we put on-slab caches into a separate lock-class. | 686 | * locking we put on-slab caches into a separate lock-class. |
687 | * | ||
688 | * We set lock class for alien array caches which are up during init. | ||
689 | * The lock annotation will be lost if all cpus of a node goes down and | ||
690 | * then comes back up during hotplug | ||
685 | */ | 691 | */ |
686 | static struct lock_class_key on_slab_key; | 692 | static struct lock_class_key on_slab_l3_key; |
693 | static struct lock_class_key on_slab_alc_key; | ||
694 | |||
695 | static inline void init_lock_keys(void) | ||
687 | 696 | ||
688 | static inline void init_lock_keys(struct cache_sizes *s) | ||
689 | { | 697 | { |
690 | int q; | 698 | int q; |
691 | 699 | struct cache_sizes *s = malloc_sizes; | |
692 | for (q = 0; q < MAX_NUMNODES; q++) { | 700 | |
693 | if (!s->cs_cachep->nodelists[q] || OFF_SLAB(s->cs_cachep)) | 701 | while (s->cs_size != ULONG_MAX) { |
694 | continue; | 702 | for_each_node(q) { |
695 | lockdep_set_class(&s->cs_cachep->nodelists[q]->list_lock, | 703 | struct array_cache **alc; |
696 | &on_slab_key); | 704 | int r; |
705 | struct kmem_list3 *l3 = s->cs_cachep->nodelists[q]; | ||
706 | if (!l3 || OFF_SLAB(s->cs_cachep)) | ||
707 | continue; | ||
708 | lockdep_set_class(&l3->list_lock, &on_slab_l3_key); | ||
709 | alc = l3->alien; | ||
710 | /* | ||
711 | * FIXME: This check for BAD_ALIEN_MAGIC | ||
712 | * should go away when common slab code is taught to | ||
713 | * work even without alien caches. | ||
714 | * Currently, non NUMA code returns BAD_ALIEN_MAGIC | ||
715 | * for alloc_alien_cache, | ||
716 | */ | ||
717 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) | ||
718 | continue; | ||
719 | for_each_node(r) { | ||
720 | if (alc[r]) | ||
721 | lockdep_set_class(&alc[r]->lock, | ||
722 | &on_slab_alc_key); | ||
723 | } | ||
724 | } | ||
725 | s++; | ||
697 | } | 726 | } |
698 | } | 727 | } |
699 | |||
700 | #else | 728 | #else |
701 | static inline void init_lock_keys(struct cache_sizes *s) | 729 | static inline void init_lock_keys(void) |
702 | { | 730 | { |
703 | } | 731 | } |
704 | #endif | 732 | #endif |
705 | 733 | ||
706 | |||
707 | |||
708 | /* Guard access to the cache-chain. */ | 734 | /* Guard access to the cache-chain. */ |
709 | static DEFINE_MUTEX(cache_chain_mutex); | 735 | static DEFINE_MUTEX(cache_chain_mutex); |
710 | static struct list_head cache_chain; | 736 | static struct list_head cache_chain; |
711 | 737 | ||
712 | /* | 738 | /* |
713 | * vm_enough_memory() looks at this to determine how many slab-allocated pages | ||
714 | * are possibly freeable under pressure | ||
715 | * | ||
716 | * SLAB_RECLAIM_ACCOUNT turns this on per-slab | ||
717 | */ | ||
718 | atomic_t slab_reclaim_pages; | ||
719 | |||
720 | /* | ||
721 | * chicken and egg problem: delay the per-cpu array allocation | 739 | * chicken and egg problem: delay the per-cpu array allocation |
722 | * until the general caches are up. | 740 | * until the general caches are up. |
723 | */ | 741 | */ |
@@ -768,11 +786,10 @@ static inline struct kmem_cache *__find_general_cachep(size_t size, | |||
768 | return csizep->cs_cachep; | 786 | return csizep->cs_cachep; |
769 | } | 787 | } |
770 | 788 | ||
771 | struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) | 789 | static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) |
772 | { | 790 | { |
773 | return __find_general_cachep(size, gfpflags); | 791 | return __find_general_cachep(size, gfpflags); |
774 | } | 792 | } |
775 | EXPORT_SYMBOL(kmem_find_general_cachep); | ||
776 | 793 | ||
777 | static size_t slab_mgmt_size(size_t nr_objs, size_t align) | 794 | static size_t slab_mgmt_size(size_t nr_objs, size_t align) |
778 | { | 795 | { |
@@ -955,7 +972,39 @@ static int transfer_objects(struct array_cache *to, | |||
955 | return nr; | 972 | return nr; |
956 | } | 973 | } |
957 | 974 | ||
958 | #ifdef CONFIG_NUMA | 975 | #ifndef CONFIG_NUMA |
976 | |||
977 | #define drain_alien_cache(cachep, alien) do { } while (0) | ||
978 | #define reap_alien(cachep, l3) do { } while (0) | ||
979 | |||
980 | static inline struct array_cache **alloc_alien_cache(int node, int limit) | ||
981 | { | ||
982 | return (struct array_cache **)BAD_ALIEN_MAGIC; | ||
983 | } | ||
984 | |||
985 | static inline void free_alien_cache(struct array_cache **ac_ptr) | ||
986 | { | ||
987 | } | ||
988 | |||
989 | static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | ||
990 | { | ||
991 | return 0; | ||
992 | } | ||
993 | |||
994 | static inline void *alternate_node_alloc(struct kmem_cache *cachep, | ||
995 | gfp_t flags) | ||
996 | { | ||
997 | return NULL; | ||
998 | } | ||
999 | |||
1000 | static inline void *__cache_alloc_node(struct kmem_cache *cachep, | ||
1001 | gfp_t flags, int nodeid) | ||
1002 | { | ||
1003 | return NULL; | ||
1004 | } | ||
1005 | |||
1006 | #else /* CONFIG_NUMA */ | ||
1007 | |||
959 | static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); | 1008 | static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); |
960 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); | 1009 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); |
961 | 1010 | ||
@@ -1084,26 +1133,6 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | |||
1084 | } | 1133 | } |
1085 | return 1; | 1134 | return 1; |
1086 | } | 1135 | } |
1087 | |||
1088 | #else | ||
1089 | |||
1090 | #define drain_alien_cache(cachep, alien) do { } while (0) | ||
1091 | #define reap_alien(cachep, l3) do { } while (0) | ||
1092 | |||
1093 | static inline struct array_cache **alloc_alien_cache(int node, int limit) | ||
1094 | { | ||
1095 | return (struct array_cache **) 0x01020304ul; | ||
1096 | } | ||
1097 | |||
1098 | static inline void free_alien_cache(struct array_cache **ac_ptr) | ||
1099 | { | ||
1100 | } | ||
1101 | |||
1102 | static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | ||
1103 | { | ||
1104 | return 0; | ||
1105 | } | ||
1106 | |||
1107 | #endif | 1136 | #endif |
1108 | 1137 | ||
1109 | static int __cpuinit cpuup_callback(struct notifier_block *nfb, | 1138 | static int __cpuinit cpuup_callback(struct notifier_block *nfb, |
@@ -1422,7 +1451,6 @@ void __init kmem_cache_init(void) | |||
1422 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, | 1451 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, |
1423 | NULL, NULL); | 1452 | NULL, NULL); |
1424 | } | 1453 | } |
1425 | init_lock_keys(sizes); | ||
1426 | 1454 | ||
1427 | sizes->cs_dmacachep = kmem_cache_create(names->name_dma, | 1455 | sizes->cs_dmacachep = kmem_cache_create(names->name_dma, |
1428 | sizes->cs_size, | 1456 | sizes->cs_size, |
@@ -1491,10 +1519,15 @@ void __init kmem_cache_init(void) | |||
1491 | struct kmem_cache *cachep; | 1519 | struct kmem_cache *cachep; |
1492 | mutex_lock(&cache_chain_mutex); | 1520 | mutex_lock(&cache_chain_mutex); |
1493 | list_for_each_entry(cachep, &cache_chain, next) | 1521 | list_for_each_entry(cachep, &cache_chain, next) |
1494 | enable_cpucache(cachep); | 1522 | if (enable_cpucache(cachep)) |
1523 | BUG(); | ||
1495 | mutex_unlock(&cache_chain_mutex); | 1524 | mutex_unlock(&cache_chain_mutex); |
1496 | } | 1525 | } |
1497 | 1526 | ||
1527 | /* Annotate slab for lockdep -- annotate the malloc caches */ | ||
1528 | init_lock_keys(); | ||
1529 | |||
1530 | |||
1498 | /* Done! */ | 1531 | /* Done! */ |
1499 | g_cpucache_up = FULL; | 1532 | g_cpucache_up = FULL; |
1500 | 1533 | ||
@@ -1543,7 +1576,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1543 | */ | 1576 | */ |
1544 | flags |= __GFP_COMP; | 1577 | flags |= __GFP_COMP; |
1545 | #endif | 1578 | #endif |
1546 | flags |= cachep->gfpflags; | 1579 | |
1580 | /* | ||
1581 | * Under NUMA we want memory on the indicated node. We will handle | ||
1582 | * the needed fallback ourselves since we want to serve from our | ||
1583 | * per node object lists first for other nodes. | ||
1584 | */ | ||
1585 | flags |= cachep->gfpflags | GFP_THISNODE; | ||
1547 | 1586 | ||
1548 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); | 1587 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); |
1549 | if (!page) | 1588 | if (!page) |
@@ -1551,8 +1590,11 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1551 | 1590 | ||
1552 | nr_pages = (1 << cachep->gfporder); | 1591 | nr_pages = (1 << cachep->gfporder); |
1553 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1592 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1554 | atomic_add(nr_pages, &slab_reclaim_pages); | 1593 | add_zone_page_state(page_zone(page), |
1555 | add_zone_page_state(page_zone(page), NR_SLAB, nr_pages); | 1594 | NR_SLAB_RECLAIMABLE, nr_pages); |
1595 | else | ||
1596 | add_zone_page_state(page_zone(page), | ||
1597 | NR_SLAB_UNRECLAIMABLE, nr_pages); | ||
1556 | for (i = 0; i < nr_pages; i++) | 1598 | for (i = 0; i < nr_pages; i++) |
1557 | __SetPageSlab(page + i); | 1599 | __SetPageSlab(page + i); |
1558 | return page_address(page); | 1600 | return page_address(page); |
@@ -1567,7 +1609,12 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) | |||
1567 | struct page *page = virt_to_page(addr); | 1609 | struct page *page = virt_to_page(addr); |
1568 | const unsigned long nr_freed = i; | 1610 | const unsigned long nr_freed = i; |
1569 | 1611 | ||
1570 | sub_zone_page_state(page_zone(page), NR_SLAB, nr_freed); | 1612 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1613 | sub_zone_page_state(page_zone(page), | ||
1614 | NR_SLAB_RECLAIMABLE, nr_freed); | ||
1615 | else | ||
1616 | sub_zone_page_state(page_zone(page), | ||
1617 | NR_SLAB_UNRECLAIMABLE, nr_freed); | ||
1571 | while (i--) { | 1618 | while (i--) { |
1572 | BUG_ON(!PageSlab(page)); | 1619 | BUG_ON(!PageSlab(page)); |
1573 | __ClearPageSlab(page); | 1620 | __ClearPageSlab(page); |
@@ -1576,8 +1623,6 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) | |||
1576 | if (current->reclaim_state) | 1623 | if (current->reclaim_state) |
1577 | current->reclaim_state->reclaimed_slab += nr_freed; | 1624 | current->reclaim_state->reclaimed_slab += nr_freed; |
1578 | free_pages((unsigned long)addr, cachep->gfporder); | 1625 | free_pages((unsigned long)addr, cachep->gfporder); |
1579 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | ||
1580 | atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages); | ||
1581 | } | 1626 | } |
1582 | 1627 | ||
1583 | static void kmem_rcu_free(struct rcu_head *head) | 1628 | static void kmem_rcu_free(struct rcu_head *head) |
@@ -1638,10 +1683,32 @@ static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) | |||
1638 | static void dump_line(char *data, int offset, int limit) | 1683 | static void dump_line(char *data, int offset, int limit) |
1639 | { | 1684 | { |
1640 | int i; | 1685 | int i; |
1686 | unsigned char error = 0; | ||
1687 | int bad_count = 0; | ||
1688 | |||
1641 | printk(KERN_ERR "%03x:", offset); | 1689 | printk(KERN_ERR "%03x:", offset); |
1642 | for (i = 0; i < limit; i++) | 1690 | for (i = 0; i < limit; i++) { |
1691 | if (data[offset + i] != POISON_FREE) { | ||
1692 | error = data[offset + i]; | ||
1693 | bad_count++; | ||
1694 | } | ||
1643 | printk(" %02x", (unsigned char)data[offset + i]); | 1695 | printk(" %02x", (unsigned char)data[offset + i]); |
1696 | } | ||
1644 | printk("\n"); | 1697 | printk("\n"); |
1698 | |||
1699 | if (bad_count == 1) { | ||
1700 | error ^= POISON_FREE; | ||
1701 | if (!(error & (error - 1))) { | ||
1702 | printk(KERN_ERR "Single bit error detected. Probably " | ||
1703 | "bad RAM.\n"); | ||
1704 | #ifdef CONFIG_X86 | ||
1705 | printk(KERN_ERR "Run memtest86+ or a similar memory " | ||
1706 | "test tool.\n"); | ||
1707 | #else | ||
1708 | printk(KERN_ERR "Run a memory test tool.\n"); | ||
1709 | #endif | ||
1710 | } | ||
1711 | } | ||
1645 | } | 1712 | } |
1646 | #endif | 1713 | #endif |
1647 | 1714 | ||
@@ -1834,6 +1901,27 @@ static void set_up_list3s(struct kmem_cache *cachep, int index) | |||
1834 | } | 1901 | } |
1835 | } | 1902 | } |
1836 | 1903 | ||
1904 | static void __kmem_cache_destroy(struct kmem_cache *cachep) | ||
1905 | { | ||
1906 | int i; | ||
1907 | struct kmem_list3 *l3; | ||
1908 | |||
1909 | for_each_online_cpu(i) | ||
1910 | kfree(cachep->array[i]); | ||
1911 | |||
1912 | /* NUMA: free the list3 structures */ | ||
1913 | for_each_online_node(i) { | ||
1914 | l3 = cachep->nodelists[i]; | ||
1915 | if (l3) { | ||
1916 | kfree(l3->shared); | ||
1917 | free_alien_cache(l3->alien); | ||
1918 | kfree(l3); | ||
1919 | } | ||
1920 | } | ||
1921 | kmem_cache_free(&cache_cache, cachep); | ||
1922 | } | ||
1923 | |||
1924 | |||
1837 | /** | 1925 | /** |
1838 | * calculate_slab_order - calculate size (page order) of slabs | 1926 | * calculate_slab_order - calculate size (page order) of slabs |
1839 | * @cachep: pointer to the cache that is being created | 1927 | * @cachep: pointer to the cache that is being created |
@@ -1904,12 +1992,11 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, | |||
1904 | return left_over; | 1992 | return left_over; |
1905 | } | 1993 | } |
1906 | 1994 | ||
1907 | static void setup_cpu_cache(struct kmem_cache *cachep) | 1995 | static int setup_cpu_cache(struct kmem_cache *cachep) |
1908 | { | 1996 | { |
1909 | if (g_cpucache_up == FULL) { | 1997 | if (g_cpucache_up == FULL) |
1910 | enable_cpucache(cachep); | 1998 | return enable_cpucache(cachep); |
1911 | return; | 1999 | |
1912 | } | ||
1913 | if (g_cpucache_up == NONE) { | 2000 | if (g_cpucache_up == NONE) { |
1914 | /* | 2001 | /* |
1915 | * Note: the first kmem_cache_create must create the cache | 2002 | * Note: the first kmem_cache_create must create the cache |
@@ -1956,6 +2043,7 @@ static void setup_cpu_cache(struct kmem_cache *cachep) | |||
1956 | cpu_cache_get(cachep)->touched = 0; | 2043 | cpu_cache_get(cachep)->touched = 0; |
1957 | cachep->batchcount = 1; | 2044 | cachep->batchcount = 1; |
1958 | cachep->limit = BOOT_CPUCACHE_ENTRIES; | 2045 | cachep->limit = BOOT_CPUCACHE_ENTRIES; |
2046 | return 0; | ||
1959 | } | 2047 | } |
1960 | 2048 | ||
1961 | /** | 2049 | /** |
@@ -2097,6 +2185,15 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2097 | } else { | 2185 | } else { |
2098 | ralign = BYTES_PER_WORD; | 2186 | ralign = BYTES_PER_WORD; |
2099 | } | 2187 | } |
2188 | |||
2189 | /* | ||
2190 | * Redzoning and user store require word alignment. Note this will be | ||
2191 | * overridden by architecture or caller mandated alignment if either | ||
2192 | * is greater than BYTES_PER_WORD. | ||
2193 | */ | ||
2194 | if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) | ||
2195 | ralign = BYTES_PER_WORD; | ||
2196 | |||
2100 | /* 2) arch mandated alignment: disables debug if necessary */ | 2197 | /* 2) arch mandated alignment: disables debug if necessary */ |
2101 | if (ralign < ARCH_SLAB_MINALIGN) { | 2198 | if (ralign < ARCH_SLAB_MINALIGN) { |
2102 | ralign = ARCH_SLAB_MINALIGN; | 2199 | ralign = ARCH_SLAB_MINALIGN; |
@@ -2110,8 +2207,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2110 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | 2207 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); |
2111 | } | 2208 | } |
2112 | /* | 2209 | /* |
2113 | * 4) Store it. Note that the debug code below can reduce | 2210 | * 4) Store it. |
2114 | * the alignment to BYTES_PER_WORD. | ||
2115 | */ | 2211 | */ |
2116 | align = ralign; | 2212 | align = ralign; |
2117 | 2213 | ||
@@ -2123,20 +2219,19 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2123 | #if DEBUG | 2219 | #if DEBUG |
2124 | cachep->obj_size = size; | 2220 | cachep->obj_size = size; |
2125 | 2221 | ||
2222 | /* | ||
2223 | * Both debugging options require word-alignment which is calculated | ||
2224 | * into align above. | ||
2225 | */ | ||
2126 | if (flags & SLAB_RED_ZONE) { | 2226 | if (flags & SLAB_RED_ZONE) { |
2127 | /* redzoning only works with word aligned caches */ | ||
2128 | align = BYTES_PER_WORD; | ||
2129 | |||
2130 | /* add space for red zone words */ | 2227 | /* add space for red zone words */ |
2131 | cachep->obj_offset += BYTES_PER_WORD; | 2228 | cachep->obj_offset += BYTES_PER_WORD; |
2132 | size += 2 * BYTES_PER_WORD; | 2229 | size += 2 * BYTES_PER_WORD; |
2133 | } | 2230 | } |
2134 | if (flags & SLAB_STORE_USER) { | 2231 | if (flags & SLAB_STORE_USER) { |
2135 | /* user store requires word alignment and | 2232 | /* user store requires one word storage behind the end of |
2136 | * one word storage behind the end of the real | 2233 | * the real object. |
2137 | * object. | ||
2138 | */ | 2234 | */ |
2139 | align = BYTES_PER_WORD; | ||
2140 | size += BYTES_PER_WORD; | 2235 | size += BYTES_PER_WORD; |
2141 | } | 2236 | } |
2142 | #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) | 2237 | #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) |
@@ -2200,14 +2295,26 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2200 | cachep->gfpflags |= GFP_DMA; | 2295 | cachep->gfpflags |= GFP_DMA; |
2201 | cachep->buffer_size = size; | 2296 | cachep->buffer_size = size; |
2202 | 2297 | ||
2203 | if (flags & CFLGS_OFF_SLAB) | 2298 | if (flags & CFLGS_OFF_SLAB) { |
2204 | cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); | 2299 | cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); |
2300 | /* | ||
2301 | * This is a possibility for one of the malloc_sizes caches. | ||
2302 | * But since we go off slab only for object size greater than | ||
2303 | * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, | ||
2304 | * this should not happen at all. | ||
2305 | * But leave a BUG_ON for some lucky dude. | ||
2306 | */ | ||
2307 | BUG_ON(!cachep->slabp_cache); | ||
2308 | } | ||
2205 | cachep->ctor = ctor; | 2309 | cachep->ctor = ctor; |
2206 | cachep->dtor = dtor; | 2310 | cachep->dtor = dtor; |
2207 | cachep->name = name; | 2311 | cachep->name = name; |
2208 | 2312 | ||
2209 | 2313 | if (setup_cpu_cache(cachep)) { | |
2210 | setup_cpu_cache(cachep); | 2314 | __kmem_cache_destroy(cachep); |
2315 | cachep = NULL; | ||
2316 | goto oops; | ||
2317 | } | ||
2211 | 2318 | ||
2212 | /* cache setup completed, link it into the list */ | 2319 | /* cache setup completed, link it into the list */ |
2213 | list_add(&cachep->next, &cache_chain); | 2320 | list_add(&cachep->next, &cache_chain); |
@@ -2375,7 +2482,6 @@ EXPORT_SYMBOL(kmem_cache_shrink); | |||
2375 | * @cachep: the cache to destroy | 2482 | * @cachep: the cache to destroy |
2376 | * | 2483 | * |
2377 | * Remove a struct kmem_cache object from the slab cache. | 2484 | * Remove a struct kmem_cache object from the slab cache. |
2378 | * Returns 0 on success. | ||
2379 | * | 2485 | * |
2380 | * It is expected this function will be called by a module when it is | 2486 | * It is expected this function will be called by a module when it is |
2381 | * unloaded. This will remove the cache completely, and avoid a duplicate | 2487 | * unloaded. This will remove the cache completely, and avoid a duplicate |
@@ -2387,11 +2493,8 @@ EXPORT_SYMBOL(kmem_cache_shrink); | |||
2387 | * The caller must guarantee that noone will allocate memory from the cache | 2493 | * The caller must guarantee that noone will allocate memory from the cache |
2388 | * during the kmem_cache_destroy(). | 2494 | * during the kmem_cache_destroy(). |
2389 | */ | 2495 | */ |
2390 | int kmem_cache_destroy(struct kmem_cache *cachep) | 2496 | void kmem_cache_destroy(struct kmem_cache *cachep) |
2391 | { | 2497 | { |
2392 | int i; | ||
2393 | struct kmem_list3 *l3; | ||
2394 | |||
2395 | BUG_ON(!cachep || in_interrupt()); | 2498 | BUG_ON(!cachep || in_interrupt()); |
2396 | 2499 | ||
2397 | /* Don't let CPUs to come and go */ | 2500 | /* Don't let CPUs to come and go */ |
@@ -2411,31 +2514,28 @@ int kmem_cache_destroy(struct kmem_cache *cachep) | |||
2411 | list_add(&cachep->next, &cache_chain); | 2514 | list_add(&cachep->next, &cache_chain); |
2412 | mutex_unlock(&cache_chain_mutex); | 2515 | mutex_unlock(&cache_chain_mutex); |
2413 | unlock_cpu_hotplug(); | 2516 | unlock_cpu_hotplug(); |
2414 | return 1; | 2517 | return; |
2415 | } | 2518 | } |
2416 | 2519 | ||
2417 | if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) | 2520 | if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) |
2418 | synchronize_rcu(); | 2521 | synchronize_rcu(); |
2419 | 2522 | ||
2420 | for_each_online_cpu(i) | 2523 | __kmem_cache_destroy(cachep); |
2421 | kfree(cachep->array[i]); | ||
2422 | |||
2423 | /* NUMA: free the list3 structures */ | ||
2424 | for_each_online_node(i) { | ||
2425 | l3 = cachep->nodelists[i]; | ||
2426 | if (l3) { | ||
2427 | kfree(l3->shared); | ||
2428 | free_alien_cache(l3->alien); | ||
2429 | kfree(l3); | ||
2430 | } | ||
2431 | } | ||
2432 | kmem_cache_free(&cache_cache, cachep); | ||
2433 | unlock_cpu_hotplug(); | 2524 | unlock_cpu_hotplug(); |
2434 | return 0; | ||
2435 | } | 2525 | } |
2436 | EXPORT_SYMBOL(kmem_cache_destroy); | 2526 | EXPORT_SYMBOL(kmem_cache_destroy); |
2437 | 2527 | ||
2438 | /* Get the memory for a slab management obj. */ | 2528 | /* |
2529 | * Get the memory for a slab management obj. | ||
2530 | * For a slab cache when the slab descriptor is off-slab, slab descriptors | ||
2531 | * always come from malloc_sizes caches. The slab descriptor cannot | ||
2532 | * come from the same cache which is getting created because, | ||
2533 | * when we are searching for an appropriate cache for these | ||
2534 | * descriptors in kmem_cache_create, we search through the malloc_sizes array. | ||
2535 | * If we are creating a malloc_sizes cache here it would not be visible to | ||
2536 | * kmem_find_general_cachep till the initialization is complete. | ||
2537 | * Hence we cannot have slabp_cache same as the original cache. | ||
2538 | */ | ||
2439 | static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, | 2539 | static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, |
2440 | int colour_off, gfp_t local_flags, | 2540 | int colour_off, gfp_t local_flags, |
2441 | int nodeid) | 2541 | int nodeid) |
@@ -2968,14 +3068,6 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
2968 | void *objp; | 3068 | void *objp; |
2969 | struct array_cache *ac; | 3069 | struct array_cache *ac; |
2970 | 3070 | ||
2971 | #ifdef CONFIG_NUMA | ||
2972 | if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { | ||
2973 | objp = alternate_node_alloc(cachep, flags); | ||
2974 | if (objp != NULL) | ||
2975 | return objp; | ||
2976 | } | ||
2977 | #endif | ||
2978 | |||
2979 | check_irq_off(); | 3071 | check_irq_off(); |
2980 | ac = cpu_cache_get(cachep); | 3072 | ac = cpu_cache_get(cachep); |
2981 | if (likely(ac->avail)) { | 3073 | if (likely(ac->avail)) { |
@@ -2993,12 +3085,24 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep, | |||
2993 | gfp_t flags, void *caller) | 3085 | gfp_t flags, void *caller) |
2994 | { | 3086 | { |
2995 | unsigned long save_flags; | 3087 | unsigned long save_flags; |
2996 | void *objp; | 3088 | void *objp = NULL; |
2997 | 3089 | ||
2998 | cache_alloc_debugcheck_before(cachep, flags); | 3090 | cache_alloc_debugcheck_before(cachep, flags); |
2999 | 3091 | ||
3000 | local_irq_save(save_flags); | 3092 | local_irq_save(save_flags); |
3001 | objp = ____cache_alloc(cachep, flags); | 3093 | |
3094 | if (unlikely(NUMA_BUILD && | ||
3095 | current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) | ||
3096 | objp = alternate_node_alloc(cachep, flags); | ||
3097 | |||
3098 | if (!objp) | ||
3099 | objp = ____cache_alloc(cachep, flags); | ||
3100 | /* | ||
3101 | * We may just have run out of memory on the local node. | ||
3102 | * __cache_alloc_node() knows how to locate memory on other nodes | ||
3103 | */ | ||
3104 | if (NUMA_BUILD && !objp) | ||
3105 | objp = __cache_alloc_node(cachep, flags, numa_node_id()); | ||
3002 | local_irq_restore(save_flags); | 3106 | local_irq_restore(save_flags); |
3003 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, | 3107 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, |
3004 | caller); | 3108 | caller); |
@@ -3017,7 +3121,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3017 | { | 3121 | { |
3018 | int nid_alloc, nid_here; | 3122 | int nid_alloc, nid_here; |
3019 | 3123 | ||
3020 | if (in_interrupt()) | 3124 | if (in_interrupt() || (flags & __GFP_THISNODE)) |
3021 | return NULL; | 3125 | return NULL; |
3022 | nid_alloc = nid_here = numa_node_id(); | 3126 | nid_alloc = nid_here = numa_node_id(); |
3023 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) | 3127 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) |
@@ -3030,6 +3134,28 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3030 | } | 3134 | } |
3031 | 3135 | ||
3032 | /* | 3136 | /* |
3137 | * Fallback function if there was no memory available and no objects on a | ||
3138 | * certain node and we are allowed to fall back. We mimick the behavior of | ||
3139 | * the page allocator. We fall back according to a zonelist determined by | ||
3140 | * the policy layer while obeying cpuset constraints. | ||
3141 | */ | ||
3142 | void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | ||
3143 | { | ||
3144 | struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy)) | ||
3145 | ->node_zonelists[gfp_zone(flags)]; | ||
3146 | struct zone **z; | ||
3147 | void *obj = NULL; | ||
3148 | |||
3149 | for (z = zonelist->zones; *z && !obj; z++) | ||
3150 | if (zone_idx(*z) <= ZONE_NORMAL && | ||
3151 | cpuset_zone_allowed(*z, flags)) | ||
3152 | obj = __cache_alloc_node(cache, | ||
3153 | flags | __GFP_THISNODE, | ||
3154 | zone_to_nid(*z)); | ||
3155 | return obj; | ||
3156 | } | ||
3157 | |||
3158 | /* | ||
3033 | * A interface to enable slab creation on nodeid | 3159 | * A interface to enable slab creation on nodeid |
3034 | */ | 3160 | */ |
3035 | static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, | 3161 | static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, |
@@ -3082,11 +3208,15 @@ retry: | |||
3082 | must_grow: | 3208 | must_grow: |
3083 | spin_unlock(&l3->list_lock); | 3209 | spin_unlock(&l3->list_lock); |
3084 | x = cache_grow(cachep, flags, nodeid); | 3210 | x = cache_grow(cachep, flags, nodeid); |
3211 | if (x) | ||
3212 | goto retry; | ||
3085 | 3213 | ||
3086 | if (!x) | 3214 | if (!(flags & __GFP_THISNODE)) |
3087 | return NULL; | 3215 | /* Unable to grow the cache. Fall back to other nodes. */ |
3216 | return fallback_alloc(cachep, flags); | ||
3217 | |||
3218 | return NULL; | ||
3088 | 3219 | ||
3089 | goto retry; | ||
3090 | done: | 3220 | done: |
3091 | return obj; | 3221 | return obj; |
3092 | } | 3222 | } |
@@ -3119,6 +3249,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, | |||
3119 | if (slabp->inuse == 0) { | 3249 | if (slabp->inuse == 0) { |
3120 | if (l3->free_objects > l3->free_limit) { | 3250 | if (l3->free_objects > l3->free_limit) { |
3121 | l3->free_objects -= cachep->num; | 3251 | l3->free_objects -= cachep->num; |
3252 | /* No need to drop any previously held | ||
3253 | * lock here, even if we have a off-slab slab | ||
3254 | * descriptor it is guaranteed to come from | ||
3255 | * a different cache, refer to comments before | ||
3256 | * alloc_slabmgmt. | ||
3257 | */ | ||
3122 | slab_destroy(cachep, slabp); | 3258 | slab_destroy(cachep, slabp); |
3123 | } else { | 3259 | } else { |
3124 | list_add(&slabp->list, &l3->slabs_free); | 3260 | list_add(&slabp->list, &l3->slabs_free); |
@@ -3317,7 +3453,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
3317 | } | 3453 | } |
3318 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 3454 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
3319 | 3455 | ||
3320 | void *kmalloc_node(size_t size, gfp_t flags, int node) | 3456 | void *__kmalloc_node(size_t size, gfp_t flags, int node) |
3321 | { | 3457 | { |
3322 | struct kmem_cache *cachep; | 3458 | struct kmem_cache *cachep; |
3323 | 3459 | ||
@@ -3326,7 +3462,7 @@ void *kmalloc_node(size_t size, gfp_t flags, int node) | |||
3326 | return NULL; | 3462 | return NULL; |
3327 | return kmem_cache_alloc_node(cachep, flags, node); | 3463 | return kmem_cache_alloc_node(cachep, flags, node); |
3328 | } | 3464 | } |
3329 | EXPORT_SYMBOL(kmalloc_node); | 3465 | EXPORT_SYMBOL(__kmalloc_node); |
3330 | #endif | 3466 | #endif |
3331 | 3467 | ||
3332 | /** | 3468 | /** |
@@ -3370,55 +3506,6 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller) | |||
3370 | EXPORT_SYMBOL(__kmalloc_track_caller); | 3506 | EXPORT_SYMBOL(__kmalloc_track_caller); |
3371 | #endif | 3507 | #endif |
3372 | 3508 | ||
3373 | #ifdef CONFIG_SMP | ||
3374 | /** | ||
3375 | * __alloc_percpu - allocate one copy of the object for every present | ||
3376 | * cpu in the system, zeroing them. | ||
3377 | * Objects should be dereferenced using the per_cpu_ptr macro only. | ||
3378 | * | ||
3379 | * @size: how many bytes of memory are required. | ||
3380 | */ | ||
3381 | void *__alloc_percpu(size_t size) | ||
3382 | { | ||
3383 | int i; | ||
3384 | struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL); | ||
3385 | |||
3386 | if (!pdata) | ||
3387 | return NULL; | ||
3388 | |||
3389 | /* | ||
3390 | * Cannot use for_each_online_cpu since a cpu may come online | ||
3391 | * and we have no way of figuring out how to fix the array | ||
3392 | * that we have allocated then.... | ||
3393 | */ | ||
3394 | for_each_possible_cpu(i) { | ||
3395 | int node = cpu_to_node(i); | ||
3396 | |||
3397 | if (node_online(node)) | ||
3398 | pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node); | ||
3399 | else | ||
3400 | pdata->ptrs[i] = kmalloc(size, GFP_KERNEL); | ||
3401 | |||
3402 | if (!pdata->ptrs[i]) | ||
3403 | goto unwind_oom; | ||
3404 | memset(pdata->ptrs[i], 0, size); | ||
3405 | } | ||
3406 | |||
3407 | /* Catch derefs w/o wrappers */ | ||
3408 | return (void *)(~(unsigned long)pdata); | ||
3409 | |||
3410 | unwind_oom: | ||
3411 | while (--i >= 0) { | ||
3412 | if (!cpu_possible(i)) | ||
3413 | continue; | ||
3414 | kfree(pdata->ptrs[i]); | ||
3415 | } | ||
3416 | kfree(pdata); | ||
3417 | return NULL; | ||
3418 | } | ||
3419 | EXPORT_SYMBOL(__alloc_percpu); | ||
3420 | #endif | ||
3421 | |||
3422 | /** | 3509 | /** |
3423 | * kmem_cache_free - Deallocate an object | 3510 | * kmem_cache_free - Deallocate an object |
3424 | * @cachep: The cache the allocation was from. | 3511 | * @cachep: The cache the allocation was from. |
@@ -3464,29 +3551,6 @@ void kfree(const void *objp) | |||
3464 | } | 3551 | } |
3465 | EXPORT_SYMBOL(kfree); | 3552 | EXPORT_SYMBOL(kfree); |
3466 | 3553 | ||
3467 | #ifdef CONFIG_SMP | ||
3468 | /** | ||
3469 | * free_percpu - free previously allocated percpu memory | ||
3470 | * @objp: pointer returned by alloc_percpu. | ||
3471 | * | ||
3472 | * Don't free memory not originally allocated by alloc_percpu() | ||
3473 | * The complemented objp is to check for that. | ||
3474 | */ | ||
3475 | void free_percpu(const void *objp) | ||
3476 | { | ||
3477 | int i; | ||
3478 | struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp); | ||
3479 | |||
3480 | /* | ||
3481 | * We allocate for all cpus so we cannot use for online cpu here. | ||
3482 | */ | ||
3483 | for_each_possible_cpu(i) | ||
3484 | kfree(p->ptrs[i]); | ||
3485 | kfree(p); | ||
3486 | } | ||
3487 | EXPORT_SYMBOL(free_percpu); | ||
3488 | #endif | ||
3489 | |||
3490 | unsigned int kmem_cache_size(struct kmem_cache *cachep) | 3554 | unsigned int kmem_cache_size(struct kmem_cache *cachep) |
3491 | { | 3555 | { |
3492 | return obj_size(cachep); | 3556 | return obj_size(cachep); |
@@ -3603,22 +3667,26 @@ static void do_ccupdate_local(void *info) | |||
3603 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | 3667 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, |
3604 | int batchcount, int shared) | 3668 | int batchcount, int shared) |
3605 | { | 3669 | { |
3606 | struct ccupdate_struct new; | 3670 | struct ccupdate_struct *new; |
3607 | int i, err; | 3671 | int i; |
3672 | |||
3673 | new = kzalloc(sizeof(*new), GFP_KERNEL); | ||
3674 | if (!new) | ||
3675 | return -ENOMEM; | ||
3608 | 3676 | ||
3609 | memset(&new.new, 0, sizeof(new.new)); | ||
3610 | for_each_online_cpu(i) { | 3677 | for_each_online_cpu(i) { |
3611 | new.new[i] = alloc_arraycache(cpu_to_node(i), limit, | 3678 | new->new[i] = alloc_arraycache(cpu_to_node(i), limit, |
3612 | batchcount); | 3679 | batchcount); |
3613 | if (!new.new[i]) { | 3680 | if (!new->new[i]) { |
3614 | for (i--; i >= 0; i--) | 3681 | for (i--; i >= 0; i--) |
3615 | kfree(new.new[i]); | 3682 | kfree(new->new[i]); |
3683 | kfree(new); | ||
3616 | return -ENOMEM; | 3684 | return -ENOMEM; |
3617 | } | 3685 | } |
3618 | } | 3686 | } |
3619 | new.cachep = cachep; | 3687 | new->cachep = cachep; |
3620 | 3688 | ||
3621 | on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1); | 3689 | on_each_cpu(do_ccupdate_local, (void *)new, 1, 1); |
3622 | 3690 | ||
3623 | check_irq_on(); | 3691 | check_irq_on(); |
3624 | cachep->batchcount = batchcount; | 3692 | cachep->batchcount = batchcount; |
@@ -3626,7 +3694,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
3626 | cachep->shared = shared; | 3694 | cachep->shared = shared; |
3627 | 3695 | ||
3628 | for_each_online_cpu(i) { | 3696 | for_each_online_cpu(i) { |
3629 | struct array_cache *ccold = new.new[i]; | 3697 | struct array_cache *ccold = new->new[i]; |
3630 | if (!ccold) | 3698 | if (!ccold) |
3631 | continue; | 3699 | continue; |
3632 | spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); | 3700 | spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); |
@@ -3634,18 +3702,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
3634 | spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); | 3702 | spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); |
3635 | kfree(ccold); | 3703 | kfree(ccold); |
3636 | } | 3704 | } |
3637 | 3705 | kfree(new); | |
3638 | err = alloc_kmemlist(cachep); | 3706 | return alloc_kmemlist(cachep); |
3639 | if (err) { | ||
3640 | printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n", | ||
3641 | cachep->name, -err); | ||
3642 | BUG(); | ||
3643 | } | ||
3644 | return 0; | ||
3645 | } | 3707 | } |
3646 | 3708 | ||
3647 | /* Called with cache_chain_mutex held always */ | 3709 | /* Called with cache_chain_mutex held always */ |
3648 | static void enable_cpucache(struct kmem_cache *cachep) | 3710 | static int enable_cpucache(struct kmem_cache *cachep) |
3649 | { | 3711 | { |
3650 | int err; | 3712 | int err; |
3651 | int limit, shared; | 3713 | int limit, shared; |
@@ -3697,6 +3759,7 @@ static void enable_cpucache(struct kmem_cache *cachep) | |||
3697 | if (err) | 3759 | if (err) |
3698 | printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", | 3760 | printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", |
3699 | cachep->name, -err); | 3761 | cachep->name, -err); |
3762 | return err; | ||
3700 | } | 3763 | } |
3701 | 3764 | ||
3702 | /* | 3765 | /* |
@@ -4157,6 +4220,7 @@ static int leaks_show(struct seq_file *m, void *p) | |||
4157 | show_symbol(m, n[2*i+2]); | 4220 | show_symbol(m, n[2*i+2]); |
4158 | seq_putc(m, '\n'); | 4221 | seq_putc(m, '\n'); |
4159 | } | 4222 | } |
4223 | |||
4160 | return 0; | 4224 | return 0; |
4161 | } | 4225 | } |
4162 | 4226 | ||
@@ -270,10 +270,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
270 | } | 270 | } |
271 | EXPORT_SYMBOL(kmem_cache_create); | 271 | EXPORT_SYMBOL(kmem_cache_create); |
272 | 272 | ||
273 | int kmem_cache_destroy(struct kmem_cache *c) | 273 | void kmem_cache_destroy(struct kmem_cache *c) |
274 | { | 274 | { |
275 | slob_free(c, sizeof(struct kmem_cache)); | 275 | slob_free(c, sizeof(struct kmem_cache)); |
276 | return 0; | ||
277 | } | 276 | } |
278 | EXPORT_SYMBOL(kmem_cache_destroy); | 277 | EXPORT_SYMBOL(kmem_cache_destroy); |
279 | 278 | ||
@@ -339,52 +338,3 @@ void kmem_cache_init(void) | |||
339 | 338 | ||
340 | mod_timer(&slob_timer, jiffies + HZ); | 339 | mod_timer(&slob_timer, jiffies + HZ); |
341 | } | 340 | } |
342 | |||
343 | atomic_t slab_reclaim_pages = ATOMIC_INIT(0); | ||
344 | EXPORT_SYMBOL(slab_reclaim_pages); | ||
345 | |||
346 | #ifdef CONFIG_SMP | ||
347 | |||
348 | void *__alloc_percpu(size_t size) | ||
349 | { | ||
350 | int i; | ||
351 | struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL); | ||
352 | |||
353 | if (!pdata) | ||
354 | return NULL; | ||
355 | |||
356 | for_each_possible_cpu(i) { | ||
357 | pdata->ptrs[i] = kmalloc(size, GFP_KERNEL); | ||
358 | if (!pdata->ptrs[i]) | ||
359 | goto unwind_oom; | ||
360 | memset(pdata->ptrs[i], 0, size); | ||
361 | } | ||
362 | |||
363 | /* Catch derefs w/o wrappers */ | ||
364 | return (void *) (~(unsigned long) pdata); | ||
365 | |||
366 | unwind_oom: | ||
367 | while (--i >= 0) { | ||
368 | if (!cpu_possible(i)) | ||
369 | continue; | ||
370 | kfree(pdata->ptrs[i]); | ||
371 | } | ||
372 | kfree(pdata); | ||
373 | return NULL; | ||
374 | } | ||
375 | EXPORT_SYMBOL(__alloc_percpu); | ||
376 | |||
377 | void | ||
378 | free_percpu(const void *objp) | ||
379 | { | ||
380 | int i; | ||
381 | struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp); | ||
382 | |||
383 | for_each_possible_cpu(i) | ||
384 | kfree(p->ptrs[i]); | ||
385 | |||
386 | kfree(p); | ||
387 | } | ||
388 | EXPORT_SYMBOL(free_percpu); | ||
389 | |||
390 | #endif | ||
@@ -34,6 +34,25 @@ | |||
34 | /* How many pages do we try to swap or page in/out together? */ | 34 | /* How many pages do we try to swap or page in/out together? */ |
35 | int page_cluster; | 35 | int page_cluster; |
36 | 36 | ||
37 | /* | ||
38 | * This path almost never happens for VM activity - pages are normally | ||
39 | * freed via pagevecs. But it gets used by networking. | ||
40 | */ | ||
41 | static void fastcall __page_cache_release(struct page *page) | ||
42 | { | ||
43 | if (PageLRU(page)) { | ||
44 | unsigned long flags; | ||
45 | struct zone *zone = page_zone(page); | ||
46 | |||
47 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
48 | VM_BUG_ON(!PageLRU(page)); | ||
49 | __ClearPageLRU(page); | ||
50 | del_page_from_lru(zone, page); | ||
51 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
52 | } | ||
53 | free_hot_page(page); | ||
54 | } | ||
55 | |||
37 | static void put_compound_page(struct page *page) | 56 | static void put_compound_page(struct page *page) |
38 | { | 57 | { |
39 | page = (struct page *)page_private(page); | 58 | page = (struct page *)page_private(page); |
@@ -223,26 +242,6 @@ int lru_add_drain_all(void) | |||
223 | #endif | 242 | #endif |
224 | 243 | ||
225 | /* | 244 | /* |
226 | * This path almost never happens for VM activity - pages are normally | ||
227 | * freed via pagevecs. But it gets used by networking. | ||
228 | */ | ||
229 | void fastcall __page_cache_release(struct page *page) | ||
230 | { | ||
231 | if (PageLRU(page)) { | ||
232 | unsigned long flags; | ||
233 | struct zone *zone = page_zone(page); | ||
234 | |||
235 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
236 | BUG_ON(!PageLRU(page)); | ||
237 | __ClearPageLRU(page); | ||
238 | del_page_from_lru(zone, page); | ||
239 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
240 | } | ||
241 | free_hot_page(page); | ||
242 | } | ||
243 | EXPORT_SYMBOL(__page_cache_release); | ||
244 | |||
245 | /* | ||
246 | * Batched page_cache_release(). Decrement the reference count on all the | 245 | * Batched page_cache_release(). Decrement the reference count on all the |
247 | * passed pages. If it fell to zero then remove the page from the LRU and | 246 | * passed pages. If it fell to zero then remove the page from the LRU and |
248 | * free it. | 247 | * free it. |
@@ -284,7 +283,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
284 | zone = pagezone; | 283 | zone = pagezone; |
285 | spin_lock_irq(&zone->lru_lock); | 284 | spin_lock_irq(&zone->lru_lock); |
286 | } | 285 | } |
287 | BUG_ON(!PageLRU(page)); | 286 | VM_BUG_ON(!PageLRU(page)); |
288 | __ClearPageLRU(page); | 287 | __ClearPageLRU(page); |
289 | del_page_from_lru(zone, page); | 288 | del_page_from_lru(zone, page); |
290 | } | 289 | } |
@@ -337,7 +336,7 @@ void __pagevec_release_nonlru(struct pagevec *pvec) | |||
337 | for (i = 0; i < pagevec_count(pvec); i++) { | 336 | for (i = 0; i < pagevec_count(pvec); i++) { |
338 | struct page *page = pvec->pages[i]; | 337 | struct page *page = pvec->pages[i]; |
339 | 338 | ||
340 | BUG_ON(PageLRU(page)); | 339 | VM_BUG_ON(PageLRU(page)); |
341 | if (put_page_testzero(page)) | 340 | if (put_page_testzero(page)) |
342 | pagevec_add(&pages_to_free, page); | 341 | pagevec_add(&pages_to_free, page); |
343 | } | 342 | } |
@@ -364,7 +363,7 @@ void __pagevec_lru_add(struct pagevec *pvec) | |||
364 | zone = pagezone; | 363 | zone = pagezone; |
365 | spin_lock_irq(&zone->lru_lock); | 364 | spin_lock_irq(&zone->lru_lock); |
366 | } | 365 | } |
367 | BUG_ON(PageLRU(page)); | 366 | VM_BUG_ON(PageLRU(page)); |
368 | SetPageLRU(page); | 367 | SetPageLRU(page); |
369 | add_page_to_inactive_list(zone, page); | 368 | add_page_to_inactive_list(zone, page); |
370 | } | 369 | } |
@@ -391,9 +390,9 @@ void __pagevec_lru_add_active(struct pagevec *pvec) | |||
391 | zone = pagezone; | 390 | zone = pagezone; |
392 | spin_lock_irq(&zone->lru_lock); | 391 | spin_lock_irq(&zone->lru_lock); |
393 | } | 392 | } |
394 | BUG_ON(PageLRU(page)); | 393 | VM_BUG_ON(PageLRU(page)); |
395 | SetPageLRU(page); | 394 | SetPageLRU(page); |
396 | BUG_ON(PageActive(page)); | 395 | VM_BUG_ON(PageActive(page)); |
397 | SetPageActive(page); | 396 | SetPageActive(page); |
398 | add_page_to_active_list(zone, page); | 397 | add_page_to_active_list(zone, page); |
399 | } | 398 | } |
diff --git a/mm/swapfile.c b/mm/swapfile.c index f1f5ec7837..a15def63f2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -1723,13 +1723,14 @@ get_swap_info_struct(unsigned type) | |||
1723 | */ | 1723 | */ |
1724 | int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | 1724 | int valid_swaphandles(swp_entry_t entry, unsigned long *offset) |
1725 | { | 1725 | { |
1726 | int ret = 0, i = 1 << page_cluster; | 1726 | int our_page_cluster = page_cluster; |
1727 | int ret = 0, i = 1 << our_page_cluster; | ||
1727 | unsigned long toff; | 1728 | unsigned long toff; |
1728 | struct swap_info_struct *swapdev = swp_type(entry) + swap_info; | 1729 | struct swap_info_struct *swapdev = swp_type(entry) + swap_info; |
1729 | 1730 | ||
1730 | if (!page_cluster) /* no readahead */ | 1731 | if (!our_page_cluster) /* no readahead */ |
1731 | return 0; | 1732 | return 0; |
1732 | toff = (swp_offset(entry) >> page_cluster) << page_cluster; | 1733 | toff = (swp_offset(entry) >> our_page_cluster) << our_page_cluster; |
1733 | if (!toff) /* first page is swap header */ | 1734 | if (!toff) /* first page is swap header */ |
1734 | toff++, i--; | 1735 | toff++, i--; |
1735 | *offset = toff; | 1736 | *offset = toff; |
diff --git a/mm/truncate.c b/mm/truncate.c index c6ab55ec68..f4edbc179d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -9,6 +9,7 @@ | |||
9 | 9 | ||
10 | #include <linux/kernel.h> | 10 | #include <linux/kernel.h> |
11 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
12 | #include <linux/swap.h> | ||
12 | #include <linux/module.h> | 13 | #include <linux/module.h> |
13 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
14 | #include <linux/pagevec.h> | 15 | #include <linux/pagevec.h> |
@@ -16,6 +17,32 @@ | |||
16 | do_invalidatepage */ | 17 | do_invalidatepage */ |
17 | 18 | ||
18 | 19 | ||
20 | /** | ||
21 | * do_invalidatepage - invalidate part of all of a page | ||
22 | * @page: the page which is affected | ||
23 | * @offset: the index of the truncation point | ||
24 | * | ||
25 | * do_invalidatepage() is called when all or part of the page has become | ||
26 | * invalidated by a truncate operation. | ||
27 | * | ||
28 | * do_invalidatepage() does not have to release all buffers, but it must | ||
29 | * ensure that no dirty buffer is left outside @offset and that no I/O | ||
30 | * is underway against any of the blocks which are outside the truncation | ||
31 | * point. Because the caller is about to free (and possibly reuse) those | ||
32 | * blocks on-disk. | ||
33 | */ | ||
34 | void do_invalidatepage(struct page *page, unsigned long offset) | ||
35 | { | ||
36 | void (*invalidatepage)(struct page *, unsigned long); | ||
37 | invalidatepage = page->mapping->a_ops->invalidatepage; | ||
38 | #ifdef CONFIG_BLOCK | ||
39 | if (!invalidatepage) | ||
40 | invalidatepage = block_invalidatepage; | ||
41 | #endif | ||
42 | if (invalidatepage) | ||
43 | (*invalidatepage)(page, offset); | ||
44 | } | ||
45 | |||
19 | static inline void truncate_partial_page(struct page *page, unsigned partial) | 46 | static inline void truncate_partial_page(struct page *page, unsigned partial) |
20 | { | 47 | { |
21 | memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); | 48 | memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); |
@@ -52,36 +79,26 @@ truncate_complete_page(struct address_space *mapping, struct page *page) | |||
52 | /* | 79 | /* |
53 | * This is for invalidate_inode_pages(). That function can be called at | 80 | * This is for invalidate_inode_pages(). That function can be called at |
54 | * any time, and is not supposed to throw away dirty pages. But pages can | 81 | * any time, and is not supposed to throw away dirty pages. But pages can |
55 | * be marked dirty at any time too. So we re-check the dirtiness inside | 82 | * be marked dirty at any time too, so use remove_mapping which safely |
56 | * ->tree_lock. That provides exclusion against the __set_page_dirty | 83 | * discards clean, unused pages. |
57 | * functions. | ||
58 | * | 84 | * |
59 | * Returns non-zero if the page was successfully invalidated. | 85 | * Returns non-zero if the page was successfully invalidated. |
60 | */ | 86 | */ |
61 | static int | 87 | static int |
62 | invalidate_complete_page(struct address_space *mapping, struct page *page) | 88 | invalidate_complete_page(struct address_space *mapping, struct page *page) |
63 | { | 89 | { |
90 | int ret; | ||
91 | |||
64 | if (page->mapping != mapping) | 92 | if (page->mapping != mapping) |
65 | return 0; | 93 | return 0; |
66 | 94 | ||
67 | if (PagePrivate(page) && !try_to_release_page(page, 0)) | 95 | if (PagePrivate(page) && !try_to_release_page(page, 0)) |
68 | return 0; | 96 | return 0; |
69 | 97 | ||
70 | write_lock_irq(&mapping->tree_lock); | 98 | ret = remove_mapping(mapping, page); |
71 | if (PageDirty(page)) | ||
72 | goto failed; | ||
73 | if (page_count(page) != 2) /* caller's ref + pagecache ref */ | ||
74 | goto failed; | ||
75 | |||
76 | BUG_ON(PagePrivate(page)); | ||
77 | __remove_from_page_cache(page); | ||
78 | write_unlock_irq(&mapping->tree_lock); | ||
79 | ClearPageUptodate(page); | 99 | ClearPageUptodate(page); |
80 | page_cache_release(page); /* pagecache ref */ | 100 | |
81 | return 1; | 101 | return ret; |
82 | failed: | ||
83 | write_unlock_irq(&mapping->tree_lock); | ||
84 | return 0; | ||
85 | } | 102 | } |
86 | 103 | ||
87 | /** | 104 | /** |
@@ -270,9 +287,39 @@ unsigned long invalidate_inode_pages(struct address_space *mapping) | |||
270 | { | 287 | { |
271 | return invalidate_mapping_pages(mapping, 0, ~0UL); | 288 | return invalidate_mapping_pages(mapping, 0, ~0UL); |
272 | } | 289 | } |
273 | |||
274 | EXPORT_SYMBOL(invalidate_inode_pages); | 290 | EXPORT_SYMBOL(invalidate_inode_pages); |
275 | 291 | ||
292 | /* | ||
293 | * This is like invalidate_complete_page(), except it ignores the page's | ||
294 | * refcount. We do this because invalidate_inode_pages2() needs stronger | ||
295 | * invalidation guarantees, and cannot afford to leave pages behind because | ||
296 | * shrink_list() has a temp ref on them, or because they're transiently sitting | ||
297 | * in the lru_cache_add() pagevecs. | ||
298 | */ | ||
299 | static int | ||
300 | invalidate_complete_page2(struct address_space *mapping, struct page *page) | ||
301 | { | ||
302 | if (page->mapping != mapping) | ||
303 | return 0; | ||
304 | |||
305 | if (PagePrivate(page) && !try_to_release_page(page, 0)) | ||
306 | return 0; | ||
307 | |||
308 | write_lock_irq(&mapping->tree_lock); | ||
309 | if (PageDirty(page)) | ||
310 | goto failed; | ||
311 | |||
312 | BUG_ON(PagePrivate(page)); | ||
313 | __remove_from_page_cache(page); | ||
314 | write_unlock_irq(&mapping->tree_lock); | ||
315 | ClearPageUptodate(page); | ||
316 | page_cache_release(page); /* pagecache ref */ | ||
317 | return 1; | ||
318 | failed: | ||
319 | write_unlock_irq(&mapping->tree_lock); | ||
320 | return 0; | ||
321 | } | ||
322 | |||
276 | /** | 323 | /** |
277 | * invalidate_inode_pages2_range - remove range of pages from an address_space | 324 | * invalidate_inode_pages2_range - remove range of pages from an address_space |
278 | * @mapping: the address_space | 325 | * @mapping: the address_space |
@@ -339,7 +386,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
339 | } | 386 | } |
340 | } | 387 | } |
341 | was_dirty = test_clear_page_dirty(page); | 388 | was_dirty = test_clear_page_dirty(page); |
342 | if (!invalidate_complete_page(mapping, page)) { | 389 | if (!invalidate_complete_page2(mapping, page)) { |
343 | if (was_dirty) | 390 | if (was_dirty) |
344 | set_page_dirty(page); | 391 | set_page_dirty(page); |
345 | ret = -EIO; | 392 | ret = -EIO; |
@@ -40,6 +40,24 @@ char *kstrdup(const char *s, gfp_t gfp) | |||
40 | } | 40 | } |
41 | EXPORT_SYMBOL(kstrdup); | 41 | EXPORT_SYMBOL(kstrdup); |
42 | 42 | ||
43 | /** | ||
44 | * kmemdup - duplicate region of memory | ||
45 | * | ||
46 | * @src: memory region to duplicate | ||
47 | * @len: memory region length | ||
48 | * @gfp: GFP mask to use | ||
49 | */ | ||
50 | void *kmemdup(const void *src, size_t len, gfp_t gfp) | ||
51 | { | ||
52 | void *p; | ||
53 | |||
54 | p = ____kmalloc(len, gfp); | ||
55 | if (p) | ||
56 | memcpy(p, src, len); | ||
57 | return p; | ||
58 | } | ||
59 | EXPORT_SYMBOL(kmemdup); | ||
60 | |||
43 | /* | 61 | /* |
44 | * strndup_user - duplicate an existing string from user space | 62 | * strndup_user - duplicate an existing string from user space |
45 | * | 63 | * |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 266162d2ba..1ac191ce56 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -24,6 +24,9 @@ | |||
24 | DEFINE_RWLOCK(vmlist_lock); | 24 | DEFINE_RWLOCK(vmlist_lock); |
25 | struct vm_struct *vmlist; | 25 | struct vm_struct *vmlist; |
26 | 26 | ||
27 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | ||
28 | int node); | ||
29 | |||
27 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) | 30 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) |
28 | { | 31 | { |
29 | pte_t *pte; | 32 | pte_t *pte; |
@@ -238,7 +241,6 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | |||
238 | 241 | ||
239 | /** | 242 | /** |
240 | * get_vm_area - reserve a contingous kernel virtual area | 243 | * get_vm_area - reserve a contingous kernel virtual area |
241 | * | ||
242 | * @size: size of the area | 244 | * @size: size of the area |
243 | * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC | 245 | * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC |
244 | * | 246 | * |
@@ -270,7 +272,7 @@ static struct vm_struct *__find_vm_area(void *addr) | |||
270 | } | 272 | } |
271 | 273 | ||
272 | /* Caller must hold vmlist_lock */ | 274 | /* Caller must hold vmlist_lock */ |
273 | struct vm_struct *__remove_vm_area(void *addr) | 275 | static struct vm_struct *__remove_vm_area(void *addr) |
274 | { | 276 | { |
275 | struct vm_struct **p, *tmp; | 277 | struct vm_struct **p, *tmp; |
276 | 278 | ||
@@ -293,7 +295,6 @@ found: | |||
293 | 295 | ||
294 | /** | 296 | /** |
295 | * remove_vm_area - find and remove a contingous kernel virtual area | 297 | * remove_vm_area - find and remove a contingous kernel virtual area |
296 | * | ||
297 | * @addr: base address | 298 | * @addr: base address |
298 | * | 299 | * |
299 | * Search for the kernel VM area starting at @addr, and remove it. | 300 | * Search for the kernel VM area starting at @addr, and remove it. |
@@ -352,7 +353,6 @@ void __vunmap(void *addr, int deallocate_pages) | |||
352 | 353 | ||
353 | /** | 354 | /** |
354 | * vfree - release memory allocated by vmalloc() | 355 | * vfree - release memory allocated by vmalloc() |
355 | * | ||
356 | * @addr: memory base address | 356 | * @addr: memory base address |
357 | * | 357 | * |
358 | * Free the virtually contiguous memory area starting at @addr, as | 358 | * Free the virtually contiguous memory area starting at @addr, as |
@@ -370,7 +370,6 @@ EXPORT_SYMBOL(vfree); | |||
370 | 370 | ||
371 | /** | 371 | /** |
372 | * vunmap - release virtual mapping obtained by vmap() | 372 | * vunmap - release virtual mapping obtained by vmap() |
373 | * | ||
374 | * @addr: memory base address | 373 | * @addr: memory base address |
375 | * | 374 | * |
376 | * Free the virtually contiguous memory area starting at @addr, | 375 | * Free the virtually contiguous memory area starting at @addr, |
@@ -387,7 +386,6 @@ EXPORT_SYMBOL(vunmap); | |||
387 | 386 | ||
388 | /** | 387 | /** |
389 | * vmap - map an array of pages into virtually contiguous space | 388 | * vmap - map an array of pages into virtually contiguous space |
390 | * | ||
391 | * @pages: array of page pointers | 389 | * @pages: array of page pointers |
392 | * @count: number of pages to map | 390 | * @count: number of pages to map |
393 | * @flags: vm_area->flags | 391 | * @flags: vm_area->flags |
@@ -468,7 +466,6 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | |||
468 | 466 | ||
469 | /** | 467 | /** |
470 | * __vmalloc_node - allocate virtually contiguous memory | 468 | * __vmalloc_node - allocate virtually contiguous memory |
471 | * | ||
472 | * @size: allocation size | 469 | * @size: allocation size |
473 | * @gfp_mask: flags for the page level allocator | 470 | * @gfp_mask: flags for the page level allocator |
474 | * @prot: protection mask for the allocated pages | 471 | * @prot: protection mask for the allocated pages |
@@ -478,8 +475,8 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | |||
478 | * allocator with @gfp_mask flags. Map them into contiguous | 475 | * allocator with @gfp_mask flags. Map them into contiguous |
479 | * kernel virtual space, using a pagetable protection of @prot. | 476 | * kernel virtual space, using a pagetable protection of @prot. |
480 | */ | 477 | */ |
481 | void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | 478 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, |
482 | int node) | 479 | int node) |
483 | { | 480 | { |
484 | struct vm_struct *area; | 481 | struct vm_struct *area; |
485 | 482 | ||
@@ -493,7 +490,6 @@ void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | |||
493 | 490 | ||
494 | return __vmalloc_area_node(area, gfp_mask, prot, node); | 491 | return __vmalloc_area_node(area, gfp_mask, prot, node); |
495 | } | 492 | } |
496 | EXPORT_SYMBOL(__vmalloc_node); | ||
497 | 493 | ||
498 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | 494 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) |
499 | { | 495 | { |
@@ -503,9 +499,7 @@ EXPORT_SYMBOL(__vmalloc); | |||
503 | 499 | ||
504 | /** | 500 | /** |
505 | * vmalloc - allocate virtually contiguous memory | 501 | * vmalloc - allocate virtually contiguous memory |
506 | * | ||
507 | * @size: allocation size | 502 | * @size: allocation size |
508 | * | ||
509 | * Allocate enough pages to cover @size from the page level | 503 | * Allocate enough pages to cover @size from the page level |
510 | * allocator and map them into contiguous kernel virtual space. | 504 | * allocator and map them into contiguous kernel virtual space. |
511 | * | 505 | * |
@@ -519,11 +513,11 @@ void *vmalloc(unsigned long size) | |||
519 | EXPORT_SYMBOL(vmalloc); | 513 | EXPORT_SYMBOL(vmalloc); |
520 | 514 | ||
521 | /** | 515 | /** |
522 | * vmalloc_user - allocate virtually contiguous memory which has | 516 | * vmalloc_user - allocate zeroed virtually contiguous memory for userspace |
523 | * been zeroed so it can be mapped to userspace without | 517 | * @size: allocation size |
524 | * leaking data. | ||
525 | * | 518 | * |
526 | * @size: allocation size | 519 | * The resulting memory area is zeroed so it can be mapped to userspace |
520 | * without leaking data. | ||
527 | */ | 521 | */ |
528 | void *vmalloc_user(unsigned long size) | 522 | void *vmalloc_user(unsigned long size) |
529 | { | 523 | { |
@@ -542,7 +536,6 @@ EXPORT_SYMBOL(vmalloc_user); | |||
542 | 536 | ||
543 | /** | 537 | /** |
544 | * vmalloc_node - allocate memory on a specific node | 538 | * vmalloc_node - allocate memory on a specific node |
545 | * | ||
546 | * @size: allocation size | 539 | * @size: allocation size |
547 | * @node: numa node | 540 | * @node: numa node |
548 | * | 541 | * |
@@ -564,7 +557,6 @@ EXPORT_SYMBOL(vmalloc_node); | |||
564 | 557 | ||
565 | /** | 558 | /** |
566 | * vmalloc_exec - allocate virtually contiguous, executable memory | 559 | * vmalloc_exec - allocate virtually contiguous, executable memory |
567 | * | ||
568 | * @size: allocation size | 560 | * @size: allocation size |
569 | * | 561 | * |
570 | * Kernel-internal function to allocate enough pages to cover @size | 562 | * Kernel-internal function to allocate enough pages to cover @size |
@@ -582,7 +574,6 @@ void *vmalloc_exec(unsigned long size) | |||
582 | 574 | ||
583 | /** | 575 | /** |
584 | * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) | 576 | * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) |
585 | * | ||
586 | * @size: allocation size | 577 | * @size: allocation size |
587 | * | 578 | * |
588 | * Allocate enough 32bit PA addressable pages to cover @size from the | 579 | * Allocate enough 32bit PA addressable pages to cover @size from the |
@@ -595,11 +586,11 @@ void *vmalloc_32(unsigned long size) | |||
595 | EXPORT_SYMBOL(vmalloc_32); | 586 | EXPORT_SYMBOL(vmalloc_32); |
596 | 587 | ||
597 | /** | 588 | /** |
598 | * vmalloc_32_user - allocate virtually contiguous memory (32bit | 589 | * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory |
599 | * addressable) which is zeroed so it can be | ||
600 | * mapped to userspace without leaking data. | ||
601 | * | ||
602 | * @size: allocation size | 590 | * @size: allocation size |
591 | * | ||
592 | * The resulting memory area is 32bit addressable and zeroed so it can be | ||
593 | * mapped to userspace without leaking data. | ||
603 | */ | 594 | */ |
604 | void *vmalloc_32_user(unsigned long size) | 595 | void *vmalloc_32_user(unsigned long size) |
605 | { | 596 | { |
@@ -693,7 +684,6 @@ finished: | |||
693 | 684 | ||
694 | /** | 685 | /** |
695 | * remap_vmalloc_range - map vmalloc pages to userspace | 686 | * remap_vmalloc_range - map vmalloc pages to userspace |
696 | * | ||
697 | * @vma: vma to cover (map full range of vma) | 687 | * @vma: vma to cover (map full range of vma) |
698 | * @addr: vmalloc memory | 688 | * @addr: vmalloc memory |
699 | * @pgoff: number of pages into addr before first page to map | 689 | * @pgoff: number of pages into addr before first page to map |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 5d4c4d0225..eca70310ad 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
20 | #include <linux/init.h> | 20 | #include <linux/init.h> |
21 | #include <linux/highmem.h> | 21 | #include <linux/highmem.h> |
22 | #include <linux/vmstat.h> | ||
22 | #include <linux/file.h> | 23 | #include <linux/file.h> |
23 | #include <linux/writeback.h> | 24 | #include <linux/writeback.h> |
24 | #include <linux/blkdev.h> | 25 | #include <linux/blkdev.h> |
@@ -62,6 +63,8 @@ struct scan_control { | |||
62 | int swap_cluster_max; | 63 | int swap_cluster_max; |
63 | 64 | ||
64 | int swappiness; | 65 | int swappiness; |
66 | |||
67 | int all_unreclaimable; | ||
65 | }; | 68 | }; |
66 | 69 | ||
67 | /* | 70 | /* |
@@ -368,7 +371,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) | |||
368 | /* synchronous write or broken a_ops? */ | 371 | /* synchronous write or broken a_ops? */ |
369 | ClearPageReclaim(page); | 372 | ClearPageReclaim(page); |
370 | } | 373 | } |
371 | 374 | inc_zone_page_state(page, NR_VMSCAN_WRITE); | |
372 | return PAGE_SUCCESS; | 375 | return PAGE_SUCCESS; |
373 | } | 376 | } |
374 | 377 | ||
@@ -377,15 +380,34 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) | |||
377 | 380 | ||
378 | int remove_mapping(struct address_space *mapping, struct page *page) | 381 | int remove_mapping(struct address_space *mapping, struct page *page) |
379 | { | 382 | { |
380 | if (!mapping) | 383 | BUG_ON(!PageLocked(page)); |
381 | return 0; /* truncate got there first */ | 384 | BUG_ON(mapping != page_mapping(page)); |
382 | 385 | ||
383 | write_lock_irq(&mapping->tree_lock); | 386 | write_lock_irq(&mapping->tree_lock); |
384 | |||
385 | /* | 387 | /* |
386 | * The non-racy check for busy page. It is critical to check | 388 | * The non racy check for a busy page. |
387 | * PageDirty _after_ making sure that the page is freeable and | 389 | * |
388 | * not in use by anybody. (pagecache + us == 2) | 390 | * Must be careful with the order of the tests. When someone has |
391 | * a ref to the page, it may be possible that they dirty it then | ||
392 | * drop the reference. So if PageDirty is tested before page_count | ||
393 | * here, then the following race may occur: | ||
394 | * | ||
395 | * get_user_pages(&page); | ||
396 | * [user mapping goes away] | ||
397 | * write_to(page); | ||
398 | * !PageDirty(page) [good] | ||
399 | * SetPageDirty(page); | ||
400 | * put_page(page); | ||
401 | * !page_count(page) [good, discard it] | ||
402 | * | ||
403 | * [oops, our write_to data is lost] | ||
404 | * | ||
405 | * Reversing the order of the tests ensures such a situation cannot | ||
406 | * escape unnoticed. The smp_rmb is needed to ensure the page->flags | ||
407 | * load is not satisfied before that of page->_count. | ||
408 | * | ||
409 | * Note that if SetPageDirty is always performed via set_page_dirty, | ||
410 | * and thus under tree_lock, then this ordering is not required. | ||
389 | */ | 411 | */ |
390 | if (unlikely(page_count(page) != 2)) | 412 | if (unlikely(page_count(page) != 2)) |
391 | goto cannot_free; | 413 | goto cannot_free; |
@@ -440,7 +462,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
440 | if (TestSetPageLocked(page)) | 462 | if (TestSetPageLocked(page)) |
441 | goto keep; | 463 | goto keep; |
442 | 464 | ||
443 | BUG_ON(PageActive(page)); | 465 | VM_BUG_ON(PageActive(page)); |
444 | 466 | ||
445 | sc->nr_scanned++; | 467 | sc->nr_scanned++; |
446 | 468 | ||
@@ -547,7 +569,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
547 | goto free_it; | 569 | goto free_it; |
548 | } | 570 | } |
549 | 571 | ||
550 | if (!remove_mapping(mapping, page)) | 572 | if (!mapping || !remove_mapping(mapping, page)) |
551 | goto keep_locked; | 573 | goto keep_locked; |
552 | 574 | ||
553 | free_it: | 575 | free_it: |
@@ -564,7 +586,7 @@ keep_locked: | |||
564 | unlock_page(page); | 586 | unlock_page(page); |
565 | keep: | 587 | keep: |
566 | list_add(&page->lru, &ret_pages); | 588 | list_add(&page->lru, &ret_pages); |
567 | BUG_ON(PageLRU(page)); | 589 | VM_BUG_ON(PageLRU(page)); |
568 | } | 590 | } |
569 | list_splice(&ret_pages, page_list); | 591 | list_splice(&ret_pages, page_list); |
570 | if (pagevec_count(&freed_pvec)) | 592 | if (pagevec_count(&freed_pvec)) |
@@ -603,7 +625,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
603 | page = lru_to_page(src); | 625 | page = lru_to_page(src); |
604 | prefetchw_prev_lru_page(page, src, flags); | 626 | prefetchw_prev_lru_page(page, src, flags); |
605 | 627 | ||
606 | BUG_ON(!PageLRU(page)); | 628 | VM_BUG_ON(!PageLRU(page)); |
607 | 629 | ||
608 | list_del(&page->lru); | 630 | list_del(&page->lru); |
609 | target = src; | 631 | target = src; |
@@ -674,7 +696,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
674 | */ | 696 | */ |
675 | while (!list_empty(&page_list)) { | 697 | while (!list_empty(&page_list)) { |
676 | page = lru_to_page(&page_list); | 698 | page = lru_to_page(&page_list); |
677 | BUG_ON(PageLRU(page)); | 699 | VM_BUG_ON(PageLRU(page)); |
678 | SetPageLRU(page); | 700 | SetPageLRU(page); |
679 | list_del(&page->lru); | 701 | list_del(&page->lru); |
680 | if (PageActive(page)) | 702 | if (PageActive(page)) |
@@ -695,6 +717,11 @@ done: | |||
695 | return nr_reclaimed; | 717 | return nr_reclaimed; |
696 | } | 718 | } |
697 | 719 | ||
720 | static inline int zone_is_near_oom(struct zone *zone) | ||
721 | { | ||
722 | return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3; | ||
723 | } | ||
724 | |||
698 | /* | 725 | /* |
699 | * This moves pages from the active list to the inactive list. | 726 | * This moves pages from the active list to the inactive list. |
700 | * | 727 | * |
@@ -730,6 +757,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
730 | long distress; | 757 | long distress; |
731 | long swap_tendency; | 758 | long swap_tendency; |
732 | 759 | ||
760 | if (zone_is_near_oom(zone)) | ||
761 | goto force_reclaim_mapped; | ||
762 | |||
733 | /* | 763 | /* |
734 | * `distress' is a measure of how much trouble we're having | 764 | * `distress' is a measure of how much trouble we're having |
735 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. | 765 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. |
@@ -765,6 +795,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
765 | * memory onto the inactive list. | 795 | * memory onto the inactive list. |
766 | */ | 796 | */ |
767 | if (swap_tendency >= 100) | 797 | if (swap_tendency >= 100) |
798 | force_reclaim_mapped: | ||
768 | reclaim_mapped = 1; | 799 | reclaim_mapped = 1; |
769 | } | 800 | } |
770 | 801 | ||
@@ -797,9 +828,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
797 | while (!list_empty(&l_inactive)) { | 828 | while (!list_empty(&l_inactive)) { |
798 | page = lru_to_page(&l_inactive); | 829 | page = lru_to_page(&l_inactive); |
799 | prefetchw_prev_lru_page(page, &l_inactive, flags); | 830 | prefetchw_prev_lru_page(page, &l_inactive, flags); |
800 | BUG_ON(PageLRU(page)); | 831 | VM_BUG_ON(PageLRU(page)); |
801 | SetPageLRU(page); | 832 | SetPageLRU(page); |
802 | BUG_ON(!PageActive(page)); | 833 | VM_BUG_ON(!PageActive(page)); |
803 | ClearPageActive(page); | 834 | ClearPageActive(page); |
804 | 835 | ||
805 | list_move(&page->lru, &zone->inactive_list); | 836 | list_move(&page->lru, &zone->inactive_list); |
@@ -827,9 +858,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
827 | while (!list_empty(&l_active)) { | 858 | while (!list_empty(&l_active)) { |
828 | page = lru_to_page(&l_active); | 859 | page = lru_to_page(&l_active); |
829 | prefetchw_prev_lru_page(page, &l_active, flags); | 860 | prefetchw_prev_lru_page(page, &l_active, flags); |
830 | BUG_ON(PageLRU(page)); | 861 | VM_BUG_ON(PageLRU(page)); |
831 | SetPageLRU(page); | 862 | SetPageLRU(page); |
832 | BUG_ON(!PageActive(page)); | 863 | VM_BUG_ON(!PageActive(page)); |
833 | list_move(&page->lru, &zone->active_list); | 864 | list_move(&page->lru, &zone->active_list); |
834 | pgmoved++; | 865 | pgmoved++; |
835 | if (!pagevec_add(&pvec, page)) { | 866 | if (!pagevec_add(&pvec, page)) { |
@@ -925,6 +956,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
925 | unsigned long nr_reclaimed = 0; | 956 | unsigned long nr_reclaimed = 0; |
926 | int i; | 957 | int i; |
927 | 958 | ||
959 | sc->all_unreclaimable = 1; | ||
928 | for (i = 0; zones[i] != NULL; i++) { | 960 | for (i = 0; zones[i] != NULL; i++) { |
929 | struct zone *zone = zones[i]; | 961 | struct zone *zone = zones[i]; |
930 | 962 | ||
@@ -941,6 +973,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
941 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 973 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
942 | continue; /* Let kswapd poll it */ | 974 | continue; /* Let kswapd poll it */ |
943 | 975 | ||
976 | sc->all_unreclaimable = 0; | ||
977 | |||
944 | nr_reclaimed += shrink_zone(priority, zone, sc); | 978 | nr_reclaimed += shrink_zone(priority, zone, sc); |
945 | } | 979 | } |
946 | return nr_reclaimed; | 980 | return nr_reclaimed; |
@@ -1021,6 +1055,9 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
1021 | if (sc.nr_scanned && priority < DEF_PRIORITY - 2) | 1055 | if (sc.nr_scanned && priority < DEF_PRIORITY - 2) |
1022 | blk_congestion_wait(WRITE, HZ/10); | 1056 | blk_congestion_wait(WRITE, HZ/10); |
1023 | } | 1057 | } |
1058 | /* top priority shrink_caches still had more to do? don't OOM, then */ | ||
1059 | if (!sc.all_unreclaimable) | ||
1060 | ret = 1; | ||
1024 | out: | 1061 | out: |
1025 | for (i = 0; zones[i] != 0; i++) { | 1062 | for (i = 0; zones[i] != 0; i++) { |
1026 | struct zone *zone = zones[i]; | 1063 | struct zone *zone = zones[i]; |
@@ -1153,7 +1190,7 @@ scan: | |||
1153 | if (zone->all_unreclaimable) | 1190 | if (zone->all_unreclaimable) |
1154 | continue; | 1191 | continue; |
1155 | if (nr_slab == 0 && zone->pages_scanned >= | 1192 | if (nr_slab == 0 && zone->pages_scanned >= |
1156 | (zone->nr_active + zone->nr_inactive) * 4) | 1193 | (zone->nr_active + zone->nr_inactive) * 6) |
1157 | zone->all_unreclaimable = 1; | 1194 | zone->all_unreclaimable = 1; |
1158 | /* | 1195 | /* |
1159 | * If we've done a decent amount of scanning and | 1196 | * If we've done a decent amount of scanning and |
@@ -1361,7 +1398,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1361 | for_each_zone(zone) | 1398 | for_each_zone(zone) |
1362 | lru_pages += zone->nr_active + zone->nr_inactive; | 1399 | lru_pages += zone->nr_active + zone->nr_inactive; |
1363 | 1400 | ||
1364 | nr_slab = global_page_state(NR_SLAB); | 1401 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); |
1365 | /* If slab caches are huge, it's better to hit them first */ | 1402 | /* If slab caches are huge, it's better to hit them first */ |
1366 | while (nr_slab >= lru_pages) { | 1403 | while (nr_slab >= lru_pages) { |
1367 | reclaim_state.reclaimed_slab = 0; | 1404 | reclaim_state.reclaimed_slab = 0; |
@@ -1510,7 +1547,6 @@ int zone_reclaim_mode __read_mostly; | |||
1510 | #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ | 1547 | #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ |
1511 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ | 1548 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ |
1512 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ | 1549 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ |
1513 | #define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ | ||
1514 | 1550 | ||
1515 | /* | 1551 | /* |
1516 | * Priority for ZONE_RECLAIM. This determines the fraction of pages | 1552 | * Priority for ZONE_RECLAIM. This determines the fraction of pages |
@@ -1526,6 +1562,12 @@ int zone_reclaim_mode __read_mostly; | |||
1526 | int sysctl_min_unmapped_ratio = 1; | 1562 | int sysctl_min_unmapped_ratio = 1; |
1527 | 1563 | ||
1528 | /* | 1564 | /* |
1565 | * If the number of slab pages in a zone grows beyond this percentage then | ||
1566 | * slab reclaim needs to occur. | ||
1567 | */ | ||
1568 | int sysctl_min_slab_ratio = 5; | ||
1569 | |||
1570 | /* | ||
1529 | * Try to free up some pages from this zone through reclaim. | 1571 | * Try to free up some pages from this zone through reclaim. |
1530 | */ | 1572 | */ |
1531 | static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 1573 | static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) |
@@ -1544,6 +1586,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1544 | .gfp_mask = gfp_mask, | 1586 | .gfp_mask = gfp_mask, |
1545 | .swappiness = vm_swappiness, | 1587 | .swappiness = vm_swappiness, |
1546 | }; | 1588 | }; |
1589 | unsigned long slab_reclaimable; | ||
1547 | 1590 | ||
1548 | disable_swap_token(); | 1591 | disable_swap_token(); |
1549 | cond_resched(); | 1592 | cond_resched(); |
@@ -1556,29 +1599,43 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1556 | reclaim_state.reclaimed_slab = 0; | 1599 | reclaim_state.reclaimed_slab = 0; |
1557 | p->reclaim_state = &reclaim_state; | 1600 | p->reclaim_state = &reclaim_state; |
1558 | 1601 | ||
1559 | /* | 1602 | if (zone_page_state(zone, NR_FILE_PAGES) - |
1560 | * Free memory by calling shrink zone with increasing priorities | 1603 | zone_page_state(zone, NR_FILE_MAPPED) > |
1561 | * until we have enough memory freed. | 1604 | zone->min_unmapped_pages) { |
1562 | */ | 1605 | /* |
1563 | priority = ZONE_RECLAIM_PRIORITY; | 1606 | * Free memory by calling shrink zone with increasing |
1564 | do { | 1607 | * priorities until we have enough memory freed. |
1565 | nr_reclaimed += shrink_zone(priority, zone, &sc); | 1608 | */ |
1566 | priority--; | 1609 | priority = ZONE_RECLAIM_PRIORITY; |
1567 | } while (priority >= 0 && nr_reclaimed < nr_pages); | 1610 | do { |
1611 | nr_reclaimed += shrink_zone(priority, zone, &sc); | ||
1612 | priority--; | ||
1613 | } while (priority >= 0 && nr_reclaimed < nr_pages); | ||
1614 | } | ||
1568 | 1615 | ||
1569 | if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { | 1616 | slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); |
1617 | if (slab_reclaimable > zone->min_slab_pages) { | ||
1570 | /* | 1618 | /* |
1571 | * shrink_slab() does not currently allow us to determine how | 1619 | * shrink_slab() does not currently allow us to determine how |
1572 | * many pages were freed in this zone. So we just shake the slab | 1620 | * many pages were freed in this zone. So we take the current |
1573 | * a bit and then go off node for this particular allocation | 1621 | * number of slab pages and shake the slab until it is reduced |
1574 | * despite possibly having freed enough memory to allocate in | 1622 | * by the same nr_pages that we used for reclaiming unmapped |
1575 | * this zone. If we freed local memory then the next | 1623 | * pages. |
1576 | * allocations will be local again. | ||
1577 | * | 1624 | * |
1578 | * shrink_slab will free memory on all zones and may take | 1625 | * Note that shrink_slab will free memory on all zones and may |
1579 | * a long time. | 1626 | * take a long time. |
1627 | */ | ||
1628 | while (shrink_slab(sc.nr_scanned, gfp_mask, order) && | ||
1629 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) > | ||
1630 | slab_reclaimable - nr_pages) | ||
1631 | ; | ||
1632 | |||
1633 | /* | ||
1634 | * Update nr_reclaimed by the number of slab pages we | ||
1635 | * reclaimed from this zone. | ||
1580 | */ | 1636 | */ |
1581 | shrink_slab(sc.nr_scanned, gfp_mask, order); | 1637 | nr_reclaimed += slab_reclaimable - |
1638 | zone_page_state(zone, NR_SLAB_RECLAIMABLE); | ||
1582 | } | 1639 | } |
1583 | 1640 | ||
1584 | p->reclaim_state = NULL; | 1641 | p->reclaim_state = NULL; |
@@ -1592,7 +1649,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1592 | int node_id; | 1649 | int node_id; |
1593 | 1650 | ||
1594 | /* | 1651 | /* |
1595 | * Zone reclaim reclaims unmapped file backed pages. | 1652 | * Zone reclaim reclaims unmapped file backed pages and |
1653 | * slab pages if we are over the defined limits. | ||
1596 | * | 1654 | * |
1597 | * A small portion of unmapped file backed pages is needed for | 1655 | * A small portion of unmapped file backed pages is needed for |
1598 | * file I/O otherwise pages read by file I/O will be immediately | 1656 | * file I/O otherwise pages read by file I/O will be immediately |
@@ -1601,7 +1659,9 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1601 | * unmapped file backed pages. | 1659 | * unmapped file backed pages. |
1602 | */ | 1660 | */ |
1603 | if (zone_page_state(zone, NR_FILE_PAGES) - | 1661 | if (zone_page_state(zone, NR_FILE_PAGES) - |
1604 | zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio) | 1662 | zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages |
1663 | && zone_page_state(zone, NR_SLAB_RECLAIMABLE) | ||
1664 | <= zone->min_slab_pages) | ||
1605 | return 0; | 1665 | return 0; |
1606 | 1666 | ||
1607 | /* | 1667 | /* |
@@ -1621,7 +1681,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1621 | * over remote processors and spread off node memory allocations | 1681 | * over remote processors and spread off node memory allocations |
1622 | * as wide as possible. | 1682 | * as wide as possible. |
1623 | */ | 1683 | */ |
1624 | node_id = zone->zone_pgdat->node_id; | 1684 | node_id = zone_to_nid(zone); |
1625 | mask = node_to_cpumask(node_id); | 1685 | mask = node_to_cpumask(node_id); |
1626 | if (!cpus_empty(mask) && node_id != numa_node_id()) | 1686 | if (!cpus_empty(mask) && node_id != numa_node_id()) |
1627 | return 0; | 1687 | return 0; |
diff --git a/mm/vmstat.c b/mm/vmstat.c index c1b5f4106b..a2b6a9f96e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -321,6 +321,9 @@ void refresh_cpu_vm_stats(int cpu) | |||
321 | for_each_zone(zone) { | 321 | for_each_zone(zone) { |
322 | struct per_cpu_pageset *pcp; | 322 | struct per_cpu_pageset *pcp; |
323 | 323 | ||
324 | if (!populated_zone(zone)) | ||
325 | continue; | ||
326 | |||
324 | pcp = zone_pcp(zone, cpu); | 327 | pcp = zone_pcp(zone, cpu); |
325 | 328 | ||
326 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 329 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) |
@@ -368,7 +371,7 @@ void zone_statistics(struct zonelist *zonelist, struct zone *z) | |||
368 | __inc_zone_state(z, NUMA_MISS); | 371 | __inc_zone_state(z, NUMA_MISS); |
369 | __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN); | 372 | __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN); |
370 | } | 373 | } |
371 | if (z->zone_pgdat == NODE_DATA(numa_node_id())) | 374 | if (z->node == numa_node_id()) |
372 | __inc_zone_state(z, NUMA_LOCAL); | 375 | __inc_zone_state(z, NUMA_LOCAL); |
373 | else | 376 | else |
374 | __inc_zone_state(z, NUMA_OTHER); | 377 | __inc_zone_state(z, NUMA_OTHER); |
@@ -435,17 +438,34 @@ struct seq_operations fragmentation_op = { | |||
435 | .show = frag_show, | 438 | .show = frag_show, |
436 | }; | 439 | }; |
437 | 440 | ||
441 | #ifdef CONFIG_ZONE_DMA32 | ||
442 | #define TEXT_FOR_DMA32(xx) xx "_dma32", | ||
443 | #else | ||
444 | #define TEXT_FOR_DMA32(xx) | ||
445 | #endif | ||
446 | |||
447 | #ifdef CONFIG_HIGHMEM | ||
448 | #define TEXT_FOR_HIGHMEM(xx) xx "_high", | ||
449 | #else | ||
450 | #define TEXT_FOR_HIGHMEM(xx) | ||
451 | #endif | ||
452 | |||
453 | #define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \ | ||
454 | TEXT_FOR_HIGHMEM(xx) | ||
455 | |||
438 | static char *vmstat_text[] = { | 456 | static char *vmstat_text[] = { |
439 | /* Zoned VM counters */ | 457 | /* Zoned VM counters */ |
440 | "nr_anon_pages", | 458 | "nr_anon_pages", |
441 | "nr_mapped", | 459 | "nr_mapped", |
442 | "nr_file_pages", | 460 | "nr_file_pages", |
443 | "nr_slab", | 461 | "nr_slab_reclaimable", |
462 | "nr_slab_unreclaimable", | ||
444 | "nr_page_table_pages", | 463 | "nr_page_table_pages", |
445 | "nr_dirty", | 464 | "nr_dirty", |
446 | "nr_writeback", | 465 | "nr_writeback", |
447 | "nr_unstable", | 466 | "nr_unstable", |
448 | "nr_bounce", | 467 | "nr_bounce", |
468 | "nr_vmscan_write", | ||
449 | 469 | ||
450 | #ifdef CONFIG_NUMA | 470 | #ifdef CONFIG_NUMA |
451 | "numa_hit", | 471 | "numa_hit", |
@@ -462,10 +482,7 @@ static char *vmstat_text[] = { | |||
462 | "pswpin", | 482 | "pswpin", |
463 | "pswpout", | 483 | "pswpout", |
464 | 484 | ||
465 | "pgalloc_dma", | 485 | TEXTS_FOR_ZONES("pgalloc") |
466 | "pgalloc_dma32", | ||
467 | "pgalloc_normal", | ||
468 | "pgalloc_high", | ||
469 | 486 | ||
470 | "pgfree", | 487 | "pgfree", |
471 | "pgactivate", | 488 | "pgactivate", |
@@ -474,25 +491,10 @@ static char *vmstat_text[] = { | |||
474 | "pgfault", | 491 | "pgfault", |
475 | "pgmajfault", | 492 | "pgmajfault", |
476 | 493 | ||
477 | "pgrefill_dma", | 494 | TEXTS_FOR_ZONES("pgrefill") |
478 | "pgrefill_dma32", | 495 | TEXTS_FOR_ZONES("pgsteal") |
479 | "pgrefill_normal", | 496 | TEXTS_FOR_ZONES("pgscan_kswapd") |
480 | "pgrefill_high", | 497 | TEXTS_FOR_ZONES("pgscan_direct") |
481 | |||
482 | "pgsteal_dma", | ||
483 | "pgsteal_dma32", | ||
484 | "pgsteal_normal", | ||
485 | "pgsteal_high", | ||
486 | |||
487 | "pgscan_kswapd_dma", | ||
488 | "pgscan_kswapd_dma32", | ||
489 | "pgscan_kswapd_normal", | ||
490 | "pgscan_kswapd_high", | ||
491 | |||
492 | "pgscan_direct_dma", | ||
493 | "pgscan_direct_dma32", | ||
494 | "pgscan_direct_normal", | ||
495 | "pgscan_direct_high", | ||
496 | 498 | ||
497 | "pginodesteal", | 499 | "pginodesteal", |
498 | "slabs_scanned", | 500 | "slabs_scanned", |