aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig5
-rw-r--r--mm/Makefile2
-rw-r--r--mm/allocpercpu.c24
-rw-r--r--mm/bootmem.c935
-rw-r--r--mm/filemap.c389
-rw-r--r--mm/hugetlb.c1630
-rw-r--r--mm/internal.h61
-rw-r--r--mm/memcontrol.c364
-rw-r--r--mm/memory.c246
-rw-r--r--mm/memory_hotplug.c80
-rw-r--r--mm/mempolicy.c9
-rw-r--r--mm/migrate.c49
-rw-r--r--mm/mm_init.c152
-rw-r--r--mm/mmap.c12
-rw-r--r--mm/mprotect.c6
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/page-writeback.c12
-rw-r--r--mm/page_alloc.c152
-rw-r--r--mm/pdflush.c4
-rw-r--r--mm/readahead.c6
-rw-r--r--mm/rmap.c16
-rw-r--r--mm/shmem.c99
-rw-r--r--mm/slab.c11
-rw-r--r--mm/slob.c19
-rw-r--r--mm/slub.c78
-rw-r--r--mm/sparse.c117
-rw-r--r--mm/swap.c8
-rw-r--r--mm/swap_state.c30
-rw-r--r--mm/swapfile.c59
-rw-r--r--mm/truncate.c6
-rw-r--r--mm/util.c11
-rw-r--r--mm/vmalloc.c26
-rw-r--r--mm/vmscan.c85
-rw-r--r--mm/vmstat.c3
34 files changed, 3191 insertions, 1519 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c4de85285bb4..efee5d379df4 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -77,6 +77,9 @@ config FLAT_NODE_MEM_MAP
77 def_bool y 77 def_bool y
78 depends on !SPARSEMEM 78 depends on !SPARSEMEM
79 79
80config HAVE_GET_USER_PAGES_FAST
81 bool
82
80# 83#
81# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's 84# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
82# to represent different areas of memory. This variable allows 85# to represent different areas of memory. This variable allows
@@ -174,7 +177,7 @@ config SPLIT_PTLOCK_CPUS
174config MIGRATION 177config MIGRATION
175 bool "Page migration" 178 bool "Page migration"
176 def_bool y 179 def_bool y
177 depends on NUMA 180 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE
178 help 181 help
179 Allows the migration of the physical location of pages of processes 182 Allows the migration of the physical location of pages of processes
180 while the virtual addresses are not changed. This is useful for 183 while the virtual addresses are not changed. This is useful for
diff --git a/mm/Makefile b/mm/Makefile
index 18c143b3c46c..06ca2381fef1 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o pdflush.o \ 11 maccess.o page_alloc.o page-writeback.o pdflush.o \
12 readahead.o swap.o truncate.o vmscan.o \ 12 readahead.o swap.o truncate.o vmscan.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o $(mmu-y) 14 page_isolation.o mm_init.o $(mmu-y)
15 15
16obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o 16obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
17obj-$(CONFIG_BOUNCE) += bounce.o 17obj-$(CONFIG_BOUNCE) += bounce.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 05f2b4009ccc..4297bc41bfd2 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -18,27 +18,28 @@
18 * Depopulating per-cpu data for a cpu going offline would be a typical 18 * Depopulating per-cpu data for a cpu going offline would be a typical
19 * use case. You need to register a cpu hotplug handler for that purpose. 19 * use case. You need to register a cpu hotplug handler for that purpose.
20 */ 20 */
21void percpu_depopulate(void *__pdata, int cpu) 21static void percpu_depopulate(void *__pdata, int cpu)
22{ 22{
23 struct percpu_data *pdata = __percpu_disguise(__pdata); 23 struct percpu_data *pdata = __percpu_disguise(__pdata);
24 24
25 kfree(pdata->ptrs[cpu]); 25 kfree(pdata->ptrs[cpu]);
26 pdata->ptrs[cpu] = NULL; 26 pdata->ptrs[cpu] = NULL;
27} 27}
28EXPORT_SYMBOL_GPL(percpu_depopulate);
29 28
30/** 29/**
31 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's 30 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
32 * @__pdata: per-cpu data to depopulate 31 * @__pdata: per-cpu data to depopulate
33 * @mask: depopulate per-cpu data for cpu's selected through mask bits 32 * @mask: depopulate per-cpu data for cpu's selected through mask bits
34 */ 33 */
35void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) 34static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
36{ 35{
37 int cpu; 36 int cpu;
38 for_each_cpu_mask(cpu, *mask) 37 for_each_cpu_mask_nr(cpu, *mask)
39 percpu_depopulate(__pdata, cpu); 38 percpu_depopulate(__pdata, cpu);
40} 39}
41EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); 40
41#define percpu_depopulate_mask(__pdata, mask) \
42 __percpu_depopulate_mask((__pdata), &(mask))
42 43
43/** 44/**
44 * percpu_populate - populate per-cpu data for given cpu 45 * percpu_populate - populate per-cpu data for given cpu
@@ -51,7 +52,7 @@ EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
51 * use case. You need to register a cpu hotplug handler for that purpose. 52 * use case. You need to register a cpu hotplug handler for that purpose.
52 * Per-cpu object is populated with zeroed buffer. 53 * Per-cpu object is populated with zeroed buffer.
53 */ 54 */
54void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) 55static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
55{ 56{
56 struct percpu_data *pdata = __percpu_disguise(__pdata); 57 struct percpu_data *pdata = __percpu_disguise(__pdata);
57 int node = cpu_to_node(cpu); 58 int node = cpu_to_node(cpu);
@@ -68,7 +69,6 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
68 pdata->ptrs[cpu] = kzalloc(size, gfp); 69 pdata->ptrs[cpu] = kzalloc(size, gfp);
69 return pdata->ptrs[cpu]; 70 return pdata->ptrs[cpu];
70} 71}
71EXPORT_SYMBOL_GPL(percpu_populate);
72 72
73/** 73/**
74 * percpu_populate_mask - populate per-cpu data for more cpu's 74 * percpu_populate_mask - populate per-cpu data for more cpu's
@@ -79,14 +79,14 @@ EXPORT_SYMBOL_GPL(percpu_populate);
79 * 79 *
80 * Per-cpu objects are populated with zeroed buffers. 80 * Per-cpu objects are populated with zeroed buffers.
81 */ 81 */
82int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, 82static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
83 cpumask_t *mask) 83 cpumask_t *mask)
84{ 84{
85 cpumask_t populated; 85 cpumask_t populated;
86 int cpu; 86 int cpu;
87 87
88 cpus_clear(populated); 88 cpus_clear(populated);
89 for_each_cpu_mask(cpu, *mask) 89 for_each_cpu_mask_nr(cpu, *mask)
90 if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { 90 if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
91 __percpu_depopulate_mask(__pdata, &populated); 91 __percpu_depopulate_mask(__pdata, &populated);
92 return -ENOMEM; 92 return -ENOMEM;
@@ -94,7 +94,9 @@ int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
94 cpu_set(cpu, populated); 94 cpu_set(cpu, populated);
95 return 0; 95 return 0;
96} 96}
97EXPORT_SYMBOL_GPL(__percpu_populate_mask); 97
98#define percpu_populate_mask(__pdata, size, gfp, mask) \
99 __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
98 100
99/** 101/**
100 * percpu_alloc_mask - initial setup of per-cpu data 102 * percpu_alloc_mask - initial setup of per-cpu data
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 8d9f60e06f62..4af15d0340ad 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -1,12 +1,12 @@
1/* 1/*
2 * linux/mm/bootmem.c 2 * bootmem - A boot-time physical memory allocator and configurator
3 * 3 *
4 * Copyright (C) 1999 Ingo Molnar 4 * Copyright (C) 1999 Ingo Molnar
5 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 5 * 1999 Kanoj Sarcar, SGI
6 * 2008 Johannes Weiner
6 * 7 *
7 * simple boot-time physical memory area allocator and 8 * Access to this subsystem has to be serialized externally (which is true
8 * free memory collector. It's used to deal with reserved 9 * for the boot process anyway).
9 * system memory and memory holes as well.
10 */ 10 */
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/pfn.h> 12#include <linux/pfn.h>
@@ -19,15 +19,10 @@
19 19
20#include "internal.h" 20#include "internal.h"
21 21
22/*
23 * Access to this subsystem has to be serialized externally. (this is
24 * true for the boot process anyway)
25 */
26unsigned long max_low_pfn; 22unsigned long max_low_pfn;
27unsigned long min_low_pfn; 23unsigned long min_low_pfn;
28unsigned long max_pfn; 24unsigned long max_pfn;
29 25
30static LIST_HEAD(bdata_list);
31#ifdef CONFIG_CRASH_DUMP 26#ifdef CONFIG_CRASH_DUMP
32/* 27/*
33 * If we have booted due to a crash, max_pfn will be a very low value. We need 28 * If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -36,63 +31,72 @@ static LIST_HEAD(bdata_list);
36unsigned long saved_max_pfn; 31unsigned long saved_max_pfn;
37#endif 32#endif
38 33
39/* return the number of _pages_ that will be allocated for the boot bitmap */ 34bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
40unsigned long __init bootmem_bootmap_pages(unsigned long pages) 35
36static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
37
38static int bootmem_debug;
39
40static int __init bootmem_debug_setup(char *buf)
41{ 41{
42 unsigned long mapsize; 42 bootmem_debug = 1;
43 return 0;
44}
45early_param("bootmem_debug", bootmem_debug_setup);
43 46
44 mapsize = (pages+7)/8; 47#define bdebug(fmt, args...) ({ \
45 mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; 48 if (unlikely(bootmem_debug)) \
46 mapsize >>= PAGE_SHIFT; 49 printk(KERN_INFO \
50 "bootmem::%s " fmt, \
51 __FUNCTION__, ## args); \
52})
47 53
48 return mapsize; 54static unsigned long __init bootmap_bytes(unsigned long pages)
55{
56 unsigned long bytes = (pages + 7) / 8;
57
58 return ALIGN(bytes, sizeof(long));
49} 59}
50 60
51/* 61/**
52 * link bdata in order 62 * bootmem_bootmap_pages - calculate bitmap size in pages
63 * @pages: number of pages the bitmap has to represent
53 */ 64 */
54static void __init link_bootmem(bootmem_data_t *bdata) 65unsigned long __init bootmem_bootmap_pages(unsigned long pages)
55{ 66{
56 bootmem_data_t *ent; 67 unsigned long bytes = bootmap_bytes(pages);
57 68
58 if (list_empty(&bdata_list)) { 69 return PAGE_ALIGN(bytes) >> PAGE_SHIFT;
59 list_add(&bdata->list, &bdata_list);
60 return;
61 }
62 /* insert in order */
63 list_for_each_entry(ent, &bdata_list, list) {
64 if (bdata->node_boot_start < ent->node_boot_start) {
65 list_add_tail(&bdata->list, &ent->list);
66 return;
67 }
68 }
69 list_add_tail(&bdata->list, &bdata_list);
70} 70}
71 71
72/* 72/*
73 * Given an initialised bdata, it returns the size of the boot bitmap 73 * link bdata in order
74 */ 74 */
75static unsigned long __init get_mapsize(bootmem_data_t *bdata) 75static void __init link_bootmem(bootmem_data_t *bdata)
76{ 76{
77 unsigned long mapsize; 77 struct list_head *iter;
78 unsigned long start = PFN_DOWN(bdata->node_boot_start);
79 unsigned long end = bdata->node_low_pfn;
80 78
81 mapsize = ((end - start) + 7) / 8; 79 list_for_each(iter, &bdata_list) {
82 return ALIGN(mapsize, sizeof(long)); 80 bootmem_data_t *ent;
81
82 ent = list_entry(iter, bootmem_data_t, list);
83 if (bdata->node_min_pfn < ent->node_min_pfn)
84 break;
85 }
86 list_add_tail(&bdata->list, iter);
83} 87}
84 88
85/* 89/*
86 * Called once to set up the allocator itself. 90 * Called once to set up the allocator itself.
87 */ 91 */
88static unsigned long __init init_bootmem_core(pg_data_t *pgdat, 92static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
89 unsigned long mapstart, unsigned long start, unsigned long end) 93 unsigned long mapstart, unsigned long start, unsigned long end)
90{ 94{
91 bootmem_data_t *bdata = pgdat->bdata;
92 unsigned long mapsize; 95 unsigned long mapsize;
93 96
97 mminit_validate_memmodel_limits(&start, &end);
94 bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); 98 bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
95 bdata->node_boot_start = PFN_PHYS(start); 99 bdata->node_min_pfn = start;
96 bdata->node_low_pfn = end; 100 bdata->node_low_pfn = end;
97 link_bootmem(bdata); 101 link_bootmem(bdata);
98 102
@@ -100,429 +104,461 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
100 * Initially all pages are reserved - setup_arch() has to 104 * Initially all pages are reserved - setup_arch() has to
101 * register free RAM areas explicitly. 105 * register free RAM areas explicitly.
102 */ 106 */
103 mapsize = get_mapsize(bdata); 107 mapsize = bootmap_bytes(end - start);
104 memset(bdata->node_bootmem_map, 0xff, mapsize); 108 memset(bdata->node_bootmem_map, 0xff, mapsize);
105 109
110 bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
111 bdata - bootmem_node_data, start, mapstart, end, mapsize);
112
106 return mapsize; 113 return mapsize;
107} 114}
108 115
109/* 116/**
110 * Marks a particular physical memory range as unallocatable. Usable RAM 117 * init_bootmem_node - register a node as boot memory
111 * might be used for boot-time allocations - or it might get added 118 * @pgdat: node to register
112 * to the free page pool later on. 119 * @freepfn: pfn where the bitmap for this node is to be placed
120 * @startpfn: first pfn on the node
121 * @endpfn: first pfn after the node
122 *
123 * Returns the number of bytes needed to hold the bitmap for this node.
113 */ 124 */
114static int __init can_reserve_bootmem_core(bootmem_data_t *bdata, 125unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
115 unsigned long addr, unsigned long size, int flags) 126 unsigned long startpfn, unsigned long endpfn)
116{ 127{
117 unsigned long sidx, eidx; 128 return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
118 unsigned long i; 129}
119 130
120 BUG_ON(!size); 131/**
132 * init_bootmem - register boot memory
133 * @start: pfn where the bitmap is to be placed
134 * @pages: number of available physical pages
135 *
136 * Returns the number of bytes needed to hold the bitmap.
137 */
138unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
139{
140 max_low_pfn = pages;
141 min_low_pfn = start;
142 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
143}
121 144
122 /* out of range, don't hold other */ 145static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
123 if (addr + size < bdata->node_boot_start || 146{
124 PFN_DOWN(addr) > bdata->node_low_pfn) 147 int aligned;
148 struct page *page;
149 unsigned long start, end, pages, count = 0;
150
151 if (!bdata->node_bootmem_map)
125 return 0; 152 return 0;
126 153
154 start = bdata->node_min_pfn;
155 end = bdata->node_low_pfn;
156
127 /* 157 /*
128 * Round up to index to the range. 158 * If the start is aligned to the machines wordsize, we might
159 * be able to free pages in bulks of that order.
129 */ 160 */
130 if (addr > bdata->node_boot_start) 161 aligned = !(start & (BITS_PER_LONG - 1));
131 sidx= PFN_DOWN(addr - bdata->node_boot_start);
132 else
133 sidx = 0;
134 162
135 eidx = PFN_UP(addr + size - bdata->node_boot_start); 163 bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
136 if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) 164 bdata - bootmem_node_data, start, end, aligned);
137 eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
138 165
139 for (i = sidx; i < eidx; i++) { 166 while (start < end) {
140 if (test_bit(i, bdata->node_bootmem_map)) { 167 unsigned long *map, idx, vec;
141 if (flags & BOOTMEM_EXCLUSIVE)
142 return -EBUSY;
143 }
144 }
145 168
146 return 0; 169 map = bdata->node_bootmem_map;
170 idx = start - bdata->node_min_pfn;
171 vec = ~map[idx / BITS_PER_LONG];
147 172
148} 173 if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
174 int order = ilog2(BITS_PER_LONG);
149 175
150static void __init reserve_bootmem_core(bootmem_data_t *bdata, 176 __free_pages_bootmem(pfn_to_page(start), order);
151 unsigned long addr, unsigned long size, int flags) 177 count += BITS_PER_LONG;
152{ 178 } else {
153 unsigned long sidx, eidx; 179 unsigned long off = 0;
154 unsigned long i;
155
156 BUG_ON(!size);
157 180
158 /* out of range */ 181 while (vec && off < BITS_PER_LONG) {
159 if (addr + size < bdata->node_boot_start || 182 if (vec & 1) {
160 PFN_DOWN(addr) > bdata->node_low_pfn) 183 page = pfn_to_page(start + off);
161 return; 184 __free_pages_bootmem(page, 0);
185 count++;
186 }
187 vec >>= 1;
188 off++;
189 }
190 }
191 start += BITS_PER_LONG;
192 }
162 193
163 /* 194 page = virt_to_page(bdata->node_bootmem_map);
164 * Round up to index to the range. 195 pages = bdata->node_low_pfn - bdata->node_min_pfn;
165 */ 196 pages = bootmem_bootmap_pages(pages);
166 if (addr > bdata->node_boot_start) 197 count += pages;
167 sidx= PFN_DOWN(addr - bdata->node_boot_start); 198 while (pages--)
168 else 199 __free_pages_bootmem(page++, 0);
169 sidx = 0;
170 200
171 eidx = PFN_UP(addr + size - bdata->node_boot_start); 201 bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
172 if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
173 eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
174 202
175 for (i = sidx; i < eidx; i++) { 203 return count;
176 if (test_and_set_bit(i, bdata->node_bootmem_map)) {
177#ifdef CONFIG_DEBUG_BOOTMEM
178 printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
179#endif
180 }
181 }
182} 204}
183 205
184static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, 206/**
185 unsigned long size) 207 * free_all_bootmem_node - release a node's free pages to the buddy allocator
208 * @pgdat: node to be released
209 *
210 * Returns the number of pages actually released.
211 */
212unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
186{ 213{
187 unsigned long sidx, eidx; 214 register_page_bootmem_info_node(pgdat);
188 unsigned long i; 215 return free_all_bootmem_core(pgdat->bdata);
189 216}
190 BUG_ON(!size);
191 217
192 /* out range */ 218/**
193 if (addr + size < bdata->node_boot_start || 219 * free_all_bootmem - release free pages to the buddy allocator
194 PFN_DOWN(addr) > bdata->node_low_pfn) 220 *
195 return; 221 * Returns the number of pages actually released.
196 /* 222 */
197 * round down end of usable mem, partially free pages are 223unsigned long __init free_all_bootmem(void)
198 * considered reserved. 224{
199 */ 225 return free_all_bootmem_core(NODE_DATA(0)->bdata);
226}
200 227
201 if (addr >= bdata->node_boot_start && addr < bdata->last_success) 228static void __init __free(bootmem_data_t *bdata,
202 bdata->last_success = addr; 229 unsigned long sidx, unsigned long eidx)
230{
231 unsigned long idx;
203 232
204 /* 233 bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
205 * Round up to index to the range. 234 sidx + bdata->node_min_pfn,
206 */ 235 eidx + bdata->node_min_pfn);
207 if (PFN_UP(addr) > PFN_DOWN(bdata->node_boot_start))
208 sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start);
209 else
210 sidx = 0;
211 236
212 eidx = PFN_DOWN(addr + size - bdata->node_boot_start); 237 if (bdata->hint_idx > sidx)
213 if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) 238 bdata->hint_idx = sidx;
214 eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
215 239
216 for (i = sidx; i < eidx; i++) { 240 for (idx = sidx; idx < eidx; idx++)
217 if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) 241 if (!test_and_clear_bit(idx, bdata->node_bootmem_map))
218 BUG(); 242 BUG();
219 }
220} 243}
221 244
222/* 245static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
223 * We 'merge' subsequent allocations to save space. We might 'lose' 246 unsigned long eidx, int flags)
224 * some fraction of a page if allocations cannot be satisfied due to
225 * size constraints on boxes where there is physical RAM space
226 * fragmentation - in these cases (mostly large memory boxes) this
227 * is not a problem.
228 *
229 * On low memory boxes we get it right in 100% of the cases.
230 *
231 * alignment has to be a power of 2 value.
232 *
233 * NOTE: This function is _not_ reentrant.
234 */
235void * __init
236__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
237 unsigned long align, unsigned long goal, unsigned long limit)
238{ 247{
239 unsigned long areasize, preferred; 248 unsigned long idx;
240 unsigned long i, start = 0, incr, eidx, end_pfn; 249 int exclusive = flags & BOOTMEM_EXCLUSIVE;
241 void *ret; 250
242 unsigned long node_boot_start; 251 bdebug("nid=%td start=%lx end=%lx flags=%x\n",
243 void *node_bootmem_map; 252 bdata - bootmem_node_data,
244 253 sidx + bdata->node_min_pfn,
245 if (!size) { 254 eidx + bdata->node_min_pfn,
246 printk("__alloc_bootmem_core(): zero-sized request\n"); 255 flags);
247 BUG(); 256
248 } 257 for (idx = sidx; idx < eidx; idx++)
249 BUG_ON(align & (align-1)); 258 if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
250 259 if (exclusive) {
251 /* on nodes without memory - bootmem_map is NULL */ 260 __free(bdata, sidx, idx);
252 if (!bdata->node_bootmem_map) 261 return -EBUSY;
253 return NULL; 262 }
263 bdebug("silent double reserve of PFN %lx\n",
264 idx + bdata->node_min_pfn);
265 }
266 return 0;
267}
254 268
255 /* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */ 269static int __init mark_bootmem_node(bootmem_data_t *bdata,
256 node_boot_start = bdata->node_boot_start; 270 unsigned long start, unsigned long end,
257 node_bootmem_map = bdata->node_bootmem_map; 271 int reserve, int flags)
258 if (align) { 272{
259 node_boot_start = ALIGN(bdata->node_boot_start, align); 273 unsigned long sidx, eidx;
260 if (node_boot_start > bdata->node_boot_start)
261 node_bootmem_map = (unsigned long *)bdata->node_bootmem_map +
262 PFN_DOWN(node_boot_start - bdata->node_boot_start)/BITS_PER_LONG;
263 }
264 274
265 if (limit && node_boot_start >= limit) 275 bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n",
266 return NULL; 276 bdata - bootmem_node_data, start, end, reserve, flags);
267 277
268 end_pfn = bdata->node_low_pfn; 278 BUG_ON(start < bdata->node_min_pfn);
269 limit = PFN_DOWN(limit); 279 BUG_ON(end > bdata->node_low_pfn);
270 if (limit && end_pfn > limit)
271 end_pfn = limit;
272 280
273 eidx = end_pfn - PFN_DOWN(node_boot_start); 281 sidx = start - bdata->node_min_pfn;
282 eidx = end - bdata->node_min_pfn;
274 283
275 /* 284 if (reserve)
276 * We try to allocate bootmem pages above 'goal' 285 return __reserve(bdata, sidx, eidx, flags);
277 * first, then we try to allocate lower pages. 286 else
278 */ 287 __free(bdata, sidx, eidx);
279 preferred = 0; 288 return 0;
280 if (goal && PFN_DOWN(goal) < end_pfn) { 289}
281 if (goal > node_boot_start)
282 preferred = goal - node_boot_start;
283
284 if (bdata->last_success > node_boot_start &&
285 bdata->last_success - node_boot_start >= preferred)
286 if (!limit || (limit && limit > bdata->last_success))
287 preferred = bdata->last_success - node_boot_start;
288 }
289 290
290 preferred = PFN_DOWN(ALIGN(preferred, align)); 291static int __init mark_bootmem(unsigned long start, unsigned long end,
291 areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; 292 int reserve, int flags)
292 incr = align >> PAGE_SHIFT ? : 1; 293{
294 unsigned long pos;
295 bootmem_data_t *bdata;
293 296
294restart_scan: 297 pos = start;
295 for (i = preferred; i < eidx;) { 298 list_for_each_entry(bdata, &bdata_list, list) {
296 unsigned long j; 299 int err;
300 unsigned long max;
297 301
298 i = find_next_zero_bit(node_bootmem_map, eidx, i); 302 if (pos < bdata->node_min_pfn ||
299 i = ALIGN(i, incr); 303 pos >= bdata->node_low_pfn) {
300 if (i >= eidx) 304 BUG_ON(pos != start);
301 break;
302 if (test_bit(i, node_bootmem_map)) {
303 i += incr;
304 continue; 305 continue;
305 } 306 }
306 for (j = i + 1; j < i + areasize; ++j) {
307 if (j >= eidx)
308 goto fail_block;
309 if (test_bit(j, node_bootmem_map))
310 goto fail_block;
311 }
312 start = i;
313 goto found;
314 fail_block:
315 i = ALIGN(j, incr);
316 if (i == j)
317 i += incr;
318 }
319 307
320 if (preferred > 0) { 308 max = min(bdata->node_low_pfn, end);
321 preferred = 0;
322 goto restart_scan;
323 }
324 return NULL;
325 309
326found: 310 err = mark_bootmem_node(bdata, pos, max, reserve, flags);
327 bdata->last_success = PFN_PHYS(start) + node_boot_start; 311 if (reserve && err) {
328 BUG_ON(start >= eidx); 312 mark_bootmem(start, pos, 0, 0);
329 313 return err;
330 /*
331 * Is the next page of the previous allocation-end the start
332 * of this allocation's buffer? If yes then we can 'merge'
333 * the previous partial page with this allocation.
334 */
335 if (align < PAGE_SIZE &&
336 bdata->last_offset && bdata->last_pos+1 == start) {
337 unsigned long offset, remaining_size;
338 offset = ALIGN(bdata->last_offset, align);
339 BUG_ON(offset > PAGE_SIZE);
340 remaining_size = PAGE_SIZE - offset;
341 if (size < remaining_size) {
342 areasize = 0;
343 /* last_pos unchanged */
344 bdata->last_offset = offset + size;
345 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
346 offset + node_boot_start);
347 } else {
348 remaining_size = size - remaining_size;
349 areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
350 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
351 offset + node_boot_start);
352 bdata->last_pos = start + areasize - 1;
353 bdata->last_offset = remaining_size;
354 } 314 }
355 bdata->last_offset &= ~PAGE_MASK;
356 } else {
357 bdata->last_pos = start + areasize - 1;
358 bdata->last_offset = size & ~PAGE_MASK;
359 ret = phys_to_virt(start * PAGE_SIZE + node_boot_start);
360 }
361 315
362 /* 316 if (max == end)
363 * Reserve the area now: 317 return 0;
364 */ 318 pos = bdata->node_low_pfn;
365 for (i = start; i < start + areasize; i++) 319 }
366 if (unlikely(test_and_set_bit(i, node_bootmem_map))) 320 BUG();
367 BUG();
368 memset(ret, 0, size);
369 return ret;
370} 321}
371 322
372static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) 323/**
324 * free_bootmem_node - mark a page range as usable
325 * @pgdat: node the range resides on
326 * @physaddr: starting address of the range
327 * @size: size of the range in bytes
328 *
329 * Partial pages will be considered reserved and left as they are.
330 *
331 * The range must reside completely on the specified node.
332 */
333void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
334 unsigned long size)
373{ 335{
374 struct page *page; 336 unsigned long start, end;
375 unsigned long pfn;
376 bootmem_data_t *bdata = pgdat->bdata;
377 unsigned long i, count, total = 0;
378 unsigned long idx;
379 unsigned long *map;
380 int gofast = 0;
381
382 BUG_ON(!bdata->node_bootmem_map);
383
384 count = 0;
385 /* first extant page of the node */
386 pfn = PFN_DOWN(bdata->node_boot_start);
387 idx = bdata->node_low_pfn - pfn;
388 map = bdata->node_bootmem_map;
389 /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
390 if (bdata->node_boot_start == 0 ||
391 ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
392 gofast = 1;
393 for (i = 0; i < idx; ) {
394 unsigned long v = ~map[i / BITS_PER_LONG];
395
396 if (gofast && v == ~0UL) {
397 int order;
398
399 page = pfn_to_page(pfn);
400 count += BITS_PER_LONG;
401 order = ffs(BITS_PER_LONG) - 1;
402 __free_pages_bootmem(page, order);
403 i += BITS_PER_LONG;
404 page += BITS_PER_LONG;
405 } else if (v) {
406 unsigned long m;
407
408 page = pfn_to_page(pfn);
409 for (m = 1; m && i < idx; m<<=1, page++, i++) {
410 if (v & m) {
411 count++;
412 __free_pages_bootmem(page, 0);
413 }
414 }
415 } else {
416 i += BITS_PER_LONG;
417 }
418 pfn += BITS_PER_LONG;
419 }
420 total += count;
421 337
422 /* 338 start = PFN_UP(physaddr);
423 * Now free the allocator bitmap itself, it's not 339 end = PFN_DOWN(physaddr + size);
424 * needed anymore:
425 */
426 page = virt_to_page(bdata->node_bootmem_map);
427 count = 0;
428 idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT;
429 for (i = 0; i < idx; i++, page++) {
430 __free_pages_bootmem(page, 0);
431 count++;
432 }
433 total += count;
434 bdata->node_bootmem_map = NULL;
435 340
436 return total; 341 mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
437} 342}
438 343
439unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, 344/**
440 unsigned long startpfn, unsigned long endpfn) 345 * free_bootmem - mark a page range as usable
441{ 346 * @addr: starting address of the range
442 return init_bootmem_core(pgdat, freepfn, startpfn, endpfn); 347 * @size: size of the range in bytes
443} 348 *
444 349 * Partial pages will be considered reserved and left as they are.
445int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 350 *
446 unsigned long size, int flags) 351 * The range must be contiguous but may span node boundaries.
352 */
353void __init free_bootmem(unsigned long addr, unsigned long size)
447{ 354{
448 int ret; 355 unsigned long start, end;
449 356
450 ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); 357 start = PFN_UP(addr);
451 if (ret < 0) 358 end = PFN_DOWN(addr + size);
452 return -ENOMEM;
453 reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
454 359
455 return 0; 360 mark_bootmem(start, end, 0, 0);
456} 361}
457 362
458void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 363/**
459 unsigned long size) 364 * reserve_bootmem_node - mark a page range as reserved
365 * @pgdat: node the range resides on
366 * @physaddr: starting address of the range
367 * @size: size of the range in bytes
368 * @flags: reservation flags (see linux/bootmem.h)
369 *
370 * Partial pages will be reserved.
371 *
372 * The range must reside completely on the specified node.
373 */
374int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
375 unsigned long size, int flags)
460{ 376{
461 free_bootmem_core(pgdat->bdata, physaddr, size); 377 unsigned long start, end;
462}
463 378
464unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 379 start = PFN_DOWN(physaddr);
465{ 380 end = PFN_UP(physaddr + size);
466 register_page_bootmem_info_node(pgdat);
467 return free_all_bootmem_core(pgdat);
468}
469 381
470unsigned long __init init_bootmem(unsigned long start, unsigned long pages) 382 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
471{
472 max_low_pfn = pages;
473 min_low_pfn = start;
474 return init_bootmem_core(NODE_DATA(0), start, 0, pages);
475} 383}
476 384
477#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE 385#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
386/**
387 * reserve_bootmem - mark a page range as usable
388 * @addr: starting address of the range
389 * @size: size of the range in bytes
390 * @flags: reservation flags (see linux/bootmem.h)
391 *
392 * Partial pages will be reserved.
393 *
394 * The range must be contiguous but may span node boundaries.
395 */
478int __init reserve_bootmem(unsigned long addr, unsigned long size, 396int __init reserve_bootmem(unsigned long addr, unsigned long size,
479 int flags) 397 int flags)
480{ 398{
481 bootmem_data_t *bdata; 399 unsigned long start, end;
482 int ret;
483 400
484 list_for_each_entry(bdata, &bdata_list, list) { 401 start = PFN_DOWN(addr);
485 ret = can_reserve_bootmem_core(bdata, addr, size, flags); 402 end = PFN_UP(addr + size);
486 if (ret < 0)
487 return ret;
488 }
489 list_for_each_entry(bdata, &bdata_list, list)
490 reserve_bootmem_core(bdata, addr, size, flags);
491 403
492 return 0; 404 return mark_bootmem(start, end, 1, flags);
493} 405}
494#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ 406#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
495 407
496void __init free_bootmem(unsigned long addr, unsigned long size) 408static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
409 unsigned long size, unsigned long align,
410 unsigned long goal, unsigned long limit)
497{ 411{
498 bootmem_data_t *bdata; 412 unsigned long fallback = 0;
499 list_for_each_entry(bdata, &bdata_list, list) 413 unsigned long min, max, start, sidx, midx, step;
500 free_bootmem_core(bdata, addr, size);
501}
502 414
503unsigned long __init free_all_bootmem(void) 415 BUG_ON(!size);
504{ 416 BUG_ON(align & (align - 1));
505 return free_all_bootmem_core(NODE_DATA(0)); 417 BUG_ON(limit && goal + size > limit);
418
419 if (!bdata->node_bootmem_map)
420 return NULL;
421
422 bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
423 bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
424 align, goal, limit);
425
426 min = bdata->node_min_pfn;
427 max = bdata->node_low_pfn;
428
429 goal >>= PAGE_SHIFT;
430 limit >>= PAGE_SHIFT;
431
432 if (limit && max > limit)
433 max = limit;
434 if (max <= min)
435 return NULL;
436
437 step = max(align >> PAGE_SHIFT, 1UL);
438
439 if (goal && min < goal && goal < max)
440 start = ALIGN(goal, step);
441 else
442 start = ALIGN(min, step);
443
444 sidx = start - bdata->node_min_pfn;;
445 midx = max - bdata->node_min_pfn;
446
447 if (bdata->hint_idx > sidx) {
448 /*
449 * Handle the valid case of sidx being zero and still
450 * catch the fallback below.
451 */
452 fallback = sidx + 1;
453 sidx = ALIGN(bdata->hint_idx, step);
454 }
455
456 while (1) {
457 int merge;
458 void *region;
459 unsigned long eidx, i, start_off, end_off;
460find_block:
461 sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
462 sidx = ALIGN(sidx, step);
463 eidx = sidx + PFN_UP(size);
464
465 if (sidx >= midx || eidx > midx)
466 break;
467
468 for (i = sidx; i < eidx; i++)
469 if (test_bit(i, bdata->node_bootmem_map)) {
470 sidx = ALIGN(i, step);
471 if (sidx == i)
472 sidx += step;
473 goto find_block;
474 }
475
476 if (bdata->last_end_off &&
477 PFN_DOWN(bdata->last_end_off) + 1 == sidx)
478 start_off = ALIGN(bdata->last_end_off, align);
479 else
480 start_off = PFN_PHYS(sidx);
481
482 merge = PFN_DOWN(start_off) < sidx;
483 end_off = start_off + size;
484
485 bdata->last_end_off = end_off;
486 bdata->hint_idx = PFN_UP(end_off);
487
488 /*
489 * Reserve the area now:
490 */
491 if (__reserve(bdata, PFN_DOWN(start_off) + merge,
492 PFN_UP(end_off), BOOTMEM_EXCLUSIVE))
493 BUG();
494
495 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
496 start_off);
497 memset(region, 0, size);
498 return region;
499 }
500
501 if (fallback) {
502 sidx = ALIGN(fallback - 1, step);
503 fallback = 0;
504 goto find_block;
505 }
506
507 return NULL;
506} 508}
507 509
508void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, 510static void * __init ___alloc_bootmem_nopanic(unsigned long size,
509 unsigned long goal) 511 unsigned long align,
512 unsigned long goal,
513 unsigned long limit)
510{ 514{
511 bootmem_data_t *bdata; 515 bootmem_data_t *bdata;
512 void *ptr;
513 516
517restart:
514 list_for_each_entry(bdata, &bdata_list, list) { 518 list_for_each_entry(bdata, &bdata_list, list) {
515 ptr = __alloc_bootmem_core(bdata, size, align, goal, 0); 519 void *region;
516 if (ptr) 520
517 return ptr; 521 if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
522 continue;
523 if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
524 break;
525
526 region = alloc_bootmem_core(bdata, size, align, goal, limit);
527 if (region)
528 return region;
529 }
530
531 if (goal) {
532 goal = 0;
533 goto restart;
518 } 534 }
535
519 return NULL; 536 return NULL;
520} 537}
521 538
522void * __init __alloc_bootmem(unsigned long size, unsigned long align, 539/**
523 unsigned long goal) 540 * __alloc_bootmem_nopanic - allocate boot memory without panicking
541 * @size: size of the request in bytes
542 * @align: alignment of the region
543 * @goal: preferred starting address of the region
544 *
545 * The goal is dropped if it can not be satisfied and the allocation will
546 * fall back to memory below @goal.
547 *
548 * Allocation may happen on any node in the system.
549 *
550 * Returns NULL on failure.
551 */
552void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
553 unsigned long goal)
524{ 554{
525 void *mem = __alloc_bootmem_nopanic(size,align,goal); 555 return ___alloc_bootmem_nopanic(size, align, goal, 0);
556}
557
558static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
559 unsigned long goal, unsigned long limit)
560{
561 void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
526 562
527 if (mem) 563 if (mem)
528 return mem; 564 return mem;
@@ -534,78 +570,135 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
534 return NULL; 570 return NULL;
535} 571}
536 572
573/**
574 * __alloc_bootmem - allocate boot memory
575 * @size: size of the request in bytes
576 * @align: alignment of the region
577 * @goal: preferred starting address of the region
578 *
579 * The goal is dropped if it can not be satisfied and the allocation will
580 * fall back to memory below @goal.
581 *
582 * Allocation may happen on any node in the system.
583 *
584 * The function panics if the request can not be satisfied.
585 */
586void * __init __alloc_bootmem(unsigned long size, unsigned long align,
587 unsigned long goal)
588{
589 return ___alloc_bootmem(size, align, goal, 0);
590}
537 591
538void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, 592static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
539 unsigned long align, unsigned long goal) 593 unsigned long size, unsigned long align,
594 unsigned long goal, unsigned long limit)
540{ 595{
541 void *ptr; 596 void *ptr;
542 597
543 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 598 ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
544 if (ptr) 599 if (ptr)
545 return ptr; 600 return ptr;
546 601
547 return __alloc_bootmem(size, align, goal); 602 return ___alloc_bootmem(size, align, goal, limit);
603}
604
605/**
606 * __alloc_bootmem_node - allocate boot memory from a specific node
607 * @pgdat: node to allocate from
608 * @size: size of the request in bytes
609 * @align: alignment of the region
610 * @goal: preferred starting address of the region
611 *
612 * The goal is dropped if it can not be satisfied and the allocation will
613 * fall back to memory below @goal.
614 *
615 * Allocation may fall back to any node in the system if the specified node
616 * can not hold the requested memory.
617 *
618 * The function panics if the request can not be satisfied.
619 */
620void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
621 unsigned long align, unsigned long goal)
622{
623 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
548} 624}
549 625
550#ifdef CONFIG_SPARSEMEM 626#ifdef CONFIG_SPARSEMEM
627/**
628 * alloc_bootmem_section - allocate boot memory from a specific section
629 * @size: size of the request in bytes
630 * @section_nr: sparse map section to allocate from
631 *
632 * Return NULL on failure.
633 */
551void * __init alloc_bootmem_section(unsigned long size, 634void * __init alloc_bootmem_section(unsigned long size,
552 unsigned long section_nr) 635 unsigned long section_nr)
553{ 636{
554 void *ptr; 637 bootmem_data_t *bdata;
555 unsigned long limit, goal, start_nr, end_nr, pfn; 638 unsigned long pfn, goal, limit;
556 struct pglist_data *pgdat;
557 639
558 pfn = section_nr_to_pfn(section_nr); 640 pfn = section_nr_to_pfn(section_nr);
559 goal = PFN_PHYS(pfn); 641 goal = pfn << PAGE_SHIFT;
560 limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1; 642 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
561 pgdat = NODE_DATA(early_pfn_to_nid(pfn)); 643 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
562 ptr = __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal,
563 limit);
564 644
565 if (!ptr) 645 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
566 return NULL; 646}
647#endif
567 648
568 start_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr))); 649void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
569 end_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr) + size)); 650 unsigned long align, unsigned long goal)
570 if (start_nr != section_nr || end_nr != section_nr) { 651{
571 printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n", 652 void *ptr;
572 section_nr);
573 free_bootmem_core(pgdat->bdata, __pa(ptr), size);
574 ptr = NULL;
575 }
576 653
577 return ptr; 654 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
655 if (ptr)
656 return ptr;
657
658 return __alloc_bootmem_nopanic(size, align, goal);
578} 659}
579#endif
580 660
581#ifndef ARCH_LOW_ADDRESS_LIMIT 661#ifndef ARCH_LOW_ADDRESS_LIMIT
582#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL 662#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
583#endif 663#endif
584 664
665/**
666 * __alloc_bootmem_low - allocate low boot memory
667 * @size: size of the request in bytes
668 * @align: alignment of the region
669 * @goal: preferred starting address of the region
670 *
671 * The goal is dropped if it can not be satisfied and the allocation will
672 * fall back to memory below @goal.
673 *
674 * Allocation may happen on any node in the system.
675 *
676 * The function panics if the request can not be satisfied.
677 */
585void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, 678void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
586 unsigned long goal) 679 unsigned long goal)
587{ 680{
588 bootmem_data_t *bdata; 681 return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
589 void *ptr;
590
591 list_for_each_entry(bdata, &bdata_list, list) {
592 ptr = __alloc_bootmem_core(bdata, size, align, goal,
593 ARCH_LOW_ADDRESS_LIMIT);
594 if (ptr)
595 return ptr;
596 }
597
598 /*
599 * Whoops, we cannot satisfy the allocation request.
600 */
601 printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
602 panic("Out of low memory");
603 return NULL;
604} 682}
605 683
684/**
685 * __alloc_bootmem_low_node - allocate low boot memory from a specific node
686 * @pgdat: node to allocate from
687 * @size: size of the request in bytes
688 * @align: alignment of the region
689 * @goal: preferred starting address of the region
690 *
691 * The goal is dropped if it can not be satisfied and the allocation will
692 * fall back to memory below @goal.
693 *
694 * Allocation may fall back to any node in the system if the specified node
695 * can not hold the requested memory.
696 *
697 * The function panics if the request can not be satisfied.
698 */
606void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, 699void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
607 unsigned long align, unsigned long goal) 700 unsigned long align, unsigned long goal)
608{ 701{
609 return __alloc_bootmem_core(pgdat->bdata, size, align, goal, 702 return ___alloc_bootmem_node(pgdat->bdata, size, align,
610 ARCH_LOW_ADDRESS_LIMIT); 703 goal, ARCH_LOW_ADDRESS_LIMIT);
611} 704}
diff --git a/mm/filemap.c b/mm/filemap.c
index 65d9d9e2b755..2ed8b0389c51 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -42,9 +42,6 @@
42 42
43#include <asm/mman.h> 43#include <asm/mman.h>
44 44
45static ssize_t
46generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
47 loff_t offset, unsigned long nr_segs);
48 45
49/* 46/*
50 * Shared mappings implemented 30.11.1994. It's not fully working yet, 47 * Shared mappings implemented 30.11.1994. It's not fully working yet,
@@ -112,13 +109,13 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
112/* 109/*
113 * Remove a page from the page cache and free it. Caller has to make 110 * Remove a page from the page cache and free it. Caller has to make
114 * sure the page is locked and that nobody else uses it - or that usage 111 * sure the page is locked and that nobody else uses it - or that usage
115 * is safe. The caller must hold a write_lock on the mapping's tree_lock. 112 * is safe. The caller must hold the mapping's tree_lock.
116 */ 113 */
117void __remove_from_page_cache(struct page *page) 114void __remove_from_page_cache(struct page *page)
118{ 115{
119 struct address_space *mapping = page->mapping; 116 struct address_space *mapping = page->mapping;
120 117
121 mem_cgroup_uncharge_page(page); 118 mem_cgroup_uncharge_cache_page(page);
122 radix_tree_delete(&mapping->page_tree, page->index); 119 radix_tree_delete(&mapping->page_tree, page->index);
123 page->mapping = NULL; 120 page->mapping = NULL;
124 mapping->nrpages--; 121 mapping->nrpages--;
@@ -144,9 +141,9 @@ void remove_from_page_cache(struct page *page)
144 141
145 BUG_ON(!PageLocked(page)); 142 BUG_ON(!PageLocked(page));
146 143
147 write_lock_irq(&mapping->tree_lock); 144 spin_lock_irq(&mapping->tree_lock);
148 __remove_from_page_cache(page); 145 __remove_from_page_cache(page);
149 write_unlock_irq(&mapping->tree_lock); 146 spin_unlock_irq(&mapping->tree_lock);
150} 147}
151 148
152static int sync_page(void *word) 149static int sync_page(void *word)
@@ -445,48 +442,52 @@ int filemap_write_and_wait_range(struct address_space *mapping,
445} 442}
446 443
447/** 444/**
448 * add_to_page_cache - add newly allocated pagecache pages 445 * add_to_page_cache_locked - add a locked page to the pagecache
449 * @page: page to add 446 * @page: page to add
450 * @mapping: the page's address_space 447 * @mapping: the page's address_space
451 * @offset: page index 448 * @offset: page index
452 * @gfp_mask: page allocation mode 449 * @gfp_mask: page allocation mode
453 * 450 *
454 * This function is used to add newly allocated pagecache pages; 451 * This function is used to add a page to the pagecache. It must be locked.
455 * the page is new, so we can just run SetPageLocked() against it.
456 * The other page state flags were set by rmqueue().
457 *
458 * This function does not add the page to the LRU. The caller must do that. 452 * This function does not add the page to the LRU. The caller must do that.
459 */ 453 */
460int add_to_page_cache(struct page *page, struct address_space *mapping, 454int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
461 pgoff_t offset, gfp_t gfp_mask) 455 pgoff_t offset, gfp_t gfp_mask)
462{ 456{
463 int error = mem_cgroup_cache_charge(page, current->mm, 457 int error;
458
459 VM_BUG_ON(!PageLocked(page));
460
461 error = mem_cgroup_cache_charge(page, current->mm,
464 gfp_mask & ~__GFP_HIGHMEM); 462 gfp_mask & ~__GFP_HIGHMEM);
465 if (error) 463 if (error)
466 goto out; 464 goto out;
467 465
468 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 466 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
469 if (error == 0) { 467 if (error == 0) {
470 write_lock_irq(&mapping->tree_lock); 468 page_cache_get(page);
469 page->mapping = mapping;
470 page->index = offset;
471
472 spin_lock_irq(&mapping->tree_lock);
471 error = radix_tree_insert(&mapping->page_tree, offset, page); 473 error = radix_tree_insert(&mapping->page_tree, offset, page);
472 if (!error) { 474 if (likely(!error)) {
473 page_cache_get(page);
474 SetPageLocked(page);
475 page->mapping = mapping;
476 page->index = offset;
477 mapping->nrpages++; 475 mapping->nrpages++;
478 __inc_zone_page_state(page, NR_FILE_PAGES); 476 __inc_zone_page_state(page, NR_FILE_PAGES);
479 } else 477 } else {
480 mem_cgroup_uncharge_page(page); 478 page->mapping = NULL;
479 mem_cgroup_uncharge_cache_page(page);
480 page_cache_release(page);
481 }
481 482
482 write_unlock_irq(&mapping->tree_lock); 483 spin_unlock_irq(&mapping->tree_lock);
483 radix_tree_preload_end(); 484 radix_tree_preload_end();
484 } else 485 } else
485 mem_cgroup_uncharge_page(page); 486 mem_cgroup_uncharge_cache_page(page);
486out: 487out:
487 return error; 488 return error;
488} 489}
489EXPORT_SYMBOL(add_to_page_cache); 490EXPORT_SYMBOL(add_to_page_cache_locked);
490 491
491int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 492int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
492 pgoff_t offset, gfp_t gfp_mask) 493 pgoff_t offset, gfp_t gfp_mask)
@@ -636,15 +637,35 @@ void __lock_page_nosync(struct page *page)
636 * Is there a pagecache struct page at the given (mapping, offset) tuple? 637 * Is there a pagecache struct page at the given (mapping, offset) tuple?
637 * If yes, increment its refcount and return it; if no, return NULL. 638 * If yes, increment its refcount and return it; if no, return NULL.
638 */ 639 */
639struct page * find_get_page(struct address_space *mapping, pgoff_t offset) 640struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
640{ 641{
642 void **pagep;
641 struct page *page; 643 struct page *page;
642 644
643 read_lock_irq(&mapping->tree_lock); 645 rcu_read_lock();
644 page = radix_tree_lookup(&mapping->page_tree, offset); 646repeat:
645 if (page) 647 page = NULL;
646 page_cache_get(page); 648 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
647 read_unlock_irq(&mapping->tree_lock); 649 if (pagep) {
650 page = radix_tree_deref_slot(pagep);
651 if (unlikely(!page || page == RADIX_TREE_RETRY))
652 goto repeat;
653
654 if (!page_cache_get_speculative(page))
655 goto repeat;
656
657 /*
658 * Has the page moved?
659 * This is part of the lockless pagecache protocol. See
660 * include/linux/pagemap.h for details.
661 */
662 if (unlikely(page != *pagep)) {
663 page_cache_release(page);
664 goto repeat;
665 }
666 }
667 rcu_read_unlock();
668
648 return page; 669 return page;
649} 670}
650EXPORT_SYMBOL(find_get_page); 671EXPORT_SYMBOL(find_get_page);
@@ -659,32 +680,22 @@ EXPORT_SYMBOL(find_get_page);
659 * 680 *
660 * Returns zero if the page was not present. find_lock_page() may sleep. 681 * Returns zero if the page was not present. find_lock_page() may sleep.
661 */ 682 */
662struct page *find_lock_page(struct address_space *mapping, 683struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
663 pgoff_t offset)
664{ 684{
665 struct page *page; 685 struct page *page;
666 686
667repeat: 687repeat:
668 read_lock_irq(&mapping->tree_lock); 688 page = find_get_page(mapping, offset);
669 page = radix_tree_lookup(&mapping->page_tree, offset);
670 if (page) { 689 if (page) {
671 page_cache_get(page); 690 lock_page(page);
672 if (TestSetPageLocked(page)) { 691 /* Has the page been truncated? */
673 read_unlock_irq(&mapping->tree_lock); 692 if (unlikely(page->mapping != mapping)) {
674 __lock_page(page); 693 unlock_page(page);
675 694 page_cache_release(page);
676 /* Has the page been truncated while we slept? */ 695 goto repeat;
677 if (unlikely(page->mapping != mapping)) {
678 unlock_page(page);
679 page_cache_release(page);
680 goto repeat;
681 }
682 VM_BUG_ON(page->index != offset);
683 goto out;
684 } 696 }
697 VM_BUG_ON(page->index != offset);
685 } 698 }
686 read_unlock_irq(&mapping->tree_lock);
687out:
688 return page; 699 return page;
689} 700}
690EXPORT_SYMBOL(find_lock_page); 701EXPORT_SYMBOL(find_lock_page);
@@ -750,13 +761,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
750{ 761{
751 unsigned int i; 762 unsigned int i;
752 unsigned int ret; 763 unsigned int ret;
764 unsigned int nr_found;
765
766 rcu_read_lock();
767restart:
768 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
769 (void ***)pages, start, nr_pages);
770 ret = 0;
771 for (i = 0; i < nr_found; i++) {
772 struct page *page;
773repeat:
774 page = radix_tree_deref_slot((void **)pages[i]);
775 if (unlikely(!page))
776 continue;
777 /*
778 * this can only trigger if nr_found == 1, making livelock
779 * a non issue.
780 */
781 if (unlikely(page == RADIX_TREE_RETRY))
782 goto restart;
783
784 if (!page_cache_get_speculative(page))
785 goto repeat;
786
787 /* Has the page moved? */
788 if (unlikely(page != *((void **)pages[i]))) {
789 page_cache_release(page);
790 goto repeat;
791 }
753 792
754 read_lock_irq(&mapping->tree_lock); 793 pages[ret] = page;
755 ret = radix_tree_gang_lookup(&mapping->page_tree, 794 ret++;
756 (void **)pages, start, nr_pages); 795 }
757 for (i = 0; i < ret; i++) 796 rcu_read_unlock();
758 page_cache_get(pages[i]);
759 read_unlock_irq(&mapping->tree_lock);
760 return ret; 797 return ret;
761} 798}
762 799
@@ -777,19 +814,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
777{ 814{
778 unsigned int i; 815 unsigned int i;
779 unsigned int ret; 816 unsigned int ret;
817 unsigned int nr_found;
818
819 rcu_read_lock();
820restart:
821 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
822 (void ***)pages, index, nr_pages);
823 ret = 0;
824 for (i = 0; i < nr_found; i++) {
825 struct page *page;
826repeat:
827 page = radix_tree_deref_slot((void **)pages[i]);
828 if (unlikely(!page))
829 continue;
830 /*
831 * this can only trigger if nr_found == 1, making livelock
832 * a non issue.
833 */
834 if (unlikely(page == RADIX_TREE_RETRY))
835 goto restart;
780 836
781 read_lock_irq(&mapping->tree_lock); 837 if (page->mapping == NULL || page->index != index)
782 ret = radix_tree_gang_lookup(&mapping->page_tree,
783 (void **)pages, index, nr_pages);
784 for (i = 0; i < ret; i++) {
785 if (pages[i]->mapping == NULL || pages[i]->index != index)
786 break; 838 break;
787 839
788 page_cache_get(pages[i]); 840 if (!page_cache_get_speculative(page))
841 goto repeat;
842
843 /* Has the page moved? */
844 if (unlikely(page != *((void **)pages[i]))) {
845 page_cache_release(page);
846 goto repeat;
847 }
848
849 pages[ret] = page;
850 ret++;
789 index++; 851 index++;
790 } 852 }
791 read_unlock_irq(&mapping->tree_lock); 853 rcu_read_unlock();
792 return i; 854 return ret;
793} 855}
794EXPORT_SYMBOL(find_get_pages_contig); 856EXPORT_SYMBOL(find_get_pages_contig);
795 857
@@ -809,15 +871,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
809{ 871{
810 unsigned int i; 872 unsigned int i;
811 unsigned int ret; 873 unsigned int ret;
874 unsigned int nr_found;
875
876 rcu_read_lock();
877restart:
878 nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
879 (void ***)pages, *index, nr_pages, tag);
880 ret = 0;
881 for (i = 0; i < nr_found; i++) {
882 struct page *page;
883repeat:
884 page = radix_tree_deref_slot((void **)pages[i]);
885 if (unlikely(!page))
886 continue;
887 /*
888 * this can only trigger if nr_found == 1, making livelock
889 * a non issue.
890 */
891 if (unlikely(page == RADIX_TREE_RETRY))
892 goto restart;
893
894 if (!page_cache_get_speculative(page))
895 goto repeat;
896
897 /* Has the page moved? */
898 if (unlikely(page != *((void **)pages[i]))) {
899 page_cache_release(page);
900 goto repeat;
901 }
902
903 pages[ret] = page;
904 ret++;
905 }
906 rcu_read_unlock();
812 907
813 read_lock_irq(&mapping->tree_lock);
814 ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
815 (void **)pages, *index, nr_pages, tag);
816 for (i = 0; i < ret; i++)
817 page_cache_get(pages[i]);
818 if (ret) 908 if (ret)
819 *index = pages[ret - 1]->index + 1; 909 *index = pages[ret - 1]->index + 1;
820 read_unlock_irq(&mapping->tree_lock); 910
821 return ret; 911 return ret;
822} 912}
823EXPORT_SYMBOL(find_get_pages_tag); 913EXPORT_SYMBOL(find_get_pages_tag);
@@ -1200,42 +1290,41 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1200 1290
1201 mapping = filp->f_mapping; 1291 mapping = filp->f_mapping;
1202 inode = mapping->host; 1292 inode = mapping->host;
1203 retval = 0;
1204 if (!count) 1293 if (!count)
1205 goto out; /* skip atime */ 1294 goto out; /* skip atime */
1206 size = i_size_read(inode); 1295 size = i_size_read(inode);
1207 if (pos < size) { 1296 if (pos < size) {
1208 retval = generic_file_direct_IO(READ, iocb, 1297 retval = filemap_write_and_wait(mapping);
1209 iov, pos, nr_segs); 1298 if (!retval) {
1299 retval = mapping->a_ops->direct_IO(READ, iocb,
1300 iov, pos, nr_segs);
1301 }
1210 if (retval > 0) 1302 if (retval > 0)
1211 *ppos = pos + retval; 1303 *ppos = pos + retval;
1212 } 1304 if (retval) {
1213 if (likely(retval != 0)) { 1305 file_accessed(filp);
1214 file_accessed(filp); 1306 goto out;
1215 goto out; 1307 }
1216 } 1308 }
1217 } 1309 }
1218 1310
1219 retval = 0; 1311 for (seg = 0; seg < nr_segs; seg++) {
1220 if (count) { 1312 read_descriptor_t desc;
1221 for (seg = 0; seg < nr_segs; seg++) {
1222 read_descriptor_t desc;
1223 1313
1224 desc.written = 0; 1314 desc.written = 0;
1225 desc.arg.buf = iov[seg].iov_base; 1315 desc.arg.buf = iov[seg].iov_base;
1226 desc.count = iov[seg].iov_len; 1316 desc.count = iov[seg].iov_len;
1227 if (desc.count == 0) 1317 if (desc.count == 0)
1228 continue; 1318 continue;
1229 desc.error = 0; 1319 desc.error = 0;
1230 do_generic_file_read(filp,ppos,&desc,file_read_actor); 1320 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1231 retval += desc.written; 1321 retval += desc.written;
1232 if (desc.error) { 1322 if (desc.error) {
1233 retval = retval ?: desc.error; 1323 retval = retval ?: desc.error;
1234 break; 1324 break;
1235 }
1236 if (desc.count > 0)
1237 break;
1238 } 1325 }
1326 if (desc.count > 0)
1327 break;
1239 } 1328 }
1240out: 1329out:
1241 return retval; 1330 return retval;
@@ -2004,11 +2093,55 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2004 struct address_space *mapping = file->f_mapping; 2093 struct address_space *mapping = file->f_mapping;
2005 struct inode *inode = mapping->host; 2094 struct inode *inode = mapping->host;
2006 ssize_t written; 2095 ssize_t written;
2096 size_t write_len;
2097 pgoff_t end;
2007 2098
2008 if (count != ocount) 2099 if (count != ocount)
2009 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); 2100 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
2010 2101
2011 written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs); 2102 /*
2103 * Unmap all mmappings of the file up-front.
2104 *
2105 * This will cause any pte dirty bits to be propagated into the
2106 * pageframes for the subsequent filemap_write_and_wait().
2107 */
2108 write_len = iov_length(iov, *nr_segs);
2109 end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
2110 if (mapping_mapped(mapping))
2111 unmap_mapping_range(mapping, pos, write_len, 0);
2112
2113 written = filemap_write_and_wait(mapping);
2114 if (written)
2115 goto out;
2116
2117 /*
2118 * After a write we want buffered reads to be sure to go to disk to get
2119 * the new data. We invalidate clean cached page from the region we're
2120 * about to write. We do this *before* the write so that we can return
2121 * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
2122 */
2123 if (mapping->nrpages) {
2124 written = invalidate_inode_pages2_range(mapping,
2125 pos >> PAGE_CACHE_SHIFT, end);
2126 if (written)
2127 goto out;
2128 }
2129
2130 written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
2131
2132 /*
2133 * Finally, try again to invalidate clean pages which might have been
2134 * cached by non-direct readahead, or faulted in by get_user_pages()
2135 * if the source of the write was an mmap'ed region of the file
2136 * we're writing. Either one is a pretty crazy thing to do,
2137 * so we don't support it 100%. If this invalidation
2138 * fails, tough, the write still worked...
2139 */
2140 if (mapping->nrpages) {
2141 invalidate_inode_pages2_range(mapping,
2142 pos >> PAGE_CACHE_SHIFT, end);
2143 }
2144
2012 if (written > 0) { 2145 if (written > 0) {
2013 loff_t end = pos + written; 2146 loff_t end = pos + written;
2014 if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 2147 if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
@@ -2024,6 +2157,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2024 * i_mutex is held, which protects generic_osync_inode() from 2157 * i_mutex is held, which protects generic_osync_inode() from
2025 * livelocking. AIO O_DIRECT ops attempt to sync metadata here. 2158 * livelocking. AIO O_DIRECT ops attempt to sync metadata here.
2026 */ 2159 */
2160out:
2027 if ((written >= 0 || written == -EIOCBQUEUED) && 2161 if ((written >= 0 || written == -EIOCBQUEUED) &&
2028 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2162 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2029 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); 2163 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
@@ -2511,66 +2645,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2511} 2645}
2512EXPORT_SYMBOL(generic_file_aio_write); 2646EXPORT_SYMBOL(generic_file_aio_write);
2513 2647
2514/*
2515 * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something
2516 * went wrong during pagecache shootdown.
2517 */
2518static ssize_t
2519generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2520 loff_t offset, unsigned long nr_segs)
2521{
2522 struct file *file = iocb->ki_filp;
2523 struct address_space *mapping = file->f_mapping;
2524 ssize_t retval;
2525 size_t write_len;
2526 pgoff_t end = 0; /* silence gcc */
2527
2528 /*
2529 * If it's a write, unmap all mmappings of the file up-front. This
2530 * will cause any pte dirty bits to be propagated into the pageframes
2531 * for the subsequent filemap_write_and_wait().
2532 */
2533 if (rw == WRITE) {
2534 write_len = iov_length(iov, nr_segs);
2535 end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT;
2536 if (mapping_mapped(mapping))
2537 unmap_mapping_range(mapping, offset, write_len, 0);
2538 }
2539
2540 retval = filemap_write_and_wait(mapping);
2541 if (retval)
2542 goto out;
2543
2544 /*
2545 * After a write we want buffered reads to be sure to go to disk to get
2546 * the new data. We invalidate clean cached page from the region we're
2547 * about to write. We do this *before* the write so that we can return
2548 * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
2549 */
2550 if (rw == WRITE && mapping->nrpages) {
2551 retval = invalidate_inode_pages2_range(mapping,
2552 offset >> PAGE_CACHE_SHIFT, end);
2553 if (retval)
2554 goto out;
2555 }
2556
2557 retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
2558
2559 /*
2560 * Finally, try again to invalidate clean pages which might have been
2561 * cached by non-direct readahead, or faulted in by get_user_pages()
2562 * if the source of the write was an mmap'ed region of the file
2563 * we're writing. Either one is a pretty crazy thing to do,
2564 * so we don't support it 100%. If this invalidation
2565 * fails, tough, the write still worked...
2566 */
2567 if (rw == WRITE && mapping->nrpages) {
2568 invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end);
2569 }
2570out:
2571 return retval;
2572}
2573
2574/** 2648/**
2575 * try_to_release_page() - release old fs-specific metadata on a page 2649 * try_to_release_page() - release old fs-specific metadata on a page
2576 * 2650 *
@@ -2582,9 +2656,8 @@ out:
2582 * Otherwise return zero. 2656 * Otherwise return zero.
2583 * 2657 *
2584 * The @gfp_mask argument specifies whether I/O may be performed to release 2658 * The @gfp_mask argument specifies whether I/O may be performed to release
2585 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). 2659 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
2586 * 2660 *
2587 * NOTE: @gfp_mask may go away, and this function may become non-blocking.
2588 */ 2661 */
2589int try_to_release_page(struct page *page, gfp_t gfp_mask) 2662int try_to_release_page(struct page *page, gfp_t gfp_mask)
2590{ 2663{
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ab171274ef21..3be79dc18c5c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -14,6 +14,8 @@
14#include <linux/mempolicy.h> 14#include <linux/mempolicy.h>
15#include <linux/cpuset.h> 15#include <linux/cpuset.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/bootmem.h>
18#include <linux/sysfs.h>
17 19
18#include <asm/page.h> 20#include <asm/page.h>
19#include <asm/pgtable.h> 21#include <asm/pgtable.h>
@@ -22,30 +24,340 @@
22#include "internal.h" 24#include "internal.h"
23 25
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 26const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
26static unsigned long surplus_huge_pages;
27static unsigned long nr_overcommit_huge_pages;
28unsigned long max_huge_pages;
29unsigned long sysctl_overcommit_huge_pages;
30static struct list_head hugepage_freelists[MAX_NUMNODES];
31static unsigned int nr_huge_pages_node[MAX_NUMNODES];
32static unsigned int free_huge_pages_node[MAX_NUMNODES];
33static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
34static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 27static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
35unsigned long hugepages_treat_as_movable; 28unsigned long hugepages_treat_as_movable;
36static int hugetlb_next_nid; 29
30static int max_hstate;
31unsigned int default_hstate_idx;
32struct hstate hstates[HUGE_MAX_HSTATE];
33
34__initdata LIST_HEAD(huge_boot_pages);
35
36/* for command line parsing */
37static struct hstate * __initdata parsed_hstate;
38static unsigned long __initdata default_hstate_max_huge_pages;
39static unsigned long __initdata default_hstate_size;
40
41#define for_each_hstate(h) \
42 for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
37 43
38/* 44/*
39 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 45 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
40 */ 46 */
41static DEFINE_SPINLOCK(hugetlb_lock); 47static DEFINE_SPINLOCK(hugetlb_lock);
42 48
43static void clear_huge_page(struct page *page, unsigned long addr) 49/*
50 * Region tracking -- allows tracking of reservations and instantiated pages
51 * across the pages in a mapping.
52 *
53 * The region data structures are protected by a combination of the mmap_sem
54 * and the hugetlb_instantion_mutex. To access or modify a region the caller
55 * must either hold the mmap_sem for write, or the mmap_sem for read and
56 * the hugetlb_instantiation mutex:
57 *
58 * down_write(&mm->mmap_sem);
59 * or
60 * down_read(&mm->mmap_sem);
61 * mutex_lock(&hugetlb_instantiation_mutex);
62 */
63struct file_region {
64 struct list_head link;
65 long from;
66 long to;
67};
68
69static long region_add(struct list_head *head, long f, long t)
70{
71 struct file_region *rg, *nrg, *trg;
72
73 /* Locate the region we are either in or before. */
74 list_for_each_entry(rg, head, link)
75 if (f <= rg->to)
76 break;
77
78 /* Round our left edge to the current segment if it encloses us. */
79 if (f > rg->from)
80 f = rg->from;
81
82 /* Check for and consume any regions we now overlap with. */
83 nrg = rg;
84 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
85 if (&rg->link == head)
86 break;
87 if (rg->from > t)
88 break;
89
90 /* If this area reaches higher then extend our area to
91 * include it completely. If this is not the first area
92 * which we intend to reuse, free it. */
93 if (rg->to > t)
94 t = rg->to;
95 if (rg != nrg) {
96 list_del(&rg->link);
97 kfree(rg);
98 }
99 }
100 nrg->from = f;
101 nrg->to = t;
102 return 0;
103}
104
105static long region_chg(struct list_head *head, long f, long t)
106{
107 struct file_region *rg, *nrg;
108 long chg = 0;
109
110 /* Locate the region we are before or in. */
111 list_for_each_entry(rg, head, link)
112 if (f <= rg->to)
113 break;
114
115 /* If we are below the current region then a new region is required.
116 * Subtle, allocate a new region at the position but make it zero
117 * size such that we can guarantee to record the reservation. */
118 if (&rg->link == head || t < rg->from) {
119 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
120 if (!nrg)
121 return -ENOMEM;
122 nrg->from = f;
123 nrg->to = f;
124 INIT_LIST_HEAD(&nrg->link);
125 list_add(&nrg->link, rg->link.prev);
126
127 return t - f;
128 }
129
130 /* Round our left edge to the current segment if it encloses us. */
131 if (f > rg->from)
132 f = rg->from;
133 chg = t - f;
134
135 /* Check for and consume any regions we now overlap with. */
136 list_for_each_entry(rg, rg->link.prev, link) {
137 if (&rg->link == head)
138 break;
139 if (rg->from > t)
140 return chg;
141
142 /* We overlap with this area, if it extends futher than
143 * us then we must extend ourselves. Account for its
144 * existing reservation. */
145 if (rg->to > t) {
146 chg += rg->to - t;
147 t = rg->to;
148 }
149 chg -= rg->to - rg->from;
150 }
151 return chg;
152}
153
154static long region_truncate(struct list_head *head, long end)
155{
156 struct file_region *rg, *trg;
157 long chg = 0;
158
159 /* Locate the region we are either in or before. */
160 list_for_each_entry(rg, head, link)
161 if (end <= rg->to)
162 break;
163 if (&rg->link == head)
164 return 0;
165
166 /* If we are in the middle of a region then adjust it. */
167 if (end > rg->from) {
168 chg = rg->to - end;
169 rg->to = end;
170 rg = list_entry(rg->link.next, typeof(*rg), link);
171 }
172
173 /* Drop any remaining regions. */
174 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
175 if (&rg->link == head)
176 break;
177 chg += rg->to - rg->from;
178 list_del(&rg->link);
179 kfree(rg);
180 }
181 return chg;
182}
183
184static long region_count(struct list_head *head, long f, long t)
185{
186 struct file_region *rg;
187 long chg = 0;
188
189 /* Locate each segment we overlap with, and count that overlap. */
190 list_for_each_entry(rg, head, link) {
191 int seg_from;
192 int seg_to;
193
194 if (rg->to <= f)
195 continue;
196 if (rg->from >= t)
197 break;
198
199 seg_from = max(rg->from, f);
200 seg_to = min(rg->to, t);
201
202 chg += seg_to - seg_from;
203 }
204
205 return chg;
206}
207
208/*
209 * Convert the address within this vma to the page offset within
210 * the mapping, in pagecache page units; huge pages here.
211 */
212static pgoff_t vma_hugecache_offset(struct hstate *h,
213 struct vm_area_struct *vma, unsigned long address)
214{
215 return ((address - vma->vm_start) >> huge_page_shift(h)) +
216 (vma->vm_pgoff >> huge_page_order(h));
217}
218
219/*
220 * Flags for MAP_PRIVATE reservations. These are stored in the bottom
221 * bits of the reservation map pointer, which are always clear due to
222 * alignment.
223 */
224#define HPAGE_RESV_OWNER (1UL << 0)
225#define HPAGE_RESV_UNMAPPED (1UL << 1)
226#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
227
228/*
229 * These helpers are used to track how many pages are reserved for
230 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
231 * is guaranteed to have their future faults succeed.
232 *
233 * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
234 * the reserve counters are updated with the hugetlb_lock held. It is safe
235 * to reset the VMA at fork() time as it is not in use yet and there is no
236 * chance of the global counters getting corrupted as a result of the values.
237 *
238 * The private mapping reservation is represented in a subtly different
239 * manner to a shared mapping. A shared mapping has a region map associated
240 * with the underlying file, this region map represents the backing file
241 * pages which have ever had a reservation assigned which this persists even
242 * after the page is instantiated. A private mapping has a region map
243 * associated with the original mmap which is attached to all VMAs which
244 * reference it, this region map represents those offsets which have consumed
245 * reservation ie. where pages have been instantiated.
246 */
247static unsigned long get_vma_private_data(struct vm_area_struct *vma)
248{
249 return (unsigned long)vma->vm_private_data;
250}
251
252static void set_vma_private_data(struct vm_area_struct *vma,
253 unsigned long value)
254{
255 vma->vm_private_data = (void *)value;
256}
257
258struct resv_map {
259 struct kref refs;
260 struct list_head regions;
261};
262
263struct resv_map *resv_map_alloc(void)
264{
265 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
266 if (!resv_map)
267 return NULL;
268
269 kref_init(&resv_map->refs);
270 INIT_LIST_HEAD(&resv_map->regions);
271
272 return resv_map;
273}
274
275void resv_map_release(struct kref *ref)
276{
277 struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
278
279 /* Clear out any active regions before we release the map. */
280 region_truncate(&resv_map->regions, 0);
281 kfree(resv_map);
282}
283
284static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
285{
286 VM_BUG_ON(!is_vm_hugetlb_page(vma));
287 if (!(vma->vm_flags & VM_SHARED))
288 return (struct resv_map *)(get_vma_private_data(vma) &
289 ~HPAGE_RESV_MASK);
290 return 0;
291}
292
293static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
294{
295 VM_BUG_ON(!is_vm_hugetlb_page(vma));
296 VM_BUG_ON(vma->vm_flags & VM_SHARED);
297
298 set_vma_private_data(vma, (get_vma_private_data(vma) &
299 HPAGE_RESV_MASK) | (unsigned long)map);
300}
301
302static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
303{
304 VM_BUG_ON(!is_vm_hugetlb_page(vma));
305 VM_BUG_ON(vma->vm_flags & VM_SHARED);
306
307 set_vma_private_data(vma, get_vma_private_data(vma) | flags);
308}
309
310static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
311{
312 VM_BUG_ON(!is_vm_hugetlb_page(vma));
313
314 return (get_vma_private_data(vma) & flag) != 0;
315}
316
317/* Decrement the reserved pages in the hugepage pool by one */
318static void decrement_hugepage_resv_vma(struct hstate *h,
319 struct vm_area_struct *vma)
320{
321 if (vma->vm_flags & VM_NORESERVE)
322 return;
323
324 if (vma->vm_flags & VM_SHARED) {
325 /* Shared mappings always use reserves */
326 h->resv_huge_pages--;
327 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
328 /*
329 * Only the process that called mmap() has reserves for
330 * private mappings.
331 */
332 h->resv_huge_pages--;
333 }
334}
335
336/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
337void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
338{
339 VM_BUG_ON(!is_vm_hugetlb_page(vma));
340 if (!(vma->vm_flags & VM_SHARED))
341 vma->vm_private_data = (void *)0;
342}
343
344/* Returns true if the VMA has associated reserve pages */
345static int vma_has_reserves(struct vm_area_struct *vma)
346{
347 if (vma->vm_flags & VM_SHARED)
348 return 1;
349 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
350 return 1;
351 return 0;
352}
353
354static void clear_huge_page(struct page *page,
355 unsigned long addr, unsigned long sz)
44{ 356{
45 int i; 357 int i;
46 358
47 might_sleep(); 359 might_sleep();
48 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { 360 for (i = 0; i < sz/PAGE_SIZE; i++) {
49 cond_resched(); 361 cond_resched();
50 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 362 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
51 } 363 }
@@ -55,42 +367,44 @@ static void copy_huge_page(struct page *dst, struct page *src,
55 unsigned long addr, struct vm_area_struct *vma) 367 unsigned long addr, struct vm_area_struct *vma)
56{ 368{
57 int i; 369 int i;
370 struct hstate *h = hstate_vma(vma);
58 371
59 might_sleep(); 372 might_sleep();
60 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 373 for (i = 0; i < pages_per_huge_page(h); i++) {
61 cond_resched(); 374 cond_resched();
62 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 375 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
63 } 376 }
64} 377}
65 378
66static void enqueue_huge_page(struct page *page) 379static void enqueue_huge_page(struct hstate *h, struct page *page)
67{ 380{
68 int nid = page_to_nid(page); 381 int nid = page_to_nid(page);
69 list_add(&page->lru, &hugepage_freelists[nid]); 382 list_add(&page->lru, &h->hugepage_freelists[nid]);
70 free_huge_pages++; 383 h->free_huge_pages++;
71 free_huge_pages_node[nid]++; 384 h->free_huge_pages_node[nid]++;
72} 385}
73 386
74static struct page *dequeue_huge_page(void) 387static struct page *dequeue_huge_page(struct hstate *h)
75{ 388{
76 int nid; 389 int nid;
77 struct page *page = NULL; 390 struct page *page = NULL;
78 391
79 for (nid = 0; nid < MAX_NUMNODES; ++nid) { 392 for (nid = 0; nid < MAX_NUMNODES; ++nid) {
80 if (!list_empty(&hugepage_freelists[nid])) { 393 if (!list_empty(&h->hugepage_freelists[nid])) {
81 page = list_entry(hugepage_freelists[nid].next, 394 page = list_entry(h->hugepage_freelists[nid].next,
82 struct page, lru); 395 struct page, lru);
83 list_del(&page->lru); 396 list_del(&page->lru);
84 free_huge_pages--; 397 h->free_huge_pages--;
85 free_huge_pages_node[nid]--; 398 h->free_huge_pages_node[nid]--;
86 break; 399 break;
87 } 400 }
88 } 401 }
89 return page; 402 return page;
90} 403}
91 404
92static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, 405static struct page *dequeue_huge_page_vma(struct hstate *h,
93 unsigned long address) 406 struct vm_area_struct *vma,
407 unsigned long address, int avoid_reserve)
94{ 408{
95 int nid; 409 int nid;
96 struct page *page = NULL; 410 struct page *page = NULL;
@@ -101,18 +415,33 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
101 struct zone *zone; 415 struct zone *zone;
102 struct zoneref *z; 416 struct zoneref *z;
103 417
418 /*
419 * A child process with MAP_PRIVATE mappings created by their parent
420 * have no page reserves. This check ensures that reservations are
421 * not "stolen". The child may still get SIGKILLed
422 */
423 if (!vma_has_reserves(vma) &&
424 h->free_huge_pages - h->resv_huge_pages == 0)
425 return NULL;
426
427 /* If reserves cannot be used, ensure enough pages are in the pool */
428 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
429 return NULL;
430
104 for_each_zone_zonelist_nodemask(zone, z, zonelist, 431 for_each_zone_zonelist_nodemask(zone, z, zonelist,
105 MAX_NR_ZONES - 1, nodemask) { 432 MAX_NR_ZONES - 1, nodemask) {
106 nid = zone_to_nid(zone); 433 nid = zone_to_nid(zone);
107 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 434 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
108 !list_empty(&hugepage_freelists[nid])) { 435 !list_empty(&h->hugepage_freelists[nid])) {
109 page = list_entry(hugepage_freelists[nid].next, 436 page = list_entry(h->hugepage_freelists[nid].next,
110 struct page, lru); 437 struct page, lru);
111 list_del(&page->lru); 438 list_del(&page->lru);
112 free_huge_pages--; 439 h->free_huge_pages--;
113 free_huge_pages_node[nid]--; 440 h->free_huge_pages_node[nid]--;
114 if (vma && vma->vm_flags & VM_MAYSHARE) 441
115 resv_huge_pages--; 442 if (!avoid_reserve)
443 decrement_hugepage_resv_vma(h, vma);
444
116 break; 445 break;
117 } 446 }
118 } 447 }
@@ -120,12 +449,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
120 return page; 449 return page;
121} 450}
122 451
123static void update_and_free_page(struct page *page) 452static void update_and_free_page(struct hstate *h, struct page *page)
124{ 453{
125 int i; 454 int i;
126 nr_huge_pages--; 455
127 nr_huge_pages_node[page_to_nid(page)]--; 456 h->nr_huge_pages--;
128 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 457 h->nr_huge_pages_node[page_to_nid(page)]--;
458 for (i = 0; i < pages_per_huge_page(h); i++) {
129 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 459 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
130 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 460 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
131 1 << PG_private | 1<< PG_writeback); 461 1 << PG_private | 1<< PG_writeback);
@@ -133,11 +463,27 @@ static void update_and_free_page(struct page *page)
133 set_compound_page_dtor(page, NULL); 463 set_compound_page_dtor(page, NULL);
134 set_page_refcounted(page); 464 set_page_refcounted(page);
135 arch_release_hugepage(page); 465 arch_release_hugepage(page);
136 __free_pages(page, HUGETLB_PAGE_ORDER); 466 __free_pages(page, huge_page_order(h));
467}
468
469struct hstate *size_to_hstate(unsigned long size)
470{
471 struct hstate *h;
472
473 for_each_hstate(h) {
474 if (huge_page_size(h) == size)
475 return h;
476 }
477 return NULL;
137} 478}
138 479
139static void free_huge_page(struct page *page) 480static void free_huge_page(struct page *page)
140{ 481{
482 /*
483 * Can't pass hstate in here because it is called from the
484 * compound page destructor.
485 */
486 struct hstate *h = page_hstate(page);
141 int nid = page_to_nid(page); 487 int nid = page_to_nid(page);
142 struct address_space *mapping; 488 struct address_space *mapping;
143 489
@@ -147,12 +493,12 @@ static void free_huge_page(struct page *page)
147 INIT_LIST_HEAD(&page->lru); 493 INIT_LIST_HEAD(&page->lru);
148 494
149 spin_lock(&hugetlb_lock); 495 spin_lock(&hugetlb_lock);
150 if (surplus_huge_pages_node[nid]) { 496 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
151 update_and_free_page(page); 497 update_and_free_page(h, page);
152 surplus_huge_pages--; 498 h->surplus_huge_pages--;
153 surplus_huge_pages_node[nid]--; 499 h->surplus_huge_pages_node[nid]--;
154 } else { 500 } else {
155 enqueue_huge_page(page); 501 enqueue_huge_page(h, page);
156 } 502 }
157 spin_unlock(&hugetlb_lock); 503 spin_unlock(&hugetlb_lock);
158 if (mapping) 504 if (mapping)
@@ -164,7 +510,7 @@ static void free_huge_page(struct page *page)
164 * balanced by operating on them in a round-robin fashion. 510 * balanced by operating on them in a round-robin fashion.
165 * Returns 1 if an adjustment was made. 511 * Returns 1 if an adjustment was made.
166 */ 512 */
167static int adjust_pool_surplus(int delta) 513static int adjust_pool_surplus(struct hstate *h, int delta)
168{ 514{
169 static int prev_nid; 515 static int prev_nid;
170 int nid = prev_nid; 516 int nid = prev_nid;
@@ -177,15 +523,15 @@ static int adjust_pool_surplus(int delta)
177 nid = first_node(node_online_map); 523 nid = first_node(node_online_map);
178 524
179 /* To shrink on this node, there must be a surplus page */ 525 /* To shrink on this node, there must be a surplus page */
180 if (delta < 0 && !surplus_huge_pages_node[nid]) 526 if (delta < 0 && !h->surplus_huge_pages_node[nid])
181 continue; 527 continue;
182 /* Surplus cannot exceed the total number of pages */ 528 /* Surplus cannot exceed the total number of pages */
183 if (delta > 0 && surplus_huge_pages_node[nid] >= 529 if (delta > 0 && h->surplus_huge_pages_node[nid] >=
184 nr_huge_pages_node[nid]) 530 h->nr_huge_pages_node[nid])
185 continue; 531 continue;
186 532
187 surplus_huge_pages += delta; 533 h->surplus_huge_pages += delta;
188 surplus_huge_pages_node[nid] += delta; 534 h->surplus_huge_pages_node[nid] += delta;
189 ret = 1; 535 ret = 1;
190 break; 536 break;
191 } while (nid != prev_nid); 537 } while (nid != prev_nid);
@@ -194,59 +540,74 @@ static int adjust_pool_surplus(int delta)
194 return ret; 540 return ret;
195} 541}
196 542
197static struct page *alloc_fresh_huge_page_node(int nid) 543static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
544{
545 set_compound_page_dtor(page, free_huge_page);
546 spin_lock(&hugetlb_lock);
547 h->nr_huge_pages++;
548 h->nr_huge_pages_node[nid]++;
549 spin_unlock(&hugetlb_lock);
550 put_page(page); /* free it into the hugepage allocator */
551}
552
553static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
198{ 554{
199 struct page *page; 555 struct page *page;
200 556
557 if (h->order >= MAX_ORDER)
558 return NULL;
559
201 page = alloc_pages_node(nid, 560 page = alloc_pages_node(nid,
202 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 561 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
203 __GFP_REPEAT|__GFP_NOWARN, 562 __GFP_REPEAT|__GFP_NOWARN,
204 HUGETLB_PAGE_ORDER); 563 huge_page_order(h));
205 if (page) { 564 if (page) {
206 if (arch_prepare_hugepage(page)) { 565 if (arch_prepare_hugepage(page)) {
207 __free_pages(page, HUGETLB_PAGE_ORDER); 566 __free_pages(page, HUGETLB_PAGE_ORDER);
208 return NULL; 567 return NULL;
209 } 568 }
210 set_compound_page_dtor(page, free_huge_page); 569 prep_new_huge_page(h, page, nid);
211 spin_lock(&hugetlb_lock);
212 nr_huge_pages++;
213 nr_huge_pages_node[nid]++;
214 spin_unlock(&hugetlb_lock);
215 put_page(page); /* free it into the hugepage allocator */
216 } 570 }
217 571
218 return page; 572 return page;
219} 573}
220 574
221static int alloc_fresh_huge_page(void) 575/*
576 * Use a helper variable to find the next node and then
577 * copy it back to hugetlb_next_nid afterwards:
578 * otherwise there's a window in which a racer might
579 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
580 * But we don't need to use a spin_lock here: it really
581 * doesn't matter if occasionally a racer chooses the
582 * same nid as we do. Move nid forward in the mask even
583 * if we just successfully allocated a hugepage so that
584 * the next caller gets hugepages on the next node.
585 */
586static int hstate_next_node(struct hstate *h)
587{
588 int next_nid;
589 next_nid = next_node(h->hugetlb_next_nid, node_online_map);
590 if (next_nid == MAX_NUMNODES)
591 next_nid = first_node(node_online_map);
592 h->hugetlb_next_nid = next_nid;
593 return next_nid;
594}
595
596static int alloc_fresh_huge_page(struct hstate *h)
222{ 597{
223 struct page *page; 598 struct page *page;
224 int start_nid; 599 int start_nid;
225 int next_nid; 600 int next_nid;
226 int ret = 0; 601 int ret = 0;
227 602
228 start_nid = hugetlb_next_nid; 603 start_nid = h->hugetlb_next_nid;
229 604
230 do { 605 do {
231 page = alloc_fresh_huge_page_node(hugetlb_next_nid); 606 page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
232 if (page) 607 if (page)
233 ret = 1; 608 ret = 1;
234 /* 609 next_nid = hstate_next_node(h);
235 * Use a helper variable to find the next node and then 610 } while (!page && h->hugetlb_next_nid != start_nid);
236 * copy it back to hugetlb_next_nid afterwards:
237 * otherwise there's a window in which a racer might
238 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
239 * But we don't need to use a spin_lock here: it really
240 * doesn't matter if occasionally a racer chooses the
241 * same nid as we do. Move nid forward in the mask even
242 * if we just successfully allocated a hugepage so that
243 * the next caller gets hugepages on the next node.
244 */
245 next_nid = next_node(hugetlb_next_nid, node_online_map);
246 if (next_nid == MAX_NUMNODES)
247 next_nid = first_node(node_online_map);
248 hugetlb_next_nid = next_nid;
249 } while (!page && hugetlb_next_nid != start_nid);
250 611
251 if (ret) 612 if (ret)
252 count_vm_event(HTLB_BUDDY_PGALLOC); 613 count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -256,12 +617,15 @@ static int alloc_fresh_huge_page(void)
256 return ret; 617 return ret;
257} 618}
258 619
259static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, 620static struct page *alloc_buddy_huge_page(struct hstate *h,
260 unsigned long address) 621 struct vm_area_struct *vma, unsigned long address)
261{ 622{
262 struct page *page; 623 struct page *page;
263 unsigned int nid; 624 unsigned int nid;
264 625
626 if (h->order >= MAX_ORDER)
627 return NULL;
628
265 /* 629 /*
266 * Assume we will successfully allocate the surplus page to 630 * Assume we will successfully allocate the surplus page to
267 * prevent racing processes from causing the surplus to exceed 631 * prevent racing processes from causing the surplus to exceed
@@ -286,18 +650,18 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
286 * per-node value is checked there. 650 * per-node value is checked there.
287 */ 651 */
288 spin_lock(&hugetlb_lock); 652 spin_lock(&hugetlb_lock);
289 if (surplus_huge_pages >= nr_overcommit_huge_pages) { 653 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
290 spin_unlock(&hugetlb_lock); 654 spin_unlock(&hugetlb_lock);
291 return NULL; 655 return NULL;
292 } else { 656 } else {
293 nr_huge_pages++; 657 h->nr_huge_pages++;
294 surplus_huge_pages++; 658 h->surplus_huge_pages++;
295 } 659 }
296 spin_unlock(&hugetlb_lock); 660 spin_unlock(&hugetlb_lock);
297 661
298 page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 662 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
299 __GFP_REPEAT|__GFP_NOWARN, 663 __GFP_REPEAT|__GFP_NOWARN,
300 HUGETLB_PAGE_ORDER); 664 huge_page_order(h));
301 665
302 spin_lock(&hugetlb_lock); 666 spin_lock(&hugetlb_lock);
303 if (page) { 667 if (page) {
@@ -312,12 +676,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
312 /* 676 /*
313 * We incremented the global counters already 677 * We incremented the global counters already
314 */ 678 */
315 nr_huge_pages_node[nid]++; 679 h->nr_huge_pages_node[nid]++;
316 surplus_huge_pages_node[nid]++; 680 h->surplus_huge_pages_node[nid]++;
317 __count_vm_event(HTLB_BUDDY_PGALLOC); 681 __count_vm_event(HTLB_BUDDY_PGALLOC);
318 } else { 682 } else {
319 nr_huge_pages--; 683 h->nr_huge_pages--;
320 surplus_huge_pages--; 684 h->surplus_huge_pages--;
321 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 685 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
322 } 686 }
323 spin_unlock(&hugetlb_lock); 687 spin_unlock(&hugetlb_lock);
@@ -329,16 +693,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
329 * Increase the hugetlb pool such that it can accomodate a reservation 693 * Increase the hugetlb pool such that it can accomodate a reservation
330 * of size 'delta'. 694 * of size 'delta'.
331 */ 695 */
332static int gather_surplus_pages(int delta) 696static int gather_surplus_pages(struct hstate *h, int delta)
333{ 697{
334 struct list_head surplus_list; 698 struct list_head surplus_list;
335 struct page *page, *tmp; 699 struct page *page, *tmp;
336 int ret, i; 700 int ret, i;
337 int needed, allocated; 701 int needed, allocated;
338 702
339 needed = (resv_huge_pages + delta) - free_huge_pages; 703 needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
340 if (needed <= 0) { 704 if (needed <= 0) {
341 resv_huge_pages += delta; 705 h->resv_huge_pages += delta;
342 return 0; 706 return 0;
343 } 707 }
344 708
@@ -349,7 +713,7 @@ static int gather_surplus_pages(int delta)
349retry: 713retry:
350 spin_unlock(&hugetlb_lock); 714 spin_unlock(&hugetlb_lock);
351 for (i = 0; i < needed; i++) { 715 for (i = 0; i < needed; i++) {
352 page = alloc_buddy_huge_page(NULL, 0); 716 page = alloc_buddy_huge_page(h, NULL, 0);
353 if (!page) { 717 if (!page) {
354 /* 718 /*
355 * We were not able to allocate enough pages to 719 * We were not able to allocate enough pages to
@@ -370,7 +734,8 @@ retry:
370 * because either resv_huge_pages or free_huge_pages may have changed. 734 * because either resv_huge_pages or free_huge_pages may have changed.
371 */ 735 */
372 spin_lock(&hugetlb_lock); 736 spin_lock(&hugetlb_lock);
373 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); 737 needed = (h->resv_huge_pages + delta) -
738 (h->free_huge_pages + allocated);
374 if (needed > 0) 739 if (needed > 0)
375 goto retry; 740 goto retry;
376 741
@@ -383,7 +748,7 @@ retry:
383 * before they are reserved. 748 * before they are reserved.
384 */ 749 */
385 needed += allocated; 750 needed += allocated;
386 resv_huge_pages += delta; 751 h->resv_huge_pages += delta;
387 ret = 0; 752 ret = 0;
388free: 753free:
389 /* Free the needed pages to the hugetlb pool */ 754 /* Free the needed pages to the hugetlb pool */
@@ -391,7 +756,7 @@ free:
391 if ((--needed) < 0) 756 if ((--needed) < 0)
392 break; 757 break;
393 list_del(&page->lru); 758 list_del(&page->lru);
394 enqueue_huge_page(page); 759 enqueue_huge_page(h, page);
395 } 760 }
396 761
397 /* Free unnecessary surplus pages to the buddy allocator */ 762 /* Free unnecessary surplus pages to the buddy allocator */
@@ -419,7 +784,8 @@ free:
419 * allocated to satisfy the reservation must be explicitly freed if they were 784 * allocated to satisfy the reservation must be explicitly freed if they were
420 * never used. 785 * never used.
421 */ 786 */
422static void return_unused_surplus_pages(unsigned long unused_resv_pages) 787static void return_unused_surplus_pages(struct hstate *h,
788 unsigned long unused_resv_pages)
423{ 789{
424 static int nid = -1; 790 static int nid = -1;
425 struct page *page; 791 struct page *page;
@@ -434,157 +800,269 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
434 unsigned long remaining_iterations = num_online_nodes(); 800 unsigned long remaining_iterations = num_online_nodes();
435 801
436 /* Uncommit the reservation */ 802 /* Uncommit the reservation */
437 resv_huge_pages -= unused_resv_pages; 803 h->resv_huge_pages -= unused_resv_pages;
804
805 /* Cannot return gigantic pages currently */
806 if (h->order >= MAX_ORDER)
807 return;
438 808
439 nr_pages = min(unused_resv_pages, surplus_huge_pages); 809 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
440 810
441 while (remaining_iterations-- && nr_pages) { 811 while (remaining_iterations-- && nr_pages) {
442 nid = next_node(nid, node_online_map); 812 nid = next_node(nid, node_online_map);
443 if (nid == MAX_NUMNODES) 813 if (nid == MAX_NUMNODES)
444 nid = first_node(node_online_map); 814 nid = first_node(node_online_map);
445 815
446 if (!surplus_huge_pages_node[nid]) 816 if (!h->surplus_huge_pages_node[nid])
447 continue; 817 continue;
448 818
449 if (!list_empty(&hugepage_freelists[nid])) { 819 if (!list_empty(&h->hugepage_freelists[nid])) {
450 page = list_entry(hugepage_freelists[nid].next, 820 page = list_entry(h->hugepage_freelists[nid].next,
451 struct page, lru); 821 struct page, lru);
452 list_del(&page->lru); 822 list_del(&page->lru);
453 update_and_free_page(page); 823 update_and_free_page(h, page);
454 free_huge_pages--; 824 h->free_huge_pages--;
455 free_huge_pages_node[nid]--; 825 h->free_huge_pages_node[nid]--;
456 surplus_huge_pages--; 826 h->surplus_huge_pages--;
457 surplus_huge_pages_node[nid]--; 827 h->surplus_huge_pages_node[nid]--;
458 nr_pages--; 828 nr_pages--;
459 remaining_iterations = num_online_nodes(); 829 remaining_iterations = num_online_nodes();
460 } 830 }
461 } 831 }
462} 832}
463 833
834/*
835 * Determine if the huge page at addr within the vma has an associated
836 * reservation. Where it does not we will need to logically increase
837 * reservation and actually increase quota before an allocation can occur.
838 * Where any new reservation would be required the reservation change is
839 * prepared, but not committed. Once the page has been quota'd allocated
840 * an instantiated the change should be committed via vma_commit_reservation.
841 * No action is required on failure.
842 */
843static int vma_needs_reservation(struct hstate *h,
844 struct vm_area_struct *vma, unsigned long addr)
845{
846 struct address_space *mapping = vma->vm_file->f_mapping;
847 struct inode *inode = mapping->host;
848
849 if (vma->vm_flags & VM_SHARED) {
850 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
851 return region_chg(&inode->i_mapping->private_list,
852 idx, idx + 1);
464 853
465static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, 854 } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
466 unsigned long addr) 855 return 1;
856
857 } else {
858 int err;
859 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
860 struct resv_map *reservations = vma_resv_map(vma);
861
862 err = region_chg(&reservations->regions, idx, idx + 1);
863 if (err < 0)
864 return err;
865 return 0;
866 }
867}
868static void vma_commit_reservation(struct hstate *h,
869 struct vm_area_struct *vma, unsigned long addr)
467{ 870{
468 struct page *page; 871 struct address_space *mapping = vma->vm_file->f_mapping;
872 struct inode *inode = mapping->host;
469 873
470 spin_lock(&hugetlb_lock); 874 if (vma->vm_flags & VM_SHARED) {
471 page = dequeue_huge_page_vma(vma, addr); 875 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
472 spin_unlock(&hugetlb_lock); 876 region_add(&inode->i_mapping->private_list, idx, idx + 1);
473 return page ? page : ERR_PTR(-VM_FAULT_OOM); 877
878 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
879 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
880 struct resv_map *reservations = vma_resv_map(vma);
881
882 /* Mark this page used in the map. */
883 region_add(&reservations->regions, idx, idx + 1);
884 }
474} 885}
475 886
476static struct page *alloc_huge_page_private(struct vm_area_struct *vma, 887static struct page *alloc_huge_page(struct vm_area_struct *vma,
477 unsigned long addr) 888 unsigned long addr, int avoid_reserve)
478{ 889{
479 struct page *page = NULL; 890 struct hstate *h = hstate_vma(vma);
891 struct page *page;
892 struct address_space *mapping = vma->vm_file->f_mapping;
893 struct inode *inode = mapping->host;
894 unsigned int chg;
480 895
481 if (hugetlb_get_quota(vma->vm_file->f_mapping, 1)) 896 /*
482 return ERR_PTR(-VM_FAULT_SIGBUS); 897 * Processes that did not create the mapping will have no reserves and
898 * will not have accounted against quota. Check that the quota can be
899 * made before satisfying the allocation
900 * MAP_NORESERVE mappings may also need pages and quota allocated
901 * if no reserve mapping overlaps.
902 */
903 chg = vma_needs_reservation(h, vma, addr);
904 if (chg < 0)
905 return ERR_PTR(chg);
906 if (chg)
907 if (hugetlb_get_quota(inode->i_mapping, chg))
908 return ERR_PTR(-ENOSPC);
483 909
484 spin_lock(&hugetlb_lock); 910 spin_lock(&hugetlb_lock);
485 if (free_huge_pages > resv_huge_pages) 911 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
486 page = dequeue_huge_page_vma(vma, addr);
487 spin_unlock(&hugetlb_lock); 912 spin_unlock(&hugetlb_lock);
913
488 if (!page) { 914 if (!page) {
489 page = alloc_buddy_huge_page(vma, addr); 915 page = alloc_buddy_huge_page(h, vma, addr);
490 if (!page) { 916 if (!page) {
491 hugetlb_put_quota(vma->vm_file->f_mapping, 1); 917 hugetlb_put_quota(inode->i_mapping, chg);
492 return ERR_PTR(-VM_FAULT_OOM); 918 return ERR_PTR(-VM_FAULT_OOM);
493 } 919 }
494 } 920 }
921
922 set_page_refcounted(page);
923 set_page_private(page, (unsigned long) mapping);
924
925 vma_commit_reservation(h, vma, addr);
926
495 return page; 927 return page;
496} 928}
497 929
498static struct page *alloc_huge_page(struct vm_area_struct *vma, 930__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
499 unsigned long addr)
500{ 931{
501 struct page *page; 932 struct huge_bootmem_page *m;
502 struct address_space *mapping = vma->vm_file->f_mapping; 933 int nr_nodes = nodes_weight(node_online_map);
503 934
504 if (vma->vm_flags & VM_MAYSHARE) 935 while (nr_nodes) {
505 page = alloc_huge_page_shared(vma, addr); 936 void *addr;
506 else 937
507 page = alloc_huge_page_private(vma, addr); 938 addr = __alloc_bootmem_node_nopanic(
939 NODE_DATA(h->hugetlb_next_nid),
940 huge_page_size(h), huge_page_size(h), 0);
508 941
509 if (!IS_ERR(page)) { 942 if (addr) {
510 set_page_refcounted(page); 943 /*
511 set_page_private(page, (unsigned long) mapping); 944 * Use the beginning of the huge page to store the
945 * huge_bootmem_page struct (until gather_bootmem
946 * puts them into the mem_map).
947 */
948 m = addr;
949 if (m)
950 goto found;
951 }
952 hstate_next_node(h);
953 nr_nodes--;
512 } 954 }
513 return page; 955 return 0;
956
957found:
958 BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
959 /* Put them into a private list first because mem_map is not up yet */
960 list_add(&m->list, &huge_boot_pages);
961 m->hstate = h;
962 return 1;
514} 963}
515 964
516static int __init hugetlb_init(void) 965/* Put bootmem huge pages into the standard lists after mem_map is up */
966static void __init gather_bootmem_prealloc(void)
517{ 967{
518 unsigned long i; 968 struct huge_bootmem_page *m;
519 969
520 if (HPAGE_SHIFT == 0) 970 list_for_each_entry(m, &huge_boot_pages, list) {
521 return 0; 971 struct page *page = virt_to_page(m);
522 972 struct hstate *h = m->hstate;
523 for (i = 0; i < MAX_NUMNODES; ++i) 973 __ClearPageReserved(page);
524 INIT_LIST_HEAD(&hugepage_freelists[i]); 974 WARN_ON(page_count(page) != 1);
975 prep_compound_page(page, h->order);
976 prep_new_huge_page(h, page, page_to_nid(page));
977 }
978}
525 979
526 hugetlb_next_nid = first_node(node_online_map); 980static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
981{
982 unsigned long i;
527 983
528 for (i = 0; i < max_huge_pages; ++i) { 984 for (i = 0; i < h->max_huge_pages; ++i) {
529 if (!alloc_fresh_huge_page()) 985 if (h->order >= MAX_ORDER) {
986 if (!alloc_bootmem_huge_page(h))
987 break;
988 } else if (!alloc_fresh_huge_page(h))
530 break; 989 break;
531 } 990 }
532 max_huge_pages = free_huge_pages = nr_huge_pages = i; 991 h->max_huge_pages = i;
533 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
534 return 0;
535} 992}
536module_init(hugetlb_init);
537 993
538static int __init hugetlb_setup(char *s) 994static void __init hugetlb_init_hstates(void)
539{ 995{
540 if (sscanf(s, "%lu", &max_huge_pages) <= 0) 996 struct hstate *h;
541 max_huge_pages = 0; 997
542 return 1; 998 for_each_hstate(h) {
999 /* oversize hugepages were init'ed in early boot */
1000 if (h->order < MAX_ORDER)
1001 hugetlb_hstate_alloc_pages(h);
1002 }
543} 1003}
544__setup("hugepages=", hugetlb_setup);
545 1004
546static unsigned int cpuset_mems_nr(unsigned int *array) 1005static char * __init memfmt(char *buf, unsigned long n)
547{ 1006{
548 int node; 1007 if (n >= (1UL << 30))
549 unsigned int nr = 0; 1008 sprintf(buf, "%lu GB", n >> 30);
550 1009 else if (n >= (1UL << 20))
551 for_each_node_mask(node, cpuset_current_mems_allowed) 1010 sprintf(buf, "%lu MB", n >> 20);
552 nr += array[node]; 1011 else
1012 sprintf(buf, "%lu KB", n >> 10);
1013 return buf;
1014}
553 1015
554 return nr; 1016static void __init report_hugepages(void)
1017{
1018 struct hstate *h;
1019
1020 for_each_hstate(h) {
1021 char buf[32];
1022 printk(KERN_INFO "HugeTLB registered %s page size, "
1023 "pre-allocated %ld pages\n",
1024 memfmt(buf, huge_page_size(h)),
1025 h->free_huge_pages);
1026 }
555} 1027}
556 1028
557#ifdef CONFIG_SYSCTL
558#ifdef CONFIG_HIGHMEM 1029#ifdef CONFIG_HIGHMEM
559static void try_to_free_low(unsigned long count) 1030static void try_to_free_low(struct hstate *h, unsigned long count)
560{ 1031{
561 int i; 1032 int i;
562 1033
1034 if (h->order >= MAX_ORDER)
1035 return;
1036
563 for (i = 0; i < MAX_NUMNODES; ++i) { 1037 for (i = 0; i < MAX_NUMNODES; ++i) {
564 struct page *page, *next; 1038 struct page *page, *next;
565 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 1039 struct list_head *freel = &h->hugepage_freelists[i];
566 if (count >= nr_huge_pages) 1040 list_for_each_entry_safe(page, next, freel, lru) {
1041 if (count >= h->nr_huge_pages)
567 return; 1042 return;
568 if (PageHighMem(page)) 1043 if (PageHighMem(page))
569 continue; 1044 continue;
570 list_del(&page->lru); 1045 list_del(&page->lru);
571 update_and_free_page(page); 1046 update_and_free_page(h, page);
572 free_huge_pages--; 1047 h->free_huge_pages--;
573 free_huge_pages_node[page_to_nid(page)]--; 1048 h->free_huge_pages_node[page_to_nid(page)]--;
574 } 1049 }
575 } 1050 }
576} 1051}
577#else 1052#else
578static inline void try_to_free_low(unsigned long count) 1053static inline void try_to_free_low(struct hstate *h, unsigned long count)
579{ 1054{
580} 1055}
581#endif 1056#endif
582 1057
583#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) 1058#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
584static unsigned long set_max_huge_pages(unsigned long count) 1059static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
585{ 1060{
586 unsigned long min_count, ret; 1061 unsigned long min_count, ret;
587 1062
1063 if (h->order >= MAX_ORDER)
1064 return h->max_huge_pages;
1065
588 /* 1066 /*
589 * Increase the pool size 1067 * Increase the pool size
590 * First take pages out of surplus state. Then make up the 1068 * First take pages out of surplus state. Then make up the
@@ -597,20 +1075,19 @@ static unsigned long set_max_huge_pages(unsigned long count)
597 * within all the constraints specified by the sysctls. 1075 * within all the constraints specified by the sysctls.
598 */ 1076 */
599 spin_lock(&hugetlb_lock); 1077 spin_lock(&hugetlb_lock);
600 while (surplus_huge_pages && count > persistent_huge_pages) { 1078 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
601 if (!adjust_pool_surplus(-1)) 1079 if (!adjust_pool_surplus(h, -1))
602 break; 1080 break;
603 } 1081 }
604 1082
605 while (count > persistent_huge_pages) { 1083 while (count > persistent_huge_pages(h)) {
606 int ret;
607 /* 1084 /*
608 * If this allocation races such that we no longer need the 1085 * If this allocation races such that we no longer need the
609 * page, free_huge_page will handle it by freeing the page 1086 * page, free_huge_page will handle it by freeing the page
610 * and reducing the surplus. 1087 * and reducing the surplus.
611 */ 1088 */
612 spin_unlock(&hugetlb_lock); 1089 spin_unlock(&hugetlb_lock);
613 ret = alloc_fresh_huge_page(); 1090 ret = alloc_fresh_huge_page(h);
614 spin_lock(&hugetlb_lock); 1091 spin_lock(&hugetlb_lock);
615 if (!ret) 1092 if (!ret)
616 goto out; 1093 goto out;
@@ -632,31 +1109,300 @@ static unsigned long set_max_huge_pages(unsigned long count)
632 * and won't grow the pool anywhere else. Not until one of the 1109 * and won't grow the pool anywhere else. Not until one of the
633 * sysctls are changed, or the surplus pages go out of use. 1110 * sysctls are changed, or the surplus pages go out of use.
634 */ 1111 */
635 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; 1112 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
636 min_count = max(count, min_count); 1113 min_count = max(count, min_count);
637 try_to_free_low(min_count); 1114 try_to_free_low(h, min_count);
638 while (min_count < persistent_huge_pages) { 1115 while (min_count < persistent_huge_pages(h)) {
639 struct page *page = dequeue_huge_page(); 1116 struct page *page = dequeue_huge_page(h);
640 if (!page) 1117 if (!page)
641 break; 1118 break;
642 update_and_free_page(page); 1119 update_and_free_page(h, page);
643 } 1120 }
644 while (count < persistent_huge_pages) { 1121 while (count < persistent_huge_pages(h)) {
645 if (!adjust_pool_surplus(1)) 1122 if (!adjust_pool_surplus(h, 1))
646 break; 1123 break;
647 } 1124 }
648out: 1125out:
649 ret = persistent_huge_pages; 1126 ret = persistent_huge_pages(h);
650 spin_unlock(&hugetlb_lock); 1127 spin_unlock(&hugetlb_lock);
651 return ret; 1128 return ret;
652} 1129}
653 1130
1131#define HSTATE_ATTR_RO(_name) \
1132 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1133
1134#define HSTATE_ATTR(_name) \
1135 static struct kobj_attribute _name##_attr = \
1136 __ATTR(_name, 0644, _name##_show, _name##_store)
1137
1138static struct kobject *hugepages_kobj;
1139static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1140
1141static struct hstate *kobj_to_hstate(struct kobject *kobj)
1142{
1143 int i;
1144 for (i = 0; i < HUGE_MAX_HSTATE; i++)
1145 if (hstate_kobjs[i] == kobj)
1146 return &hstates[i];
1147 BUG();
1148 return NULL;
1149}
1150
1151static ssize_t nr_hugepages_show(struct kobject *kobj,
1152 struct kobj_attribute *attr, char *buf)
1153{
1154 struct hstate *h = kobj_to_hstate(kobj);
1155 return sprintf(buf, "%lu\n", h->nr_huge_pages);
1156}
1157static ssize_t nr_hugepages_store(struct kobject *kobj,
1158 struct kobj_attribute *attr, const char *buf, size_t count)
1159{
1160 int err;
1161 unsigned long input;
1162 struct hstate *h = kobj_to_hstate(kobj);
1163
1164 err = strict_strtoul(buf, 10, &input);
1165 if (err)
1166 return 0;
1167
1168 h->max_huge_pages = set_max_huge_pages(h, input);
1169
1170 return count;
1171}
1172HSTATE_ATTR(nr_hugepages);
1173
1174static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1175 struct kobj_attribute *attr, char *buf)
1176{
1177 struct hstate *h = kobj_to_hstate(kobj);
1178 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1179}
1180static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1181 struct kobj_attribute *attr, const char *buf, size_t count)
1182{
1183 int err;
1184 unsigned long input;
1185 struct hstate *h = kobj_to_hstate(kobj);
1186
1187 err = strict_strtoul(buf, 10, &input);
1188 if (err)
1189 return 0;
1190
1191 spin_lock(&hugetlb_lock);
1192 h->nr_overcommit_huge_pages = input;
1193 spin_unlock(&hugetlb_lock);
1194
1195 return count;
1196}
1197HSTATE_ATTR(nr_overcommit_hugepages);
1198
1199static ssize_t free_hugepages_show(struct kobject *kobj,
1200 struct kobj_attribute *attr, char *buf)
1201{
1202 struct hstate *h = kobj_to_hstate(kobj);
1203 return sprintf(buf, "%lu\n", h->free_huge_pages);
1204}
1205HSTATE_ATTR_RO(free_hugepages);
1206
1207static ssize_t resv_hugepages_show(struct kobject *kobj,
1208 struct kobj_attribute *attr, char *buf)
1209{
1210 struct hstate *h = kobj_to_hstate(kobj);
1211 return sprintf(buf, "%lu\n", h->resv_huge_pages);
1212}
1213HSTATE_ATTR_RO(resv_hugepages);
1214
1215static ssize_t surplus_hugepages_show(struct kobject *kobj,
1216 struct kobj_attribute *attr, char *buf)
1217{
1218 struct hstate *h = kobj_to_hstate(kobj);
1219 return sprintf(buf, "%lu\n", h->surplus_huge_pages);
1220}
1221HSTATE_ATTR_RO(surplus_hugepages);
1222
1223static struct attribute *hstate_attrs[] = {
1224 &nr_hugepages_attr.attr,
1225 &nr_overcommit_hugepages_attr.attr,
1226 &free_hugepages_attr.attr,
1227 &resv_hugepages_attr.attr,
1228 &surplus_hugepages_attr.attr,
1229 NULL,
1230};
1231
1232static struct attribute_group hstate_attr_group = {
1233 .attrs = hstate_attrs,
1234};
1235
1236static int __init hugetlb_sysfs_add_hstate(struct hstate *h)
1237{
1238 int retval;
1239
1240 hstate_kobjs[h - hstates] = kobject_create_and_add(h->name,
1241 hugepages_kobj);
1242 if (!hstate_kobjs[h - hstates])
1243 return -ENOMEM;
1244
1245 retval = sysfs_create_group(hstate_kobjs[h - hstates],
1246 &hstate_attr_group);
1247 if (retval)
1248 kobject_put(hstate_kobjs[h - hstates]);
1249
1250 return retval;
1251}
1252
1253static void __init hugetlb_sysfs_init(void)
1254{
1255 struct hstate *h;
1256 int err;
1257
1258 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
1259 if (!hugepages_kobj)
1260 return;
1261
1262 for_each_hstate(h) {
1263 err = hugetlb_sysfs_add_hstate(h);
1264 if (err)
1265 printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
1266 h->name);
1267 }
1268}
1269
1270static void __exit hugetlb_exit(void)
1271{
1272 struct hstate *h;
1273
1274 for_each_hstate(h) {
1275 kobject_put(hstate_kobjs[h - hstates]);
1276 }
1277
1278 kobject_put(hugepages_kobj);
1279}
1280module_exit(hugetlb_exit);
1281
1282static int __init hugetlb_init(void)
1283{
1284 BUILD_BUG_ON(HPAGE_SHIFT == 0);
1285
1286 if (!size_to_hstate(default_hstate_size)) {
1287 default_hstate_size = HPAGE_SIZE;
1288 if (!size_to_hstate(default_hstate_size))
1289 hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
1290 }
1291 default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
1292 if (default_hstate_max_huge_pages)
1293 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1294
1295 hugetlb_init_hstates();
1296
1297 gather_bootmem_prealloc();
1298
1299 report_hugepages();
1300
1301 hugetlb_sysfs_init();
1302
1303 return 0;
1304}
1305module_init(hugetlb_init);
1306
1307/* Should be called on processing a hugepagesz=... option */
1308void __init hugetlb_add_hstate(unsigned order)
1309{
1310 struct hstate *h;
1311 unsigned long i;
1312
1313 if (size_to_hstate(PAGE_SIZE << order)) {
1314 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
1315 return;
1316 }
1317 BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
1318 BUG_ON(order == 0);
1319 h = &hstates[max_hstate++];
1320 h->order = order;
1321 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
1322 h->nr_huge_pages = 0;
1323 h->free_huge_pages = 0;
1324 for (i = 0; i < MAX_NUMNODES; ++i)
1325 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1326 h->hugetlb_next_nid = first_node(node_online_map);
1327 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1328 huge_page_size(h)/1024);
1329
1330 parsed_hstate = h;
1331}
1332
1333static int __init hugetlb_nrpages_setup(char *s)
1334{
1335 unsigned long *mhp;
1336 static unsigned long *last_mhp;
1337
1338 /*
1339 * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
1340 * so this hugepages= parameter goes to the "default hstate".
1341 */
1342 if (!max_hstate)
1343 mhp = &default_hstate_max_huge_pages;
1344 else
1345 mhp = &parsed_hstate->max_huge_pages;
1346
1347 if (mhp == last_mhp) {
1348 printk(KERN_WARNING "hugepages= specified twice without "
1349 "interleaving hugepagesz=, ignoring\n");
1350 return 1;
1351 }
1352
1353 if (sscanf(s, "%lu", mhp) <= 0)
1354 *mhp = 0;
1355
1356 /*
1357 * Global state is always initialized later in hugetlb_init.
1358 * But we need to allocate >= MAX_ORDER hstates here early to still
1359 * use the bootmem allocator.
1360 */
1361 if (max_hstate && parsed_hstate->order >= MAX_ORDER)
1362 hugetlb_hstate_alloc_pages(parsed_hstate);
1363
1364 last_mhp = mhp;
1365
1366 return 1;
1367}
1368__setup("hugepages=", hugetlb_nrpages_setup);
1369
1370static int __init hugetlb_default_setup(char *s)
1371{
1372 default_hstate_size = memparse(s, &s);
1373 return 1;
1374}
1375__setup("default_hugepagesz=", hugetlb_default_setup);
1376
1377static unsigned int cpuset_mems_nr(unsigned int *array)
1378{
1379 int node;
1380 unsigned int nr = 0;
1381
1382 for_each_node_mask(node, cpuset_current_mems_allowed)
1383 nr += array[node];
1384
1385 return nr;
1386}
1387
1388#ifdef CONFIG_SYSCTL
654int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1389int hugetlb_sysctl_handler(struct ctl_table *table, int write,
655 struct file *file, void __user *buffer, 1390 struct file *file, void __user *buffer,
656 size_t *length, loff_t *ppos) 1391 size_t *length, loff_t *ppos)
657{ 1392{
1393 struct hstate *h = &default_hstate;
1394 unsigned long tmp;
1395
1396 if (!write)
1397 tmp = h->max_huge_pages;
1398
1399 table->data = &tmp;
1400 table->maxlen = sizeof(unsigned long);
658 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1401 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
659 max_huge_pages = set_max_huge_pages(max_huge_pages); 1402
1403 if (write)
1404 h->max_huge_pages = set_max_huge_pages(h, tmp);
1405
660 return 0; 1406 return 0;
661} 1407}
662 1408
@@ -676,10 +1422,22 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
676 struct file *file, void __user *buffer, 1422 struct file *file, void __user *buffer,
677 size_t *length, loff_t *ppos) 1423 size_t *length, loff_t *ppos)
678{ 1424{
1425 struct hstate *h = &default_hstate;
1426 unsigned long tmp;
1427
1428 if (!write)
1429 tmp = h->nr_overcommit_huge_pages;
1430
1431 table->data = &tmp;
1432 table->maxlen = sizeof(unsigned long);
679 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1433 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
680 spin_lock(&hugetlb_lock); 1434
681 nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; 1435 if (write) {
682 spin_unlock(&hugetlb_lock); 1436 spin_lock(&hugetlb_lock);
1437 h->nr_overcommit_huge_pages = tmp;
1438 spin_unlock(&hugetlb_lock);
1439 }
1440
683 return 0; 1441 return 0;
684} 1442}
685 1443
@@ -687,34 +1445,118 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
687 1445
688int hugetlb_report_meminfo(char *buf) 1446int hugetlb_report_meminfo(char *buf)
689{ 1447{
1448 struct hstate *h = &default_hstate;
690 return sprintf(buf, 1449 return sprintf(buf,
691 "HugePages_Total: %5lu\n" 1450 "HugePages_Total: %5lu\n"
692 "HugePages_Free: %5lu\n" 1451 "HugePages_Free: %5lu\n"
693 "HugePages_Rsvd: %5lu\n" 1452 "HugePages_Rsvd: %5lu\n"
694 "HugePages_Surp: %5lu\n" 1453 "HugePages_Surp: %5lu\n"
695 "Hugepagesize: %5lu kB\n", 1454 "Hugepagesize: %5lu kB\n",
696 nr_huge_pages, 1455 h->nr_huge_pages,
697 free_huge_pages, 1456 h->free_huge_pages,
698 resv_huge_pages, 1457 h->resv_huge_pages,
699 surplus_huge_pages, 1458 h->surplus_huge_pages,
700 HPAGE_SIZE/1024); 1459 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
701} 1460}
702 1461
703int hugetlb_report_node_meminfo(int nid, char *buf) 1462int hugetlb_report_node_meminfo(int nid, char *buf)
704{ 1463{
1464 struct hstate *h = &default_hstate;
705 return sprintf(buf, 1465 return sprintf(buf,
706 "Node %d HugePages_Total: %5u\n" 1466 "Node %d HugePages_Total: %5u\n"
707 "Node %d HugePages_Free: %5u\n" 1467 "Node %d HugePages_Free: %5u\n"
708 "Node %d HugePages_Surp: %5u\n", 1468 "Node %d HugePages_Surp: %5u\n",
709 nid, nr_huge_pages_node[nid], 1469 nid, h->nr_huge_pages_node[nid],
710 nid, free_huge_pages_node[nid], 1470 nid, h->free_huge_pages_node[nid],
711 nid, surplus_huge_pages_node[nid]); 1471 nid, h->surplus_huge_pages_node[nid]);
712} 1472}
713 1473
714/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 1474/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
715unsigned long hugetlb_total_pages(void) 1475unsigned long hugetlb_total_pages(void)
716{ 1476{
717 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 1477 struct hstate *h = &default_hstate;
1478 return h->nr_huge_pages * pages_per_huge_page(h);
1479}
1480
1481static int hugetlb_acct_memory(struct hstate *h, long delta)
1482{
1483 int ret = -ENOMEM;
1484
1485 spin_lock(&hugetlb_lock);
1486 /*
1487 * When cpuset is configured, it breaks the strict hugetlb page
1488 * reservation as the accounting is done on a global variable. Such
1489 * reservation is completely rubbish in the presence of cpuset because
1490 * the reservation is not checked against page availability for the
1491 * current cpuset. Application can still potentially OOM'ed by kernel
1492 * with lack of free htlb page in cpuset that the task is in.
1493 * Attempt to enforce strict accounting with cpuset is almost
1494 * impossible (or too ugly) because cpuset is too fluid that
1495 * task or memory node can be dynamically moved between cpusets.
1496 *
1497 * The change of semantics for shared hugetlb mapping with cpuset is
1498 * undesirable. However, in order to preserve some of the semantics,
1499 * we fall back to check against current free page availability as
1500 * a best attempt and hopefully to minimize the impact of changing
1501 * semantics that cpuset has.
1502 */
1503 if (delta > 0) {
1504 if (gather_surplus_pages(h, delta) < 0)
1505 goto out;
1506
1507 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
1508 return_unused_surplus_pages(h, delta);
1509 goto out;
1510 }
1511 }
1512
1513 ret = 0;
1514 if (delta < 0)
1515 return_unused_surplus_pages(h, (unsigned long) -delta);
1516
1517out:
1518 spin_unlock(&hugetlb_lock);
1519 return ret;
1520}
1521
1522static void hugetlb_vm_op_open(struct vm_area_struct *vma)
1523{
1524 struct resv_map *reservations = vma_resv_map(vma);
1525
1526 /*
1527 * This new VMA should share its siblings reservation map if present.
1528 * The VMA will only ever have a valid reservation map pointer where
1529 * it is being copied for another still existing VMA. As that VMA
1530 * has a reference to the reservation map it cannot dissappear until
1531 * after this open call completes. It is therefore safe to take a
1532 * new reference here without additional locking.
1533 */
1534 if (reservations)
1535 kref_get(&reservations->refs);
1536}
1537
1538static void hugetlb_vm_op_close(struct vm_area_struct *vma)
1539{
1540 struct hstate *h = hstate_vma(vma);
1541 struct resv_map *reservations = vma_resv_map(vma);
1542 unsigned long reserve;
1543 unsigned long start;
1544 unsigned long end;
1545
1546 if (reservations) {
1547 start = vma_hugecache_offset(h, vma, vma->vm_start);
1548 end = vma_hugecache_offset(h, vma, vma->vm_end);
1549
1550 reserve = (end - start) -
1551 region_count(&reservations->regions, start, end);
1552
1553 kref_put(&reservations->refs, resv_map_release);
1554
1555 if (reserve) {
1556 hugetlb_acct_memory(h, -reserve);
1557 hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
1558 }
1559 }
718} 1560}
719 1561
720/* 1562/*
@@ -731,6 +1573,8 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
731 1573
732struct vm_operations_struct hugetlb_vm_ops = { 1574struct vm_operations_struct hugetlb_vm_ops = {
733 .fault = hugetlb_vm_op_fault, 1575 .fault = hugetlb_vm_op_fault,
1576 .open = hugetlb_vm_op_open,
1577 .close = hugetlb_vm_op_close,
734}; 1578};
735 1579
736static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 1580static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -769,14 +1613,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
769 struct page *ptepage; 1613 struct page *ptepage;
770 unsigned long addr; 1614 unsigned long addr;
771 int cow; 1615 int cow;
1616 struct hstate *h = hstate_vma(vma);
1617 unsigned long sz = huge_page_size(h);
772 1618
773 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 1619 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
774 1620
775 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 1621 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
776 src_pte = huge_pte_offset(src, addr); 1622 src_pte = huge_pte_offset(src, addr);
777 if (!src_pte) 1623 if (!src_pte)
778 continue; 1624 continue;
779 dst_pte = huge_pte_alloc(dst, addr); 1625 dst_pte = huge_pte_alloc(dst, addr, sz);
780 if (!dst_pte) 1626 if (!dst_pte)
781 goto nomem; 1627 goto nomem;
782 1628
@@ -804,7 +1650,7 @@ nomem:
804} 1650}
805 1651
806void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 1652void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
807 unsigned long end) 1653 unsigned long end, struct page *ref_page)
808{ 1654{
809 struct mm_struct *mm = vma->vm_mm; 1655 struct mm_struct *mm = vma->vm_mm;
810 unsigned long address; 1656 unsigned long address;
@@ -812,6 +1658,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
812 pte_t pte; 1658 pte_t pte;
813 struct page *page; 1659 struct page *page;
814 struct page *tmp; 1660 struct page *tmp;
1661 struct hstate *h = hstate_vma(vma);
1662 unsigned long sz = huge_page_size(h);
1663
815 /* 1664 /*
816 * A page gathering list, protected by per file i_mmap_lock. The 1665 * A page gathering list, protected by per file i_mmap_lock. The
817 * lock is used to avoid list corruption from multiple unmapping 1666 * lock is used to avoid list corruption from multiple unmapping
@@ -820,11 +1669,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
820 LIST_HEAD(page_list); 1669 LIST_HEAD(page_list);
821 1670
822 WARN_ON(!is_vm_hugetlb_page(vma)); 1671 WARN_ON(!is_vm_hugetlb_page(vma));
823 BUG_ON(start & ~HPAGE_MASK); 1672 BUG_ON(start & ~huge_page_mask(h));
824 BUG_ON(end & ~HPAGE_MASK); 1673 BUG_ON(end & ~huge_page_mask(h));
825 1674
826 spin_lock(&mm->page_table_lock); 1675 spin_lock(&mm->page_table_lock);
827 for (address = start; address < end; address += HPAGE_SIZE) { 1676 for (address = start; address < end; address += sz) {
828 ptep = huge_pte_offset(mm, address); 1677 ptep = huge_pte_offset(mm, address);
829 if (!ptep) 1678 if (!ptep)
830 continue; 1679 continue;
@@ -832,6 +1681,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
832 if (huge_pmd_unshare(mm, &address, ptep)) 1681 if (huge_pmd_unshare(mm, &address, ptep))
833 continue; 1682 continue;
834 1683
1684 /*
1685 * If a reference page is supplied, it is because a specific
1686 * page is being unmapped, not a range. Ensure the page we
1687 * are about to unmap is the actual page of interest.
1688 */
1689 if (ref_page) {
1690 pte = huge_ptep_get(ptep);
1691 if (huge_pte_none(pte))
1692 continue;
1693 page = pte_page(pte);
1694 if (page != ref_page)
1695 continue;
1696
1697 /*
1698 * Mark the VMA as having unmapped its page so that
1699 * future faults in this VMA will fail rather than
1700 * looking like data was lost
1701 */
1702 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
1703 }
1704
835 pte = huge_ptep_get_and_clear(mm, address, ptep); 1705 pte = huge_ptep_get_and_clear(mm, address, ptep);
836 if (huge_pte_none(pte)) 1706 if (huge_pte_none(pte))
837 continue; 1707 continue;
@@ -850,31 +1720,71 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
850} 1720}
851 1721
852void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 1722void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
853 unsigned long end) 1723 unsigned long end, struct page *ref_page)
1724{
1725 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
1726 __unmap_hugepage_range(vma, start, end, ref_page);
1727 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
1728}
1729
1730/*
1731 * This is called when the original mapper is failing to COW a MAP_PRIVATE
1732 * mappping it owns the reserve page for. The intention is to unmap the page
1733 * from other VMAs and let the children be SIGKILLed if they are faulting the
1734 * same region.
1735 */
1736int unmap_ref_private(struct mm_struct *mm,
1737 struct vm_area_struct *vma,
1738 struct page *page,
1739 unsigned long address)
854{ 1740{
1741 struct vm_area_struct *iter_vma;
1742 struct address_space *mapping;
1743 struct prio_tree_iter iter;
1744 pgoff_t pgoff;
1745
855 /* 1746 /*
856 * It is undesirable to test vma->vm_file as it should be non-null 1747 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
857 * for valid hugetlb area. However, vm_file will be NULL in the error 1748 * from page cache lookup which is in HPAGE_SIZE units.
858 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
859 * do_mmap_pgoff() nullifies vma->vm_file before calling this function
860 * to clean up. Since no pte has actually been setup, it is safe to
861 * do nothing in this case.
862 */ 1749 */
863 if (vma->vm_file) { 1750 address = address & huge_page_mask(hstate_vma(vma));
864 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 1751 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
865 __unmap_hugepage_range(vma, start, end); 1752 + (vma->vm_pgoff >> PAGE_SHIFT);
866 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 1753 mapping = (struct address_space *)page_private(page);
1754
1755 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1756 /* Do not unmap the current VMA */
1757 if (iter_vma == vma)
1758 continue;
1759
1760 /*
1761 * Unmap the page from other VMAs without their own reserves.
1762 * They get marked to be SIGKILLed if they fault in these
1763 * areas. This is because a future no-page fault on this VMA
1764 * could insert a zeroed page instead of the data existing
1765 * from the time of fork. This would look like data corruption
1766 */
1767 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
1768 unmap_hugepage_range(iter_vma,
1769 address, address + HPAGE_SIZE,
1770 page);
867 } 1771 }
1772
1773 return 1;
868} 1774}
869 1775
870static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 1776static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
871 unsigned long address, pte_t *ptep, pte_t pte) 1777 unsigned long address, pte_t *ptep, pte_t pte,
1778 struct page *pagecache_page)
872{ 1779{
1780 struct hstate *h = hstate_vma(vma);
873 struct page *old_page, *new_page; 1781 struct page *old_page, *new_page;
874 int avoidcopy; 1782 int avoidcopy;
1783 int outside_reserve = 0;
875 1784
876 old_page = pte_page(pte); 1785 old_page = pte_page(pte);
877 1786
1787retry_avoidcopy:
878 /* If no-one else is actually using this page, avoid the copy 1788 /* If no-one else is actually using this page, avoid the copy
879 * and just make the page writable */ 1789 * and just make the page writable */
880 avoidcopy = (page_count(old_page) == 1); 1790 avoidcopy = (page_count(old_page) == 1);
@@ -883,11 +1793,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
883 return 0; 1793 return 0;
884 } 1794 }
885 1795
1796 /*
1797 * If the process that created a MAP_PRIVATE mapping is about to
1798 * perform a COW due to a shared page count, attempt to satisfy
1799 * the allocation without using the existing reserves. The pagecache
1800 * page is used to determine if the reserve at this address was
1801 * consumed or not. If reserves were used, a partial faulted mapping
1802 * at the time of fork() could consume its reserves on COW instead
1803 * of the full address range.
1804 */
1805 if (!(vma->vm_flags & VM_SHARED) &&
1806 is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
1807 old_page != pagecache_page)
1808 outside_reserve = 1;
1809
886 page_cache_get(old_page); 1810 page_cache_get(old_page);
887 new_page = alloc_huge_page(vma, address); 1811 new_page = alloc_huge_page(vma, address, outside_reserve);
888 1812
889 if (IS_ERR(new_page)) { 1813 if (IS_ERR(new_page)) {
890 page_cache_release(old_page); 1814 page_cache_release(old_page);
1815
1816 /*
1817 * If a process owning a MAP_PRIVATE mapping fails to COW,
1818 * it is due to references held by a child and an insufficient
1819 * huge page pool. To guarantee the original mappers
1820 * reliability, unmap the page from child processes. The child
1821 * may get SIGKILLed if it later faults.
1822 */
1823 if (outside_reserve) {
1824 BUG_ON(huge_pte_none(pte));
1825 if (unmap_ref_private(mm, vma, old_page, address)) {
1826 BUG_ON(page_count(old_page) != 1);
1827 BUG_ON(huge_pte_none(pte));
1828 goto retry_avoidcopy;
1829 }
1830 WARN_ON_ONCE(1);
1831 }
1832
891 return -PTR_ERR(new_page); 1833 return -PTR_ERR(new_page);
892 } 1834 }
893 1835
@@ -896,7 +1838,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
896 __SetPageUptodate(new_page); 1838 __SetPageUptodate(new_page);
897 spin_lock(&mm->page_table_lock); 1839 spin_lock(&mm->page_table_lock);
898 1840
899 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 1841 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
900 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 1842 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
901 /* Break COW */ 1843 /* Break COW */
902 huge_ptep_clear_flush(vma, address, ptep); 1844 huge_ptep_clear_flush(vma, address, ptep);
@@ -910,19 +1852,44 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
910 return 0; 1852 return 0;
911} 1853}
912 1854
1855/* Return the pagecache page at a given address within a VMA */
1856static struct page *hugetlbfs_pagecache_page(struct hstate *h,
1857 struct vm_area_struct *vma, unsigned long address)
1858{
1859 struct address_space *mapping;
1860 pgoff_t idx;
1861
1862 mapping = vma->vm_file->f_mapping;
1863 idx = vma_hugecache_offset(h, vma, address);
1864
1865 return find_lock_page(mapping, idx);
1866}
1867
913static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 1868static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
914 unsigned long address, pte_t *ptep, int write_access) 1869 unsigned long address, pte_t *ptep, int write_access)
915{ 1870{
1871 struct hstate *h = hstate_vma(vma);
916 int ret = VM_FAULT_SIGBUS; 1872 int ret = VM_FAULT_SIGBUS;
917 unsigned long idx; 1873 pgoff_t idx;
918 unsigned long size; 1874 unsigned long size;
919 struct page *page; 1875 struct page *page;
920 struct address_space *mapping; 1876 struct address_space *mapping;
921 pte_t new_pte; 1877 pte_t new_pte;
922 1878
1879 /*
1880 * Currently, we are forced to kill the process in the event the
1881 * original mapper has unmapped pages from the child due to a failed
1882 * COW. Warn that such a situation has occured as it may not be obvious
1883 */
1884 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
1885 printk(KERN_WARNING
1886 "PID %d killed due to inadequate hugepage pool\n",
1887 current->pid);
1888 return ret;
1889 }
1890
923 mapping = vma->vm_file->f_mapping; 1891 mapping = vma->vm_file->f_mapping;
924 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 1892 idx = vma_hugecache_offset(h, vma, address);
925 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
926 1893
927 /* 1894 /*
928 * Use page lock to guard against racing truncation 1895 * Use page lock to guard against racing truncation
@@ -931,15 +1898,15 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
931retry: 1898retry:
932 page = find_lock_page(mapping, idx); 1899 page = find_lock_page(mapping, idx);
933 if (!page) { 1900 if (!page) {
934 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 1901 size = i_size_read(mapping->host) >> huge_page_shift(h);
935 if (idx >= size) 1902 if (idx >= size)
936 goto out; 1903 goto out;
937 page = alloc_huge_page(vma, address); 1904 page = alloc_huge_page(vma, address, 0);
938 if (IS_ERR(page)) { 1905 if (IS_ERR(page)) {
939 ret = -PTR_ERR(page); 1906 ret = -PTR_ERR(page);
940 goto out; 1907 goto out;
941 } 1908 }
942 clear_huge_page(page, address); 1909 clear_huge_page(page, address, huge_page_size(h));
943 __SetPageUptodate(page); 1910 __SetPageUptodate(page);
944 1911
945 if (vma->vm_flags & VM_SHARED) { 1912 if (vma->vm_flags & VM_SHARED) {
@@ -955,14 +1922,14 @@ retry:
955 } 1922 }
956 1923
957 spin_lock(&inode->i_lock); 1924 spin_lock(&inode->i_lock);
958 inode->i_blocks += BLOCKS_PER_HUGEPAGE; 1925 inode->i_blocks += blocks_per_huge_page(h);
959 spin_unlock(&inode->i_lock); 1926 spin_unlock(&inode->i_lock);
960 } else 1927 } else
961 lock_page(page); 1928 lock_page(page);
962 } 1929 }
963 1930
964 spin_lock(&mm->page_table_lock); 1931 spin_lock(&mm->page_table_lock);
965 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 1932 size = i_size_read(mapping->host) >> huge_page_shift(h);
966 if (idx >= size) 1933 if (idx >= size)
967 goto backout; 1934 goto backout;
968 1935
@@ -976,7 +1943,7 @@ retry:
976 1943
977 if (write_access && !(vma->vm_flags & VM_SHARED)) { 1944 if (write_access && !(vma->vm_flags & VM_SHARED)) {
978 /* Optimization, do the COW without a second fault */ 1945 /* Optimization, do the COW without a second fault */
979 ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 1946 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
980 } 1947 }
981 1948
982 spin_unlock(&mm->page_table_lock); 1949 spin_unlock(&mm->page_table_lock);
@@ -998,8 +1965,9 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
998 pte_t entry; 1965 pte_t entry;
999 int ret; 1966 int ret;
1000 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 1967 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
1968 struct hstate *h = hstate_vma(vma);
1001 1969
1002 ptep = huge_pte_alloc(mm, address); 1970 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
1003 if (!ptep) 1971 if (!ptep)
1004 return VM_FAULT_OOM; 1972 return VM_FAULT_OOM;
1005 1973
@@ -1021,14 +1989,30 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
1021 spin_lock(&mm->page_table_lock); 1989 spin_lock(&mm->page_table_lock);
1022 /* Check for a racing update before calling hugetlb_cow */ 1990 /* Check for a racing update before calling hugetlb_cow */
1023 if (likely(pte_same(entry, huge_ptep_get(ptep)))) 1991 if (likely(pte_same(entry, huge_ptep_get(ptep))))
1024 if (write_access && !pte_write(entry)) 1992 if (write_access && !pte_write(entry)) {
1025 ret = hugetlb_cow(mm, vma, address, ptep, entry); 1993 struct page *page;
1994 page = hugetlbfs_pagecache_page(h, vma, address);
1995 ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
1996 if (page) {
1997 unlock_page(page);
1998 put_page(page);
1999 }
2000 }
1026 spin_unlock(&mm->page_table_lock); 2001 spin_unlock(&mm->page_table_lock);
1027 mutex_unlock(&hugetlb_instantiation_mutex); 2002 mutex_unlock(&hugetlb_instantiation_mutex);
1028 2003
1029 return ret; 2004 return ret;
1030} 2005}
1031 2006
2007/* Can be overriden by architectures */
2008__attribute__((weak)) struct page *
2009follow_huge_pud(struct mm_struct *mm, unsigned long address,
2010 pud_t *pud, int write)
2011{
2012 BUG();
2013 return NULL;
2014}
2015
1032int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 2016int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1033 struct page **pages, struct vm_area_struct **vmas, 2017 struct page **pages, struct vm_area_struct **vmas,
1034 unsigned long *position, int *length, int i, 2018 unsigned long *position, int *length, int i,
@@ -1037,6 +2021,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1037 unsigned long pfn_offset; 2021 unsigned long pfn_offset;
1038 unsigned long vaddr = *position; 2022 unsigned long vaddr = *position;
1039 int remainder = *length; 2023 int remainder = *length;
2024 struct hstate *h = hstate_vma(vma);
1040 2025
1041 spin_lock(&mm->page_table_lock); 2026 spin_lock(&mm->page_table_lock);
1042 while (vaddr < vma->vm_end && remainder) { 2027 while (vaddr < vma->vm_end && remainder) {
@@ -1048,7 +2033,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1048 * each hugepage. We have to make * sure we get the 2033 * each hugepage. We have to make * sure we get the
1049 * first, for the page indexing below to work. 2034 * first, for the page indexing below to work.
1050 */ 2035 */
1051 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 2036 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
1052 2037
1053 if (!pte || huge_pte_none(huge_ptep_get(pte)) || 2038 if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
1054 (write && !pte_write(huge_ptep_get(pte)))) { 2039 (write && !pte_write(huge_ptep_get(pte)))) {
@@ -1066,7 +2051,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1066 break; 2051 break;
1067 } 2052 }
1068 2053
1069 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; 2054 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
1070 page = pte_page(huge_ptep_get(pte)); 2055 page = pte_page(huge_ptep_get(pte));
1071same_page: 2056same_page:
1072 if (pages) { 2057 if (pages) {
@@ -1082,7 +2067,7 @@ same_page:
1082 --remainder; 2067 --remainder;
1083 ++i; 2068 ++i;
1084 if (vaddr < vma->vm_end && remainder && 2069 if (vaddr < vma->vm_end && remainder &&
1085 pfn_offset < HPAGE_SIZE/PAGE_SIZE) { 2070 pfn_offset < pages_per_huge_page(h)) {
1086 /* 2071 /*
1087 * We use pfn_offset to avoid touching the pageframes 2072 * We use pfn_offset to avoid touching the pageframes
1088 * of this compound page. 2073 * of this compound page.
@@ -1104,13 +2089,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
1104 unsigned long start = address; 2089 unsigned long start = address;
1105 pte_t *ptep; 2090 pte_t *ptep;
1106 pte_t pte; 2091 pte_t pte;
2092 struct hstate *h = hstate_vma(vma);
1107 2093
1108 BUG_ON(address >= end); 2094 BUG_ON(address >= end);
1109 flush_cache_range(vma, address, end); 2095 flush_cache_range(vma, address, end);
1110 2096
1111 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 2097 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
1112 spin_lock(&mm->page_table_lock); 2098 spin_lock(&mm->page_table_lock);
1113 for (; address < end; address += HPAGE_SIZE) { 2099 for (; address < end; address += huge_page_size(h)) {
1114 ptep = huge_pte_offset(mm, address); 2100 ptep = huge_pte_offset(mm, address);
1115 if (!ptep) 2101 if (!ptep)
1116 continue; 2102 continue;
@@ -1128,195 +2114,59 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
1128 flush_tlb_range(vma, start, end); 2114 flush_tlb_range(vma, start, end);
1129} 2115}
1130 2116
1131struct file_region { 2117int hugetlb_reserve_pages(struct inode *inode,
1132 struct list_head link; 2118 long from, long to,
1133 long from; 2119 struct vm_area_struct *vma)
1134 long to;
1135};
1136
1137static long region_add(struct list_head *head, long f, long t)
1138{
1139 struct file_region *rg, *nrg, *trg;
1140
1141 /* Locate the region we are either in or before. */
1142 list_for_each_entry(rg, head, link)
1143 if (f <= rg->to)
1144 break;
1145
1146 /* Round our left edge to the current segment if it encloses us. */
1147 if (f > rg->from)
1148 f = rg->from;
1149
1150 /* Check for and consume any regions we now overlap with. */
1151 nrg = rg;
1152 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
1153 if (&rg->link == head)
1154 break;
1155 if (rg->from > t)
1156 break;
1157
1158 /* If this area reaches higher then extend our area to
1159 * include it completely. If this is not the first area
1160 * which we intend to reuse, free it. */
1161 if (rg->to > t)
1162 t = rg->to;
1163 if (rg != nrg) {
1164 list_del(&rg->link);
1165 kfree(rg);
1166 }
1167 }
1168 nrg->from = f;
1169 nrg->to = t;
1170 return 0;
1171}
1172
1173static long region_chg(struct list_head *head, long f, long t)
1174{ 2120{
1175 struct file_region *rg, *nrg; 2121 long ret, chg;
1176 long chg = 0; 2122 struct hstate *h = hstate_inode(inode);
1177
1178 /* Locate the region we are before or in. */
1179 list_for_each_entry(rg, head, link)
1180 if (f <= rg->to)
1181 break;
1182
1183 /* If we are below the current region then a new region is required.
1184 * Subtle, allocate a new region at the position but make it zero
1185 * size such that we can guarantee to record the reservation. */
1186 if (&rg->link == head || t < rg->from) {
1187 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
1188 if (!nrg)
1189 return -ENOMEM;
1190 nrg->from = f;
1191 nrg->to = f;
1192 INIT_LIST_HEAD(&nrg->link);
1193 list_add(&nrg->link, rg->link.prev);
1194
1195 return t - f;
1196 }
1197
1198 /* Round our left edge to the current segment if it encloses us. */
1199 if (f > rg->from)
1200 f = rg->from;
1201 chg = t - f;
1202
1203 /* Check for and consume any regions we now overlap with. */
1204 list_for_each_entry(rg, rg->link.prev, link) {
1205 if (&rg->link == head)
1206 break;
1207 if (rg->from > t)
1208 return chg;
1209
1210 /* We overlap with this area, if it extends futher than
1211 * us then we must extend ourselves. Account for its
1212 * existing reservation. */
1213 if (rg->to > t) {
1214 chg += rg->to - t;
1215 t = rg->to;
1216 }
1217 chg -= rg->to - rg->from;
1218 }
1219 return chg;
1220}
1221
1222static long region_truncate(struct list_head *head, long end)
1223{
1224 struct file_region *rg, *trg;
1225 long chg = 0;
1226 2123
1227 /* Locate the region we are either in or before. */ 2124 if (vma && vma->vm_flags & VM_NORESERVE)
1228 list_for_each_entry(rg, head, link)
1229 if (end <= rg->to)
1230 break;
1231 if (&rg->link == head)
1232 return 0; 2125 return 0;
1233 2126
1234 /* If we are in the middle of a region then adjust it. */
1235 if (end > rg->from) {
1236 chg = rg->to - end;
1237 rg->to = end;
1238 rg = list_entry(rg->link.next, typeof(*rg), link);
1239 }
1240
1241 /* Drop any remaining regions. */
1242 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
1243 if (&rg->link == head)
1244 break;
1245 chg += rg->to - rg->from;
1246 list_del(&rg->link);
1247 kfree(rg);
1248 }
1249 return chg;
1250}
1251
1252static int hugetlb_acct_memory(long delta)
1253{
1254 int ret = -ENOMEM;
1255
1256 spin_lock(&hugetlb_lock);
1257 /* 2127 /*
1258 * When cpuset is configured, it breaks the strict hugetlb page 2128 * Shared mappings base their reservation on the number of pages that
1259 * reservation as the accounting is done on a global variable. Such 2129 * are already allocated on behalf of the file. Private mappings need
1260 * reservation is completely rubbish in the presence of cpuset because 2130 * to reserve the full area even if read-only as mprotect() may be
1261 * the reservation is not checked against page availability for the 2131 * called to make the mapping read-write. Assume !vma is a shm mapping
1262 * current cpuset. Application can still potentially OOM'ed by kernel
1263 * with lack of free htlb page in cpuset that the task is in.
1264 * Attempt to enforce strict accounting with cpuset is almost
1265 * impossible (or too ugly) because cpuset is too fluid that
1266 * task or memory node can be dynamically moved between cpusets.
1267 *
1268 * The change of semantics for shared hugetlb mapping with cpuset is
1269 * undesirable. However, in order to preserve some of the semantics,
1270 * we fall back to check against current free page availability as
1271 * a best attempt and hopefully to minimize the impact of changing
1272 * semantics that cpuset has.
1273 */ 2132 */
1274 if (delta > 0) { 2133 if (!vma || vma->vm_flags & VM_SHARED)
1275 if (gather_surplus_pages(delta) < 0) 2134 chg = region_chg(&inode->i_mapping->private_list, from, to);
1276 goto out; 2135 else {
1277 2136 struct resv_map *resv_map = resv_map_alloc();
1278 if (delta > cpuset_mems_nr(free_huge_pages_node)) { 2137 if (!resv_map)
1279 return_unused_surplus_pages(delta); 2138 return -ENOMEM;
1280 goto out;
1281 }
1282 }
1283
1284 ret = 0;
1285 if (delta < 0)
1286 return_unused_surplus_pages((unsigned long) -delta);
1287 2139
1288out: 2140 chg = to - from;
1289 spin_unlock(&hugetlb_lock);
1290 return ret;
1291}
1292 2141
1293int hugetlb_reserve_pages(struct inode *inode, long from, long to) 2142 set_vma_resv_map(vma, resv_map);
1294{ 2143 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
1295 long ret, chg; 2144 }
1296 2145
1297 chg = region_chg(&inode->i_mapping->private_list, from, to);
1298 if (chg < 0) 2146 if (chg < 0)
1299 return chg; 2147 return chg;
1300 2148
1301 if (hugetlb_get_quota(inode->i_mapping, chg)) 2149 if (hugetlb_get_quota(inode->i_mapping, chg))
1302 return -ENOSPC; 2150 return -ENOSPC;
1303 ret = hugetlb_acct_memory(chg); 2151 ret = hugetlb_acct_memory(h, chg);
1304 if (ret < 0) { 2152 if (ret < 0) {
1305 hugetlb_put_quota(inode->i_mapping, chg); 2153 hugetlb_put_quota(inode->i_mapping, chg);
1306 return ret; 2154 return ret;
1307 } 2155 }
1308 region_add(&inode->i_mapping->private_list, from, to); 2156 if (!vma || vma->vm_flags & VM_SHARED)
2157 region_add(&inode->i_mapping->private_list, from, to);
1309 return 0; 2158 return 0;
1310} 2159}
1311 2160
1312void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 2161void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
1313{ 2162{
2163 struct hstate *h = hstate_inode(inode);
1314 long chg = region_truncate(&inode->i_mapping->private_list, offset); 2164 long chg = region_truncate(&inode->i_mapping->private_list, offset);
1315 2165
1316 spin_lock(&inode->i_lock); 2166 spin_lock(&inode->i_lock);
1317 inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; 2167 inode->i_blocks -= blocks_per_huge_page(h);
1318 spin_unlock(&inode->i_lock); 2168 spin_unlock(&inode->i_lock);
1319 2169
1320 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 2170 hugetlb_put_quota(inode->i_mapping, (chg - freed));
1321 hugetlb_acct_memory(-(chg - freed)); 2171 hugetlb_acct_memory(h, -(chg - freed));
1322} 2172}
diff --git a/mm/internal.h b/mm/internal.h
index 0034e947e4bc..1f43f7416972 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -13,6 +13,11 @@
13 13
14#include <linux/mm.h> 14#include <linux/mm.h>
15 15
16void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
17 unsigned long floor, unsigned long ceiling);
18
19extern void prep_compound_page(struct page *page, unsigned long order);
20
16static inline void set_page_count(struct page *page, int v) 21static inline void set_page_count(struct page *page, int v)
17{ 22{
18 atomic_set(&page->_count, v); 23 atomic_set(&page->_count, v);
@@ -59,4 +64,60 @@ static inline unsigned long page_order(struct page *page)
59#define __paginginit __init 64#define __paginginit __init
60#endif 65#endif
61 66
67/* Memory initialisation debug and verification */
68enum mminit_level {
69 MMINIT_WARNING,
70 MMINIT_VERIFY,
71 MMINIT_TRACE
72};
73
74#ifdef CONFIG_DEBUG_MEMORY_INIT
75
76extern int mminit_loglevel;
77
78#define mminit_dprintk(level, prefix, fmt, arg...) \
79do { \
80 if (level < mminit_loglevel) { \
81 printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
82 printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
83 } \
84} while (0)
85
86extern void mminit_verify_pageflags_layout(void);
87extern void mminit_verify_page_links(struct page *page,
88 enum zone_type zone, unsigned long nid, unsigned long pfn);
89extern void mminit_verify_zonelist(void);
90
91#else
92
93static inline void mminit_dprintk(enum mminit_level level,
94 const char *prefix, const char *fmt, ...)
95{
96}
97
98static inline void mminit_verify_pageflags_layout(void)
99{
100}
101
102static inline void mminit_verify_page_links(struct page *page,
103 enum zone_type zone, unsigned long nid, unsigned long pfn)
104{
105}
106
107static inline void mminit_verify_zonelist(void)
108{
109}
110#endif /* CONFIG_DEBUG_MEMORY_INIT */
111
112/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
113#if defined(CONFIG_SPARSEMEM)
114extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
115 unsigned long *end_pfn);
116#else
117static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
118 unsigned long *end_pfn)
119{
120}
121#endif /* CONFIG_SPARSEMEM */
122
62#endif 123#endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e46451e1d9b7..fba566c51322 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,9 +35,9 @@
35 35
36#include <asm/uaccess.h> 36#include <asm/uaccess.h>
37 37
38struct cgroup_subsys mem_cgroup_subsys; 38struct cgroup_subsys mem_cgroup_subsys __read_mostly;
39static const int MEM_CGROUP_RECLAIM_RETRIES = 5; 39static struct kmem_cache *page_cgroup_cache __read_mostly;
40static struct kmem_cache *page_cgroup_cache; 40#define MEM_CGROUP_RECLAIM_RETRIES 5
41 41
42/* 42/*
43 * Statistics for memory cgroup. 43 * Statistics for memory cgroup.
@@ -166,7 +166,6 @@ struct page_cgroup {
166 struct list_head lru; /* per cgroup LRU list */ 166 struct list_head lru; /* per cgroup LRU list */
167 struct page *page; 167 struct page *page;
168 struct mem_cgroup *mem_cgroup; 168 struct mem_cgroup *mem_cgroup;
169 int ref_cnt; /* cached, mapped, migrating */
170 int flags; 169 int flags;
171}; 170};
172#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ 171#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
@@ -185,6 +184,7 @@ static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
185enum charge_type { 184enum charge_type {
186 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 185 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
187 MEM_CGROUP_CHARGE_TYPE_MAPPED, 186 MEM_CGROUP_CHARGE_TYPE_MAPPED,
187 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
188}; 188};
189 189
190/* 190/*
@@ -296,7 +296,7 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
296 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; 296 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
297 297
298 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); 298 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);
299 list_del_init(&pc->lru); 299 list_del(&pc->lru);
300} 300}
301 301
302static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, 302static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
@@ -354,6 +354,9 @@ void mem_cgroup_move_lists(struct page *page, bool active)
354 struct mem_cgroup_per_zone *mz; 354 struct mem_cgroup_per_zone *mz;
355 unsigned long flags; 355 unsigned long flags;
356 356
357 if (mem_cgroup_subsys.disabled)
358 return;
359
357 /* 360 /*
358 * We cannot lock_page_cgroup while holding zone's lru_lock, 361 * We cannot lock_page_cgroup while holding zone's lru_lock,
359 * because other holders of lock_page_cgroup can be interrupted 362 * because other holders of lock_page_cgroup can be interrupted
@@ -524,7 +527,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
524 * < 0 if the cgroup is over its limit 527 * < 0 if the cgroup is over its limit
525 */ 528 */
526static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 529static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
527 gfp_t gfp_mask, enum charge_type ctype) 530 gfp_t gfp_mask, enum charge_type ctype,
531 struct mem_cgroup *memcg)
528{ 532{
529 struct mem_cgroup *mem; 533 struct mem_cgroup *mem;
530 struct page_cgroup *pc; 534 struct page_cgroup *pc;
@@ -532,35 +536,8 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
532 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 536 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
533 struct mem_cgroup_per_zone *mz; 537 struct mem_cgroup_per_zone *mz;
534 538
535 if (mem_cgroup_subsys.disabled) 539 pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
536 return 0; 540 if (unlikely(pc == NULL))
537
538 /*
539 * Should page_cgroup's go to their own slab?
540 * One could optimize the performance of the charging routine
541 * by saving a bit in the page_flags and using it as a lock
542 * to see if the cgroup page already has a page_cgroup associated
543 * with it
544 */
545retry:
546 lock_page_cgroup(page);
547 pc = page_get_page_cgroup(page);
548 /*
549 * The page_cgroup exists and
550 * the page has already been accounted.
551 */
552 if (pc) {
553 VM_BUG_ON(pc->page != page);
554 VM_BUG_ON(pc->ref_cnt <= 0);
555
556 pc->ref_cnt++;
557 unlock_page_cgroup(page);
558 goto done;
559 }
560 unlock_page_cgroup(page);
561
562 pc = kmem_cache_zalloc(page_cgroup_cache, gfp_mask);
563 if (pc == NULL)
564 goto err; 541 goto err;
565 542
566 /* 543 /*
@@ -569,16 +546,18 @@ retry:
569 * thread group leader migrates. It's possible that mm is not 546 * thread group leader migrates. It's possible that mm is not
570 * set, if so charge the init_mm (happens for pagecache usage). 547 * set, if so charge the init_mm (happens for pagecache usage).
571 */ 548 */
572 if (!mm) 549 if (likely(!memcg)) {
573 mm = &init_mm; 550 rcu_read_lock();
574 551 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
575 rcu_read_lock(); 552 /*
576 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 553 * For every charge from the cgroup, increment reference count
577 /* 554 */
578 * For every charge from the cgroup, increment reference count 555 css_get(&mem->css);
579 */ 556 rcu_read_unlock();
580 css_get(&mem->css); 557 } else {
581 rcu_read_unlock(); 558 mem = memcg;
559 css_get(&memcg->css);
560 }
582 561
583 while (res_counter_charge(&mem->res, PAGE_SIZE)) { 562 while (res_counter_charge(&mem->res, PAGE_SIZE)) {
584 if (!(gfp_mask & __GFP_WAIT)) 563 if (!(gfp_mask & __GFP_WAIT))
@@ -603,25 +582,24 @@ retry:
603 } 582 }
604 } 583 }
605 584
606 pc->ref_cnt = 1;
607 pc->mem_cgroup = mem; 585 pc->mem_cgroup = mem;
608 pc->page = page; 586 pc->page = page;
609 pc->flags = PAGE_CGROUP_FLAG_ACTIVE; 587 /*
588 * If a page is accounted as a page cache, insert to inactive list.
589 * If anon, insert to active list.
590 */
610 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) 591 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
611 pc->flags = PAGE_CGROUP_FLAG_CACHE; 592 pc->flags = PAGE_CGROUP_FLAG_CACHE;
593 else
594 pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
612 595
613 lock_page_cgroup(page); 596 lock_page_cgroup(page);
614 if (page_get_page_cgroup(page)) { 597 if (unlikely(page_get_page_cgroup(page))) {
615 unlock_page_cgroup(page); 598 unlock_page_cgroup(page);
616 /*
617 * Another charge has been added to this page already.
618 * We take lock_page_cgroup(page) again and read
619 * page->cgroup, increment refcnt.... just retry is OK.
620 */
621 res_counter_uncharge(&mem->res, PAGE_SIZE); 599 res_counter_uncharge(&mem->res, PAGE_SIZE);
622 css_put(&mem->css); 600 css_put(&mem->css);
623 kmem_cache_free(page_cgroup_cache, pc); 601 kmem_cache_free(page_cgroup_cache, pc);
624 goto retry; 602 goto done;
625 } 603 }
626 page_assign_page_cgroup(page, pc); 604 page_assign_page_cgroup(page, pc);
627 605
@@ -642,24 +620,65 @@ err:
642 620
643int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) 621int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
644{ 622{
623 if (mem_cgroup_subsys.disabled)
624 return 0;
625
626 /*
627 * If already mapped, we don't have to account.
628 * If page cache, page->mapping has address_space.
629 * But page->mapping may have out-of-use anon_vma pointer,
630 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
631 * is NULL.
632 */
633 if (page_mapped(page) || (page->mapping && !PageAnon(page)))
634 return 0;
635 if (unlikely(!mm))
636 mm = &init_mm;
645 return mem_cgroup_charge_common(page, mm, gfp_mask, 637 return mem_cgroup_charge_common(page, mm, gfp_mask,
646 MEM_CGROUP_CHARGE_TYPE_MAPPED); 638 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
647} 639}
648 640
649int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 641int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
650 gfp_t gfp_mask) 642 gfp_t gfp_mask)
651{ 643{
652 if (!mm) 644 if (mem_cgroup_subsys.disabled)
645 return 0;
646
647 /*
648 * Corner case handling. This is called from add_to_page_cache()
649 * in usual. But some FS (shmem) precharges this page before calling it
650 * and call add_to_page_cache() with GFP_NOWAIT.
651 *
652 * For GFP_NOWAIT case, the page may be pre-charged before calling
653 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
654 * charge twice. (It works but has to pay a bit larger cost.)
655 */
656 if (!(gfp_mask & __GFP_WAIT)) {
657 struct page_cgroup *pc;
658
659 lock_page_cgroup(page);
660 pc = page_get_page_cgroup(page);
661 if (pc) {
662 VM_BUG_ON(pc->page != page);
663 VM_BUG_ON(!pc->mem_cgroup);
664 unlock_page_cgroup(page);
665 return 0;
666 }
667 unlock_page_cgroup(page);
668 }
669
670 if (unlikely(!mm))
653 mm = &init_mm; 671 mm = &init_mm;
672
654 return mem_cgroup_charge_common(page, mm, gfp_mask, 673 return mem_cgroup_charge_common(page, mm, gfp_mask,
655 MEM_CGROUP_CHARGE_TYPE_CACHE); 674 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
656} 675}
657 676
658/* 677/*
659 * Uncharging is always a welcome operation, we never complain, simply 678 * uncharge if !page_mapped(page)
660 * uncharge.
661 */ 679 */
662void mem_cgroup_uncharge_page(struct page *page) 680static void
681__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
663{ 682{
664 struct page_cgroup *pc; 683 struct page_cgroup *pc;
665 struct mem_cgroup *mem; 684 struct mem_cgroup *mem;
@@ -674,98 +693,151 @@ void mem_cgroup_uncharge_page(struct page *page)
674 */ 693 */
675 lock_page_cgroup(page); 694 lock_page_cgroup(page);
676 pc = page_get_page_cgroup(page); 695 pc = page_get_page_cgroup(page);
677 if (!pc) 696 if (unlikely(!pc))
678 goto unlock; 697 goto unlock;
679 698
680 VM_BUG_ON(pc->page != page); 699 VM_BUG_ON(pc->page != page);
681 VM_BUG_ON(pc->ref_cnt <= 0);
682 700
683 if (--(pc->ref_cnt) == 0) { 701 if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
684 mz = page_cgroup_zoneinfo(pc); 702 && ((pc->flags & PAGE_CGROUP_FLAG_CACHE)
685 spin_lock_irqsave(&mz->lru_lock, flags); 703 || page_mapped(page)))
686 __mem_cgroup_remove_list(mz, pc); 704 goto unlock;
687 spin_unlock_irqrestore(&mz->lru_lock, flags);
688 705
689 page_assign_page_cgroup(page, NULL); 706 mz = page_cgroup_zoneinfo(pc);
690 unlock_page_cgroup(page); 707 spin_lock_irqsave(&mz->lru_lock, flags);
708 __mem_cgroup_remove_list(mz, pc);
709 spin_unlock_irqrestore(&mz->lru_lock, flags);
691 710
692 mem = pc->mem_cgroup; 711 page_assign_page_cgroup(page, NULL);
693 res_counter_uncharge(&mem->res, PAGE_SIZE); 712 unlock_page_cgroup(page);
694 css_put(&mem->css);
695 713
696 kmem_cache_free(page_cgroup_cache, pc); 714 mem = pc->mem_cgroup;
697 return; 715 res_counter_uncharge(&mem->res, PAGE_SIZE);
698 } 716 css_put(&mem->css);
699 717
718 kmem_cache_free(page_cgroup_cache, pc);
719 return;
700unlock: 720unlock:
701 unlock_page_cgroup(page); 721 unlock_page_cgroup(page);
702} 722}
703 723
724void mem_cgroup_uncharge_page(struct page *page)
725{
726 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
727}
728
729void mem_cgroup_uncharge_cache_page(struct page *page)
730{
731 VM_BUG_ON(page_mapped(page));
732 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
733}
734
704/* 735/*
705 * Returns non-zero if a page (under migration) has valid page_cgroup member. 736 * Before starting migration, account against new page.
706 * Refcnt of page_cgroup is incremented.
707 */ 737 */
708int mem_cgroup_prepare_migration(struct page *page) 738int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
709{ 739{
710 struct page_cgroup *pc; 740 struct page_cgroup *pc;
741 struct mem_cgroup *mem = NULL;
742 enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
743 int ret = 0;
711 744
712 if (mem_cgroup_subsys.disabled) 745 if (mem_cgroup_subsys.disabled)
713 return 0; 746 return 0;
714 747
715 lock_page_cgroup(page); 748 lock_page_cgroup(page);
716 pc = page_get_page_cgroup(page); 749 pc = page_get_page_cgroup(page);
717 if (pc) 750 if (pc) {
718 pc->ref_cnt++; 751 mem = pc->mem_cgroup;
752 css_get(&mem->css);
753 if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
754 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
755 }
719 unlock_page_cgroup(page); 756 unlock_page_cgroup(page);
720 return pc != NULL; 757 if (mem) {
758 ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
759 ctype, mem);
760 css_put(&mem->css);
761 }
762 return ret;
721} 763}
722 764
723void mem_cgroup_end_migration(struct page *page) 765/* remove redundant charge if migration failed*/
766void mem_cgroup_end_migration(struct page *newpage)
724{ 767{
725 mem_cgroup_uncharge_page(page); 768 /*
769 * At success, page->mapping is not NULL.
770 * special rollback care is necessary when
771 * 1. at migration failure. (newpage->mapping is cleared in this case)
772 * 2. the newpage was moved but not remapped again because the task
773 * exits and the newpage is obsolete. In this case, the new page
774 * may be a swapcache. So, we just call mem_cgroup_uncharge_page()
775 * always for avoiding mess. The page_cgroup will be removed if
776 * unnecessary. File cache pages is still on radix-tree. Don't
777 * care it.
778 */
779 if (!newpage->mapping)
780 __mem_cgroup_uncharge_common(newpage,
781 MEM_CGROUP_CHARGE_TYPE_FORCE);
782 else if (PageAnon(newpage))
783 mem_cgroup_uncharge_page(newpage);
726} 784}
727 785
728/* 786/*
729 * We know both *page* and *newpage* are now not-on-LRU and PG_locked. 787 * A call to try to shrink memory usage under specified resource controller.
730 * And no race with uncharge() routines because page_cgroup for *page* 788 * This is typically used for page reclaiming for shmem for reducing side
731 * has extra one reference by mem_cgroup_prepare_migration. 789 * effect of page allocation from shmem, which is used by some mem_cgroup.
732 */ 790 */
733void mem_cgroup_page_migration(struct page *page, struct page *newpage) 791int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
734{ 792{
735 struct page_cgroup *pc; 793 struct mem_cgroup *mem;
736 struct mem_cgroup_per_zone *mz; 794 int progress = 0;
737 unsigned long flags; 795 int retry = MEM_CGROUP_RECLAIM_RETRIES;
738 796
739 lock_page_cgroup(page); 797 if (mem_cgroup_subsys.disabled)
740 pc = page_get_page_cgroup(page); 798 return 0;
741 if (!pc) {
742 unlock_page_cgroup(page);
743 return;
744 }
745 799
746 mz = page_cgroup_zoneinfo(pc); 800 rcu_read_lock();
747 spin_lock_irqsave(&mz->lru_lock, flags); 801 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
748 __mem_cgroup_remove_list(mz, pc); 802 css_get(&mem->css);
749 spin_unlock_irqrestore(&mz->lru_lock, flags); 803 rcu_read_unlock();
750 804
751 page_assign_page_cgroup(page, NULL); 805 do {
752 unlock_page_cgroup(page); 806 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
807 } while (!progress && --retry);
753 808
754 pc->page = newpage; 809 css_put(&mem->css);
755 lock_page_cgroup(newpage); 810 if (!retry)
756 page_assign_page_cgroup(newpage, pc); 811 return -ENOMEM;
812 return 0;
813}
757 814
758 mz = page_cgroup_zoneinfo(pc); 815int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
759 spin_lock_irqsave(&mz->lru_lock, flags); 816{
760 __mem_cgroup_add_list(mz, pc); 817
761 spin_unlock_irqrestore(&mz->lru_lock, flags); 818 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
819 int progress;
820 int ret = 0;
762 821
763 unlock_page_cgroup(newpage); 822 while (res_counter_set_limit(&memcg->res, val)) {
823 if (signal_pending(current)) {
824 ret = -EINTR;
825 break;
826 }
827 if (!retry_count) {
828 ret = -EBUSY;
829 break;
830 }
831 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL);
832 if (!progress)
833 retry_count--;
834 }
835 return ret;
764} 836}
765 837
838
766/* 839/*
767 * This routine traverse page_cgroup in given list and drop them all. 840 * This routine traverse page_cgroup in given list and drop them all.
768 * This routine ignores page_cgroup->ref_cnt.
769 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 841 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
770 */ 842 */
771#define FORCE_UNCHARGE_BATCH (128) 843#define FORCE_UNCHARGE_BATCH (128)
@@ -790,12 +862,20 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
790 page = pc->page; 862 page = pc->page;
791 get_page(page); 863 get_page(page);
792 spin_unlock_irqrestore(&mz->lru_lock, flags); 864 spin_unlock_irqrestore(&mz->lru_lock, flags);
793 mem_cgroup_uncharge_page(page); 865 /*
794 put_page(page); 866 * Check if this page is on LRU. !LRU page can be found
795 if (--count <= 0) { 867 * if it's under page migration.
796 count = FORCE_UNCHARGE_BATCH; 868 */
869 if (PageLRU(page)) {
870 __mem_cgroup_uncharge_common(page,
871 MEM_CGROUP_CHARGE_TYPE_FORCE);
872 put_page(page);
873 if (--count <= 0) {
874 count = FORCE_UNCHARGE_BATCH;
875 cond_resched();
876 }
877 } else
797 cond_resched(); 878 cond_resched();
798 }
799 spin_lock_irqsave(&mz->lru_lock, flags); 879 spin_lock_irqsave(&mz->lru_lock, flags);
800 } 880 }
801 spin_unlock_irqrestore(&mz->lru_lock, flags); 881 spin_unlock_irqrestore(&mz->lru_lock, flags);
@@ -810,9 +890,6 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
810 int ret = -EBUSY; 890 int ret = -EBUSY;
811 int node, zid; 891 int node, zid;
812 892
813 if (mem_cgroup_subsys.disabled)
814 return 0;
815
816 css_get(&mem->css); 893 css_get(&mem->css);
817 /* 894 /*
818 * page reclaim code (kswapd etc..) will move pages between 895 * page reclaim code (kswapd etc..) will move pages between
@@ -838,32 +915,34 @@ out:
838 return ret; 915 return ret;
839} 916}
840 917
841static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
842{
843 *tmp = memparse(buf, &buf);
844 if (*buf != '\0')
845 return -EINVAL;
846
847 /*
848 * Round up the value to the closest page size
849 */
850 *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
851 return 0;
852}
853
854static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 918static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
855{ 919{
856 return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, 920 return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
857 cft->private); 921 cft->private);
858} 922}
859 923/*
860static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 924 * The user of this function is...
861 struct file *file, const char __user *userbuf, 925 * RES_LIMIT.
862 size_t nbytes, loff_t *ppos) 926 */
927static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
928 const char *buffer)
863{ 929{
864 return res_counter_write(&mem_cgroup_from_cont(cont)->res, 930 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
865 cft->private, userbuf, nbytes, ppos, 931 unsigned long long val;
866 mem_cgroup_write_strategy); 932 int ret;
933
934 switch (cft->private) {
935 case RES_LIMIT:
936 /* This function does all necessary parse...reuse it */
937 ret = res_counter_memparse_write_strategy(buffer, &val);
938 if (!ret)
939 ret = mem_cgroup_resize_limit(memcg, val);
940 break;
941 default:
942 ret = -EINVAL; /* should be BUG() ? */
943 break;
944 }
945 return ret;
867} 946}
868 947
869static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 948static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
@@ -940,7 +1019,7 @@ static struct cftype mem_cgroup_files[] = {
940 { 1019 {
941 .name = "limit_in_bytes", 1020 .name = "limit_in_bytes",
942 .private = RES_LIMIT, 1021 .private = RES_LIMIT,
943 .write = mem_cgroup_write, 1022 .write_string = mem_cgroup_write,
944 .read_u64 = mem_cgroup_read, 1023 .read_u64 = mem_cgroup_read,
945 }, 1024 },
946 { 1025 {
@@ -1070,8 +1149,6 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
1070static int mem_cgroup_populate(struct cgroup_subsys *ss, 1149static int mem_cgroup_populate(struct cgroup_subsys *ss,
1071 struct cgroup *cont) 1150 struct cgroup *cont)
1072{ 1151{
1073 if (mem_cgroup_subsys.disabled)
1074 return 0;
1075 return cgroup_add_files(cont, ss, mem_cgroup_files, 1152 return cgroup_add_files(cont, ss, mem_cgroup_files,
1076 ARRAY_SIZE(mem_cgroup_files)); 1153 ARRAY_SIZE(mem_cgroup_files));
1077} 1154}
@@ -1084,9 +1161,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1084 struct mm_struct *mm; 1161 struct mm_struct *mm;
1085 struct mem_cgroup *mem, *old_mem; 1162 struct mem_cgroup *mem, *old_mem;
1086 1163
1087 if (mem_cgroup_subsys.disabled)
1088 return;
1089
1090 mm = get_task_mm(p); 1164 mm = get_task_mm(p);
1091 if (mm == NULL) 1165 if (mm == NULL)
1092 return; 1166 return;
diff --git a/mm/memory.c b/mm/memory.c
index 2302d228fe04..a8ca04faaea6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -61,6 +61,8 @@
61#include <linux/swapops.h> 61#include <linux/swapops.h>
62#include <linux/elf.h> 62#include <linux/elf.h>
63 63
64#include "internal.h"
65
64#ifndef CONFIG_NEED_MULTIPLE_NODES 66#ifndef CONFIG_NEED_MULTIPLE_NODES
65/* use the per-pgdat data instead for discontigmem - mbligh */ 67/* use the per-pgdat data instead for discontigmem - mbligh */
66unsigned long max_mapnr; 68unsigned long max_mapnr;
@@ -211,7 +213,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
211 * 213 *
212 * Must be called with pagetable lock held. 214 * Must be called with pagetable lock held.
213 */ 215 */
214void free_pgd_range(struct mmu_gather **tlb, 216void free_pgd_range(struct mmu_gather *tlb,
215 unsigned long addr, unsigned long end, 217 unsigned long addr, unsigned long end,
216 unsigned long floor, unsigned long ceiling) 218 unsigned long floor, unsigned long ceiling)
217{ 219{
@@ -262,16 +264,16 @@ void free_pgd_range(struct mmu_gather **tlb,
262 return; 264 return;
263 265
264 start = addr; 266 start = addr;
265 pgd = pgd_offset((*tlb)->mm, addr); 267 pgd = pgd_offset(tlb->mm, addr);
266 do { 268 do {
267 next = pgd_addr_end(addr, end); 269 next = pgd_addr_end(addr, end);
268 if (pgd_none_or_clear_bad(pgd)) 270 if (pgd_none_or_clear_bad(pgd))
269 continue; 271 continue;
270 free_pud_range(*tlb, pgd, addr, next, floor, ceiling); 272 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
271 } while (pgd++, addr = next, addr != end); 273 } while (pgd++, addr = next, addr != end);
272} 274}
273 275
274void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, 276void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
275 unsigned long floor, unsigned long ceiling) 277 unsigned long floor, unsigned long ceiling)
276{ 278{
277 while (vma) { 279 while (vma) {
@@ -372,7 +374,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
372 * 374 *
373 * The calling function must still handle the error. 375 * The calling function must still handle the error.
374 */ 376 */
375void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) 377static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
378 unsigned long vaddr)
376{ 379{
377 printk(KERN_ERR "Bad pte = %08llx, process = %s, " 380 printk(KERN_ERR "Bad pte = %08llx, process = %s, "
378 "vm_flags = %lx, vaddr = %lx\n", 381 "vm_flags = %lx, vaddr = %lx\n",
@@ -899,9 +902,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
899 } 902 }
900 903
901 if (unlikely(is_vm_hugetlb_page(vma))) { 904 if (unlikely(is_vm_hugetlb_page(vma))) {
902 unmap_hugepage_range(vma, start, end); 905 /*
903 zap_work -= (end - start) / 906 * It is undesirable to test vma->vm_file as it
904 (HPAGE_SIZE / PAGE_SIZE); 907 * should be non-null for valid hugetlb area.
908 * However, vm_file will be NULL in the error
909 * cleanup path of do_mmap_pgoff. When
910 * hugetlbfs ->mmap method fails,
911 * do_mmap_pgoff() nullifies vma->vm_file
912 * before calling this function to clean up.
913 * Since no pte has actually been setup, it is
914 * safe to do nothing in this case.
915 */
916 if (vma->vm_file) {
917 unmap_hugepage_range(vma, start, end, NULL);
918 zap_work -= (end - start) /
919 pages_per_huge_page(hstate_vma(vma));
920 }
921
905 start = end; 922 start = end;
906 } else 923 } else
907 start = unmap_page_range(*tlbp, vma, 924 start = unmap_page_range(*tlbp, vma,
@@ -982,19 +999,24 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
982 goto no_page_table; 999 goto no_page_table;
983 1000
984 pud = pud_offset(pgd, address); 1001 pud = pud_offset(pgd, address);
985 if (pud_none(*pud) || unlikely(pud_bad(*pud))) 1002 if (pud_none(*pud))
986 goto no_page_table; 1003 goto no_page_table;
987 1004 if (pud_huge(*pud)) {
1005 BUG_ON(flags & FOLL_GET);
1006 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1007 goto out;
1008 }
1009 if (unlikely(pud_bad(*pud)))
1010 goto no_page_table;
1011
988 pmd = pmd_offset(pud, address); 1012 pmd = pmd_offset(pud, address);
989 if (pmd_none(*pmd)) 1013 if (pmd_none(*pmd))
990 goto no_page_table; 1014 goto no_page_table;
991
992 if (pmd_huge(*pmd)) { 1015 if (pmd_huge(*pmd)) {
993 BUG_ON(flags & FOLL_GET); 1016 BUG_ON(flags & FOLL_GET);
994 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1017 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
995 goto out; 1018 goto out;
996 } 1019 }
997
998 if (unlikely(pmd_bad(*pmd))) 1020 if (unlikely(pmd_bad(*pmd)))
999 goto no_page_table; 1021 goto no_page_table;
1000 1022
@@ -1058,11 +1080,9 @@ static inline int use_zero_page(struct vm_area_struct *vma)
1058 if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) 1080 if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
1059 return 0; 1081 return 0;
1060 /* 1082 /*
1061 * And if we have a fault or a nopfn routine, it's not an 1083 * And if we have a fault routine, it's not an anonymous region.
1062 * anonymous region.
1063 */ 1084 */
1064 return !vma->vm_ops || 1085 return !vma->vm_ops || !vma->vm_ops->fault;
1065 (!vma->vm_ops->fault && !vma->vm_ops->nopfn);
1066} 1086}
1067 1087
1068int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1088int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -1338,6 +1358,11 @@ out:
1338 * 1358 *
1339 * This function should only be called from a vm_ops->fault handler, and 1359 * This function should only be called from a vm_ops->fault handler, and
1340 * in that case the handler should return NULL. 1360 * in that case the handler should return NULL.
1361 *
1362 * vma cannot be a COW mapping.
1363 *
1364 * As this is called only for pages that do not currently exist, we
1365 * do not need to flush old virtual caches or the TLB.
1341 */ 1366 */
1342int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 1367int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1343 unsigned long pfn) 1368 unsigned long pfn)
@@ -1548,6 +1573,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
1548 unsigned long next; 1573 unsigned long next;
1549 int err; 1574 int err;
1550 1575
1576 BUG_ON(pud_huge(*pud));
1577
1551 pmd = pmd_alloc(mm, pud, addr); 1578 pmd = pmd_alloc(mm, pud, addr);
1552 if (!pmd) 1579 if (!pmd)
1553 return -ENOMEM; 1580 return -ENOMEM;
@@ -2501,59 +2528,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2501 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 2528 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
2502} 2529}
2503 2530
2504
2505/*
2506 * do_no_pfn() tries to create a new page mapping for a page without
2507 * a struct_page backing it
2508 *
2509 * As this is called only for pages that do not currently exist, we
2510 * do not need to flush old virtual caches or the TLB.
2511 *
2512 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2513 * but allow concurrent faults), and pte mapped but not yet locked.
2514 * We return with mmap_sem still held, but pte unmapped and unlocked.
2515 *
2516 * It is expected that the ->nopfn handler always returns the same pfn
2517 * for a given virtual mapping.
2518 *
2519 * Mark this `noinline' to prevent it from bloating the main pagefault code.
2520 */
2521static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
2522 unsigned long address, pte_t *page_table, pmd_t *pmd,
2523 int write_access)
2524{
2525 spinlock_t *ptl;
2526 pte_t entry;
2527 unsigned long pfn;
2528
2529 pte_unmap(page_table);
2530 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2531 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2532
2533 pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
2534
2535 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2536
2537 if (unlikely(pfn == NOPFN_OOM))
2538 return VM_FAULT_OOM;
2539 else if (unlikely(pfn == NOPFN_SIGBUS))
2540 return VM_FAULT_SIGBUS;
2541 else if (unlikely(pfn == NOPFN_REFAULT))
2542 return 0;
2543
2544 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2545
2546 /* Only go through if we didn't race with anybody else... */
2547 if (pte_none(*page_table)) {
2548 entry = pfn_pte(pfn, vma->vm_page_prot);
2549 if (write_access)
2550 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2551 set_pte_at(mm, address, page_table, entry);
2552 }
2553 pte_unmap_unlock(page_table, ptl);
2554 return 0;
2555}
2556
2557/* 2531/*
2558 * Fault of a previously existing named mapping. Repopulate the pte 2532 * Fault of a previously existing named mapping. Repopulate the pte
2559 * from the encoded file_pte if possible. This enables swappable 2533 * from the encoded file_pte if possible. This enables swappable
@@ -2614,9 +2588,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2614 if (likely(vma->vm_ops->fault)) 2588 if (likely(vma->vm_ops->fault))
2615 return do_linear_fault(mm, vma, address, 2589 return do_linear_fault(mm, vma, address,
2616 pte, pmd, write_access, entry); 2590 pte, pmd, write_access, entry);
2617 if (unlikely(vma->vm_ops->nopfn))
2618 return do_no_pfn(mm, vma, address, pte,
2619 pmd, write_access);
2620 } 2591 }
2621 return do_anonymous_page(mm, vma, address, 2592 return do_anonymous_page(mm, vma, address,
2622 pte, pmd, write_access); 2593 pte, pmd, write_access);
@@ -2804,6 +2775,86 @@ int in_gate_area_no_task(unsigned long addr)
2804 2775
2805#endif /* __HAVE_ARCH_GATE_AREA */ 2776#endif /* __HAVE_ARCH_GATE_AREA */
2806 2777
2778#ifdef CONFIG_HAVE_IOREMAP_PROT
2779static resource_size_t follow_phys(struct vm_area_struct *vma,
2780 unsigned long address, unsigned int flags,
2781 unsigned long *prot)
2782{
2783 pgd_t *pgd;
2784 pud_t *pud;
2785 pmd_t *pmd;
2786 pte_t *ptep, pte;
2787 spinlock_t *ptl;
2788 resource_size_t phys_addr = 0;
2789 struct mm_struct *mm = vma->vm_mm;
2790
2791 VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
2792
2793 pgd = pgd_offset(mm, address);
2794 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
2795 goto no_page_table;
2796
2797 pud = pud_offset(pgd, address);
2798 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
2799 goto no_page_table;
2800
2801 pmd = pmd_offset(pud, address);
2802 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
2803 goto no_page_table;
2804
2805 /* We cannot handle huge page PFN maps. Luckily they don't exist. */
2806 if (pmd_huge(*pmd))
2807 goto no_page_table;
2808
2809 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
2810 if (!ptep)
2811 goto out;
2812
2813 pte = *ptep;
2814 if (!pte_present(pte))
2815 goto unlock;
2816 if ((flags & FOLL_WRITE) && !pte_write(pte))
2817 goto unlock;
2818 phys_addr = pte_pfn(pte);
2819 phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
2820
2821 *prot = pgprot_val(pte_pgprot(pte));
2822
2823unlock:
2824 pte_unmap_unlock(ptep, ptl);
2825out:
2826 return phys_addr;
2827no_page_table:
2828 return 0;
2829}
2830
2831int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
2832 void *buf, int len, int write)
2833{
2834 resource_size_t phys_addr;
2835 unsigned long prot = 0;
2836 void *maddr;
2837 int offset = addr & (PAGE_SIZE-1);
2838
2839 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
2840 return -EINVAL;
2841
2842 phys_addr = follow_phys(vma, addr, write, &prot);
2843
2844 if (!phys_addr)
2845 return -EINVAL;
2846
2847 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
2848 if (write)
2849 memcpy_toio(maddr + offset, buf, len);
2850 else
2851 memcpy_fromio(buf, maddr + offset, len);
2852 iounmap(maddr);
2853
2854 return len;
2855}
2856#endif
2857
2807/* 2858/*
2808 * Access another process' address space. 2859 * Access another process' address space.
2809 * Source/target buffer must be kernel space, 2860 * Source/target buffer must be kernel space,
@@ -2813,7 +2864,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
2813{ 2864{
2814 struct mm_struct *mm; 2865 struct mm_struct *mm;
2815 struct vm_area_struct *vma; 2866 struct vm_area_struct *vma;
2816 struct page *page;
2817 void *old_buf = buf; 2867 void *old_buf = buf;
2818 2868
2819 mm = get_task_mm(tsk); 2869 mm = get_task_mm(tsk);
@@ -2825,28 +2875,44 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
2825 while (len) { 2875 while (len) {
2826 int bytes, ret, offset; 2876 int bytes, ret, offset;
2827 void *maddr; 2877 void *maddr;
2878 struct page *page = NULL;
2828 2879
2829 ret = get_user_pages(tsk, mm, addr, 1, 2880 ret = get_user_pages(tsk, mm, addr, 1,
2830 write, 1, &page, &vma); 2881 write, 1, &page, &vma);
2831 if (ret <= 0) 2882 if (ret <= 0) {
2832 break; 2883 /*
2833 2884 * Check if this is a VM_IO | VM_PFNMAP VMA, which
2834 bytes = len; 2885 * we can access using slightly different code.
2835 offset = addr & (PAGE_SIZE-1); 2886 */
2836 if (bytes > PAGE_SIZE-offset) 2887#ifdef CONFIG_HAVE_IOREMAP_PROT
2837 bytes = PAGE_SIZE-offset; 2888 vma = find_vma(mm, addr);
2838 2889 if (!vma)
2839 maddr = kmap(page); 2890 break;
2840 if (write) { 2891 if (vma->vm_ops && vma->vm_ops->access)
2841 copy_to_user_page(vma, page, addr, 2892 ret = vma->vm_ops->access(vma, addr, buf,
2842 maddr + offset, buf, bytes); 2893 len, write);
2843 set_page_dirty_lock(page); 2894 if (ret <= 0)
2895#endif
2896 break;
2897 bytes = ret;
2844 } else { 2898 } else {
2845 copy_from_user_page(vma, page, addr, 2899 bytes = len;
2846 buf, maddr + offset, bytes); 2900 offset = addr & (PAGE_SIZE-1);
2901 if (bytes > PAGE_SIZE-offset)
2902 bytes = PAGE_SIZE-offset;
2903
2904 maddr = kmap(page);
2905 if (write) {
2906 copy_to_user_page(vma, page, addr,
2907 maddr + offset, buf, bytes);
2908 set_page_dirty_lock(page);
2909 } else {
2910 copy_from_user_page(vma, page, addr,
2911 buf, maddr + offset, bytes);
2912 }
2913 kunmap(page);
2914 page_cache_release(page);
2847 } 2915 }
2848 kunmap(page);
2849 page_cache_release(page);
2850 len -= bytes; 2916 len -= bytes;
2851 buf += bytes; 2917 buf += bytes;
2852 addr += bytes; 2918 addr += bytes;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 833f854eabe5..89fee2dcb039 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -62,9 +62,9 @@ static void release_memory_resource(struct resource *res)
62 62
63#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 63#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
64#ifndef CONFIG_SPARSEMEM_VMEMMAP 64#ifndef CONFIG_SPARSEMEM_VMEMMAP
65static void get_page_bootmem(unsigned long info, struct page *page, int magic) 65static void get_page_bootmem(unsigned long info, struct page *page, int type)
66{ 66{
67 atomic_set(&page->_mapcount, magic); 67 atomic_set(&page->_mapcount, type);
68 SetPagePrivate(page); 68 SetPagePrivate(page);
69 set_page_private(page, info); 69 set_page_private(page, info);
70 atomic_inc(&page->_count); 70 atomic_inc(&page->_count);
@@ -72,10 +72,10 @@ static void get_page_bootmem(unsigned long info, struct page *page, int magic)
72 72
73void put_page_bootmem(struct page *page) 73void put_page_bootmem(struct page *page)
74{ 74{
75 int magic; 75 int type;
76 76
77 magic = atomic_read(&page->_mapcount); 77 type = atomic_read(&page->_mapcount);
78 BUG_ON(magic >= -1); 78 BUG_ON(type >= -1);
79 79
80 if (atomic_dec_return(&page->_count) == 1) { 80 if (atomic_dec_return(&page->_count) == 1) {
81 ClearPagePrivate(page); 81 ClearPagePrivate(page);
@@ -86,7 +86,7 @@ void put_page_bootmem(struct page *page)
86 86
87} 87}
88 88
89void register_page_bootmem_info_section(unsigned long start_pfn) 89static void register_page_bootmem_info_section(unsigned long start_pfn)
90{ 90{
91 unsigned long *usemap, mapsize, section_nr, i; 91 unsigned long *usemap, mapsize, section_nr, i;
92 struct mem_section *ms; 92 struct mem_section *ms;
@@ -119,7 +119,7 @@ void register_page_bootmem_info_section(unsigned long start_pfn)
119 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 119 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
120 120
121 for (i = 0; i < mapsize; i++, page++) 121 for (i = 0; i < mapsize; i++, page++)
122 get_page_bootmem(section_nr, page, MIX_INFO); 122 get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
123 123
124} 124}
125 125
@@ -429,7 +429,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
429 429
430 if (need_zonelists_rebuild) 430 if (need_zonelists_rebuild)
431 build_all_zonelists(); 431 build_all_zonelists();
432 vm_total_pages = nr_free_pagecache_pages(); 432 else
433 vm_total_pages = nr_free_pagecache_pages();
434
433 writeback_set_ratelimit(); 435 writeback_set_ratelimit();
434 436
435 if (onlined_pages) 437 if (onlined_pages)
@@ -455,7 +457,7 @@ static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
455 /* we can use NODE_DATA(nid) from here */ 457 /* we can use NODE_DATA(nid) from here */
456 458
457 /* init node's zones as empty zones, we don't have any present pages.*/ 459 /* init node's zones as empty zones, we don't have any present pages.*/
458 free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size); 460 free_area_init_node(nid, zones_size, start_pfn, zholes_size);
459 461
460 return pgdat; 462 return pgdat;
461} 463}
@@ -521,6 +523,66 @@ EXPORT_SYMBOL_GPL(add_memory);
521 523
522#ifdef CONFIG_MEMORY_HOTREMOVE 524#ifdef CONFIG_MEMORY_HOTREMOVE
523/* 525/*
526 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
527 * set and the size of the free page is given by page_order(). Using this,
528 * the function determines if the pageblock contains only free pages.
529 * Due to buddy contraints, a free page at least the size of a pageblock will
530 * be located at the start of the pageblock
531 */
532static inline int pageblock_free(struct page *page)
533{
534 return PageBuddy(page) && page_order(page) >= pageblock_order;
535}
536
537/* Return the start of the next active pageblock after a given page */
538static struct page *next_active_pageblock(struct page *page)
539{
540 int pageblocks_stride;
541
542 /* Ensure the starting page is pageblock-aligned */
543 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
544
545 /* Move forward by at least 1 * pageblock_nr_pages */
546 pageblocks_stride = 1;
547
548 /* If the entire pageblock is free, move to the end of free page */
549 if (pageblock_free(page))
550 pageblocks_stride += page_order(page) - pageblock_order;
551
552 return page + (pageblocks_stride * pageblock_nr_pages);
553}
554
555/* Checks if this range of memory is likely to be hot-removable. */
556int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
557{
558 int type;
559 struct page *page = pfn_to_page(start_pfn);
560 struct page *end_page = page + nr_pages;
561
562 /* Check the starting page of each pageblock within the range */
563 for (; page < end_page; page = next_active_pageblock(page)) {
564 type = get_pageblock_migratetype(page);
565
566 /*
567 * A pageblock containing MOVABLE or free pages is considered
568 * removable
569 */
570 if (type != MIGRATE_MOVABLE && !pageblock_free(page))
571 return 0;
572
573 /*
574 * A pageblock starting with a PageReserved page is not
575 * considered removable.
576 */
577 if (PageReserved(page))
578 return 0;
579 }
580
581 /* All pageblocks in the memory block are likely to be hot-removable */
582 return 1;
583}
584
585/*
524 * Confirm all pages in a range [start, end) is belongs to the same zone. 586 * Confirm all pages in a range [start, end) is belongs to the same zone.
525 */ 587 */
526static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 588static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c94e58b192c3..e550bec20582 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1481,7 +1481,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1481 1481
1482 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { 1482 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1483 zl = node_zonelist(interleave_nid(*mpol, vma, addr, 1483 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1484 HPAGE_SHIFT), gfp_flags); 1484 huge_page_shift(hstate_vma(vma))), gfp_flags);
1485 } else { 1485 } else {
1486 zl = policy_zonelist(gfp_flags, *mpol); 1486 zl = policy_zonelist(gfp_flags, *mpol);
1487 if ((*mpol)->mode == MPOL_BIND) 1487 if ((*mpol)->mode == MPOL_BIND)
@@ -2220,9 +2220,12 @@ static void check_huge_range(struct vm_area_struct *vma,
2220{ 2220{
2221 unsigned long addr; 2221 unsigned long addr;
2222 struct page *page; 2222 struct page *page;
2223 struct hstate *h = hstate_vma(vma);
2224 unsigned long sz = huge_page_size(h);
2223 2225
2224 for (addr = start; addr < end; addr += HPAGE_SIZE) { 2226 for (addr = start; addr < end; addr += sz) {
2225 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); 2227 pte_t *ptep = huge_pte_offset(vma->vm_mm,
2228 addr & huge_page_mask(h));
2226 pte_t pte; 2229 pte_t pte;
2227 2230
2228 if (!ptep) 2231 if (!ptep)
diff --git a/mm/migrate.c b/mm/migrate.c
index 55bd355d170d..153572fb60b8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -30,6 +30,7 @@
30#include <linux/vmalloc.h> 30#include <linux/vmalloc.h>
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/memcontrol.h> 32#include <linux/memcontrol.h>
33#include <linux/syscalls.h>
33 34
34#include "internal.h" 35#include "internal.h"
35 36
@@ -284,7 +285,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
284 285
285 page = migration_entry_to_page(entry); 286 page = migration_entry_to_page(entry);
286 287
287 get_page(page); 288 /*
289 * Once radix-tree replacement of page migration started, page_count
290 * *must* be zero. And, we don't want to call wait_on_page_locked()
291 * against a page without get_page().
292 * So, we use get_page_unless_zero(), here. Even failed, page fault
293 * will occur again.
294 */
295 if (!get_page_unless_zero(page))
296 goto out;
288 pte_unmap_unlock(ptep, ptl); 297 pte_unmap_unlock(ptep, ptl);
289 wait_on_page_locked(page); 298 wait_on_page_locked(page);
290 put_page(page); 299 put_page(page);
@@ -304,6 +313,7 @@ out:
304static int migrate_page_move_mapping(struct address_space *mapping, 313static int migrate_page_move_mapping(struct address_space *mapping,
305 struct page *newpage, struct page *page) 314 struct page *newpage, struct page *page)
306{ 315{
316 int expected_count;
307 void **pslot; 317 void **pslot;
308 318
309 if (!mapping) { 319 if (!mapping) {
@@ -313,14 +323,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
313 return 0; 323 return 0;
314 } 324 }
315 325
316 write_lock_irq(&mapping->tree_lock); 326 spin_lock_irq(&mapping->tree_lock);
317 327
318 pslot = radix_tree_lookup_slot(&mapping->page_tree, 328 pslot = radix_tree_lookup_slot(&mapping->page_tree,
319 page_index(page)); 329 page_index(page));
320 330
321 if (page_count(page) != 2 + !!PagePrivate(page) || 331 expected_count = 2 + !!PagePrivate(page);
332 if (page_count(page) != expected_count ||
322 (struct page *)radix_tree_deref_slot(pslot) != page) { 333 (struct page *)radix_tree_deref_slot(pslot) != page) {
323 write_unlock_irq(&mapping->tree_lock); 334 spin_unlock_irq(&mapping->tree_lock);
335 return -EAGAIN;
336 }
337
338 if (!page_freeze_refs(page, expected_count)) {
339 spin_unlock_irq(&mapping->tree_lock);
324 return -EAGAIN; 340 return -EAGAIN;
325 } 341 }
326 342
@@ -337,6 +353,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
337 353
338 radix_tree_replace_slot(pslot, newpage); 354 radix_tree_replace_slot(pslot, newpage);
339 355
356 page_unfreeze_refs(page, expected_count);
340 /* 357 /*
341 * Drop cache reference from old page. 358 * Drop cache reference from old page.
342 * We know this isn't the last reference. 359 * We know this isn't the last reference.
@@ -356,7 +373,9 @@ static int migrate_page_move_mapping(struct address_space *mapping,
356 __dec_zone_page_state(page, NR_FILE_PAGES); 373 __dec_zone_page_state(page, NR_FILE_PAGES);
357 __inc_zone_page_state(newpage, NR_FILE_PAGES); 374 __inc_zone_page_state(newpage, NR_FILE_PAGES);
358 375
359 write_unlock_irq(&mapping->tree_lock); 376 spin_unlock_irq(&mapping->tree_lock);
377 if (!PageSwapCache(newpage))
378 mem_cgroup_uncharge_cache_page(page);
360 379
361 return 0; 380 return 0;
362} 381}
@@ -610,7 +629,6 @@ static int move_to_new_page(struct page *newpage, struct page *page)
610 rc = fallback_migrate_page(mapping, newpage, page); 629 rc = fallback_migrate_page(mapping, newpage, page);
611 630
612 if (!rc) { 631 if (!rc) {
613 mem_cgroup_page_migration(page, newpage);
614 remove_migration_ptes(page, newpage); 632 remove_migration_ptes(page, newpage);
615 } else 633 } else
616 newpage->mapping = NULL; 634 newpage->mapping = NULL;
@@ -640,6 +658,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
640 /* page was freed from under us. So we are done. */ 658 /* page was freed from under us. So we are done. */
641 goto move_newpage; 659 goto move_newpage;
642 660
661 charge = mem_cgroup_prepare_migration(page, newpage);
662 if (charge == -ENOMEM) {
663 rc = -ENOMEM;
664 goto move_newpage;
665 }
666 /* prepare cgroup just returns 0 or -ENOMEM */
667 BUG_ON(charge);
668
643 rc = -EAGAIN; 669 rc = -EAGAIN;
644 if (TestSetPageLocked(page)) { 670 if (TestSetPageLocked(page)) {
645 if (!force) 671 if (!force)
@@ -691,19 +717,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
691 goto rcu_unlock; 717 goto rcu_unlock;
692 } 718 }
693 719
694 charge = mem_cgroup_prepare_migration(page);
695 /* Establish migration ptes or remove ptes */ 720 /* Establish migration ptes or remove ptes */
696 try_to_unmap(page, 1); 721 try_to_unmap(page, 1);
697 722
698 if (!page_mapped(page)) 723 if (!page_mapped(page))
699 rc = move_to_new_page(newpage, page); 724 rc = move_to_new_page(newpage, page);
700 725
701 if (rc) { 726 if (rc)
702 remove_migration_ptes(page, page); 727 remove_migration_ptes(page, page);
703 if (charge)
704 mem_cgroup_end_migration(page);
705 } else if (charge)
706 mem_cgroup_end_migration(newpage);
707rcu_unlock: 728rcu_unlock:
708 if (rcu_locked) 729 if (rcu_locked)
709 rcu_read_unlock(); 730 rcu_read_unlock();
@@ -724,6 +745,8 @@ unlock:
724 } 745 }
725 746
726move_newpage: 747move_newpage:
748 if (!charge)
749 mem_cgroup_end_migration(newpage);
727 /* 750 /*
728 * Move the new page to the LRU. If migration was not successful 751 * Move the new page to the LRU. If migration was not successful
729 * then this will free the page. 752 * then this will free the page.
@@ -1070,7 +1093,6 @@ out2:
1070 mmput(mm); 1093 mmput(mm);
1071 return err; 1094 return err;
1072} 1095}
1073#endif
1074 1096
1075/* 1097/*
1076 * Call migration functions in the vma_ops that may prepare 1098 * Call migration functions in the vma_ops that may prepare
@@ -1092,3 +1114,4 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1092 } 1114 }
1093 return err; 1115 return err;
1094} 1116}
1117#endif
diff --git a/mm/mm_init.c b/mm/mm_init.c
new file mode 100644
index 000000000000..c6af41ea9994
--- /dev/null
+++ b/mm/mm_init.c
@@ -0,0 +1,152 @@
1/*
2 * mm_init.c - Memory initialisation verification and debugging
3 *
4 * Copyright 2008 IBM Corporation, 2008
5 * Author Mel Gorman <mel@csn.ul.ie>
6 *
7 */
8#include <linux/kernel.h>
9#include <linux/init.h>
10#include <linux/kobject.h>
11#include <linux/module.h>
12#include "internal.h"
13
14#ifdef CONFIG_DEBUG_MEMORY_INIT
15int __meminitdata mminit_loglevel;
16
17/* The zonelists are simply reported, validation is manual. */
18void mminit_verify_zonelist(void)
19{
20 int nid;
21
22 if (mminit_loglevel < MMINIT_VERIFY)
23 return;
24
25 for_each_online_node(nid) {
26 pg_data_t *pgdat = NODE_DATA(nid);
27 struct zone *zone;
28 struct zoneref *z;
29 struct zonelist *zonelist;
30 int i, listid, zoneid;
31
32 BUG_ON(MAX_ZONELISTS > 2);
33 for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
34
35 /* Identify the zone and nodelist */
36 zoneid = i % MAX_NR_ZONES;
37 listid = i / MAX_NR_ZONES;
38 zonelist = &pgdat->node_zonelists[listid];
39 zone = &pgdat->node_zones[zoneid];
40 if (!populated_zone(zone))
41 continue;
42
43 /* Print information about the zonelist */
44 printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
45 listid > 0 ? "thisnode" : "general", nid,
46 zone->name);
47
48 /* Iterate the zonelist */
49 for_each_zone_zonelist(zone, z, zonelist, zoneid) {
50#ifdef CONFIG_NUMA
51 printk(KERN_CONT "%d:%s ",
52 zone->node, zone->name);
53#else
54 printk(KERN_CONT "0:%s ", zone->name);
55#endif /* CONFIG_NUMA */
56 }
57 printk(KERN_CONT "\n");
58 }
59 }
60}
61
62void __init mminit_verify_pageflags_layout(void)
63{
64 int shift, width;
65 unsigned long or_mask, add_mask;
66
67 shift = 8 * sizeof(unsigned long);
68 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH;
69 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
70 "Section %d Node %d Zone %d Flags %d\n",
71 SECTIONS_WIDTH,
72 NODES_WIDTH,
73 ZONES_WIDTH,
74 NR_PAGEFLAGS);
75 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
76 "Section %d Node %d Zone %d\n",
77#ifdef SECTIONS_SHIFT
78 SECTIONS_SHIFT,
79#else
80 0,
81#endif
82 NODES_SHIFT,
83 ZONES_SHIFT);
84 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets",
85 "Section %lu Node %lu Zone %lu\n",
86 (unsigned long)SECTIONS_PGSHIFT,
87 (unsigned long)NODES_PGSHIFT,
88 (unsigned long)ZONES_PGSHIFT);
89 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid",
90 "Zone ID: %lu -> %lu\n",
91 (unsigned long)ZONEID_PGOFF,
92 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT));
93 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
94 "location: %d -> %d unused %d -> %d flags %d -> %d\n",
95 shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
96#ifdef NODE_NOT_IN_PAGE_FLAGS
97 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
98 "Node not in page flags");
99#endif
100
101 if (SECTIONS_WIDTH) {
102 shift -= SECTIONS_WIDTH;
103 BUG_ON(shift != SECTIONS_PGSHIFT);
104 }
105 if (NODES_WIDTH) {
106 shift -= NODES_WIDTH;
107 BUG_ON(shift != NODES_PGSHIFT);
108 }
109 if (ZONES_WIDTH) {
110 shift -= ZONES_WIDTH;
111 BUG_ON(shift != ZONES_PGSHIFT);
112 }
113
114 /* Check for bitmask overlaps */
115 or_mask = (ZONES_MASK << ZONES_PGSHIFT) |
116 (NODES_MASK << NODES_PGSHIFT) |
117 (SECTIONS_MASK << SECTIONS_PGSHIFT);
118 add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
119 (NODES_MASK << NODES_PGSHIFT) +
120 (SECTIONS_MASK << SECTIONS_PGSHIFT);
121 BUG_ON(or_mask != add_mask);
122}
123
124void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone,
125 unsigned long nid, unsigned long pfn)
126{
127 BUG_ON(page_to_nid(page) != nid);
128 BUG_ON(page_zonenum(page) != zone);
129 BUG_ON(page_to_pfn(page) != pfn);
130}
131
132static __init int set_mminit_loglevel(char *str)
133{
134 get_option(&str, &mminit_loglevel);
135 return 0;
136}
137early_param("mminit_loglevel", set_mminit_loglevel);
138#endif /* CONFIG_DEBUG_MEMORY_INIT */
139
140struct kobject *mm_kobj;
141EXPORT_SYMBOL_GPL(mm_kobj);
142
143static int __init mm_sysfs_init(void)
144{
145 mm_kobj = kobject_create_and_add("mm", kernel_kobj);
146 if (!mm_kobj)
147 return -ENOMEM;
148
149 return 0;
150}
151
152__initcall(mm_sysfs_init);
diff --git a/mm/mmap.c b/mm/mmap.c
index 1d102b956fd8..5e0cc99e9cd5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -32,6 +32,8 @@
32#include <asm/tlb.h> 32#include <asm/tlb.h>
33#include <asm/mmu_context.h> 33#include <asm/mmu_context.h>
34 34
35#include "internal.h"
36
35#ifndef arch_mmap_check 37#ifndef arch_mmap_check
36#define arch_mmap_check(addr, len, flags) (0) 38#define arch_mmap_check(addr, len, flags) (0)
37#endif 39#endif
@@ -1108,6 +1110,9 @@ munmap_back:
1108 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) 1110 if (!may_expand_vm(mm, len >> PAGE_SHIFT))
1109 return -ENOMEM; 1111 return -ENOMEM;
1110 1112
1113 if (flags & MAP_NORESERVE)
1114 vm_flags |= VM_NORESERVE;
1115
1111 if (accountable && (!(flags & MAP_NORESERVE) || 1116 if (accountable && (!(flags & MAP_NORESERVE) ||
1112 sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { 1117 sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
1113 if (vm_flags & VM_SHARED) { 1118 if (vm_flags & VM_SHARED) {
@@ -1763,7 +1768,7 @@ static void unmap_region(struct mm_struct *mm,
1763 update_hiwater_rss(mm); 1768 update_hiwater_rss(mm);
1764 unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); 1769 unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
1765 vm_unacct_memory(nr_accounted); 1770 vm_unacct_memory(nr_accounted);
1766 free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, 1771 free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
1767 next? next->vm_start: 0); 1772 next? next->vm_start: 0);
1768 tlb_finish_mmu(tlb, start, end); 1773 tlb_finish_mmu(tlb, start, end);
1769} 1774}
@@ -1807,7 +1812,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1807 struct mempolicy *pol; 1812 struct mempolicy *pol;
1808 struct vm_area_struct *new; 1813 struct vm_area_struct *new;
1809 1814
1810 if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK)) 1815 if (is_vm_hugetlb_page(vma) && (addr &
1816 ~(huge_page_mask(hstate_vma(vma)))))
1811 return -EINVAL; 1817 return -EINVAL;
1812 1818
1813 if (mm->map_count >= sysctl_max_map_count) 1819 if (mm->map_count >= sysctl_max_map_count)
@@ -2063,7 +2069,7 @@ void exit_mmap(struct mm_struct *mm)
2063 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2069 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2064 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2070 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2065 vm_unacct_memory(nr_accounted); 2071 vm_unacct_memory(nr_accounted);
2066 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); 2072 free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
2067 tlb_finish_mmu(tlb, 0, end); 2073 tlb_finish_mmu(tlb, 0, end);
2068 2074
2069 /* 2075 /*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 360d9cc8b38c..abd645a3b0a0 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -153,12 +153,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
153 * If we make a private mapping writable we increase our commit; 153 * If we make a private mapping writable we increase our commit;
154 * but (without finer accounting) cannot reduce our commit if we 154 * but (without finer accounting) cannot reduce our commit if we
155 * make it unwritable again. 155 * make it unwritable again.
156 *
157 * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting
158 * a MAP_NORESERVE private mapping to writable will now reserve.
159 */ 156 */
160 if (newflags & VM_WRITE) { 157 if (newflags & VM_WRITE) {
161 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { 158 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|
159 VM_SHARED|VM_NORESERVE))) {
162 charged = nrpages; 160 charged = nrpages;
163 if (security_vm_enough_memory(charged)) 161 if (security_vm_enough_memory(charged))
164 return -ENOMEM; 162 return -ENOMEM;
diff --git a/mm/nommu.c b/mm/nommu.c
index 4462b6a3fcb9..5edccd9c9218 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,7 @@
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
25#include <linux/ptrace.h> 25#include <linux/tracehook.h>
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/mount.h> 28#include <linux/mount.h>
@@ -745,7 +745,7 @@ static unsigned long determine_vm_flags(struct file *file,
745 * it's being traced - otherwise breakpoints set in it may interfere 745 * it's being traced - otherwise breakpoints set in it may interfere
746 * with another untraced process 746 * with another untraced process
747 */ 747 */
748 if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED)) 748 if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
749 vm_flags &= ~VM_MAYSHARE; 749 vm_flags &= ~VM_MAYSHARE;
750 750
751 return vm_flags; 751 return vm_flags;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 94c6d8988ab3..24de8b65fdbd 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1088,7 +1088,7 @@ int __set_page_dirty_nobuffers(struct page *page)
1088 if (!mapping) 1088 if (!mapping)
1089 return 1; 1089 return 1;
1090 1090
1091 write_lock_irq(&mapping->tree_lock); 1091 spin_lock_irq(&mapping->tree_lock);
1092 mapping2 = page_mapping(page); 1092 mapping2 = page_mapping(page);
1093 if (mapping2) { /* Race with truncate? */ 1093 if (mapping2) { /* Race with truncate? */
1094 BUG_ON(mapping2 != mapping); 1094 BUG_ON(mapping2 != mapping);
@@ -1102,7 +1102,7 @@ int __set_page_dirty_nobuffers(struct page *page)
1102 radix_tree_tag_set(&mapping->page_tree, 1102 radix_tree_tag_set(&mapping->page_tree,
1103 page_index(page), PAGECACHE_TAG_DIRTY); 1103 page_index(page), PAGECACHE_TAG_DIRTY);
1104 } 1104 }
1105 write_unlock_irq(&mapping->tree_lock); 1105 spin_unlock_irq(&mapping->tree_lock);
1106 if (mapping->host) { 1106 if (mapping->host) {
1107 /* !PageAnon && !swapper_space */ 1107 /* !PageAnon && !swapper_space */
1108 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1108 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1258,7 +1258,7 @@ int test_clear_page_writeback(struct page *page)
1258 struct backing_dev_info *bdi = mapping->backing_dev_info; 1258 struct backing_dev_info *bdi = mapping->backing_dev_info;
1259 unsigned long flags; 1259 unsigned long flags;
1260 1260
1261 write_lock_irqsave(&mapping->tree_lock, flags); 1261 spin_lock_irqsave(&mapping->tree_lock, flags);
1262 ret = TestClearPageWriteback(page); 1262 ret = TestClearPageWriteback(page);
1263 if (ret) { 1263 if (ret) {
1264 radix_tree_tag_clear(&mapping->page_tree, 1264 radix_tree_tag_clear(&mapping->page_tree,
@@ -1269,7 +1269,7 @@ int test_clear_page_writeback(struct page *page)
1269 __bdi_writeout_inc(bdi); 1269 __bdi_writeout_inc(bdi);
1270 } 1270 }
1271 } 1271 }
1272 write_unlock_irqrestore(&mapping->tree_lock, flags); 1272 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1273 } else { 1273 } else {
1274 ret = TestClearPageWriteback(page); 1274 ret = TestClearPageWriteback(page);
1275 } 1275 }
@@ -1287,7 +1287,7 @@ int test_set_page_writeback(struct page *page)
1287 struct backing_dev_info *bdi = mapping->backing_dev_info; 1287 struct backing_dev_info *bdi = mapping->backing_dev_info;
1288 unsigned long flags; 1288 unsigned long flags;
1289 1289
1290 write_lock_irqsave(&mapping->tree_lock, flags); 1290 spin_lock_irqsave(&mapping->tree_lock, flags);
1291 ret = TestSetPageWriteback(page); 1291 ret = TestSetPageWriteback(page);
1292 if (!ret) { 1292 if (!ret) {
1293 radix_tree_tag_set(&mapping->page_tree, 1293 radix_tree_tag_set(&mapping->page_tree,
@@ -1300,7 +1300,7 @@ int test_set_page_writeback(struct page *page)
1300 radix_tree_tag_clear(&mapping->page_tree, 1300 radix_tree_tag_clear(&mapping->page_tree,
1301 page_index(page), 1301 page_index(page),
1302 PAGECACHE_TAG_DIRTY); 1302 PAGECACHE_TAG_DIRTY);
1303 write_unlock_irqrestore(&mapping->tree_lock, flags); 1303 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1304 } else { 1304 } else {
1305 ret = TestSetPageWriteback(page); 1305 ret = TestSetPageWriteback(page);
1306 } 1306 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 79ac4afc908c..6da667274df5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -153,9 +153,9 @@ static unsigned long __meminitdata dma_reserve;
153 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; 153 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
154 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; 154 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
155#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 155#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
156 unsigned long __initdata required_kernelcore; 156 static unsigned long __initdata required_kernelcore;
157 static unsigned long __initdata required_movablecore; 157 static unsigned long __initdata required_movablecore;
158 unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 158 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
159 159
160 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 160 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
161 int movable_zone; 161 int movable_zone;
@@ -264,7 +264,7 @@ static void free_compound_page(struct page *page)
264 __free_pages_ok(page, compound_order(page)); 264 __free_pages_ok(page, compound_order(page));
265} 265}
266 266
267static void prep_compound_page(struct page *page, unsigned long order) 267void prep_compound_page(struct page *page, unsigned long order)
268{ 268{
269 int i; 269 int i;
270 int nr_pages = 1 << order; 270 int nr_pages = 1 << order;
@@ -432,8 +432,9 @@ static inline void __free_one_page(struct page *page,
432 432
433 buddy = __page_find_buddy(page, page_idx, order); 433 buddy = __page_find_buddy(page, page_idx, order);
434 if (!page_is_buddy(page, buddy, order)) 434 if (!page_is_buddy(page, buddy, order))
435 break; /* Move the buddy up one level. */ 435 break;
436 436
437 /* Our buddy is free, merge with it and move up one order. */
437 list_del(&buddy->lru); 438 list_del(&buddy->lru);
438 zone->free_area[order].nr_free--; 439 zone->free_area[order].nr_free--;
439 rmv_page_order(buddy); 440 rmv_page_order(buddy);
@@ -532,7 +533,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
532/* 533/*
533 * permit the bootmem allocator to evade page validation on high-order frees 534 * permit the bootmem allocator to evade page validation on high-order frees
534 */ 535 */
535void __free_pages_bootmem(struct page *page, unsigned int order) 536void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
536{ 537{
537 if (order == 0) { 538 if (order == 0) {
538 __ClearPageReserved(page); 539 __ClearPageReserved(page);
@@ -673,9 +674,9 @@ static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
673 * Note that start_page and end_pages are not aligned on a pageblock 674 * Note that start_page and end_pages are not aligned on a pageblock
674 * boundary. If alignment is required, use move_freepages_block() 675 * boundary. If alignment is required, use move_freepages_block()
675 */ 676 */
676int move_freepages(struct zone *zone, 677static int move_freepages(struct zone *zone,
677 struct page *start_page, struct page *end_page, 678 struct page *start_page, struct page *end_page,
678 int migratetype) 679 int migratetype)
679{ 680{
680 struct page *page; 681 struct page *page;
681 unsigned long order; 682 unsigned long order;
@@ -714,7 +715,8 @@ int move_freepages(struct zone *zone,
714 return pages_moved; 715 return pages_moved;
715} 716}
716 717
717int move_freepages_block(struct zone *zone, struct page *page, int migratetype) 718static int move_freepages_block(struct zone *zone, struct page *page,
719 int migratetype)
718{ 720{
719 unsigned long start_pfn, end_pfn; 721 unsigned long start_pfn, end_pfn;
720 struct page *start_page, *end_page; 722 struct page *start_page, *end_page;
@@ -1429,7 +1431,7 @@ try_next_zone:
1429/* 1431/*
1430 * This is the 'heart' of the zoned buddy allocator. 1432 * This is the 'heart' of the zoned buddy allocator.
1431 */ 1433 */
1432static struct page * 1434struct page *
1433__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, 1435__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
1434 struct zonelist *zonelist, nodemask_t *nodemask) 1436 struct zonelist *zonelist, nodemask_t *nodemask)
1435{ 1437{
@@ -1632,22 +1634,7 @@ nopage:
1632got_pg: 1634got_pg:
1633 return page; 1635 return page;
1634} 1636}
1635 1637EXPORT_SYMBOL(__alloc_pages_internal);
1636struct page *
1637__alloc_pages(gfp_t gfp_mask, unsigned int order,
1638 struct zonelist *zonelist)
1639{
1640 return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
1641}
1642
1643struct page *
1644__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1645 struct zonelist *zonelist, nodemask_t *nodemask)
1646{
1647 return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
1648}
1649
1650EXPORT_SYMBOL(__alloc_pages);
1651 1638
1652/* 1639/*
1653 * Common helper functions. 1640 * Common helper functions.
@@ -1711,6 +1698,59 @@ void free_pages(unsigned long addr, unsigned int order)
1711 1698
1712EXPORT_SYMBOL(free_pages); 1699EXPORT_SYMBOL(free_pages);
1713 1700
1701/**
1702 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
1703 * @size: the number of bytes to allocate
1704 * @gfp_mask: GFP flags for the allocation
1705 *
1706 * This function is similar to alloc_pages(), except that it allocates the
1707 * minimum number of pages to satisfy the request. alloc_pages() can only
1708 * allocate memory in power-of-two pages.
1709 *
1710 * This function is also limited by MAX_ORDER.
1711 *
1712 * Memory allocated by this function must be released by free_pages_exact().
1713 */
1714void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
1715{
1716 unsigned int order = get_order(size);
1717 unsigned long addr;
1718
1719 addr = __get_free_pages(gfp_mask, order);
1720 if (addr) {
1721 unsigned long alloc_end = addr + (PAGE_SIZE << order);
1722 unsigned long used = addr + PAGE_ALIGN(size);
1723
1724 split_page(virt_to_page(addr), order);
1725 while (used < alloc_end) {
1726 free_page(used);
1727 used += PAGE_SIZE;
1728 }
1729 }
1730
1731 return (void *)addr;
1732}
1733EXPORT_SYMBOL(alloc_pages_exact);
1734
1735/**
1736 * free_pages_exact - release memory allocated via alloc_pages_exact()
1737 * @virt: the value returned by alloc_pages_exact.
1738 * @size: size of allocation, same value as passed to alloc_pages_exact().
1739 *
1740 * Release the memory allocated by a previous call to alloc_pages_exact.
1741 */
1742void free_pages_exact(void *virt, size_t size)
1743{
1744 unsigned long addr = (unsigned long)virt;
1745 unsigned long end = addr + PAGE_ALIGN(size);
1746
1747 while (addr < end) {
1748 free_page(addr);
1749 addr += PAGE_SIZE;
1750 }
1751}
1752EXPORT_SYMBOL(free_pages_exact);
1753
1714static unsigned int nr_free_zone_pages(int offset) 1754static unsigned int nr_free_zone_pages(int offset)
1715{ 1755{
1716 struct zoneref *z; 1756 struct zoneref *z;
@@ -2352,6 +2392,7 @@ void build_all_zonelists(void)
2352 2392
2353 if (system_state == SYSTEM_BOOTING) { 2393 if (system_state == SYSTEM_BOOTING) {
2354 __build_all_zonelists(NULL); 2394 __build_all_zonelists(NULL);
2395 mminit_verify_zonelist();
2355 cpuset_init_current_mems_allowed(); 2396 cpuset_init_current_mems_allowed();
2356 } else { 2397 } else {
2357 /* we have to stop all cpus to guarantee there is no user 2398 /* we have to stop all cpus to guarantee there is no user
@@ -2534,6 +2575,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2534 } 2575 }
2535 page = pfn_to_page(pfn); 2576 page = pfn_to_page(pfn);
2536 set_page_links(page, zone, nid, pfn); 2577 set_page_links(page, zone, nid, pfn);
2578 mminit_verify_page_links(page, zone, nid, pfn);
2537 init_page_count(page); 2579 init_page_count(page);
2538 reset_page_mapcount(page); 2580 reset_page_mapcount(page);
2539 SetPageReserved(page); 2581 SetPageReserved(page);
@@ -2611,7 +2653,7 @@ static int zone_batchsize(struct zone *zone)
2611 return batch; 2653 return batch;
2612} 2654}
2613 2655
2614inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 2656static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2615{ 2657{
2616 struct per_cpu_pages *pcp; 2658 struct per_cpu_pages *pcp;
2617 2659
@@ -2836,6 +2878,12 @@ __meminit int init_currently_empty_zone(struct zone *zone,
2836 2878
2837 zone->zone_start_pfn = zone_start_pfn; 2879 zone->zone_start_pfn = zone_start_pfn;
2838 2880
2881 mminit_dprintk(MMINIT_TRACE, "memmap_init",
2882 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
2883 pgdat->node_id,
2884 (unsigned long)zone_idx(zone),
2885 zone_start_pfn, (zone_start_pfn + size));
2886
2839 zone_init_free_lists(zone); 2887 zone_init_free_lists(zone);
2840 2888
2841 return 0; 2889 return 0;
@@ -2975,7 +3023,8 @@ void __init sparse_memory_present_with_active_regions(int nid)
2975void __init push_node_boundaries(unsigned int nid, 3023void __init push_node_boundaries(unsigned int nid,
2976 unsigned long start_pfn, unsigned long end_pfn) 3024 unsigned long start_pfn, unsigned long end_pfn)
2977{ 3025{
2978 printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", 3026 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3027 "Entering push_node_boundaries(%u, %lu, %lu)\n",
2979 nid, start_pfn, end_pfn); 3028 nid, start_pfn, end_pfn);
2980 3029
2981 /* Initialise the boundary for this node if necessary */ 3030 /* Initialise the boundary for this node if necessary */
@@ -2993,7 +3042,8 @@ void __init push_node_boundaries(unsigned int nid,
2993static void __meminit account_node_boundary(unsigned int nid, 3042static void __meminit account_node_boundary(unsigned int nid,
2994 unsigned long *start_pfn, unsigned long *end_pfn) 3043 unsigned long *start_pfn, unsigned long *end_pfn)
2995{ 3044{
2996 printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", 3045 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3046 "Entering account_node_boundary(%u, %lu, %lu)\n",
2997 nid, *start_pfn, *end_pfn); 3047 nid, *start_pfn, *end_pfn);
2998 3048
2999 /* Return if boundary information has not been provided */ 3049 /* Return if boundary information has not been provided */
@@ -3050,7 +3100,7 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
3050 * assumption is made that zones within a node are ordered in monotonic 3100 * assumption is made that zones within a node are ordered in monotonic
3051 * increasing memory addresses so that the "highest" populated zone is used 3101 * increasing memory addresses so that the "highest" populated zone is used
3052 */ 3102 */
3053void __init find_usable_zone_for_movable(void) 3103static void __init find_usable_zone_for_movable(void)
3054{ 3104{
3055 int zone_index; 3105 int zone_index;
3056 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 3106 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
@@ -3076,7 +3126,7 @@ void __init find_usable_zone_for_movable(void)
3076 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 3126 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
3077 * zones within a node are in order of monotonic increases memory addresses 3127 * zones within a node are in order of monotonic increases memory addresses
3078 */ 3128 */
3079void __meminit adjust_zone_range_for_zone_movable(int nid, 3129static void __meminit adjust_zone_range_for_zone_movable(int nid,
3080 unsigned long zone_type, 3130 unsigned long zone_type,
3081 unsigned long node_start_pfn, 3131 unsigned long node_start_pfn,
3082 unsigned long node_end_pfn, 3132 unsigned long node_end_pfn,
@@ -3137,7 +3187,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
3137 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 3187 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
3138 * then all holes in the requested range will be accounted for. 3188 * then all holes in the requested range will be accounted for.
3139 */ 3189 */
3140unsigned long __meminit __absent_pages_in_range(int nid, 3190static unsigned long __meminit __absent_pages_in_range(int nid,
3141 unsigned long range_start_pfn, 3191 unsigned long range_start_pfn,
3142 unsigned long range_end_pfn) 3192 unsigned long range_end_pfn)
3143{ 3193{
@@ -3368,8 +3418,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3368 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; 3418 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
3369 if (realsize >= memmap_pages) { 3419 if (realsize >= memmap_pages) {
3370 realsize -= memmap_pages; 3420 realsize -= memmap_pages;
3371 printk(KERN_DEBUG 3421 mminit_dprintk(MMINIT_TRACE, "memmap_init",
3372 " %s zone: %lu pages used for memmap\n", 3422 "%s zone: %lu pages used for memmap\n",
3373 zone_names[j], memmap_pages); 3423 zone_names[j], memmap_pages);
3374 } else 3424 } else
3375 printk(KERN_WARNING 3425 printk(KERN_WARNING
@@ -3379,7 +3429,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3379 /* Account for reserved pages */ 3429 /* Account for reserved pages */
3380 if (j == 0 && realsize > dma_reserve) { 3430 if (j == 0 && realsize > dma_reserve) {
3381 realsize -= dma_reserve; 3431 realsize -= dma_reserve;
3382 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 3432 mminit_dprintk(MMINIT_TRACE, "memmap_init",
3433 "%s zone: %lu pages reserved\n",
3383 zone_names[0], dma_reserve); 3434 zone_names[0], dma_reserve);
3384 } 3435 }
3385 3436
@@ -3464,10 +3515,11 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
3464#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 3515#endif /* CONFIG_FLAT_NODE_MEM_MAP */
3465} 3516}
3466 3517
3467void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat, 3518void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
3468 unsigned long *zones_size, unsigned long node_start_pfn, 3519 unsigned long node_start_pfn, unsigned long *zholes_size)
3469 unsigned long *zholes_size)
3470{ 3520{
3521 pg_data_t *pgdat = NODE_DATA(nid);
3522
3471 pgdat->node_id = nid; 3523 pgdat->node_id = nid;
3472 pgdat->node_start_pfn = node_start_pfn; 3524 pgdat->node_start_pfn = node_start_pfn;
3473 calculate_node_totalpages(pgdat, zones_size, zholes_size); 3525 calculate_node_totalpages(pgdat, zones_size, zholes_size);
@@ -3520,10 +3572,13 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
3520{ 3572{
3521 int i; 3573 int i;
3522 3574
3523 printk(KERN_DEBUG "Entering add_active_range(%d, %#lx, %#lx) " 3575 mminit_dprintk(MMINIT_TRACE, "memory_register",
3524 "%d entries of %d used\n", 3576 "Entering add_active_range(%d, %#lx, %#lx) "
3525 nid, start_pfn, end_pfn, 3577 "%d entries of %d used\n",
3526 nr_nodemap_entries, MAX_ACTIVE_REGIONS); 3578 nid, start_pfn, end_pfn,
3579 nr_nodemap_entries, MAX_ACTIVE_REGIONS);
3580
3581 mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
3527 3582
3528 /* Merge with existing active regions if possible */ 3583 /* Merge with existing active regions if possible */
3529 for (i = 0; i < nr_nodemap_entries; i++) { 3584 for (i = 0; i < nr_nodemap_entries; i++) {
@@ -3669,7 +3724,7 @@ static void __init sort_node_map(void)
3669} 3724}
3670 3725
3671/* Find the lowest pfn for a node */ 3726/* Find the lowest pfn for a node */
3672unsigned long __init find_min_pfn_for_node(int nid) 3727static unsigned long __init find_min_pfn_for_node(int nid)
3673{ 3728{
3674 int i; 3729 int i;
3675 unsigned long min_pfn = ULONG_MAX; 3730 unsigned long min_pfn = ULONG_MAX;
@@ -3741,7 +3796,7 @@ static unsigned long __init early_calculate_totalpages(void)
3741 * memory. When they don't, some nodes will have more kernelcore than 3796 * memory. When they don't, some nodes will have more kernelcore than
3742 * others 3797 * others
3743 */ 3798 */
3744void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) 3799static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3745{ 3800{
3746 int i, nid; 3801 int i, nid;
3747 unsigned long usable_startpfn; 3802 unsigned long usable_startpfn;
@@ -3957,10 +4012,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
3957 early_node_map[i].end_pfn); 4012 early_node_map[i].end_pfn);
3958 4013
3959 /* Initialise every node */ 4014 /* Initialise every node */
4015 mminit_verify_pageflags_layout();
3960 setup_nr_node_ids(); 4016 setup_nr_node_ids();
3961 for_each_online_node(nid) { 4017 for_each_online_node(nid) {
3962 pg_data_t *pgdat = NODE_DATA(nid); 4018 pg_data_t *pgdat = NODE_DATA(nid);
3963 free_area_init_node(nid, pgdat, NULL, 4019 free_area_init_node(nid, NULL,
3964 find_min_pfn_for_node(nid), NULL); 4020 find_min_pfn_for_node(nid), NULL);
3965 4021
3966 /* Any memory on that node */ 4022 /* Any memory on that node */
@@ -4025,15 +4081,13 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
4025} 4081}
4026 4082
4027#ifndef CONFIG_NEED_MULTIPLE_NODES 4083#ifndef CONFIG_NEED_MULTIPLE_NODES
4028static bootmem_data_t contig_bootmem_data; 4084struct pglist_data contig_page_data = { .bdata = &bootmem_node_data[0] };
4029struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
4030
4031EXPORT_SYMBOL(contig_page_data); 4085EXPORT_SYMBOL(contig_page_data);
4032#endif 4086#endif
4033 4087
4034void __init free_area_init(unsigned long *zones_size) 4088void __init free_area_init(unsigned long *zones_size)
4035{ 4089{
4036 free_area_init_node(0, NODE_DATA(0), zones_size, 4090 free_area_init_node(0, zones_size,
4037 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 4091 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
4038} 4092}
4039 4093
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 9d834aa4b979..0cbe0c60c6bf 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -130,7 +130,7 @@ static int __pdflush(struct pdflush_work *my_work)
130 * Thread creation: For how long have there been zero 130 * Thread creation: For how long have there been zero
131 * available threads? 131 * available threads?
132 */ 132 */
133 if (jiffies - last_empty_jifs > 1 * HZ) { 133 if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
134 /* unlocked list_empty() test is OK here */ 134 /* unlocked list_empty() test is OK here */
135 if (list_empty(&pdflush_list)) { 135 if (list_empty(&pdflush_list)) {
136 /* unlocked test is OK here */ 136 /* unlocked test is OK here */
@@ -151,7 +151,7 @@ static int __pdflush(struct pdflush_work *my_work)
151 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) 151 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
152 continue; 152 continue;
153 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); 153 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
154 if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) { 154 if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
155 /* Limit exit rate */ 155 /* Limit exit rate */
156 pdf->when_i_went_to_sleep = jiffies; 156 pdf->when_i_went_to_sleep = jiffies;
157 break; /* exeunt */ 157 break; /* exeunt */
diff --git a/mm/readahead.c b/mm/readahead.c
index d8723a5f6496..77e8ddf945e9 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -382,9 +382,9 @@ ondemand_readahead(struct address_space *mapping,
382 if (hit_readahead_marker) { 382 if (hit_readahead_marker) {
383 pgoff_t start; 383 pgoff_t start;
384 384
385 read_lock_irq(&mapping->tree_lock); 385 rcu_read_lock();
386 start = radix_tree_next_hole(&mapping->page_tree, offset, max+1); 386 start = radix_tree_next_hole(&mapping->page_tree, offset,max+1);
387 read_unlock_irq(&mapping->tree_lock); 387 rcu_read_unlock();
388 388
389 if (!start || start - offset > max) 389 if (!start || start - offset > max)
390 return 0; 390 return 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index bf0a5b7cfb8e..39ae5a9bf382 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -138,7 +138,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
138 anon_vma_free(anon_vma); 138 anon_vma_free(anon_vma);
139} 139}
140 140
141static void anon_vma_ctor(struct kmem_cache *cachep, void *data) 141static void anon_vma_ctor(void *data)
142{ 142{
143 struct anon_vma *anon_vma = data; 143 struct anon_vma *anon_vma = data;
144 144
@@ -576,14 +576,8 @@ void page_add_anon_rmap(struct page *page,
576 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 576 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
577 if (atomic_inc_and_test(&page->_mapcount)) 577 if (atomic_inc_and_test(&page->_mapcount))
578 __page_set_anon_rmap(page, vma, address); 578 __page_set_anon_rmap(page, vma, address);
579 else { 579 else
580 __page_check_anon_rmap(page, vma, address); 580 __page_check_anon_rmap(page, vma, address);
581 /*
582 * We unconditionally charged during prepare, we uncharge here
583 * This takes care of balancing the reference counts
584 */
585 mem_cgroup_uncharge_page(page);
586 }
587} 581}
588 582
589/** 583/**
@@ -614,12 +608,6 @@ void page_add_file_rmap(struct page *page)
614{ 608{
615 if (atomic_inc_and_test(&page->_mapcount)) 609 if (atomic_inc_and_test(&page->_mapcount))
616 __inc_zone_page_state(page, NR_FILE_MAPPED); 610 __inc_zone_page_state(page, NR_FILE_MAPPED);
617 else
618 /*
619 * We unconditionally charged during prepare, we uncharge here
620 * This takes care of balancing the reference counts
621 */
622 mem_cgroup_uncharge_page(page);
623} 611}
624 612
625#ifdef CONFIG_DEBUG_VM 613#ifdef CONFIG_DEBUG_VM
diff --git a/mm/shmem.c b/mm/shmem.c
index e2a6ae1a44e9..952d361774bb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -922,20 +922,26 @@ found:
922 error = 1; 922 error = 1;
923 if (!inode) 923 if (!inode)
924 goto out; 924 goto out;
925 /* Precharge page while we can wait, compensate afterwards */ 925 /* Precharge page using GFP_KERNEL while we can wait */
926 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); 926 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
927 if (error) 927 if (error)
928 goto out; 928 goto out;
929 error = radix_tree_preload(GFP_KERNEL); 929 error = radix_tree_preload(GFP_KERNEL);
930 if (error) 930 if (error) {
931 goto uncharge; 931 mem_cgroup_uncharge_cache_page(page);
932 goto out;
933 }
932 error = 1; 934 error = 1;
933 935
934 spin_lock(&info->lock); 936 spin_lock(&info->lock);
935 ptr = shmem_swp_entry(info, idx, NULL); 937 ptr = shmem_swp_entry(info, idx, NULL);
936 if (ptr && ptr->val == entry.val) 938 if (ptr && ptr->val == entry.val) {
937 error = add_to_page_cache(page, inode->i_mapping, 939 error = add_to_page_cache_locked(page, inode->i_mapping,
938 idx, GFP_NOWAIT); 940 idx, GFP_NOWAIT);
941 /* does mem_cgroup_uncharge_cache_page on error */
942 } else /* we must compensate for our precharge above */
943 mem_cgroup_uncharge_cache_page(page);
944
939 if (error == -EEXIST) { 945 if (error == -EEXIST) {
940 struct page *filepage = find_get_page(inode->i_mapping, idx); 946 struct page *filepage = find_get_page(inode->i_mapping, idx);
941 error = 1; 947 error = 1;
@@ -961,8 +967,6 @@ found:
961 shmem_swp_unmap(ptr); 967 shmem_swp_unmap(ptr);
962 spin_unlock(&info->lock); 968 spin_unlock(&info->lock);
963 radix_tree_preload_end(); 969 radix_tree_preload_end();
964uncharge:
965 mem_cgroup_uncharge_page(page);
966out: 970out:
967 unlock_page(page); 971 unlock_page(page);
968 page_cache_release(page); 972 page_cache_release(page);
@@ -1297,8 +1301,8 @@ repeat:
1297 SetPageUptodate(filepage); 1301 SetPageUptodate(filepage);
1298 set_page_dirty(filepage); 1302 set_page_dirty(filepage);
1299 swap_free(swap); 1303 swap_free(swap);
1300 } else if (!(error = add_to_page_cache( 1304 } else if (!(error = add_to_page_cache_locked(swappage, mapping,
1301 swappage, mapping, idx, GFP_NOWAIT))) { 1305 idx, GFP_NOWAIT))) {
1302 info->flags |= SHMEM_PAGEIN; 1306 info->flags |= SHMEM_PAGEIN;
1303 shmem_swp_set(info, entry, 0); 1307 shmem_swp_set(info, entry, 0);
1304 shmem_swp_unmap(entry); 1308 shmem_swp_unmap(entry);
@@ -1311,17 +1315,14 @@ repeat:
1311 shmem_swp_unmap(entry); 1315 shmem_swp_unmap(entry);
1312 spin_unlock(&info->lock); 1316 spin_unlock(&info->lock);
1313 unlock_page(swappage); 1317 unlock_page(swappage);
1318 page_cache_release(swappage);
1314 if (error == -ENOMEM) { 1319 if (error == -ENOMEM) {
1315 /* allow reclaim from this memory cgroup */ 1320 /* allow reclaim from this memory cgroup */
1316 error = mem_cgroup_cache_charge(swappage, 1321 error = mem_cgroup_shrink_usage(current->mm,
1317 current->mm, gfp & ~__GFP_HIGHMEM); 1322 gfp);
1318 if (error) { 1323 if (error)
1319 page_cache_release(swappage);
1320 goto failed; 1324 goto failed;
1321 }
1322 mem_cgroup_uncharge_page(swappage);
1323 } 1325 }
1324 page_cache_release(swappage);
1325 goto repeat; 1326 goto repeat;
1326 } 1327 }
1327 } else if (sgp == SGP_READ && !filepage) { 1328 } else if (sgp == SGP_READ && !filepage) {
@@ -1358,6 +1359,8 @@ repeat:
1358 } 1359 }
1359 1360
1360 if (!filepage) { 1361 if (!filepage) {
1362 int ret;
1363
1361 spin_unlock(&info->lock); 1364 spin_unlock(&info->lock);
1362 filepage = shmem_alloc_page(gfp, info, idx); 1365 filepage = shmem_alloc_page(gfp, info, idx);
1363 if (!filepage) { 1366 if (!filepage) {
@@ -1386,10 +1389,18 @@ repeat:
1386 swap = *entry; 1389 swap = *entry;
1387 shmem_swp_unmap(entry); 1390 shmem_swp_unmap(entry);
1388 } 1391 }
1389 if (error || swap.val || 0 != add_to_page_cache_lru( 1392 ret = error || swap.val;
1390 filepage, mapping, idx, GFP_NOWAIT)) { 1393 if (ret)
1394 mem_cgroup_uncharge_cache_page(filepage);
1395 else
1396 ret = add_to_page_cache_lru(filepage, mapping,
1397 idx, GFP_NOWAIT);
1398 /*
1399 * At add_to_page_cache_lru() failure, uncharge will
1400 * be done automatically.
1401 */
1402 if (ret) {
1391 spin_unlock(&info->lock); 1403 spin_unlock(&info->lock);
1392 mem_cgroup_uncharge_page(filepage);
1393 page_cache_release(filepage); 1404 page_cache_release(filepage);
1394 shmem_unacct_blocks(info->flags, 1); 1405 shmem_unacct_blocks(info->flags, 1);
1395 shmem_free_blocks(inode, 1); 1406 shmem_free_blocks(inode, 1);
@@ -1398,7 +1409,6 @@ repeat:
1398 goto failed; 1409 goto failed;
1399 goto repeat; 1410 goto repeat;
1400 } 1411 }
1401 mem_cgroup_uncharge_page(filepage);
1402 info->flags |= SHMEM_PAGEIN; 1412 info->flags |= SHMEM_PAGEIN;
1403 } 1413 }
1404 1414
@@ -1690,26 +1700,38 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1690 file_accessed(filp); 1700 file_accessed(filp);
1691} 1701}
1692 1702
1693static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) 1703static ssize_t shmem_file_aio_read(struct kiocb *iocb,
1704 const struct iovec *iov, unsigned long nr_segs, loff_t pos)
1694{ 1705{
1695 read_descriptor_t desc; 1706 struct file *filp = iocb->ki_filp;
1707 ssize_t retval;
1708 unsigned long seg;
1709 size_t count;
1710 loff_t *ppos = &iocb->ki_pos;
1696 1711
1697 if ((ssize_t) count < 0) 1712 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1698 return -EINVAL; 1713 if (retval)
1699 if (!access_ok(VERIFY_WRITE, buf, count)) 1714 return retval;
1700 return -EFAULT;
1701 if (!count)
1702 return 0;
1703 1715
1704 desc.written = 0; 1716 for (seg = 0; seg < nr_segs; seg++) {
1705 desc.count = count; 1717 read_descriptor_t desc;
1706 desc.arg.buf = buf;
1707 desc.error = 0;
1708 1718
1709 do_shmem_file_read(filp, ppos, &desc, file_read_actor); 1719 desc.written = 0;
1710 if (desc.written) 1720 desc.arg.buf = iov[seg].iov_base;
1711 return desc.written; 1721 desc.count = iov[seg].iov_len;
1712 return desc.error; 1722 if (desc.count == 0)
1723 continue;
1724 desc.error = 0;
1725 do_shmem_file_read(filp, ppos, &desc, file_read_actor);
1726 retval += desc.written;
1727 if (desc.error) {
1728 retval = retval ?: desc.error;
1729 break;
1730 }
1731 if (desc.count > 0)
1732 break;
1733 }
1734 return retval;
1713} 1735}
1714 1736
1715static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 1737static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -2330,7 +2352,7 @@ static void shmem_destroy_inode(struct inode *inode)
2330 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2352 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2331} 2353}
2332 2354
2333static void init_once(struct kmem_cache *cachep, void *foo) 2355static void init_once(void *foo)
2334{ 2356{
2335 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2357 struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2336 2358
@@ -2369,8 +2391,9 @@ static const struct file_operations shmem_file_operations = {
2369 .mmap = shmem_mmap, 2391 .mmap = shmem_mmap,
2370#ifdef CONFIG_TMPFS 2392#ifdef CONFIG_TMPFS
2371 .llseek = generic_file_llseek, 2393 .llseek = generic_file_llseek,
2372 .read = shmem_file_read, 2394 .read = do_sync_read,
2373 .write = do_sync_write, 2395 .write = do_sync_write,
2396 .aio_read = shmem_file_aio_read,
2374 .aio_write = generic_file_aio_write, 2397 .aio_write = generic_file_aio_write,
2375 .fsync = simple_sync_file, 2398 .fsync = simple_sync_file,
2376 .splice_read = generic_file_splice_read, 2399 .splice_read = generic_file_splice_read,
diff --git a/mm/slab.c b/mm/slab.c
index 052e7d64537e..918f04f7fef1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -406,7 +406,7 @@ struct kmem_cache {
406 unsigned int dflags; /* dynamic flags */ 406 unsigned int dflags; /* dynamic flags */
407 407
408 /* constructor func */ 408 /* constructor func */
409 void (*ctor)(struct kmem_cache *, void *); 409 void (*ctor)(void *obj);
410 410
411/* 5) cache creation/removal */ 411/* 5) cache creation/removal */
412 const char *name; 412 const char *name;
@@ -2137,8 +2137,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
2137 */ 2137 */
2138struct kmem_cache * 2138struct kmem_cache *
2139kmem_cache_create (const char *name, size_t size, size_t align, 2139kmem_cache_create (const char *name, size_t size, size_t align,
2140 unsigned long flags, 2140 unsigned long flags, void (*ctor)(void *))
2141 void (*ctor)(struct kmem_cache *, void *))
2142{ 2141{
2143 size_t left_over, slab_size, ralign; 2142 size_t left_over, slab_size, ralign;
2144 struct kmem_cache *cachep = NULL, *pc; 2143 struct kmem_cache *cachep = NULL, *pc;
@@ -2653,7 +2652,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2653 * They must also be threaded. 2652 * They must also be threaded.
2654 */ 2653 */
2655 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2654 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2656 cachep->ctor(cachep, objp + obj_offset(cachep)); 2655 cachep->ctor(objp + obj_offset(cachep));
2657 2656
2658 if (cachep->flags & SLAB_RED_ZONE) { 2657 if (cachep->flags & SLAB_RED_ZONE) {
2659 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2658 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2669,7 +2668,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2669 cachep->buffer_size / PAGE_SIZE, 0); 2668 cachep->buffer_size / PAGE_SIZE, 0);
2670#else 2669#else
2671 if (cachep->ctor) 2670 if (cachep->ctor)
2672 cachep->ctor(cachep, objp); 2671 cachep->ctor(objp);
2673#endif 2672#endif
2674 slab_bufctl(slabp)[i] = i + 1; 2673 slab_bufctl(slabp)[i] = i + 1;
2675 } 2674 }
@@ -3093,7 +3092,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3093#endif 3092#endif
3094 objp += obj_offset(cachep); 3093 objp += obj_offset(cachep);
3095 if (cachep->ctor && cachep->flags & SLAB_POISON) 3094 if (cachep->ctor && cachep->flags & SLAB_POISON)
3096 cachep->ctor(cachep, objp); 3095 cachep->ctor(objp);
3097#if ARCH_SLAB_MINALIGN 3096#if ARCH_SLAB_MINALIGN
3098 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { 3097 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3099 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 3098 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
diff --git a/mm/slob.c b/mm/slob.c
index a3ad6671adf1..d8fbd4d1bfa7 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -130,17 +130,17 @@ static LIST_HEAD(free_slob_large);
130 */ 130 */
131static inline int slob_page(struct slob_page *sp) 131static inline int slob_page(struct slob_page *sp)
132{ 132{
133 return test_bit(PG_active, &sp->flags); 133 return PageSlobPage((struct page *)sp);
134} 134}
135 135
136static inline void set_slob_page(struct slob_page *sp) 136static inline void set_slob_page(struct slob_page *sp)
137{ 137{
138 __set_bit(PG_active, &sp->flags); 138 __SetPageSlobPage((struct page *)sp);
139} 139}
140 140
141static inline void clear_slob_page(struct slob_page *sp) 141static inline void clear_slob_page(struct slob_page *sp)
142{ 142{
143 __clear_bit(PG_active, &sp->flags); 143 __ClearPageSlobPage((struct page *)sp);
144} 144}
145 145
146/* 146/*
@@ -148,19 +148,19 @@ static inline void clear_slob_page(struct slob_page *sp)
148 */ 148 */
149static inline int slob_page_free(struct slob_page *sp) 149static inline int slob_page_free(struct slob_page *sp)
150{ 150{
151 return test_bit(PG_private, &sp->flags); 151 return PageSlobFree((struct page *)sp);
152} 152}
153 153
154static void set_slob_page_free(struct slob_page *sp, struct list_head *list) 154static void set_slob_page_free(struct slob_page *sp, struct list_head *list)
155{ 155{
156 list_add(&sp->list, list); 156 list_add(&sp->list, list);
157 __set_bit(PG_private, &sp->flags); 157 __SetPageSlobFree((struct page *)sp);
158} 158}
159 159
160static inline void clear_slob_page_free(struct slob_page *sp) 160static inline void clear_slob_page_free(struct slob_page *sp)
161{ 161{
162 list_del(&sp->list); 162 list_del(&sp->list);
163 __clear_bit(PG_private, &sp->flags); 163 __ClearPageSlobFree((struct page *)sp);
164} 164}
165 165
166#define SLOB_UNIT sizeof(slob_t) 166#define SLOB_UNIT sizeof(slob_t)
@@ -525,12 +525,11 @@ struct kmem_cache {
525 unsigned int size, align; 525 unsigned int size, align;
526 unsigned long flags; 526 unsigned long flags;
527 const char *name; 527 const char *name;
528 void (*ctor)(struct kmem_cache *, void *); 528 void (*ctor)(void *);
529}; 529};
530 530
531struct kmem_cache *kmem_cache_create(const char *name, size_t size, 531struct kmem_cache *kmem_cache_create(const char *name, size_t size,
532 size_t align, unsigned long flags, 532 size_t align, unsigned long flags, void (*ctor)(void *))
533 void (*ctor)(struct kmem_cache *, void *))
534{ 533{
535 struct kmem_cache *c; 534 struct kmem_cache *c;
536 535
@@ -575,7 +574,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
575 b = slob_new_page(flags, get_order(c->size), node); 574 b = slob_new_page(flags, get_order(c->size), node);
576 575
577 if (c->ctor) 576 if (c->ctor)
578 c->ctor(c, b); 577 c->ctor(b);
579 578
580 return b; 579 return b;
581} 580}
diff --git a/mm/slub.c b/mm/slub.c
index 6d4a49c1ff2f..b7e2cd5d82db 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -102,44 +102,12 @@
102 * the fast path and disables lockless freelists. 102 * the fast path and disables lockless freelists.
103 */ 103 */
104 104
105#define FROZEN (1 << PG_active)
106
107#ifdef CONFIG_SLUB_DEBUG 105#ifdef CONFIG_SLUB_DEBUG
108#define SLABDEBUG (1 << PG_error) 106#define SLABDEBUG 1
109#else 107#else
110#define SLABDEBUG 0 108#define SLABDEBUG 0
111#endif 109#endif
112 110
113static inline int SlabFrozen(struct page *page)
114{
115 return page->flags & FROZEN;
116}
117
118static inline void SetSlabFrozen(struct page *page)
119{
120 page->flags |= FROZEN;
121}
122
123static inline void ClearSlabFrozen(struct page *page)
124{
125 page->flags &= ~FROZEN;
126}
127
128static inline int SlabDebug(struct page *page)
129{
130 return page->flags & SLABDEBUG;
131}
132
133static inline void SetSlabDebug(struct page *page)
134{
135 page->flags |= SLABDEBUG;
136}
137
138static inline void ClearSlabDebug(struct page *page)
139{
140 page->flags &= ~SLABDEBUG;
141}
142
143/* 111/*
144 * Issues still to be resolved: 112 * Issues still to be resolved:
145 * 113 *
@@ -971,7 +939,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
971 } 939 }
972 940
973 /* Special debug activities for freeing objects */ 941 /* Special debug activities for freeing objects */
974 if (!SlabFrozen(page) && !page->freelist) 942 if (!PageSlubFrozen(page) && !page->freelist)
975 remove_full(s, page); 943 remove_full(s, page);
976 if (s->flags & SLAB_STORE_USER) 944 if (s->flags & SLAB_STORE_USER)
977 set_track(s, object, TRACK_FREE, addr); 945 set_track(s, object, TRACK_FREE, addr);
@@ -1044,7 +1012,7 @@ __setup("slub_debug", setup_slub_debug);
1044 1012
1045static unsigned long kmem_cache_flags(unsigned long objsize, 1013static unsigned long kmem_cache_flags(unsigned long objsize,
1046 unsigned long flags, const char *name, 1014 unsigned long flags, const char *name,
1047 void (*ctor)(struct kmem_cache *, void *)) 1015 void (*ctor)(void *))
1048{ 1016{
1049 /* 1017 /*
1050 * Enable debugging if selected on the kernel commandline. 1018 * Enable debugging if selected on the kernel commandline.
@@ -1072,7 +1040,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
1072static inline void add_full(struct kmem_cache_node *n, struct page *page) {} 1040static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
1073static inline unsigned long kmem_cache_flags(unsigned long objsize, 1041static inline unsigned long kmem_cache_flags(unsigned long objsize,
1074 unsigned long flags, const char *name, 1042 unsigned long flags, const char *name,
1075 void (*ctor)(struct kmem_cache *, void *)) 1043 void (*ctor)(void *))
1076{ 1044{
1077 return flags; 1045 return flags;
1078} 1046}
@@ -1135,7 +1103,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
1135{ 1103{
1136 setup_object_debug(s, page, object); 1104 setup_object_debug(s, page, object);
1137 if (unlikely(s->ctor)) 1105 if (unlikely(s->ctor))
1138 s->ctor(s, object); 1106 s->ctor(object);
1139} 1107}
1140 1108
1141static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1109static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1157,7 +1125,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1157 page->flags |= 1 << PG_slab; 1125 page->flags |= 1 << PG_slab;
1158 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | 1126 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
1159 SLAB_STORE_USER | SLAB_TRACE)) 1127 SLAB_STORE_USER | SLAB_TRACE))
1160 SetSlabDebug(page); 1128 __SetPageSlubDebug(page);
1161 1129
1162 start = page_address(page); 1130 start = page_address(page);
1163 1131
@@ -1184,14 +1152,14 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1184 int order = compound_order(page); 1152 int order = compound_order(page);
1185 int pages = 1 << order; 1153 int pages = 1 << order;
1186 1154
1187 if (unlikely(SlabDebug(page))) { 1155 if (unlikely(SLABDEBUG && PageSlubDebug(page))) {
1188 void *p; 1156 void *p;
1189 1157
1190 slab_pad_check(s, page); 1158 slab_pad_check(s, page);
1191 for_each_object(p, s, page_address(page), 1159 for_each_object(p, s, page_address(page),
1192 page->objects) 1160 page->objects)
1193 check_object(s, page, p, 0); 1161 check_object(s, page, p, 0);
1194 ClearSlabDebug(page); 1162 __ClearPageSlubDebug(page);
1195 } 1163 }
1196 1164
1197 mod_zone_page_state(page_zone(page), 1165 mod_zone_page_state(page_zone(page),
@@ -1288,7 +1256,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
1288 if (slab_trylock(page)) { 1256 if (slab_trylock(page)) {
1289 list_del(&page->lru); 1257 list_del(&page->lru);
1290 n->nr_partial--; 1258 n->nr_partial--;
1291 SetSlabFrozen(page); 1259 __SetPageSlubFrozen(page);
1292 return 1; 1260 return 1;
1293 } 1261 }
1294 return 0; 1262 return 0;
@@ -1398,7 +1366,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1398 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1366 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1399 struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); 1367 struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
1400 1368
1401 ClearSlabFrozen(page); 1369 __ClearPageSlubFrozen(page);
1402 if (page->inuse) { 1370 if (page->inuse) {
1403 1371
1404 if (page->freelist) { 1372 if (page->freelist) {
@@ -1406,7 +1374,8 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1406 stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); 1374 stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1407 } else { 1375 } else {
1408 stat(c, DEACTIVATE_FULL); 1376 stat(c, DEACTIVATE_FULL);
1409 if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) 1377 if (SLABDEBUG && PageSlubDebug(page) &&
1378 (s->flags & SLAB_STORE_USER))
1410 add_full(n, page); 1379 add_full(n, page);
1411 } 1380 }
1412 slab_unlock(page); 1381 slab_unlock(page);
@@ -1551,7 +1520,7 @@ load_freelist:
1551 object = c->page->freelist; 1520 object = c->page->freelist;
1552 if (unlikely(!object)) 1521 if (unlikely(!object))
1553 goto another_slab; 1522 goto another_slab;
1554 if (unlikely(SlabDebug(c->page))) 1523 if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
1555 goto debug; 1524 goto debug;
1556 1525
1557 c->freelist = object[c->offset]; 1526 c->freelist = object[c->offset];
@@ -1588,7 +1557,7 @@ new_slab:
1588 if (c->page) 1557 if (c->page)
1589 flush_slab(s, c); 1558 flush_slab(s, c);
1590 slab_lock(new); 1559 slab_lock(new);
1591 SetSlabFrozen(new); 1560 __SetPageSlubFrozen(new);
1592 c->page = new; 1561 c->page = new;
1593 goto load_freelist; 1562 goto load_freelist;
1594 } 1563 }
@@ -1674,7 +1643,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
1674 stat(c, FREE_SLOWPATH); 1643 stat(c, FREE_SLOWPATH);
1675 slab_lock(page); 1644 slab_lock(page);
1676 1645
1677 if (unlikely(SlabDebug(page))) 1646 if (unlikely(SLABDEBUG && PageSlubDebug(page)))
1678 goto debug; 1647 goto debug;
1679 1648
1680checks_ok: 1649checks_ok:
@@ -1682,7 +1651,7 @@ checks_ok:
1682 page->freelist = object; 1651 page->freelist = object;
1683 page->inuse--; 1652 page->inuse--;
1684 1653
1685 if (unlikely(SlabFrozen(page))) { 1654 if (unlikely(PageSlubFrozen(page))) {
1686 stat(c, FREE_FROZEN); 1655 stat(c, FREE_FROZEN);
1687 goto out_unlock; 1656 goto out_unlock;
1688 } 1657 }
@@ -2317,7 +2286,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2317static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 2286static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2318 const char *name, size_t size, 2287 const char *name, size_t size,
2319 size_t align, unsigned long flags, 2288 size_t align, unsigned long flags,
2320 void (*ctor)(struct kmem_cache *, void *)) 2289 void (*ctor)(void *))
2321{ 2290{
2322 memset(s, 0, kmem_size); 2291 memset(s, 0, kmem_size);
2323 s->name = name; 2292 s->name = name;
@@ -3073,7 +3042,7 @@ static int slab_unmergeable(struct kmem_cache *s)
3073 3042
3074static struct kmem_cache *find_mergeable(size_t size, 3043static struct kmem_cache *find_mergeable(size_t size,
3075 size_t align, unsigned long flags, const char *name, 3044 size_t align, unsigned long flags, const char *name,
3076 void (*ctor)(struct kmem_cache *, void *)) 3045 void (*ctor)(void *))
3077{ 3046{
3078 struct kmem_cache *s; 3047 struct kmem_cache *s;
3079 3048
@@ -3113,8 +3082,7 @@ static struct kmem_cache *find_mergeable(size_t size,
3113} 3082}
3114 3083
3115struct kmem_cache *kmem_cache_create(const char *name, size_t size, 3084struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3116 size_t align, unsigned long flags, 3085 size_t align, unsigned long flags, void (*ctor)(void *))
3117 void (*ctor)(struct kmem_cache *, void *))
3118{ 3086{
3119 struct kmem_cache *s; 3087 struct kmem_cache *s;
3120 3088
@@ -3317,12 +3285,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page,
3317 s->name, page); 3285 s->name, page);
3318 3286
3319 if (s->flags & DEBUG_DEFAULT_FLAGS) { 3287 if (s->flags & DEBUG_DEFAULT_FLAGS) {
3320 if (!SlabDebug(page)) 3288 if (!PageSlubDebug(page))
3321 printk(KERN_ERR "SLUB %s: SlabDebug not set " 3289 printk(KERN_ERR "SLUB %s: SlubDebug not set "
3322 "on slab 0x%p\n", s->name, page); 3290 "on slab 0x%p\n", s->name, page);
3323 } else { 3291 } else {
3324 if (SlabDebug(page)) 3292 if (PageSlubDebug(page))
3325 printk(KERN_ERR "SLUB %s: SlabDebug set on " 3293 printk(KERN_ERR "SLUB %s: SlubDebug set on "
3326 "slab 0x%p\n", s->name, page); 3294 "slab 0x%p\n", s->name, page);
3327 } 3295 }
3328} 3296}
diff --git a/mm/sparse.c b/mm/sparse.c
index 36511c7b5e2c..5d9dbbb9d39e 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -12,6 +12,7 @@
12#include <asm/dma.h> 12#include <asm/dma.h>
13#include <asm/pgalloc.h> 13#include <asm/pgalloc.h>
14#include <asm/pgtable.h> 14#include <asm/pgtable.h>
15#include "internal.h"
15 16
16/* 17/*
17 * Permanent SPARSEMEM data: 18 * Permanent SPARSEMEM data:
@@ -147,22 +148,41 @@ static inline int sparse_early_nid(struct mem_section *section)
147 return (section->section_mem_map >> SECTION_NID_SHIFT); 148 return (section->section_mem_map >> SECTION_NID_SHIFT);
148} 149}
149 150
150/* Record a memory area against a node. */ 151/* Validate the physical addressing limitations of the model */
151void __init memory_present(int nid, unsigned long start, unsigned long end) 152void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
153 unsigned long *end_pfn)
152{ 154{
153 unsigned long max_arch_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); 155 unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
154 unsigned long pfn;
155 156
156 /* 157 /*
157 * Sanity checks - do not allow an architecture to pass 158 * Sanity checks - do not allow an architecture to pass
158 * in larger pfns than the maximum scope of sparsemem: 159 * in larger pfns than the maximum scope of sparsemem:
159 */ 160 */
160 if (start >= max_arch_pfn) 161 if (*start_pfn > max_sparsemem_pfn) {
161 return; 162 mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
162 if (end >= max_arch_pfn) 163 "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
163 end = max_arch_pfn; 164 *start_pfn, *end_pfn, max_sparsemem_pfn);
165 WARN_ON_ONCE(1);
166 *start_pfn = max_sparsemem_pfn;
167 *end_pfn = max_sparsemem_pfn;
168 }
169
170 if (*end_pfn > max_sparsemem_pfn) {
171 mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
172 "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
173 *start_pfn, *end_pfn, max_sparsemem_pfn);
174 WARN_ON_ONCE(1);
175 *end_pfn = max_sparsemem_pfn;
176 }
177}
178
179/* Record a memory area against a node. */
180void __init memory_present(int nid, unsigned long start, unsigned long end)
181{
182 unsigned long pfn;
164 183
165 start &= PAGE_SECTION_MASK; 184 start &= PAGE_SECTION_MASK;
185 mminit_validate_memmodel_limits(&start, &end);
166 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { 186 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
167 unsigned long section = pfn_to_section_nr(pfn); 187 unsigned long section = pfn_to_section_nr(pfn);
168 struct mem_section *ms; 188 struct mem_section *ms;
@@ -187,6 +207,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
187 unsigned long pfn; 207 unsigned long pfn;
188 unsigned long nr_pages = 0; 208 unsigned long nr_pages = 0;
189 209
210 mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
190 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 211 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
191 if (nid != early_pfn_to_nid(pfn)) 212 if (nid != early_pfn_to_nid(pfn))
192 continue; 213 continue;
@@ -248,16 +269,92 @@ static unsigned long *__kmalloc_section_usemap(void)
248} 269}
249#endif /* CONFIG_MEMORY_HOTPLUG */ 270#endif /* CONFIG_MEMORY_HOTPLUG */
250 271
272#ifdef CONFIG_MEMORY_HOTREMOVE
273static unsigned long * __init
274sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
275{
276 unsigned long section_nr;
277
278 /*
279 * A page may contain usemaps for other sections preventing the
280 * page being freed and making a section unremovable while
281 * other sections referencing the usemap retmain active. Similarly,
282 * a pgdat can prevent a section being removed. If section A
283 * contains a pgdat and section B contains the usemap, both
284 * sections become inter-dependent. This allocates usemaps
285 * from the same section as the pgdat where possible to avoid
286 * this problem.
287 */
288 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
289 return alloc_bootmem_section(usemap_size(), section_nr);
290}
291
292static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
293{
294 unsigned long usemap_snr, pgdat_snr;
295 static unsigned long old_usemap_snr = NR_MEM_SECTIONS;
296 static unsigned long old_pgdat_snr = NR_MEM_SECTIONS;
297 struct pglist_data *pgdat = NODE_DATA(nid);
298 int usemap_nid;
299
300 usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
301 pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
302 if (usemap_snr == pgdat_snr)
303 return;
304
305 if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
306 /* skip redundant message */
307 return;
308
309 old_usemap_snr = usemap_snr;
310 old_pgdat_snr = pgdat_snr;
311
312 usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
313 if (usemap_nid != nid) {
314 printk(KERN_INFO
315 "node %d must be removed before remove section %ld\n",
316 nid, usemap_snr);
317 return;
318 }
319 /*
320 * There is a circular dependency.
321 * Some platforms allow un-removable section because they will just
322 * gather other removable sections for dynamic partitioning.
323 * Just notify un-removable section's number here.
324 */
325 printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr,
326 pgdat_snr, nid);
327 printk(KERN_CONT
328 " have a circular dependency on usemap and pgdat allocations\n");
329}
330#else
331static unsigned long * __init
332sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
333{
334 return NULL;
335}
336
337static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
338{
339}
340#endif /* CONFIG_MEMORY_HOTREMOVE */
341
251static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) 342static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
252{ 343{
253 unsigned long *usemap; 344 unsigned long *usemap;
254 struct mem_section *ms = __nr_to_section(pnum); 345 struct mem_section *ms = __nr_to_section(pnum);
255 int nid = sparse_early_nid(ms); 346 int nid = sparse_early_nid(ms);
256 347
257 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); 348 usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
258 if (usemap) 349 if (usemap)
259 return usemap; 350 return usemap;
260 351
352 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
353 if (usemap) {
354 check_usemap_section_nr(nid, usemap);
355 return usemap;
356 }
357
261 /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ 358 /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */
262 nid = 0; 359 nid = 0;
263 360
@@ -280,7 +377,7 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
280} 377}
281#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 378#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
282 379
283struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) 380static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
284{ 381{
285 struct page *map; 382 struct page *map;
286 struct mem_section *ms = __nr_to_section(pnum); 383 struct mem_section *ms = __nr_to_section(pnum);
diff --git a/mm/swap.c b/mm/swap.c
index 45c9f25a8a3b..dd89234ee51f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,9 +34,9 @@
34/* How many pages do we try to swap or page in/out together? */ 34/* How many pages do we try to swap or page in/out together? */
35int page_cluster; 35int page_cluster;
36 36
37static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; 37static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs);
38static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; 38static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs);
39static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; 39static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
40 40
41/* 41/*
42 * This path almost never happens for VM activity - pages are normally 42 * This path almost never happens for VM activity - pages are normally
@@ -493,7 +493,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
493 */ 493 */
494#define ACCT_THRESHOLD max(16, NR_CPUS * 2) 494#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
495 495
496static DEFINE_PER_CPU(long, committed_space) = 0; 496static DEFINE_PER_CPU(long, committed_space);
497 497
498void vm_acct_memory(long pages) 498void vm_acct_memory(long pages)
499{ 499{
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d8aadaf2a0ba..b8035b055129 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -39,7 +39,7 @@ static struct backing_dev_info swap_backing_dev_info = {
39 39
40struct address_space swapper_space = { 40struct address_space swapper_space = {
41 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), 41 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
42 .tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock), 42 .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
43 .a_ops = &swap_aops, 43 .a_ops = &swap_aops,
44 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), 44 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
45 .backing_dev_info = &swap_backing_dev_info, 45 .backing_dev_info = &swap_backing_dev_info,
@@ -56,7 +56,8 @@ static struct {
56 56
57void show_swap_cache_info(void) 57void show_swap_cache_info(void)
58{ 58{
59 printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n", 59 printk("%lu pages in swap cache\n", total_swapcache_pages);
60 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
60 swap_cache_info.add_total, swap_cache_info.del_total, 61 swap_cache_info.add_total, swap_cache_info.del_total,
61 swap_cache_info.find_success, swap_cache_info.find_total); 62 swap_cache_info.find_success, swap_cache_info.find_total);
62 printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); 63 printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
@@ -64,7 +65,7 @@ void show_swap_cache_info(void)
64} 65}
65 66
66/* 67/*
67 * add_to_swap_cache resembles add_to_page_cache on swapper_space, 68 * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
68 * but sets SwapCache flag and private instead of mapping and index. 69 * but sets SwapCache flag and private instead of mapping and index.
69 */ 70 */
70int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) 71int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
@@ -76,19 +77,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
76 BUG_ON(PagePrivate(page)); 77 BUG_ON(PagePrivate(page));
77 error = radix_tree_preload(gfp_mask); 78 error = radix_tree_preload(gfp_mask);
78 if (!error) { 79 if (!error) {
79 write_lock_irq(&swapper_space.tree_lock); 80 page_cache_get(page);
81 SetPageSwapCache(page);
82 set_page_private(page, entry.val);
83
84 spin_lock_irq(&swapper_space.tree_lock);
80 error = radix_tree_insert(&swapper_space.page_tree, 85 error = radix_tree_insert(&swapper_space.page_tree,
81 entry.val, page); 86 entry.val, page);
82 if (!error) { 87 if (likely(!error)) {
83 page_cache_get(page);
84 SetPageSwapCache(page);
85 set_page_private(page, entry.val);
86 total_swapcache_pages++; 88 total_swapcache_pages++;
87 __inc_zone_page_state(page, NR_FILE_PAGES); 89 __inc_zone_page_state(page, NR_FILE_PAGES);
88 INC_CACHE_INFO(add_total); 90 INC_CACHE_INFO(add_total);
89 } 91 }
90 write_unlock_irq(&swapper_space.tree_lock); 92 spin_unlock_irq(&swapper_space.tree_lock);
91 radix_tree_preload_end(); 93 radix_tree_preload_end();
94
95 if (unlikely(error)) {
96 set_page_private(page, 0UL);
97 ClearPageSwapCache(page);
98 page_cache_release(page);
99 }
92 } 100 }
93 return error; 101 return error;
94} 102}
@@ -175,9 +183,9 @@ void delete_from_swap_cache(struct page *page)
175 183
176 entry.val = page_private(page); 184 entry.val = page_private(page);
177 185
178 write_lock_irq(&swapper_space.tree_lock); 186 spin_lock_irq(&swapper_space.tree_lock);
179 __delete_from_swap_cache(page); 187 __delete_from_swap_cache(page);
180 write_unlock_irq(&swapper_space.tree_lock); 188 spin_unlock_irq(&swapper_space.tree_lock);
181 189
182 swap_free(entry); 190 swap_free(entry);
183 page_cache_release(page); 191 page_cache_release(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bd1bb5920306..6beb6251e99d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,17 +33,18 @@
33#include <asm/tlbflush.h> 33#include <asm/tlbflush.h>
34#include <linux/swapops.h> 34#include <linux/swapops.h>
35 35
36DEFINE_SPINLOCK(swap_lock); 36static DEFINE_SPINLOCK(swap_lock);
37unsigned int nr_swapfiles; 37static unsigned int nr_swapfiles;
38long total_swap_pages; 38long total_swap_pages;
39static int swap_overflow; 39static int swap_overflow;
40static int least_priority;
40 41
41static const char Bad_file[] = "Bad swap file entry "; 42static const char Bad_file[] = "Bad swap file entry ";
42static const char Unused_file[] = "Unused swap file entry "; 43static const char Unused_file[] = "Unused swap file entry ";
43static const char Bad_offset[] = "Bad swap offset entry "; 44static const char Bad_offset[] = "Bad swap offset entry ";
44static const char Unused_offset[] = "Unused swap offset entry "; 45static const char Unused_offset[] = "Unused swap offset entry ";
45 46
46struct swap_list_t swap_list = {-1, -1}; 47static struct swap_list_t swap_list = {-1, -1};
47 48
48static struct swap_info_struct swap_info[MAX_SWAPFILES]; 49static struct swap_info_struct swap_info[MAX_SWAPFILES];
49 50
@@ -368,13 +369,13 @@ int remove_exclusive_swap_page(struct page *page)
368 retval = 0; 369 retval = 0;
369 if (p->swap_map[swp_offset(entry)] == 1) { 370 if (p->swap_map[swp_offset(entry)] == 1) {
370 /* Recheck the page count with the swapcache lock held.. */ 371 /* Recheck the page count with the swapcache lock held.. */
371 write_lock_irq(&swapper_space.tree_lock); 372 spin_lock_irq(&swapper_space.tree_lock);
372 if ((page_count(page) == 2) && !PageWriteback(page)) { 373 if ((page_count(page) == 2) && !PageWriteback(page)) {
373 __delete_from_swap_cache(page); 374 __delete_from_swap_cache(page);
374 SetPageDirty(page); 375 SetPageDirty(page);
375 retval = 1; 376 retval = 1;
376 } 377 }
377 write_unlock_irq(&swapper_space.tree_lock); 378 spin_unlock_irq(&swapper_space.tree_lock);
378 } 379 }
379 spin_unlock(&swap_lock); 380 spin_unlock(&swap_lock);
380 381
@@ -1260,6 +1261,11 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1260 /* just pick something that's safe... */ 1261 /* just pick something that's safe... */
1261 swap_list.next = swap_list.head; 1262 swap_list.next = swap_list.head;
1262 } 1263 }
1264 if (p->prio < 0) {
1265 for (i = p->next; i >= 0; i = swap_info[i].next)
1266 swap_info[i].prio = p->prio--;
1267 least_priority++;
1268 }
1263 nr_swap_pages -= p->pages; 1269 nr_swap_pages -= p->pages;
1264 total_swap_pages -= p->pages; 1270 total_swap_pages -= p->pages;
1265 p->flags &= ~SWP_WRITEOK; 1271 p->flags &= ~SWP_WRITEOK;
@@ -1272,9 +1278,14 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1272 if (err) { 1278 if (err) {
1273 /* re-insert swap space back into swap_list */ 1279 /* re-insert swap space back into swap_list */
1274 spin_lock(&swap_lock); 1280 spin_lock(&swap_lock);
1275 for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) 1281 if (p->prio < 0)
1282 p->prio = --least_priority;
1283 prev = -1;
1284 for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
1276 if (p->prio >= swap_info[i].prio) 1285 if (p->prio >= swap_info[i].prio)
1277 break; 1286 break;
1287 prev = i;
1288 }
1278 p->next = i; 1289 p->next = i;
1279 if (prev < 0) 1290 if (prev < 0)
1280 swap_list.head = swap_list.next = p - swap_info; 1291 swap_list.head = swap_list.next = p - swap_info;
@@ -1447,7 +1458,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1447 unsigned int type; 1458 unsigned int type;
1448 int i, prev; 1459 int i, prev;
1449 int error; 1460 int error;
1450 static int least_priority;
1451 union swap_header *swap_header = NULL; 1461 union swap_header *swap_header = NULL;
1452 int swap_header_version; 1462 int swap_header_version;
1453 unsigned int nr_good_pages = 0; 1463 unsigned int nr_good_pages = 0;
@@ -1455,7 +1465,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1455 sector_t span; 1465 sector_t span;
1456 unsigned long maxpages = 1; 1466 unsigned long maxpages = 1;
1457 int swapfilesize; 1467 int swapfilesize;
1458 unsigned short *swap_map; 1468 unsigned short *swap_map = NULL;
1459 struct page *page = NULL; 1469 struct page *page = NULL;
1460 struct inode *inode = NULL; 1470 struct inode *inode = NULL;
1461 int did_down = 0; 1471 int did_down = 0;
@@ -1474,22 +1484,10 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1474 } 1484 }
1475 if (type >= nr_swapfiles) 1485 if (type >= nr_swapfiles)
1476 nr_swapfiles = type+1; 1486 nr_swapfiles = type+1;
1487 memset(p, 0, sizeof(*p));
1477 INIT_LIST_HEAD(&p->extent_list); 1488 INIT_LIST_HEAD(&p->extent_list);
1478 p->flags = SWP_USED; 1489 p->flags = SWP_USED;
1479 p->swap_file = NULL;
1480 p->old_block_size = 0;
1481 p->swap_map = NULL;
1482 p->lowest_bit = 0;
1483 p->highest_bit = 0;
1484 p->cluster_nr = 0;
1485 p->inuse_pages = 0;
1486 p->next = -1; 1490 p->next = -1;
1487 if (swap_flags & SWAP_FLAG_PREFER) {
1488 p->prio =
1489 (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
1490 } else {
1491 p->prio = --least_priority;
1492 }
1493 spin_unlock(&swap_lock); 1491 spin_unlock(&swap_lock);
1494 name = getname(specialfile); 1492 name = getname(specialfile);
1495 error = PTR_ERR(name); 1493 error = PTR_ERR(name);
@@ -1632,19 +1630,20 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1632 goto bad_swap; 1630 goto bad_swap;
1633 1631
1634 /* OK, set up the swap map and apply the bad block list */ 1632 /* OK, set up the swap map and apply the bad block list */
1635 if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { 1633 swap_map = vmalloc(maxpages * sizeof(short));
1634 if (!swap_map) {
1636 error = -ENOMEM; 1635 error = -ENOMEM;
1637 goto bad_swap; 1636 goto bad_swap;
1638 } 1637 }
1639 1638
1640 error = 0; 1639 error = 0;
1641 memset(p->swap_map, 0, maxpages * sizeof(short)); 1640 memset(swap_map, 0, maxpages * sizeof(short));
1642 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1641 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1643 int page_nr = swap_header->info.badpages[i]; 1642 int page_nr = swap_header->info.badpages[i];
1644 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) 1643 if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
1645 error = -EINVAL; 1644 error = -EINVAL;
1646 else 1645 else
1647 p->swap_map[page_nr] = SWAP_MAP_BAD; 1646 swap_map[page_nr] = SWAP_MAP_BAD;
1648 } 1647 }
1649 nr_good_pages = swap_header->info.last_page - 1648 nr_good_pages = swap_header->info.last_page -
1650 swap_header->info.nr_badpages - 1649 swap_header->info.nr_badpages -
@@ -1654,7 +1653,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1654 } 1653 }
1655 1654
1656 if (nr_good_pages) { 1655 if (nr_good_pages) {
1657 p->swap_map[0] = SWAP_MAP_BAD; 1656 swap_map[0] = SWAP_MAP_BAD;
1658 p->max = maxpages; 1657 p->max = maxpages;
1659 p->pages = nr_good_pages; 1658 p->pages = nr_good_pages;
1660 nr_extents = setup_swap_extents(p, &span); 1659 nr_extents = setup_swap_extents(p, &span);
@@ -1672,6 +1671,12 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1672 1671
1673 mutex_lock(&swapon_mutex); 1672 mutex_lock(&swapon_mutex);
1674 spin_lock(&swap_lock); 1673 spin_lock(&swap_lock);
1674 if (swap_flags & SWAP_FLAG_PREFER)
1675 p->prio =
1676 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
1677 else
1678 p->prio = --least_priority;
1679 p->swap_map = swap_map;
1675 p->flags = SWP_ACTIVE; 1680 p->flags = SWP_ACTIVE;
1676 nr_swap_pages += nr_good_pages; 1681 nr_swap_pages += nr_good_pages;
1677 total_swap_pages += nr_good_pages; 1682 total_swap_pages += nr_good_pages;
@@ -1707,12 +1712,8 @@ bad_swap:
1707 destroy_swap_extents(p); 1712 destroy_swap_extents(p);
1708bad_swap_2: 1713bad_swap_2:
1709 spin_lock(&swap_lock); 1714 spin_lock(&swap_lock);
1710 swap_map = p->swap_map;
1711 p->swap_file = NULL; 1715 p->swap_file = NULL;
1712 p->swap_map = NULL;
1713 p->flags = 0; 1716 p->flags = 0;
1714 if (!(swap_flags & SWAP_FLAG_PREFER))
1715 ++least_priority;
1716 spin_unlock(&swap_lock); 1717 spin_unlock(&swap_lock);
1717 vfree(swap_map); 1718 vfree(swap_map);
1718 if (swap_file) 1719 if (swap_file)
diff --git a/mm/truncate.c b/mm/truncate.c
index b8961cb63414..e68443d74567 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -349,18 +349,18 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
349 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) 349 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
350 return 0; 350 return 0;
351 351
352 write_lock_irq(&mapping->tree_lock); 352 spin_lock_irq(&mapping->tree_lock);
353 if (PageDirty(page)) 353 if (PageDirty(page))
354 goto failed; 354 goto failed;
355 355
356 BUG_ON(PagePrivate(page)); 356 BUG_ON(PagePrivate(page));
357 __remove_from_page_cache(page); 357 __remove_from_page_cache(page);
358 write_unlock_irq(&mapping->tree_lock); 358 spin_unlock_irq(&mapping->tree_lock);
359 ClearPageUptodate(page); 359 ClearPageUptodate(page);
360 page_cache_release(page); /* pagecache ref */ 360 page_cache_release(page); /* pagecache ref */
361 return 1; 361 return 1;
362failed: 362failed:
363 write_unlock_irq(&mapping->tree_lock); 363 spin_unlock_irq(&mapping->tree_lock);
364 return 0; 364 return 0;
365} 365}
366 366
diff --git a/mm/util.c b/mm/util.c
index 6ef9e9943f62..9341ca77bd88 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,7 +1,9 @@
1#include <linux/mm.h>
1#include <linux/slab.h> 2#include <linux/slab.h>
2#include <linux/string.h> 3#include <linux/string.h>
3#include <linux/module.h> 4#include <linux/module.h>
4#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h>
5#include <asm/uaccess.h> 7#include <asm/uaccess.h>
6 8
7/** 9/**
@@ -160,3 +162,12 @@ char *strndup_user(const char __user *s, long n)
160 return p; 162 return p;
161} 163}
162EXPORT_SYMBOL(strndup_user); 164EXPORT_SYMBOL(strndup_user);
165
166#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT
167void arch_pick_mmap_layout(struct mm_struct *mm)
168{
169 mm->mmap_base = TASK_UNMAPPED_BASE;
170 mm->get_unmapped_area = arch_get_unmapped_area;
171 mm->unmap_area = arch_unmap_area;
172}
173#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6e45b0f3d125..85b9a0d2c877 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -381,16 +381,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
381 return; 381 return;
382 382
383 if ((PAGE_SIZE-1) & (unsigned long)addr) { 383 if ((PAGE_SIZE-1) & (unsigned long)addr) {
384 printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); 384 WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
385 WARN_ON(1);
386 return; 385 return;
387 } 386 }
388 387
389 area = remove_vm_area(addr); 388 area = remove_vm_area(addr);
390 if (unlikely(!area)) { 389 if (unlikely(!area)) {
391 printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", 390 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
392 addr); 391 addr);
393 WARN_ON(1);
394 return; 392 return;
395 } 393 }
396 394
@@ -931,6 +929,25 @@ static void s_stop(struct seq_file *m, void *p)
931 read_unlock(&vmlist_lock); 929 read_unlock(&vmlist_lock);
932} 930}
933 931
932static void show_numa_info(struct seq_file *m, struct vm_struct *v)
933{
934 if (NUMA_BUILD) {
935 unsigned int nr, *counters = m->private;
936
937 if (!counters)
938 return;
939
940 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
941
942 for (nr = 0; nr < v->nr_pages; nr++)
943 counters[page_to_nid(v->pages[nr])]++;
944
945 for_each_node_state(nr, N_HIGH_MEMORY)
946 if (counters[nr])
947 seq_printf(m, " N%u=%u", nr, counters[nr]);
948 }
949}
950
934static int s_show(struct seq_file *m, void *p) 951static int s_show(struct seq_file *m, void *p)
935{ 952{
936 struct vm_struct *v = p; 953 struct vm_struct *v = p;
@@ -967,6 +984,7 @@ static int s_show(struct seq_file *m, void *p)
967 if (v->flags & VM_VPAGES) 984 if (v->flags & VM_VPAGES)
968 seq_printf(m, " vpages"); 985 seq_printf(m, " vpages");
969 986
987 show_numa_info(m, v);
970 seq_putc(m, '\n'); 988 seq_putc(m, '\n');
971 return 0; 989 return 0;
972} 990}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 967d30ccd92b..8f71761bc4b7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -38,6 +38,7 @@
38#include <linux/kthread.h> 38#include <linux/kthread.h>
39#include <linux/freezer.h> 39#include <linux/freezer.h>
40#include <linux/memcontrol.h> 40#include <linux/memcontrol.h>
41#include <linux/delayacct.h>
41 42
42#include <asm/tlbflush.h> 43#include <asm/tlbflush.h>
43#include <asm/div64.h> 44#include <asm/div64.h>
@@ -390,17 +391,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
390} 391}
391 392
392/* 393/*
393 * Attempt to detach a locked page from its ->mapping. If it is dirty or if 394 * Same as remove_mapping, but if the page is removed from the mapping, it
394 * someone else has a ref on the page, abort and return 0. If it was 395 * gets returned with a refcount of 0.
395 * successfully detached, return 1. Assumes the caller has a single ref on
396 * this page.
397 */ 396 */
398int remove_mapping(struct address_space *mapping, struct page *page) 397static int __remove_mapping(struct address_space *mapping, struct page *page)
399{ 398{
400 BUG_ON(!PageLocked(page)); 399 BUG_ON(!PageLocked(page));
401 BUG_ON(mapping != page_mapping(page)); 400 BUG_ON(mapping != page_mapping(page));
402 401
403 write_lock_irq(&mapping->tree_lock); 402 spin_lock_irq(&mapping->tree_lock);
404 /* 403 /*
405 * The non racy check for a busy page. 404 * The non racy check for a busy page.
406 * 405 *
@@ -426,28 +425,48 @@ int remove_mapping(struct address_space *mapping, struct page *page)
426 * Note that if SetPageDirty is always performed via set_page_dirty, 425 * Note that if SetPageDirty is always performed via set_page_dirty,
427 * and thus under tree_lock, then this ordering is not required. 426 * and thus under tree_lock, then this ordering is not required.
428 */ 427 */
429 if (unlikely(page_count(page) != 2)) 428 if (!page_freeze_refs(page, 2))
430 goto cannot_free; 429 goto cannot_free;
431 smp_rmb(); 430 /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
432 if (unlikely(PageDirty(page))) 431 if (unlikely(PageDirty(page))) {
432 page_unfreeze_refs(page, 2);
433 goto cannot_free; 433 goto cannot_free;
434 }
434 435
435 if (PageSwapCache(page)) { 436 if (PageSwapCache(page)) {
436 swp_entry_t swap = { .val = page_private(page) }; 437 swp_entry_t swap = { .val = page_private(page) };
437 __delete_from_swap_cache(page); 438 __delete_from_swap_cache(page);
438 write_unlock_irq(&mapping->tree_lock); 439 spin_unlock_irq(&mapping->tree_lock);
439 swap_free(swap); 440 swap_free(swap);
440 __put_page(page); /* The pagecache ref */ 441 } else {
441 return 1; 442 __remove_from_page_cache(page);
443 spin_unlock_irq(&mapping->tree_lock);
442 } 444 }
443 445
444 __remove_from_page_cache(page);
445 write_unlock_irq(&mapping->tree_lock);
446 __put_page(page);
447 return 1; 446 return 1;
448 447
449cannot_free: 448cannot_free:
450 write_unlock_irq(&mapping->tree_lock); 449 spin_unlock_irq(&mapping->tree_lock);
450 return 0;
451}
452
453/*
454 * Attempt to detach a locked page from its ->mapping. If it is dirty or if
455 * someone else has a ref on the page, abort and return 0. If it was
456 * successfully detached, return 1. Assumes the caller has a single ref on
457 * this page.
458 */
459int remove_mapping(struct address_space *mapping, struct page *page)
460{
461 if (__remove_mapping(mapping, page)) {
462 /*
463 * Unfreezing the refcount with 1 rather than 2 effectively
464 * drops the pagecache ref for us without requiring another
465 * atomic operation.
466 */
467 page_unfreeze_refs(page, 1);
468 return 1;
469 }
451 return 0; 470 return 0;
452} 471}
453 472
@@ -597,18 +616,34 @@ static unsigned long shrink_page_list(struct list_head *page_list,
597 if (PagePrivate(page)) { 616 if (PagePrivate(page)) {
598 if (!try_to_release_page(page, sc->gfp_mask)) 617 if (!try_to_release_page(page, sc->gfp_mask))
599 goto activate_locked; 618 goto activate_locked;
600 if (!mapping && page_count(page) == 1) 619 if (!mapping && page_count(page) == 1) {
601 goto free_it; 620 unlock_page(page);
621 if (put_page_testzero(page))
622 goto free_it;
623 else {
624 /*
625 * rare race with speculative reference.
626 * the speculative reference will free
627 * this page shortly, so we may
628 * increment nr_reclaimed here (and
629 * leave it off the LRU).
630 */
631 nr_reclaimed++;
632 continue;
633 }
634 }
602 } 635 }
603 636
604 if (!mapping || !remove_mapping(mapping, page)) 637 if (!mapping || !__remove_mapping(mapping, page))
605 goto keep_locked; 638 goto keep_locked;
606 639
607free_it:
608 unlock_page(page); 640 unlock_page(page);
641free_it:
609 nr_reclaimed++; 642 nr_reclaimed++;
610 if (!pagevec_add(&freed_pvec, page)) 643 if (!pagevec_add(&freed_pvec, page)) {
611 __pagevec_release_nonlru(&freed_pvec); 644 __pagevec_free(&freed_pvec);
645 pagevec_reinit(&freed_pvec);
646 }
612 continue; 647 continue;
613 648
614activate_locked: 649activate_locked:
@@ -622,7 +657,7 @@ keep:
622 } 657 }
623 list_splice(&ret_pages, page_list); 658 list_splice(&ret_pages, page_list);
624 if (pagevec_count(&freed_pvec)) 659 if (pagevec_count(&freed_pvec))
625 __pagevec_release_nonlru(&freed_pvec); 660 __pagevec_free(&freed_pvec);
626 count_vm_events(PGACTIVATE, pgactivate); 661 count_vm_events(PGACTIVATE, pgactivate);
627 return nr_reclaimed; 662 return nr_reclaimed;
628} 663}
@@ -1316,6 +1351,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1316 struct zone *zone; 1351 struct zone *zone;
1317 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); 1352 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1318 1353
1354 delayacct_freepages_start();
1355
1319 if (scan_global_lru(sc)) 1356 if (scan_global_lru(sc))
1320 count_vm_event(ALLOCSTALL); 1357 count_vm_event(ALLOCSTALL);
1321 /* 1358 /*
@@ -1396,6 +1433,8 @@ out:
1396 } else 1433 } else
1397 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); 1434 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
1398 1435
1436 delayacct_freepages_end();
1437
1399 return ret; 1438 return ret;
1400} 1439}
1401 1440
diff --git a/mm/vmstat.c b/mm/vmstat.c
index db9eabb2c5b3..b0d08e667ece 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -13,6 +13,7 @@
13#include <linux/err.h> 13#include <linux/err.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/cpu.h> 15#include <linux/cpu.h>
16#include <linux/vmstat.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
17 18
18#ifdef CONFIG_VM_EVENT_COUNTERS 19#ifdef CONFIG_VM_EVENT_COUNTERS
@@ -26,7 +27,7 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
26 27
27 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); 28 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
28 29
29 for_each_cpu_mask(cpu, *cpumask) { 30 for_each_cpu_mask_nr(cpu, *cpumask) {
30 struct vm_event_state *this = &per_cpu(vm_event_states, cpu); 31 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
31 32
32 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 33 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)