diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 3 | ||||
-rw-r--r-- | mm/allocpercpu.c | 2 | ||||
-rw-r--r-- | mm/bootmem.c | 27 | ||||
-rw-r--r-- | mm/dmapool.c | 500 | ||||
-rw-r--r-- | mm/fadvise.c | 16 | ||||
-rw-r--r-- | mm/filemap.c | 27 | ||||
-rw-r--r-- | mm/filemap_xip.c | 2 | ||||
-rw-r--r-- | mm/fremap.c | 5 | ||||
-rw-r--r-- | mm/highmem.c | 4 | ||||
-rw-r--r-- | mm/hugetlb.c | 2 | ||||
-rw-r--r-- | mm/internal.h | 4 | ||||
-rw-r--r-- | mm/memcontrol.c | 1192 | ||||
-rw-r--r-- | mm/memory.c | 256 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 6 | ||||
-rw-r--r-- | mm/migrate.c | 54 | ||||
-rw-r--r-- | mm/mmap.c | 10 | ||||
-rw-r--r-- | mm/nommu.c | 53 | ||||
-rw-r--r-- | mm/oom_kill.c | 90 | ||||
-rw-r--r-- | mm/page-writeback.c | 24 | ||||
-rw-r--r-- | mm/page_alloc.c | 162 | ||||
-rw-r--r-- | mm/page_io.c | 2 | ||||
-rw-r--r-- | mm/pagewalk.c | 131 | ||||
-rw-r--r-- | mm/rmap.c | 53 | ||||
-rw-r--r-- | mm/shmem.c | 517 | ||||
-rw-r--r-- | mm/slob.c | 51 | ||||
-rw-r--r-- | mm/slub.c | 182 | ||||
-rw-r--r-- | mm/sparse.c | 12 | ||||
-rw-r--r-- | mm/swap.c | 12 | ||||
-rw-r--r-- | mm/swap_state.c | 153 | ||||
-rw-r--r-- | mm/swapfile.c | 150 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 12 | ||||
-rw-r--r-- | mm/truncate.c | 10 | ||||
-rw-r--r-- | mm/vmalloc.c | 74 | ||||
-rw-r--r-- | mm/vmscan.c | 495 | ||||
-rw-r--r-- | mm/vmstat.c | 61 |
35 files changed, 3308 insertions, 1046 deletions
diff --git a/mm/Makefile b/mm/Makefile index 5c0b0ea7572d..9f117bab5322 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -13,8 +13,10 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | |||
13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
14 | page_isolation.o $(mmu-y) | 14 | page_isolation.o $(mmu-y) |
15 | 15 | ||
16 | obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o | ||
16 | obj-$(CONFIG_BOUNCE) += bounce.o | 17 | obj-$(CONFIG_BOUNCE) += bounce.o |
17 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o | 18 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o |
19 | obj-$(CONFIG_HAS_DMA) += dmapool.o | ||
18 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o | 20 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o |
19 | obj-$(CONFIG_NUMA) += mempolicy.o | 21 | obj-$(CONFIG_NUMA) += mempolicy.o |
20 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 22 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
@@ -30,4 +32,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o | |||
30 | obj-$(CONFIG_MIGRATION) += migrate.o | 32 | obj-$(CONFIG_MIGRATION) += migrate.o |
31 | obj-$(CONFIG_SMP) += allocpercpu.o | 33 | obj-$(CONFIG_SMP) += allocpercpu.o |
32 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 34 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
35 | obj-$(CONFIG_CGROUP_MEM_CONT) += memcontrol.o | ||
33 | 36 | ||
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index 00b02623f008..7e58322b7134 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c | |||
@@ -98,7 +98,7 @@ EXPORT_SYMBOL_GPL(__percpu_populate_mask); | |||
98 | */ | 98 | */ |
99 | void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) | 99 | void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) |
100 | { | 100 | { |
101 | void *pdata = kzalloc(sizeof(struct percpu_data), gfp); | 101 | void *pdata = kzalloc(nr_cpu_ids * sizeof(void *), gfp); |
102 | void *__pdata = __percpu_disguise(pdata); | 102 | void *__pdata = __percpu_disguise(pdata); |
103 | 103 | ||
104 | if (unlikely(!pdata)) | 104 | if (unlikely(!pdata)) |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 00a96970b237..f6ff4337b424 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -111,11 +111,12 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat, | |||
111 | * might be used for boot-time allocations - or it might get added | 111 | * might be used for boot-time allocations - or it might get added |
112 | * to the free page pool later on. | 112 | * to the free page pool later on. |
113 | */ | 113 | */ |
114 | static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, | 114 | static int __init reserve_bootmem_core(bootmem_data_t *bdata, |
115 | unsigned long size) | 115 | unsigned long addr, unsigned long size, int flags) |
116 | { | 116 | { |
117 | unsigned long sidx, eidx; | 117 | unsigned long sidx, eidx; |
118 | unsigned long i; | 118 | unsigned long i; |
119 | int ret; | ||
119 | 120 | ||
120 | /* | 121 | /* |
121 | * round up, partially reserved pages are considered | 122 | * round up, partially reserved pages are considered |
@@ -133,7 +134,20 @@ static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long add | |||
133 | #ifdef CONFIG_DEBUG_BOOTMEM | 134 | #ifdef CONFIG_DEBUG_BOOTMEM |
134 | printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); | 135 | printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); |
135 | #endif | 136 | #endif |
137 | if (flags & BOOTMEM_EXCLUSIVE) { | ||
138 | ret = -EBUSY; | ||
139 | goto err; | ||
140 | } | ||
136 | } | 141 | } |
142 | |||
143 | return 0; | ||
144 | |||
145 | err: | ||
146 | /* unreserve memory we accidentally reserved */ | ||
147 | for (i--; i >= sidx; i--) | ||
148 | clear_bit(i, bdata->node_bootmem_map); | ||
149 | |||
150 | return ret; | ||
137 | } | 151 | } |
138 | 152 | ||
139 | static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, | 153 | static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, |
@@ -374,9 +388,9 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, | |||
374 | } | 388 | } |
375 | 389 | ||
376 | void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | 390 | void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, |
377 | unsigned long size) | 391 | unsigned long size, int flags) |
378 | { | 392 | { |
379 | reserve_bootmem_core(pgdat->bdata, physaddr, size); | 393 | reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); |
380 | } | 394 | } |
381 | 395 | ||
382 | void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | 396 | void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, |
@@ -398,9 +412,10 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) | |||
398 | } | 412 | } |
399 | 413 | ||
400 | #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE | 414 | #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE |
401 | void __init reserve_bootmem(unsigned long addr, unsigned long size) | 415 | int __init reserve_bootmem(unsigned long addr, unsigned long size, |
416 | int flags) | ||
402 | { | 417 | { |
403 | reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size); | 418 | return reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size, flags); |
404 | } | 419 | } |
405 | #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ | 420 | #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ |
406 | 421 | ||
diff --git a/mm/dmapool.c b/mm/dmapool.c new file mode 100644 index 000000000000..34aaac451a96 --- /dev/null +++ b/mm/dmapool.c | |||
@@ -0,0 +1,500 @@ | |||
1 | /* | ||
2 | * DMA Pool allocator | ||
3 | * | ||
4 | * Copyright 2001 David Brownell | ||
5 | * Copyright 2007 Intel Corporation | ||
6 | * Author: Matthew Wilcox <willy@linux.intel.com> | ||
7 | * | ||
8 | * This software may be redistributed and/or modified under the terms of | ||
9 | * the GNU General Public License ("GPL") version 2 as published by the | ||
10 | * Free Software Foundation. | ||
11 | * | ||
12 | * This allocator returns small blocks of a given size which are DMA-able by | ||
13 | * the given device. It uses the dma_alloc_coherent page allocator to get | ||
14 | * new pages, then splits them up into blocks of the required size. | ||
15 | * Many older drivers still have their own code to do this. | ||
16 | * | ||
17 | * The current design of this allocator is fairly simple. The pool is | ||
18 | * represented by the 'struct dma_pool' which keeps a doubly-linked list of | ||
19 | * allocated pages. Each page in the page_list is split into blocks of at | ||
20 | * least 'size' bytes. Free blocks are tracked in an unsorted singly-linked | ||
21 | * list of free blocks within the page. Used blocks aren't tracked, but we | ||
22 | * keep a count of how many are currently allocated from each page. | ||
23 | */ | ||
24 | |||
25 | #include <linux/device.h> | ||
26 | #include <linux/dma-mapping.h> | ||
27 | #include <linux/dmapool.h> | ||
28 | #include <linux/kernel.h> | ||
29 | #include <linux/list.h> | ||
30 | #include <linux/module.h> | ||
31 | #include <linux/mutex.h> | ||
32 | #include <linux/poison.h> | ||
33 | #include <linux/sched.h> | ||
34 | #include <linux/slab.h> | ||
35 | #include <linux/spinlock.h> | ||
36 | #include <linux/string.h> | ||
37 | #include <linux/types.h> | ||
38 | #include <linux/wait.h> | ||
39 | |||
40 | struct dma_pool { /* the pool */ | ||
41 | struct list_head page_list; | ||
42 | spinlock_t lock; | ||
43 | size_t size; | ||
44 | struct device *dev; | ||
45 | size_t allocation; | ||
46 | size_t boundary; | ||
47 | char name[32]; | ||
48 | wait_queue_head_t waitq; | ||
49 | struct list_head pools; | ||
50 | }; | ||
51 | |||
52 | struct dma_page { /* cacheable header for 'allocation' bytes */ | ||
53 | struct list_head page_list; | ||
54 | void *vaddr; | ||
55 | dma_addr_t dma; | ||
56 | unsigned int in_use; | ||
57 | unsigned int offset; | ||
58 | }; | ||
59 | |||
60 | #define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000) | ||
61 | |||
62 | static DEFINE_MUTEX(pools_lock); | ||
63 | |||
64 | static ssize_t | ||
65 | show_pools(struct device *dev, struct device_attribute *attr, char *buf) | ||
66 | { | ||
67 | unsigned temp; | ||
68 | unsigned size; | ||
69 | char *next; | ||
70 | struct dma_page *page; | ||
71 | struct dma_pool *pool; | ||
72 | |||
73 | next = buf; | ||
74 | size = PAGE_SIZE; | ||
75 | |||
76 | temp = scnprintf(next, size, "poolinfo - 0.1\n"); | ||
77 | size -= temp; | ||
78 | next += temp; | ||
79 | |||
80 | mutex_lock(&pools_lock); | ||
81 | list_for_each_entry(pool, &dev->dma_pools, pools) { | ||
82 | unsigned pages = 0; | ||
83 | unsigned blocks = 0; | ||
84 | |||
85 | list_for_each_entry(page, &pool->page_list, page_list) { | ||
86 | pages++; | ||
87 | blocks += page->in_use; | ||
88 | } | ||
89 | |||
90 | /* per-pool info, no real statistics yet */ | ||
91 | temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n", | ||
92 | pool->name, blocks, | ||
93 | pages * (pool->allocation / pool->size), | ||
94 | pool->size, pages); | ||
95 | size -= temp; | ||
96 | next += temp; | ||
97 | } | ||
98 | mutex_unlock(&pools_lock); | ||
99 | |||
100 | return PAGE_SIZE - size; | ||
101 | } | ||
102 | |||
103 | static DEVICE_ATTR(pools, S_IRUGO, show_pools, NULL); | ||
104 | |||
105 | /** | ||
106 | * dma_pool_create - Creates a pool of consistent memory blocks, for dma. | ||
107 | * @name: name of pool, for diagnostics | ||
108 | * @dev: device that will be doing the DMA | ||
109 | * @size: size of the blocks in this pool. | ||
110 | * @align: alignment requirement for blocks; must be a power of two | ||
111 | * @boundary: returned blocks won't cross this power of two boundary | ||
112 | * Context: !in_interrupt() | ||
113 | * | ||
114 | * Returns a dma allocation pool with the requested characteristics, or | ||
115 | * null if one can't be created. Given one of these pools, dma_pool_alloc() | ||
116 | * may be used to allocate memory. Such memory will all have "consistent" | ||
117 | * DMA mappings, accessible by the device and its driver without using | ||
118 | * cache flushing primitives. The actual size of blocks allocated may be | ||
119 | * larger than requested because of alignment. | ||
120 | * | ||
121 | * If @boundary is nonzero, objects returned from dma_pool_alloc() won't | ||
122 | * cross that size boundary. This is useful for devices which have | ||
123 | * addressing restrictions on individual DMA transfers, such as not crossing | ||
124 | * boundaries of 4KBytes. | ||
125 | */ | ||
126 | struct dma_pool *dma_pool_create(const char *name, struct device *dev, | ||
127 | size_t size, size_t align, size_t boundary) | ||
128 | { | ||
129 | struct dma_pool *retval; | ||
130 | size_t allocation; | ||
131 | |||
132 | if (align == 0) { | ||
133 | align = 1; | ||
134 | } else if (align & (align - 1)) { | ||
135 | return NULL; | ||
136 | } | ||
137 | |||
138 | if (size == 0) { | ||
139 | return NULL; | ||
140 | } else if (size < 4) { | ||
141 | size = 4; | ||
142 | } | ||
143 | |||
144 | if ((size % align) != 0) | ||
145 | size = ALIGN(size, align); | ||
146 | |||
147 | allocation = max_t(size_t, size, PAGE_SIZE); | ||
148 | |||
149 | if (!boundary) { | ||
150 | boundary = allocation; | ||
151 | } else if ((boundary < size) || (boundary & (boundary - 1))) { | ||
152 | return NULL; | ||
153 | } | ||
154 | |||
155 | retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); | ||
156 | if (!retval) | ||
157 | return retval; | ||
158 | |||
159 | strlcpy(retval->name, name, sizeof(retval->name)); | ||
160 | |||
161 | retval->dev = dev; | ||
162 | |||
163 | INIT_LIST_HEAD(&retval->page_list); | ||
164 | spin_lock_init(&retval->lock); | ||
165 | retval->size = size; | ||
166 | retval->boundary = boundary; | ||
167 | retval->allocation = allocation; | ||
168 | init_waitqueue_head(&retval->waitq); | ||
169 | |||
170 | if (dev) { | ||
171 | int ret; | ||
172 | |||
173 | mutex_lock(&pools_lock); | ||
174 | if (list_empty(&dev->dma_pools)) | ||
175 | ret = device_create_file(dev, &dev_attr_pools); | ||
176 | else | ||
177 | ret = 0; | ||
178 | /* note: not currently insisting "name" be unique */ | ||
179 | if (!ret) | ||
180 | list_add(&retval->pools, &dev->dma_pools); | ||
181 | else { | ||
182 | kfree(retval); | ||
183 | retval = NULL; | ||
184 | } | ||
185 | mutex_unlock(&pools_lock); | ||
186 | } else | ||
187 | INIT_LIST_HEAD(&retval->pools); | ||
188 | |||
189 | return retval; | ||
190 | } | ||
191 | EXPORT_SYMBOL(dma_pool_create); | ||
192 | |||
193 | static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page) | ||
194 | { | ||
195 | unsigned int offset = 0; | ||
196 | unsigned int next_boundary = pool->boundary; | ||
197 | |||
198 | do { | ||
199 | unsigned int next = offset + pool->size; | ||
200 | if (unlikely((next + pool->size) >= next_boundary)) { | ||
201 | next = next_boundary; | ||
202 | next_boundary += pool->boundary; | ||
203 | } | ||
204 | *(int *)(page->vaddr + offset) = next; | ||
205 | offset = next; | ||
206 | } while (offset < pool->allocation); | ||
207 | } | ||
208 | |||
209 | static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) | ||
210 | { | ||
211 | struct dma_page *page; | ||
212 | |||
213 | page = kmalloc(sizeof(*page), mem_flags); | ||
214 | if (!page) | ||
215 | return NULL; | ||
216 | page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation, | ||
217 | &page->dma, mem_flags); | ||
218 | if (page->vaddr) { | ||
219 | #ifdef CONFIG_DEBUG_SLAB | ||
220 | memset(page->vaddr, POOL_POISON_FREED, pool->allocation); | ||
221 | #endif | ||
222 | pool_initialise_page(pool, page); | ||
223 | list_add(&page->page_list, &pool->page_list); | ||
224 | page->in_use = 0; | ||
225 | page->offset = 0; | ||
226 | } else { | ||
227 | kfree(page); | ||
228 | page = NULL; | ||
229 | } | ||
230 | return page; | ||
231 | } | ||
232 | |||
233 | static inline int is_page_busy(struct dma_page *page) | ||
234 | { | ||
235 | return page->in_use != 0; | ||
236 | } | ||
237 | |||
238 | static void pool_free_page(struct dma_pool *pool, struct dma_page *page) | ||
239 | { | ||
240 | dma_addr_t dma = page->dma; | ||
241 | |||
242 | #ifdef CONFIG_DEBUG_SLAB | ||
243 | memset(page->vaddr, POOL_POISON_FREED, pool->allocation); | ||
244 | #endif | ||
245 | dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma); | ||
246 | list_del(&page->page_list); | ||
247 | kfree(page); | ||
248 | } | ||
249 | |||
250 | /** | ||
251 | * dma_pool_destroy - destroys a pool of dma memory blocks. | ||
252 | * @pool: dma pool that will be destroyed | ||
253 | * Context: !in_interrupt() | ||
254 | * | ||
255 | * Caller guarantees that no more memory from the pool is in use, | ||
256 | * and that nothing will try to use the pool after this call. | ||
257 | */ | ||
258 | void dma_pool_destroy(struct dma_pool *pool) | ||
259 | { | ||
260 | mutex_lock(&pools_lock); | ||
261 | list_del(&pool->pools); | ||
262 | if (pool->dev && list_empty(&pool->dev->dma_pools)) | ||
263 | device_remove_file(pool->dev, &dev_attr_pools); | ||
264 | mutex_unlock(&pools_lock); | ||
265 | |||
266 | while (!list_empty(&pool->page_list)) { | ||
267 | struct dma_page *page; | ||
268 | page = list_entry(pool->page_list.next, | ||
269 | struct dma_page, page_list); | ||
270 | if (is_page_busy(page)) { | ||
271 | if (pool->dev) | ||
272 | dev_err(pool->dev, | ||
273 | "dma_pool_destroy %s, %p busy\n", | ||
274 | pool->name, page->vaddr); | ||
275 | else | ||
276 | printk(KERN_ERR | ||
277 | "dma_pool_destroy %s, %p busy\n", | ||
278 | pool->name, page->vaddr); | ||
279 | /* leak the still-in-use consistent memory */ | ||
280 | list_del(&page->page_list); | ||
281 | kfree(page); | ||
282 | } else | ||
283 | pool_free_page(pool, page); | ||
284 | } | ||
285 | |||
286 | kfree(pool); | ||
287 | } | ||
288 | EXPORT_SYMBOL(dma_pool_destroy); | ||
289 | |||
290 | /** | ||
291 | * dma_pool_alloc - get a block of consistent memory | ||
292 | * @pool: dma pool that will produce the block | ||
293 | * @mem_flags: GFP_* bitmask | ||
294 | * @handle: pointer to dma address of block | ||
295 | * | ||
296 | * This returns the kernel virtual address of a currently unused block, | ||
297 | * and reports its dma address through the handle. | ||
298 | * If such a memory block can't be allocated, %NULL is returned. | ||
299 | */ | ||
300 | void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, | ||
301 | dma_addr_t *handle) | ||
302 | { | ||
303 | unsigned long flags; | ||
304 | struct dma_page *page; | ||
305 | size_t offset; | ||
306 | void *retval; | ||
307 | |||
308 | spin_lock_irqsave(&pool->lock, flags); | ||
309 | restart: | ||
310 | list_for_each_entry(page, &pool->page_list, page_list) { | ||
311 | if (page->offset < pool->allocation) | ||
312 | goto ready; | ||
313 | } | ||
314 | page = pool_alloc_page(pool, GFP_ATOMIC); | ||
315 | if (!page) { | ||
316 | if (mem_flags & __GFP_WAIT) { | ||
317 | DECLARE_WAITQUEUE(wait, current); | ||
318 | |||
319 | __set_current_state(TASK_INTERRUPTIBLE); | ||
320 | __add_wait_queue(&pool->waitq, &wait); | ||
321 | spin_unlock_irqrestore(&pool->lock, flags); | ||
322 | |||
323 | schedule_timeout(POOL_TIMEOUT_JIFFIES); | ||
324 | |||
325 | spin_lock_irqsave(&pool->lock, flags); | ||
326 | __remove_wait_queue(&pool->waitq, &wait); | ||
327 | goto restart; | ||
328 | } | ||
329 | retval = NULL; | ||
330 | goto done; | ||
331 | } | ||
332 | |||
333 | ready: | ||
334 | page->in_use++; | ||
335 | offset = page->offset; | ||
336 | page->offset = *(int *)(page->vaddr + offset); | ||
337 | retval = offset + page->vaddr; | ||
338 | *handle = offset + page->dma; | ||
339 | #ifdef CONFIG_DEBUG_SLAB | ||
340 | memset(retval, POOL_POISON_ALLOCATED, pool->size); | ||
341 | #endif | ||
342 | done: | ||
343 | spin_unlock_irqrestore(&pool->lock, flags); | ||
344 | return retval; | ||
345 | } | ||
346 | EXPORT_SYMBOL(dma_pool_alloc); | ||
347 | |||
348 | static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) | ||
349 | { | ||
350 | unsigned long flags; | ||
351 | struct dma_page *page; | ||
352 | |||
353 | spin_lock_irqsave(&pool->lock, flags); | ||
354 | list_for_each_entry(page, &pool->page_list, page_list) { | ||
355 | if (dma < page->dma) | ||
356 | continue; | ||
357 | if (dma < (page->dma + pool->allocation)) | ||
358 | goto done; | ||
359 | } | ||
360 | page = NULL; | ||
361 | done: | ||
362 | spin_unlock_irqrestore(&pool->lock, flags); | ||
363 | return page; | ||
364 | } | ||
365 | |||
366 | /** | ||
367 | * dma_pool_free - put block back into dma pool | ||
368 | * @pool: the dma pool holding the block | ||
369 | * @vaddr: virtual address of block | ||
370 | * @dma: dma address of block | ||
371 | * | ||
372 | * Caller promises neither device nor driver will again touch this block | ||
373 | * unless it is first re-allocated. | ||
374 | */ | ||
375 | void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) | ||
376 | { | ||
377 | struct dma_page *page; | ||
378 | unsigned long flags; | ||
379 | unsigned int offset; | ||
380 | |||
381 | page = pool_find_page(pool, dma); | ||
382 | if (!page) { | ||
383 | if (pool->dev) | ||
384 | dev_err(pool->dev, | ||
385 | "dma_pool_free %s, %p/%lx (bad dma)\n", | ||
386 | pool->name, vaddr, (unsigned long)dma); | ||
387 | else | ||
388 | printk(KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n", | ||
389 | pool->name, vaddr, (unsigned long)dma); | ||
390 | return; | ||
391 | } | ||
392 | |||
393 | offset = vaddr - page->vaddr; | ||
394 | #ifdef CONFIG_DEBUG_SLAB | ||
395 | if ((dma - page->dma) != offset) { | ||
396 | if (pool->dev) | ||
397 | dev_err(pool->dev, | ||
398 | "dma_pool_free %s, %p (bad vaddr)/%Lx\n", | ||
399 | pool->name, vaddr, (unsigned long long)dma); | ||
400 | else | ||
401 | printk(KERN_ERR | ||
402 | "dma_pool_free %s, %p (bad vaddr)/%Lx\n", | ||
403 | pool->name, vaddr, (unsigned long long)dma); | ||
404 | return; | ||
405 | } | ||
406 | { | ||
407 | unsigned int chain = page->offset; | ||
408 | while (chain < pool->allocation) { | ||
409 | if (chain != offset) { | ||
410 | chain = *(int *)(page->vaddr + chain); | ||
411 | continue; | ||
412 | } | ||
413 | if (pool->dev) | ||
414 | dev_err(pool->dev, "dma_pool_free %s, dma %Lx " | ||
415 | "already free\n", pool->name, | ||
416 | (unsigned long long)dma); | ||
417 | else | ||
418 | printk(KERN_ERR "dma_pool_free %s, dma %Lx " | ||
419 | "already free\n", pool->name, | ||
420 | (unsigned long long)dma); | ||
421 | return; | ||
422 | } | ||
423 | } | ||
424 | memset(vaddr, POOL_POISON_FREED, pool->size); | ||
425 | #endif | ||
426 | |||
427 | spin_lock_irqsave(&pool->lock, flags); | ||
428 | page->in_use--; | ||
429 | *(int *)vaddr = page->offset; | ||
430 | page->offset = offset; | ||
431 | if (waitqueue_active(&pool->waitq)) | ||
432 | wake_up_locked(&pool->waitq); | ||
433 | /* | ||
434 | * Resist a temptation to do | ||
435 | * if (!is_page_busy(page)) pool_free_page(pool, page); | ||
436 | * Better have a few empty pages hang around. | ||
437 | */ | ||
438 | spin_unlock_irqrestore(&pool->lock, flags); | ||
439 | } | ||
440 | EXPORT_SYMBOL(dma_pool_free); | ||
441 | |||
442 | /* | ||
443 | * Managed DMA pool | ||
444 | */ | ||
445 | static void dmam_pool_release(struct device *dev, void *res) | ||
446 | { | ||
447 | struct dma_pool *pool = *(struct dma_pool **)res; | ||
448 | |||
449 | dma_pool_destroy(pool); | ||
450 | } | ||
451 | |||
452 | static int dmam_pool_match(struct device *dev, void *res, void *match_data) | ||
453 | { | ||
454 | return *(struct dma_pool **)res == match_data; | ||
455 | } | ||
456 | |||
457 | /** | ||
458 | * dmam_pool_create - Managed dma_pool_create() | ||
459 | * @name: name of pool, for diagnostics | ||
460 | * @dev: device that will be doing the DMA | ||
461 | * @size: size of the blocks in this pool. | ||
462 | * @align: alignment requirement for blocks; must be a power of two | ||
463 | * @allocation: returned blocks won't cross this boundary (or zero) | ||
464 | * | ||
465 | * Managed dma_pool_create(). DMA pool created with this function is | ||
466 | * automatically destroyed on driver detach. | ||
467 | */ | ||
468 | struct dma_pool *dmam_pool_create(const char *name, struct device *dev, | ||
469 | size_t size, size_t align, size_t allocation) | ||
470 | { | ||
471 | struct dma_pool **ptr, *pool; | ||
472 | |||
473 | ptr = devres_alloc(dmam_pool_release, sizeof(*ptr), GFP_KERNEL); | ||
474 | if (!ptr) | ||
475 | return NULL; | ||
476 | |||
477 | pool = *ptr = dma_pool_create(name, dev, size, align, allocation); | ||
478 | if (pool) | ||
479 | devres_add(dev, ptr); | ||
480 | else | ||
481 | devres_free(ptr); | ||
482 | |||
483 | return pool; | ||
484 | } | ||
485 | EXPORT_SYMBOL(dmam_pool_create); | ||
486 | |||
487 | /** | ||
488 | * dmam_pool_destroy - Managed dma_pool_destroy() | ||
489 | * @pool: dma pool that will be destroyed | ||
490 | * | ||
491 | * Managed dma_pool_destroy(). | ||
492 | */ | ||
493 | void dmam_pool_destroy(struct dma_pool *pool) | ||
494 | { | ||
495 | struct device *dev = pool->dev; | ||
496 | |||
497 | dma_pool_destroy(pool); | ||
498 | WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); | ||
499 | } | ||
500 | EXPORT_SYMBOL(dmam_pool_destroy); | ||
diff --git a/mm/fadvise.c b/mm/fadvise.c index 0df4c899e979..3c0f1e99f5e4 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -49,9 +49,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | |||
49 | goto out; | 49 | goto out; |
50 | } | 50 | } |
51 | 51 | ||
52 | if (mapping->a_ops->get_xip_page) | 52 | if (mapping->a_ops->get_xip_page) { |
53 | /* no bad return value, but ignore advice */ | 53 | switch (advice) { |
54 | case POSIX_FADV_NORMAL: | ||
55 | case POSIX_FADV_RANDOM: | ||
56 | case POSIX_FADV_SEQUENTIAL: | ||
57 | case POSIX_FADV_WILLNEED: | ||
58 | case POSIX_FADV_NOREUSE: | ||
59 | case POSIX_FADV_DONTNEED: | ||
60 | /* no bad return value, but ignore advice */ | ||
61 | break; | ||
62 | default: | ||
63 | ret = -EINVAL; | ||
64 | } | ||
54 | goto out; | 65 | goto out; |
66 | } | ||
55 | 67 | ||
56 | /* Careful about overflows. Len == 0 means "as much as possible" */ | 68 | /* Careful about overflows. Len == 0 means "as much as possible" */ |
57 | endbyte = offset + len; | 69 | endbyte = offset + len; |
diff --git a/mm/filemap.c b/mm/filemap.c index 76bea88cbebc..5357fcc4643b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/syscalls.h> | 33 | #include <linux/syscalls.h> |
34 | #include <linux/cpuset.h> | 34 | #include <linux/cpuset.h> |
35 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ | 35 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ |
36 | #include <linux/memcontrol.h> | ||
36 | #include "internal.h" | 37 | #include "internal.h" |
37 | 38 | ||
38 | /* | 39 | /* |
@@ -65,7 +66,6 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | |||
65 | * ->private_lock (__free_pte->__set_page_dirty_buffers) | 66 | * ->private_lock (__free_pte->__set_page_dirty_buffers) |
66 | * ->swap_lock (exclusive_swap_page, others) | 67 | * ->swap_lock (exclusive_swap_page, others) |
67 | * ->mapping->tree_lock | 68 | * ->mapping->tree_lock |
68 | * ->zone.lock | ||
69 | * | 69 | * |
70 | * ->i_mutex | 70 | * ->i_mutex |
71 | * ->i_mmap_lock (truncate->unmap_mapping_range) | 71 | * ->i_mmap_lock (truncate->unmap_mapping_range) |
@@ -119,6 +119,7 @@ void __remove_from_page_cache(struct page *page) | |||
119 | { | 119 | { |
120 | struct address_space *mapping = page->mapping; | 120 | struct address_space *mapping = page->mapping; |
121 | 121 | ||
122 | mem_cgroup_uncharge_page(page); | ||
122 | radix_tree_delete(&mapping->page_tree, page->index); | 123 | radix_tree_delete(&mapping->page_tree, page->index); |
123 | page->mapping = NULL; | 124 | page->mapping = NULL; |
124 | mapping->nrpages--; | 125 | mapping->nrpages--; |
@@ -459,8 +460,12 @@ int filemap_write_and_wait_range(struct address_space *mapping, | |||
459 | int add_to_page_cache(struct page *page, struct address_space *mapping, | 460 | int add_to_page_cache(struct page *page, struct address_space *mapping, |
460 | pgoff_t offset, gfp_t gfp_mask) | 461 | pgoff_t offset, gfp_t gfp_mask) |
461 | { | 462 | { |
462 | int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | 463 | int error = mem_cgroup_cache_charge(page, current->mm, |
464 | gfp_mask & ~__GFP_HIGHMEM); | ||
465 | if (error) | ||
466 | goto out; | ||
463 | 467 | ||
468 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | ||
464 | if (error == 0) { | 469 | if (error == 0) { |
465 | write_lock_irq(&mapping->tree_lock); | 470 | write_lock_irq(&mapping->tree_lock); |
466 | error = radix_tree_insert(&mapping->page_tree, offset, page); | 471 | error = radix_tree_insert(&mapping->page_tree, offset, page); |
@@ -471,10 +476,14 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, | |||
471 | page->index = offset; | 476 | page->index = offset; |
472 | mapping->nrpages++; | 477 | mapping->nrpages++; |
473 | __inc_zone_page_state(page, NR_FILE_PAGES); | 478 | __inc_zone_page_state(page, NR_FILE_PAGES); |
474 | } | 479 | } else |
480 | mem_cgroup_uncharge_page(page); | ||
481 | |||
475 | write_unlock_irq(&mapping->tree_lock); | 482 | write_unlock_irq(&mapping->tree_lock); |
476 | radix_tree_preload_end(); | 483 | radix_tree_preload_end(); |
477 | } | 484 | } else |
485 | mem_cgroup_uncharge_page(page); | ||
486 | out: | ||
478 | return error; | 487 | return error; |
479 | } | 488 | } |
480 | EXPORT_SYMBOL(add_to_page_cache); | 489 | EXPORT_SYMBOL(add_to_page_cache); |
@@ -528,7 +537,7 @@ static inline void wake_up_page(struct page *page, int bit) | |||
528 | __wake_up_bit(page_waitqueue(page), &page->flags, bit); | 537 | __wake_up_bit(page_waitqueue(page), &page->flags, bit); |
529 | } | 538 | } |
530 | 539 | ||
531 | void fastcall wait_on_page_bit(struct page *page, int bit_nr) | 540 | void wait_on_page_bit(struct page *page, int bit_nr) |
532 | { | 541 | { |
533 | DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); | 542 | DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); |
534 | 543 | ||
@@ -552,7 +561,7 @@ EXPORT_SYMBOL(wait_on_page_bit); | |||
552 | * the clear_bit and the read of the waitqueue (to avoid SMP races with a | 561 | * the clear_bit and the read of the waitqueue (to avoid SMP races with a |
553 | * parallel wait_on_page_locked()). | 562 | * parallel wait_on_page_locked()). |
554 | */ | 563 | */ |
555 | void fastcall unlock_page(struct page *page) | 564 | void unlock_page(struct page *page) |
556 | { | 565 | { |
557 | smp_mb__before_clear_bit(); | 566 | smp_mb__before_clear_bit(); |
558 | if (!TestClearPageLocked(page)) | 567 | if (!TestClearPageLocked(page)) |
@@ -586,7 +595,7 @@ EXPORT_SYMBOL(end_page_writeback); | |||
586 | * chances are that on the second loop, the block layer's plug list is empty, | 595 | * chances are that on the second loop, the block layer's plug list is empty, |
587 | * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. | 596 | * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. |
588 | */ | 597 | */ |
589 | void fastcall __lock_page(struct page *page) | 598 | void __lock_page(struct page *page) |
590 | { | 599 | { |
591 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | 600 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); |
592 | 601 | ||
@@ -607,7 +616,7 @@ int fastcall __lock_page_killable(struct page *page) | |||
607 | * Variant of lock_page that does not require the caller to hold a reference | 616 | * Variant of lock_page that does not require the caller to hold a reference |
608 | * on the page's mapping. | 617 | * on the page's mapping. |
609 | */ | 618 | */ |
610 | void fastcall __lock_page_nosync(struct page *page) | 619 | void __lock_page_nosync(struct page *page) |
611 | { | 620 | { |
612 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | 621 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); |
613 | __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, | 622 | __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, |
@@ -1277,7 +1286,7 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) | |||
1277 | * This adds the requested page to the page cache if it isn't already there, | 1286 | * This adds the requested page to the page cache if it isn't already there, |
1278 | * and schedules an I/O to read in its contents from disk. | 1287 | * and schedules an I/O to read in its contents from disk. |
1279 | */ | 1288 | */ |
1280 | static int fastcall page_cache_read(struct file * file, pgoff_t offset) | 1289 | static int page_cache_read(struct file *file, pgoff_t offset) |
1281 | { | 1290 | { |
1282 | struct address_space *mapping = file->f_mapping; | 1291 | struct address_space *mapping = file->f_mapping; |
1283 | struct page *page; | 1292 | struct page *page; |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index f874ae818ad3..0420a0292b03 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -431,7 +431,7 @@ xip_truncate_page(struct address_space *mapping, loff_t from) | |||
431 | else | 431 | else |
432 | return PTR_ERR(page); | 432 | return PTR_ERR(page); |
433 | } | 433 | } |
434 | zero_user_page(page, offset, length, KM_USER0); | 434 | zero_user(page, offset, length); |
435 | return 0; | 435 | return 0; |
436 | } | 436 | } |
437 | EXPORT_SYMBOL_GPL(xip_truncate_page); | 437 | EXPORT_SYMBOL_GPL(xip_truncate_page); |
diff --git a/mm/fremap.c b/mm/fremap.c index 14bd3bf7826e..69a37c2bdf81 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -190,10 +190,13 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, | |||
190 | */ | 190 | */ |
191 | if (mapping_cap_account_dirty(mapping)) { | 191 | if (mapping_cap_account_dirty(mapping)) { |
192 | unsigned long addr; | 192 | unsigned long addr; |
193 | struct file *file = vma->vm_file; | ||
193 | 194 | ||
194 | flags &= MAP_NONBLOCK; | 195 | flags &= MAP_NONBLOCK; |
195 | addr = mmap_region(vma->vm_file, start, size, | 196 | get_file(file); |
197 | addr = mmap_region(file, start, size, | ||
196 | flags, vma->vm_flags, pgoff, 1); | 198 | flags, vma->vm_flags, pgoff, 1); |
199 | fput(file); | ||
197 | if (IS_ERR_VALUE(addr)) { | 200 | if (IS_ERR_VALUE(addr)) { |
198 | err = addr; | 201 | err = addr; |
199 | } else { | 202 | } else { |
diff --git a/mm/highmem.c b/mm/highmem.c index 7a967bc35152..35d47733cde4 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -163,7 +163,7 @@ start: | |||
163 | return vaddr; | 163 | return vaddr; |
164 | } | 164 | } |
165 | 165 | ||
166 | void fastcall *kmap_high(struct page *page) | 166 | void *kmap_high(struct page *page) |
167 | { | 167 | { |
168 | unsigned long vaddr; | 168 | unsigned long vaddr; |
169 | 169 | ||
@@ -185,7 +185,7 @@ void fastcall *kmap_high(struct page *page) | |||
185 | 185 | ||
186 | EXPORT_SYMBOL(kmap_high); | 186 | EXPORT_SYMBOL(kmap_high); |
187 | 187 | ||
188 | void fastcall kunmap_high(struct page *page) | 188 | void kunmap_high(struct page *page) |
189 | { | 189 | { |
190 | unsigned long vaddr; | 190 | unsigned long vaddr; |
191 | unsigned long nr; | 191 | unsigned long nr; |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index db861d8b6c28..1a5642074e34 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -813,6 +813,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
813 | 813 | ||
814 | spin_unlock(&mm->page_table_lock); | 814 | spin_unlock(&mm->page_table_lock); |
815 | copy_huge_page(new_page, old_page, address, vma); | 815 | copy_huge_page(new_page, old_page, address, vma); |
816 | __SetPageUptodate(new_page); | ||
816 | spin_lock(&mm->page_table_lock); | 817 | spin_lock(&mm->page_table_lock); |
817 | 818 | ||
818 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); | 819 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); |
@@ -858,6 +859,7 @@ retry: | |||
858 | goto out; | 859 | goto out; |
859 | } | 860 | } |
860 | clear_huge_page(page, address); | 861 | clear_huge_page(page, address); |
862 | __SetPageUptodate(page); | ||
861 | 863 | ||
862 | if (vma->vm_flags & VM_SHARED) { | 864 | if (vma->vm_flags & VM_SHARED) { |
863 | int err; | 865 | int err; |
diff --git a/mm/internal.h b/mm/internal.h index 953f941ea867..5a9a6200e034 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -24,7 +24,7 @@ static inline void set_page_count(struct page *page, int v) | |||
24 | */ | 24 | */ |
25 | static inline void set_page_refcounted(struct page *page) | 25 | static inline void set_page_refcounted(struct page *page) |
26 | { | 26 | { |
27 | VM_BUG_ON(PageCompound(page) && PageTail(page)); | 27 | VM_BUG_ON(PageTail(page)); |
28 | VM_BUG_ON(atomic_read(&page->_count)); | 28 | VM_BUG_ON(atomic_read(&page->_count)); |
29 | set_page_count(page, 1); | 29 | set_page_count(page, 1); |
30 | } | 30 | } |
@@ -34,7 +34,7 @@ static inline void __put_page(struct page *page) | |||
34 | atomic_dec(&page->_count); | 34 | atomic_dec(&page->_count); |
35 | } | 35 | } |
36 | 36 | ||
37 | extern void fastcall __init __free_pages_bootmem(struct page *page, | 37 | extern void __init __free_pages_bootmem(struct page *page, |
38 | unsigned int order); | 38 | unsigned int order); |
39 | 39 | ||
40 | /* | 40 | /* |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c new file mode 100644 index 000000000000..5c2c702af617 --- /dev/null +++ b/mm/memcontrol.c | |||
@@ -0,0 +1,1192 @@ | |||
1 | /* memcontrol.c - Memory Controller | ||
2 | * | ||
3 | * Copyright IBM Corporation, 2007 | ||
4 | * Author Balbir Singh <balbir@linux.vnet.ibm.com> | ||
5 | * | ||
6 | * Copyright 2007 OpenVZ SWsoft Inc | ||
7 | * Author: Pavel Emelianov <xemul@openvz.org> | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License as published by | ||
11 | * the Free Software Foundation; either version 2 of the License, or | ||
12 | * (at your option) any later version. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, | ||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
17 | * GNU General Public License for more details. | ||
18 | */ | ||
19 | |||
20 | #include <linux/res_counter.h> | ||
21 | #include <linux/memcontrol.h> | ||
22 | #include <linux/cgroup.h> | ||
23 | #include <linux/mm.h> | ||
24 | #include <linux/smp.h> | ||
25 | #include <linux/page-flags.h> | ||
26 | #include <linux/backing-dev.h> | ||
27 | #include <linux/bit_spinlock.h> | ||
28 | #include <linux/rcupdate.h> | ||
29 | #include <linux/swap.h> | ||
30 | #include <linux/spinlock.h> | ||
31 | #include <linux/fs.h> | ||
32 | #include <linux/seq_file.h> | ||
33 | |||
34 | #include <asm/uaccess.h> | ||
35 | |||
36 | struct cgroup_subsys mem_cgroup_subsys; | ||
37 | static const int MEM_CGROUP_RECLAIM_RETRIES = 5; | ||
38 | |||
39 | /* | ||
40 | * Statistics for memory cgroup. | ||
41 | */ | ||
42 | enum mem_cgroup_stat_index { | ||
43 | /* | ||
44 | * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. | ||
45 | */ | ||
46 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | ||
47 | MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ | ||
48 | |||
49 | MEM_CGROUP_STAT_NSTATS, | ||
50 | }; | ||
51 | |||
52 | struct mem_cgroup_stat_cpu { | ||
53 | s64 count[MEM_CGROUP_STAT_NSTATS]; | ||
54 | } ____cacheline_aligned_in_smp; | ||
55 | |||
56 | struct mem_cgroup_stat { | ||
57 | struct mem_cgroup_stat_cpu cpustat[NR_CPUS]; | ||
58 | }; | ||
59 | |||
60 | /* | ||
61 | * For accounting under irq disable, no need for increment preempt count. | ||
62 | */ | ||
63 | static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat, | ||
64 | enum mem_cgroup_stat_index idx, int val) | ||
65 | { | ||
66 | int cpu = smp_processor_id(); | ||
67 | stat->cpustat[cpu].count[idx] += val; | ||
68 | } | ||
69 | |||
70 | static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, | ||
71 | enum mem_cgroup_stat_index idx) | ||
72 | { | ||
73 | int cpu; | ||
74 | s64 ret = 0; | ||
75 | for_each_possible_cpu(cpu) | ||
76 | ret += stat->cpustat[cpu].count[idx]; | ||
77 | return ret; | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * per-zone information in memory controller. | ||
82 | */ | ||
83 | |||
84 | enum mem_cgroup_zstat_index { | ||
85 | MEM_CGROUP_ZSTAT_ACTIVE, | ||
86 | MEM_CGROUP_ZSTAT_INACTIVE, | ||
87 | |||
88 | NR_MEM_CGROUP_ZSTAT, | ||
89 | }; | ||
90 | |||
91 | struct mem_cgroup_per_zone { | ||
92 | /* | ||
93 | * spin_lock to protect the per cgroup LRU | ||
94 | */ | ||
95 | spinlock_t lru_lock; | ||
96 | struct list_head active_list; | ||
97 | struct list_head inactive_list; | ||
98 | unsigned long count[NR_MEM_CGROUP_ZSTAT]; | ||
99 | }; | ||
100 | /* Macro for accessing counter */ | ||
101 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | ||
102 | |||
103 | struct mem_cgroup_per_node { | ||
104 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | ||
105 | }; | ||
106 | |||
107 | struct mem_cgroup_lru_info { | ||
108 | struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; | ||
109 | }; | ||
110 | |||
111 | /* | ||
112 | * The memory controller data structure. The memory controller controls both | ||
113 | * page cache and RSS per cgroup. We would eventually like to provide | ||
114 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, | ||
115 | * to help the administrator determine what knobs to tune. | ||
116 | * | ||
117 | * TODO: Add a water mark for the memory controller. Reclaim will begin when | ||
118 | * we hit the water mark. May be even add a low water mark, such that | ||
119 | * no reclaim occurs from a cgroup at it's low water mark, this is | ||
120 | * a feature that will be implemented much later in the future. | ||
121 | */ | ||
122 | struct mem_cgroup { | ||
123 | struct cgroup_subsys_state css; | ||
124 | /* | ||
125 | * the counter to account for memory usage | ||
126 | */ | ||
127 | struct res_counter res; | ||
128 | /* | ||
129 | * Per cgroup active and inactive list, similar to the | ||
130 | * per zone LRU lists. | ||
131 | */ | ||
132 | struct mem_cgroup_lru_info info; | ||
133 | |||
134 | int prev_priority; /* for recording reclaim priority */ | ||
135 | /* | ||
136 | * statistics. | ||
137 | */ | ||
138 | struct mem_cgroup_stat stat; | ||
139 | }; | ||
140 | |||
141 | /* | ||
142 | * We use the lower bit of the page->page_cgroup pointer as a bit spin | ||
143 | * lock. We need to ensure that page->page_cgroup is atleast two | ||
144 | * byte aligned (based on comments from Nick Piggin) | ||
145 | */ | ||
146 | #define PAGE_CGROUP_LOCK_BIT 0x0 | ||
147 | #define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) | ||
148 | |||
149 | /* | ||
150 | * A page_cgroup page is associated with every page descriptor. The | ||
151 | * page_cgroup helps us identify information about the cgroup | ||
152 | */ | ||
153 | struct page_cgroup { | ||
154 | struct list_head lru; /* per cgroup LRU list */ | ||
155 | struct page *page; | ||
156 | struct mem_cgroup *mem_cgroup; | ||
157 | atomic_t ref_cnt; /* Helpful when pages move b/w */ | ||
158 | /* mapped and cached states */ | ||
159 | int flags; | ||
160 | }; | ||
161 | #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ | ||
162 | #define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ | ||
163 | |||
164 | static inline int page_cgroup_nid(struct page_cgroup *pc) | ||
165 | { | ||
166 | return page_to_nid(pc->page); | ||
167 | } | ||
168 | |||
169 | static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) | ||
170 | { | ||
171 | return page_zonenum(pc->page); | ||
172 | } | ||
173 | |||
174 | enum { | ||
175 | MEM_CGROUP_TYPE_UNSPEC = 0, | ||
176 | MEM_CGROUP_TYPE_MAPPED, | ||
177 | MEM_CGROUP_TYPE_CACHED, | ||
178 | MEM_CGROUP_TYPE_ALL, | ||
179 | MEM_CGROUP_TYPE_MAX, | ||
180 | }; | ||
181 | |||
182 | enum charge_type { | ||
183 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | ||
184 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | ||
185 | }; | ||
186 | |||
187 | |||
188 | /* | ||
189 | * Always modified under lru lock. Then, not necessary to preempt_disable() | ||
190 | */ | ||
191 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, | ||
192 | bool charge) | ||
193 | { | ||
194 | int val = (charge)? 1 : -1; | ||
195 | struct mem_cgroup_stat *stat = &mem->stat; | ||
196 | VM_BUG_ON(!irqs_disabled()); | ||
197 | |||
198 | if (flags & PAGE_CGROUP_FLAG_CACHE) | ||
199 | __mem_cgroup_stat_add_safe(stat, | ||
200 | MEM_CGROUP_STAT_CACHE, val); | ||
201 | else | ||
202 | __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); | ||
203 | } | ||
204 | |||
205 | static inline struct mem_cgroup_per_zone * | ||
206 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | ||
207 | { | ||
208 | BUG_ON(!mem->info.nodeinfo[nid]); | ||
209 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | ||
210 | } | ||
211 | |||
212 | static inline struct mem_cgroup_per_zone * | ||
213 | page_cgroup_zoneinfo(struct page_cgroup *pc) | ||
214 | { | ||
215 | struct mem_cgroup *mem = pc->mem_cgroup; | ||
216 | int nid = page_cgroup_nid(pc); | ||
217 | int zid = page_cgroup_zid(pc); | ||
218 | |||
219 | return mem_cgroup_zoneinfo(mem, nid, zid); | ||
220 | } | ||
221 | |||
222 | static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, | ||
223 | enum mem_cgroup_zstat_index idx) | ||
224 | { | ||
225 | int nid, zid; | ||
226 | struct mem_cgroup_per_zone *mz; | ||
227 | u64 total = 0; | ||
228 | |||
229 | for_each_online_node(nid) | ||
230 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | ||
231 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | ||
232 | total += MEM_CGROUP_ZSTAT(mz, idx); | ||
233 | } | ||
234 | return total; | ||
235 | } | ||
236 | |||
237 | static struct mem_cgroup init_mem_cgroup; | ||
238 | |||
239 | static inline | ||
240 | struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | ||
241 | { | ||
242 | return container_of(cgroup_subsys_state(cont, | ||
243 | mem_cgroup_subsys_id), struct mem_cgroup, | ||
244 | css); | ||
245 | } | ||
246 | |||
247 | static inline | ||
248 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | ||
249 | { | ||
250 | return container_of(task_subsys_state(p, mem_cgroup_subsys_id), | ||
251 | struct mem_cgroup, css); | ||
252 | } | ||
253 | |||
254 | void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p) | ||
255 | { | ||
256 | struct mem_cgroup *mem; | ||
257 | |||
258 | mem = mem_cgroup_from_task(p); | ||
259 | css_get(&mem->css); | ||
260 | mm->mem_cgroup = mem; | ||
261 | } | ||
262 | |||
263 | void mm_free_cgroup(struct mm_struct *mm) | ||
264 | { | ||
265 | css_put(&mm->mem_cgroup->css); | ||
266 | } | ||
267 | |||
268 | static inline int page_cgroup_locked(struct page *page) | ||
269 | { | ||
270 | return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, | ||
271 | &page->page_cgroup); | ||
272 | } | ||
273 | |||
274 | void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) | ||
275 | { | ||
276 | int locked; | ||
277 | |||
278 | /* | ||
279 | * While resetting the page_cgroup we might not hold the | ||
280 | * page_cgroup lock. free_hot_cold_page() is an example | ||
281 | * of such a scenario | ||
282 | */ | ||
283 | if (pc) | ||
284 | VM_BUG_ON(!page_cgroup_locked(page)); | ||
285 | locked = (page->page_cgroup & PAGE_CGROUP_LOCK); | ||
286 | page->page_cgroup = ((unsigned long)pc | locked); | ||
287 | } | ||
288 | |||
289 | struct page_cgroup *page_get_page_cgroup(struct page *page) | ||
290 | { | ||
291 | return (struct page_cgroup *) | ||
292 | (page->page_cgroup & ~PAGE_CGROUP_LOCK); | ||
293 | } | ||
294 | |||
295 | static void __always_inline lock_page_cgroup(struct page *page) | ||
296 | { | ||
297 | bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
298 | VM_BUG_ON(!page_cgroup_locked(page)); | ||
299 | } | ||
300 | |||
301 | static void __always_inline unlock_page_cgroup(struct page *page) | ||
302 | { | ||
303 | bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
304 | } | ||
305 | |||
306 | /* | ||
307 | * Tie new page_cgroup to struct page under lock_page_cgroup() | ||
308 | * This can fail if the page has been tied to a page_cgroup. | ||
309 | * If success, returns 0. | ||
310 | */ | ||
311 | static int page_cgroup_assign_new_page_cgroup(struct page *page, | ||
312 | struct page_cgroup *pc) | ||
313 | { | ||
314 | int ret = 0; | ||
315 | |||
316 | lock_page_cgroup(page); | ||
317 | if (!page_get_page_cgroup(page)) | ||
318 | page_assign_page_cgroup(page, pc); | ||
319 | else /* A page is tied to other pc. */ | ||
320 | ret = 1; | ||
321 | unlock_page_cgroup(page); | ||
322 | return ret; | ||
323 | } | ||
324 | |||
325 | /* | ||
326 | * Clear page->page_cgroup member under lock_page_cgroup(). | ||
327 | * If given "pc" value is different from one page->page_cgroup, | ||
328 | * page->cgroup is not cleared. | ||
329 | * Returns a value of page->page_cgroup at lock taken. | ||
330 | * A can can detect failure of clearing by following | ||
331 | * clear_page_cgroup(page, pc) == pc | ||
332 | */ | ||
333 | |||
334 | static struct page_cgroup *clear_page_cgroup(struct page *page, | ||
335 | struct page_cgroup *pc) | ||
336 | { | ||
337 | struct page_cgroup *ret; | ||
338 | /* lock and clear */ | ||
339 | lock_page_cgroup(page); | ||
340 | ret = page_get_page_cgroup(page); | ||
341 | if (likely(ret == pc)) | ||
342 | page_assign_page_cgroup(page, NULL); | ||
343 | unlock_page_cgroup(page); | ||
344 | return ret; | ||
345 | } | ||
346 | |||
347 | static void __mem_cgroup_remove_list(struct page_cgroup *pc) | ||
348 | { | ||
349 | int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; | ||
350 | struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); | ||
351 | |||
352 | if (from) | ||
353 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; | ||
354 | else | ||
355 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; | ||
356 | |||
357 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); | ||
358 | list_del_init(&pc->lru); | ||
359 | } | ||
360 | |||
361 | static void __mem_cgroup_add_list(struct page_cgroup *pc) | ||
362 | { | ||
363 | int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; | ||
364 | struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); | ||
365 | |||
366 | if (!to) { | ||
367 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; | ||
368 | list_add(&pc->lru, &mz->inactive_list); | ||
369 | } else { | ||
370 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; | ||
371 | list_add(&pc->lru, &mz->active_list); | ||
372 | } | ||
373 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); | ||
374 | } | ||
375 | |||
376 | static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) | ||
377 | { | ||
378 | int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; | ||
379 | struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); | ||
380 | |||
381 | if (from) | ||
382 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; | ||
383 | else | ||
384 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; | ||
385 | |||
386 | if (active) { | ||
387 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; | ||
388 | pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; | ||
389 | list_move(&pc->lru, &mz->active_list); | ||
390 | } else { | ||
391 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; | ||
392 | pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; | ||
393 | list_move(&pc->lru, &mz->inactive_list); | ||
394 | } | ||
395 | } | ||
396 | |||
397 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | ||
398 | { | ||
399 | int ret; | ||
400 | |||
401 | task_lock(task); | ||
402 | ret = task->mm && mm_cgroup(task->mm) == mem; | ||
403 | task_unlock(task); | ||
404 | return ret; | ||
405 | } | ||
406 | |||
407 | /* | ||
408 | * This routine assumes that the appropriate zone's lru lock is already held | ||
409 | */ | ||
410 | void mem_cgroup_move_lists(struct page_cgroup *pc, bool active) | ||
411 | { | ||
412 | struct mem_cgroup_per_zone *mz; | ||
413 | unsigned long flags; | ||
414 | |||
415 | if (!pc) | ||
416 | return; | ||
417 | |||
418 | mz = page_cgroup_zoneinfo(pc); | ||
419 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
420 | __mem_cgroup_move_lists(pc, active); | ||
421 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
422 | } | ||
423 | |||
424 | /* | ||
425 | * Calculate mapped_ratio under memory controller. This will be used in | ||
426 | * vmscan.c for deteremining we have to reclaim mapped pages. | ||
427 | */ | ||
428 | int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) | ||
429 | { | ||
430 | long total, rss; | ||
431 | |||
432 | /* | ||
433 | * usage is recorded in bytes. But, here, we assume the number of | ||
434 | * physical pages can be represented by "long" on any arch. | ||
435 | */ | ||
436 | total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; | ||
437 | rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); | ||
438 | return (int)((rss * 100L) / total); | ||
439 | } | ||
440 | /* | ||
441 | * This function is called from vmscan.c. In page reclaiming loop. balance | ||
442 | * between active and inactive list is calculated. For memory controller | ||
443 | * page reclaiming, we should use using mem_cgroup's imbalance rather than | ||
444 | * zone's global lru imbalance. | ||
445 | */ | ||
446 | long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem) | ||
447 | { | ||
448 | unsigned long active, inactive; | ||
449 | /* active and inactive are the number of pages. 'long' is ok.*/ | ||
450 | active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE); | ||
451 | inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE); | ||
452 | return (long) (active / (inactive + 1)); | ||
453 | } | ||
454 | |||
455 | /* | ||
456 | * prev_priority control...this will be used in memory reclaim path. | ||
457 | */ | ||
458 | int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) | ||
459 | { | ||
460 | return mem->prev_priority; | ||
461 | } | ||
462 | |||
463 | void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) | ||
464 | { | ||
465 | if (priority < mem->prev_priority) | ||
466 | mem->prev_priority = priority; | ||
467 | } | ||
468 | |||
469 | void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) | ||
470 | { | ||
471 | mem->prev_priority = priority; | ||
472 | } | ||
473 | |||
474 | /* | ||
475 | * Calculate # of pages to be scanned in this priority/zone. | ||
476 | * See also vmscan.c | ||
477 | * | ||
478 | * priority starts from "DEF_PRIORITY" and decremented in each loop. | ||
479 | * (see include/linux/mmzone.h) | ||
480 | */ | ||
481 | |||
482 | long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, | ||
483 | struct zone *zone, int priority) | ||
484 | { | ||
485 | long nr_active; | ||
486 | int nid = zone->zone_pgdat->node_id; | ||
487 | int zid = zone_idx(zone); | ||
488 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); | ||
489 | |||
490 | nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE); | ||
491 | return (nr_active >> priority); | ||
492 | } | ||
493 | |||
494 | long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, | ||
495 | struct zone *zone, int priority) | ||
496 | { | ||
497 | long nr_inactive; | ||
498 | int nid = zone->zone_pgdat->node_id; | ||
499 | int zid = zone_idx(zone); | ||
500 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); | ||
501 | |||
502 | nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); | ||
503 | |||
504 | return (nr_inactive >> priority); | ||
505 | } | ||
506 | |||
507 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | ||
508 | struct list_head *dst, | ||
509 | unsigned long *scanned, int order, | ||
510 | int mode, struct zone *z, | ||
511 | struct mem_cgroup *mem_cont, | ||
512 | int active) | ||
513 | { | ||
514 | unsigned long nr_taken = 0; | ||
515 | struct page *page; | ||
516 | unsigned long scan; | ||
517 | LIST_HEAD(pc_list); | ||
518 | struct list_head *src; | ||
519 | struct page_cgroup *pc, *tmp; | ||
520 | int nid = z->zone_pgdat->node_id; | ||
521 | int zid = zone_idx(z); | ||
522 | struct mem_cgroup_per_zone *mz; | ||
523 | |||
524 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); | ||
525 | if (active) | ||
526 | src = &mz->active_list; | ||
527 | else | ||
528 | src = &mz->inactive_list; | ||
529 | |||
530 | |||
531 | spin_lock(&mz->lru_lock); | ||
532 | scan = 0; | ||
533 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { | ||
534 | if (scan >= nr_to_scan) | ||
535 | break; | ||
536 | page = pc->page; | ||
537 | VM_BUG_ON(!pc); | ||
538 | |||
539 | if (unlikely(!PageLRU(page))) | ||
540 | continue; | ||
541 | |||
542 | if (PageActive(page) && !active) { | ||
543 | __mem_cgroup_move_lists(pc, true); | ||
544 | continue; | ||
545 | } | ||
546 | if (!PageActive(page) && active) { | ||
547 | __mem_cgroup_move_lists(pc, false); | ||
548 | continue; | ||
549 | } | ||
550 | |||
551 | scan++; | ||
552 | list_move(&pc->lru, &pc_list); | ||
553 | |||
554 | if (__isolate_lru_page(page, mode) == 0) { | ||
555 | list_move(&page->lru, dst); | ||
556 | nr_taken++; | ||
557 | } | ||
558 | } | ||
559 | |||
560 | list_splice(&pc_list, src); | ||
561 | spin_unlock(&mz->lru_lock); | ||
562 | |||
563 | *scanned = scan; | ||
564 | return nr_taken; | ||
565 | } | ||
566 | |||
567 | /* | ||
568 | * Charge the memory controller for page usage. | ||
569 | * Return | ||
570 | * 0 if the charge was successful | ||
571 | * < 0 if the cgroup is over its limit | ||
572 | */ | ||
573 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | ||
574 | gfp_t gfp_mask, enum charge_type ctype) | ||
575 | { | ||
576 | struct mem_cgroup *mem; | ||
577 | struct page_cgroup *pc; | ||
578 | unsigned long flags; | ||
579 | unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
580 | struct mem_cgroup_per_zone *mz; | ||
581 | |||
582 | /* | ||
583 | * Should page_cgroup's go to their own slab? | ||
584 | * One could optimize the performance of the charging routine | ||
585 | * by saving a bit in the page_flags and using it as a lock | ||
586 | * to see if the cgroup page already has a page_cgroup associated | ||
587 | * with it | ||
588 | */ | ||
589 | retry: | ||
590 | if (page) { | ||
591 | lock_page_cgroup(page); | ||
592 | pc = page_get_page_cgroup(page); | ||
593 | /* | ||
594 | * The page_cgroup exists and | ||
595 | * the page has already been accounted. | ||
596 | */ | ||
597 | if (pc) { | ||
598 | if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) { | ||
599 | /* this page is under being uncharged ? */ | ||
600 | unlock_page_cgroup(page); | ||
601 | cpu_relax(); | ||
602 | goto retry; | ||
603 | } else { | ||
604 | unlock_page_cgroup(page); | ||
605 | goto done; | ||
606 | } | ||
607 | } | ||
608 | unlock_page_cgroup(page); | ||
609 | } | ||
610 | |||
611 | pc = kzalloc(sizeof(struct page_cgroup), gfp_mask); | ||
612 | if (pc == NULL) | ||
613 | goto err; | ||
614 | |||
615 | /* | ||
616 | * We always charge the cgroup the mm_struct belongs to. | ||
617 | * The mm_struct's mem_cgroup changes on task migration if the | ||
618 | * thread group leader migrates. It's possible that mm is not | ||
619 | * set, if so charge the init_mm (happens for pagecache usage). | ||
620 | */ | ||
621 | if (!mm) | ||
622 | mm = &init_mm; | ||
623 | |||
624 | rcu_read_lock(); | ||
625 | mem = rcu_dereference(mm->mem_cgroup); | ||
626 | /* | ||
627 | * For every charge from the cgroup, increment reference | ||
628 | * count | ||
629 | */ | ||
630 | css_get(&mem->css); | ||
631 | rcu_read_unlock(); | ||
632 | |||
633 | /* | ||
634 | * If we created the page_cgroup, we should free it on exceeding | ||
635 | * the cgroup limit. | ||
636 | */ | ||
637 | while (res_counter_charge(&mem->res, PAGE_SIZE)) { | ||
638 | if (!(gfp_mask & __GFP_WAIT)) | ||
639 | goto out; | ||
640 | |||
641 | if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) | ||
642 | continue; | ||
643 | |||
644 | /* | ||
645 | * try_to_free_mem_cgroup_pages() might not give us a full | ||
646 | * picture of reclaim. Some pages are reclaimed and might be | ||
647 | * moved to swap cache or just unmapped from the cgroup. | ||
648 | * Check the limit again to see if the reclaim reduced the | ||
649 | * current usage of the cgroup before giving up | ||
650 | */ | ||
651 | if (res_counter_check_under_limit(&mem->res)) | ||
652 | continue; | ||
653 | |||
654 | if (!nr_retries--) { | ||
655 | mem_cgroup_out_of_memory(mem, gfp_mask); | ||
656 | goto out; | ||
657 | } | ||
658 | congestion_wait(WRITE, HZ/10); | ||
659 | } | ||
660 | |||
661 | atomic_set(&pc->ref_cnt, 1); | ||
662 | pc->mem_cgroup = mem; | ||
663 | pc->page = page; | ||
664 | pc->flags = PAGE_CGROUP_FLAG_ACTIVE; | ||
665 | if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) | ||
666 | pc->flags |= PAGE_CGROUP_FLAG_CACHE; | ||
667 | |||
668 | if (!page || page_cgroup_assign_new_page_cgroup(page, pc)) { | ||
669 | /* | ||
670 | * Another charge has been added to this page already. | ||
671 | * We take lock_page_cgroup(page) again and read | ||
672 | * page->cgroup, increment refcnt.... just retry is OK. | ||
673 | */ | ||
674 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
675 | css_put(&mem->css); | ||
676 | kfree(pc); | ||
677 | if (!page) | ||
678 | goto done; | ||
679 | goto retry; | ||
680 | } | ||
681 | |||
682 | mz = page_cgroup_zoneinfo(pc); | ||
683 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
684 | /* Update statistics vector */ | ||
685 | __mem_cgroup_add_list(pc); | ||
686 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
687 | |||
688 | done: | ||
689 | return 0; | ||
690 | out: | ||
691 | css_put(&mem->css); | ||
692 | kfree(pc); | ||
693 | err: | ||
694 | return -ENOMEM; | ||
695 | } | ||
696 | |||
697 | int mem_cgroup_charge(struct page *page, struct mm_struct *mm, | ||
698 | gfp_t gfp_mask) | ||
699 | { | ||
700 | return mem_cgroup_charge_common(page, mm, gfp_mask, | ||
701 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | ||
702 | } | ||
703 | |||
704 | /* | ||
705 | * See if the cached pages should be charged at all? | ||
706 | */ | ||
707 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | ||
708 | gfp_t gfp_mask) | ||
709 | { | ||
710 | int ret = 0; | ||
711 | if (!mm) | ||
712 | mm = &init_mm; | ||
713 | |||
714 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, | ||
715 | MEM_CGROUP_CHARGE_TYPE_CACHE); | ||
716 | return ret; | ||
717 | } | ||
718 | |||
719 | /* | ||
720 | * Uncharging is always a welcome operation, we never complain, simply | ||
721 | * uncharge. This routine should be called with lock_page_cgroup held | ||
722 | */ | ||
723 | void mem_cgroup_uncharge(struct page_cgroup *pc) | ||
724 | { | ||
725 | struct mem_cgroup *mem; | ||
726 | struct mem_cgroup_per_zone *mz; | ||
727 | struct page *page; | ||
728 | unsigned long flags; | ||
729 | |||
730 | /* | ||
731 | * Check if our page_cgroup is valid | ||
732 | */ | ||
733 | if (!pc) | ||
734 | return; | ||
735 | |||
736 | if (atomic_dec_and_test(&pc->ref_cnt)) { | ||
737 | page = pc->page; | ||
738 | mz = page_cgroup_zoneinfo(pc); | ||
739 | /* | ||
740 | * get page->cgroup and clear it under lock. | ||
741 | * force_empty can drop page->cgroup without checking refcnt. | ||
742 | */ | ||
743 | unlock_page_cgroup(page); | ||
744 | if (clear_page_cgroup(page, pc) == pc) { | ||
745 | mem = pc->mem_cgroup; | ||
746 | css_put(&mem->css); | ||
747 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
748 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
749 | __mem_cgroup_remove_list(pc); | ||
750 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
751 | kfree(pc); | ||
752 | } | ||
753 | lock_page_cgroup(page); | ||
754 | } | ||
755 | } | ||
756 | |||
757 | void mem_cgroup_uncharge_page(struct page *page) | ||
758 | { | ||
759 | lock_page_cgroup(page); | ||
760 | mem_cgroup_uncharge(page_get_page_cgroup(page)); | ||
761 | unlock_page_cgroup(page); | ||
762 | } | ||
763 | |||
764 | /* | ||
765 | * Returns non-zero if a page (under migration) has valid page_cgroup member. | ||
766 | * Refcnt of page_cgroup is incremented. | ||
767 | */ | ||
768 | |||
769 | int mem_cgroup_prepare_migration(struct page *page) | ||
770 | { | ||
771 | struct page_cgroup *pc; | ||
772 | int ret = 0; | ||
773 | lock_page_cgroup(page); | ||
774 | pc = page_get_page_cgroup(page); | ||
775 | if (pc && atomic_inc_not_zero(&pc->ref_cnt)) | ||
776 | ret = 1; | ||
777 | unlock_page_cgroup(page); | ||
778 | return ret; | ||
779 | } | ||
780 | |||
781 | void mem_cgroup_end_migration(struct page *page) | ||
782 | { | ||
783 | struct page_cgroup *pc; | ||
784 | |||
785 | lock_page_cgroup(page); | ||
786 | pc = page_get_page_cgroup(page); | ||
787 | mem_cgroup_uncharge(pc); | ||
788 | unlock_page_cgroup(page); | ||
789 | } | ||
790 | /* | ||
791 | * We know both *page* and *newpage* are now not-on-LRU and Pg_locked. | ||
792 | * And no race with uncharge() routines because page_cgroup for *page* | ||
793 | * has extra one reference by mem_cgroup_prepare_migration. | ||
794 | */ | ||
795 | |||
796 | void mem_cgroup_page_migration(struct page *page, struct page *newpage) | ||
797 | { | ||
798 | struct page_cgroup *pc; | ||
799 | struct mem_cgroup *mem; | ||
800 | unsigned long flags; | ||
801 | struct mem_cgroup_per_zone *mz; | ||
802 | retry: | ||
803 | pc = page_get_page_cgroup(page); | ||
804 | if (!pc) | ||
805 | return; | ||
806 | mem = pc->mem_cgroup; | ||
807 | mz = page_cgroup_zoneinfo(pc); | ||
808 | if (clear_page_cgroup(page, pc) != pc) | ||
809 | goto retry; | ||
810 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
811 | |||
812 | __mem_cgroup_remove_list(pc); | ||
813 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
814 | |||
815 | pc->page = newpage; | ||
816 | lock_page_cgroup(newpage); | ||
817 | page_assign_page_cgroup(newpage, pc); | ||
818 | unlock_page_cgroup(newpage); | ||
819 | |||
820 | mz = page_cgroup_zoneinfo(pc); | ||
821 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
822 | __mem_cgroup_add_list(pc); | ||
823 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
824 | return; | ||
825 | } | ||
826 | |||
827 | /* | ||
828 | * This routine traverse page_cgroup in given list and drop them all. | ||
829 | * This routine ignores page_cgroup->ref_cnt. | ||
830 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | ||
831 | */ | ||
832 | #define FORCE_UNCHARGE_BATCH (128) | ||
833 | static void | ||
834 | mem_cgroup_force_empty_list(struct mem_cgroup *mem, | ||
835 | struct mem_cgroup_per_zone *mz, | ||
836 | int active) | ||
837 | { | ||
838 | struct page_cgroup *pc; | ||
839 | struct page *page; | ||
840 | int count; | ||
841 | unsigned long flags; | ||
842 | struct list_head *list; | ||
843 | |||
844 | if (active) | ||
845 | list = &mz->active_list; | ||
846 | else | ||
847 | list = &mz->inactive_list; | ||
848 | |||
849 | if (list_empty(list)) | ||
850 | return; | ||
851 | retry: | ||
852 | count = FORCE_UNCHARGE_BATCH; | ||
853 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
854 | |||
855 | while (--count && !list_empty(list)) { | ||
856 | pc = list_entry(list->prev, struct page_cgroup, lru); | ||
857 | page = pc->page; | ||
858 | /* Avoid race with charge */ | ||
859 | atomic_set(&pc->ref_cnt, 0); | ||
860 | if (clear_page_cgroup(page, pc) == pc) { | ||
861 | css_put(&mem->css); | ||
862 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
863 | __mem_cgroup_remove_list(pc); | ||
864 | kfree(pc); | ||
865 | } else /* being uncharged ? ...do relax */ | ||
866 | break; | ||
867 | } | ||
868 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
869 | if (!list_empty(list)) { | ||
870 | cond_resched(); | ||
871 | goto retry; | ||
872 | } | ||
873 | return; | ||
874 | } | ||
875 | |||
876 | /* | ||
877 | * make mem_cgroup's charge to be 0 if there is no task. | ||
878 | * This enables deleting this mem_cgroup. | ||
879 | */ | ||
880 | |||
881 | int mem_cgroup_force_empty(struct mem_cgroup *mem) | ||
882 | { | ||
883 | int ret = -EBUSY; | ||
884 | int node, zid; | ||
885 | css_get(&mem->css); | ||
886 | /* | ||
887 | * page reclaim code (kswapd etc..) will move pages between | ||
888 | ` * active_list <-> inactive_list while we don't take a lock. | ||
889 | * So, we have to do loop here until all lists are empty. | ||
890 | */ | ||
891 | while (mem->res.usage > 0) { | ||
892 | if (atomic_read(&mem->css.cgroup->count) > 0) | ||
893 | goto out; | ||
894 | for_each_node_state(node, N_POSSIBLE) | ||
895 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | ||
896 | struct mem_cgroup_per_zone *mz; | ||
897 | mz = mem_cgroup_zoneinfo(mem, node, zid); | ||
898 | /* drop all page_cgroup in active_list */ | ||
899 | mem_cgroup_force_empty_list(mem, mz, 1); | ||
900 | /* drop all page_cgroup in inactive_list */ | ||
901 | mem_cgroup_force_empty_list(mem, mz, 0); | ||
902 | } | ||
903 | } | ||
904 | ret = 0; | ||
905 | out: | ||
906 | css_put(&mem->css); | ||
907 | return ret; | ||
908 | } | ||
909 | |||
910 | |||
911 | |||
912 | int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) | ||
913 | { | ||
914 | *tmp = memparse(buf, &buf); | ||
915 | if (*buf != '\0') | ||
916 | return -EINVAL; | ||
917 | |||
918 | /* | ||
919 | * Round up the value to the closest page size | ||
920 | */ | ||
921 | *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT; | ||
922 | return 0; | ||
923 | } | ||
924 | |||
925 | static ssize_t mem_cgroup_read(struct cgroup *cont, | ||
926 | struct cftype *cft, struct file *file, | ||
927 | char __user *userbuf, size_t nbytes, loff_t *ppos) | ||
928 | { | ||
929 | return res_counter_read(&mem_cgroup_from_cont(cont)->res, | ||
930 | cft->private, userbuf, nbytes, ppos, | ||
931 | NULL); | ||
932 | } | ||
933 | |||
934 | static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | ||
935 | struct file *file, const char __user *userbuf, | ||
936 | size_t nbytes, loff_t *ppos) | ||
937 | { | ||
938 | return res_counter_write(&mem_cgroup_from_cont(cont)->res, | ||
939 | cft->private, userbuf, nbytes, ppos, | ||
940 | mem_cgroup_write_strategy); | ||
941 | } | ||
942 | |||
943 | static ssize_t mem_force_empty_write(struct cgroup *cont, | ||
944 | struct cftype *cft, struct file *file, | ||
945 | const char __user *userbuf, | ||
946 | size_t nbytes, loff_t *ppos) | ||
947 | { | ||
948 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | ||
949 | int ret; | ||
950 | ret = mem_cgroup_force_empty(mem); | ||
951 | if (!ret) | ||
952 | ret = nbytes; | ||
953 | return ret; | ||
954 | } | ||
955 | |||
956 | /* | ||
957 | * Note: This should be removed if cgroup supports write-only file. | ||
958 | */ | ||
959 | |||
960 | static ssize_t mem_force_empty_read(struct cgroup *cont, | ||
961 | struct cftype *cft, | ||
962 | struct file *file, char __user *userbuf, | ||
963 | size_t nbytes, loff_t *ppos) | ||
964 | { | ||
965 | return -EINVAL; | ||
966 | } | ||
967 | |||
968 | |||
969 | static const struct mem_cgroup_stat_desc { | ||
970 | const char *msg; | ||
971 | u64 unit; | ||
972 | } mem_cgroup_stat_desc[] = { | ||
973 | [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, | ||
974 | [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, | ||
975 | }; | ||
976 | |||
977 | static int mem_control_stat_show(struct seq_file *m, void *arg) | ||
978 | { | ||
979 | struct cgroup *cont = m->private; | ||
980 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); | ||
981 | struct mem_cgroup_stat *stat = &mem_cont->stat; | ||
982 | int i; | ||
983 | |||
984 | for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { | ||
985 | s64 val; | ||
986 | |||
987 | val = mem_cgroup_read_stat(stat, i); | ||
988 | val *= mem_cgroup_stat_desc[i].unit; | ||
989 | seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg, | ||
990 | (long long)val); | ||
991 | } | ||
992 | /* showing # of active pages */ | ||
993 | { | ||
994 | unsigned long active, inactive; | ||
995 | |||
996 | inactive = mem_cgroup_get_all_zonestat(mem_cont, | ||
997 | MEM_CGROUP_ZSTAT_INACTIVE); | ||
998 | active = mem_cgroup_get_all_zonestat(mem_cont, | ||
999 | MEM_CGROUP_ZSTAT_ACTIVE); | ||
1000 | seq_printf(m, "active %ld\n", (active) * PAGE_SIZE); | ||
1001 | seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE); | ||
1002 | } | ||
1003 | return 0; | ||
1004 | } | ||
1005 | |||
1006 | static const struct file_operations mem_control_stat_file_operations = { | ||
1007 | .read = seq_read, | ||
1008 | .llseek = seq_lseek, | ||
1009 | .release = single_release, | ||
1010 | }; | ||
1011 | |||
1012 | static int mem_control_stat_open(struct inode *unused, struct file *file) | ||
1013 | { | ||
1014 | /* XXX __d_cont */ | ||
1015 | struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; | ||
1016 | |||
1017 | file->f_op = &mem_control_stat_file_operations; | ||
1018 | return single_open(file, mem_control_stat_show, cont); | ||
1019 | } | ||
1020 | |||
1021 | |||
1022 | |||
1023 | static struct cftype mem_cgroup_files[] = { | ||
1024 | { | ||
1025 | .name = "usage_in_bytes", | ||
1026 | .private = RES_USAGE, | ||
1027 | .read = mem_cgroup_read, | ||
1028 | }, | ||
1029 | { | ||
1030 | .name = "limit_in_bytes", | ||
1031 | .private = RES_LIMIT, | ||
1032 | .write = mem_cgroup_write, | ||
1033 | .read = mem_cgroup_read, | ||
1034 | }, | ||
1035 | { | ||
1036 | .name = "failcnt", | ||
1037 | .private = RES_FAILCNT, | ||
1038 | .read = mem_cgroup_read, | ||
1039 | }, | ||
1040 | { | ||
1041 | .name = "force_empty", | ||
1042 | .write = mem_force_empty_write, | ||
1043 | .read = mem_force_empty_read, | ||
1044 | }, | ||
1045 | { | ||
1046 | .name = "stat", | ||
1047 | .open = mem_control_stat_open, | ||
1048 | }, | ||
1049 | }; | ||
1050 | |||
1051 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | ||
1052 | { | ||
1053 | struct mem_cgroup_per_node *pn; | ||
1054 | struct mem_cgroup_per_zone *mz; | ||
1055 | int zone; | ||
1056 | /* | ||
1057 | * This routine is called against possible nodes. | ||
1058 | * But it's BUG to call kmalloc() against offline node. | ||
1059 | * | ||
1060 | * TODO: this routine can waste much memory for nodes which will | ||
1061 | * never be onlined. It's better to use memory hotplug callback | ||
1062 | * function. | ||
1063 | */ | ||
1064 | if (node_state(node, N_HIGH_MEMORY)) | ||
1065 | pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, node); | ||
1066 | else | ||
1067 | pn = kmalloc(sizeof(*pn), GFP_KERNEL); | ||
1068 | if (!pn) | ||
1069 | return 1; | ||
1070 | |||
1071 | mem->info.nodeinfo[node] = pn; | ||
1072 | memset(pn, 0, sizeof(*pn)); | ||
1073 | |||
1074 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
1075 | mz = &pn->zoneinfo[zone]; | ||
1076 | INIT_LIST_HEAD(&mz->active_list); | ||
1077 | INIT_LIST_HEAD(&mz->inactive_list); | ||
1078 | spin_lock_init(&mz->lru_lock); | ||
1079 | } | ||
1080 | return 0; | ||
1081 | } | ||
1082 | |||
1083 | static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | ||
1084 | { | ||
1085 | kfree(mem->info.nodeinfo[node]); | ||
1086 | } | ||
1087 | |||
1088 | |||
1089 | static struct mem_cgroup init_mem_cgroup; | ||
1090 | |||
1091 | static struct cgroup_subsys_state * | ||
1092 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | ||
1093 | { | ||
1094 | struct mem_cgroup *mem; | ||
1095 | int node; | ||
1096 | |||
1097 | if (unlikely((cont->parent) == NULL)) { | ||
1098 | mem = &init_mem_cgroup; | ||
1099 | init_mm.mem_cgroup = mem; | ||
1100 | } else | ||
1101 | mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL); | ||
1102 | |||
1103 | if (mem == NULL) | ||
1104 | return NULL; | ||
1105 | |||
1106 | res_counter_init(&mem->res); | ||
1107 | |||
1108 | memset(&mem->info, 0, sizeof(mem->info)); | ||
1109 | |||
1110 | for_each_node_state(node, N_POSSIBLE) | ||
1111 | if (alloc_mem_cgroup_per_zone_info(mem, node)) | ||
1112 | goto free_out; | ||
1113 | |||
1114 | return &mem->css; | ||
1115 | free_out: | ||
1116 | for_each_node_state(node, N_POSSIBLE) | ||
1117 | free_mem_cgroup_per_zone_info(mem, node); | ||
1118 | if (cont->parent != NULL) | ||
1119 | kfree(mem); | ||
1120 | return NULL; | ||
1121 | } | ||
1122 | |||
1123 | static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, | ||
1124 | struct cgroup *cont) | ||
1125 | { | ||
1126 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | ||
1127 | mem_cgroup_force_empty(mem); | ||
1128 | } | ||
1129 | |||
1130 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, | ||
1131 | struct cgroup *cont) | ||
1132 | { | ||
1133 | int node; | ||
1134 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | ||
1135 | |||
1136 | for_each_node_state(node, N_POSSIBLE) | ||
1137 | free_mem_cgroup_per_zone_info(mem, node); | ||
1138 | |||
1139 | kfree(mem_cgroup_from_cont(cont)); | ||
1140 | } | ||
1141 | |||
1142 | static int mem_cgroup_populate(struct cgroup_subsys *ss, | ||
1143 | struct cgroup *cont) | ||
1144 | { | ||
1145 | return cgroup_add_files(cont, ss, mem_cgroup_files, | ||
1146 | ARRAY_SIZE(mem_cgroup_files)); | ||
1147 | } | ||
1148 | |||
1149 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | ||
1150 | struct cgroup *cont, | ||
1151 | struct cgroup *old_cont, | ||
1152 | struct task_struct *p) | ||
1153 | { | ||
1154 | struct mm_struct *mm; | ||
1155 | struct mem_cgroup *mem, *old_mem; | ||
1156 | |||
1157 | mm = get_task_mm(p); | ||
1158 | if (mm == NULL) | ||
1159 | return; | ||
1160 | |||
1161 | mem = mem_cgroup_from_cont(cont); | ||
1162 | old_mem = mem_cgroup_from_cont(old_cont); | ||
1163 | |||
1164 | if (mem == old_mem) | ||
1165 | goto out; | ||
1166 | |||
1167 | /* | ||
1168 | * Only thread group leaders are allowed to migrate, the mm_struct is | ||
1169 | * in effect owned by the leader | ||
1170 | */ | ||
1171 | if (p->tgid != p->pid) | ||
1172 | goto out; | ||
1173 | |||
1174 | css_get(&mem->css); | ||
1175 | rcu_assign_pointer(mm->mem_cgroup, mem); | ||
1176 | css_put(&old_mem->css); | ||
1177 | |||
1178 | out: | ||
1179 | mmput(mm); | ||
1180 | return; | ||
1181 | } | ||
1182 | |||
1183 | struct cgroup_subsys mem_cgroup_subsys = { | ||
1184 | .name = "memory", | ||
1185 | .subsys_id = mem_cgroup_subsys_id, | ||
1186 | .create = mem_cgroup_create, | ||
1187 | .pre_destroy = mem_cgroup_pre_destroy, | ||
1188 | .destroy = mem_cgroup_destroy, | ||
1189 | .populate = mem_cgroup_populate, | ||
1190 | .attach = mem_cgroup_move_task, | ||
1191 | .early_init = 0, | ||
1192 | }; | ||
diff --git a/mm/memory.c b/mm/memory.c index d902d0e25edc..153a54b2013c 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -50,6 +50,7 @@ | |||
50 | #include <linux/delayacct.h> | 50 | #include <linux/delayacct.h> |
51 | #include <linux/init.h> | 51 | #include <linux/init.h> |
52 | #include <linux/writeback.h> | 52 | #include <linux/writeback.h> |
53 | #include <linux/memcontrol.h> | ||
53 | 54 | ||
54 | #include <asm/pgalloc.h> | 55 | #include <asm/pgalloc.h> |
55 | #include <asm/uaccess.h> | 56 | #include <asm/uaccess.h> |
@@ -82,7 +83,18 @@ void * high_memory; | |||
82 | EXPORT_SYMBOL(num_physpages); | 83 | EXPORT_SYMBOL(num_physpages); |
83 | EXPORT_SYMBOL(high_memory); | 84 | EXPORT_SYMBOL(high_memory); |
84 | 85 | ||
85 | int randomize_va_space __read_mostly = 1; | 86 | /* |
87 | * Randomize the address space (stacks, mmaps, brk, etc.). | ||
88 | * | ||
89 | * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, | ||
90 | * as ancient (libc5 based) binaries can segfault. ) | ||
91 | */ | ||
92 | int randomize_va_space __read_mostly = | ||
93 | #ifdef CONFIG_COMPAT_BRK | ||
94 | 1; | ||
95 | #else | ||
96 | 2; | ||
97 | #endif | ||
86 | 98 | ||
87 | static int __init disable_randmaps(char *s) | 99 | static int __init disable_randmaps(char *s) |
88 | { | 100 | { |
@@ -305,7 +317,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | |||
305 | spin_lock(&mm->page_table_lock); | 317 | spin_lock(&mm->page_table_lock); |
306 | if (pmd_present(*pmd)) { /* Another has populated it */ | 318 | if (pmd_present(*pmd)) { /* Another has populated it */ |
307 | pte_lock_deinit(new); | 319 | pte_lock_deinit(new); |
308 | pte_free(new); | 320 | pte_free(mm, new); |
309 | } else { | 321 | } else { |
310 | mm->nr_ptes++; | 322 | mm->nr_ptes++; |
311 | inc_zone_page_state(new, NR_PAGETABLE); | 323 | inc_zone_page_state(new, NR_PAGETABLE); |
@@ -323,7 +335,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) | |||
323 | 335 | ||
324 | spin_lock(&init_mm.page_table_lock); | 336 | spin_lock(&init_mm.page_table_lock); |
325 | if (pmd_present(*pmd)) /* Another has populated it */ | 337 | if (pmd_present(*pmd)) /* Another has populated it */ |
326 | pte_free_kernel(new); | 338 | pte_free_kernel(&init_mm, new); |
327 | else | 339 | else |
328 | pmd_populate_kernel(&init_mm, pmd, new); | 340 | pmd_populate_kernel(&init_mm, pmd, new); |
329 | spin_unlock(&init_mm.page_table_lock); | 341 | spin_unlock(&init_mm.page_table_lock); |
@@ -1109,7 +1121,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1109 | } | 1121 | } |
1110 | EXPORT_SYMBOL(get_user_pages); | 1122 | EXPORT_SYMBOL(get_user_pages); |
1111 | 1123 | ||
1112 | pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) | 1124 | pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, |
1125 | spinlock_t **ptl) | ||
1113 | { | 1126 | { |
1114 | pgd_t * pgd = pgd_offset(mm, addr); | 1127 | pgd_t * pgd = pgd_offset(mm, addr); |
1115 | pud_t * pud = pud_alloc(mm, pgd, addr); | 1128 | pud_t * pud = pud_alloc(mm, pgd, addr); |
@@ -1132,16 +1145,20 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa | |||
1132 | { | 1145 | { |
1133 | int retval; | 1146 | int retval; |
1134 | pte_t *pte; | 1147 | pte_t *pte; |
1135 | spinlock_t *ptl; | 1148 | spinlock_t *ptl; |
1149 | |||
1150 | retval = mem_cgroup_charge(page, mm, GFP_KERNEL); | ||
1151 | if (retval) | ||
1152 | goto out; | ||
1136 | 1153 | ||
1137 | retval = -EINVAL; | 1154 | retval = -EINVAL; |
1138 | if (PageAnon(page)) | 1155 | if (PageAnon(page)) |
1139 | goto out; | 1156 | goto out_uncharge; |
1140 | retval = -ENOMEM; | 1157 | retval = -ENOMEM; |
1141 | flush_dcache_page(page); | 1158 | flush_dcache_page(page); |
1142 | pte = get_locked_pte(mm, addr, &ptl); | 1159 | pte = get_locked_pte(mm, addr, &ptl); |
1143 | if (!pte) | 1160 | if (!pte) |
1144 | goto out; | 1161 | goto out_uncharge; |
1145 | retval = -EBUSY; | 1162 | retval = -EBUSY; |
1146 | if (!pte_none(*pte)) | 1163 | if (!pte_none(*pte)) |
1147 | goto out_unlock; | 1164 | goto out_unlock; |
@@ -1153,8 +1170,12 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa | |||
1153 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); | 1170 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); |
1154 | 1171 | ||
1155 | retval = 0; | 1172 | retval = 0; |
1173 | pte_unmap_unlock(pte, ptl); | ||
1174 | return retval; | ||
1156 | out_unlock: | 1175 | out_unlock: |
1157 | pte_unmap_unlock(pte, ptl); | 1176 | pte_unmap_unlock(pte, ptl); |
1177 | out_uncharge: | ||
1178 | mem_cgroup_uncharge_page(page); | ||
1158 | out: | 1179 | out: |
1159 | return retval; | 1180 | return retval; |
1160 | } | 1181 | } |
@@ -1517,10 +1538,8 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo | |||
1517 | memset(kaddr, 0, PAGE_SIZE); | 1538 | memset(kaddr, 0, PAGE_SIZE); |
1518 | kunmap_atomic(kaddr, KM_USER0); | 1539 | kunmap_atomic(kaddr, KM_USER0); |
1519 | flush_dcache_page(dst); | 1540 | flush_dcache_page(dst); |
1520 | return; | 1541 | } else |
1521 | 1542 | copy_user_highpage(dst, src, va, vma); | |
1522 | } | ||
1523 | copy_user_highpage(dst, src, va, vma); | ||
1524 | } | 1543 | } |
1525 | 1544 | ||
1526 | /* | 1545 | /* |
@@ -1629,6 +1648,10 @@ gotten: | |||
1629 | if (!new_page) | 1648 | if (!new_page) |
1630 | goto oom; | 1649 | goto oom; |
1631 | cow_user_page(new_page, old_page, address, vma); | 1650 | cow_user_page(new_page, old_page, address, vma); |
1651 | __SetPageUptodate(new_page); | ||
1652 | |||
1653 | if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) | ||
1654 | goto oom_free_new; | ||
1632 | 1655 | ||
1633 | /* | 1656 | /* |
1634 | * Re-check the pte - we dropped the lock | 1657 | * Re-check the pte - we dropped the lock |
@@ -1661,7 +1684,9 @@ gotten: | |||
1661 | /* Free the old page.. */ | 1684 | /* Free the old page.. */ |
1662 | new_page = old_page; | 1685 | new_page = old_page; |
1663 | ret |= VM_FAULT_WRITE; | 1686 | ret |= VM_FAULT_WRITE; |
1664 | } | 1687 | } else |
1688 | mem_cgroup_uncharge_page(new_page); | ||
1689 | |||
1665 | if (new_page) | 1690 | if (new_page) |
1666 | page_cache_release(new_page); | 1691 | page_cache_release(new_page); |
1667 | if (old_page) | 1692 | if (old_page) |
@@ -1685,6 +1710,8 @@ unlock: | |||
1685 | put_page(dirty_page); | 1710 | put_page(dirty_page); |
1686 | } | 1711 | } |
1687 | return ret; | 1712 | return ret; |
1713 | oom_free_new: | ||
1714 | __free_page(new_page); | ||
1688 | oom: | 1715 | oom: |
1689 | if (old_page) | 1716 | if (old_page) |
1690 | page_cache_release(old_page); | 1717 | page_cache_release(old_page); |
@@ -1909,50 +1936,49 @@ EXPORT_SYMBOL(unmap_mapping_range); | |||
1909 | */ | 1936 | */ |
1910 | int vmtruncate(struct inode * inode, loff_t offset) | 1937 | int vmtruncate(struct inode * inode, loff_t offset) |
1911 | { | 1938 | { |
1912 | struct address_space *mapping = inode->i_mapping; | 1939 | if (inode->i_size < offset) { |
1913 | unsigned long limit; | 1940 | unsigned long limit; |
1914 | 1941 | ||
1915 | if (inode->i_size < offset) | 1942 | limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; |
1916 | goto do_expand; | 1943 | if (limit != RLIM_INFINITY && offset > limit) |
1917 | /* | 1944 | goto out_sig; |
1918 | * truncation of in-use swapfiles is disallowed - it would cause | 1945 | if (offset > inode->i_sb->s_maxbytes) |
1919 | * subsequent swapout to scribble on the now-freed blocks. | 1946 | goto out_big; |
1920 | */ | 1947 | i_size_write(inode, offset); |
1921 | if (IS_SWAPFILE(inode)) | 1948 | } else { |
1922 | goto out_busy; | 1949 | struct address_space *mapping = inode->i_mapping; |
1923 | i_size_write(inode, offset); | 1950 | |
1951 | /* | ||
1952 | * truncation of in-use swapfiles is disallowed - it would | ||
1953 | * cause subsequent swapout to scribble on the now-freed | ||
1954 | * blocks. | ||
1955 | */ | ||
1956 | if (IS_SWAPFILE(inode)) | ||
1957 | return -ETXTBSY; | ||
1958 | i_size_write(inode, offset); | ||
1959 | |||
1960 | /* | ||
1961 | * unmap_mapping_range is called twice, first simply for | ||
1962 | * efficiency so that truncate_inode_pages does fewer | ||
1963 | * single-page unmaps. However after this first call, and | ||
1964 | * before truncate_inode_pages finishes, it is possible for | ||
1965 | * private pages to be COWed, which remain after | ||
1966 | * truncate_inode_pages finishes, hence the second | ||
1967 | * unmap_mapping_range call must be made for correctness. | ||
1968 | */ | ||
1969 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); | ||
1970 | truncate_inode_pages(mapping, offset); | ||
1971 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); | ||
1972 | } | ||
1924 | 1973 | ||
1925 | /* | ||
1926 | * unmap_mapping_range is called twice, first simply for efficiency | ||
1927 | * so that truncate_inode_pages does fewer single-page unmaps. However | ||
1928 | * after this first call, and before truncate_inode_pages finishes, | ||
1929 | * it is possible for private pages to be COWed, which remain after | ||
1930 | * truncate_inode_pages finishes, hence the second unmap_mapping_range | ||
1931 | * call must be made for correctness. | ||
1932 | */ | ||
1933 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); | ||
1934 | truncate_inode_pages(mapping, offset); | ||
1935 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); | ||
1936 | goto out_truncate; | ||
1937 | |||
1938 | do_expand: | ||
1939 | limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | ||
1940 | if (limit != RLIM_INFINITY && offset > limit) | ||
1941 | goto out_sig; | ||
1942 | if (offset > inode->i_sb->s_maxbytes) | ||
1943 | goto out_big; | ||
1944 | i_size_write(inode, offset); | ||
1945 | |||
1946 | out_truncate: | ||
1947 | if (inode->i_op && inode->i_op->truncate) | 1974 | if (inode->i_op && inode->i_op->truncate) |
1948 | inode->i_op->truncate(inode); | 1975 | inode->i_op->truncate(inode); |
1949 | return 0; | 1976 | return 0; |
1977 | |||
1950 | out_sig: | 1978 | out_sig: |
1951 | send_sig(SIGXFSZ, current, 0); | 1979 | send_sig(SIGXFSZ, current, 0); |
1952 | out_big: | 1980 | out_big: |
1953 | return -EFBIG; | 1981 | return -EFBIG; |
1954 | out_busy: | ||
1955 | return -ETXTBSY; | ||
1956 | } | 1982 | } |
1957 | EXPORT_SYMBOL(vmtruncate); | 1983 | EXPORT_SYMBOL(vmtruncate); |
1958 | 1984 | ||
@@ -1980,67 +2006,6 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | |||
1980 | return 0; | 2006 | return 0; |
1981 | } | 2007 | } |
1982 | 2008 | ||
1983 | /** | ||
1984 | * swapin_readahead - swap in pages in hope we need them soon | ||
1985 | * @entry: swap entry of this memory | ||
1986 | * @addr: address to start | ||
1987 | * @vma: user vma this addresses belong to | ||
1988 | * | ||
1989 | * Primitive swap readahead code. We simply read an aligned block of | ||
1990 | * (1 << page_cluster) entries in the swap area. This method is chosen | ||
1991 | * because it doesn't cost us any seek time. We also make sure to queue | ||
1992 | * the 'original' request together with the readahead ones... | ||
1993 | * | ||
1994 | * This has been extended to use the NUMA policies from the mm triggering | ||
1995 | * the readahead. | ||
1996 | * | ||
1997 | * Caller must hold down_read on the vma->vm_mm if vma is not NULL. | ||
1998 | */ | ||
1999 | void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma) | ||
2000 | { | ||
2001 | #ifdef CONFIG_NUMA | ||
2002 | struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL; | ||
2003 | #endif | ||
2004 | int i, num; | ||
2005 | struct page *new_page; | ||
2006 | unsigned long offset; | ||
2007 | |||
2008 | /* | ||
2009 | * Get the number of handles we should do readahead io to. | ||
2010 | */ | ||
2011 | num = valid_swaphandles(entry, &offset); | ||
2012 | for (i = 0; i < num; offset++, i++) { | ||
2013 | /* Ok, do the async read-ahead now */ | ||
2014 | new_page = read_swap_cache_async(swp_entry(swp_type(entry), | ||
2015 | offset), vma, addr); | ||
2016 | if (!new_page) | ||
2017 | break; | ||
2018 | page_cache_release(new_page); | ||
2019 | #ifdef CONFIG_NUMA | ||
2020 | /* | ||
2021 | * Find the next applicable VMA for the NUMA policy. | ||
2022 | */ | ||
2023 | addr += PAGE_SIZE; | ||
2024 | if (addr == 0) | ||
2025 | vma = NULL; | ||
2026 | if (vma) { | ||
2027 | if (addr >= vma->vm_end) { | ||
2028 | vma = next_vma; | ||
2029 | next_vma = vma ? vma->vm_next : NULL; | ||
2030 | } | ||
2031 | if (vma && addr < vma->vm_start) | ||
2032 | vma = NULL; | ||
2033 | } else { | ||
2034 | if (next_vma && addr >= next_vma->vm_start) { | ||
2035 | vma = next_vma; | ||
2036 | next_vma = vma->vm_next; | ||
2037 | } | ||
2038 | } | ||
2039 | #endif | ||
2040 | } | ||
2041 | lru_add_drain(); /* Push any new pages onto the LRU now */ | ||
2042 | } | ||
2043 | |||
2044 | /* | 2009 | /* |
2045 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 2010 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
2046 | * but allow concurrent faults), and pte mapped but not yet locked. | 2011 | * but allow concurrent faults), and pte mapped but not yet locked. |
@@ -2068,8 +2033,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2068 | page = lookup_swap_cache(entry); | 2033 | page = lookup_swap_cache(entry); |
2069 | if (!page) { | 2034 | if (!page) { |
2070 | grab_swap_token(); /* Contend for token _before_ read-in */ | 2035 | grab_swap_token(); /* Contend for token _before_ read-in */ |
2071 | swapin_readahead(entry, address, vma); | 2036 | page = swapin_readahead(entry, |
2072 | page = read_swap_cache_async(entry, vma, address); | 2037 | GFP_HIGHUSER_MOVABLE, vma, address); |
2073 | if (!page) { | 2038 | if (!page) { |
2074 | /* | 2039 | /* |
2075 | * Back out if somebody else faulted in this pte | 2040 | * Back out if somebody else faulted in this pte |
@@ -2087,6 +2052,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2087 | count_vm_event(PGMAJFAULT); | 2052 | count_vm_event(PGMAJFAULT); |
2088 | } | 2053 | } |
2089 | 2054 | ||
2055 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { | ||
2056 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | ||
2057 | ret = VM_FAULT_OOM; | ||
2058 | goto out; | ||
2059 | } | ||
2060 | |||
2090 | mark_page_accessed(page); | 2061 | mark_page_accessed(page); |
2091 | lock_page(page); | 2062 | lock_page(page); |
2092 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2063 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
@@ -2124,8 +2095,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2124 | if (write_access) { | 2095 | if (write_access) { |
2125 | /* XXX: We could OR the do_wp_page code with this one? */ | 2096 | /* XXX: We could OR the do_wp_page code with this one? */ |
2126 | if (do_wp_page(mm, vma, address, | 2097 | if (do_wp_page(mm, vma, address, |
2127 | page_table, pmd, ptl, pte) & VM_FAULT_OOM) | 2098 | page_table, pmd, ptl, pte) & VM_FAULT_OOM) { |
2099 | mem_cgroup_uncharge_page(page); | ||
2128 | ret = VM_FAULT_OOM; | 2100 | ret = VM_FAULT_OOM; |
2101 | } | ||
2129 | goto out; | 2102 | goto out; |
2130 | } | 2103 | } |
2131 | 2104 | ||
@@ -2136,6 +2109,7 @@ unlock: | |||
2136 | out: | 2109 | out: |
2137 | return ret; | 2110 | return ret; |
2138 | out_nomap: | 2111 | out_nomap: |
2112 | mem_cgroup_uncharge_page(page); | ||
2139 | pte_unmap_unlock(page_table, ptl); | 2113 | pte_unmap_unlock(page_table, ptl); |
2140 | unlock_page(page); | 2114 | unlock_page(page); |
2141 | page_cache_release(page); | 2115 | page_cache_release(page); |
@@ -2163,6 +2137,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2163 | page = alloc_zeroed_user_highpage_movable(vma, address); | 2137 | page = alloc_zeroed_user_highpage_movable(vma, address); |
2164 | if (!page) | 2138 | if (!page) |
2165 | goto oom; | 2139 | goto oom; |
2140 | __SetPageUptodate(page); | ||
2141 | |||
2142 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) | ||
2143 | goto oom_free_page; | ||
2166 | 2144 | ||
2167 | entry = mk_pte(page, vma->vm_page_prot); | 2145 | entry = mk_pte(page, vma->vm_page_prot); |
2168 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2146 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
@@ -2181,8 +2159,11 @@ unlock: | |||
2181 | pte_unmap_unlock(page_table, ptl); | 2159 | pte_unmap_unlock(page_table, ptl); |
2182 | return 0; | 2160 | return 0; |
2183 | release: | 2161 | release: |
2162 | mem_cgroup_uncharge_page(page); | ||
2184 | page_cache_release(page); | 2163 | page_cache_release(page); |
2185 | goto unlock; | 2164 | goto unlock; |
2165 | oom_free_page: | ||
2166 | __free_page(page); | ||
2186 | oom: | 2167 | oom: |
2187 | return VM_FAULT_OOM; | 2168 | return VM_FAULT_OOM; |
2188 | } | 2169 | } |
@@ -2263,6 +2244,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2263 | goto out; | 2244 | goto out; |
2264 | } | 2245 | } |
2265 | copy_user_highpage(page, vmf.page, address, vma); | 2246 | copy_user_highpage(page, vmf.page, address, vma); |
2247 | __SetPageUptodate(page); | ||
2266 | } else { | 2248 | } else { |
2267 | /* | 2249 | /* |
2268 | * If the page will be shareable, see if the backing | 2250 | * If the page will be shareable, see if the backing |
@@ -2295,6 +2277,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2295 | 2277 | ||
2296 | } | 2278 | } |
2297 | 2279 | ||
2280 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { | ||
2281 | ret = VM_FAULT_OOM; | ||
2282 | goto out; | ||
2283 | } | ||
2284 | |||
2298 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2285 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
2299 | 2286 | ||
2300 | /* | 2287 | /* |
@@ -2330,6 +2317,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2330 | /* no need to invalidate: a not-present page won't be cached */ | 2317 | /* no need to invalidate: a not-present page won't be cached */ |
2331 | update_mmu_cache(vma, address, entry); | 2318 | update_mmu_cache(vma, address, entry); |
2332 | } else { | 2319 | } else { |
2320 | mem_cgroup_uncharge_page(page); | ||
2333 | if (anon) | 2321 | if (anon) |
2334 | page_cache_release(page); | 2322 | page_cache_release(page); |
2335 | else | 2323 | else |
@@ -2563,7 +2551,7 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) | |||
2563 | 2551 | ||
2564 | spin_lock(&mm->page_table_lock); | 2552 | spin_lock(&mm->page_table_lock); |
2565 | if (pgd_present(*pgd)) /* Another has populated it */ | 2553 | if (pgd_present(*pgd)) /* Another has populated it */ |
2566 | pud_free(new); | 2554 | pud_free(mm, new); |
2567 | else | 2555 | else |
2568 | pgd_populate(mm, pgd, new); | 2556 | pgd_populate(mm, pgd, new); |
2569 | spin_unlock(&mm->page_table_lock); | 2557 | spin_unlock(&mm->page_table_lock); |
@@ -2585,12 +2573,12 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) | |||
2585 | spin_lock(&mm->page_table_lock); | 2573 | spin_lock(&mm->page_table_lock); |
2586 | #ifndef __ARCH_HAS_4LEVEL_HACK | 2574 | #ifndef __ARCH_HAS_4LEVEL_HACK |
2587 | if (pud_present(*pud)) /* Another has populated it */ | 2575 | if (pud_present(*pud)) /* Another has populated it */ |
2588 | pmd_free(new); | 2576 | pmd_free(mm, new); |
2589 | else | 2577 | else |
2590 | pud_populate(mm, pud, new); | 2578 | pud_populate(mm, pud, new); |
2591 | #else | 2579 | #else |
2592 | if (pgd_present(*pud)) /* Another has populated it */ | 2580 | if (pgd_present(*pud)) /* Another has populated it */ |
2593 | pmd_free(new); | 2581 | pmd_free(mm, new); |
2594 | else | 2582 | else |
2595 | pgd_populate(mm, pud, new); | 2583 | pgd_populate(mm, pud, new); |
2596 | #endif /* __ARCH_HAS_4LEVEL_HACK */ | 2584 | #endif /* __ARCH_HAS_4LEVEL_HACK */ |
@@ -2618,46 +2606,6 @@ int make_pages_present(unsigned long addr, unsigned long end) | |||
2618 | return ret == len ? 0 : -1; | 2606 | return ret == len ? 0 : -1; |
2619 | } | 2607 | } |
2620 | 2608 | ||
2621 | /* | ||
2622 | * Map a vmalloc()-space virtual address to the physical page. | ||
2623 | */ | ||
2624 | struct page * vmalloc_to_page(void * vmalloc_addr) | ||
2625 | { | ||
2626 | unsigned long addr = (unsigned long) vmalloc_addr; | ||
2627 | struct page *page = NULL; | ||
2628 | pgd_t *pgd = pgd_offset_k(addr); | ||
2629 | pud_t *pud; | ||
2630 | pmd_t *pmd; | ||
2631 | pte_t *ptep, pte; | ||
2632 | |||
2633 | if (!pgd_none(*pgd)) { | ||
2634 | pud = pud_offset(pgd, addr); | ||
2635 | if (!pud_none(*pud)) { | ||
2636 | pmd = pmd_offset(pud, addr); | ||
2637 | if (!pmd_none(*pmd)) { | ||
2638 | ptep = pte_offset_map(pmd, addr); | ||
2639 | pte = *ptep; | ||
2640 | if (pte_present(pte)) | ||
2641 | page = pte_page(pte); | ||
2642 | pte_unmap(ptep); | ||
2643 | } | ||
2644 | } | ||
2645 | } | ||
2646 | return page; | ||
2647 | } | ||
2648 | |||
2649 | EXPORT_SYMBOL(vmalloc_to_page); | ||
2650 | |||
2651 | /* | ||
2652 | * Map a vmalloc()-space virtual address to the physical page frame number. | ||
2653 | */ | ||
2654 | unsigned long vmalloc_to_pfn(void * vmalloc_addr) | ||
2655 | { | ||
2656 | return page_to_pfn(vmalloc_to_page(vmalloc_addr)); | ||
2657 | } | ||
2658 | |||
2659 | EXPORT_SYMBOL(vmalloc_to_pfn); | ||
2660 | |||
2661 | #if !defined(__HAVE_ARCH_GATE_AREA) | 2609 | #if !defined(__HAVE_ARCH_GATE_AREA) |
2662 | 2610 | ||
2663 | #if defined(AT_SYSINFO_EHDR) | 2611 | #if defined(AT_SYSINFO_EHDR) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9512a544d044..7469c503580d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -481,8 +481,6 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
481 | return offlined; | 481 | return offlined; |
482 | } | 482 | } |
483 | 483 | ||
484 | extern void drain_all_local_pages(void); | ||
485 | |||
486 | int offline_pages(unsigned long start_pfn, | 484 | int offline_pages(unsigned long start_pfn, |
487 | unsigned long end_pfn, unsigned long timeout) | 485 | unsigned long end_pfn, unsigned long timeout) |
488 | { | 486 | { |
@@ -540,7 +538,7 @@ repeat: | |||
540 | lru_add_drain_all(); | 538 | lru_add_drain_all(); |
541 | flush_scheduled_work(); | 539 | flush_scheduled_work(); |
542 | cond_resched(); | 540 | cond_resched(); |
543 | drain_all_local_pages(); | 541 | drain_all_pages(); |
544 | } | 542 | } |
545 | 543 | ||
546 | pfn = scan_lru_pages(start_pfn, end_pfn); | 544 | pfn = scan_lru_pages(start_pfn, end_pfn); |
@@ -563,7 +561,7 @@ repeat: | |||
563 | flush_scheduled_work(); | 561 | flush_scheduled_work(); |
564 | yield(); | 562 | yield(); |
565 | /* drain pcp pages , this is synchrouns. */ | 563 | /* drain pcp pages , this is synchrouns. */ |
566 | drain_all_local_pages(); | 564 | drain_all_pages(); |
567 | /* check again */ | 565 | /* check again */ |
568 | offlined_pages = check_pages_isolated(start_pfn, end_pfn); | 566 | offlined_pages = check_pages_isolated(start_pfn, end_pfn); |
569 | if (offlined_pages < 0) { | 567 | if (offlined_pages < 0) { |
diff --git a/mm/migrate.c b/mm/migrate.c index 6a207e8d17ea..a73504ff5ab9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/mempolicy.h> | 29 | #include <linux/mempolicy.h> |
30 | #include <linux/vmalloc.h> | 30 | #include <linux/vmalloc.h> |
31 | #include <linux/security.h> | 31 | #include <linux/security.h> |
32 | #include <linux/memcontrol.h> | ||
32 | 33 | ||
33 | #include "internal.h" | 34 | #include "internal.h" |
34 | 35 | ||
@@ -115,11 +116,6 @@ int putback_lru_pages(struct list_head *l) | |||
115 | return count; | 116 | return count; |
116 | } | 117 | } |
117 | 118 | ||
118 | static inline int is_swap_pte(pte_t pte) | ||
119 | { | ||
120 | return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); | ||
121 | } | ||
122 | |||
123 | /* | 119 | /* |
124 | * Restore a potential migration pte to a working pte entry | 120 | * Restore a potential migration pte to a working pte entry |
125 | */ | 121 | */ |
@@ -157,6 +153,11 @@ static void remove_migration_pte(struct vm_area_struct *vma, | |||
157 | return; | 153 | return; |
158 | } | 154 | } |
159 | 155 | ||
156 | if (mem_cgroup_charge(new, mm, GFP_KERNEL)) { | ||
157 | pte_unmap(ptep); | ||
158 | return; | ||
159 | } | ||
160 | |||
160 | ptl = pte_lockptr(mm, pmd); | 161 | ptl = pte_lockptr(mm, pmd); |
161 | spin_lock(ptl); | 162 | spin_lock(ptl); |
162 | pte = *ptep; | 163 | pte = *ptep; |
@@ -592,9 +593,10 @@ static int move_to_new_page(struct page *newpage, struct page *page) | |||
592 | else | 593 | else |
593 | rc = fallback_migrate_page(mapping, newpage, page); | 594 | rc = fallback_migrate_page(mapping, newpage, page); |
594 | 595 | ||
595 | if (!rc) | 596 | if (!rc) { |
597 | mem_cgroup_page_migration(page, newpage); | ||
596 | remove_migration_ptes(page, newpage); | 598 | remove_migration_ptes(page, newpage); |
597 | else | 599 | } else |
598 | newpage->mapping = NULL; | 600 | newpage->mapping = NULL; |
599 | 601 | ||
600 | unlock_page(newpage); | 602 | unlock_page(newpage); |
@@ -613,6 +615,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
613 | int *result = NULL; | 615 | int *result = NULL; |
614 | struct page *newpage = get_new_page(page, private, &result); | 616 | struct page *newpage = get_new_page(page, private, &result); |
615 | int rcu_locked = 0; | 617 | int rcu_locked = 0; |
618 | int charge = 0; | ||
616 | 619 | ||
617 | if (!newpage) | 620 | if (!newpage) |
618 | return -ENOMEM; | 621 | return -ENOMEM; |
@@ -645,23 +648,46 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
645 | rcu_read_lock(); | 648 | rcu_read_lock(); |
646 | rcu_locked = 1; | 649 | rcu_locked = 1; |
647 | } | 650 | } |
651 | |||
648 | /* | 652 | /* |
649 | * This is a corner case handling. | 653 | * Corner case handling: |
650 | * When a new swap-cache is read into, it is linked to LRU | 654 | * 1. When a new swap-cache page is read into, it is added to the LRU |
651 | * and treated as swapcache but has no rmap yet. | 655 | * and treated as swapcache but it has no rmap yet. |
652 | * Calling try_to_unmap() against a page->mapping==NULL page is | 656 | * Calling try_to_unmap() against a page->mapping==NULL page will |
653 | * BUG. So handle it here. | 657 | * trigger a BUG. So handle it here. |
658 | * 2. An orphaned page (see truncate_complete_page) might have | ||
659 | * fs-private metadata. The page can be picked up due to memory | ||
660 | * offlining. Everywhere else except page reclaim, the page is | ||
661 | * invisible to the vm, so the page can not be migrated. So try to | ||
662 | * free the metadata, so the page can be freed. | ||
654 | */ | 663 | */ |
655 | if (!page->mapping) | 664 | if (!page->mapping) { |
665 | if (!PageAnon(page) && PagePrivate(page)) { | ||
666 | /* | ||
667 | * Go direct to try_to_free_buffers() here because | ||
668 | * a) that's what try_to_release_page() would do anyway | ||
669 | * b) we may be under rcu_read_lock() here, so we can't | ||
670 | * use GFP_KERNEL which is what try_to_release_page() | ||
671 | * needs to be effective. | ||
672 | */ | ||
673 | try_to_free_buffers(page); | ||
674 | } | ||
656 | goto rcu_unlock; | 675 | goto rcu_unlock; |
676 | } | ||
677 | |||
678 | charge = mem_cgroup_prepare_migration(page); | ||
657 | /* Establish migration ptes or remove ptes */ | 679 | /* Establish migration ptes or remove ptes */ |
658 | try_to_unmap(page, 1); | 680 | try_to_unmap(page, 1); |
659 | 681 | ||
660 | if (!page_mapped(page)) | 682 | if (!page_mapped(page)) |
661 | rc = move_to_new_page(newpage, page); | 683 | rc = move_to_new_page(newpage, page); |
662 | 684 | ||
663 | if (rc) | 685 | if (rc) { |
664 | remove_migration_ptes(page, page); | 686 | remove_migration_ptes(page, page); |
687 | if (charge) | ||
688 | mem_cgroup_end_migration(page); | ||
689 | } else if (charge) | ||
690 | mem_cgroup_end_migration(newpage); | ||
665 | rcu_unlock: | 691 | rcu_unlock: |
666 | if (rcu_locked) | 692 | if (rcu_locked) |
667 | rcu_read_unlock(); | 693 | rcu_read_unlock(); |
@@ -36,6 +36,10 @@ | |||
36 | #define arch_mmap_check(addr, len, flags) (0) | 36 | #define arch_mmap_check(addr, len, flags) (0) |
37 | #endif | 37 | #endif |
38 | 38 | ||
39 | #ifndef arch_rebalance_pgtables | ||
40 | #define arch_rebalance_pgtables(addr, len) (addr) | ||
41 | #endif | ||
42 | |||
39 | static void unmap_region(struct mm_struct *mm, | 43 | static void unmap_region(struct mm_struct *mm, |
40 | struct vm_area_struct *vma, struct vm_area_struct *prev, | 44 | struct vm_area_struct *vma, struct vm_area_struct *prev, |
41 | unsigned long start, unsigned long end); | 45 | unsigned long start, unsigned long end); |
@@ -241,7 +245,7 @@ asmlinkage unsigned long sys_brk(unsigned long brk) | |||
241 | 245 | ||
242 | down_write(&mm->mmap_sem); | 246 | down_write(&mm->mmap_sem); |
243 | 247 | ||
244 | if (brk < mm->end_code) | 248 | if (brk < mm->start_brk) |
245 | goto out; | 249 | goto out; |
246 | 250 | ||
247 | /* | 251 | /* |
@@ -1424,7 +1428,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, | |||
1424 | if (addr & ~PAGE_MASK) | 1428 | if (addr & ~PAGE_MASK) |
1425 | return -EINVAL; | 1429 | return -EINVAL; |
1426 | 1430 | ||
1427 | return addr; | 1431 | return arch_rebalance_pgtables(addr, len); |
1428 | } | 1432 | } |
1429 | 1433 | ||
1430 | EXPORT_SYMBOL(get_unmapped_area); | 1434 | EXPORT_SYMBOL(get_unmapped_area); |
@@ -2216,7 +2220,7 @@ int install_special_mapping(struct mm_struct *mm, | |||
2216 | vma->vm_start = addr; | 2220 | vma->vm_start = addr; |
2217 | vma->vm_end = addr + len; | 2221 | vma->vm_end = addr + len; |
2218 | 2222 | ||
2219 | vma->vm_flags = vm_flags | mm->def_flags; | 2223 | vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; |
2220 | vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); | 2224 | vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); |
2221 | 2225 | ||
2222 | vma->vm_ops = &special_mapping_vmops; | 2226 | vma->vm_ops = &special_mapping_vmops; |
diff --git a/mm/nommu.c b/mm/nommu.c index b989cb928a7c..5d8ae086f74e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -10,6 +10,7 @@ | |||
10 | * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> | 10 | * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> |
11 | * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> | 11 | * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> |
12 | * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> | 12 | * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> |
13 | * Copyright (c) 2007 Paul Mundt <lethal@linux-sh.org> | ||
13 | */ | 14 | */ |
14 | 15 | ||
15 | #include <linux/module.h> | 16 | #include <linux/module.h> |
@@ -167,7 +168,7 @@ EXPORT_SYMBOL(get_user_pages); | |||
167 | DEFINE_RWLOCK(vmlist_lock); | 168 | DEFINE_RWLOCK(vmlist_lock); |
168 | struct vm_struct *vmlist; | 169 | struct vm_struct *vmlist; |
169 | 170 | ||
170 | void vfree(void *addr) | 171 | void vfree(const void *addr) |
171 | { | 172 | { |
172 | kfree(addr); | 173 | kfree(addr); |
173 | } | 174 | } |
@@ -183,13 +184,33 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | |||
183 | } | 184 | } |
184 | EXPORT_SYMBOL(__vmalloc); | 185 | EXPORT_SYMBOL(__vmalloc); |
185 | 186 | ||
186 | struct page * vmalloc_to_page(void *addr) | 187 | void *vmalloc_user(unsigned long size) |
188 | { | ||
189 | void *ret; | ||
190 | |||
191 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, | ||
192 | PAGE_KERNEL); | ||
193 | if (ret) { | ||
194 | struct vm_area_struct *vma; | ||
195 | |||
196 | down_write(¤t->mm->mmap_sem); | ||
197 | vma = find_vma(current->mm, (unsigned long)ret); | ||
198 | if (vma) | ||
199 | vma->vm_flags |= VM_USERMAP; | ||
200 | up_write(¤t->mm->mmap_sem); | ||
201 | } | ||
202 | |||
203 | return ret; | ||
204 | } | ||
205 | EXPORT_SYMBOL(vmalloc_user); | ||
206 | |||
207 | struct page *vmalloc_to_page(const void *addr) | ||
187 | { | 208 | { |
188 | return virt_to_page(addr); | 209 | return virt_to_page(addr); |
189 | } | 210 | } |
190 | EXPORT_SYMBOL(vmalloc_to_page); | 211 | EXPORT_SYMBOL(vmalloc_to_page); |
191 | 212 | ||
192 | unsigned long vmalloc_to_pfn(void *addr) | 213 | unsigned long vmalloc_to_pfn(const void *addr) |
193 | { | 214 | { |
194 | return page_to_pfn(virt_to_page(addr)); | 215 | return page_to_pfn(virt_to_page(addr)); |
195 | } | 216 | } |
@@ -253,10 +274,17 @@ EXPORT_SYMBOL(vmalloc_32); | |||
253 | * | 274 | * |
254 | * The resulting memory area is 32bit addressable and zeroed so it can be | 275 | * The resulting memory area is 32bit addressable and zeroed so it can be |
255 | * mapped to userspace without leaking data. | 276 | * mapped to userspace without leaking data. |
277 | * | ||
278 | * VM_USERMAP is set on the corresponding VMA so that subsequent calls to | ||
279 | * remap_vmalloc_range() are permissible. | ||
256 | */ | 280 | */ |
257 | void *vmalloc_32_user(unsigned long size) | 281 | void *vmalloc_32_user(unsigned long size) |
258 | { | 282 | { |
259 | return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); | 283 | /* |
284 | * We'll have to sort out the ZONE_DMA bits for 64-bit, | ||
285 | * but for now this can simply use vmalloc_user() directly. | ||
286 | */ | ||
287 | return vmalloc_user(size); | ||
260 | } | 288 | } |
261 | EXPORT_SYMBOL(vmalloc_32_user); | 289 | EXPORT_SYMBOL(vmalloc_32_user); |
262 | 290 | ||
@@ -267,7 +295,7 @@ void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_ | |||
267 | } | 295 | } |
268 | EXPORT_SYMBOL(vmap); | 296 | EXPORT_SYMBOL(vmap); |
269 | 297 | ||
270 | void vunmap(void *addr) | 298 | void vunmap(const void *addr) |
271 | { | 299 | { |
272 | BUG(); | 300 | BUG(); |
273 | } | 301 | } |
@@ -1216,6 +1244,21 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, | |||
1216 | } | 1244 | } |
1217 | EXPORT_SYMBOL(remap_pfn_range); | 1245 | EXPORT_SYMBOL(remap_pfn_range); |
1218 | 1246 | ||
1247 | int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | ||
1248 | unsigned long pgoff) | ||
1249 | { | ||
1250 | unsigned int size = vma->vm_end - vma->vm_start; | ||
1251 | |||
1252 | if (!(vma->vm_flags & VM_USERMAP)) | ||
1253 | return -EINVAL; | ||
1254 | |||
1255 | vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT)); | ||
1256 | vma->vm_end = vma->vm_start + size; | ||
1257 | |||
1258 | return 0; | ||
1259 | } | ||
1260 | EXPORT_SYMBOL(remap_vmalloc_range); | ||
1261 | |||
1219 | void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | 1262 | void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) |
1220 | { | 1263 | { |
1221 | } | 1264 | } |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 96473b482099..4194b9db0104 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -25,9 +25,11 @@ | |||
25 | #include <linux/cpuset.h> | 25 | #include <linux/cpuset.h> |
26 | #include <linux/module.h> | 26 | #include <linux/module.h> |
27 | #include <linux/notifier.h> | 27 | #include <linux/notifier.h> |
28 | #include <linux/memcontrol.h> | ||
28 | 29 | ||
29 | int sysctl_panic_on_oom; | 30 | int sysctl_panic_on_oom; |
30 | int sysctl_oom_kill_allocating_task; | 31 | int sysctl_oom_kill_allocating_task; |
32 | int sysctl_oom_dump_tasks; | ||
31 | static DEFINE_SPINLOCK(zone_scan_mutex); | 33 | static DEFINE_SPINLOCK(zone_scan_mutex); |
32 | /* #define DEBUG */ | 34 | /* #define DEBUG */ |
33 | 35 | ||
@@ -50,7 +52,8 @@ static DEFINE_SPINLOCK(zone_scan_mutex); | |||
50 | * of least surprise ... (be careful when you change it) | 52 | * of least surprise ... (be careful when you change it) |
51 | */ | 53 | */ |
52 | 54 | ||
53 | unsigned long badness(struct task_struct *p, unsigned long uptime) | 55 | unsigned long badness(struct task_struct *p, unsigned long uptime, |
56 | struct mem_cgroup *mem) | ||
54 | { | 57 | { |
55 | unsigned long points, cpu_time, run_time, s; | 58 | unsigned long points, cpu_time, run_time, s; |
56 | struct mm_struct *mm; | 59 | struct mm_struct *mm; |
@@ -125,8 +128,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
125 | * Superuser processes are usually more important, so we make it | 128 | * Superuser processes are usually more important, so we make it |
126 | * less likely that we kill those. | 129 | * less likely that we kill those. |
127 | */ | 130 | */ |
128 | if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) || | 131 | if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE)) |
129 | p->uid == 0 || p->euid == 0) | ||
130 | points /= 4; | 132 | points /= 4; |
131 | 133 | ||
132 | /* | 134 | /* |
@@ -135,7 +137,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
135 | * tend to only have this flag set on applications they think | 137 | * tend to only have this flag set on applications they think |
136 | * of as important. | 138 | * of as important. |
137 | */ | 139 | */ |
138 | if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) | 140 | if (__capable(p, CAP_SYS_RAWIO)) |
139 | points /= 4; | 141 | points /= 4; |
140 | 142 | ||
141 | /* | 143 | /* |
@@ -194,7 +196,8 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, | |||
194 | * | 196 | * |
195 | * (not docbooked, we don't want this one cluttering up the manual) | 197 | * (not docbooked, we don't want this one cluttering up the manual) |
196 | */ | 198 | */ |
197 | static struct task_struct *select_bad_process(unsigned long *ppoints) | 199 | static struct task_struct *select_bad_process(unsigned long *ppoints, |
200 | struct mem_cgroup *mem) | ||
198 | { | 201 | { |
199 | struct task_struct *g, *p; | 202 | struct task_struct *g, *p; |
200 | struct task_struct *chosen = NULL; | 203 | struct task_struct *chosen = NULL; |
@@ -214,6 +217,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) | |||
214 | /* skip the init task */ | 217 | /* skip the init task */ |
215 | if (is_global_init(p)) | 218 | if (is_global_init(p)) |
216 | continue; | 219 | continue; |
220 | if (mem && !task_in_mem_cgroup(p, mem)) | ||
221 | continue; | ||
217 | 222 | ||
218 | /* | 223 | /* |
219 | * This task already has access to memory reserves and is | 224 | * This task already has access to memory reserves and is |
@@ -248,7 +253,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) | |||
248 | if (p->oomkilladj == OOM_DISABLE) | 253 | if (p->oomkilladj == OOM_DISABLE) |
249 | continue; | 254 | continue; |
250 | 255 | ||
251 | points = badness(p, uptime.tv_sec); | 256 | points = badness(p, uptime.tv_sec, mem); |
252 | if (points > *ppoints || !chosen) { | 257 | if (points > *ppoints || !chosen) { |
253 | chosen = p; | 258 | chosen = p; |
254 | *ppoints = points; | 259 | *ppoints = points; |
@@ -259,6 +264,41 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) | |||
259 | } | 264 | } |
260 | 265 | ||
261 | /** | 266 | /** |
267 | * Dumps the current memory state of all system tasks, excluding kernel threads. | ||
268 | * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj | ||
269 | * score, and name. | ||
270 | * | ||
271 | * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are | ||
272 | * shown. | ||
273 | * | ||
274 | * Call with tasklist_lock read-locked. | ||
275 | */ | ||
276 | static void dump_tasks(const struct mem_cgroup *mem) | ||
277 | { | ||
278 | struct task_struct *g, *p; | ||
279 | |||
280 | printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " | ||
281 | "name\n"); | ||
282 | do_each_thread(g, p) { | ||
283 | /* | ||
284 | * total_vm and rss sizes do not exist for tasks with a | ||
285 | * detached mm so there's no need to report them. | ||
286 | */ | ||
287 | if (!p->mm) | ||
288 | continue; | ||
289 | if (mem && !task_in_mem_cgroup(p, mem)) | ||
290 | continue; | ||
291 | |||
292 | task_lock(p); | ||
293 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", | ||
294 | p->pid, p->uid, p->tgid, p->mm->total_vm, | ||
295 | get_mm_rss(p->mm), (int)task_cpu(p), p->oomkilladj, | ||
296 | p->comm); | ||
297 | task_unlock(p); | ||
298 | } while_each_thread(g, p); | ||
299 | } | ||
300 | |||
301 | /** | ||
262 | * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO | 302 | * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO |
263 | * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO | 303 | * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO |
264 | * set. | 304 | * set. |
@@ -335,7 +375,8 @@ static int oom_kill_task(struct task_struct *p) | |||
335 | } | 375 | } |
336 | 376 | ||
337 | static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | 377 | static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, |
338 | unsigned long points, const char *message) | 378 | unsigned long points, struct mem_cgroup *mem, |
379 | const char *message) | ||
339 | { | 380 | { |
340 | struct task_struct *c; | 381 | struct task_struct *c; |
341 | 382 | ||
@@ -345,6 +386,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
345 | current->comm, gfp_mask, order, current->oomkilladj); | 386 | current->comm, gfp_mask, order, current->oomkilladj); |
346 | dump_stack(); | 387 | dump_stack(); |
347 | show_mem(); | 388 | show_mem(); |
389 | if (sysctl_oom_dump_tasks) | ||
390 | dump_tasks(mem); | ||
348 | } | 391 | } |
349 | 392 | ||
350 | /* | 393 | /* |
@@ -369,6 +412,31 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
369 | return oom_kill_task(p); | 412 | return oom_kill_task(p); |
370 | } | 413 | } |
371 | 414 | ||
415 | #ifdef CONFIG_CGROUP_MEM_CONT | ||
416 | void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) | ||
417 | { | ||
418 | unsigned long points = 0; | ||
419 | struct task_struct *p; | ||
420 | |||
421 | cgroup_lock(); | ||
422 | rcu_read_lock(); | ||
423 | retry: | ||
424 | p = select_bad_process(&points, mem); | ||
425 | if (PTR_ERR(p) == -1UL) | ||
426 | goto out; | ||
427 | |||
428 | if (!p) | ||
429 | p = current; | ||
430 | |||
431 | if (oom_kill_process(p, gfp_mask, 0, points, mem, | ||
432 | "Memory cgroup out of memory")) | ||
433 | goto retry; | ||
434 | out: | ||
435 | rcu_read_unlock(); | ||
436 | cgroup_unlock(); | ||
437 | } | ||
438 | #endif | ||
439 | |||
372 | static BLOCKING_NOTIFIER_HEAD(oom_notify_list); | 440 | static BLOCKING_NOTIFIER_HEAD(oom_notify_list); |
373 | 441 | ||
374 | int register_oom_notifier(struct notifier_block *nb) | 442 | int register_oom_notifier(struct notifier_block *nb) |
@@ -466,7 +534,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
466 | 534 | ||
467 | switch (constraint) { | 535 | switch (constraint) { |
468 | case CONSTRAINT_MEMORY_POLICY: | 536 | case CONSTRAINT_MEMORY_POLICY: |
469 | oom_kill_process(current, gfp_mask, order, points, | 537 | oom_kill_process(current, gfp_mask, order, points, NULL, |
470 | "No available memory (MPOL_BIND)"); | 538 | "No available memory (MPOL_BIND)"); |
471 | break; | 539 | break; |
472 | 540 | ||
@@ -476,7 +544,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
476 | /* Fall-through */ | 544 | /* Fall-through */ |
477 | case CONSTRAINT_CPUSET: | 545 | case CONSTRAINT_CPUSET: |
478 | if (sysctl_oom_kill_allocating_task) { | 546 | if (sysctl_oom_kill_allocating_task) { |
479 | oom_kill_process(current, gfp_mask, order, points, | 547 | oom_kill_process(current, gfp_mask, order, points, NULL, |
480 | "Out of memory (oom_kill_allocating_task)"); | 548 | "Out of memory (oom_kill_allocating_task)"); |
481 | break; | 549 | break; |
482 | } | 550 | } |
@@ -485,7 +553,7 @@ retry: | |||
485 | * Rambo mode: Shoot down a process and hope it solves whatever | 553 | * Rambo mode: Shoot down a process and hope it solves whatever |
486 | * issues we may have. | 554 | * issues we may have. |
487 | */ | 555 | */ |
488 | p = select_bad_process(&points); | 556 | p = select_bad_process(&points, NULL); |
489 | 557 | ||
490 | if (PTR_ERR(p) == -1UL) | 558 | if (PTR_ERR(p) == -1UL) |
491 | goto out; | 559 | goto out; |
@@ -496,7 +564,7 @@ retry: | |||
496 | panic("Out of memory and no killable processes...\n"); | 564 | panic("Out of memory and no killable processes...\n"); |
497 | } | 565 | } |
498 | 566 | ||
499 | if (oom_kill_process(p, gfp_mask, order, points, | 567 | if (oom_kill_process(p, gfp_mask, order, points, NULL, |
500 | "Out of memory")) | 568 | "Out of memory")) |
501 | goto retry; | 569 | goto retry; |
502 | 570 | ||
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3d3848fa6324..5e00f1772c20 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void) | |||
69 | int dirty_background_ratio = 5; | 69 | int dirty_background_ratio = 5; |
70 | 70 | ||
71 | /* | 71 | /* |
72 | * free highmem will not be subtracted from the total free memory | ||
73 | * for calculating free ratios if vm_highmem_is_dirtyable is true | ||
74 | */ | ||
75 | int vm_highmem_is_dirtyable; | ||
76 | |||
77 | /* | ||
72 | * The generator of dirty data starts writeback at this percentage | 78 | * The generator of dirty data starts writeback at this percentage |
73 | */ | 79 | */ |
74 | int vm_dirty_ratio = 10; | 80 | int vm_dirty_ratio = 10; |
@@ -219,7 +225,7 @@ static inline void task_dirties_fraction(struct task_struct *tsk, | |||
219 | * | 225 | * |
220 | * dirty -= (dirty/8) * p_{t} | 226 | * dirty -= (dirty/8) * p_{t} |
221 | */ | 227 | */ |
222 | void task_dirty_limit(struct task_struct *tsk, long *pdirty) | 228 | static void task_dirty_limit(struct task_struct *tsk, long *pdirty) |
223 | { | 229 | { |
224 | long numerator, denominator; | 230 | long numerator, denominator; |
225 | long dirty = *pdirty; | 231 | long dirty = *pdirty; |
@@ -287,7 +293,10 @@ static unsigned long determine_dirtyable_memory(void) | |||
287 | x = global_page_state(NR_FREE_PAGES) | 293 | x = global_page_state(NR_FREE_PAGES) |
288 | + global_page_state(NR_INACTIVE) | 294 | + global_page_state(NR_INACTIVE) |
289 | + global_page_state(NR_ACTIVE); | 295 | + global_page_state(NR_ACTIVE); |
290 | x -= highmem_dirtyable_memory(x); | 296 | |
297 | if (!vm_highmem_is_dirtyable) | ||
298 | x -= highmem_dirtyable_memory(x); | ||
299 | |||
291 | return x + 1; /* Ensure that we never return 0 */ | 300 | return x + 1; /* Ensure that we never return 0 */ |
292 | } | 301 | } |
293 | 302 | ||
@@ -558,6 +567,7 @@ static void background_writeout(unsigned long _min_pages) | |||
558 | global_page_state(NR_UNSTABLE_NFS) < background_thresh | 567 | global_page_state(NR_UNSTABLE_NFS) < background_thresh |
559 | && min_pages <= 0) | 568 | && min_pages <= 0) |
560 | break; | 569 | break; |
570 | wbc.more_io = 0; | ||
561 | wbc.encountered_congestion = 0; | 571 | wbc.encountered_congestion = 0; |
562 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; | 572 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; |
563 | wbc.pages_skipped = 0; | 573 | wbc.pages_skipped = 0; |
@@ -565,8 +575,9 @@ static void background_writeout(unsigned long _min_pages) | |||
565 | min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; | 575 | min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; |
566 | if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { | 576 | if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { |
567 | /* Wrote less than expected */ | 577 | /* Wrote less than expected */ |
568 | congestion_wait(WRITE, HZ/10); | 578 | if (wbc.encountered_congestion || wbc.more_io) |
569 | if (!wbc.encountered_congestion) | 579 | congestion_wait(WRITE, HZ/10); |
580 | else | ||
570 | break; | 581 | break; |
571 | } | 582 | } |
572 | } | 583 | } |
@@ -631,11 +642,12 @@ static void wb_kupdate(unsigned long arg) | |||
631 | global_page_state(NR_UNSTABLE_NFS) + | 642 | global_page_state(NR_UNSTABLE_NFS) + |
632 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); | 643 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); |
633 | while (nr_to_write > 0) { | 644 | while (nr_to_write > 0) { |
645 | wbc.more_io = 0; | ||
634 | wbc.encountered_congestion = 0; | 646 | wbc.encountered_congestion = 0; |
635 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; | 647 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; |
636 | writeback_inodes(&wbc); | 648 | writeback_inodes(&wbc); |
637 | if (wbc.nr_to_write > 0) { | 649 | if (wbc.nr_to_write > 0) { |
638 | if (wbc.encountered_congestion) | 650 | if (wbc.encountered_congestion || wbc.more_io) |
639 | congestion_wait(WRITE, HZ/10); | 651 | congestion_wait(WRITE, HZ/10); |
640 | else | 652 | else |
641 | break; /* All the old data is written */ | 653 | break; /* All the old data is written */ |
@@ -1064,7 +1076,7 @@ static int __set_page_dirty(struct page *page) | |||
1064 | return 0; | 1076 | return 0; |
1065 | } | 1077 | } |
1066 | 1078 | ||
1067 | int fastcall set_page_dirty(struct page *page) | 1079 | int set_page_dirty(struct page *page) |
1068 | { | 1080 | { |
1069 | int ret = __set_page_dirty(page); | 1081 | int ret = __set_page_dirty(page); |
1070 | if (ret) | 1082 | if (ret) |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b2838c24e582..26a54a17dc9f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -43,6 +43,7 @@ | |||
43 | #include <linux/backing-dev.h> | 43 | #include <linux/backing-dev.h> |
44 | #include <linux/fault-inject.h> | 44 | #include <linux/fault-inject.h> |
45 | #include <linux/page-isolation.h> | 45 | #include <linux/page-isolation.h> |
46 | #include <linux/memcontrol.h> | ||
46 | 47 | ||
47 | #include <asm/tlbflush.h> | 48 | #include <asm/tlbflush.h> |
48 | #include <asm/div64.h> | 49 | #include <asm/div64.h> |
@@ -537,7 +538,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
537 | /* | 538 | /* |
538 | * permit the bootmem allocator to evade page validation on high-order frees | 539 | * permit the bootmem allocator to evade page validation on high-order frees |
539 | */ | 540 | */ |
540 | void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) | 541 | void __init __free_pages_bootmem(struct page *page, unsigned int order) |
541 | { | 542 | { |
542 | if (order == 0) { | 543 | if (order == 0) { |
543 | __ClearPageReserved(page); | 544 | __ClearPageReserved(page); |
@@ -890,31 +891,51 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | |||
890 | } | 891 | } |
891 | #endif | 892 | #endif |
892 | 893 | ||
893 | static void __drain_pages(unsigned int cpu) | 894 | /* |
895 | * Drain pages of the indicated processor. | ||
896 | * | ||
897 | * The processor must either be the current processor and the | ||
898 | * thread pinned to the current processor or a processor that | ||
899 | * is not online. | ||
900 | */ | ||
901 | static void drain_pages(unsigned int cpu) | ||
894 | { | 902 | { |
895 | unsigned long flags; | 903 | unsigned long flags; |
896 | struct zone *zone; | 904 | struct zone *zone; |
897 | int i; | ||
898 | 905 | ||
899 | for_each_zone(zone) { | 906 | for_each_zone(zone) { |
900 | struct per_cpu_pageset *pset; | 907 | struct per_cpu_pageset *pset; |
908 | struct per_cpu_pages *pcp; | ||
901 | 909 | ||
902 | if (!populated_zone(zone)) | 910 | if (!populated_zone(zone)) |
903 | continue; | 911 | continue; |
904 | 912 | ||
905 | pset = zone_pcp(zone, cpu); | 913 | pset = zone_pcp(zone, cpu); |
906 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | 914 | |
907 | struct per_cpu_pages *pcp; | 915 | pcp = &pset->pcp; |
908 | 916 | local_irq_save(flags); | |
909 | pcp = &pset->pcp[i]; | 917 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); |
910 | local_irq_save(flags); | 918 | pcp->count = 0; |
911 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); | 919 | local_irq_restore(flags); |
912 | pcp->count = 0; | ||
913 | local_irq_restore(flags); | ||
914 | } | ||
915 | } | 920 | } |
916 | } | 921 | } |
917 | 922 | ||
923 | /* | ||
924 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. | ||
925 | */ | ||
926 | void drain_local_pages(void *arg) | ||
927 | { | ||
928 | drain_pages(smp_processor_id()); | ||
929 | } | ||
930 | |||
931 | /* | ||
932 | * Spill all the per-cpu pages from all CPUs back into the buddy allocator | ||
933 | */ | ||
934 | void drain_all_pages(void) | ||
935 | { | ||
936 | on_each_cpu(drain_local_pages, NULL, 0, 1); | ||
937 | } | ||
938 | |||
918 | #ifdef CONFIG_HIBERNATION | 939 | #ifdef CONFIG_HIBERNATION |
919 | 940 | ||
920 | void mark_free_pages(struct zone *zone) | 941 | void mark_free_pages(struct zone *zone) |
@@ -952,40 +973,9 @@ void mark_free_pages(struct zone *zone) | |||
952 | #endif /* CONFIG_PM */ | 973 | #endif /* CONFIG_PM */ |
953 | 974 | ||
954 | /* | 975 | /* |
955 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. | ||
956 | */ | ||
957 | void drain_local_pages(void) | ||
958 | { | ||
959 | unsigned long flags; | ||
960 | |||
961 | local_irq_save(flags); | ||
962 | __drain_pages(smp_processor_id()); | ||
963 | local_irq_restore(flags); | ||
964 | } | ||
965 | |||
966 | void smp_drain_local_pages(void *arg) | ||
967 | { | ||
968 | drain_local_pages(); | ||
969 | } | ||
970 | |||
971 | /* | ||
972 | * Spill all the per-cpu pages from all CPUs back into the buddy allocator | ||
973 | */ | ||
974 | void drain_all_local_pages(void) | ||
975 | { | ||
976 | unsigned long flags; | ||
977 | |||
978 | local_irq_save(flags); | ||
979 | __drain_pages(smp_processor_id()); | ||
980 | local_irq_restore(flags); | ||
981 | |||
982 | smp_call_function(smp_drain_local_pages, NULL, 0, 1); | ||
983 | } | ||
984 | |||
985 | /* | ||
986 | * Free a 0-order page | 976 | * Free a 0-order page |
987 | */ | 977 | */ |
988 | static void fastcall free_hot_cold_page(struct page *page, int cold) | 978 | static void free_hot_cold_page(struct page *page, int cold) |
989 | { | 979 | { |
990 | struct zone *zone = page_zone(page); | 980 | struct zone *zone = page_zone(page); |
991 | struct per_cpu_pages *pcp; | 981 | struct per_cpu_pages *pcp; |
@@ -998,13 +988,17 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) | |||
998 | 988 | ||
999 | if (!PageHighMem(page)) | 989 | if (!PageHighMem(page)) |
1000 | debug_check_no_locks_freed(page_address(page), PAGE_SIZE); | 990 | debug_check_no_locks_freed(page_address(page), PAGE_SIZE); |
991 | VM_BUG_ON(page_get_page_cgroup(page)); | ||
1001 | arch_free_page(page, 0); | 992 | arch_free_page(page, 0); |
1002 | kernel_map_pages(page, 1, 0); | 993 | kernel_map_pages(page, 1, 0); |
1003 | 994 | ||
1004 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; | 995 | pcp = &zone_pcp(zone, get_cpu())->pcp; |
1005 | local_irq_save(flags); | 996 | local_irq_save(flags); |
1006 | __count_vm_event(PGFREE); | 997 | __count_vm_event(PGFREE); |
1007 | list_add(&page->lru, &pcp->list); | 998 | if (cold) |
999 | list_add_tail(&page->lru, &pcp->list); | ||
1000 | else | ||
1001 | list_add(&page->lru, &pcp->list); | ||
1008 | set_page_private(page, get_pageblock_migratetype(page)); | 1002 | set_page_private(page, get_pageblock_migratetype(page)); |
1009 | pcp->count++; | 1003 | pcp->count++; |
1010 | if (pcp->count >= pcp->high) { | 1004 | if (pcp->count >= pcp->high) { |
@@ -1015,12 +1009,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) | |||
1015 | put_cpu(); | 1009 | put_cpu(); |
1016 | } | 1010 | } |
1017 | 1011 | ||
1018 | void fastcall free_hot_page(struct page *page) | 1012 | void free_hot_page(struct page *page) |
1019 | { | 1013 | { |
1020 | free_hot_cold_page(page, 0); | 1014 | free_hot_cold_page(page, 0); |
1021 | } | 1015 | } |
1022 | 1016 | ||
1023 | void fastcall free_cold_page(struct page *page) | 1017 | void free_cold_page(struct page *page) |
1024 | { | 1018 | { |
1025 | free_hot_cold_page(page, 1); | 1019 | free_hot_cold_page(page, 1); |
1026 | } | 1020 | } |
@@ -1062,7 +1056,7 @@ again: | |||
1062 | if (likely(order == 0)) { | 1056 | if (likely(order == 0)) { |
1063 | struct per_cpu_pages *pcp; | 1057 | struct per_cpu_pages *pcp; |
1064 | 1058 | ||
1065 | pcp = &zone_pcp(zone, cpu)->pcp[cold]; | 1059 | pcp = &zone_pcp(zone, cpu)->pcp; |
1066 | local_irq_save(flags); | 1060 | local_irq_save(flags); |
1067 | if (!pcp->count) { | 1061 | if (!pcp->count) { |
1068 | pcp->count = rmqueue_bulk(zone, 0, | 1062 | pcp->count = rmqueue_bulk(zone, 0, |
@@ -1072,9 +1066,15 @@ again: | |||
1072 | } | 1066 | } |
1073 | 1067 | ||
1074 | /* Find a page of the appropriate migrate type */ | 1068 | /* Find a page of the appropriate migrate type */ |
1075 | list_for_each_entry(page, &pcp->list, lru) | 1069 | if (cold) { |
1076 | if (page_private(page) == migratetype) | 1070 | list_for_each_entry_reverse(page, &pcp->list, lru) |
1077 | break; | 1071 | if (page_private(page) == migratetype) |
1072 | break; | ||
1073 | } else { | ||
1074 | list_for_each_entry(page, &pcp->list, lru) | ||
1075 | if (page_private(page) == migratetype) | ||
1076 | break; | ||
1077 | } | ||
1078 | 1078 | ||
1079 | /* Allocate more to the pcp list if necessary */ | 1079 | /* Allocate more to the pcp list if necessary */ |
1080 | if (unlikely(&page->lru == &pcp->list)) { | 1080 | if (unlikely(&page->lru == &pcp->list)) { |
@@ -1569,7 +1569,7 @@ nofail_alloc: | |||
1569 | cond_resched(); | 1569 | cond_resched(); |
1570 | 1570 | ||
1571 | if (order != 0) | 1571 | if (order != 0) |
1572 | drain_all_local_pages(); | 1572 | drain_all_pages(); |
1573 | 1573 | ||
1574 | if (likely(did_some_progress)) { | 1574 | if (likely(did_some_progress)) { |
1575 | page = get_page_from_freelist(gfp_mask, order, | 1575 | page = get_page_from_freelist(gfp_mask, order, |
@@ -1643,7 +1643,7 @@ EXPORT_SYMBOL(__alloc_pages); | |||
1643 | /* | 1643 | /* |
1644 | * Common helper functions. | 1644 | * Common helper functions. |
1645 | */ | 1645 | */ |
1646 | fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) | 1646 | unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) |
1647 | { | 1647 | { |
1648 | struct page * page; | 1648 | struct page * page; |
1649 | page = alloc_pages(gfp_mask, order); | 1649 | page = alloc_pages(gfp_mask, order); |
@@ -1654,7 +1654,7 @@ fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) | |||
1654 | 1654 | ||
1655 | EXPORT_SYMBOL(__get_free_pages); | 1655 | EXPORT_SYMBOL(__get_free_pages); |
1656 | 1656 | ||
1657 | fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) | 1657 | unsigned long get_zeroed_page(gfp_t gfp_mask) |
1658 | { | 1658 | { |
1659 | struct page * page; | 1659 | struct page * page; |
1660 | 1660 | ||
@@ -1680,7 +1680,7 @@ void __pagevec_free(struct pagevec *pvec) | |||
1680 | free_hot_cold_page(pvec->pages[i], pvec->cold); | 1680 | free_hot_cold_page(pvec->pages[i], pvec->cold); |
1681 | } | 1681 | } |
1682 | 1682 | ||
1683 | fastcall void __free_pages(struct page *page, unsigned int order) | 1683 | void __free_pages(struct page *page, unsigned int order) |
1684 | { | 1684 | { |
1685 | if (put_page_testzero(page)) { | 1685 | if (put_page_testzero(page)) { |
1686 | if (order == 0) | 1686 | if (order == 0) |
@@ -1692,7 +1692,7 @@ fastcall void __free_pages(struct page *page, unsigned int order) | |||
1692 | 1692 | ||
1693 | EXPORT_SYMBOL(__free_pages); | 1693 | EXPORT_SYMBOL(__free_pages); |
1694 | 1694 | ||
1695 | fastcall void free_pages(unsigned long addr, unsigned int order) | 1695 | void free_pages(unsigned long addr, unsigned int order) |
1696 | { | 1696 | { |
1697 | if (addr != 0) { | 1697 | if (addr != 0) { |
1698 | VM_BUG_ON(!virt_addr_valid((void *)addr)); | 1698 | VM_BUG_ON(!virt_addr_valid((void *)addr)); |
@@ -1801,12 +1801,9 @@ void show_free_areas(void) | |||
1801 | 1801 | ||
1802 | pageset = zone_pcp(zone, cpu); | 1802 | pageset = zone_pcp(zone, cpu); |
1803 | 1803 | ||
1804 | printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d " | 1804 | printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", |
1805 | "Cold: hi:%5d, btch:%4d usd:%4d\n", | 1805 | cpu, pageset->pcp.high, |
1806 | cpu, pageset->pcp[0].high, | 1806 | pageset->pcp.batch, pageset->pcp.count); |
1807 | pageset->pcp[0].batch, pageset->pcp[0].count, | ||
1808 | pageset->pcp[1].high, pageset->pcp[1].batch, | ||
1809 | pageset->pcp[1].count); | ||
1810 | } | 1807 | } |
1811 | } | 1808 | } |
1812 | 1809 | ||
@@ -1879,6 +1876,8 @@ void show_free_areas(void) | |||
1879 | printk("= %lukB\n", K(total)); | 1876 | printk("= %lukB\n", K(total)); |
1880 | } | 1877 | } |
1881 | 1878 | ||
1879 | printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); | ||
1880 | |||
1882 | show_swap_cache_info(); | 1881 | show_swap_cache_info(); |
1883 | } | 1882 | } |
1884 | 1883 | ||
@@ -2528,6 +2527,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
2528 | set_page_links(page, zone, nid, pfn); | 2527 | set_page_links(page, zone, nid, pfn); |
2529 | init_page_count(page); | 2528 | init_page_count(page); |
2530 | reset_page_mapcount(page); | 2529 | reset_page_mapcount(page); |
2530 | page_assign_page_cgroup(page, NULL); | ||
2531 | SetPageReserved(page); | 2531 | SetPageReserved(page); |
2532 | 2532 | ||
2533 | /* | 2533 | /* |
@@ -2551,8 +2551,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
2551 | } | 2551 | } |
2552 | } | 2552 | } |
2553 | 2553 | ||
2554 | static void __meminit zone_init_free_lists(struct pglist_data *pgdat, | 2554 | static void __meminit zone_init_free_lists(struct zone *zone) |
2555 | struct zone *zone, unsigned long size) | ||
2556 | { | 2555 | { |
2557 | int order, t; | 2556 | int order, t; |
2558 | for_each_migratetype_order(order, t) { | 2557 | for_each_migratetype_order(order, t) { |
@@ -2604,17 +2603,11 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | |||
2604 | 2603 | ||
2605 | memset(p, 0, sizeof(*p)); | 2604 | memset(p, 0, sizeof(*p)); |
2606 | 2605 | ||
2607 | pcp = &p->pcp[0]; /* hot */ | 2606 | pcp = &p->pcp; |
2608 | pcp->count = 0; | 2607 | pcp->count = 0; |
2609 | pcp->high = 6 * batch; | 2608 | pcp->high = 6 * batch; |
2610 | pcp->batch = max(1UL, 1 * batch); | 2609 | pcp->batch = max(1UL, 1 * batch); |
2611 | INIT_LIST_HEAD(&pcp->list); | 2610 | INIT_LIST_HEAD(&pcp->list); |
2612 | |||
2613 | pcp = &p->pcp[1]; /* cold*/ | ||
2614 | pcp->count = 0; | ||
2615 | pcp->high = 2 * batch; | ||
2616 | pcp->batch = max(1UL, batch/2); | ||
2617 | INIT_LIST_HEAD(&pcp->list); | ||
2618 | } | 2611 | } |
2619 | 2612 | ||
2620 | /* | 2613 | /* |
@@ -2627,7 +2620,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, | |||
2627 | { | 2620 | { |
2628 | struct per_cpu_pages *pcp; | 2621 | struct per_cpu_pages *pcp; |
2629 | 2622 | ||
2630 | pcp = &p->pcp[0]; /* hot list */ | 2623 | pcp = &p->pcp; |
2631 | pcp->high = high; | 2624 | pcp->high = high; |
2632 | pcp->batch = max(1UL, high/4); | 2625 | pcp->batch = max(1UL, high/4); |
2633 | if ((high/4) > (PAGE_SHIFT * 8)) | 2626 | if ((high/4) > (PAGE_SHIFT * 8)) |
@@ -2831,7 +2824,7 @@ __meminit int init_currently_empty_zone(struct zone *zone, | |||
2831 | 2824 | ||
2832 | memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); | 2825 | memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); |
2833 | 2826 | ||
2834 | zone_init_free_lists(pgdat, zone, zone->spanned_pages); | 2827 | zone_init_free_lists(zone); |
2835 | 2828 | ||
2836 | return 0; | 2829 | return 0; |
2837 | } | 2830 | } |
@@ -3978,10 +3971,23 @@ static int page_alloc_cpu_notify(struct notifier_block *self, | |||
3978 | int cpu = (unsigned long)hcpu; | 3971 | int cpu = (unsigned long)hcpu; |
3979 | 3972 | ||
3980 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { | 3973 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { |
3981 | local_irq_disable(); | 3974 | drain_pages(cpu); |
3982 | __drain_pages(cpu); | 3975 | |
3976 | /* | ||
3977 | * Spill the event counters of the dead processor | ||
3978 | * into the current processors event counters. | ||
3979 | * This artificially elevates the count of the current | ||
3980 | * processor. | ||
3981 | */ | ||
3983 | vm_events_fold_cpu(cpu); | 3982 | vm_events_fold_cpu(cpu); |
3984 | local_irq_enable(); | 3983 | |
3984 | /* | ||
3985 | * Zero the differential counters of the dead processor | ||
3986 | * so that the vm statistics are consistent. | ||
3987 | * | ||
3988 | * This is only okay since the processor is dead and cannot | ||
3989 | * race with what we are doing. | ||
3990 | */ | ||
3985 | refresh_cpu_vm_stats(cpu); | 3991 | refresh_cpu_vm_stats(cpu); |
3986 | } | 3992 | } |
3987 | return NOTIFY_OK; | 3993 | return NOTIFY_OK; |
@@ -4480,7 +4486,7 @@ int set_migratetype_isolate(struct page *page) | |||
4480 | out: | 4486 | out: |
4481 | spin_unlock_irqrestore(&zone->lock, flags); | 4487 | spin_unlock_irqrestore(&zone->lock, flags); |
4482 | if (!ret) | 4488 | if (!ret) |
4483 | drain_all_local_pages(); | 4489 | drain_all_pages(); |
4484 | return ret; | 4490 | return ret; |
4485 | } | 4491 | } |
4486 | 4492 | ||
diff --git a/mm/page_io.c b/mm/page_io.c index 3b97f6850273..065c4480eaf0 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -126,7 +126,7 @@ int swap_readpage(struct file *file, struct page *page) | |||
126 | int ret = 0; | 126 | int ret = 0; |
127 | 127 | ||
128 | BUG_ON(!PageLocked(page)); | 128 | BUG_ON(!PageLocked(page)); |
129 | ClearPageUptodate(page); | 129 | BUG_ON(PageUptodate(page)); |
130 | bio = get_swap_bio(GFP_KERNEL, page_private(page), page, | 130 | bio = get_swap_bio(GFP_KERNEL, page_private(page), page, |
131 | end_swap_bio_read); | 131 | end_swap_bio_read); |
132 | if (bio == NULL) { | 132 | if (bio == NULL) { |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c new file mode 100644 index 000000000000..b4f27d22da91 --- /dev/null +++ b/mm/pagewalk.c | |||
@@ -0,0 +1,131 @@ | |||
1 | #include <linux/mm.h> | ||
2 | #include <linux/highmem.h> | ||
3 | #include <linux/sched.h> | ||
4 | |||
5 | static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | ||
6 | const struct mm_walk *walk, void *private) | ||
7 | { | ||
8 | pte_t *pte; | ||
9 | int err = 0; | ||
10 | |||
11 | pte = pte_offset_map(pmd, addr); | ||
12 | do { | ||
13 | err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private); | ||
14 | if (err) | ||
15 | break; | ||
16 | } while (pte++, addr += PAGE_SIZE, addr != end); | ||
17 | |||
18 | pte_unmap(pte); | ||
19 | return err; | ||
20 | } | ||
21 | |||
22 | static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, | ||
23 | const struct mm_walk *walk, void *private) | ||
24 | { | ||
25 | pmd_t *pmd; | ||
26 | unsigned long next; | ||
27 | int err = 0; | ||
28 | |||
29 | pmd = pmd_offset(pud, addr); | ||
30 | do { | ||
31 | next = pmd_addr_end(addr, end); | ||
32 | if (pmd_none_or_clear_bad(pmd)) { | ||
33 | if (walk->pte_hole) | ||
34 | err = walk->pte_hole(addr, next, private); | ||
35 | if (err) | ||
36 | break; | ||
37 | continue; | ||
38 | } | ||
39 | if (walk->pmd_entry) | ||
40 | err = walk->pmd_entry(pmd, addr, next, private); | ||
41 | if (!err && walk->pte_entry) | ||
42 | err = walk_pte_range(pmd, addr, next, walk, private); | ||
43 | if (err) | ||
44 | break; | ||
45 | } while (pmd++, addr = next, addr != end); | ||
46 | |||
47 | return err; | ||
48 | } | ||
49 | |||
50 | static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, | ||
51 | const struct mm_walk *walk, void *private) | ||
52 | { | ||
53 | pud_t *pud; | ||
54 | unsigned long next; | ||
55 | int err = 0; | ||
56 | |||
57 | pud = pud_offset(pgd, addr); | ||
58 | do { | ||
59 | next = pud_addr_end(addr, end); | ||
60 | if (pud_none_or_clear_bad(pud)) { | ||
61 | if (walk->pte_hole) | ||
62 | err = walk->pte_hole(addr, next, private); | ||
63 | if (err) | ||
64 | break; | ||
65 | continue; | ||
66 | } | ||
67 | if (walk->pud_entry) | ||
68 | err = walk->pud_entry(pud, addr, next, private); | ||
69 | if (!err && (walk->pmd_entry || walk->pte_entry)) | ||
70 | err = walk_pmd_range(pud, addr, next, walk, private); | ||
71 | if (err) | ||
72 | break; | ||
73 | } while (pud++, addr = next, addr != end); | ||
74 | |||
75 | return err; | ||
76 | } | ||
77 | |||
78 | /** | ||
79 | * walk_page_range - walk a memory map's page tables with a callback | ||
80 | * @mm - memory map to walk | ||
81 | * @addr - starting address | ||
82 | * @end - ending address | ||
83 | * @walk - set of callbacks to invoke for each level of the tree | ||
84 | * @private - private data passed to the callback function | ||
85 | * | ||
86 | * Recursively walk the page table for the memory area in a VMA, | ||
87 | * calling supplied callbacks. Callbacks are called in-order (first | ||
88 | * PGD, first PUD, first PMD, first PTE, second PTE... second PMD, | ||
89 | * etc.). If lower-level callbacks are omitted, walking depth is reduced. | ||
90 | * | ||
91 | * Each callback receives an entry pointer, the start and end of the | ||
92 | * associated range, and a caller-supplied private data pointer. | ||
93 | * | ||
94 | * No locks are taken, but the bottom level iterator will map PTE | ||
95 | * directories from highmem if necessary. | ||
96 | * | ||
97 | * If any callback returns a non-zero value, the walk is aborted and | ||
98 | * the return value is propagated back to the caller. Otherwise 0 is returned. | ||
99 | */ | ||
100 | int walk_page_range(const struct mm_struct *mm, | ||
101 | unsigned long addr, unsigned long end, | ||
102 | const struct mm_walk *walk, void *private) | ||
103 | { | ||
104 | pgd_t *pgd; | ||
105 | unsigned long next; | ||
106 | int err = 0; | ||
107 | |||
108 | if (addr >= end) | ||
109 | return err; | ||
110 | |||
111 | pgd = pgd_offset(mm, addr); | ||
112 | do { | ||
113 | next = pgd_addr_end(addr, end); | ||
114 | if (pgd_none_or_clear_bad(pgd)) { | ||
115 | if (walk->pte_hole) | ||
116 | err = walk->pte_hole(addr, next, private); | ||
117 | if (err) | ||
118 | break; | ||
119 | continue; | ||
120 | } | ||
121 | if (walk->pgd_entry) | ||
122 | err = walk->pgd_entry(pgd, addr, next, private); | ||
123 | if (!err && | ||
124 | (walk->pud_entry || walk->pmd_entry || walk->pte_entry)) | ||
125 | err = walk_pud_range(pgd, addr, next, walk, private); | ||
126 | if (err) | ||
127 | break; | ||
128 | } while (pgd++, addr = next, addr != end); | ||
129 | |||
130 | return err; | ||
131 | } | ||
@@ -36,7 +36,6 @@ | |||
36 | * mapping->tree_lock (widely used, in set_page_dirty, | 36 | * mapping->tree_lock (widely used, in set_page_dirty, |
37 | * in arch-dependent flush_dcache_mmap_lock, | 37 | * in arch-dependent flush_dcache_mmap_lock, |
38 | * within inode_lock in __sync_single_inode) | 38 | * within inode_lock in __sync_single_inode) |
39 | * zone->lock (within radix tree node alloc) | ||
40 | */ | 39 | */ |
41 | 40 | ||
42 | #include <linux/mm.h> | 41 | #include <linux/mm.h> |
@@ -49,6 +48,7 @@ | |||
49 | #include <linux/rcupdate.h> | 48 | #include <linux/rcupdate.h> |
50 | #include <linux/module.h> | 49 | #include <linux/module.h> |
51 | #include <linux/kallsyms.h> | 50 | #include <linux/kallsyms.h> |
51 | #include <linux/memcontrol.h> | ||
52 | 52 | ||
53 | #include <asm/tlbflush.h> | 53 | #include <asm/tlbflush.h> |
54 | 54 | ||
@@ -284,7 +284,10 @@ static int page_referenced_one(struct page *page, | |||
284 | if (!pte) | 284 | if (!pte) |
285 | goto out; | 285 | goto out; |
286 | 286 | ||
287 | if (ptep_clear_flush_young(vma, address, pte)) | 287 | if (vma->vm_flags & VM_LOCKED) { |
288 | referenced++; | ||
289 | *mapcount = 1; /* break early from loop */ | ||
290 | } else if (ptep_clear_flush_young(vma, address, pte)) | ||
288 | referenced++; | 291 | referenced++; |
289 | 292 | ||
290 | /* Pretend the page is referenced if the task has the | 293 | /* Pretend the page is referenced if the task has the |
@@ -299,7 +302,8 @@ out: | |||
299 | return referenced; | 302 | return referenced; |
300 | } | 303 | } |
301 | 304 | ||
302 | static int page_referenced_anon(struct page *page) | 305 | static int page_referenced_anon(struct page *page, |
306 | struct mem_cgroup *mem_cont) | ||
303 | { | 307 | { |
304 | unsigned int mapcount; | 308 | unsigned int mapcount; |
305 | struct anon_vma *anon_vma; | 309 | struct anon_vma *anon_vma; |
@@ -312,6 +316,13 @@ static int page_referenced_anon(struct page *page) | |||
312 | 316 | ||
313 | mapcount = page_mapcount(page); | 317 | mapcount = page_mapcount(page); |
314 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 318 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { |
319 | /* | ||
320 | * If we are reclaiming on behalf of a cgroup, skip | ||
321 | * counting on behalf of references from different | ||
322 | * cgroups | ||
323 | */ | ||
324 | if (mem_cont && (mm_cgroup(vma->vm_mm) != mem_cont)) | ||
325 | continue; | ||
315 | referenced += page_referenced_one(page, vma, &mapcount); | 326 | referenced += page_referenced_one(page, vma, &mapcount); |
316 | if (!mapcount) | 327 | if (!mapcount) |
317 | break; | 328 | break; |
@@ -332,7 +343,8 @@ static int page_referenced_anon(struct page *page) | |||
332 | * | 343 | * |
333 | * This function is only called from page_referenced for object-based pages. | 344 | * This function is only called from page_referenced for object-based pages. |
334 | */ | 345 | */ |
335 | static int page_referenced_file(struct page *page) | 346 | static int page_referenced_file(struct page *page, |
347 | struct mem_cgroup *mem_cont) | ||
336 | { | 348 | { |
337 | unsigned int mapcount; | 349 | unsigned int mapcount; |
338 | struct address_space *mapping = page->mapping; | 350 | struct address_space *mapping = page->mapping; |
@@ -365,6 +377,13 @@ static int page_referenced_file(struct page *page) | |||
365 | mapcount = page_mapcount(page); | 377 | mapcount = page_mapcount(page); |
366 | 378 | ||
367 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 379 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
380 | /* | ||
381 | * If we are reclaiming on behalf of a cgroup, skip | ||
382 | * counting on behalf of references from different | ||
383 | * cgroups | ||
384 | */ | ||
385 | if (mem_cont && (mm_cgroup(vma->vm_mm) != mem_cont)) | ||
386 | continue; | ||
368 | if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) | 387 | if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) |
369 | == (VM_LOCKED|VM_MAYSHARE)) { | 388 | == (VM_LOCKED|VM_MAYSHARE)) { |
370 | referenced++; | 389 | referenced++; |
@@ -387,7 +406,8 @@ static int page_referenced_file(struct page *page) | |||
387 | * Quick test_and_clear_referenced for all mappings to a page, | 406 | * Quick test_and_clear_referenced for all mappings to a page, |
388 | * returns the number of ptes which referenced the page. | 407 | * returns the number of ptes which referenced the page. |
389 | */ | 408 | */ |
390 | int page_referenced(struct page *page, int is_locked) | 409 | int page_referenced(struct page *page, int is_locked, |
410 | struct mem_cgroup *mem_cont) | ||
391 | { | 411 | { |
392 | int referenced = 0; | 412 | int referenced = 0; |
393 | 413 | ||
@@ -399,14 +419,15 @@ int page_referenced(struct page *page, int is_locked) | |||
399 | 419 | ||
400 | if (page_mapped(page) && page->mapping) { | 420 | if (page_mapped(page) && page->mapping) { |
401 | if (PageAnon(page)) | 421 | if (PageAnon(page)) |
402 | referenced += page_referenced_anon(page); | 422 | referenced += page_referenced_anon(page, mem_cont); |
403 | else if (is_locked) | 423 | else if (is_locked) |
404 | referenced += page_referenced_file(page); | 424 | referenced += page_referenced_file(page, mem_cont); |
405 | else if (TestSetPageLocked(page)) | 425 | else if (TestSetPageLocked(page)) |
406 | referenced++; | 426 | referenced++; |
407 | else { | 427 | else { |
408 | if (page->mapping) | 428 | if (page->mapping) |
409 | referenced += page_referenced_file(page); | 429 | referenced += |
430 | page_referenced_file(page, mem_cont); | ||
410 | unlock_page(page); | 431 | unlock_page(page); |
411 | } | 432 | } |
412 | } | 433 | } |
@@ -552,8 +573,14 @@ void page_add_anon_rmap(struct page *page, | |||
552 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 573 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
553 | if (atomic_inc_and_test(&page->_mapcount)) | 574 | if (atomic_inc_and_test(&page->_mapcount)) |
554 | __page_set_anon_rmap(page, vma, address); | 575 | __page_set_anon_rmap(page, vma, address); |
555 | else | 576 | else { |
556 | __page_check_anon_rmap(page, vma, address); | 577 | __page_check_anon_rmap(page, vma, address); |
578 | /* | ||
579 | * We unconditionally charged during prepare, we uncharge here | ||
580 | * This takes care of balancing the reference counts | ||
581 | */ | ||
582 | mem_cgroup_uncharge_page(page); | ||
583 | } | ||
557 | } | 584 | } |
558 | 585 | ||
559 | /* | 586 | /* |
@@ -584,6 +611,12 @@ void page_add_file_rmap(struct page *page) | |||
584 | { | 611 | { |
585 | if (atomic_inc_and_test(&page->_mapcount)) | 612 | if (atomic_inc_and_test(&page->_mapcount)) |
586 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 613 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
614 | else | ||
615 | /* | ||
616 | * We unconditionally charged during prepare, we uncharge here | ||
617 | * This takes care of balancing the reference counts | ||
618 | */ | ||
619 | mem_cgroup_uncharge_page(page); | ||
587 | } | 620 | } |
588 | 621 | ||
589 | #ifdef CONFIG_DEBUG_VM | 622 | #ifdef CONFIG_DEBUG_VM |
@@ -644,6 +677,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) | |||
644 | page_clear_dirty(page); | 677 | page_clear_dirty(page); |
645 | set_page_dirty(page); | 678 | set_page_dirty(page); |
646 | } | 679 | } |
680 | mem_cgroup_uncharge_page(page); | ||
681 | |||
647 | __dec_zone_page_state(page, | 682 | __dec_zone_page_state(page, |
648 | PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); | 683 | PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); |
649 | } | 684 | } |
diff --git a/mm/shmem.c b/mm/shmem.c index 51b3d6ccddab..85bed948fafc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -78,11 +78,10 @@ | |||
78 | 78 | ||
79 | /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ | 79 | /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ |
80 | enum sgp_type { | 80 | enum sgp_type { |
81 | SGP_QUICK, /* don't try more than file page cache lookup */ | ||
82 | SGP_READ, /* don't exceed i_size, don't allocate page */ | 81 | SGP_READ, /* don't exceed i_size, don't allocate page */ |
83 | SGP_CACHE, /* don't exceed i_size, may allocate page */ | 82 | SGP_CACHE, /* don't exceed i_size, may allocate page */ |
83 | SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ | ||
84 | SGP_WRITE, /* may exceed i_size, may allocate page */ | 84 | SGP_WRITE, /* may exceed i_size, may allocate page */ |
85 | SGP_FAULT, /* same as SGP_CACHE, return with page locked */ | ||
86 | }; | 85 | }; |
87 | 86 | ||
88 | static int shmem_getpage(struct inode *inode, unsigned long idx, | 87 | static int shmem_getpage(struct inode *inode, unsigned long idx, |
@@ -194,7 +193,7 @@ static struct backing_dev_info shmem_backing_dev_info __read_mostly = { | |||
194 | }; | 193 | }; |
195 | 194 | ||
196 | static LIST_HEAD(shmem_swaplist); | 195 | static LIST_HEAD(shmem_swaplist); |
197 | static DEFINE_SPINLOCK(shmem_swaplist_lock); | 196 | static DEFINE_MUTEX(shmem_swaplist_mutex); |
198 | 197 | ||
199 | static void shmem_free_blocks(struct inode *inode, long pages) | 198 | static void shmem_free_blocks(struct inode *inode, long pages) |
200 | { | 199 | { |
@@ -207,6 +206,31 @@ static void shmem_free_blocks(struct inode *inode, long pages) | |||
207 | } | 206 | } |
208 | } | 207 | } |
209 | 208 | ||
209 | static int shmem_reserve_inode(struct super_block *sb) | ||
210 | { | ||
211 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); | ||
212 | if (sbinfo->max_inodes) { | ||
213 | spin_lock(&sbinfo->stat_lock); | ||
214 | if (!sbinfo->free_inodes) { | ||
215 | spin_unlock(&sbinfo->stat_lock); | ||
216 | return -ENOSPC; | ||
217 | } | ||
218 | sbinfo->free_inodes--; | ||
219 | spin_unlock(&sbinfo->stat_lock); | ||
220 | } | ||
221 | return 0; | ||
222 | } | ||
223 | |||
224 | static void shmem_free_inode(struct super_block *sb) | ||
225 | { | ||
226 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); | ||
227 | if (sbinfo->max_inodes) { | ||
228 | spin_lock(&sbinfo->stat_lock); | ||
229 | sbinfo->free_inodes++; | ||
230 | spin_unlock(&sbinfo->stat_lock); | ||
231 | } | ||
232 | } | ||
233 | |||
210 | /* | 234 | /* |
211 | * shmem_recalc_inode - recalculate the size of an inode | 235 | * shmem_recalc_inode - recalculate the size of an inode |
212 | * | 236 | * |
@@ -731,6 +755,8 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | |||
731 | (void) shmem_getpage(inode, | 755 | (void) shmem_getpage(inode, |
732 | attr->ia_size>>PAGE_CACHE_SHIFT, | 756 | attr->ia_size>>PAGE_CACHE_SHIFT, |
733 | &page, SGP_READ, NULL); | 757 | &page, SGP_READ, NULL); |
758 | if (page) | ||
759 | unlock_page(page); | ||
734 | } | 760 | } |
735 | /* | 761 | /* |
736 | * Reset SHMEM_PAGEIN flag so that shmem_truncate can | 762 | * Reset SHMEM_PAGEIN flag so that shmem_truncate can |
@@ -762,7 +788,6 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | |||
762 | 788 | ||
763 | static void shmem_delete_inode(struct inode *inode) | 789 | static void shmem_delete_inode(struct inode *inode) |
764 | { | 790 | { |
765 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
766 | struct shmem_inode_info *info = SHMEM_I(inode); | 791 | struct shmem_inode_info *info = SHMEM_I(inode); |
767 | 792 | ||
768 | if (inode->i_op->truncate == shmem_truncate) { | 793 | if (inode->i_op->truncate == shmem_truncate) { |
@@ -771,17 +796,13 @@ static void shmem_delete_inode(struct inode *inode) | |||
771 | inode->i_size = 0; | 796 | inode->i_size = 0; |
772 | shmem_truncate(inode); | 797 | shmem_truncate(inode); |
773 | if (!list_empty(&info->swaplist)) { | 798 | if (!list_empty(&info->swaplist)) { |
774 | spin_lock(&shmem_swaplist_lock); | 799 | mutex_lock(&shmem_swaplist_mutex); |
775 | list_del_init(&info->swaplist); | 800 | list_del_init(&info->swaplist); |
776 | spin_unlock(&shmem_swaplist_lock); | 801 | mutex_unlock(&shmem_swaplist_mutex); |
777 | } | 802 | } |
778 | } | 803 | } |
779 | BUG_ON(inode->i_blocks); | 804 | BUG_ON(inode->i_blocks); |
780 | if (sbinfo->max_inodes) { | 805 | shmem_free_inode(inode->i_sb); |
781 | spin_lock(&sbinfo->stat_lock); | ||
782 | sbinfo->free_inodes++; | ||
783 | spin_unlock(&sbinfo->stat_lock); | ||
784 | } | ||
785 | clear_inode(inode); | 806 | clear_inode(inode); |
786 | } | 807 | } |
787 | 808 | ||
@@ -807,19 +828,22 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s | |||
807 | struct page *subdir; | 828 | struct page *subdir; |
808 | swp_entry_t *ptr; | 829 | swp_entry_t *ptr; |
809 | int offset; | 830 | int offset; |
831 | int error; | ||
810 | 832 | ||
811 | idx = 0; | 833 | idx = 0; |
812 | ptr = info->i_direct; | 834 | ptr = info->i_direct; |
813 | spin_lock(&info->lock); | 835 | spin_lock(&info->lock); |
836 | if (!info->swapped) { | ||
837 | list_del_init(&info->swaplist); | ||
838 | goto lost2; | ||
839 | } | ||
814 | limit = info->next_index; | 840 | limit = info->next_index; |
815 | size = limit; | 841 | size = limit; |
816 | if (size > SHMEM_NR_DIRECT) | 842 | if (size > SHMEM_NR_DIRECT) |
817 | size = SHMEM_NR_DIRECT; | 843 | size = SHMEM_NR_DIRECT; |
818 | offset = shmem_find_swp(entry, ptr, ptr+size); | 844 | offset = shmem_find_swp(entry, ptr, ptr+size); |
819 | if (offset >= 0) { | 845 | if (offset >= 0) |
820 | shmem_swp_balance_unmap(); | ||
821 | goto found; | 846 | goto found; |
822 | } | ||
823 | if (!info->i_indirect) | 847 | if (!info->i_indirect) |
824 | goto lost2; | 848 | goto lost2; |
825 | 849 | ||
@@ -829,6 +853,14 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s | |||
829 | for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) { | 853 | for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) { |
830 | if (unlikely(idx == stage)) { | 854 | if (unlikely(idx == stage)) { |
831 | shmem_dir_unmap(dir-1); | 855 | shmem_dir_unmap(dir-1); |
856 | if (cond_resched_lock(&info->lock)) { | ||
857 | /* check it has not been truncated */ | ||
858 | if (limit > info->next_index) { | ||
859 | limit = info->next_index; | ||
860 | if (idx >= limit) | ||
861 | goto lost2; | ||
862 | } | ||
863 | } | ||
832 | dir = shmem_dir_map(info->i_indirect) + | 864 | dir = shmem_dir_map(info->i_indirect) + |
833 | ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; | 865 | ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; |
834 | while (!*dir) { | 866 | while (!*dir) { |
@@ -849,11 +881,11 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s | |||
849 | if (size > ENTRIES_PER_PAGE) | 881 | if (size > ENTRIES_PER_PAGE) |
850 | size = ENTRIES_PER_PAGE; | 882 | size = ENTRIES_PER_PAGE; |
851 | offset = shmem_find_swp(entry, ptr, ptr+size); | 883 | offset = shmem_find_swp(entry, ptr, ptr+size); |
884 | shmem_swp_unmap(ptr); | ||
852 | if (offset >= 0) { | 885 | if (offset >= 0) { |
853 | shmem_dir_unmap(dir); | 886 | shmem_dir_unmap(dir); |
854 | goto found; | 887 | goto found; |
855 | } | 888 | } |
856 | shmem_swp_unmap(ptr); | ||
857 | } | 889 | } |
858 | } | 890 | } |
859 | lost1: | 891 | lost1: |
@@ -863,19 +895,69 @@ lost2: | |||
863 | return 0; | 895 | return 0; |
864 | found: | 896 | found: |
865 | idx += offset; | 897 | idx += offset; |
866 | inode = &info->vfs_inode; | 898 | inode = igrab(&info->vfs_inode); |
867 | if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) { | ||
868 | info->flags |= SHMEM_PAGEIN; | ||
869 | shmem_swp_set(info, ptr + offset, 0); | ||
870 | } | ||
871 | shmem_swp_unmap(ptr); | ||
872 | spin_unlock(&info->lock); | 899 | spin_unlock(&info->lock); |
900 | |||
873 | /* | 901 | /* |
874 | * Decrement swap count even when the entry is left behind: | 902 | * Move _head_ to start search for next from here. |
875 | * try_to_unuse will skip over mms, then reincrement count. | 903 | * But be careful: shmem_delete_inode checks list_empty without taking |
904 | * mutex, and there's an instant in list_move_tail when info->swaplist | ||
905 | * would appear empty, if it were the only one on shmem_swaplist. We | ||
906 | * could avoid doing it if inode NULL; or use this minor optimization. | ||
876 | */ | 907 | */ |
877 | swap_free(entry); | 908 | if (shmem_swaplist.next != &info->swaplist) |
878 | return 1; | 909 | list_move_tail(&shmem_swaplist, &info->swaplist); |
910 | mutex_unlock(&shmem_swaplist_mutex); | ||
911 | |||
912 | error = 1; | ||
913 | if (!inode) | ||
914 | goto out; | ||
915 | /* Precharge page while we can wait, compensate afterwards */ | ||
916 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); | ||
917 | if (error) | ||
918 | goto out; | ||
919 | error = radix_tree_preload(GFP_KERNEL); | ||
920 | if (error) | ||
921 | goto uncharge; | ||
922 | error = 1; | ||
923 | |||
924 | spin_lock(&info->lock); | ||
925 | ptr = shmem_swp_entry(info, idx, NULL); | ||
926 | if (ptr && ptr->val == entry.val) | ||
927 | error = add_to_page_cache(page, inode->i_mapping, | ||
928 | idx, GFP_NOWAIT); | ||
929 | if (error == -EEXIST) { | ||
930 | struct page *filepage = find_get_page(inode->i_mapping, idx); | ||
931 | error = 1; | ||
932 | if (filepage) { | ||
933 | /* | ||
934 | * There might be a more uptodate page coming down | ||
935 | * from a stacked writepage: forget our swappage if so. | ||
936 | */ | ||
937 | if (PageUptodate(filepage)) | ||
938 | error = 0; | ||
939 | page_cache_release(filepage); | ||
940 | } | ||
941 | } | ||
942 | if (!error) { | ||
943 | delete_from_swap_cache(page); | ||
944 | set_page_dirty(page); | ||
945 | info->flags |= SHMEM_PAGEIN; | ||
946 | shmem_swp_set(info, ptr, 0); | ||
947 | swap_free(entry); | ||
948 | error = 1; /* not an error, but entry was found */ | ||
949 | } | ||
950 | if (ptr) | ||
951 | shmem_swp_unmap(ptr); | ||
952 | spin_unlock(&info->lock); | ||
953 | radix_tree_preload_end(); | ||
954 | uncharge: | ||
955 | mem_cgroup_uncharge_page(page); | ||
956 | out: | ||
957 | unlock_page(page); | ||
958 | page_cache_release(page); | ||
959 | iput(inode); /* allows for NULL */ | ||
960 | return error; | ||
879 | } | 961 | } |
880 | 962 | ||
881 | /* | 963 | /* |
@@ -887,20 +969,16 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
887 | struct shmem_inode_info *info; | 969 | struct shmem_inode_info *info; |
888 | int found = 0; | 970 | int found = 0; |
889 | 971 | ||
890 | spin_lock(&shmem_swaplist_lock); | 972 | mutex_lock(&shmem_swaplist_mutex); |
891 | list_for_each_safe(p, next, &shmem_swaplist) { | 973 | list_for_each_safe(p, next, &shmem_swaplist) { |
892 | info = list_entry(p, struct shmem_inode_info, swaplist); | 974 | info = list_entry(p, struct shmem_inode_info, swaplist); |
893 | if (!info->swapped) | 975 | found = shmem_unuse_inode(info, entry, page); |
894 | list_del_init(&info->swaplist); | 976 | cond_resched(); |
895 | else if (shmem_unuse_inode(info, entry, page)) { | 977 | if (found) |
896 | /* move head to start search for next from here */ | 978 | goto out; |
897 | list_move_tail(&shmem_swaplist, &info->swaplist); | ||
898 | found = 1; | ||
899 | break; | ||
900 | } | ||
901 | } | 979 | } |
902 | spin_unlock(&shmem_swaplist_lock); | 980 | mutex_unlock(&shmem_swaplist_mutex); |
903 | return found; | 981 | out: return found; /* 0 or 1 or -ENOMEM */ |
904 | } | 982 | } |
905 | 983 | ||
906 | /* | 984 | /* |
@@ -915,54 +993,65 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
915 | struct inode *inode; | 993 | struct inode *inode; |
916 | 994 | ||
917 | BUG_ON(!PageLocked(page)); | 995 | BUG_ON(!PageLocked(page)); |
918 | /* | ||
919 | * shmem_backing_dev_info's capabilities prevent regular writeback or | ||
920 | * sync from ever calling shmem_writepage; but a stacking filesystem | ||
921 | * may use the ->writepage of its underlying filesystem, in which case | ||
922 | * we want to do nothing when that underlying filesystem is tmpfs | ||
923 | * (writing out to swap is useful as a response to memory pressure, but | ||
924 | * of no use to stabilize the data) - just redirty the page, unlock it | ||
925 | * and claim success in this case. AOP_WRITEPAGE_ACTIVATE, and the | ||
926 | * page_mapped check below, must be avoided unless we're in reclaim. | ||
927 | */ | ||
928 | if (!wbc->for_reclaim) { | ||
929 | set_page_dirty(page); | ||
930 | unlock_page(page); | ||
931 | return 0; | ||
932 | } | ||
933 | BUG_ON(page_mapped(page)); | ||
934 | |||
935 | mapping = page->mapping; | 996 | mapping = page->mapping; |
936 | index = page->index; | 997 | index = page->index; |
937 | inode = mapping->host; | 998 | inode = mapping->host; |
938 | info = SHMEM_I(inode); | 999 | info = SHMEM_I(inode); |
939 | if (info->flags & VM_LOCKED) | 1000 | if (info->flags & VM_LOCKED) |
940 | goto redirty; | 1001 | goto redirty; |
941 | swap = get_swap_page(); | 1002 | if (!total_swap_pages) |
942 | if (!swap.val) | ||
943 | goto redirty; | 1003 | goto redirty; |
944 | 1004 | ||
1005 | /* | ||
1006 | * shmem_backing_dev_info's capabilities prevent regular writeback or | ||
1007 | * sync from ever calling shmem_writepage; but a stacking filesystem | ||
1008 | * may use the ->writepage of its underlying filesystem, in which case | ||
1009 | * tmpfs should write out to swap only in response to memory pressure, | ||
1010 | * and not for pdflush or sync. However, in those cases, we do still | ||
1011 | * want to check if there's a redundant swappage to be discarded. | ||
1012 | */ | ||
1013 | if (wbc->for_reclaim) | ||
1014 | swap = get_swap_page(); | ||
1015 | else | ||
1016 | swap.val = 0; | ||
1017 | |||
945 | spin_lock(&info->lock); | 1018 | spin_lock(&info->lock); |
946 | shmem_recalc_inode(inode); | ||
947 | if (index >= info->next_index) { | 1019 | if (index >= info->next_index) { |
948 | BUG_ON(!(info->flags & SHMEM_TRUNCATE)); | 1020 | BUG_ON(!(info->flags & SHMEM_TRUNCATE)); |
949 | goto unlock; | 1021 | goto unlock; |
950 | } | 1022 | } |
951 | entry = shmem_swp_entry(info, index, NULL); | 1023 | entry = shmem_swp_entry(info, index, NULL); |
952 | BUG_ON(!entry); | 1024 | if (entry->val) { |
953 | BUG_ON(entry->val); | 1025 | /* |
1026 | * The more uptodate page coming down from a stacked | ||
1027 | * writepage should replace our old swappage. | ||
1028 | */ | ||
1029 | free_swap_and_cache(*entry); | ||
1030 | shmem_swp_set(info, entry, 0); | ||
1031 | } | ||
1032 | shmem_recalc_inode(inode); | ||
954 | 1033 | ||
955 | if (move_to_swap_cache(page, swap) == 0) { | 1034 | if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { |
1035 | remove_from_page_cache(page); | ||
956 | shmem_swp_set(info, entry, swap.val); | 1036 | shmem_swp_set(info, entry, swap.val); |
957 | shmem_swp_unmap(entry); | 1037 | shmem_swp_unmap(entry); |
1038 | if (list_empty(&info->swaplist)) | ||
1039 | inode = igrab(inode); | ||
1040 | else | ||
1041 | inode = NULL; | ||
958 | spin_unlock(&info->lock); | 1042 | spin_unlock(&info->lock); |
959 | if (list_empty(&info->swaplist)) { | 1043 | swap_duplicate(swap); |
960 | spin_lock(&shmem_swaplist_lock); | 1044 | BUG_ON(page_mapped(page)); |
1045 | page_cache_release(page); /* pagecache ref */ | ||
1046 | set_page_dirty(page); | ||
1047 | unlock_page(page); | ||
1048 | if (inode) { | ||
1049 | mutex_lock(&shmem_swaplist_mutex); | ||
961 | /* move instead of add in case we're racing */ | 1050 | /* move instead of add in case we're racing */ |
962 | list_move_tail(&info->swaplist, &shmem_swaplist); | 1051 | list_move_tail(&info->swaplist, &shmem_swaplist); |
963 | spin_unlock(&shmem_swaplist_lock); | 1052 | mutex_unlock(&shmem_swaplist_mutex); |
1053 | iput(inode); | ||
964 | } | 1054 | } |
965 | unlock_page(page); | ||
966 | return 0; | 1055 | return 0; |
967 | } | 1056 | } |
968 | 1057 | ||
@@ -972,7 +1061,10 @@ unlock: | |||
972 | swap_free(swap); | 1061 | swap_free(swap); |
973 | redirty: | 1062 | redirty: |
974 | set_page_dirty(page); | 1063 | set_page_dirty(page); |
975 | return AOP_WRITEPAGE_ACTIVATE; /* Return with the page locked */ | 1064 | if (wbc->for_reclaim) |
1065 | return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ | ||
1066 | unlock_page(page); | ||
1067 | return 0; | ||
976 | } | 1068 | } |
977 | 1069 | ||
978 | #ifdef CONFIG_NUMA | 1070 | #ifdef CONFIG_NUMA |
@@ -1025,53 +1117,33 @@ out: | |||
1025 | return err; | 1117 | return err; |
1026 | } | 1118 | } |
1027 | 1119 | ||
1028 | static struct page *shmem_swapin_async(struct shared_policy *p, | 1120 | static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, |
1029 | swp_entry_t entry, unsigned long idx) | 1121 | struct shmem_inode_info *info, unsigned long idx) |
1030 | { | 1122 | { |
1031 | struct page *page; | ||
1032 | struct vm_area_struct pvma; | 1123 | struct vm_area_struct pvma; |
1124 | struct page *page; | ||
1033 | 1125 | ||
1034 | /* Create a pseudo vma that just contains the policy */ | 1126 | /* Create a pseudo vma that just contains the policy */ |
1035 | memset(&pvma, 0, sizeof(struct vm_area_struct)); | 1127 | pvma.vm_start = 0; |
1036 | pvma.vm_end = PAGE_SIZE; | ||
1037 | pvma.vm_pgoff = idx; | 1128 | pvma.vm_pgoff = idx; |
1038 | pvma.vm_policy = mpol_shared_policy_lookup(p, idx); | 1129 | pvma.vm_ops = NULL; |
1039 | page = read_swap_cache_async(entry, &pvma, 0); | 1130 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); |
1131 | page = swapin_readahead(entry, gfp, &pvma, 0); | ||
1040 | mpol_free(pvma.vm_policy); | 1132 | mpol_free(pvma.vm_policy); |
1041 | return page; | 1133 | return page; |
1042 | } | 1134 | } |
1043 | 1135 | ||
1044 | static struct page *shmem_swapin(struct shmem_inode_info *info, | 1136 | static struct page *shmem_alloc_page(gfp_t gfp, |
1045 | swp_entry_t entry, unsigned long idx) | 1137 | struct shmem_inode_info *info, unsigned long idx) |
1046 | { | ||
1047 | struct shared_policy *p = &info->policy; | ||
1048 | int i, num; | ||
1049 | struct page *page; | ||
1050 | unsigned long offset; | ||
1051 | |||
1052 | num = valid_swaphandles(entry, &offset); | ||
1053 | for (i = 0; i < num; offset++, i++) { | ||
1054 | page = shmem_swapin_async(p, | ||
1055 | swp_entry(swp_type(entry), offset), idx); | ||
1056 | if (!page) | ||
1057 | break; | ||
1058 | page_cache_release(page); | ||
1059 | } | ||
1060 | lru_add_drain(); /* Push any new pages onto the LRU now */ | ||
1061 | return shmem_swapin_async(p, entry, idx); | ||
1062 | } | ||
1063 | |||
1064 | static struct page * | ||
1065 | shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info, | ||
1066 | unsigned long idx) | ||
1067 | { | 1138 | { |
1068 | struct vm_area_struct pvma; | 1139 | struct vm_area_struct pvma; |
1069 | struct page *page; | 1140 | struct page *page; |
1070 | 1141 | ||
1071 | memset(&pvma, 0, sizeof(struct vm_area_struct)); | 1142 | /* Create a pseudo vma that just contains the policy */ |
1072 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); | 1143 | pvma.vm_start = 0; |
1073 | pvma.vm_pgoff = idx; | 1144 | pvma.vm_pgoff = idx; |
1074 | pvma.vm_end = PAGE_SIZE; | 1145 | pvma.vm_ops = NULL; |
1146 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); | ||
1075 | page = alloc_page_vma(gfp, &pvma, 0); | 1147 | page = alloc_page_vma(gfp, &pvma, 0); |
1076 | mpol_free(pvma.vm_policy); | 1148 | mpol_free(pvma.vm_policy); |
1077 | return page; | 1149 | return page; |
@@ -1083,15 +1155,14 @@ static inline int shmem_parse_mpol(char *value, int *policy, | |||
1083 | return 1; | 1155 | return 1; |
1084 | } | 1156 | } |
1085 | 1157 | ||
1086 | static inline struct page * | 1158 | static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, |
1087 | shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx) | 1159 | struct shmem_inode_info *info, unsigned long idx) |
1088 | { | 1160 | { |
1089 | swapin_readahead(entry, 0, NULL); | 1161 | return swapin_readahead(entry, gfp, NULL, 0); |
1090 | return read_swap_cache_async(entry, NULL, 0); | ||
1091 | } | 1162 | } |
1092 | 1163 | ||
1093 | static inline struct page * | 1164 | static inline struct page *shmem_alloc_page(gfp_t gfp, |
1094 | shmem_alloc_page(gfp_t gfp,struct shmem_inode_info *info, unsigned long idx) | 1165 | struct shmem_inode_info *info, unsigned long idx) |
1095 | { | 1166 | { |
1096 | return alloc_page(gfp); | 1167 | return alloc_page(gfp); |
1097 | } | 1168 | } |
@@ -1114,6 +1185,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, | |||
1114 | struct page *swappage; | 1185 | struct page *swappage; |
1115 | swp_entry_t *entry; | 1186 | swp_entry_t *entry; |
1116 | swp_entry_t swap; | 1187 | swp_entry_t swap; |
1188 | gfp_t gfp; | ||
1117 | int error; | 1189 | int error; |
1118 | 1190 | ||
1119 | if (idx >= SHMEM_MAX_INDEX) | 1191 | if (idx >= SHMEM_MAX_INDEX) |
@@ -1126,7 +1198,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, | |||
1126 | * Normally, filepage is NULL on entry, and either found | 1198 | * Normally, filepage is NULL on entry, and either found |
1127 | * uptodate immediately, or allocated and zeroed, or read | 1199 | * uptodate immediately, or allocated and zeroed, or read |
1128 | * in under swappage, which is then assigned to filepage. | 1200 | * in under swappage, which is then assigned to filepage. |
1129 | * But shmem_readpage and shmem_write_begin pass in a locked | 1201 | * But shmem_readpage (required for splice) passes in a locked |
1130 | * filepage, which may be found not uptodate by other callers | 1202 | * filepage, which may be found not uptodate by other callers |
1131 | * too, and may need to be copied from the swappage read in. | 1203 | * too, and may need to be copied from the swappage read in. |
1132 | */ | 1204 | */ |
@@ -1136,8 +1208,17 @@ repeat: | |||
1136 | if (filepage && PageUptodate(filepage)) | 1208 | if (filepage && PageUptodate(filepage)) |
1137 | goto done; | 1209 | goto done; |
1138 | error = 0; | 1210 | error = 0; |
1139 | if (sgp == SGP_QUICK) | 1211 | gfp = mapping_gfp_mask(mapping); |
1140 | goto failed; | 1212 | if (!filepage) { |
1213 | /* | ||
1214 | * Try to preload while we can wait, to not make a habit of | ||
1215 | * draining atomic reserves; but don't latch on to this cpu. | ||
1216 | */ | ||
1217 | error = radix_tree_preload(gfp & ~__GFP_HIGHMEM); | ||
1218 | if (error) | ||
1219 | goto failed; | ||
1220 | radix_tree_preload_end(); | ||
1221 | } | ||
1141 | 1222 | ||
1142 | spin_lock(&info->lock); | 1223 | spin_lock(&info->lock); |
1143 | shmem_recalc_inode(inode); | 1224 | shmem_recalc_inode(inode); |
@@ -1160,7 +1241,7 @@ repeat: | |||
1160 | *type |= VM_FAULT_MAJOR; | 1241 | *type |= VM_FAULT_MAJOR; |
1161 | } | 1242 | } |
1162 | spin_unlock(&info->lock); | 1243 | spin_unlock(&info->lock); |
1163 | swappage = shmem_swapin(info, swap, idx); | 1244 | swappage = shmem_swapin(swap, gfp, info, idx); |
1164 | if (!swappage) { | 1245 | if (!swappage) { |
1165 | spin_lock(&info->lock); | 1246 | spin_lock(&info->lock); |
1166 | entry = shmem_swp_alloc(info, idx, sgp); | 1247 | entry = shmem_swp_alloc(info, idx, sgp); |
@@ -1218,13 +1299,15 @@ repeat: | |||
1218 | SetPageUptodate(filepage); | 1299 | SetPageUptodate(filepage); |
1219 | set_page_dirty(filepage); | 1300 | set_page_dirty(filepage); |
1220 | swap_free(swap); | 1301 | swap_free(swap); |
1221 | } else if (!(error = move_from_swap_cache( | 1302 | } else if (!(error = add_to_page_cache( |
1222 | swappage, idx, mapping))) { | 1303 | swappage, mapping, idx, GFP_NOWAIT))) { |
1223 | info->flags |= SHMEM_PAGEIN; | 1304 | info->flags |= SHMEM_PAGEIN; |
1224 | shmem_swp_set(info, entry, 0); | 1305 | shmem_swp_set(info, entry, 0); |
1225 | shmem_swp_unmap(entry); | 1306 | shmem_swp_unmap(entry); |
1307 | delete_from_swap_cache(swappage); | ||
1226 | spin_unlock(&info->lock); | 1308 | spin_unlock(&info->lock); |
1227 | filepage = swappage; | 1309 | filepage = swappage; |
1310 | set_page_dirty(filepage); | ||
1228 | swap_free(swap); | 1311 | swap_free(swap); |
1229 | } else { | 1312 | } else { |
1230 | shmem_swp_unmap(entry); | 1313 | shmem_swp_unmap(entry); |
@@ -1232,8 +1315,11 @@ repeat: | |||
1232 | unlock_page(swappage); | 1315 | unlock_page(swappage); |
1233 | page_cache_release(swappage); | 1316 | page_cache_release(swappage); |
1234 | if (error == -ENOMEM) { | 1317 | if (error == -ENOMEM) { |
1235 | /* let kswapd refresh zone for GFP_ATOMICs */ | 1318 | /* allow reclaim from this memory cgroup */ |
1236 | congestion_wait(WRITE, HZ/50); | 1319 | error = mem_cgroup_cache_charge(NULL, |
1320 | current->mm, gfp & ~__GFP_HIGHMEM); | ||
1321 | if (error) | ||
1322 | goto failed; | ||
1237 | } | 1323 | } |
1238 | goto repeat; | 1324 | goto repeat; |
1239 | } | 1325 | } |
@@ -1272,9 +1358,7 @@ repeat: | |||
1272 | 1358 | ||
1273 | if (!filepage) { | 1359 | if (!filepage) { |
1274 | spin_unlock(&info->lock); | 1360 | spin_unlock(&info->lock); |
1275 | filepage = shmem_alloc_page(mapping_gfp_mask(mapping), | 1361 | filepage = shmem_alloc_page(gfp, info, idx); |
1276 | info, | ||
1277 | idx); | ||
1278 | if (!filepage) { | 1362 | if (!filepage) { |
1279 | shmem_unacct_blocks(info->flags, 1); | 1363 | shmem_unacct_blocks(info->flags, 1); |
1280 | shmem_free_blocks(inode, 1); | 1364 | shmem_free_blocks(inode, 1); |
@@ -1282,6 +1366,17 @@ repeat: | |||
1282 | goto failed; | 1366 | goto failed; |
1283 | } | 1367 | } |
1284 | 1368 | ||
1369 | /* Precharge page while we can wait, compensate after */ | ||
1370 | error = mem_cgroup_cache_charge(filepage, current->mm, | ||
1371 | gfp & ~__GFP_HIGHMEM); | ||
1372 | if (error) { | ||
1373 | page_cache_release(filepage); | ||
1374 | shmem_unacct_blocks(info->flags, 1); | ||
1375 | shmem_free_blocks(inode, 1); | ||
1376 | filepage = NULL; | ||
1377 | goto failed; | ||
1378 | } | ||
1379 | |||
1285 | spin_lock(&info->lock); | 1380 | spin_lock(&info->lock); |
1286 | entry = shmem_swp_alloc(info, idx, sgp); | 1381 | entry = shmem_swp_alloc(info, idx, sgp); |
1287 | if (IS_ERR(entry)) | 1382 | if (IS_ERR(entry)) |
@@ -1291,8 +1386,9 @@ repeat: | |||
1291 | shmem_swp_unmap(entry); | 1386 | shmem_swp_unmap(entry); |
1292 | } | 1387 | } |
1293 | if (error || swap.val || 0 != add_to_page_cache_lru( | 1388 | if (error || swap.val || 0 != add_to_page_cache_lru( |
1294 | filepage, mapping, idx, GFP_ATOMIC)) { | 1389 | filepage, mapping, idx, GFP_NOWAIT)) { |
1295 | spin_unlock(&info->lock); | 1390 | spin_unlock(&info->lock); |
1391 | mem_cgroup_uncharge_page(filepage); | ||
1296 | page_cache_release(filepage); | 1392 | page_cache_release(filepage); |
1297 | shmem_unacct_blocks(info->flags, 1); | 1393 | shmem_unacct_blocks(info->flags, 1); |
1298 | shmem_free_blocks(inode, 1); | 1394 | shmem_free_blocks(inode, 1); |
@@ -1301,6 +1397,7 @@ repeat: | |||
1301 | goto failed; | 1397 | goto failed; |
1302 | goto repeat; | 1398 | goto repeat; |
1303 | } | 1399 | } |
1400 | mem_cgroup_uncharge_page(filepage); | ||
1304 | info->flags |= SHMEM_PAGEIN; | 1401 | info->flags |= SHMEM_PAGEIN; |
1305 | } | 1402 | } |
1306 | 1403 | ||
@@ -1309,14 +1406,11 @@ repeat: | |||
1309 | clear_highpage(filepage); | 1406 | clear_highpage(filepage); |
1310 | flush_dcache_page(filepage); | 1407 | flush_dcache_page(filepage); |
1311 | SetPageUptodate(filepage); | 1408 | SetPageUptodate(filepage); |
1409 | if (sgp == SGP_DIRTY) | ||
1410 | set_page_dirty(filepage); | ||
1312 | } | 1411 | } |
1313 | done: | 1412 | done: |
1314 | if (*pagep != filepage) { | 1413 | *pagep = filepage; |
1315 | *pagep = filepage; | ||
1316 | if (sgp != SGP_FAULT) | ||
1317 | unlock_page(filepage); | ||
1318 | |||
1319 | } | ||
1320 | return 0; | 1414 | return 0; |
1321 | 1415 | ||
1322 | failed: | 1416 | failed: |
@@ -1336,7 +1430,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1336 | if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | 1430 | if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) |
1337 | return VM_FAULT_SIGBUS; | 1431 | return VM_FAULT_SIGBUS; |
1338 | 1432 | ||
1339 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_FAULT, &ret); | 1433 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); |
1340 | if (error) | 1434 | if (error) |
1341 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); | 1435 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); |
1342 | 1436 | ||
@@ -1399,15 +1493,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | |||
1399 | struct shmem_inode_info *info; | 1493 | struct shmem_inode_info *info; |
1400 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); | 1494 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); |
1401 | 1495 | ||
1402 | if (sbinfo->max_inodes) { | 1496 | if (shmem_reserve_inode(sb)) |
1403 | spin_lock(&sbinfo->stat_lock); | 1497 | return NULL; |
1404 | if (!sbinfo->free_inodes) { | ||
1405 | spin_unlock(&sbinfo->stat_lock); | ||
1406 | return NULL; | ||
1407 | } | ||
1408 | sbinfo->free_inodes--; | ||
1409 | spin_unlock(&sbinfo->stat_lock); | ||
1410 | } | ||
1411 | 1498 | ||
1412 | inode = new_inode(sb); | 1499 | inode = new_inode(sb); |
1413 | if (inode) { | 1500 | if (inode) { |
@@ -1451,11 +1538,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | |||
1451 | NULL); | 1538 | NULL); |
1452 | break; | 1539 | break; |
1453 | } | 1540 | } |
1454 | } else if (sbinfo->max_inodes) { | 1541 | } else |
1455 | spin_lock(&sbinfo->stat_lock); | 1542 | shmem_free_inode(sb); |
1456 | sbinfo->free_inodes++; | ||
1457 | spin_unlock(&sbinfo->stat_lock); | ||
1458 | } | ||
1459 | return inode; | 1543 | return inode; |
1460 | } | 1544 | } |
1461 | 1545 | ||
@@ -1494,123 +1578,30 @@ shmem_write_end(struct file *file, struct address_space *mapping, | |||
1494 | { | 1578 | { |
1495 | struct inode *inode = mapping->host; | 1579 | struct inode *inode = mapping->host; |
1496 | 1580 | ||
1581 | if (pos + copied > inode->i_size) | ||
1582 | i_size_write(inode, pos + copied); | ||
1583 | |||
1584 | unlock_page(page); | ||
1497 | set_page_dirty(page); | 1585 | set_page_dirty(page); |
1498 | page_cache_release(page); | 1586 | page_cache_release(page); |
1499 | 1587 | ||
1500 | if (pos+copied > inode->i_size) | ||
1501 | i_size_write(inode, pos+copied); | ||
1502 | |||
1503 | return copied; | 1588 | return copied; |
1504 | } | 1589 | } |
1505 | 1590 | ||
1506 | static ssize_t | ||
1507 | shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) | ||
1508 | { | ||
1509 | struct inode *inode = file->f_path.dentry->d_inode; | ||
1510 | loff_t pos; | ||
1511 | unsigned long written; | ||
1512 | ssize_t err; | ||
1513 | |||
1514 | if ((ssize_t) count < 0) | ||
1515 | return -EINVAL; | ||
1516 | |||
1517 | if (!access_ok(VERIFY_READ, buf, count)) | ||
1518 | return -EFAULT; | ||
1519 | |||
1520 | mutex_lock(&inode->i_mutex); | ||
1521 | |||
1522 | pos = *ppos; | ||
1523 | written = 0; | ||
1524 | |||
1525 | err = generic_write_checks(file, &pos, &count, 0); | ||
1526 | if (err || !count) | ||
1527 | goto out; | ||
1528 | |||
1529 | err = remove_suid(file->f_path.dentry); | ||
1530 | if (err) | ||
1531 | goto out; | ||
1532 | |||
1533 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | ||
1534 | |||
1535 | do { | ||
1536 | struct page *page = NULL; | ||
1537 | unsigned long bytes, index, offset; | ||
1538 | char *kaddr; | ||
1539 | int left; | ||
1540 | |||
1541 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ | ||
1542 | index = pos >> PAGE_CACHE_SHIFT; | ||
1543 | bytes = PAGE_CACHE_SIZE - offset; | ||
1544 | if (bytes > count) | ||
1545 | bytes = count; | ||
1546 | |||
1547 | /* | ||
1548 | * We don't hold page lock across copy from user - | ||
1549 | * what would it guard against? - so no deadlock here. | ||
1550 | * But it still may be a good idea to prefault below. | ||
1551 | */ | ||
1552 | |||
1553 | err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL); | ||
1554 | if (err) | ||
1555 | break; | ||
1556 | |||
1557 | left = bytes; | ||
1558 | if (PageHighMem(page)) { | ||
1559 | volatile unsigned char dummy; | ||
1560 | __get_user(dummy, buf); | ||
1561 | __get_user(dummy, buf + bytes - 1); | ||
1562 | |||
1563 | kaddr = kmap_atomic(page, KM_USER0); | ||
1564 | left = __copy_from_user_inatomic(kaddr + offset, | ||
1565 | buf, bytes); | ||
1566 | kunmap_atomic(kaddr, KM_USER0); | ||
1567 | } | ||
1568 | if (left) { | ||
1569 | kaddr = kmap(page); | ||
1570 | left = __copy_from_user(kaddr + offset, buf, bytes); | ||
1571 | kunmap(page); | ||
1572 | } | ||
1573 | |||
1574 | written += bytes; | ||
1575 | count -= bytes; | ||
1576 | pos += bytes; | ||
1577 | buf += bytes; | ||
1578 | if (pos > inode->i_size) | ||
1579 | i_size_write(inode, pos); | ||
1580 | |||
1581 | flush_dcache_page(page); | ||
1582 | set_page_dirty(page); | ||
1583 | mark_page_accessed(page); | ||
1584 | page_cache_release(page); | ||
1585 | |||
1586 | if (left) { | ||
1587 | pos -= left; | ||
1588 | written -= left; | ||
1589 | err = -EFAULT; | ||
1590 | break; | ||
1591 | } | ||
1592 | |||
1593 | /* | ||
1594 | * Our dirty pages are not counted in nr_dirty, | ||
1595 | * and we do not attempt to balance dirty pages. | ||
1596 | */ | ||
1597 | |||
1598 | cond_resched(); | ||
1599 | } while (count); | ||
1600 | |||
1601 | *ppos = pos; | ||
1602 | if (written) | ||
1603 | err = written; | ||
1604 | out: | ||
1605 | mutex_unlock(&inode->i_mutex); | ||
1606 | return err; | ||
1607 | } | ||
1608 | |||
1609 | static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) | 1591 | static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) |
1610 | { | 1592 | { |
1611 | struct inode *inode = filp->f_path.dentry->d_inode; | 1593 | struct inode *inode = filp->f_path.dentry->d_inode; |
1612 | struct address_space *mapping = inode->i_mapping; | 1594 | struct address_space *mapping = inode->i_mapping; |
1613 | unsigned long index, offset; | 1595 | unsigned long index, offset; |
1596 | enum sgp_type sgp = SGP_READ; | ||
1597 | |||
1598 | /* | ||
1599 | * Might this read be for a stacking filesystem? Then when reading | ||
1600 | * holes of a sparse file, we actually need to allocate those pages, | ||
1601 | * and even mark them dirty, so it cannot exceed the max_blocks limit. | ||
1602 | */ | ||
1603 | if (segment_eq(get_fs(), KERNEL_DS)) | ||
1604 | sgp = SGP_DIRTY; | ||
1614 | 1605 | ||
1615 | index = *ppos >> PAGE_CACHE_SHIFT; | 1606 | index = *ppos >> PAGE_CACHE_SHIFT; |
1616 | offset = *ppos & ~PAGE_CACHE_MASK; | 1607 | offset = *ppos & ~PAGE_CACHE_MASK; |
@@ -1629,12 +1620,14 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ | |||
1629 | break; | 1620 | break; |
1630 | } | 1621 | } |
1631 | 1622 | ||
1632 | desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL); | 1623 | desc->error = shmem_getpage(inode, index, &page, sgp, NULL); |
1633 | if (desc->error) { | 1624 | if (desc->error) { |
1634 | if (desc->error == -EINVAL) | 1625 | if (desc->error == -EINVAL) |
1635 | desc->error = 0; | 1626 | desc->error = 0; |
1636 | break; | 1627 | break; |
1637 | } | 1628 | } |
1629 | if (page) | ||
1630 | unlock_page(page); | ||
1638 | 1631 | ||
1639 | /* | 1632 | /* |
1640 | * We must evaluate after, since reads (unlike writes) | 1633 | * We must evaluate after, since reads (unlike writes) |
@@ -1798,22 +1791,16 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, int mode, | |||
1798 | static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) | 1791 | static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) |
1799 | { | 1792 | { |
1800 | struct inode *inode = old_dentry->d_inode; | 1793 | struct inode *inode = old_dentry->d_inode; |
1801 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | 1794 | int ret; |
1802 | 1795 | ||
1803 | /* | 1796 | /* |
1804 | * No ordinary (disk based) filesystem counts links as inodes; | 1797 | * No ordinary (disk based) filesystem counts links as inodes; |
1805 | * but each new link needs a new dentry, pinning lowmem, and | 1798 | * but each new link needs a new dentry, pinning lowmem, and |
1806 | * tmpfs dentries cannot be pruned until they are unlinked. | 1799 | * tmpfs dentries cannot be pruned until they are unlinked. |
1807 | */ | 1800 | */ |
1808 | if (sbinfo->max_inodes) { | 1801 | ret = shmem_reserve_inode(inode->i_sb); |
1809 | spin_lock(&sbinfo->stat_lock); | 1802 | if (ret) |
1810 | if (!sbinfo->free_inodes) { | 1803 | goto out; |
1811 | spin_unlock(&sbinfo->stat_lock); | ||
1812 | return -ENOSPC; | ||
1813 | } | ||
1814 | sbinfo->free_inodes--; | ||
1815 | spin_unlock(&sbinfo->stat_lock); | ||
1816 | } | ||
1817 | 1804 | ||
1818 | dir->i_size += BOGO_DIRENT_SIZE; | 1805 | dir->i_size += BOGO_DIRENT_SIZE; |
1819 | inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; | 1806 | inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; |
@@ -1821,21 +1808,16 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr | |||
1821 | atomic_inc(&inode->i_count); /* New dentry reference */ | 1808 | atomic_inc(&inode->i_count); /* New dentry reference */ |
1822 | dget(dentry); /* Extra pinning count for the created dentry */ | 1809 | dget(dentry); /* Extra pinning count for the created dentry */ |
1823 | d_instantiate(dentry, inode); | 1810 | d_instantiate(dentry, inode); |
1824 | return 0; | 1811 | out: |
1812 | return ret; | ||
1825 | } | 1813 | } |
1826 | 1814 | ||
1827 | static int shmem_unlink(struct inode *dir, struct dentry *dentry) | 1815 | static int shmem_unlink(struct inode *dir, struct dentry *dentry) |
1828 | { | 1816 | { |
1829 | struct inode *inode = dentry->d_inode; | 1817 | struct inode *inode = dentry->d_inode; |
1830 | 1818 | ||
1831 | if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) { | 1819 | if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) |
1832 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | 1820 | shmem_free_inode(inode->i_sb); |
1833 | if (sbinfo->max_inodes) { | ||
1834 | spin_lock(&sbinfo->stat_lock); | ||
1835 | sbinfo->free_inodes++; | ||
1836 | spin_unlock(&sbinfo->stat_lock); | ||
1837 | } | ||
1838 | } | ||
1839 | 1821 | ||
1840 | dir->i_size -= BOGO_DIRENT_SIZE; | 1822 | dir->i_size -= BOGO_DIRENT_SIZE; |
1841 | inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; | 1823 | inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; |
@@ -1924,6 +1906,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
1924 | iput(inode); | 1906 | iput(inode); |
1925 | return error; | 1907 | return error; |
1926 | } | 1908 | } |
1909 | unlock_page(page); | ||
1927 | inode->i_op = &shmem_symlink_inode_operations; | 1910 | inode->i_op = &shmem_symlink_inode_operations; |
1928 | kaddr = kmap_atomic(page, KM_USER0); | 1911 | kaddr = kmap_atomic(page, KM_USER0); |
1929 | memcpy(kaddr, symname, len); | 1912 | memcpy(kaddr, symname, len); |
@@ -1951,6 +1934,8 @@ static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) | |||
1951 | struct page *page = NULL; | 1934 | struct page *page = NULL; |
1952 | int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); | 1935 | int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); |
1953 | nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); | 1936 | nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); |
1937 | if (page) | ||
1938 | unlock_page(page); | ||
1954 | return page; | 1939 | return page; |
1955 | } | 1940 | } |
1956 | 1941 | ||
@@ -1996,8 +1981,7 @@ static int shmem_xattr_security_get(struct inode *inode, const char *name, | |||
1996 | { | 1981 | { |
1997 | if (strcmp(name, "") == 0) | 1982 | if (strcmp(name, "") == 0) |
1998 | return -EINVAL; | 1983 | return -EINVAL; |
1999 | return security_inode_getsecurity(inode, name, buffer, size, | 1984 | return xattr_getsecurity(inode, name, buffer, size); |
2000 | -EOPNOTSUPP); | ||
2001 | } | 1985 | } |
2002 | 1986 | ||
2003 | static int shmem_xattr_security_set(struct inode *inode, const char *name, | 1987 | static int shmem_xattr_security_set(struct inode *inode, const char *name, |
@@ -2138,7 +2122,7 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid, | |||
2138 | } | 2122 | } |
2139 | if (*rest) | 2123 | if (*rest) |
2140 | goto bad_val; | 2124 | goto bad_val; |
2141 | *blocks = size >> PAGE_CACHE_SHIFT; | 2125 | *blocks = DIV_ROUND_UP(size, PAGE_CACHE_SIZE); |
2142 | } else if (!strcmp(this_char,"nr_blocks")) { | 2126 | } else if (!strcmp(this_char,"nr_blocks")) { |
2143 | *blocks = memparse(value,&rest); | 2127 | *blocks = memparse(value,&rest); |
2144 | if (*rest) | 2128 | if (*rest) |
@@ -2375,7 +2359,8 @@ static const struct file_operations shmem_file_operations = { | |||
2375 | #ifdef CONFIG_TMPFS | 2359 | #ifdef CONFIG_TMPFS |
2376 | .llseek = generic_file_llseek, | 2360 | .llseek = generic_file_llseek, |
2377 | .read = shmem_file_read, | 2361 | .read = shmem_file_read, |
2378 | .write = shmem_file_write, | 2362 | .write = do_sync_write, |
2363 | .aio_write = generic_file_aio_write, | ||
2379 | .fsync = simple_sync_file, | 2364 | .fsync = simple_sync_file, |
2380 | .splice_read = generic_file_splice_read, | 2365 | .splice_read = generic_file_splice_read, |
2381 | .splice_write = generic_file_splice_write, | 2366 | .splice_write = generic_file_splice_write, |
@@ -12,10 +12,17 @@ | |||
12 | * allocator is as little as 2 bytes, however typically most architectures | 12 | * allocator is as little as 2 bytes, however typically most architectures |
13 | * will require 4 bytes on 32-bit and 8 bytes on 64-bit. | 13 | * will require 4 bytes on 32-bit and 8 bytes on 64-bit. |
14 | * | 14 | * |
15 | * The slob heap is a linked list of pages from alloc_pages(), and | 15 | * The slob heap is a set of linked list of pages from alloc_pages(), |
16 | * within each page, there is a singly-linked list of free blocks (slob_t). | 16 | * and within each page, there is a singly-linked list of free blocks |
17 | * The heap is grown on demand and allocation from the heap is currently | 17 | * (slob_t). The heap is grown on demand. To reduce fragmentation, |
18 | * first-fit. | 18 | * heap pages are segregated into three lists, with objects less than |
19 | * 256 bytes, objects less than 1024 bytes, and all other objects. | ||
20 | * | ||
21 | * Allocation from heap involves first searching for a page with | ||
22 | * sufficient free blocks (using a next-fit-like approach) followed by | ||
23 | * a first-fit scan of the page. Deallocation inserts objects back | ||
24 | * into the free list in address order, so this is effectively an | ||
25 | * address-ordered first fit. | ||
19 | * | 26 | * |
20 | * Above this is an implementation of kmalloc/kfree. Blocks returned | 27 | * Above this is an implementation of kmalloc/kfree. Blocks returned |
21 | * from kmalloc are prepended with a 4-byte header with the kmalloc size. | 28 | * from kmalloc are prepended with a 4-byte header with the kmalloc size. |
@@ -110,9 +117,13 @@ static inline void free_slob_page(struct slob_page *sp) | |||
110 | } | 117 | } |
111 | 118 | ||
112 | /* | 119 | /* |
113 | * All (partially) free slob pages go on this list. | 120 | * All partially free slob pages go on these lists. |
114 | */ | 121 | */ |
115 | static LIST_HEAD(free_slob_pages); | 122 | #define SLOB_BREAK1 256 |
123 | #define SLOB_BREAK2 1024 | ||
124 | static LIST_HEAD(free_slob_small); | ||
125 | static LIST_HEAD(free_slob_medium); | ||
126 | static LIST_HEAD(free_slob_large); | ||
116 | 127 | ||
117 | /* | 128 | /* |
118 | * slob_page: True for all slob pages (false for bigblock pages) | 129 | * slob_page: True for all slob pages (false for bigblock pages) |
@@ -140,9 +151,9 @@ static inline int slob_page_free(struct slob_page *sp) | |||
140 | return test_bit(PG_private, &sp->flags); | 151 | return test_bit(PG_private, &sp->flags); |
141 | } | 152 | } |
142 | 153 | ||
143 | static inline void set_slob_page_free(struct slob_page *sp) | 154 | static void set_slob_page_free(struct slob_page *sp, struct list_head *list) |
144 | { | 155 | { |
145 | list_add(&sp->list, &free_slob_pages); | 156 | list_add(&sp->list, list); |
146 | __set_bit(PG_private, &sp->flags); | 157 | __set_bit(PG_private, &sp->flags); |
147 | } | 158 | } |
148 | 159 | ||
@@ -294,12 +305,20 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) | |||
294 | { | 305 | { |
295 | struct slob_page *sp; | 306 | struct slob_page *sp; |
296 | struct list_head *prev; | 307 | struct list_head *prev; |
308 | struct list_head *slob_list; | ||
297 | slob_t *b = NULL; | 309 | slob_t *b = NULL; |
298 | unsigned long flags; | 310 | unsigned long flags; |
299 | 311 | ||
312 | if (size < SLOB_BREAK1) | ||
313 | slob_list = &free_slob_small; | ||
314 | else if (size < SLOB_BREAK2) | ||
315 | slob_list = &free_slob_medium; | ||
316 | else | ||
317 | slob_list = &free_slob_large; | ||
318 | |||
300 | spin_lock_irqsave(&slob_lock, flags); | 319 | spin_lock_irqsave(&slob_lock, flags); |
301 | /* Iterate through each partially free page, try to find room */ | 320 | /* Iterate through each partially free page, try to find room */ |
302 | list_for_each_entry(sp, &free_slob_pages, list) { | 321 | list_for_each_entry(sp, slob_list, list) { |
303 | #ifdef CONFIG_NUMA | 322 | #ifdef CONFIG_NUMA |
304 | /* | 323 | /* |
305 | * If there's a node specification, search for a partial | 324 | * If there's a node specification, search for a partial |
@@ -321,9 +340,9 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) | |||
321 | /* Improve fragment distribution and reduce our average | 340 | /* Improve fragment distribution and reduce our average |
322 | * search time by starting our next search here. (see | 341 | * search time by starting our next search here. (see |
323 | * Knuth vol 1, sec 2.5, pg 449) */ | 342 | * Knuth vol 1, sec 2.5, pg 449) */ |
324 | if (prev != free_slob_pages.prev && | 343 | if (prev != slob_list->prev && |
325 | free_slob_pages.next != prev->next) | 344 | slob_list->next != prev->next) |
326 | list_move_tail(&free_slob_pages, prev->next); | 345 | list_move_tail(slob_list, prev->next); |
327 | break; | 346 | break; |
328 | } | 347 | } |
329 | spin_unlock_irqrestore(&slob_lock, flags); | 348 | spin_unlock_irqrestore(&slob_lock, flags); |
@@ -341,7 +360,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) | |||
341 | sp->free = b; | 360 | sp->free = b; |
342 | INIT_LIST_HEAD(&sp->list); | 361 | INIT_LIST_HEAD(&sp->list); |
343 | set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); | 362 | set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); |
344 | set_slob_page_free(sp); | 363 | set_slob_page_free(sp, slob_list); |
345 | b = slob_page_alloc(sp, size, align); | 364 | b = slob_page_alloc(sp, size, align); |
346 | BUG_ON(!b); | 365 | BUG_ON(!b); |
347 | spin_unlock_irqrestore(&slob_lock, flags); | 366 | spin_unlock_irqrestore(&slob_lock, flags); |
@@ -387,7 +406,7 @@ static void slob_free(void *block, int size) | |||
387 | set_slob(b, units, | 406 | set_slob(b, units, |
388 | (void *)((unsigned long)(b + | 407 | (void *)((unsigned long)(b + |
389 | SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); | 408 | SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); |
390 | set_slob_page_free(sp); | 409 | set_slob_page_free(sp, &free_slob_small); |
391 | goto out; | 410 | goto out; |
392 | } | 411 | } |
393 | 412 | ||
@@ -398,6 +417,10 @@ static void slob_free(void *block, int size) | |||
398 | sp->units += units; | 417 | sp->units += units; |
399 | 418 | ||
400 | if (b < sp->free) { | 419 | if (b < sp->free) { |
420 | if (b + units == sp->free) { | ||
421 | units += slob_units(sp->free); | ||
422 | sp->free = slob_next(sp->free); | ||
423 | } | ||
401 | set_slob(b, units, sp->free); | 424 | set_slob(b, units, sp->free); |
402 | sp->free = b; | 425 | sp->free = b; |
403 | } else { | 426 | } else { |
@@ -247,7 +247,10 @@ static void sysfs_slab_remove(struct kmem_cache *); | |||
247 | static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } | 247 | static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } |
248 | static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) | 248 | static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) |
249 | { return 0; } | 249 | { return 0; } |
250 | static inline void sysfs_slab_remove(struct kmem_cache *s) {} | 250 | static inline void sysfs_slab_remove(struct kmem_cache *s) |
251 | { | ||
252 | kfree(s); | ||
253 | } | ||
251 | #endif | 254 | #endif |
252 | 255 | ||
253 | /******************************************************************** | 256 | /******************************************************************** |
@@ -354,22 +357,22 @@ static void print_section(char *text, u8 *addr, unsigned int length) | |||
354 | printk(KERN_ERR "%8s 0x%p: ", text, addr + i); | 357 | printk(KERN_ERR "%8s 0x%p: ", text, addr + i); |
355 | newline = 0; | 358 | newline = 0; |
356 | } | 359 | } |
357 | printk(" %02x", addr[i]); | 360 | printk(KERN_CONT " %02x", addr[i]); |
358 | offset = i % 16; | 361 | offset = i % 16; |
359 | ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; | 362 | ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; |
360 | if (offset == 15) { | 363 | if (offset == 15) { |
361 | printk(" %s\n",ascii); | 364 | printk(KERN_CONT " %s\n", ascii); |
362 | newline = 1; | 365 | newline = 1; |
363 | } | 366 | } |
364 | } | 367 | } |
365 | if (!newline) { | 368 | if (!newline) { |
366 | i %= 16; | 369 | i %= 16; |
367 | while (i < 16) { | 370 | while (i < 16) { |
368 | printk(" "); | 371 | printk(KERN_CONT " "); |
369 | ascii[i] = ' '; | 372 | ascii[i] = ' '; |
370 | i++; | 373 | i++; |
371 | } | 374 | } |
372 | printk(" %s\n", ascii); | 375 | printk(KERN_CONT " %s\n", ascii); |
373 | } | 376 | } |
374 | } | 377 | } |
375 | 378 | ||
@@ -529,7 +532,7 @@ static void init_object(struct kmem_cache *s, void *object, int active) | |||
529 | 532 | ||
530 | if (s->flags & __OBJECT_POISON) { | 533 | if (s->flags & __OBJECT_POISON) { |
531 | memset(p, POISON_FREE, s->objsize - 1); | 534 | memset(p, POISON_FREE, s->objsize - 1); |
532 | p[s->objsize -1] = POISON_END; | 535 | p[s->objsize - 1] = POISON_END; |
533 | } | 536 | } |
534 | 537 | ||
535 | if (s->flags & SLAB_RED_ZONE) | 538 | if (s->flags & SLAB_RED_ZONE) |
@@ -558,7 +561,7 @@ static void restore_bytes(struct kmem_cache *s, char *message, u8 data, | |||
558 | 561 | ||
559 | static int check_bytes_and_report(struct kmem_cache *s, struct page *page, | 562 | static int check_bytes_and_report(struct kmem_cache *s, struct page *page, |
560 | u8 *object, char *what, | 563 | u8 *object, char *what, |
561 | u8* start, unsigned int value, unsigned int bytes) | 564 | u8 *start, unsigned int value, unsigned int bytes) |
562 | { | 565 | { |
563 | u8 *fault; | 566 | u8 *fault; |
564 | u8 *end; | 567 | u8 *end; |
@@ -692,7 +695,7 @@ static int check_object(struct kmem_cache *s, struct page *page, | |||
692 | (!check_bytes_and_report(s, page, p, "Poison", p, | 695 | (!check_bytes_and_report(s, page, p, "Poison", p, |
693 | POISON_FREE, s->objsize - 1) || | 696 | POISON_FREE, s->objsize - 1) || |
694 | !check_bytes_and_report(s, page, p, "Poison", | 697 | !check_bytes_and_report(s, page, p, "Poison", |
695 | p + s->objsize -1, POISON_END, 1))) | 698 | p + s->objsize - 1, POISON_END, 1))) |
696 | return 0; | 699 | return 0; |
697 | /* | 700 | /* |
698 | * check_pad_bytes cleans up on its own. | 701 | * check_pad_bytes cleans up on its own. |
@@ -900,8 +903,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, | |||
900 | "SLUB <none>: no slab for object 0x%p.\n", | 903 | "SLUB <none>: no slab for object 0x%p.\n", |
901 | object); | 904 | object); |
902 | dump_stack(); | 905 | dump_stack(); |
903 | } | 906 | } else |
904 | else | ||
905 | object_err(s, page, object, | 907 | object_err(s, page, object, |
906 | "page slab pointer corrupt."); | 908 | "page slab pointer corrupt."); |
907 | goto fail; | 909 | goto fail; |
@@ -947,7 +949,7 @@ static int __init setup_slub_debug(char *str) | |||
947 | /* | 949 | /* |
948 | * Determine which debug features should be switched on | 950 | * Determine which debug features should be switched on |
949 | */ | 951 | */ |
950 | for ( ;*str && *str != ','; str++) { | 952 | for (; *str && *str != ','; str++) { |
951 | switch (tolower(*str)) { | 953 | switch (tolower(*str)) { |
952 | case 'f': | 954 | case 'f': |
953 | slub_debug |= SLAB_DEBUG_FREE; | 955 | slub_debug |= SLAB_DEBUG_FREE; |
@@ -966,7 +968,7 @@ static int __init setup_slub_debug(char *str) | |||
966 | break; | 968 | break; |
967 | default: | 969 | default: |
968 | printk(KERN_ERR "slub_debug option '%c' " | 970 | printk(KERN_ERR "slub_debug option '%c' " |
969 | "unknown. skipped\n",*str); | 971 | "unknown. skipped\n", *str); |
970 | } | 972 | } |
971 | } | 973 | } |
972 | 974 | ||
@@ -1039,7 +1041,7 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize, | |||
1039 | */ | 1041 | */ |
1040 | static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | 1042 | static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) |
1041 | { | 1043 | { |
1042 | struct page * page; | 1044 | struct page *page; |
1043 | int pages = 1 << s->order; | 1045 | int pages = 1 << s->order; |
1044 | 1046 | ||
1045 | if (s->order) | 1047 | if (s->order) |
@@ -1135,7 +1137,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1135 | mod_zone_page_state(page_zone(page), | 1137 | mod_zone_page_state(page_zone(page), |
1136 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? | 1138 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? |
1137 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, | 1139 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, |
1138 | - pages); | 1140 | -pages); |
1139 | 1141 | ||
1140 | __free_pages(page, s->order); | 1142 | __free_pages(page, s->order); |
1141 | } | 1143 | } |
@@ -1195,19 +1197,15 @@ static __always_inline int slab_trylock(struct page *page) | |||
1195 | /* | 1197 | /* |
1196 | * Management of partially allocated slabs | 1198 | * Management of partially allocated slabs |
1197 | */ | 1199 | */ |
1198 | static void add_partial_tail(struct kmem_cache_node *n, struct page *page) | 1200 | static void add_partial(struct kmem_cache_node *n, |
1199 | { | 1201 | struct page *page, int tail) |
1200 | spin_lock(&n->list_lock); | ||
1201 | n->nr_partial++; | ||
1202 | list_add_tail(&page->lru, &n->partial); | ||
1203 | spin_unlock(&n->list_lock); | ||
1204 | } | ||
1205 | |||
1206 | static void add_partial(struct kmem_cache_node *n, struct page *page) | ||
1207 | { | 1202 | { |
1208 | spin_lock(&n->list_lock); | 1203 | spin_lock(&n->list_lock); |
1209 | n->nr_partial++; | 1204 | n->nr_partial++; |
1210 | list_add(&page->lru, &n->partial); | 1205 | if (tail) |
1206 | list_add_tail(&page->lru, &n->partial); | ||
1207 | else | ||
1208 | list_add(&page->lru, &n->partial); | ||
1211 | spin_unlock(&n->list_lock); | 1209 | spin_unlock(&n->list_lock); |
1212 | } | 1210 | } |
1213 | 1211 | ||
@@ -1292,7 +1290,8 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
1292 | * expensive if we do it every time we are trying to find a slab | 1290 | * expensive if we do it every time we are trying to find a slab |
1293 | * with available objects. | 1291 | * with available objects. |
1294 | */ | 1292 | */ |
1295 | if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) | 1293 | if (!s->remote_node_defrag_ratio || |
1294 | get_cycles() % 1024 > s->remote_node_defrag_ratio) | ||
1296 | return NULL; | 1295 | return NULL; |
1297 | 1296 | ||
1298 | zonelist = &NODE_DATA(slab_node(current->mempolicy)) | 1297 | zonelist = &NODE_DATA(slab_node(current->mempolicy)) |
@@ -1335,7 +1334,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) | |||
1335 | * | 1334 | * |
1336 | * On exit the slab lock will have been dropped. | 1335 | * On exit the slab lock will have been dropped. |
1337 | */ | 1336 | */ |
1338 | static void unfreeze_slab(struct kmem_cache *s, struct page *page) | 1337 | static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) |
1339 | { | 1338 | { |
1340 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | 1339 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); |
1341 | 1340 | ||
@@ -1343,7 +1342,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page) | |||
1343 | if (page->inuse) { | 1342 | if (page->inuse) { |
1344 | 1343 | ||
1345 | if (page->freelist) | 1344 | if (page->freelist) |
1346 | add_partial(n, page); | 1345 | add_partial(n, page, tail); |
1347 | else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) | 1346 | else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) |
1348 | add_full(n, page); | 1347 | add_full(n, page); |
1349 | slab_unlock(page); | 1348 | slab_unlock(page); |
@@ -1358,7 +1357,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page) | |||
1358 | * partial list stays small. kmem_cache_shrink can | 1357 | * partial list stays small. kmem_cache_shrink can |
1359 | * reclaim empty slabs from the partial list. | 1358 | * reclaim empty slabs from the partial list. |
1360 | */ | 1359 | */ |
1361 | add_partial_tail(n, page); | 1360 | add_partial(n, page, 1); |
1362 | slab_unlock(page); | 1361 | slab_unlock(page); |
1363 | } else { | 1362 | } else { |
1364 | slab_unlock(page); | 1363 | slab_unlock(page); |
@@ -1373,6 +1372,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page) | |||
1373 | static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1372 | static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
1374 | { | 1373 | { |
1375 | struct page *page = c->page; | 1374 | struct page *page = c->page; |
1375 | int tail = 1; | ||
1376 | /* | 1376 | /* |
1377 | * Merge cpu freelist into freelist. Typically we get here | 1377 | * Merge cpu freelist into freelist. Typically we get here |
1378 | * because both freelists are empty. So this is unlikely | 1378 | * because both freelists are empty. So this is unlikely |
@@ -1381,6 +1381,8 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
1381 | while (unlikely(c->freelist)) { | 1381 | while (unlikely(c->freelist)) { |
1382 | void **object; | 1382 | void **object; |
1383 | 1383 | ||
1384 | tail = 0; /* Hot objects. Put the slab first */ | ||
1385 | |||
1384 | /* Retrieve object from cpu_freelist */ | 1386 | /* Retrieve object from cpu_freelist */ |
1385 | object = c->freelist; | 1387 | object = c->freelist; |
1386 | c->freelist = c->freelist[c->offset]; | 1388 | c->freelist = c->freelist[c->offset]; |
@@ -1391,7 +1393,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
1391 | page->inuse--; | 1393 | page->inuse--; |
1392 | } | 1394 | } |
1393 | c->page = NULL; | 1395 | c->page = NULL; |
1394 | unfreeze_slab(s, page); | 1396 | unfreeze_slab(s, page, tail); |
1395 | } | 1397 | } |
1396 | 1398 | ||
1397 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1399 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
@@ -1539,7 +1541,7 @@ debug: | |||
1539 | * | 1541 | * |
1540 | * Otherwise we can simply pick the next object from the lockless free list. | 1542 | * Otherwise we can simply pick the next object from the lockless free list. |
1541 | */ | 1543 | */ |
1542 | static void __always_inline *slab_alloc(struct kmem_cache *s, | 1544 | static __always_inline void *slab_alloc(struct kmem_cache *s, |
1543 | gfp_t gfpflags, int node, void *addr) | 1545 | gfp_t gfpflags, int node, void *addr) |
1544 | { | 1546 | { |
1545 | void **object; | 1547 | void **object; |
@@ -1613,7 +1615,7 @@ checks_ok: | |||
1613 | * then add it. | 1615 | * then add it. |
1614 | */ | 1616 | */ |
1615 | if (unlikely(!prior)) | 1617 | if (unlikely(!prior)) |
1616 | add_partial_tail(get_node(s, page_to_nid(page)), page); | 1618 | add_partial(get_node(s, page_to_nid(page)), page, 1); |
1617 | 1619 | ||
1618 | out_unlock: | 1620 | out_unlock: |
1619 | slab_unlock(page); | 1621 | slab_unlock(page); |
@@ -1647,7 +1649,7 @@ debug: | |||
1647 | * If fastpath is not possible then fall back to __slab_free where we deal | 1649 | * If fastpath is not possible then fall back to __slab_free where we deal |
1648 | * with all sorts of special processing. | 1650 | * with all sorts of special processing. |
1649 | */ | 1651 | */ |
1650 | static void __always_inline slab_free(struct kmem_cache *s, | 1652 | static __always_inline void slab_free(struct kmem_cache *s, |
1651 | struct page *page, void *x, void *addr) | 1653 | struct page *page, void *x, void *addr) |
1652 | { | 1654 | { |
1653 | void **object = (void *)x; | 1655 | void **object = (void *)x; |
@@ -1997,6 +1999,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, | |||
1997 | { | 1999 | { |
1998 | struct page *page; | 2000 | struct page *page; |
1999 | struct kmem_cache_node *n; | 2001 | struct kmem_cache_node *n; |
2002 | unsigned long flags; | ||
2000 | 2003 | ||
2001 | BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); | 2004 | BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); |
2002 | 2005 | ||
@@ -2021,7 +2024,14 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, | |||
2021 | #endif | 2024 | #endif |
2022 | init_kmem_cache_node(n); | 2025 | init_kmem_cache_node(n); |
2023 | atomic_long_inc(&n->nr_slabs); | 2026 | atomic_long_inc(&n->nr_slabs); |
2024 | add_partial(n, page); | 2027 | /* |
2028 | * lockdep requires consistent irq usage for each lock | ||
2029 | * so even though there cannot be a race this early in | ||
2030 | * the boot sequence, we still disable irqs. | ||
2031 | */ | ||
2032 | local_irq_save(flags); | ||
2033 | add_partial(n, page, 0); | ||
2034 | local_irq_restore(flags); | ||
2025 | return n; | 2035 | return n; |
2026 | } | 2036 | } |
2027 | 2037 | ||
@@ -2206,7 +2216,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, | |||
2206 | 2216 | ||
2207 | s->refcount = 1; | 2217 | s->refcount = 1; |
2208 | #ifdef CONFIG_NUMA | 2218 | #ifdef CONFIG_NUMA |
2209 | s->defrag_ratio = 100; | 2219 | s->remote_node_defrag_ratio = 100; |
2210 | #endif | 2220 | #endif |
2211 | if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) | 2221 | if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) |
2212 | goto error; | 2222 | goto error; |
@@ -2228,7 +2238,7 @@ error: | |||
2228 | */ | 2238 | */ |
2229 | int kmem_ptr_validate(struct kmem_cache *s, const void *object) | 2239 | int kmem_ptr_validate(struct kmem_cache *s, const void *object) |
2230 | { | 2240 | { |
2231 | struct page * page; | 2241 | struct page *page; |
2232 | 2242 | ||
2233 | page = get_object_page(object); | 2243 | page = get_object_page(object); |
2234 | 2244 | ||
@@ -2322,7 +2332,6 @@ void kmem_cache_destroy(struct kmem_cache *s) | |||
2322 | if (kmem_cache_close(s)) | 2332 | if (kmem_cache_close(s)) |
2323 | WARN_ON(1); | 2333 | WARN_ON(1); |
2324 | sysfs_slab_remove(s); | 2334 | sysfs_slab_remove(s); |
2325 | kfree(s); | ||
2326 | } else | 2335 | } else |
2327 | up_write(&slub_lock); | 2336 | up_write(&slub_lock); |
2328 | } | 2337 | } |
@@ -2341,7 +2350,7 @@ static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT]; | |||
2341 | 2350 | ||
2342 | static int __init setup_slub_min_order(char *str) | 2351 | static int __init setup_slub_min_order(char *str) |
2343 | { | 2352 | { |
2344 | get_option (&str, &slub_min_order); | 2353 | get_option(&str, &slub_min_order); |
2345 | 2354 | ||
2346 | return 1; | 2355 | return 1; |
2347 | } | 2356 | } |
@@ -2350,7 +2359,7 @@ __setup("slub_min_order=", setup_slub_min_order); | |||
2350 | 2359 | ||
2351 | static int __init setup_slub_max_order(char *str) | 2360 | static int __init setup_slub_max_order(char *str) |
2352 | { | 2361 | { |
2353 | get_option (&str, &slub_max_order); | 2362 | get_option(&str, &slub_max_order); |
2354 | 2363 | ||
2355 | return 1; | 2364 | return 1; |
2356 | } | 2365 | } |
@@ -2359,7 +2368,7 @@ __setup("slub_max_order=", setup_slub_max_order); | |||
2359 | 2368 | ||
2360 | static int __init setup_slub_min_objects(char *str) | 2369 | static int __init setup_slub_min_objects(char *str) |
2361 | { | 2370 | { |
2362 | get_option (&str, &slub_min_objects); | 2371 | get_option(&str, &slub_min_objects); |
2363 | 2372 | ||
2364 | return 1; | 2373 | return 1; |
2365 | } | 2374 | } |
@@ -2605,6 +2614,19 @@ void kfree(const void *x) | |||
2605 | } | 2614 | } |
2606 | EXPORT_SYMBOL(kfree); | 2615 | EXPORT_SYMBOL(kfree); |
2607 | 2616 | ||
2617 | static unsigned long count_partial(struct kmem_cache_node *n) | ||
2618 | { | ||
2619 | unsigned long flags; | ||
2620 | unsigned long x = 0; | ||
2621 | struct page *page; | ||
2622 | |||
2623 | spin_lock_irqsave(&n->list_lock, flags); | ||
2624 | list_for_each_entry(page, &n->partial, lru) | ||
2625 | x += page->inuse; | ||
2626 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
2627 | return x; | ||
2628 | } | ||
2629 | |||
2608 | /* | 2630 | /* |
2609 | * kmem_cache_shrink removes empty slabs from the partial lists and sorts | 2631 | * kmem_cache_shrink removes empty slabs from the partial lists and sorts |
2610 | * the remaining slabs by the number of items in use. The slabs with the | 2632 | * the remaining slabs by the number of items in use. The slabs with the |
@@ -2931,7 +2953,7 @@ static struct kmem_cache *find_mergeable(size_t size, | |||
2931 | * Check if alignment is compatible. | 2953 | * Check if alignment is compatible. |
2932 | * Courtesy of Adrian Drzewiecki | 2954 | * Courtesy of Adrian Drzewiecki |
2933 | */ | 2955 | */ |
2934 | if ((s->size & ~(align -1)) != s->size) | 2956 | if ((s->size & ~(align - 1)) != s->size) |
2935 | continue; | 2957 | continue; |
2936 | 2958 | ||
2937 | if (s->size - size >= sizeof(void *)) | 2959 | if (s->size - size >= sizeof(void *)) |
@@ -3040,8 +3062,9 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, | |||
3040 | return NOTIFY_OK; | 3062 | return NOTIFY_OK; |
3041 | } | 3063 | } |
3042 | 3064 | ||
3043 | static struct notifier_block __cpuinitdata slab_notifier = | 3065 | static struct notifier_block __cpuinitdata slab_notifier = { |
3044 | { &slab_cpuup_callback, NULL, 0 }; | 3066 | &slab_cpuup_callback, NULL, 0 |
3067 | }; | ||
3045 | 3068 | ||
3046 | #endif | 3069 | #endif |
3047 | 3070 | ||
@@ -3076,19 +3099,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, | |||
3076 | return slab_alloc(s, gfpflags, node, caller); | 3099 | return slab_alloc(s, gfpflags, node, caller); |
3077 | } | 3100 | } |
3078 | 3101 | ||
3079 | static unsigned long count_partial(struct kmem_cache_node *n) | ||
3080 | { | ||
3081 | unsigned long flags; | ||
3082 | unsigned long x = 0; | ||
3083 | struct page *page; | ||
3084 | |||
3085 | spin_lock_irqsave(&n->list_lock, flags); | ||
3086 | list_for_each_entry(page, &n->partial, lru) | ||
3087 | x += page->inuse; | ||
3088 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
3089 | return x; | ||
3090 | } | ||
3091 | |||
3092 | #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) | 3102 | #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) |
3093 | static int validate_slab(struct kmem_cache *s, struct page *page, | 3103 | static int validate_slab(struct kmem_cache *s, struct page *page, |
3094 | unsigned long *map) | 3104 | unsigned long *map) |
@@ -3390,7 +3400,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, | |||
3390 | static int list_locations(struct kmem_cache *s, char *buf, | 3400 | static int list_locations(struct kmem_cache *s, char *buf, |
3391 | enum track_item alloc) | 3401 | enum track_item alloc) |
3392 | { | 3402 | { |
3393 | int n = 0; | 3403 | int len = 0; |
3394 | unsigned long i; | 3404 | unsigned long i; |
3395 | struct loc_track t = { 0, 0, NULL }; | 3405 | struct loc_track t = { 0, 0, NULL }; |
3396 | int node; | 3406 | int node; |
@@ -3421,54 +3431,54 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
3421 | for (i = 0; i < t.count; i++) { | 3431 | for (i = 0; i < t.count; i++) { |
3422 | struct location *l = &t.loc[i]; | 3432 | struct location *l = &t.loc[i]; |
3423 | 3433 | ||
3424 | if (n > PAGE_SIZE - 100) | 3434 | if (len > PAGE_SIZE - 100) |
3425 | break; | 3435 | break; |
3426 | n += sprintf(buf + n, "%7ld ", l->count); | 3436 | len += sprintf(buf + len, "%7ld ", l->count); |
3427 | 3437 | ||
3428 | if (l->addr) | 3438 | if (l->addr) |
3429 | n += sprint_symbol(buf + n, (unsigned long)l->addr); | 3439 | len += sprint_symbol(buf + len, (unsigned long)l->addr); |
3430 | else | 3440 | else |
3431 | n += sprintf(buf + n, "<not-available>"); | 3441 | len += sprintf(buf + len, "<not-available>"); |
3432 | 3442 | ||
3433 | if (l->sum_time != l->min_time) { | 3443 | if (l->sum_time != l->min_time) { |
3434 | unsigned long remainder; | 3444 | unsigned long remainder; |
3435 | 3445 | ||
3436 | n += sprintf(buf + n, " age=%ld/%ld/%ld", | 3446 | len += sprintf(buf + len, " age=%ld/%ld/%ld", |
3437 | l->min_time, | 3447 | l->min_time, |
3438 | div_long_long_rem(l->sum_time, l->count, &remainder), | 3448 | div_long_long_rem(l->sum_time, l->count, &remainder), |
3439 | l->max_time); | 3449 | l->max_time); |
3440 | } else | 3450 | } else |
3441 | n += sprintf(buf + n, " age=%ld", | 3451 | len += sprintf(buf + len, " age=%ld", |
3442 | l->min_time); | 3452 | l->min_time); |
3443 | 3453 | ||
3444 | if (l->min_pid != l->max_pid) | 3454 | if (l->min_pid != l->max_pid) |
3445 | n += sprintf(buf + n, " pid=%ld-%ld", | 3455 | len += sprintf(buf + len, " pid=%ld-%ld", |
3446 | l->min_pid, l->max_pid); | 3456 | l->min_pid, l->max_pid); |
3447 | else | 3457 | else |
3448 | n += sprintf(buf + n, " pid=%ld", | 3458 | len += sprintf(buf + len, " pid=%ld", |
3449 | l->min_pid); | 3459 | l->min_pid); |
3450 | 3460 | ||
3451 | if (num_online_cpus() > 1 && !cpus_empty(l->cpus) && | 3461 | if (num_online_cpus() > 1 && !cpus_empty(l->cpus) && |
3452 | n < PAGE_SIZE - 60) { | 3462 | len < PAGE_SIZE - 60) { |
3453 | n += sprintf(buf + n, " cpus="); | 3463 | len += sprintf(buf + len, " cpus="); |
3454 | n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50, | 3464 | len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, |
3455 | l->cpus); | 3465 | l->cpus); |
3456 | } | 3466 | } |
3457 | 3467 | ||
3458 | if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && | 3468 | if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && |
3459 | n < PAGE_SIZE - 60) { | 3469 | len < PAGE_SIZE - 60) { |
3460 | n += sprintf(buf + n, " nodes="); | 3470 | len += sprintf(buf + len, " nodes="); |
3461 | n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50, | 3471 | len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, |
3462 | l->nodes); | 3472 | l->nodes); |
3463 | } | 3473 | } |
3464 | 3474 | ||
3465 | n += sprintf(buf + n, "\n"); | 3475 | len += sprintf(buf + len, "\n"); |
3466 | } | 3476 | } |
3467 | 3477 | ||
3468 | free_loc_track(&t); | 3478 | free_loc_track(&t); |
3469 | if (!t.count) | 3479 | if (!t.count) |
3470 | n += sprintf(buf, "No data\n"); | 3480 | len += sprintf(buf, "No data\n"); |
3471 | return n; | 3481 | return len; |
3472 | } | 3482 | } |
3473 | 3483 | ||
3474 | enum slab_stat_type { | 3484 | enum slab_stat_type { |
@@ -3498,7 +3508,6 @@ static unsigned long slab_objects(struct kmem_cache *s, | |||
3498 | 3508 | ||
3499 | for_each_possible_cpu(cpu) { | 3509 | for_each_possible_cpu(cpu) { |
3500 | struct page *page; | 3510 | struct page *page; |
3501 | int node; | ||
3502 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | 3511 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); |
3503 | 3512 | ||
3504 | if (!c) | 3513 | if (!c) |
@@ -3510,8 +3519,6 @@ static unsigned long slab_objects(struct kmem_cache *s, | |||
3510 | continue; | 3519 | continue; |
3511 | if (page) { | 3520 | if (page) { |
3512 | if (flags & SO_CPU) { | 3521 | if (flags & SO_CPU) { |
3513 | int x = 0; | ||
3514 | |||
3515 | if (flags & SO_OBJECTS) | 3522 | if (flags & SO_OBJECTS) |
3516 | x = page->inuse; | 3523 | x = page->inuse; |
3517 | else | 3524 | else |
@@ -3848,24 +3855,24 @@ static ssize_t free_calls_show(struct kmem_cache *s, char *buf) | |||
3848 | SLAB_ATTR_RO(free_calls); | 3855 | SLAB_ATTR_RO(free_calls); |
3849 | 3856 | ||
3850 | #ifdef CONFIG_NUMA | 3857 | #ifdef CONFIG_NUMA |
3851 | static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf) | 3858 | static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) |
3852 | { | 3859 | { |
3853 | return sprintf(buf, "%d\n", s->defrag_ratio / 10); | 3860 | return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10); |
3854 | } | 3861 | } |
3855 | 3862 | ||
3856 | static ssize_t defrag_ratio_store(struct kmem_cache *s, | 3863 | static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, |
3857 | const char *buf, size_t length) | 3864 | const char *buf, size_t length) |
3858 | { | 3865 | { |
3859 | int n = simple_strtoul(buf, NULL, 10); | 3866 | int n = simple_strtoul(buf, NULL, 10); |
3860 | 3867 | ||
3861 | if (n < 100) | 3868 | if (n < 100) |
3862 | s->defrag_ratio = n * 10; | 3869 | s->remote_node_defrag_ratio = n * 10; |
3863 | return length; | 3870 | return length; |
3864 | } | 3871 | } |
3865 | SLAB_ATTR(defrag_ratio); | 3872 | SLAB_ATTR(remote_node_defrag_ratio); |
3866 | #endif | 3873 | #endif |
3867 | 3874 | ||
3868 | static struct attribute * slab_attrs[] = { | 3875 | static struct attribute *slab_attrs[] = { |
3869 | &slab_size_attr.attr, | 3876 | &slab_size_attr.attr, |
3870 | &object_size_attr.attr, | 3877 | &object_size_attr.attr, |
3871 | &objs_per_slab_attr.attr, | 3878 | &objs_per_slab_attr.attr, |
@@ -3893,7 +3900,7 @@ static struct attribute * slab_attrs[] = { | |||
3893 | &cache_dma_attr.attr, | 3900 | &cache_dma_attr.attr, |
3894 | #endif | 3901 | #endif |
3895 | #ifdef CONFIG_NUMA | 3902 | #ifdef CONFIG_NUMA |
3896 | &defrag_ratio_attr.attr, | 3903 | &remote_node_defrag_ratio_attr.attr, |
3897 | #endif | 3904 | #endif |
3898 | NULL | 3905 | NULL |
3899 | }; | 3906 | }; |
@@ -3940,6 +3947,13 @@ static ssize_t slab_attr_store(struct kobject *kobj, | |||
3940 | return err; | 3947 | return err; |
3941 | } | 3948 | } |
3942 | 3949 | ||
3950 | static void kmem_cache_release(struct kobject *kobj) | ||
3951 | { | ||
3952 | struct kmem_cache *s = to_slab(kobj); | ||
3953 | |||
3954 | kfree(s); | ||
3955 | } | ||
3956 | |||
3943 | static struct sysfs_ops slab_sysfs_ops = { | 3957 | static struct sysfs_ops slab_sysfs_ops = { |
3944 | .show = slab_attr_show, | 3958 | .show = slab_attr_show, |
3945 | .store = slab_attr_store, | 3959 | .store = slab_attr_store, |
@@ -3947,6 +3961,7 @@ static struct sysfs_ops slab_sysfs_ops = { | |||
3947 | 3961 | ||
3948 | static struct kobj_type slab_ktype = { | 3962 | static struct kobj_type slab_ktype = { |
3949 | .sysfs_ops = &slab_sysfs_ops, | 3963 | .sysfs_ops = &slab_sysfs_ops, |
3964 | .release = kmem_cache_release | ||
3950 | }; | 3965 | }; |
3951 | 3966 | ||
3952 | static int uevent_filter(struct kset *kset, struct kobject *kobj) | 3967 | static int uevent_filter(struct kset *kset, struct kobject *kobj) |
@@ -4048,6 +4063,7 @@ static void sysfs_slab_remove(struct kmem_cache *s) | |||
4048 | { | 4063 | { |
4049 | kobject_uevent(&s->kobj, KOBJ_REMOVE); | 4064 | kobject_uevent(&s->kobj, KOBJ_REMOVE); |
4050 | kobject_del(&s->kobj); | 4065 | kobject_del(&s->kobj); |
4066 | kobject_put(&s->kobj); | ||
4051 | } | 4067 | } |
4052 | 4068 | ||
4053 | /* | 4069 | /* |
diff --git a/mm/sparse.c b/mm/sparse.c index a2183cb5d524..f6a43c09c322 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -237,7 +237,7 @@ static unsigned long *__kmalloc_section_usemap(void) | |||
237 | } | 237 | } |
238 | #endif /* CONFIG_MEMORY_HOTPLUG */ | 238 | #endif /* CONFIG_MEMORY_HOTPLUG */ |
239 | 239 | ||
240 | static unsigned long *sparse_early_usemap_alloc(unsigned long pnum) | 240 | static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) |
241 | { | 241 | { |
242 | unsigned long *usemap; | 242 | unsigned long *usemap; |
243 | struct mem_section *ms = __nr_to_section(pnum); | 243 | struct mem_section *ms = __nr_to_section(pnum); |
@@ -353,17 +353,9 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, | |||
353 | return __kmalloc_section_memmap(nr_pages); | 353 | return __kmalloc_section_memmap(nr_pages); |
354 | } | 354 | } |
355 | 355 | ||
356 | static int vaddr_in_vmalloc_area(void *addr) | ||
357 | { | ||
358 | if (addr >= (void *)VMALLOC_START && | ||
359 | addr < (void *)VMALLOC_END) | ||
360 | return 1; | ||
361 | return 0; | ||
362 | } | ||
363 | |||
364 | static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | 356 | static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) |
365 | { | 357 | { |
366 | if (vaddr_in_vmalloc_area(memmap)) | 358 | if (is_vmalloc_addr(memmap)) |
367 | vfree(memmap); | 359 | vfree(memmap); |
368 | else | 360 | else |
369 | free_pages((unsigned long)memmap, | 361 | free_pages((unsigned long)memmap, |
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/cpu.h> | 29 | #include <linux/cpu.h> |
30 | #include <linux/notifier.h> | 30 | #include <linux/notifier.h> |
31 | #include <linux/backing-dev.h> | 31 | #include <linux/backing-dev.h> |
32 | #include <linux/memcontrol.h> | ||
32 | 33 | ||
33 | /* How many pages do we try to swap or page in/out together? */ | 34 | /* How many pages do we try to swap or page in/out together? */ |
34 | int page_cluster; | 35 | int page_cluster; |
@@ -41,7 +42,7 @@ static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; | |||
41 | * This path almost never happens for VM activity - pages are normally | 42 | * This path almost never happens for VM activity - pages are normally |
42 | * freed via pagevecs. But it gets used by networking. | 43 | * freed via pagevecs. But it gets used by networking. |
43 | */ | 44 | */ |
44 | static void fastcall __page_cache_release(struct page *page) | 45 | static void __page_cache_release(struct page *page) |
45 | { | 46 | { |
46 | if (PageLRU(page)) { | 47 | if (PageLRU(page)) { |
47 | unsigned long flags; | 48 | unsigned long flags; |
@@ -165,7 +166,7 @@ int rotate_reclaimable_page(struct page *page) | |||
165 | /* | 166 | /* |
166 | * FIXME: speed this up? | 167 | * FIXME: speed this up? |
167 | */ | 168 | */ |
168 | void fastcall activate_page(struct page *page) | 169 | void activate_page(struct page *page) |
169 | { | 170 | { |
170 | struct zone *zone = page_zone(page); | 171 | struct zone *zone = page_zone(page); |
171 | 172 | ||
@@ -175,6 +176,7 @@ void fastcall activate_page(struct page *page) | |||
175 | SetPageActive(page); | 176 | SetPageActive(page); |
176 | add_page_to_active_list(zone, page); | 177 | add_page_to_active_list(zone, page); |
177 | __count_vm_event(PGACTIVATE); | 178 | __count_vm_event(PGACTIVATE); |
179 | mem_cgroup_move_lists(page_get_page_cgroup(page), true); | ||
178 | } | 180 | } |
179 | spin_unlock_irq(&zone->lru_lock); | 181 | spin_unlock_irq(&zone->lru_lock); |
180 | } | 182 | } |
@@ -186,7 +188,7 @@ void fastcall activate_page(struct page *page) | |||
186 | * inactive,referenced -> active,unreferenced | 188 | * inactive,referenced -> active,unreferenced |
187 | * active,unreferenced -> active,referenced | 189 | * active,unreferenced -> active,referenced |
188 | */ | 190 | */ |
189 | void fastcall mark_page_accessed(struct page *page) | 191 | void mark_page_accessed(struct page *page) |
190 | { | 192 | { |
191 | if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { | 193 | if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { |
192 | activate_page(page); | 194 | activate_page(page); |
@@ -202,7 +204,7 @@ EXPORT_SYMBOL(mark_page_accessed); | |||
202 | * lru_cache_add: add a page to the page lists | 204 | * lru_cache_add: add a page to the page lists |
203 | * @page: the page to add | 205 | * @page: the page to add |
204 | */ | 206 | */ |
205 | void fastcall lru_cache_add(struct page *page) | 207 | void lru_cache_add(struct page *page) |
206 | { | 208 | { |
207 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); | 209 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); |
208 | 210 | ||
@@ -212,7 +214,7 @@ void fastcall lru_cache_add(struct page *page) | |||
212 | put_cpu_var(lru_add_pvecs); | 214 | put_cpu_var(lru_add_pvecs); |
213 | } | 215 | } |
214 | 216 | ||
215 | void fastcall lru_cache_add_active(struct page *page) | 217 | void lru_cache_add_active(struct page *page) |
216 | { | 218 | { |
217 | struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); | 219 | struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); |
218 | 220 | ||
diff --git a/mm/swap_state.c b/mm/swap_state.c index b52635601dfe..ec42f01a8d02 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
11 | #include <linux/kernel_stat.h> | 11 | #include <linux/kernel_stat.h> |
12 | #include <linux/swap.h> | 12 | #include <linux/swap.h> |
13 | #include <linux/swapops.h> | ||
13 | #include <linux/init.h> | 14 | #include <linux/init.h> |
14 | #include <linux/pagemap.h> | 15 | #include <linux/pagemap.h> |
15 | #include <linux/buffer_head.h> | 16 | #include <linux/buffer_head.h> |
@@ -51,26 +52,22 @@ static struct { | |||
51 | unsigned long del_total; | 52 | unsigned long del_total; |
52 | unsigned long find_success; | 53 | unsigned long find_success; |
53 | unsigned long find_total; | 54 | unsigned long find_total; |
54 | unsigned long noent_race; | ||
55 | unsigned long exist_race; | ||
56 | } swap_cache_info; | 55 | } swap_cache_info; |
57 | 56 | ||
58 | void show_swap_cache_info(void) | 57 | void show_swap_cache_info(void) |
59 | { | 58 | { |
60 | printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n", | 59 | printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n", |
61 | swap_cache_info.add_total, swap_cache_info.del_total, | 60 | swap_cache_info.add_total, swap_cache_info.del_total, |
62 | swap_cache_info.find_success, swap_cache_info.find_total, | 61 | swap_cache_info.find_success, swap_cache_info.find_total); |
63 | swap_cache_info.noent_race, swap_cache_info.exist_race); | ||
64 | printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); | 62 | printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); |
65 | printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); | 63 | printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); |
66 | } | 64 | } |
67 | 65 | ||
68 | /* | 66 | /* |
69 | * __add_to_swap_cache resembles add_to_page_cache on swapper_space, | 67 | * add_to_swap_cache resembles add_to_page_cache on swapper_space, |
70 | * but sets SwapCache flag and private instead of mapping and index. | 68 | * but sets SwapCache flag and private instead of mapping and index. |
71 | */ | 69 | */ |
72 | static int __add_to_swap_cache(struct page *page, swp_entry_t entry, | 70 | int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) |
73 | gfp_t gfp_mask) | ||
74 | { | 71 | { |
75 | int error; | 72 | int error; |
76 | 73 | ||
@@ -88,6 +85,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, | |||
88 | set_page_private(page, entry.val); | 85 | set_page_private(page, entry.val); |
89 | total_swapcache_pages++; | 86 | total_swapcache_pages++; |
90 | __inc_zone_page_state(page, NR_FILE_PAGES); | 87 | __inc_zone_page_state(page, NR_FILE_PAGES); |
88 | INC_CACHE_INFO(add_total); | ||
91 | } | 89 | } |
92 | write_unlock_irq(&swapper_space.tree_lock); | 90 | write_unlock_irq(&swapper_space.tree_lock); |
93 | radix_tree_preload_end(); | 91 | radix_tree_preload_end(); |
@@ -95,31 +93,6 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, | |||
95 | return error; | 93 | return error; |
96 | } | 94 | } |
97 | 95 | ||
98 | static int add_to_swap_cache(struct page *page, swp_entry_t entry) | ||
99 | { | ||
100 | int error; | ||
101 | |||
102 | BUG_ON(PageLocked(page)); | ||
103 | if (!swap_duplicate(entry)) { | ||
104 | INC_CACHE_INFO(noent_race); | ||
105 | return -ENOENT; | ||
106 | } | ||
107 | SetPageLocked(page); | ||
108 | error = __add_to_swap_cache(page, entry, GFP_KERNEL); | ||
109 | /* | ||
110 | * Anon pages are already on the LRU, we don't run lru_cache_add here. | ||
111 | */ | ||
112 | if (error) { | ||
113 | ClearPageLocked(page); | ||
114 | swap_free(entry); | ||
115 | if (error == -EEXIST) | ||
116 | INC_CACHE_INFO(exist_race); | ||
117 | return error; | ||
118 | } | ||
119 | INC_CACHE_INFO(add_total); | ||
120 | return 0; | ||
121 | } | ||
122 | |||
123 | /* | 96 | /* |
124 | * This must be called only on pages that have | 97 | * This must be called only on pages that have |
125 | * been verified to be in the swap cache. | 98 | * been verified to be in the swap cache. |
@@ -152,6 +125,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask) | |||
152 | int err; | 125 | int err; |
153 | 126 | ||
154 | BUG_ON(!PageLocked(page)); | 127 | BUG_ON(!PageLocked(page)); |
128 | BUG_ON(!PageUptodate(page)); | ||
155 | 129 | ||
156 | for (;;) { | 130 | for (;;) { |
157 | entry = get_swap_page(); | 131 | entry = get_swap_page(); |
@@ -169,18 +143,15 @@ int add_to_swap(struct page * page, gfp_t gfp_mask) | |||
169 | /* | 143 | /* |
170 | * Add it to the swap cache and mark it dirty | 144 | * Add it to the swap cache and mark it dirty |
171 | */ | 145 | */ |
172 | err = __add_to_swap_cache(page, entry, | 146 | err = add_to_swap_cache(page, entry, |
173 | gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); | 147 | gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); |
174 | 148 | ||
175 | switch (err) { | 149 | switch (err) { |
176 | case 0: /* Success */ | 150 | case 0: /* Success */ |
177 | SetPageUptodate(page); | ||
178 | SetPageDirty(page); | 151 | SetPageDirty(page); |
179 | INC_CACHE_INFO(add_total); | ||
180 | return 1; | 152 | return 1; |
181 | case -EEXIST: | 153 | case -EEXIST: |
182 | /* Raced with "speculative" read_swap_cache_async */ | 154 | /* Raced with "speculative" read_swap_cache_async */ |
183 | INC_CACHE_INFO(exist_race); | ||
184 | swap_free(entry); | 155 | swap_free(entry); |
185 | continue; | 156 | continue; |
186 | default: | 157 | default: |
@@ -211,40 +182,6 @@ void delete_from_swap_cache(struct page *page) | |||
211 | page_cache_release(page); | 182 | page_cache_release(page); |
212 | } | 183 | } |
213 | 184 | ||
214 | /* | ||
215 | * Strange swizzling function only for use by shmem_writepage | ||
216 | */ | ||
217 | int move_to_swap_cache(struct page *page, swp_entry_t entry) | ||
218 | { | ||
219 | int err = __add_to_swap_cache(page, entry, GFP_ATOMIC); | ||
220 | if (!err) { | ||
221 | remove_from_page_cache(page); | ||
222 | page_cache_release(page); /* pagecache ref */ | ||
223 | if (!swap_duplicate(entry)) | ||
224 | BUG(); | ||
225 | SetPageDirty(page); | ||
226 | INC_CACHE_INFO(add_total); | ||
227 | } else if (err == -EEXIST) | ||
228 | INC_CACHE_INFO(exist_race); | ||
229 | return err; | ||
230 | } | ||
231 | |||
232 | /* | ||
233 | * Strange swizzling function for shmem_getpage (and shmem_unuse) | ||
234 | */ | ||
235 | int move_from_swap_cache(struct page *page, unsigned long index, | ||
236 | struct address_space *mapping) | ||
237 | { | ||
238 | int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC); | ||
239 | if (!err) { | ||
240 | delete_from_swap_cache(page); | ||
241 | /* shift page from clean_pages to dirty_pages list */ | ||
242 | ClearPageDirty(page); | ||
243 | set_page_dirty(page); | ||
244 | } | ||
245 | return err; | ||
246 | } | ||
247 | |||
248 | /* | 185 | /* |
249 | * If we are the only user, then try to free up the swap cache. | 186 | * If we are the only user, then try to free up the swap cache. |
250 | * | 187 | * |
@@ -317,7 +254,7 @@ struct page * lookup_swap_cache(swp_entry_t entry) | |||
317 | * A failure return means that either the page allocation failed or that | 254 | * A failure return means that either the page allocation failed or that |
318 | * the swap entry is no longer in use. | 255 | * the swap entry is no longer in use. |
319 | */ | 256 | */ |
320 | struct page *read_swap_cache_async(swp_entry_t entry, | 257 | struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, |
321 | struct vm_area_struct *vma, unsigned long addr) | 258 | struct vm_area_struct *vma, unsigned long addr) |
322 | { | 259 | { |
323 | struct page *found_page, *new_page = NULL; | 260 | struct page *found_page, *new_page = NULL; |
@@ -337,23 +274,27 @@ struct page *read_swap_cache_async(swp_entry_t entry, | |||
337 | * Get a new page to read into from swap. | 274 | * Get a new page to read into from swap. |
338 | */ | 275 | */ |
339 | if (!new_page) { | 276 | if (!new_page) { |
340 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, | 277 | new_page = alloc_page_vma(gfp_mask, vma, addr); |
341 | vma, addr); | ||
342 | if (!new_page) | 278 | if (!new_page) |
343 | break; /* Out of memory */ | 279 | break; /* Out of memory */ |
344 | } | 280 | } |
345 | 281 | ||
346 | /* | 282 | /* |
283 | * Swap entry may have been freed since our caller observed it. | ||
284 | */ | ||
285 | if (!swap_duplicate(entry)) | ||
286 | break; | ||
287 | |||
288 | /* | ||
347 | * Associate the page with swap entry in the swap cache. | 289 | * Associate the page with swap entry in the swap cache. |
348 | * May fail (-ENOENT) if swap entry has been freed since | 290 | * May fail (-EEXIST) if there is already a page associated |
349 | * our caller observed it. May fail (-EEXIST) if there | 291 | * with this entry in the swap cache: added by a racing |
350 | * is already a page associated with this entry in the | 292 | * read_swap_cache_async, or add_to_swap or shmem_writepage |
351 | * swap cache: added by a racing read_swap_cache_async, | 293 | * re-using the just freed swap entry for an existing page. |
352 | * or by try_to_swap_out (or shmem_writepage) re-using | ||
353 | * the just freed swap entry for an existing page. | ||
354 | * May fail (-ENOMEM) if radix-tree node allocation failed. | 294 | * May fail (-ENOMEM) if radix-tree node allocation failed. |
355 | */ | 295 | */ |
356 | err = add_to_swap_cache(new_page, entry); | 296 | SetPageLocked(new_page); |
297 | err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); | ||
357 | if (!err) { | 298 | if (!err) { |
358 | /* | 299 | /* |
359 | * Initiate read into locked page and return. | 300 | * Initiate read into locked page and return. |
@@ -362,9 +303,57 @@ struct page *read_swap_cache_async(swp_entry_t entry, | |||
362 | swap_readpage(NULL, new_page); | 303 | swap_readpage(NULL, new_page); |
363 | return new_page; | 304 | return new_page; |
364 | } | 305 | } |
365 | } while (err != -ENOENT && err != -ENOMEM); | 306 | ClearPageLocked(new_page); |
307 | swap_free(entry); | ||
308 | } while (err != -ENOMEM); | ||
366 | 309 | ||
367 | if (new_page) | 310 | if (new_page) |
368 | page_cache_release(new_page); | 311 | page_cache_release(new_page); |
369 | return found_page; | 312 | return found_page; |
370 | } | 313 | } |
314 | |||
315 | /** | ||
316 | * swapin_readahead - swap in pages in hope we need them soon | ||
317 | * @entry: swap entry of this memory | ||
318 | * @vma: user vma this address belongs to | ||
319 | * @addr: target address for mempolicy | ||
320 | * | ||
321 | * Returns the struct page for entry and addr, after queueing swapin. | ||
322 | * | ||
323 | * Primitive swap readahead code. We simply read an aligned block of | ||
324 | * (1 << page_cluster) entries in the swap area. This method is chosen | ||
325 | * because it doesn't cost us any seek time. We also make sure to queue | ||
326 | * the 'original' request together with the readahead ones... | ||
327 | * | ||
328 | * This has been extended to use the NUMA policies from the mm triggering | ||
329 | * the readahead. | ||
330 | * | ||
331 | * Caller must hold down_read on the vma->vm_mm if vma is not NULL. | ||
332 | */ | ||
333 | struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | ||
334 | struct vm_area_struct *vma, unsigned long addr) | ||
335 | { | ||
336 | int nr_pages; | ||
337 | struct page *page; | ||
338 | unsigned long offset; | ||
339 | unsigned long end_offset; | ||
340 | |||
341 | /* | ||
342 | * Get starting offset for readaround, and number of pages to read. | ||
343 | * Adjust starting address by readbehind (for NUMA interleave case)? | ||
344 | * No, it's very unlikely that swap layout would follow vma layout, | ||
345 | * more likely that neighbouring swap pages came from the same node: | ||
346 | * so use the same "addr" to choose the same node for each swap read. | ||
347 | */ | ||
348 | nr_pages = valid_swaphandles(entry, &offset); | ||
349 | for (end_offset = offset + nr_pages; offset < end_offset; offset++) { | ||
350 | /* Ok, do the async read-ahead now */ | ||
351 | page = read_swap_cache_async(swp_entry(swp_type(entry), offset), | ||
352 | gfp_mask, vma, addr); | ||
353 | if (!page) | ||
354 | break; | ||
355 | page_cache_release(page); | ||
356 | } | ||
357 | lru_add_drain(); /* Push any new pages onto the LRU now */ | ||
358 | return read_swap_cache_async(entry, gfp_mask, vma, addr); | ||
359 | } | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index f071648e1360..02ccab5ad9d9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/mutex.h> | 27 | #include <linux/mutex.h> |
28 | #include <linux/capability.h> | 28 | #include <linux/capability.h> |
29 | #include <linux/syscalls.h> | 29 | #include <linux/syscalls.h> |
30 | #include <linux/memcontrol.h> | ||
30 | 31 | ||
31 | #include <asm/pgtable.h> | 32 | #include <asm/pgtable.h> |
32 | #include <asm/tlbflush.h> | 33 | #include <asm/tlbflush.h> |
@@ -506,9 +507,24 @@ unsigned int count_swap_pages(int type, int free) | |||
506 | * just let do_wp_page work it out if a write is requested later - to | 507 | * just let do_wp_page work it out if a write is requested later - to |
507 | * force COW, vm_page_prot omits write permission from any private vma. | 508 | * force COW, vm_page_prot omits write permission from any private vma. |
508 | */ | 509 | */ |
509 | static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, | 510 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, |
510 | unsigned long addr, swp_entry_t entry, struct page *page) | 511 | unsigned long addr, swp_entry_t entry, struct page *page) |
511 | { | 512 | { |
513 | spinlock_t *ptl; | ||
514 | pte_t *pte; | ||
515 | int ret = 1; | ||
516 | |||
517 | if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) | ||
518 | ret = -ENOMEM; | ||
519 | |||
520 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||
521 | if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { | ||
522 | if (ret > 0) | ||
523 | mem_cgroup_uncharge_page(page); | ||
524 | ret = 0; | ||
525 | goto out; | ||
526 | } | ||
527 | |||
512 | inc_mm_counter(vma->vm_mm, anon_rss); | 528 | inc_mm_counter(vma->vm_mm, anon_rss); |
513 | get_page(page); | 529 | get_page(page); |
514 | set_pte_at(vma->vm_mm, addr, pte, | 530 | set_pte_at(vma->vm_mm, addr, pte, |
@@ -520,6 +536,9 @@ static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, | |||
520 | * immediately swapped out again after swapon. | 536 | * immediately swapped out again after swapon. |
521 | */ | 537 | */ |
522 | activate_page(page); | 538 | activate_page(page); |
539 | out: | ||
540 | pte_unmap_unlock(pte, ptl); | ||
541 | return ret; | ||
523 | } | 542 | } |
524 | 543 | ||
525 | static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 544 | static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
@@ -528,23 +547,34 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
528 | { | 547 | { |
529 | pte_t swp_pte = swp_entry_to_pte(entry); | 548 | pte_t swp_pte = swp_entry_to_pte(entry); |
530 | pte_t *pte; | 549 | pte_t *pte; |
531 | spinlock_t *ptl; | 550 | int ret = 0; |
532 | int found = 0; | ||
533 | 551 | ||
534 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 552 | /* |
553 | * We don't actually need pte lock while scanning for swp_pte: since | ||
554 | * we hold page lock and mmap_sem, swp_pte cannot be inserted into the | ||
555 | * page table while we're scanning; though it could get zapped, and on | ||
556 | * some architectures (e.g. x86_32 with PAE) we might catch a glimpse | ||
557 | * of unmatched parts which look like swp_pte, so unuse_pte must | ||
558 | * recheck under pte lock. Scanning without pte lock lets it be | ||
559 | * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. | ||
560 | */ | ||
561 | pte = pte_offset_map(pmd, addr); | ||
535 | do { | 562 | do { |
536 | /* | 563 | /* |
537 | * swapoff spends a _lot_ of time in this loop! | 564 | * swapoff spends a _lot_ of time in this loop! |
538 | * Test inline before going to call unuse_pte. | 565 | * Test inline before going to call unuse_pte. |
539 | */ | 566 | */ |
540 | if (unlikely(pte_same(*pte, swp_pte))) { | 567 | if (unlikely(pte_same(*pte, swp_pte))) { |
541 | unuse_pte(vma, pte++, addr, entry, page); | 568 | pte_unmap(pte); |
542 | found = 1; | 569 | ret = unuse_pte(vma, pmd, addr, entry, page); |
543 | break; | 570 | if (ret) |
571 | goto out; | ||
572 | pte = pte_offset_map(pmd, addr); | ||
544 | } | 573 | } |
545 | } while (pte++, addr += PAGE_SIZE, addr != end); | 574 | } while (pte++, addr += PAGE_SIZE, addr != end); |
546 | pte_unmap_unlock(pte - 1, ptl); | 575 | pte_unmap(pte - 1); |
547 | return found; | 576 | out: |
577 | return ret; | ||
548 | } | 578 | } |
549 | 579 | ||
550 | static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 580 | static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
@@ -553,14 +583,16 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
553 | { | 583 | { |
554 | pmd_t *pmd; | 584 | pmd_t *pmd; |
555 | unsigned long next; | 585 | unsigned long next; |
586 | int ret; | ||
556 | 587 | ||
557 | pmd = pmd_offset(pud, addr); | 588 | pmd = pmd_offset(pud, addr); |
558 | do { | 589 | do { |
559 | next = pmd_addr_end(addr, end); | 590 | next = pmd_addr_end(addr, end); |
560 | if (pmd_none_or_clear_bad(pmd)) | 591 | if (pmd_none_or_clear_bad(pmd)) |
561 | continue; | 592 | continue; |
562 | if (unuse_pte_range(vma, pmd, addr, next, entry, page)) | 593 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); |
563 | return 1; | 594 | if (ret) |
595 | return ret; | ||
564 | } while (pmd++, addr = next, addr != end); | 596 | } while (pmd++, addr = next, addr != end); |
565 | return 0; | 597 | return 0; |
566 | } | 598 | } |
@@ -571,14 +603,16 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | |||
571 | { | 603 | { |
572 | pud_t *pud; | 604 | pud_t *pud; |
573 | unsigned long next; | 605 | unsigned long next; |
606 | int ret; | ||
574 | 607 | ||
575 | pud = pud_offset(pgd, addr); | 608 | pud = pud_offset(pgd, addr); |
576 | do { | 609 | do { |
577 | next = pud_addr_end(addr, end); | 610 | next = pud_addr_end(addr, end); |
578 | if (pud_none_or_clear_bad(pud)) | 611 | if (pud_none_or_clear_bad(pud)) |
579 | continue; | 612 | continue; |
580 | if (unuse_pmd_range(vma, pud, addr, next, entry, page)) | 613 | ret = unuse_pmd_range(vma, pud, addr, next, entry, page); |
581 | return 1; | 614 | if (ret) |
615 | return ret; | ||
582 | } while (pud++, addr = next, addr != end); | 616 | } while (pud++, addr = next, addr != end); |
583 | return 0; | 617 | return 0; |
584 | } | 618 | } |
@@ -588,6 +622,7 @@ static int unuse_vma(struct vm_area_struct *vma, | |||
588 | { | 622 | { |
589 | pgd_t *pgd; | 623 | pgd_t *pgd; |
590 | unsigned long addr, end, next; | 624 | unsigned long addr, end, next; |
625 | int ret; | ||
591 | 626 | ||
592 | if (page->mapping) { | 627 | if (page->mapping) { |
593 | addr = page_address_in_vma(page, vma); | 628 | addr = page_address_in_vma(page, vma); |
@@ -605,8 +640,9 @@ static int unuse_vma(struct vm_area_struct *vma, | |||
605 | next = pgd_addr_end(addr, end); | 640 | next = pgd_addr_end(addr, end); |
606 | if (pgd_none_or_clear_bad(pgd)) | 641 | if (pgd_none_or_clear_bad(pgd)) |
607 | continue; | 642 | continue; |
608 | if (unuse_pud_range(vma, pgd, addr, next, entry, page)) | 643 | ret = unuse_pud_range(vma, pgd, addr, next, entry, page); |
609 | return 1; | 644 | if (ret) |
645 | return ret; | ||
610 | } while (pgd++, addr = next, addr != end); | 646 | } while (pgd++, addr = next, addr != end); |
611 | return 0; | 647 | return 0; |
612 | } | 648 | } |
@@ -615,6 +651,7 @@ static int unuse_mm(struct mm_struct *mm, | |||
615 | swp_entry_t entry, struct page *page) | 651 | swp_entry_t entry, struct page *page) |
616 | { | 652 | { |
617 | struct vm_area_struct *vma; | 653 | struct vm_area_struct *vma; |
654 | int ret = 0; | ||
618 | 655 | ||
619 | if (!down_read_trylock(&mm->mmap_sem)) { | 656 | if (!down_read_trylock(&mm->mmap_sem)) { |
620 | /* | 657 | /* |
@@ -627,15 +664,11 @@ static int unuse_mm(struct mm_struct *mm, | |||
627 | lock_page(page); | 664 | lock_page(page); |
628 | } | 665 | } |
629 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 666 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
630 | if (vma->anon_vma && unuse_vma(vma, entry, page)) | 667 | if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) |
631 | break; | 668 | break; |
632 | } | 669 | } |
633 | up_read(&mm->mmap_sem); | 670 | up_read(&mm->mmap_sem); |
634 | /* | 671 | return (ret < 0)? ret: 0; |
635 | * Currently unuse_mm cannot fail, but leave error handling | ||
636 | * at call sites for now, since we change it from time to time. | ||
637 | */ | ||
638 | return 0; | ||
639 | } | 672 | } |
640 | 673 | ||
641 | /* | 674 | /* |
@@ -730,7 +763,8 @@ static int try_to_unuse(unsigned int type) | |||
730 | */ | 763 | */ |
731 | swap_map = &si->swap_map[i]; | 764 | swap_map = &si->swap_map[i]; |
732 | entry = swp_entry(type, i); | 765 | entry = swp_entry(type, i); |
733 | page = read_swap_cache_async(entry, NULL, 0); | 766 | page = read_swap_cache_async(entry, |
767 | GFP_HIGHUSER_MOVABLE, NULL, 0); | ||
734 | if (!page) { | 768 | if (!page) { |
735 | /* | 769 | /* |
736 | * Either swap_duplicate() failed because entry | 770 | * Either swap_duplicate() failed because entry |
@@ -789,7 +823,7 @@ static int try_to_unuse(unsigned int type) | |||
789 | atomic_inc(&new_start_mm->mm_users); | 823 | atomic_inc(&new_start_mm->mm_users); |
790 | atomic_inc(&prev_mm->mm_users); | 824 | atomic_inc(&prev_mm->mm_users); |
791 | spin_lock(&mmlist_lock); | 825 | spin_lock(&mmlist_lock); |
792 | while (*swap_map > 1 && !retval && | 826 | while (*swap_map > 1 && !retval && !shmem && |
793 | (p = p->next) != &start_mm->mmlist) { | 827 | (p = p->next) != &start_mm->mmlist) { |
794 | mm = list_entry(p, struct mm_struct, mmlist); | 828 | mm = list_entry(p, struct mm_struct, mmlist); |
795 | if (!atomic_inc_not_zero(&mm->mm_users)) | 829 | if (!atomic_inc_not_zero(&mm->mm_users)) |
@@ -821,6 +855,13 @@ static int try_to_unuse(unsigned int type) | |||
821 | mmput(start_mm); | 855 | mmput(start_mm); |
822 | start_mm = new_start_mm; | 856 | start_mm = new_start_mm; |
823 | } | 857 | } |
858 | if (shmem) { | ||
859 | /* page has already been unlocked and released */ | ||
860 | if (shmem > 0) | ||
861 | continue; | ||
862 | retval = shmem; | ||
863 | break; | ||
864 | } | ||
824 | if (retval) { | 865 | if (retval) { |
825 | unlock_page(page); | 866 | unlock_page(page); |
826 | page_cache_release(page); | 867 | page_cache_release(page); |
@@ -859,12 +900,6 @@ static int try_to_unuse(unsigned int type) | |||
859 | * read from disk into another page. Splitting into two | 900 | * read from disk into another page. Splitting into two |
860 | * pages would be incorrect if swap supported "shared | 901 | * pages would be incorrect if swap supported "shared |
861 | * private" pages, but they are handled by tmpfs files. | 902 | * private" pages, but they are handled by tmpfs files. |
862 | * | ||
863 | * Note shmem_unuse already deleted a swappage from | ||
864 | * the swap cache, unless the move to filepage failed: | ||
865 | * in which case it left swappage in cache, lowered its | ||
866 | * swap count to pass quickly through the loops above, | ||
867 | * and now we must reincrement count to try again later. | ||
868 | */ | 903 | */ |
869 | if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { | 904 | if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { |
870 | struct writeback_control wbc = { | 905 | struct writeback_control wbc = { |
@@ -875,12 +910,8 @@ static int try_to_unuse(unsigned int type) | |||
875 | lock_page(page); | 910 | lock_page(page); |
876 | wait_on_page_writeback(page); | 911 | wait_on_page_writeback(page); |
877 | } | 912 | } |
878 | if (PageSwapCache(page)) { | 913 | if (PageSwapCache(page)) |
879 | if (shmem) | 914 | delete_from_swap_cache(page); |
880 | swap_duplicate(entry); | ||
881 | else | ||
882 | delete_from_swap_cache(page); | ||
883 | } | ||
884 | 915 | ||
885 | /* | 916 | /* |
886 | * So we could skip searching mms once swap count went | 917 | * So we could skip searching mms once swap count went |
@@ -1768,31 +1799,48 @@ get_swap_info_struct(unsigned type) | |||
1768 | */ | 1799 | */ |
1769 | int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | 1800 | int valid_swaphandles(swp_entry_t entry, unsigned long *offset) |
1770 | { | 1801 | { |
1802 | struct swap_info_struct *si; | ||
1771 | int our_page_cluster = page_cluster; | 1803 | int our_page_cluster = page_cluster; |
1772 | int ret = 0, i = 1 << our_page_cluster; | 1804 | pgoff_t target, toff; |
1773 | unsigned long toff; | 1805 | pgoff_t base, end; |
1774 | struct swap_info_struct *swapdev = swp_type(entry) + swap_info; | 1806 | int nr_pages = 0; |
1775 | 1807 | ||
1776 | if (!our_page_cluster) /* no readahead */ | 1808 | if (!our_page_cluster) /* no readahead */ |
1777 | return 0; | 1809 | return 0; |
1778 | toff = (swp_offset(entry) >> our_page_cluster) << our_page_cluster; | 1810 | |
1779 | if (!toff) /* first page is swap header */ | 1811 | si = &swap_info[swp_type(entry)]; |
1780 | toff++, i--; | 1812 | target = swp_offset(entry); |
1781 | *offset = toff; | 1813 | base = (target >> our_page_cluster) << our_page_cluster; |
1814 | end = base + (1 << our_page_cluster); | ||
1815 | if (!base) /* first page is swap header */ | ||
1816 | base++; | ||
1782 | 1817 | ||
1783 | spin_lock(&swap_lock); | 1818 | spin_lock(&swap_lock); |
1784 | do { | 1819 | if (end > si->max) /* don't go beyond end of map */ |
1785 | /* Don't read-ahead past the end of the swap area */ | 1820 | end = si->max; |
1786 | if (toff >= swapdev->max) | 1821 | |
1822 | /* Count contiguous allocated slots above our target */ | ||
1823 | for (toff = target; ++toff < end; nr_pages++) { | ||
1824 | /* Don't read in free or bad pages */ | ||
1825 | if (!si->swap_map[toff]) | ||
1787 | break; | 1826 | break; |
1827 | if (si->swap_map[toff] == SWAP_MAP_BAD) | ||
1828 | break; | ||
1829 | } | ||
1830 | /* Count contiguous allocated slots below our target */ | ||
1831 | for (toff = target; --toff >= base; nr_pages++) { | ||
1788 | /* Don't read in free or bad pages */ | 1832 | /* Don't read in free or bad pages */ |
1789 | if (!swapdev->swap_map[toff]) | 1833 | if (!si->swap_map[toff]) |
1790 | break; | 1834 | break; |
1791 | if (swapdev->swap_map[toff] == SWAP_MAP_BAD) | 1835 | if (si->swap_map[toff] == SWAP_MAP_BAD) |
1792 | break; | 1836 | break; |
1793 | toff++; | 1837 | } |
1794 | ret++; | ||
1795 | } while (--i); | ||
1796 | spin_unlock(&swap_lock); | 1838 | spin_unlock(&swap_lock); |
1797 | return ret; | 1839 | |
1840 | /* | ||
1841 | * Indicate starting offset, and return number of pages to get: | ||
1842 | * if only 1, say 0, since there's then no readahead to be done. | ||
1843 | */ | ||
1844 | *offset = ++toff; | ||
1845 | return nr_pages? ++nr_pages: 0; | ||
1798 | } | 1846 | } |
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index d436a9c82db7..702083638c16 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c | |||
@@ -121,18 +121,6 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
121 | return 0; | 121 | return 0; |
122 | } | 122 | } |
123 | 123 | ||
124 | #if 0 | ||
125 | int shmem_mmap(struct file *file, struct vm_area_struct *vma) | ||
126 | { | ||
127 | file_accessed(file); | ||
128 | #ifndef CONFIG_MMU | ||
129 | return ramfs_nommu_mmap(file, vma); | ||
130 | #else | ||
131 | return 0; | ||
132 | #endif | ||
133 | } | ||
134 | #endif /* 0 */ | ||
135 | |||
136 | #ifndef CONFIG_MMU | 124 | #ifndef CONFIG_MMU |
137 | unsigned long shmem_get_unmapped_area(struct file *file, | 125 | unsigned long shmem_get_unmapped_area(struct file *file, |
138 | unsigned long addr, | 126 | unsigned long addr, |
diff --git a/mm/truncate.c b/mm/truncate.c index cadc15653dde..c35c49e54fb6 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -21,7 +21,7 @@ | |||
21 | 21 | ||
22 | 22 | ||
23 | /** | 23 | /** |
24 | * do_invalidatepage - invalidate part of all of a page | 24 | * do_invalidatepage - invalidate part or all of a page |
25 | * @page: the page which is affected | 25 | * @page: the page which is affected |
26 | * @offset: the index of the truncation point | 26 | * @offset: the index of the truncation point |
27 | * | 27 | * |
@@ -48,7 +48,7 @@ void do_invalidatepage(struct page *page, unsigned long offset) | |||
48 | 48 | ||
49 | static inline void truncate_partial_page(struct page *page, unsigned partial) | 49 | static inline void truncate_partial_page(struct page *page, unsigned partial) |
50 | { | 50 | { |
51 | zero_user_page(page, partial, PAGE_CACHE_SIZE - partial, KM_USER0); | 51 | zero_user_segment(page, partial, PAGE_CACHE_SIZE); |
52 | if (PagePrivate(page)) | 52 | if (PagePrivate(page)) |
53 | do_invalidatepage(page, partial); | 53 | do_invalidatepage(page, partial); |
54 | } | 54 | } |
@@ -84,7 +84,7 @@ EXPORT_SYMBOL(cancel_dirty_page); | |||
84 | 84 | ||
85 | /* | 85 | /* |
86 | * If truncate cannot remove the fs-private metadata from the page, the page | 86 | * If truncate cannot remove the fs-private metadata from the page, the page |
87 | * becomes anonymous. It will be left on the LRU and may even be mapped into | 87 | * becomes orphaned. It will be left on the LRU and may even be mapped into |
88 | * user pagetables if we're racing with filemap_fault(). | 88 | * user pagetables if we're racing with filemap_fault(). |
89 | * | 89 | * |
90 | * We need to bale out if page->mapping is no longer equal to the original | 90 | * We need to bale out if page->mapping is no longer equal to the original |
@@ -98,11 +98,11 @@ truncate_complete_page(struct address_space *mapping, struct page *page) | |||
98 | if (page->mapping != mapping) | 98 | if (page->mapping != mapping) |
99 | return; | 99 | return; |
100 | 100 | ||
101 | cancel_dirty_page(page, PAGE_CACHE_SIZE); | ||
102 | |||
103 | if (PagePrivate(page)) | 101 | if (PagePrivate(page)) |
104 | do_invalidatepage(page, 0); | 102 | do_invalidatepage(page, 0); |
105 | 103 | ||
104 | cancel_dirty_page(page, PAGE_CACHE_SIZE); | ||
105 | |||
106 | remove_from_page_cache(page); | 106 | remove_from_page_cache(page); |
107 | ClearPageUptodate(page); | 107 | ClearPageUptodate(page); |
108 | ClearPageMappedToDisk(page); | 108 | ClearPageMappedToDisk(page); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index af77e171e339..0536dde139d1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -166,6 +166,44 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) | |||
166 | } | 166 | } |
167 | EXPORT_SYMBOL_GPL(map_vm_area); | 167 | EXPORT_SYMBOL_GPL(map_vm_area); |
168 | 168 | ||
169 | /* | ||
170 | * Map a vmalloc()-space virtual address to the physical page. | ||
171 | */ | ||
172 | struct page *vmalloc_to_page(const void *vmalloc_addr) | ||
173 | { | ||
174 | unsigned long addr = (unsigned long) vmalloc_addr; | ||
175 | struct page *page = NULL; | ||
176 | pgd_t *pgd = pgd_offset_k(addr); | ||
177 | pud_t *pud; | ||
178 | pmd_t *pmd; | ||
179 | pte_t *ptep, pte; | ||
180 | |||
181 | if (!pgd_none(*pgd)) { | ||
182 | pud = pud_offset(pgd, addr); | ||
183 | if (!pud_none(*pud)) { | ||
184 | pmd = pmd_offset(pud, addr); | ||
185 | if (!pmd_none(*pmd)) { | ||
186 | ptep = pte_offset_map(pmd, addr); | ||
187 | pte = *ptep; | ||
188 | if (pte_present(pte)) | ||
189 | page = pte_page(pte); | ||
190 | pte_unmap(ptep); | ||
191 | } | ||
192 | } | ||
193 | } | ||
194 | return page; | ||
195 | } | ||
196 | EXPORT_SYMBOL(vmalloc_to_page); | ||
197 | |||
198 | /* | ||
199 | * Map a vmalloc()-space virtual address to the physical page frame number. | ||
200 | */ | ||
201 | unsigned long vmalloc_to_pfn(const void *vmalloc_addr) | ||
202 | { | ||
203 | return page_to_pfn(vmalloc_to_page(vmalloc_addr)); | ||
204 | } | ||
205 | EXPORT_SYMBOL(vmalloc_to_pfn); | ||
206 | |||
169 | static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, | 207 | static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, |
170 | unsigned long start, unsigned long end, | 208 | unsigned long start, unsigned long end, |
171 | int node, gfp_t gfp_mask) | 209 | int node, gfp_t gfp_mask) |
@@ -216,6 +254,10 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl | |||
216 | if (addr > end - size) | 254 | if (addr > end - size) |
217 | goto out; | 255 | goto out; |
218 | } | 256 | } |
257 | if ((size + addr) < addr) | ||
258 | goto out; | ||
259 | if (addr > end - size) | ||
260 | goto out; | ||
219 | 261 | ||
220 | found: | 262 | found: |
221 | area->next = *p; | 263 | area->next = *p; |
@@ -268,7 +310,7 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, | |||
268 | } | 310 | } |
269 | 311 | ||
270 | /* Caller must hold vmlist_lock */ | 312 | /* Caller must hold vmlist_lock */ |
271 | static struct vm_struct *__find_vm_area(void *addr) | 313 | static struct vm_struct *__find_vm_area(const void *addr) |
272 | { | 314 | { |
273 | struct vm_struct *tmp; | 315 | struct vm_struct *tmp; |
274 | 316 | ||
@@ -281,7 +323,7 @@ static struct vm_struct *__find_vm_area(void *addr) | |||
281 | } | 323 | } |
282 | 324 | ||
283 | /* Caller must hold vmlist_lock */ | 325 | /* Caller must hold vmlist_lock */ |
284 | static struct vm_struct *__remove_vm_area(void *addr) | 326 | static struct vm_struct *__remove_vm_area(const void *addr) |
285 | { | 327 | { |
286 | struct vm_struct **p, *tmp; | 328 | struct vm_struct **p, *tmp; |
287 | 329 | ||
@@ -310,7 +352,7 @@ found: | |||
310 | * This function returns the found VM area, but using it is NOT safe | 352 | * This function returns the found VM area, but using it is NOT safe |
311 | * on SMP machines, except for its size or flags. | 353 | * on SMP machines, except for its size or flags. |
312 | */ | 354 | */ |
313 | struct vm_struct *remove_vm_area(void *addr) | 355 | struct vm_struct *remove_vm_area(const void *addr) |
314 | { | 356 | { |
315 | struct vm_struct *v; | 357 | struct vm_struct *v; |
316 | write_lock(&vmlist_lock); | 358 | write_lock(&vmlist_lock); |
@@ -319,7 +361,7 @@ struct vm_struct *remove_vm_area(void *addr) | |||
319 | return v; | 361 | return v; |
320 | } | 362 | } |
321 | 363 | ||
322 | static void __vunmap(void *addr, int deallocate_pages) | 364 | static void __vunmap(const void *addr, int deallocate_pages) |
323 | { | 365 | { |
324 | struct vm_struct *area; | 366 | struct vm_struct *area; |
325 | 367 | ||
@@ -346,8 +388,10 @@ static void __vunmap(void *addr, int deallocate_pages) | |||
346 | int i; | 388 | int i; |
347 | 389 | ||
348 | for (i = 0; i < area->nr_pages; i++) { | 390 | for (i = 0; i < area->nr_pages; i++) { |
349 | BUG_ON(!area->pages[i]); | 391 | struct page *page = area->pages[i]; |
350 | __free_page(area->pages[i]); | 392 | |
393 | BUG_ON(!page); | ||
394 | __free_page(page); | ||
351 | } | 395 | } |
352 | 396 | ||
353 | if (area->flags & VM_VPAGES) | 397 | if (area->flags & VM_VPAGES) |
@@ -370,7 +414,7 @@ static void __vunmap(void *addr, int deallocate_pages) | |||
370 | * | 414 | * |
371 | * Must not be called in interrupt context. | 415 | * Must not be called in interrupt context. |
372 | */ | 416 | */ |
373 | void vfree(void *addr) | 417 | void vfree(const void *addr) |
374 | { | 418 | { |
375 | BUG_ON(in_interrupt()); | 419 | BUG_ON(in_interrupt()); |
376 | __vunmap(addr, 1); | 420 | __vunmap(addr, 1); |
@@ -386,7 +430,7 @@ EXPORT_SYMBOL(vfree); | |||
386 | * | 430 | * |
387 | * Must not be called in interrupt context. | 431 | * Must not be called in interrupt context. |
388 | */ | 432 | */ |
389 | void vunmap(void *addr) | 433 | void vunmap(const void *addr) |
390 | { | 434 | { |
391 | BUG_ON(in_interrupt()); | 435 | BUG_ON(in_interrupt()); |
392 | __vunmap(addr, 0); | 436 | __vunmap(addr, 0); |
@@ -423,8 +467,8 @@ void *vmap(struct page **pages, unsigned int count, | |||
423 | } | 467 | } |
424 | EXPORT_SYMBOL(vmap); | 468 | EXPORT_SYMBOL(vmap); |
425 | 469 | ||
426 | void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | 470 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, |
427 | pgprot_t prot, int node) | 471 | pgprot_t prot, int node) |
428 | { | 472 | { |
429 | struct page **pages; | 473 | struct page **pages; |
430 | unsigned int nr_pages, array_size, i; | 474 | unsigned int nr_pages, array_size, i; |
@@ -451,15 +495,19 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
451 | } | 495 | } |
452 | 496 | ||
453 | for (i = 0; i < area->nr_pages; i++) { | 497 | for (i = 0; i < area->nr_pages; i++) { |
498 | struct page *page; | ||
499 | |||
454 | if (node < 0) | 500 | if (node < 0) |
455 | area->pages[i] = alloc_page(gfp_mask); | 501 | page = alloc_page(gfp_mask); |
456 | else | 502 | else |
457 | area->pages[i] = alloc_pages_node(node, gfp_mask, 0); | 503 | page = alloc_pages_node(node, gfp_mask, 0); |
458 | if (unlikely(!area->pages[i])) { | 504 | |
505 | if (unlikely(!page)) { | ||
459 | /* Successfully allocated i pages, free them in __vunmap() */ | 506 | /* Successfully allocated i pages, free them in __vunmap() */ |
460 | area->nr_pages = i; | 507 | area->nr_pages = i; |
461 | goto fail; | 508 | goto fail; |
462 | } | 509 | } |
510 | area->pages[i] = page; | ||
463 | } | 511 | } |
464 | 512 | ||
465 | if (map_vm_area(area, prot, &pages)) | 513 | if (map_vm_area(area, prot, &pages)) |
diff --git a/mm/vmscan.c b/mm/vmscan.c index e5a9597e3bbc..a26dabd62fed 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <linux/delay.h> | 37 | #include <linux/delay.h> |
38 | #include <linux/kthread.h> | 38 | #include <linux/kthread.h> |
39 | #include <linux/freezer.h> | 39 | #include <linux/freezer.h> |
40 | #include <linux/memcontrol.h> | ||
40 | 41 | ||
41 | #include <asm/tlbflush.h> | 42 | #include <asm/tlbflush.h> |
42 | #include <asm/div64.h> | 43 | #include <asm/div64.h> |
@@ -68,6 +69,22 @@ struct scan_control { | |||
68 | int all_unreclaimable; | 69 | int all_unreclaimable; |
69 | 70 | ||
70 | int order; | 71 | int order; |
72 | |||
73 | /* | ||
74 | * Pages that have (or should have) IO pending. If we run into | ||
75 | * a lot of these, we're better off waiting a little for IO to | ||
76 | * finish rather than scanning more pages in the VM. | ||
77 | */ | ||
78 | int nr_io_pages; | ||
79 | |||
80 | /* Which cgroup do we reclaim from */ | ||
81 | struct mem_cgroup *mem_cgroup; | ||
82 | |||
83 | /* Pluggable isolate pages callback */ | ||
84 | unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, | ||
85 | unsigned long *scanned, int order, int mode, | ||
86 | struct zone *z, struct mem_cgroup *mem_cont, | ||
87 | int active); | ||
71 | }; | 88 | }; |
72 | 89 | ||
73 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | 90 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
@@ -109,6 +126,12 @@ long vm_total_pages; /* The total number of pages which the VM controls */ | |||
109 | static LIST_HEAD(shrinker_list); | 126 | static LIST_HEAD(shrinker_list); |
110 | static DECLARE_RWSEM(shrinker_rwsem); | 127 | static DECLARE_RWSEM(shrinker_rwsem); |
111 | 128 | ||
129 | #ifdef CONFIG_CGROUP_MEM_CONT | ||
130 | #define scan_global_lru(sc) (!(sc)->mem_cgroup) | ||
131 | #else | ||
132 | #define scan_global_lru(sc) (1) | ||
133 | #endif | ||
134 | |||
112 | /* | 135 | /* |
113 | * Add a shrinker callback to be called from the vm | 136 | * Add a shrinker callback to be called from the vm |
114 | */ | 137 | */ |
@@ -489,11 +512,13 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
489 | */ | 512 | */ |
490 | if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) | 513 | if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) |
491 | wait_on_page_writeback(page); | 514 | wait_on_page_writeback(page); |
492 | else | 515 | else { |
516 | sc->nr_io_pages++; | ||
493 | goto keep_locked; | 517 | goto keep_locked; |
518 | } | ||
494 | } | 519 | } |
495 | 520 | ||
496 | referenced = page_referenced(page, 1); | 521 | referenced = page_referenced(page, 1, sc->mem_cgroup); |
497 | /* In active use or really unfreeable? Activate it. */ | 522 | /* In active use or really unfreeable? Activate it. */ |
498 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && | 523 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && |
499 | referenced && page_mapping_inuse(page)) | 524 | referenced && page_mapping_inuse(page)) |
@@ -529,8 +554,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
529 | if (PageDirty(page)) { | 554 | if (PageDirty(page)) { |
530 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) | 555 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) |
531 | goto keep_locked; | 556 | goto keep_locked; |
532 | if (!may_enter_fs) | 557 | if (!may_enter_fs) { |
558 | sc->nr_io_pages++; | ||
533 | goto keep_locked; | 559 | goto keep_locked; |
560 | } | ||
534 | if (!sc->may_writepage) | 561 | if (!sc->may_writepage) |
535 | goto keep_locked; | 562 | goto keep_locked; |
536 | 563 | ||
@@ -541,8 +568,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
541 | case PAGE_ACTIVATE: | 568 | case PAGE_ACTIVATE: |
542 | goto activate_locked; | 569 | goto activate_locked; |
543 | case PAGE_SUCCESS: | 570 | case PAGE_SUCCESS: |
544 | if (PageWriteback(page) || PageDirty(page)) | 571 | if (PageWriteback(page) || PageDirty(page)) { |
572 | sc->nr_io_pages++; | ||
545 | goto keep; | 573 | goto keep; |
574 | } | ||
546 | /* | 575 | /* |
547 | * A synchronous write - probably a ramdisk. Go | 576 | * A synchronous write - probably a ramdisk. Go |
548 | * ahead and try to reclaim the page. | 577 | * ahead and try to reclaim the page. |
@@ -626,7 +655,7 @@ keep: | |||
626 | * | 655 | * |
627 | * returns 0 on success, -ve errno on failure. | 656 | * returns 0 on success, -ve errno on failure. |
628 | */ | 657 | */ |
629 | static int __isolate_lru_page(struct page *page, int mode) | 658 | int __isolate_lru_page(struct page *page, int mode) |
630 | { | 659 | { |
631 | int ret = -EINVAL; | 660 | int ret = -EINVAL; |
632 | 661 | ||
@@ -760,6 +789,21 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
760 | return nr_taken; | 789 | return nr_taken; |
761 | } | 790 | } |
762 | 791 | ||
792 | static unsigned long isolate_pages_global(unsigned long nr, | ||
793 | struct list_head *dst, | ||
794 | unsigned long *scanned, int order, | ||
795 | int mode, struct zone *z, | ||
796 | struct mem_cgroup *mem_cont, | ||
797 | int active) | ||
798 | { | ||
799 | if (active) | ||
800 | return isolate_lru_pages(nr, &z->active_list, dst, | ||
801 | scanned, order, mode); | ||
802 | else | ||
803 | return isolate_lru_pages(nr, &z->inactive_list, dst, | ||
804 | scanned, order, mode); | ||
805 | } | ||
806 | |||
763 | /* | 807 | /* |
764 | * clear_active_flags() is a helper for shrink_active_list(), clearing | 808 | * clear_active_flags() is a helper for shrink_active_list(), clearing |
765 | * any active bits from the pages in the list. | 809 | * any active bits from the pages in the list. |
@@ -801,18 +845,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
801 | unsigned long nr_freed; | 845 | unsigned long nr_freed; |
802 | unsigned long nr_active; | 846 | unsigned long nr_active; |
803 | 847 | ||
804 | nr_taken = isolate_lru_pages(sc->swap_cluster_max, | 848 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, |
805 | &zone->inactive_list, | ||
806 | &page_list, &nr_scan, sc->order, | 849 | &page_list, &nr_scan, sc->order, |
807 | (sc->order > PAGE_ALLOC_COSTLY_ORDER)? | 850 | (sc->order > PAGE_ALLOC_COSTLY_ORDER)? |
808 | ISOLATE_BOTH : ISOLATE_INACTIVE); | 851 | ISOLATE_BOTH : ISOLATE_INACTIVE, |
852 | zone, sc->mem_cgroup, 0); | ||
809 | nr_active = clear_active_flags(&page_list); | 853 | nr_active = clear_active_flags(&page_list); |
810 | __count_vm_events(PGDEACTIVATE, nr_active); | 854 | __count_vm_events(PGDEACTIVATE, nr_active); |
811 | 855 | ||
812 | __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); | 856 | __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); |
813 | __mod_zone_page_state(zone, NR_INACTIVE, | 857 | __mod_zone_page_state(zone, NR_INACTIVE, |
814 | -(nr_taken - nr_active)); | 858 | -(nr_taken - nr_active)); |
815 | zone->pages_scanned += nr_scan; | 859 | if (scan_global_lru(sc)) |
860 | zone->pages_scanned += nr_scan; | ||
816 | spin_unlock_irq(&zone->lru_lock); | 861 | spin_unlock_irq(&zone->lru_lock); |
817 | 862 | ||
818 | nr_scanned += nr_scan; | 863 | nr_scanned += nr_scan; |
@@ -844,8 +889,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
844 | if (current_is_kswapd()) { | 889 | if (current_is_kswapd()) { |
845 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); | 890 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); |
846 | __count_vm_events(KSWAPD_STEAL, nr_freed); | 891 | __count_vm_events(KSWAPD_STEAL, nr_freed); |
847 | } else | 892 | } else if (scan_global_lru(sc)) |
848 | __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); | 893 | __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); |
894 | |||
849 | __count_zone_vm_events(PGSTEAL, zone, nr_freed); | 895 | __count_zone_vm_events(PGSTEAL, zone, nr_freed); |
850 | 896 | ||
851 | if (nr_taken == 0) | 897 | if (nr_taken == 0) |
@@ -899,6 +945,113 @@ static inline int zone_is_near_oom(struct zone *zone) | |||
899 | } | 945 | } |
900 | 946 | ||
901 | /* | 947 | /* |
948 | * Determine we should try to reclaim mapped pages. | ||
949 | * This is called only when sc->mem_cgroup is NULL. | ||
950 | */ | ||
951 | static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone, | ||
952 | int priority) | ||
953 | { | ||
954 | long mapped_ratio; | ||
955 | long distress; | ||
956 | long swap_tendency; | ||
957 | long imbalance; | ||
958 | int reclaim_mapped = 0; | ||
959 | int prev_priority; | ||
960 | |||
961 | if (scan_global_lru(sc) && zone_is_near_oom(zone)) | ||
962 | return 1; | ||
963 | /* | ||
964 | * `distress' is a measure of how much trouble we're having | ||
965 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. | ||
966 | */ | ||
967 | if (scan_global_lru(sc)) | ||
968 | prev_priority = zone->prev_priority; | ||
969 | else | ||
970 | prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup); | ||
971 | |||
972 | distress = 100 >> min(prev_priority, priority); | ||
973 | |||
974 | /* | ||
975 | * The point of this algorithm is to decide when to start | ||
976 | * reclaiming mapped memory instead of just pagecache. Work out | ||
977 | * how much memory | ||
978 | * is mapped. | ||
979 | */ | ||
980 | if (scan_global_lru(sc)) | ||
981 | mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + | ||
982 | global_page_state(NR_ANON_PAGES)) * 100) / | ||
983 | vm_total_pages; | ||
984 | else | ||
985 | mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup); | ||
986 | |||
987 | /* | ||
988 | * Now decide how much we really want to unmap some pages. The | ||
989 | * mapped ratio is downgraded - just because there's a lot of | ||
990 | * mapped memory doesn't necessarily mean that page reclaim | ||
991 | * isn't succeeding. | ||
992 | * | ||
993 | * The distress ratio is important - we don't want to start | ||
994 | * going oom. | ||
995 | * | ||
996 | * A 100% value of vm_swappiness overrides this algorithm | ||
997 | * altogether. | ||
998 | */ | ||
999 | swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; | ||
1000 | |||
1001 | /* | ||
1002 | * If there's huge imbalance between active and inactive | ||
1003 | * (think active 100 times larger than inactive) we should | ||
1004 | * become more permissive, or the system will take too much | ||
1005 | * cpu before it start swapping during memory pressure. | ||
1006 | * Distress is about avoiding early-oom, this is about | ||
1007 | * making swappiness graceful despite setting it to low | ||
1008 | * values. | ||
1009 | * | ||
1010 | * Avoid div by zero with nr_inactive+1, and max resulting | ||
1011 | * value is vm_total_pages. | ||
1012 | */ | ||
1013 | if (scan_global_lru(sc)) { | ||
1014 | imbalance = zone_page_state(zone, NR_ACTIVE); | ||
1015 | imbalance /= zone_page_state(zone, NR_INACTIVE) + 1; | ||
1016 | } else | ||
1017 | imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup); | ||
1018 | |||
1019 | /* | ||
1020 | * Reduce the effect of imbalance if swappiness is low, | ||
1021 | * this means for a swappiness very low, the imbalance | ||
1022 | * must be much higher than 100 for this logic to make | ||
1023 | * the difference. | ||
1024 | * | ||
1025 | * Max temporary value is vm_total_pages*100. | ||
1026 | */ | ||
1027 | imbalance *= (vm_swappiness + 1); | ||
1028 | imbalance /= 100; | ||
1029 | |||
1030 | /* | ||
1031 | * If not much of the ram is mapped, makes the imbalance | ||
1032 | * less relevant, it's high priority we refill the inactive | ||
1033 | * list with mapped pages only in presence of high ratio of | ||
1034 | * mapped pages. | ||
1035 | * | ||
1036 | * Max temporary value is vm_total_pages*100. | ||
1037 | */ | ||
1038 | imbalance *= mapped_ratio; | ||
1039 | imbalance /= 100; | ||
1040 | |||
1041 | /* apply imbalance feedback to swap_tendency */ | ||
1042 | swap_tendency += imbalance; | ||
1043 | |||
1044 | /* | ||
1045 | * Now use this metric to decide whether to start moving mapped | ||
1046 | * memory onto the inactive list. | ||
1047 | */ | ||
1048 | if (swap_tendency >= 100) | ||
1049 | reclaim_mapped = 1; | ||
1050 | |||
1051 | return reclaim_mapped; | ||
1052 | } | ||
1053 | |||
1054 | /* | ||
902 | * This moves pages from the active list to the inactive list. | 1055 | * This moves pages from the active list to the inactive list. |
903 | * | 1056 | * |
904 | * We move them the other way if the page is referenced by one or more | 1057 | * We move them the other way if the page is referenced by one or more |
@@ -915,6 +1068,8 @@ static inline int zone_is_near_oom(struct zone *zone) | |||
915 | * The downside is that we have to touch page->_count against each page. | 1068 | * The downside is that we have to touch page->_count against each page. |
916 | * But we had to alter page->flags anyway. | 1069 | * But we had to alter page->flags anyway. |
917 | */ | 1070 | */ |
1071 | |||
1072 | |||
918 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | 1073 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, |
919 | struct scan_control *sc, int priority) | 1074 | struct scan_control *sc, int priority) |
920 | { | 1075 | { |
@@ -928,99 +1083,21 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
928 | struct pagevec pvec; | 1083 | struct pagevec pvec; |
929 | int reclaim_mapped = 0; | 1084 | int reclaim_mapped = 0; |
930 | 1085 | ||
931 | if (sc->may_swap) { | 1086 | if (sc->may_swap) |
932 | long mapped_ratio; | 1087 | reclaim_mapped = calc_reclaim_mapped(sc, zone, priority); |
933 | long distress; | ||
934 | long swap_tendency; | ||
935 | long imbalance; | ||
936 | |||
937 | if (zone_is_near_oom(zone)) | ||
938 | goto force_reclaim_mapped; | ||
939 | |||
940 | /* | ||
941 | * `distress' is a measure of how much trouble we're having | ||
942 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. | ||
943 | */ | ||
944 | distress = 100 >> min(zone->prev_priority, priority); | ||
945 | |||
946 | /* | ||
947 | * The point of this algorithm is to decide when to start | ||
948 | * reclaiming mapped memory instead of just pagecache. Work out | ||
949 | * how much memory | ||
950 | * is mapped. | ||
951 | */ | ||
952 | mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + | ||
953 | global_page_state(NR_ANON_PAGES)) * 100) / | ||
954 | vm_total_pages; | ||
955 | |||
956 | /* | ||
957 | * Now decide how much we really want to unmap some pages. The | ||
958 | * mapped ratio is downgraded - just because there's a lot of | ||
959 | * mapped memory doesn't necessarily mean that page reclaim | ||
960 | * isn't succeeding. | ||
961 | * | ||
962 | * The distress ratio is important - we don't want to start | ||
963 | * going oom. | ||
964 | * | ||
965 | * A 100% value of vm_swappiness overrides this algorithm | ||
966 | * altogether. | ||
967 | */ | ||
968 | swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; | ||
969 | |||
970 | /* | ||
971 | * If there's huge imbalance between active and inactive | ||
972 | * (think active 100 times larger than inactive) we should | ||
973 | * become more permissive, or the system will take too much | ||
974 | * cpu before it start swapping during memory pressure. | ||
975 | * Distress is about avoiding early-oom, this is about | ||
976 | * making swappiness graceful despite setting it to low | ||
977 | * values. | ||
978 | * | ||
979 | * Avoid div by zero with nr_inactive+1, and max resulting | ||
980 | * value is vm_total_pages. | ||
981 | */ | ||
982 | imbalance = zone_page_state(zone, NR_ACTIVE); | ||
983 | imbalance /= zone_page_state(zone, NR_INACTIVE) + 1; | ||
984 | |||
985 | /* | ||
986 | * Reduce the effect of imbalance if swappiness is low, | ||
987 | * this means for a swappiness very low, the imbalance | ||
988 | * must be much higher than 100 for this logic to make | ||
989 | * the difference. | ||
990 | * | ||
991 | * Max temporary value is vm_total_pages*100. | ||
992 | */ | ||
993 | imbalance *= (vm_swappiness + 1); | ||
994 | imbalance /= 100; | ||
995 | |||
996 | /* | ||
997 | * If not much of the ram is mapped, makes the imbalance | ||
998 | * less relevant, it's high priority we refill the inactive | ||
999 | * list with mapped pages only in presence of high ratio of | ||
1000 | * mapped pages. | ||
1001 | * | ||
1002 | * Max temporary value is vm_total_pages*100. | ||
1003 | */ | ||
1004 | imbalance *= mapped_ratio; | ||
1005 | imbalance /= 100; | ||
1006 | |||
1007 | /* apply imbalance feedback to swap_tendency */ | ||
1008 | swap_tendency += imbalance; | ||
1009 | |||
1010 | /* | ||
1011 | * Now use this metric to decide whether to start moving mapped | ||
1012 | * memory onto the inactive list. | ||
1013 | */ | ||
1014 | if (swap_tendency >= 100) | ||
1015 | force_reclaim_mapped: | ||
1016 | reclaim_mapped = 1; | ||
1017 | } | ||
1018 | 1088 | ||
1019 | lru_add_drain(); | 1089 | lru_add_drain(); |
1020 | spin_lock_irq(&zone->lru_lock); | 1090 | spin_lock_irq(&zone->lru_lock); |
1021 | pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, | 1091 | pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, |
1022 | &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE); | 1092 | ISOLATE_ACTIVE, zone, |
1023 | zone->pages_scanned += pgscanned; | 1093 | sc->mem_cgroup, 1); |
1094 | /* | ||
1095 | * zone->pages_scanned is used for detect zone's oom | ||
1096 | * mem_cgroup remembers nr_scan by itself. | ||
1097 | */ | ||
1098 | if (scan_global_lru(sc)) | ||
1099 | zone->pages_scanned += pgscanned; | ||
1100 | |||
1024 | __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); | 1101 | __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); |
1025 | spin_unlock_irq(&zone->lru_lock); | 1102 | spin_unlock_irq(&zone->lru_lock); |
1026 | 1103 | ||
@@ -1031,7 +1108,7 @@ force_reclaim_mapped: | |||
1031 | if (page_mapped(page)) { | 1108 | if (page_mapped(page)) { |
1032 | if (!reclaim_mapped || | 1109 | if (!reclaim_mapped || |
1033 | (total_swap_pages == 0 && PageAnon(page)) || | 1110 | (total_swap_pages == 0 && PageAnon(page)) || |
1034 | page_referenced(page, 0)) { | 1111 | page_referenced(page, 0, sc->mem_cgroup)) { |
1035 | list_add(&page->lru, &l_active); | 1112 | list_add(&page->lru, &l_active); |
1036 | continue; | 1113 | continue; |
1037 | } | 1114 | } |
@@ -1051,6 +1128,7 @@ force_reclaim_mapped: | |||
1051 | ClearPageActive(page); | 1128 | ClearPageActive(page); |
1052 | 1129 | ||
1053 | list_move(&page->lru, &zone->inactive_list); | 1130 | list_move(&page->lru, &zone->inactive_list); |
1131 | mem_cgroup_move_lists(page_get_page_cgroup(page), false); | ||
1054 | pgmoved++; | 1132 | pgmoved++; |
1055 | if (!pagevec_add(&pvec, page)) { | 1133 | if (!pagevec_add(&pvec, page)) { |
1056 | __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); | 1134 | __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); |
@@ -1079,6 +1157,7 @@ force_reclaim_mapped: | |||
1079 | SetPageLRU(page); | 1157 | SetPageLRU(page); |
1080 | VM_BUG_ON(!PageActive(page)); | 1158 | VM_BUG_ON(!PageActive(page)); |
1081 | list_move(&page->lru, &zone->active_list); | 1159 | list_move(&page->lru, &zone->active_list); |
1160 | mem_cgroup_move_lists(page_get_page_cgroup(page), true); | ||
1082 | pgmoved++; | 1161 | pgmoved++; |
1083 | if (!pagevec_add(&pvec, page)) { | 1162 | if (!pagevec_add(&pvec, page)) { |
1084 | __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); | 1163 | __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); |
@@ -1108,25 +1187,39 @@ static unsigned long shrink_zone(int priority, struct zone *zone, | |||
1108 | unsigned long nr_to_scan; | 1187 | unsigned long nr_to_scan; |
1109 | unsigned long nr_reclaimed = 0; | 1188 | unsigned long nr_reclaimed = 0; |
1110 | 1189 | ||
1111 | /* | 1190 | if (scan_global_lru(sc)) { |
1112 | * Add one to `nr_to_scan' just to make sure that the kernel will | 1191 | /* |
1113 | * slowly sift through the active list. | 1192 | * Add one to nr_to_scan just to make sure that the kernel |
1114 | */ | 1193 | * will slowly sift through the active list. |
1115 | zone->nr_scan_active += | 1194 | */ |
1116 | (zone_page_state(zone, NR_ACTIVE) >> priority) + 1; | 1195 | zone->nr_scan_active += |
1117 | nr_active = zone->nr_scan_active; | 1196 | (zone_page_state(zone, NR_ACTIVE) >> priority) + 1; |
1118 | if (nr_active >= sc->swap_cluster_max) | 1197 | nr_active = zone->nr_scan_active; |
1119 | zone->nr_scan_active = 0; | 1198 | zone->nr_scan_inactive += |
1120 | else | 1199 | (zone_page_state(zone, NR_INACTIVE) >> priority) + 1; |
1121 | nr_active = 0; | 1200 | nr_inactive = zone->nr_scan_inactive; |
1201 | if (nr_inactive >= sc->swap_cluster_max) | ||
1202 | zone->nr_scan_inactive = 0; | ||
1203 | else | ||
1204 | nr_inactive = 0; | ||
1205 | |||
1206 | if (nr_active >= sc->swap_cluster_max) | ||
1207 | zone->nr_scan_active = 0; | ||
1208 | else | ||
1209 | nr_active = 0; | ||
1210 | } else { | ||
1211 | /* | ||
1212 | * This reclaim occurs not because zone memory shortage but | ||
1213 | * because memory controller hits its limit. | ||
1214 | * Then, don't modify zone reclaim related data. | ||
1215 | */ | ||
1216 | nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup, | ||
1217 | zone, priority); | ||
1218 | |||
1219 | nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup, | ||
1220 | zone, priority); | ||
1221 | } | ||
1122 | 1222 | ||
1123 | zone->nr_scan_inactive += | ||
1124 | (zone_page_state(zone, NR_INACTIVE) >> priority) + 1; | ||
1125 | nr_inactive = zone->nr_scan_inactive; | ||
1126 | if (nr_inactive >= sc->swap_cluster_max) | ||
1127 | zone->nr_scan_inactive = 0; | ||
1128 | else | ||
1129 | nr_inactive = 0; | ||
1130 | 1223 | ||
1131 | while (nr_active || nr_inactive) { | 1224 | while (nr_active || nr_inactive) { |
1132 | if (nr_active) { | 1225 | if (nr_active) { |
@@ -1171,25 +1264,39 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
1171 | unsigned long nr_reclaimed = 0; | 1264 | unsigned long nr_reclaimed = 0; |
1172 | int i; | 1265 | int i; |
1173 | 1266 | ||
1267 | |||
1174 | sc->all_unreclaimable = 1; | 1268 | sc->all_unreclaimable = 1; |
1175 | for (i = 0; zones[i] != NULL; i++) { | 1269 | for (i = 0; zones[i] != NULL; i++) { |
1176 | struct zone *zone = zones[i]; | 1270 | struct zone *zone = zones[i]; |
1177 | 1271 | ||
1178 | if (!populated_zone(zone)) | 1272 | if (!populated_zone(zone)) |
1179 | continue; | 1273 | continue; |
1274 | /* | ||
1275 | * Take care memory controller reclaiming has small influence | ||
1276 | * to global LRU. | ||
1277 | */ | ||
1278 | if (scan_global_lru(sc)) { | ||
1279 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | ||
1280 | continue; | ||
1281 | note_zone_scanning_priority(zone, priority); | ||
1180 | 1282 | ||
1181 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 1283 | if (zone_is_all_unreclaimable(zone) && |
1182 | continue; | 1284 | priority != DEF_PRIORITY) |
1183 | 1285 | continue; /* Let kswapd poll it */ | |
1184 | note_zone_scanning_priority(zone, priority); | 1286 | sc->all_unreclaimable = 0; |
1185 | 1287 | } else { | |
1186 | if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY) | 1288 | /* |
1187 | continue; /* Let kswapd poll it */ | 1289 | * Ignore cpuset limitation here. We just want to reduce |
1188 | 1290 | * # of used pages by us regardless of memory shortage. | |
1189 | sc->all_unreclaimable = 0; | 1291 | */ |
1292 | sc->all_unreclaimable = 0; | ||
1293 | mem_cgroup_note_reclaim_priority(sc->mem_cgroup, | ||
1294 | priority); | ||
1295 | } | ||
1190 | 1296 | ||
1191 | nr_reclaimed += shrink_zone(priority, zone, sc); | 1297 | nr_reclaimed += shrink_zone(priority, zone, sc); |
1192 | } | 1298 | } |
1299 | |||
1193 | return nr_reclaimed; | 1300 | return nr_reclaimed; |
1194 | } | 1301 | } |
1195 | 1302 | ||
@@ -1206,7 +1313,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
1206 | * holds filesystem locks which prevent writeout this might not work, and the | 1313 | * holds filesystem locks which prevent writeout this might not work, and the |
1207 | * allocation attempt will fail. | 1314 | * allocation attempt will fail. |
1208 | */ | 1315 | */ |
1209 | unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) | 1316 | static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, |
1317 | struct scan_control *sc) | ||
1210 | { | 1318 | { |
1211 | int priority; | 1319 | int priority; |
1212 | int ret = 0; | 1320 | int ret = 0; |
@@ -1215,39 +1323,43 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) | |||
1215 | struct reclaim_state *reclaim_state = current->reclaim_state; | 1323 | struct reclaim_state *reclaim_state = current->reclaim_state; |
1216 | unsigned long lru_pages = 0; | 1324 | unsigned long lru_pages = 0; |
1217 | int i; | 1325 | int i; |
1218 | struct scan_control sc = { | ||
1219 | .gfp_mask = gfp_mask, | ||
1220 | .may_writepage = !laptop_mode, | ||
1221 | .swap_cluster_max = SWAP_CLUSTER_MAX, | ||
1222 | .may_swap = 1, | ||
1223 | .swappiness = vm_swappiness, | ||
1224 | .order = order, | ||
1225 | }; | ||
1226 | |||
1227 | count_vm_event(ALLOCSTALL); | ||
1228 | 1326 | ||
1229 | for (i = 0; zones[i] != NULL; i++) { | 1327 | if (scan_global_lru(sc)) |
1230 | struct zone *zone = zones[i]; | 1328 | count_vm_event(ALLOCSTALL); |
1329 | /* | ||
1330 | * mem_cgroup will not do shrink_slab. | ||
1331 | */ | ||
1332 | if (scan_global_lru(sc)) { | ||
1333 | for (i = 0; zones[i] != NULL; i++) { | ||
1334 | struct zone *zone = zones[i]; | ||
1231 | 1335 | ||
1232 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 1336 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
1233 | continue; | 1337 | continue; |
1234 | 1338 | ||
1235 | lru_pages += zone_page_state(zone, NR_ACTIVE) | 1339 | lru_pages += zone_page_state(zone, NR_ACTIVE) |
1236 | + zone_page_state(zone, NR_INACTIVE); | 1340 | + zone_page_state(zone, NR_INACTIVE); |
1341 | } | ||
1237 | } | 1342 | } |
1238 | 1343 | ||
1239 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 1344 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
1240 | sc.nr_scanned = 0; | 1345 | sc->nr_scanned = 0; |
1346 | sc->nr_io_pages = 0; | ||
1241 | if (!priority) | 1347 | if (!priority) |
1242 | disable_swap_token(); | 1348 | disable_swap_token(); |
1243 | nr_reclaimed += shrink_zones(priority, zones, &sc); | 1349 | nr_reclaimed += shrink_zones(priority, zones, sc); |
1244 | shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); | 1350 | /* |
1245 | if (reclaim_state) { | 1351 | * Don't shrink slabs when reclaiming memory from |
1246 | nr_reclaimed += reclaim_state->reclaimed_slab; | 1352 | * over limit cgroups |
1247 | reclaim_state->reclaimed_slab = 0; | 1353 | */ |
1354 | if (scan_global_lru(sc)) { | ||
1355 | shrink_slab(sc->nr_scanned, gfp_mask, lru_pages); | ||
1356 | if (reclaim_state) { | ||
1357 | nr_reclaimed += reclaim_state->reclaimed_slab; | ||
1358 | reclaim_state->reclaimed_slab = 0; | ||
1359 | } | ||
1248 | } | 1360 | } |
1249 | total_scanned += sc.nr_scanned; | 1361 | total_scanned += sc->nr_scanned; |
1250 | if (nr_reclaimed >= sc.swap_cluster_max) { | 1362 | if (nr_reclaimed >= sc->swap_cluster_max) { |
1251 | ret = 1; | 1363 | ret = 1; |
1252 | goto out; | 1364 | goto out; |
1253 | } | 1365 | } |
@@ -1259,18 +1371,19 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) | |||
1259 | * that's undesirable in laptop mode, where we *want* lumpy | 1371 | * that's undesirable in laptop mode, where we *want* lumpy |
1260 | * writeout. So in laptop mode, write out the whole world. | 1372 | * writeout. So in laptop mode, write out the whole world. |
1261 | */ | 1373 | */ |
1262 | if (total_scanned > sc.swap_cluster_max + | 1374 | if (total_scanned > sc->swap_cluster_max + |
1263 | sc.swap_cluster_max / 2) { | 1375 | sc->swap_cluster_max / 2) { |
1264 | wakeup_pdflush(laptop_mode ? 0 : total_scanned); | 1376 | wakeup_pdflush(laptop_mode ? 0 : total_scanned); |
1265 | sc.may_writepage = 1; | 1377 | sc->may_writepage = 1; |
1266 | } | 1378 | } |
1267 | 1379 | ||
1268 | /* Take a nap, wait for some writeback to complete */ | 1380 | /* Take a nap, wait for some writeback to complete */ |
1269 | if (sc.nr_scanned && priority < DEF_PRIORITY - 2) | 1381 | if (sc->nr_scanned && priority < DEF_PRIORITY - 2 && |
1382 | sc->nr_io_pages > sc->swap_cluster_max) | ||
1270 | congestion_wait(WRITE, HZ/10); | 1383 | congestion_wait(WRITE, HZ/10); |
1271 | } | 1384 | } |
1272 | /* top priority shrink_caches still had more to do? don't OOM, then */ | 1385 | /* top priority shrink_caches still had more to do? don't OOM, then */ |
1273 | if (!sc.all_unreclaimable) | 1386 | if (!sc->all_unreclaimable && scan_global_lru(sc)) |
1274 | ret = 1; | 1387 | ret = 1; |
1275 | out: | 1388 | out: |
1276 | /* | 1389 | /* |
@@ -1282,17 +1395,63 @@ out: | |||
1282 | */ | 1395 | */ |
1283 | if (priority < 0) | 1396 | if (priority < 0) |
1284 | priority = 0; | 1397 | priority = 0; |
1285 | for (i = 0; zones[i] != NULL; i++) { | ||
1286 | struct zone *zone = zones[i]; | ||
1287 | 1398 | ||
1288 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 1399 | if (scan_global_lru(sc)) { |
1289 | continue; | 1400 | for (i = 0; zones[i] != NULL; i++) { |
1401 | struct zone *zone = zones[i]; | ||
1402 | |||
1403 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | ||
1404 | continue; | ||
1405 | |||
1406 | zone->prev_priority = priority; | ||
1407 | } | ||
1408 | } else | ||
1409 | mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); | ||
1290 | 1410 | ||
1291 | zone->prev_priority = priority; | ||
1292 | } | ||
1293 | return ret; | 1411 | return ret; |
1294 | } | 1412 | } |
1295 | 1413 | ||
1414 | unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) | ||
1415 | { | ||
1416 | struct scan_control sc = { | ||
1417 | .gfp_mask = gfp_mask, | ||
1418 | .may_writepage = !laptop_mode, | ||
1419 | .swap_cluster_max = SWAP_CLUSTER_MAX, | ||
1420 | .may_swap = 1, | ||
1421 | .swappiness = vm_swappiness, | ||
1422 | .order = order, | ||
1423 | .mem_cgroup = NULL, | ||
1424 | .isolate_pages = isolate_pages_global, | ||
1425 | }; | ||
1426 | |||
1427 | return do_try_to_free_pages(zones, gfp_mask, &sc); | ||
1428 | } | ||
1429 | |||
1430 | #ifdef CONFIG_CGROUP_MEM_CONT | ||
1431 | |||
1432 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | ||
1433 | gfp_t gfp_mask) | ||
1434 | { | ||
1435 | struct scan_control sc = { | ||
1436 | .gfp_mask = gfp_mask, | ||
1437 | .may_writepage = !laptop_mode, | ||
1438 | .may_swap = 1, | ||
1439 | .swap_cluster_max = SWAP_CLUSTER_MAX, | ||
1440 | .swappiness = vm_swappiness, | ||
1441 | .order = 0, | ||
1442 | .mem_cgroup = mem_cont, | ||
1443 | .isolate_pages = mem_cgroup_isolate_pages, | ||
1444 | }; | ||
1445 | struct zone **zones; | ||
1446 | int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE); | ||
1447 | |||
1448 | zones = NODE_DATA(numa_node_id())->node_zonelists[target_zone].zones; | ||
1449 | if (do_try_to_free_pages(zones, sc.gfp_mask, &sc)) | ||
1450 | return 1; | ||
1451 | return 0; | ||
1452 | } | ||
1453 | #endif | ||
1454 | |||
1296 | /* | 1455 | /* |
1297 | * For kswapd, balance_pgdat() will work across all this node's zones until | 1456 | * For kswapd, balance_pgdat() will work across all this node's zones until |
1298 | * they are all at pages_high. | 1457 | * they are all at pages_high. |
@@ -1328,6 +1487,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
1328 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1487 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
1329 | .swappiness = vm_swappiness, | 1488 | .swappiness = vm_swappiness, |
1330 | .order = order, | 1489 | .order = order, |
1490 | .mem_cgroup = NULL, | ||
1491 | .isolate_pages = isolate_pages_global, | ||
1331 | }; | 1492 | }; |
1332 | /* | 1493 | /* |
1333 | * temp_priority is used to remember the scanning priority at which | 1494 | * temp_priority is used to remember the scanning priority at which |
@@ -1352,6 +1513,7 @@ loop_again: | |||
1352 | if (!priority) | 1513 | if (!priority) |
1353 | disable_swap_token(); | 1514 | disable_swap_token(); |
1354 | 1515 | ||
1516 | sc.nr_io_pages = 0; | ||
1355 | all_zones_ok = 1; | 1517 | all_zones_ok = 1; |
1356 | 1518 | ||
1357 | /* | 1519 | /* |
@@ -1444,7 +1606,8 @@ loop_again: | |||
1444 | * OK, kswapd is getting into trouble. Take a nap, then take | 1606 | * OK, kswapd is getting into trouble. Take a nap, then take |
1445 | * another pass across the zones. | 1607 | * another pass across the zones. |
1446 | */ | 1608 | */ |
1447 | if (total_scanned && priority < DEF_PRIORITY - 2) | 1609 | if (total_scanned && priority < DEF_PRIORITY - 2 && |
1610 | sc.nr_io_pages > sc.swap_cluster_max) | ||
1448 | congestion_wait(WRITE, HZ/10); | 1611 | congestion_wait(WRITE, HZ/10); |
1449 | 1612 | ||
1450 | /* | 1613 | /* |
@@ -1649,6 +1812,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1649 | .swap_cluster_max = nr_pages, | 1812 | .swap_cluster_max = nr_pages, |
1650 | .may_writepage = 1, | 1813 | .may_writepage = 1, |
1651 | .swappiness = vm_swappiness, | 1814 | .swappiness = vm_swappiness, |
1815 | .isolate_pages = isolate_pages_global, | ||
1652 | }; | 1816 | }; |
1653 | 1817 | ||
1654 | current->reclaim_state = &reclaim_state; | 1818 | current->reclaim_state = &reclaim_state; |
@@ -1834,6 +1998,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1834 | SWAP_CLUSTER_MAX), | 1998 | SWAP_CLUSTER_MAX), |
1835 | .gfp_mask = gfp_mask, | 1999 | .gfp_mask = gfp_mask, |
1836 | .swappiness = vm_swappiness, | 2000 | .swappiness = vm_swappiness, |
2001 | .isolate_pages = isolate_pages_global, | ||
1837 | }; | 2002 | }; |
1838 | unsigned long slab_reclaimable; | 2003 | unsigned long slab_reclaimable; |
1839 | 2004 | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index e8d846f57774..422d960ffcd8 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -21,21 +21,14 @@ EXPORT_PER_CPU_SYMBOL(vm_event_states); | |||
21 | 21 | ||
22 | static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) | 22 | static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) |
23 | { | 23 | { |
24 | int cpu = 0; | 24 | int cpu; |
25 | int i; | 25 | int i; |
26 | 26 | ||
27 | memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); | 27 | memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); |
28 | 28 | ||
29 | cpu = first_cpu(*cpumask); | 29 | for_each_cpu_mask(cpu, *cpumask) { |
30 | while (cpu < NR_CPUS) { | ||
31 | struct vm_event_state *this = &per_cpu(vm_event_states, cpu); | 30 | struct vm_event_state *this = &per_cpu(vm_event_states, cpu); |
32 | 31 | ||
33 | cpu = next_cpu(cpu, *cpumask); | ||
34 | |||
35 | if (cpu < NR_CPUS) | ||
36 | prefetch(&per_cpu(vm_event_states, cpu)); | ||
37 | |||
38 | |||
39 | for (i = 0; i < NR_VM_EVENT_ITEMS; i++) | 32 | for (i = 0; i < NR_VM_EVENT_ITEMS; i++) |
40 | ret[i] += this->event[i]; | 33 | ret[i] += this->event[i]; |
41 | } | 34 | } |
@@ -284,6 +277,10 @@ EXPORT_SYMBOL(dec_zone_page_state); | |||
284 | /* | 277 | /* |
285 | * Update the zone counters for one cpu. | 278 | * Update the zone counters for one cpu. |
286 | * | 279 | * |
280 | * The cpu specified must be either the current cpu or a processor that | ||
281 | * is not online. If it is the current cpu then the execution thread must | ||
282 | * be pinned to the current cpu. | ||
283 | * | ||
287 | * Note that refresh_cpu_vm_stats strives to only access | 284 | * Note that refresh_cpu_vm_stats strives to only access |
288 | * node local memory. The per cpu pagesets on remote zones are placed | 285 | * node local memory. The per cpu pagesets on remote zones are placed |
289 | * in the memory local to the processor using that pageset. So the | 286 | * in the memory local to the processor using that pageset. So the |
@@ -299,7 +296,7 @@ void refresh_cpu_vm_stats(int cpu) | |||
299 | { | 296 | { |
300 | struct zone *zone; | 297 | struct zone *zone; |
301 | int i; | 298 | int i; |
302 | unsigned long flags; | 299 | int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; |
303 | 300 | ||
304 | for_each_zone(zone) { | 301 | for_each_zone(zone) { |
305 | struct per_cpu_pageset *p; | 302 | struct per_cpu_pageset *p; |
@@ -311,15 +308,19 @@ void refresh_cpu_vm_stats(int cpu) | |||
311 | 308 | ||
312 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 309 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) |
313 | if (p->vm_stat_diff[i]) { | 310 | if (p->vm_stat_diff[i]) { |
311 | unsigned long flags; | ||
312 | int v; | ||
313 | |||
314 | local_irq_save(flags); | 314 | local_irq_save(flags); |
315 | zone_page_state_add(p->vm_stat_diff[i], | 315 | v = p->vm_stat_diff[i]; |
316 | zone, i); | ||
317 | p->vm_stat_diff[i] = 0; | 316 | p->vm_stat_diff[i] = 0; |
317 | local_irq_restore(flags); | ||
318 | atomic_long_add(v, &zone->vm_stat[i]); | ||
319 | global_diff[i] += v; | ||
318 | #ifdef CONFIG_NUMA | 320 | #ifdef CONFIG_NUMA |
319 | /* 3 seconds idle till flush */ | 321 | /* 3 seconds idle till flush */ |
320 | p->expire = 3; | 322 | p->expire = 3; |
321 | #endif | 323 | #endif |
322 | local_irq_restore(flags); | ||
323 | } | 324 | } |
324 | #ifdef CONFIG_NUMA | 325 | #ifdef CONFIG_NUMA |
325 | /* | 326 | /* |
@@ -329,7 +330,7 @@ void refresh_cpu_vm_stats(int cpu) | |||
329 | * Check if there are pages remaining in this pageset | 330 | * Check if there are pages remaining in this pageset |
330 | * if not then there is nothing to expire. | 331 | * if not then there is nothing to expire. |
331 | */ | 332 | */ |
332 | if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count)) | 333 | if (!p->expire || !p->pcp.count) |
333 | continue; | 334 | continue; |
334 | 335 | ||
335 | /* | 336 | /* |
@@ -344,13 +345,14 @@ void refresh_cpu_vm_stats(int cpu) | |||
344 | if (p->expire) | 345 | if (p->expire) |
345 | continue; | 346 | continue; |
346 | 347 | ||
347 | if (p->pcp[0].count) | 348 | if (p->pcp.count) |
348 | drain_zone_pages(zone, p->pcp + 0); | 349 | drain_zone_pages(zone, &p->pcp); |
349 | |||
350 | if (p->pcp[1].count) | ||
351 | drain_zone_pages(zone, p->pcp + 1); | ||
352 | #endif | 350 | #endif |
353 | } | 351 | } |
352 | |||
353 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | ||
354 | if (global_diff[i]) | ||
355 | atomic_long_add(global_diff[i], &vm_stat[i]); | ||
354 | } | 356 | } |
355 | 357 | ||
356 | #endif | 358 | #endif |
@@ -681,20 +683,17 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
681 | "\n pagesets"); | 683 | "\n pagesets"); |
682 | for_each_online_cpu(i) { | 684 | for_each_online_cpu(i) { |
683 | struct per_cpu_pageset *pageset; | 685 | struct per_cpu_pageset *pageset; |
684 | int j; | ||
685 | 686 | ||
686 | pageset = zone_pcp(zone, i); | 687 | pageset = zone_pcp(zone, i); |
687 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | 688 | seq_printf(m, |
688 | seq_printf(m, | 689 | "\n cpu: %i" |
689 | "\n cpu: %i pcp: %i" | 690 | "\n count: %i" |
690 | "\n count: %i" | 691 | "\n high: %i" |
691 | "\n high: %i" | 692 | "\n batch: %i", |
692 | "\n batch: %i", | 693 | i, |
693 | i, j, | 694 | pageset->pcp.count, |
694 | pageset->pcp[j].count, | 695 | pageset->pcp.high, |
695 | pageset->pcp[j].high, | 696 | pageset->pcp.batch); |
696 | pageset->pcp[j].batch); | ||
697 | } | ||
698 | #ifdef CONFIG_SMP | 697 | #ifdef CONFIG_SMP |
699 | seq_printf(m, "\n vm stats threshold: %d", | 698 | seq_printf(m, "\n vm stats threshold: %d", |
700 | pageset->stat_threshold); | 699 | pageset->stat_threshold); |