aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /mm
Linux-2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile20
-rw-r--r--mm/bootmem.c400
-rw-r--r--mm/fadvise.c111
-rw-r--r--mm/filemap.c2306
-rw-r--r--mm/fremap.c256
-rw-r--r--mm/highmem.c607
-rw-r--r--mm/hugetlb.c260
-rw-r--r--mm/internal.h13
-rw-r--r--mm/madvise.c242
-rw-r--r--mm/memory.c2165
-rw-r--r--mm/mempolicy.c1138
-rw-r--r--mm/mempool.c290
-rw-r--r--mm/mincore.c191
-rw-r--r--mm/mlock.c253
-rw-r--r--mm/mmap.c2082
-rw-r--r--mm/mprotect.c282
-rw-r--r--mm/mremap.c426
-rw-r--r--mm/msync.c236
-rw-r--r--mm/nommu.c1180
-rw-r--r--mm/oom_kill.c292
-rw-r--r--mm/page-writeback.c819
-rw-r--r--mm/page_alloc.c2220
-rw-r--r--mm/page_io.c160
-rw-r--r--mm/pdflush.c228
-rw-r--r--mm/prio_tree.c207
-rw-r--r--mm/readahead.c557
-rw-r--r--mm/rmap.c862
-rw-r--r--mm/shmem.c2326
-rw-r--r--mm/slab.c3060
-rw-r--r--mm/swap.c485
-rw-r--r--mm/swap_state.c382
-rw-r--r--mm/swapfile.c1672
-rw-r--r--mm/thrash.c102
-rw-r--r--mm/tiny-shmem.c122
-rw-r--r--mm/truncate.c336
-rw-r--r--mm/vmalloc.c588
-rw-r--r--mm/vmscan.c1311
37 files changed, 28187 insertions, 0 deletions
diff --git a/mm/Makefile b/mm/Makefile
new file mode 100644
index 00000000000..097408064f6
--- /dev/null
+++ b/mm/Makefile
@@ -0,0 +1,20 @@
1#
2# Makefile for the linux memory manager.
3#
4
5mmu-y := nommu.o
6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o
9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 page_alloc.o page-writeback.o pdflush.o \
12 readahead.o slab.o swap.o truncate.o vmscan.o \
13 prio_tree.o $(mmu-y)
14
15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
16obj-$(CONFIG_HUGETLBFS) += hugetlb.o
17obj-$(CONFIG_NUMA) += mempolicy.o
18obj-$(CONFIG_SHMEM) += shmem.o
19obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
20
diff --git a/mm/bootmem.c b/mm/bootmem.c
new file mode 100644
index 00000000000..260e703850d
--- /dev/null
+++ b/mm/bootmem.c
@@ -0,0 +1,400 @@
1/*
2 * linux/mm/bootmem.c
3 *
4 * Copyright (C) 1999 Ingo Molnar
5 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
6 *
7 * simple boot-time physical memory area allocator and
8 * free memory collector. It's used to deal with reserved
9 * system memory and memory holes as well.
10 */
11
12#include <linux/mm.h>
13#include <linux/kernel_stat.h>
14#include <linux/swap.h>
15#include <linux/interrupt.h>
16#include <linux/init.h>
17#include <linux/bootmem.h>
18#include <linux/mmzone.h>
19#include <linux/module.h>
20#include <asm/dma.h>
21#include <asm/io.h>
22#include "internal.h"
23
24/*
25 * Access to this subsystem has to be serialized externally. (this is
26 * true for the boot process anyway)
27 */
28unsigned long max_low_pfn;
29unsigned long min_low_pfn;
30unsigned long max_pfn;
31
32EXPORT_SYMBOL(max_pfn); /* This is exported so
33 * dma_get_required_mask(), which uses
34 * it, can be an inline function */
35
36/* return the number of _pages_ that will be allocated for the boot bitmap */
37unsigned long __init bootmem_bootmap_pages (unsigned long pages)
38{
39 unsigned long mapsize;
40
41 mapsize = (pages+7)/8;
42 mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK;
43 mapsize >>= PAGE_SHIFT;
44
45 return mapsize;
46}
47
48/*
49 * Called once to set up the allocator itself.
50 */
51static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
52 unsigned long mapstart, unsigned long start, unsigned long end)
53{
54 bootmem_data_t *bdata = pgdat->bdata;
55 unsigned long mapsize = ((end - start)+7)/8;
56
57 pgdat->pgdat_next = pgdat_list;
58 pgdat_list = pgdat;
59
60 mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL);
61 bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
62 bdata->node_boot_start = (start << PAGE_SHIFT);
63 bdata->node_low_pfn = end;
64
65 /*
66 * Initially all pages are reserved - setup_arch() has to
67 * register free RAM areas explicitly.
68 */
69 memset(bdata->node_bootmem_map, 0xff, mapsize);
70
71 return mapsize;
72}
73
74/*
75 * Marks a particular physical memory range as unallocatable. Usable RAM
76 * might be used for boot-time allocations - or it might get added
77 * to the free page pool later on.
78 */
79static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size)
80{
81 unsigned long i;
82 /*
83 * round up, partially reserved pages are considered
84 * fully reserved.
85 */
86 unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE;
87 unsigned long eidx = (addr + size - bdata->node_boot_start +
88 PAGE_SIZE-1)/PAGE_SIZE;
89 unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE;
90
91 BUG_ON(!size);
92 BUG_ON(sidx >= eidx);
93 BUG_ON((addr >> PAGE_SHIFT) >= bdata->node_low_pfn);
94 BUG_ON(end > bdata->node_low_pfn);
95
96 for (i = sidx; i < eidx; i++)
97 if (test_and_set_bit(i, bdata->node_bootmem_map)) {
98#ifdef CONFIG_DEBUG_BOOTMEM
99 printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
100#endif
101 }
102}
103
104static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size)
105{
106 unsigned long i;
107 unsigned long start;
108 /*
109 * round down end of usable mem, partially free pages are
110 * considered reserved.
111 */
112 unsigned long sidx;
113 unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE;
114 unsigned long end = (addr + size)/PAGE_SIZE;
115
116 BUG_ON(!size);
117 BUG_ON(end > bdata->node_low_pfn);
118
119 if (addr < bdata->last_success)
120 bdata->last_success = addr;
121
122 /*
123 * Round up the beginning of the address.
124 */
125 start = (addr + PAGE_SIZE-1) / PAGE_SIZE;
126 sidx = start - (bdata->node_boot_start/PAGE_SIZE);
127
128 for (i = sidx; i < eidx; i++) {
129 if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map)))
130 BUG();
131 }
132}
133
134/*
135 * We 'merge' subsequent allocations to save space. We might 'lose'
136 * some fraction of a page if allocations cannot be satisfied due to
137 * size constraints on boxes where there is physical RAM space
138 * fragmentation - in these cases (mostly large memory boxes) this
139 * is not a problem.
140 *
141 * On low memory boxes we get it right in 100% of the cases.
142 *
143 * alignment has to be a power of 2 value.
144 *
145 * NOTE: This function is _not_ reentrant.
146 */
147static void * __init
148__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
149 unsigned long align, unsigned long goal)
150{
151 unsigned long offset, remaining_size, areasize, preferred;
152 unsigned long i, start = 0, incr, eidx;
153 void *ret;
154
155 if(!size) {
156 printk("__alloc_bootmem_core(): zero-sized request\n");
157 BUG();
158 }
159 BUG_ON(align & (align-1));
160
161 eidx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
162 offset = 0;
163 if (align &&
164 (bdata->node_boot_start & (align - 1UL)) != 0)
165 offset = (align - (bdata->node_boot_start & (align - 1UL)));
166 offset >>= PAGE_SHIFT;
167
168 /*
169 * We try to allocate bootmem pages above 'goal'
170 * first, then we try to allocate lower pages.
171 */
172 if (goal && (goal >= bdata->node_boot_start) &&
173 ((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) {
174 preferred = goal - bdata->node_boot_start;
175
176 if (bdata->last_success >= preferred)
177 preferred = bdata->last_success;
178 } else
179 preferred = 0;
180
181 preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT;
182 preferred += offset;
183 areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;
184 incr = align >> PAGE_SHIFT ? : 1;
185
186restart_scan:
187 for (i = preferred; i < eidx; i += incr) {
188 unsigned long j;
189 i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i);
190 i = ALIGN(i, incr);
191 if (test_bit(i, bdata->node_bootmem_map))
192 continue;
193 for (j = i + 1; j < i + areasize; ++j) {
194 if (j >= eidx)
195 goto fail_block;
196 if (test_bit (j, bdata->node_bootmem_map))
197 goto fail_block;
198 }
199 start = i;
200 goto found;
201 fail_block:
202 i = ALIGN(j, incr);
203 }
204
205 if (preferred > offset) {
206 preferred = offset;
207 goto restart_scan;
208 }
209 return NULL;
210
211found:
212 bdata->last_success = start << PAGE_SHIFT;
213 BUG_ON(start >= eidx);
214
215 /*
216 * Is the next page of the previous allocation-end the start
217 * of this allocation's buffer? If yes then we can 'merge'
218 * the previous partial page with this allocation.
219 */
220 if (align < PAGE_SIZE &&
221 bdata->last_offset && bdata->last_pos+1 == start) {
222 offset = (bdata->last_offset+align-1) & ~(align-1);
223 BUG_ON(offset > PAGE_SIZE);
224 remaining_size = PAGE_SIZE-offset;
225 if (size < remaining_size) {
226 areasize = 0;
227 /* last_pos unchanged */
228 bdata->last_offset = offset+size;
229 ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
230 bdata->node_boot_start);
231 } else {
232 remaining_size = size - remaining_size;
233 areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE;
234 ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
235 bdata->node_boot_start);
236 bdata->last_pos = start+areasize-1;
237 bdata->last_offset = remaining_size;
238 }
239 bdata->last_offset &= ~PAGE_MASK;
240 } else {
241 bdata->last_pos = start + areasize - 1;
242 bdata->last_offset = size & ~PAGE_MASK;
243 ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start);
244 }
245
246 /*
247 * Reserve the area now:
248 */
249 for (i = start; i < start+areasize; i++)
250 if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map)))
251 BUG();
252 memset(ret, 0, size);
253 return ret;
254}
255
256static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
257{
258 struct page *page;
259 bootmem_data_t *bdata = pgdat->bdata;
260 unsigned long i, count, total = 0;
261 unsigned long idx;
262 unsigned long *map;
263 int gofast = 0;
264
265 BUG_ON(!bdata->node_bootmem_map);
266
267 count = 0;
268 /* first extant page of the node */
269 page = virt_to_page(phys_to_virt(bdata->node_boot_start));
270 idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
271 map = bdata->node_bootmem_map;
272 /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
273 if (bdata->node_boot_start == 0 ||
274 ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
275 gofast = 1;
276 for (i = 0; i < idx; ) {
277 unsigned long v = ~map[i / BITS_PER_LONG];
278 if (gofast && v == ~0UL) {
279 int j, order;
280
281 count += BITS_PER_LONG;
282 __ClearPageReserved(page);
283 order = ffs(BITS_PER_LONG) - 1;
284 set_page_refs(page, order);
285 for (j = 1; j < BITS_PER_LONG; j++) {
286 if (j + 16 < BITS_PER_LONG)
287 prefetchw(page + j + 16);
288 __ClearPageReserved(page + j);
289 }
290 __free_pages(page, order);
291 i += BITS_PER_LONG;
292 page += BITS_PER_LONG;
293 } else if (v) {
294 unsigned long m;
295 for (m = 1; m && i < idx; m<<=1, page++, i++) {
296 if (v & m) {
297 count++;
298 __ClearPageReserved(page);
299 set_page_refs(page, 0);
300 __free_page(page);
301 }
302 }
303 } else {
304 i+=BITS_PER_LONG;
305 page += BITS_PER_LONG;
306 }
307 }
308 total += count;
309
310 /*
311 * Now free the allocator bitmap itself, it's not
312 * needed anymore:
313 */
314 page = virt_to_page(bdata->node_bootmem_map);
315 count = 0;
316 for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) {
317 count++;
318 __ClearPageReserved(page);
319 set_page_count(page, 1);
320 __free_page(page);
321 }
322 total += count;
323 bdata->node_bootmem_map = NULL;
324
325 return total;
326}
327
328unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn)
329{
330 return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn));
331}
332
333void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size)
334{
335 reserve_bootmem_core(pgdat->bdata, physaddr, size);
336}
337
338void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size)
339{
340 free_bootmem_core(pgdat->bdata, physaddr, size);
341}
342
343unsigned long __init free_all_bootmem_node (pg_data_t *pgdat)
344{
345 return(free_all_bootmem_core(pgdat));
346}
347
348unsigned long __init init_bootmem (unsigned long start, unsigned long pages)
349{
350 max_low_pfn = pages;
351 min_low_pfn = start;
352 return(init_bootmem_core(NODE_DATA(0), start, 0, pages));
353}
354
355#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
356void __init reserve_bootmem (unsigned long addr, unsigned long size)
357{
358 reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size);
359}
360#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
361
362void __init free_bootmem (unsigned long addr, unsigned long size)
363{
364 free_bootmem_core(NODE_DATA(0)->bdata, addr, size);
365}
366
367unsigned long __init free_all_bootmem (void)
368{
369 return(free_all_bootmem_core(NODE_DATA(0)));
370}
371
372void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal)
373{
374 pg_data_t *pgdat = pgdat_list;
375 void *ptr;
376
377 for_each_pgdat(pgdat)
378 if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
379 align, goal)))
380 return(ptr);
381
382 /*
383 * Whoops, we cannot satisfy the allocation request.
384 */
385 printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
386 panic("Out of memory");
387 return NULL;
388}
389
390void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal)
391{
392 void *ptr;
393
394 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal);
395 if (ptr)
396 return (ptr);
397
398 return __alloc_bootmem(size, align, goal);
399}
400
diff --git a/mm/fadvise.c b/mm/fadvise.c
new file mode 100644
index 00000000000..57264d74b8b
--- /dev/null
+++ b/mm/fadvise.c
@@ -0,0 +1,111 @@
1/*
2 * mm/fadvise.c
3 *
4 * Copyright (C) 2002, Linus Torvalds
5 *
6 * 11Jan2003 akpm@digeo.com
7 * Initial version.
8 */
9
10#include <linux/kernel.h>
11#include <linux/file.h>
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/pagemap.h>
15#include <linux/backing-dev.h>
16#include <linux/pagevec.h>
17#include <linux/fadvise.h>
18#include <linux/syscalls.h>
19
20#include <asm/unistd.h>
21
22/*
23 * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
24 * deactivate the pages and clear PG_Referenced.
25 */
26asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
27{
28 struct file *file = fget(fd);
29 struct address_space *mapping;
30 struct backing_dev_info *bdi;
31 loff_t endbyte;
32 pgoff_t start_index;
33 pgoff_t end_index;
34 unsigned long nrpages;
35 int ret = 0;
36
37 if (!file)
38 return -EBADF;
39
40 mapping = file->f_mapping;
41 if (!mapping || len < 0) {
42 ret = -EINVAL;
43 goto out;
44 }
45
46 /* Careful about overflows. Len == 0 means "as much as possible" */
47 endbyte = offset + len;
48 if (!len || endbyte < len)
49 endbyte = -1;
50
51 bdi = mapping->backing_dev_info;
52
53 switch (advice) {
54 case POSIX_FADV_NORMAL:
55 file->f_ra.ra_pages = bdi->ra_pages;
56 break;
57 case POSIX_FADV_RANDOM:
58 file->f_ra.ra_pages = 0;
59 break;
60 case POSIX_FADV_SEQUENTIAL:
61 file->f_ra.ra_pages = bdi->ra_pages * 2;
62 break;
63 case POSIX_FADV_WILLNEED:
64 case POSIX_FADV_NOREUSE:
65 if (!mapping->a_ops->readpage) {
66 ret = -EINVAL;
67 break;
68 }
69
70 /* First and last PARTIAL page! */
71 start_index = offset >> PAGE_CACHE_SHIFT;
72 end_index = (endbyte-1) >> PAGE_CACHE_SHIFT;
73
74 /* Careful about overflow on the "+1" */
75 nrpages = end_index - start_index + 1;
76 if (!nrpages)
77 nrpages = ~0UL;
78
79 ret = force_page_cache_readahead(mapping, file,
80 start_index,
81 max_sane_readahead(nrpages));
82 if (ret > 0)
83 ret = 0;
84 break;
85 case POSIX_FADV_DONTNEED:
86 if (!bdi_write_congested(mapping->backing_dev_info))
87 filemap_flush(mapping);
88
89 /* First and last FULL page! */
90 start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
91 end_index = (endbyte >> PAGE_CACHE_SHIFT);
92
93 if (end_index > start_index)
94 invalidate_mapping_pages(mapping, start_index, end_index-1);
95 break;
96 default:
97 ret = -EINVAL;
98 }
99out:
100 fput(file);
101 return ret;
102}
103
104#ifdef __ARCH_WANT_SYS_FADVISE64
105
106asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice)
107{
108 return sys_fadvise64_64(fd, offset, len, advice);
109}
110
111#endif
diff --git a/mm/filemap.c b/mm/filemap.c
new file mode 100644
index 00000000000..439b2bea8e3
--- /dev/null
+++ b/mm/filemap.c
@@ -0,0 +1,2306 @@
1/*
2 * linux/mm/filemap.c
3 *
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
6
7/*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
11 */
12#include <linux/config.h>
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/compiler.h>
16#include <linux/fs.h>
17#include <linux/aio.h>
18#include <linux/kernel_stat.h>
19#include <linux/mm.h>
20#include <linux/swap.h>
21#include <linux/mman.h>
22#include <linux/pagemap.h>
23#include <linux/file.h>
24#include <linux/uio.h>
25#include <linux/hash.h>
26#include <linux/writeback.h>
27#include <linux/pagevec.h>
28#include <linux/blkdev.h>
29#include <linux/security.h>
30#include <linux/syscalls.h>
31/*
32 * This is needed for the following functions:
33 * - try_to_release_page
34 * - block_invalidatepage
35 * - generic_osync_inode
36 *
37 * FIXME: remove all knowledge of the buffer layer from the core VM
38 */
39#include <linux/buffer_head.h> /* for generic_osync_inode */
40
41#include <asm/uaccess.h>
42#include <asm/mman.h>
43
44/*
45 * Shared mappings implemented 30.11.1994. It's not fully working yet,
46 * though.
47 *
48 * Shared mappings now work. 15.8.1995 Bruno.
49 *
50 * finished 'unifying' the page and buffer cache and SMP-threaded the
51 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
52 *
53 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
54 */
55
56/*
57 * Lock ordering:
58 *
59 * ->i_mmap_lock (vmtruncate)
60 * ->private_lock (__free_pte->__set_page_dirty_buffers)
61 * ->swap_list_lock
62 * ->swap_device_lock (exclusive_swap_page, others)
63 * ->mapping->tree_lock
64 *
65 * ->i_sem
66 * ->i_mmap_lock (truncate->unmap_mapping_range)
67 *
68 * ->mmap_sem
69 * ->i_mmap_lock
70 * ->page_table_lock (various places, mainly in mmap.c)
71 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
72 *
73 * ->mmap_sem
74 * ->lock_page (access_process_vm)
75 *
76 * ->mmap_sem
77 * ->i_sem (msync)
78 *
79 * ->i_sem
80 * ->i_alloc_sem (various)
81 *
82 * ->inode_lock
83 * ->sb_lock (fs/fs-writeback.c)
84 * ->mapping->tree_lock (__sync_single_inode)
85 *
86 * ->i_mmap_lock
87 * ->anon_vma.lock (vma_adjust)
88 *
89 * ->anon_vma.lock
90 * ->page_table_lock (anon_vma_prepare and various)
91 *
92 * ->page_table_lock
93 * ->swap_device_lock (try_to_unmap_one)
94 * ->private_lock (try_to_unmap_one)
95 * ->tree_lock (try_to_unmap_one)
96 * ->zone.lru_lock (follow_page->mark_page_accessed)
97 * ->private_lock (page_remove_rmap->set_page_dirty)
98 * ->tree_lock (page_remove_rmap->set_page_dirty)
99 * ->inode_lock (page_remove_rmap->set_page_dirty)
100 * ->inode_lock (zap_pte_range->set_page_dirty)
101 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
102 *
103 * ->task->proc_lock
104 * ->dcache_lock (proc_pid_lookup)
105 */
106
107/*
108 * Remove a page from the page cache and free it. Caller has to make
109 * sure the page is locked and that nobody else uses it - or that usage
110 * is safe. The caller must hold a write_lock on the mapping's tree_lock.
111 */
112void __remove_from_page_cache(struct page *page)
113{
114 struct address_space *mapping = page->mapping;
115
116 radix_tree_delete(&mapping->page_tree, page->index);
117 page->mapping = NULL;
118 mapping->nrpages--;
119 pagecache_acct(-1);
120}
121
122void remove_from_page_cache(struct page *page)
123{
124 struct address_space *mapping = page->mapping;
125
126 if (unlikely(!PageLocked(page)))
127 PAGE_BUG(page);
128
129 write_lock_irq(&mapping->tree_lock);
130 __remove_from_page_cache(page);
131 write_unlock_irq(&mapping->tree_lock);
132}
133
134static int sync_page(void *word)
135{
136 struct address_space *mapping;
137 struct page *page;
138
139 page = container_of((page_flags_t *)word, struct page, flags);
140
141 /*
142 * FIXME, fercrissake. What is this barrier here for?
143 */
144 smp_mb();
145 mapping = page_mapping(page);
146 if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
147 mapping->a_ops->sync_page(page);
148 io_schedule();
149 return 0;
150}
151
152/**
153 * filemap_fdatawrite_range - start writeback against all of a mapping's
154 * dirty pages that lie within the byte offsets <start, end>
155 * @mapping: address space structure to write
156 * @start: offset in bytes where the range starts
157 * @end : offset in bytes where the range ends
158 *
159 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
160 * opposed to a regular memory * cleansing writeback. The difference between
161 * these two operations is that if a dirty page/buffer is encountered, it must
162 * be waited upon, and not just skipped over.
163 */
164static int __filemap_fdatawrite_range(struct address_space *mapping,
165 loff_t start, loff_t end, int sync_mode)
166{
167 int ret;
168 struct writeback_control wbc = {
169 .sync_mode = sync_mode,
170 .nr_to_write = mapping->nrpages * 2,
171 .start = start,
172 .end = end,
173 };
174
175 if (!mapping_cap_writeback_dirty(mapping))
176 return 0;
177
178 ret = do_writepages(mapping, &wbc);
179 return ret;
180}
181
182static inline int __filemap_fdatawrite(struct address_space *mapping,
183 int sync_mode)
184{
185 return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode);
186}
187
188int filemap_fdatawrite(struct address_space *mapping)
189{
190 return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
191}
192EXPORT_SYMBOL(filemap_fdatawrite);
193
194static int filemap_fdatawrite_range(struct address_space *mapping,
195 loff_t start, loff_t end)
196{
197 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
198}
199
200/*
201 * This is a mostly non-blocking flush. Not suitable for data-integrity
202 * purposes - I/O may not be started against all dirty pages.
203 */
204int filemap_flush(struct address_space *mapping)
205{
206 return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
207}
208EXPORT_SYMBOL(filemap_flush);
209
210/*
211 * Wait for writeback to complete against pages indexed by start->end
212 * inclusive
213 */
214static int wait_on_page_writeback_range(struct address_space *mapping,
215 pgoff_t start, pgoff_t end)
216{
217 struct pagevec pvec;
218 int nr_pages;
219 int ret = 0;
220 pgoff_t index;
221
222 if (end < start)
223 return 0;
224
225 pagevec_init(&pvec, 0);
226 index = start;
227 while ((index <= end) &&
228 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
229 PAGECACHE_TAG_WRITEBACK,
230 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
231 unsigned i;
232
233 for (i = 0; i < nr_pages; i++) {
234 struct page *page = pvec.pages[i];
235
236 /* until radix tree lookup accepts end_index */
237 if (page->index > end)
238 continue;
239
240 wait_on_page_writeback(page);
241 if (PageError(page))
242 ret = -EIO;
243 }
244 pagevec_release(&pvec);
245 cond_resched();
246 }
247
248 /* Check for outstanding write errors */
249 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
250 ret = -ENOSPC;
251 if (test_and_clear_bit(AS_EIO, &mapping->flags))
252 ret = -EIO;
253
254 return ret;
255}
256
257/*
258 * Write and wait upon all the pages in the passed range. This is a "data
259 * integrity" operation. It waits upon in-flight writeout before starting and
260 * waiting upon new writeout. If there was an IO error, return it.
261 *
262 * We need to re-take i_sem during the generic_osync_inode list walk because
263 * it is otherwise livelockable.
264 */
265int sync_page_range(struct inode *inode, struct address_space *mapping,
266 loff_t pos, size_t count)
267{
268 pgoff_t start = pos >> PAGE_CACHE_SHIFT;
269 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
270 int ret;
271
272 if (!mapping_cap_writeback_dirty(mapping) || !count)
273 return 0;
274 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
275 if (ret == 0) {
276 down(&inode->i_sem);
277 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
278 up(&inode->i_sem);
279 }
280 if (ret == 0)
281 ret = wait_on_page_writeback_range(mapping, start, end);
282 return ret;
283}
284EXPORT_SYMBOL(sync_page_range);
285
286/*
287 * Note: Holding i_sem across sync_page_range_nolock is not a good idea
288 * as it forces O_SYNC writers to different parts of the same file
289 * to be serialised right until io completion.
290 */
291int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
292 loff_t pos, size_t count)
293{
294 pgoff_t start = pos >> PAGE_CACHE_SHIFT;
295 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
296 int ret;
297
298 if (!mapping_cap_writeback_dirty(mapping) || !count)
299 return 0;
300 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
301 if (ret == 0)
302 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
303 if (ret == 0)
304 ret = wait_on_page_writeback_range(mapping, start, end);
305 return ret;
306}
307EXPORT_SYMBOL(sync_page_range_nolock);
308
309/**
310 * filemap_fdatawait - walk the list of under-writeback pages of the given
311 * address space and wait for all of them.
312 *
313 * @mapping: address space structure to wait for
314 */
315int filemap_fdatawait(struct address_space *mapping)
316{
317 loff_t i_size = i_size_read(mapping->host);
318
319 if (i_size == 0)
320 return 0;
321
322 return wait_on_page_writeback_range(mapping, 0,
323 (i_size - 1) >> PAGE_CACHE_SHIFT);
324}
325EXPORT_SYMBOL(filemap_fdatawait);
326
327int filemap_write_and_wait(struct address_space *mapping)
328{
329 int retval = 0;
330
331 if (mapping->nrpages) {
332 retval = filemap_fdatawrite(mapping);
333 if (retval == 0)
334 retval = filemap_fdatawait(mapping);
335 }
336 return retval;
337}
338
339int filemap_write_and_wait_range(struct address_space *mapping,
340 loff_t lstart, loff_t lend)
341{
342 int retval = 0;
343
344 if (mapping->nrpages) {
345 retval = __filemap_fdatawrite_range(mapping, lstart, lend,
346 WB_SYNC_ALL);
347 if (retval == 0)
348 retval = wait_on_page_writeback_range(mapping,
349 lstart >> PAGE_CACHE_SHIFT,
350 lend >> PAGE_CACHE_SHIFT);
351 }
352 return retval;
353}
354
355/*
356 * This function is used to add newly allocated pagecache pages:
357 * the page is new, so we can just run SetPageLocked() against it.
358 * The other page state flags were set by rmqueue().
359 *
360 * This function does not add the page to the LRU. The caller must do that.
361 */
362int add_to_page_cache(struct page *page, struct address_space *mapping,
363 pgoff_t offset, int gfp_mask)
364{
365 int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
366
367 if (error == 0) {
368 write_lock_irq(&mapping->tree_lock);
369 error = radix_tree_insert(&mapping->page_tree, offset, page);
370 if (!error) {
371 page_cache_get(page);
372 SetPageLocked(page);
373 page->mapping = mapping;
374 page->index = offset;
375 mapping->nrpages++;
376 pagecache_acct(1);
377 }
378 write_unlock_irq(&mapping->tree_lock);
379 radix_tree_preload_end();
380 }
381 return error;
382}
383
384EXPORT_SYMBOL(add_to_page_cache);
385
386int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
387 pgoff_t offset, int gfp_mask)
388{
389 int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
390 if (ret == 0)
391 lru_cache_add(page);
392 return ret;
393}
394
395/*
396 * In order to wait for pages to become available there must be
397 * waitqueues associated with pages. By using a hash table of
398 * waitqueues where the bucket discipline is to maintain all
399 * waiters on the same queue and wake all when any of the pages
400 * become available, and for the woken contexts to check to be
401 * sure the appropriate page became available, this saves space
402 * at a cost of "thundering herd" phenomena during rare hash
403 * collisions.
404 */
405static wait_queue_head_t *page_waitqueue(struct page *page)
406{
407 const struct zone *zone = page_zone(page);
408
409 return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
410}
411
412static inline void wake_up_page(struct page *page, int bit)
413{
414 __wake_up_bit(page_waitqueue(page), &page->flags, bit);
415}
416
417void fastcall wait_on_page_bit(struct page *page, int bit_nr)
418{
419 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
420
421 if (test_bit(bit_nr, &page->flags))
422 __wait_on_bit(page_waitqueue(page), &wait, sync_page,
423 TASK_UNINTERRUPTIBLE);
424}
425EXPORT_SYMBOL(wait_on_page_bit);
426
427/**
428 * unlock_page() - unlock a locked page
429 *
430 * @page: the page
431 *
432 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
433 * Also wakes sleepers in wait_on_page_writeback() because the wakeup
434 * mechananism between PageLocked pages and PageWriteback pages is shared.
435 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
436 *
437 * The first mb is necessary to safely close the critical section opened by the
438 * TestSetPageLocked(), the second mb is necessary to enforce ordering between
439 * the clear_bit and the read of the waitqueue (to avoid SMP races with a
440 * parallel wait_on_page_locked()).
441 */
442void fastcall unlock_page(struct page *page)
443{
444 smp_mb__before_clear_bit();
445 if (!TestClearPageLocked(page))
446 BUG();
447 smp_mb__after_clear_bit();
448 wake_up_page(page, PG_locked);
449}
450EXPORT_SYMBOL(unlock_page);
451
452/*
453 * End writeback against a page.
454 */
455void end_page_writeback(struct page *page)
456{
457 if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
458 if (!test_clear_page_writeback(page))
459 BUG();
460 }
461 smp_mb__after_clear_bit();
462 wake_up_page(page, PG_writeback);
463}
464EXPORT_SYMBOL(end_page_writeback);
465
466/*
467 * Get a lock on the page, assuming we need to sleep to get it.
468 *
469 * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some
470 * random driver's requestfn sets TASK_RUNNING, we could busywait. However
471 * chances are that on the second loop, the block layer's plug list is empty,
472 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
473 */
474void fastcall __lock_page(struct page *page)
475{
476 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
477
478 __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
479 TASK_UNINTERRUPTIBLE);
480}
481EXPORT_SYMBOL(__lock_page);
482
483/*
484 * a rather lightweight function, finding and getting a reference to a
485 * hashed page atomically.
486 */
487struct page * find_get_page(struct address_space *mapping, unsigned long offset)
488{
489 struct page *page;
490
491 read_lock_irq(&mapping->tree_lock);
492 page = radix_tree_lookup(&mapping->page_tree, offset);
493 if (page)
494 page_cache_get(page);
495 read_unlock_irq(&mapping->tree_lock);
496 return page;
497}
498
499EXPORT_SYMBOL(find_get_page);
500
501/*
502 * Same as above, but trylock it instead of incrementing the count.
503 */
504struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
505{
506 struct page *page;
507
508 read_lock_irq(&mapping->tree_lock);
509 page = radix_tree_lookup(&mapping->page_tree, offset);
510 if (page && TestSetPageLocked(page))
511 page = NULL;
512 read_unlock_irq(&mapping->tree_lock);
513 return page;
514}
515
516EXPORT_SYMBOL(find_trylock_page);
517
518/**
519 * find_lock_page - locate, pin and lock a pagecache page
520 *
521 * @mapping - the address_space to search
522 * @offset - the page index
523 *
524 * Locates the desired pagecache page, locks it, increments its reference
525 * count and returns its address.
526 *
527 * Returns zero if the page was not present. find_lock_page() may sleep.
528 */
529struct page *find_lock_page(struct address_space *mapping,
530 unsigned long offset)
531{
532 struct page *page;
533
534 read_lock_irq(&mapping->tree_lock);
535repeat:
536 page = radix_tree_lookup(&mapping->page_tree, offset);
537 if (page) {
538 page_cache_get(page);
539 if (TestSetPageLocked(page)) {
540 read_unlock_irq(&mapping->tree_lock);
541 lock_page(page);
542 read_lock_irq(&mapping->tree_lock);
543
544 /* Has the page been truncated while we slept? */
545 if (page->mapping != mapping || page->index != offset) {
546 unlock_page(page);
547 page_cache_release(page);
548 goto repeat;
549 }
550 }
551 }
552 read_unlock_irq(&mapping->tree_lock);
553 return page;
554}
555
556EXPORT_SYMBOL(find_lock_page);
557
558/**
559 * find_or_create_page - locate or add a pagecache page
560 *
561 * @mapping - the page's address_space
562 * @index - the page's index into the mapping
563 * @gfp_mask - page allocation mode
564 *
565 * Locates a page in the pagecache. If the page is not present, a new page
566 * is allocated using @gfp_mask and is added to the pagecache and to the VM's
567 * LRU list. The returned page is locked and has its reference count
568 * incremented.
569 *
570 * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
571 * allocation!
572 *
573 * find_or_create_page() returns the desired page's address, or zero on
574 * memory exhaustion.
575 */
576struct page *find_or_create_page(struct address_space *mapping,
577 unsigned long index, unsigned int gfp_mask)
578{
579 struct page *page, *cached_page = NULL;
580 int err;
581repeat:
582 page = find_lock_page(mapping, index);
583 if (!page) {
584 if (!cached_page) {
585 cached_page = alloc_page(gfp_mask);
586 if (!cached_page)
587 return NULL;
588 }
589 err = add_to_page_cache_lru(cached_page, mapping,
590 index, gfp_mask);
591 if (!err) {
592 page = cached_page;
593 cached_page = NULL;
594 } else if (err == -EEXIST)
595 goto repeat;
596 }
597 if (cached_page)
598 page_cache_release(cached_page);
599 return page;
600}
601
602EXPORT_SYMBOL(find_or_create_page);
603
604/**
605 * find_get_pages - gang pagecache lookup
606 * @mapping: The address_space to search
607 * @start: The starting page index
608 * @nr_pages: The maximum number of pages
609 * @pages: Where the resulting pages are placed
610 *
611 * find_get_pages() will search for and return a group of up to
612 * @nr_pages pages in the mapping. The pages are placed at @pages.
613 * find_get_pages() takes a reference against the returned pages.
614 *
615 * The search returns a group of mapping-contiguous pages with ascending
616 * indexes. There may be holes in the indices due to not-present pages.
617 *
618 * find_get_pages() returns the number of pages which were found.
619 */
620unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
621 unsigned int nr_pages, struct page **pages)
622{
623 unsigned int i;
624 unsigned int ret;
625
626 read_lock_irq(&mapping->tree_lock);
627 ret = radix_tree_gang_lookup(&mapping->page_tree,
628 (void **)pages, start, nr_pages);
629 for (i = 0; i < ret; i++)
630 page_cache_get(pages[i]);
631 read_unlock_irq(&mapping->tree_lock);
632 return ret;
633}
634
635/*
636 * Like find_get_pages, except we only return pages which are tagged with
637 * `tag'. We update *index to index the next page for the traversal.
638 */
639unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
640 int tag, unsigned int nr_pages, struct page **pages)
641{
642 unsigned int i;
643 unsigned int ret;
644
645 read_lock_irq(&mapping->tree_lock);
646 ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
647 (void **)pages, *index, nr_pages, tag);
648 for (i = 0; i < ret; i++)
649 page_cache_get(pages[i]);
650 if (ret)
651 *index = pages[ret - 1]->index + 1;
652 read_unlock_irq(&mapping->tree_lock);
653 return ret;
654}
655
656/*
657 * Same as grab_cache_page, but do not wait if the page is unavailable.
658 * This is intended for speculative data generators, where the data can
659 * be regenerated if the page couldn't be grabbed. This routine should
660 * be safe to call while holding the lock for another page.
661 *
662 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
663 * and deadlock against the caller's locked page.
664 */
665struct page *
666grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
667{
668 struct page *page = find_get_page(mapping, index);
669 unsigned int gfp_mask;
670
671 if (page) {
672 if (!TestSetPageLocked(page))
673 return page;
674 page_cache_release(page);
675 return NULL;
676 }
677 gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS;
678 page = alloc_pages(gfp_mask, 0);
679 if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) {
680 page_cache_release(page);
681 page = NULL;
682 }
683 return page;
684}
685
686EXPORT_SYMBOL(grab_cache_page_nowait);
687
688/*
689 * This is a generic file read routine, and uses the
690 * mapping->a_ops->readpage() function for the actual low-level
691 * stuff.
692 *
693 * This is really ugly. But the goto's actually try to clarify some
694 * of the logic when it comes to error handling etc.
695 *
696 * Note the struct file* is only passed for the use of readpage. It may be
697 * NULL.
698 */
699void do_generic_mapping_read(struct address_space *mapping,
700 struct file_ra_state *_ra,
701 struct file *filp,
702 loff_t *ppos,
703 read_descriptor_t *desc,
704 read_actor_t actor)
705{
706 struct inode *inode = mapping->host;
707 unsigned long index;
708 unsigned long end_index;
709 unsigned long offset;
710 unsigned long last_index;
711 unsigned long next_index;
712 unsigned long prev_index;
713 loff_t isize;
714 struct page *cached_page;
715 int error;
716 struct file_ra_state ra = *_ra;
717
718 cached_page = NULL;
719 index = *ppos >> PAGE_CACHE_SHIFT;
720 next_index = index;
721 prev_index = ra.prev_page;
722 last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
723 offset = *ppos & ~PAGE_CACHE_MASK;
724
725 isize = i_size_read(inode);
726 if (!isize)
727 goto out;
728
729 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
730 for (;;) {
731 struct page *page;
732 unsigned long nr, ret;
733
734 /* nr is the maximum number of bytes to copy from this page */
735 nr = PAGE_CACHE_SIZE;
736 if (index >= end_index) {
737 if (index > end_index)
738 goto out;
739 nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
740 if (nr <= offset) {
741 goto out;
742 }
743 }
744 nr = nr - offset;
745
746 cond_resched();
747 if (index == next_index)
748 next_index = page_cache_readahead(mapping, &ra, filp,
749 index, last_index - index);
750
751find_page:
752 page = find_get_page(mapping, index);
753 if (unlikely(page == NULL)) {
754 handle_ra_miss(mapping, &ra, index);
755 goto no_cached_page;
756 }
757 if (!PageUptodate(page))
758 goto page_not_up_to_date;
759page_ok:
760
761 /* If users can be writing to this page using arbitrary
762 * virtual addresses, take care about potential aliasing
763 * before reading the page on the kernel side.
764 */
765 if (mapping_writably_mapped(mapping))
766 flush_dcache_page(page);
767
768 /*
769 * When (part of) the same page is read multiple times
770 * in succession, only mark it as accessed the first time.
771 */
772 if (prev_index != index)
773 mark_page_accessed(page);
774 prev_index = index;
775
776 /*
777 * Ok, we have the page, and it's up-to-date, so
778 * now we can copy it to user space...
779 *
780 * The actor routine returns how many bytes were actually used..
781 * NOTE! This may not be the same as how much of a user buffer
782 * we filled up (we may be padding etc), so we can only update
783 * "pos" here (the actor routine has to update the user buffer
784 * pointers and the remaining count).
785 */
786 ret = actor(desc, page, offset, nr);
787 offset += ret;
788 index += offset >> PAGE_CACHE_SHIFT;
789 offset &= ~PAGE_CACHE_MASK;
790
791 page_cache_release(page);
792 if (ret == nr && desc->count)
793 continue;
794 goto out;
795
796page_not_up_to_date:
797 /* Get exclusive access to the page ... */
798 lock_page(page);
799
800 /* Did it get unhashed before we got the lock? */
801 if (!page->mapping) {
802 unlock_page(page);
803 page_cache_release(page);
804 continue;
805 }
806
807 /* Did somebody else fill it already? */
808 if (PageUptodate(page)) {
809 unlock_page(page);
810 goto page_ok;
811 }
812
813readpage:
814 /* Start the actual read. The read will unlock the page. */
815 error = mapping->a_ops->readpage(filp, page);
816
817 if (unlikely(error))
818 goto readpage_error;
819
820 if (!PageUptodate(page)) {
821 lock_page(page);
822 if (!PageUptodate(page)) {
823 if (page->mapping == NULL) {
824 /*
825 * invalidate_inode_pages got it
826 */
827 unlock_page(page);
828 page_cache_release(page);
829 goto find_page;
830 }
831 unlock_page(page);
832 error = -EIO;
833 goto readpage_error;
834 }
835 unlock_page(page);
836 }
837
838 /*
839 * i_size must be checked after we have done ->readpage.
840 *
841 * Checking i_size after the readpage allows us to calculate
842 * the correct value for "nr", which means the zero-filled
843 * part of the page is not copied back to userspace (unless
844 * another truncate extends the file - this is desired though).
845 */
846 isize = i_size_read(inode);
847 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
848 if (unlikely(!isize || index > end_index)) {
849 page_cache_release(page);
850 goto out;
851 }
852
853 /* nr is the maximum number of bytes to copy from this page */
854 nr = PAGE_CACHE_SIZE;
855 if (index == end_index) {
856 nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
857 if (nr <= offset) {
858 page_cache_release(page);
859 goto out;
860 }
861 }
862 nr = nr - offset;
863 goto page_ok;
864
865readpage_error:
866 /* UHHUH! A synchronous read error occurred. Report it */
867 desc->error = error;
868 page_cache_release(page);
869 goto out;
870
871no_cached_page:
872 /*
873 * Ok, it wasn't cached, so we need to create a new
874 * page..
875 */
876 if (!cached_page) {
877 cached_page = page_cache_alloc_cold(mapping);
878 if (!cached_page) {
879 desc->error = -ENOMEM;
880 goto out;
881 }
882 }
883 error = add_to_page_cache_lru(cached_page, mapping,
884 index, GFP_KERNEL);
885 if (error) {
886 if (error == -EEXIST)
887 goto find_page;
888 desc->error = error;
889 goto out;
890 }
891 page = cached_page;
892 cached_page = NULL;
893 goto readpage;
894 }
895
896out:
897 *_ra = ra;
898
899 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
900 if (cached_page)
901 page_cache_release(cached_page);
902 if (filp)
903 file_accessed(filp);
904}
905
906EXPORT_SYMBOL(do_generic_mapping_read);
907
908int file_read_actor(read_descriptor_t *desc, struct page *page,
909 unsigned long offset, unsigned long size)
910{
911 char *kaddr;
912 unsigned long left, count = desc->count;
913
914 if (size > count)
915 size = count;
916
917 /*
918 * Faults on the destination of a read are common, so do it before
919 * taking the kmap.
920 */
921 if (!fault_in_pages_writeable(desc->arg.buf, size)) {
922 kaddr = kmap_atomic(page, KM_USER0);
923 left = __copy_to_user_inatomic(desc->arg.buf,
924 kaddr + offset, size);
925 kunmap_atomic(kaddr, KM_USER0);
926 if (left == 0)
927 goto success;
928 }
929
930 /* Do it the slow way */
931 kaddr = kmap(page);
932 left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
933 kunmap(page);
934
935 if (left) {
936 size -= left;
937 desc->error = -EFAULT;
938 }
939success:
940 desc->count = count - size;
941 desc->written += size;
942 desc->arg.buf += size;
943 return size;
944}
945
946/*
947 * This is the "read()" routine for all filesystems
948 * that can use the page cache directly.
949 */
950ssize_t
951__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
952 unsigned long nr_segs, loff_t *ppos)
953{
954 struct file *filp = iocb->ki_filp;
955 ssize_t retval;
956 unsigned long seg;
957 size_t count;
958
959 count = 0;
960 for (seg = 0; seg < nr_segs; seg++) {
961 const struct iovec *iv = &iov[seg];
962
963 /*
964 * If any segment has a negative length, or the cumulative
965 * length ever wraps negative then return -EINVAL.
966 */
967 count += iv->iov_len;
968 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
969 return -EINVAL;
970 if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
971 continue;
972 if (seg == 0)
973 return -EFAULT;
974 nr_segs = seg;
975 count -= iv->iov_len; /* This segment is no good */
976 break;
977 }
978
979 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
980 if (filp->f_flags & O_DIRECT) {
981 loff_t pos = *ppos, size;
982 struct address_space *mapping;
983 struct inode *inode;
984
985 mapping = filp->f_mapping;
986 inode = mapping->host;
987 retval = 0;
988 if (!count)
989 goto out; /* skip atime */
990 size = i_size_read(inode);
991 if (pos < size) {
992 retval = generic_file_direct_IO(READ, iocb,
993 iov, pos, nr_segs);
994 if (retval >= 0 && !is_sync_kiocb(iocb))
995 retval = -EIOCBQUEUED;
996 if (retval > 0)
997 *ppos = pos + retval;
998 }
999 file_accessed(filp);
1000 goto out;
1001 }
1002
1003 retval = 0;
1004 if (count) {
1005 for (seg = 0; seg < nr_segs; seg++) {
1006 read_descriptor_t desc;
1007
1008 desc.written = 0;
1009 desc.arg.buf = iov[seg].iov_base;
1010 desc.count = iov[seg].iov_len;
1011 if (desc.count == 0)
1012 continue;
1013 desc.error = 0;
1014 do_generic_file_read(filp,ppos,&desc,file_read_actor);
1015 retval += desc.written;
1016 if (!retval) {
1017 retval = desc.error;
1018 break;
1019 }
1020 }
1021 }
1022out:
1023 return retval;
1024}
1025
1026EXPORT_SYMBOL(__generic_file_aio_read);
1027
1028ssize_t
1029generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
1030{
1031 struct iovec local_iov = { .iov_base = buf, .iov_len = count };
1032
1033 BUG_ON(iocb->ki_pos != pos);
1034 return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
1035}
1036
1037EXPORT_SYMBOL(generic_file_aio_read);
1038
1039ssize_t
1040generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
1041{
1042 struct iovec local_iov = { .iov_base = buf, .iov_len = count };
1043 struct kiocb kiocb;
1044 ssize_t ret;
1045
1046 init_sync_kiocb(&kiocb, filp);
1047 ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
1048 if (-EIOCBQUEUED == ret)
1049 ret = wait_on_sync_kiocb(&kiocb);
1050 return ret;
1051}
1052
1053EXPORT_SYMBOL(generic_file_read);
1054
1055int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1056{
1057 ssize_t written;
1058 unsigned long count = desc->count;
1059 struct file *file = desc->arg.data;
1060
1061 if (size > count)
1062 size = count;
1063
1064 written = file->f_op->sendpage(file, page, offset,
1065 size, &file->f_pos, size<count);
1066 if (written < 0) {
1067 desc->error = written;
1068 written = 0;
1069 }
1070 desc->count = count - written;
1071 desc->written += written;
1072 return written;
1073}
1074
1075ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
1076 size_t count, read_actor_t actor, void *target)
1077{
1078 read_descriptor_t desc;
1079
1080 if (!count)
1081 return 0;
1082
1083 desc.written = 0;
1084 desc.count = count;
1085 desc.arg.data = target;
1086 desc.error = 0;
1087
1088 do_generic_file_read(in_file, ppos, &desc, actor);
1089 if (desc.written)
1090 return desc.written;
1091 return desc.error;
1092}
1093
1094EXPORT_SYMBOL(generic_file_sendfile);
1095
1096static ssize_t
1097do_readahead(struct address_space *mapping, struct file *filp,
1098 unsigned long index, unsigned long nr)
1099{
1100 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1101 return -EINVAL;
1102
1103 force_page_cache_readahead(mapping, filp, index,
1104 max_sane_readahead(nr));
1105 return 0;
1106}
1107
1108asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1109{
1110 ssize_t ret;
1111 struct file *file;
1112
1113 ret = -EBADF;
1114 file = fget(fd);
1115 if (file) {
1116 if (file->f_mode & FMODE_READ) {
1117 struct address_space *mapping = file->f_mapping;
1118 unsigned long start = offset >> PAGE_CACHE_SHIFT;
1119 unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1120 unsigned long len = end - start + 1;
1121 ret = do_readahead(mapping, file, start, len);
1122 }
1123 fput(file);
1124 }
1125 return ret;
1126}
1127
1128#ifdef CONFIG_MMU
1129/*
1130 * This adds the requested page to the page cache if it isn't already there,
1131 * and schedules an I/O to read in its contents from disk.
1132 */
1133static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
1134static int fastcall page_cache_read(struct file * file, unsigned long offset)
1135{
1136 struct address_space *mapping = file->f_mapping;
1137 struct page *page;
1138 int error;
1139
1140 page = page_cache_alloc_cold(mapping);
1141 if (!page)
1142 return -ENOMEM;
1143
1144 error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1145 if (!error) {
1146 error = mapping->a_ops->readpage(file, page);
1147 page_cache_release(page);
1148 return error;
1149 }
1150
1151 /*
1152 * We arrive here in the unlikely event that someone
1153 * raced with us and added our page to the cache first
1154 * or we are out of memory for radix-tree nodes.
1155 */
1156 page_cache_release(page);
1157 return error == -EEXIST ? 0 : error;
1158}
1159
1160#define MMAP_LOTSAMISS (100)
1161
1162/*
1163 * filemap_nopage() is invoked via the vma operations vector for a
1164 * mapped memory region to read in file data during a page fault.
1165 *
1166 * The goto's are kind of ugly, but this streamlines the normal case of having
1167 * it in the page cache, and handles the special cases reasonably without
1168 * having a lot of duplicated code.
1169 */
1170struct page *filemap_nopage(struct vm_area_struct *area,
1171 unsigned long address, int *type)
1172{
1173 int error;
1174 struct file *file = area->vm_file;
1175 struct address_space *mapping = file->f_mapping;
1176 struct file_ra_state *ra = &file->f_ra;
1177 struct inode *inode = mapping->host;
1178 struct page *page;
1179 unsigned long size, pgoff;
1180 int did_readaround = 0, majmin = VM_FAULT_MINOR;
1181
1182 pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1183
1184retry_all:
1185 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1186 if (pgoff >= size)
1187 goto outside_data_content;
1188
1189 /* If we don't want any read-ahead, don't bother */
1190 if (VM_RandomReadHint(area))
1191 goto no_cached_page;
1192
1193 /*
1194 * The readahead code wants to be told about each and every page
1195 * so it can build and shrink its windows appropriately
1196 *
1197 * For sequential accesses, we use the generic readahead logic.
1198 */
1199 if (VM_SequentialReadHint(area))
1200 page_cache_readahead(mapping, ra, file, pgoff, 1);
1201
1202 /*
1203 * Do we have something in the page cache already?
1204 */
1205retry_find:
1206 page = find_get_page(mapping, pgoff);
1207 if (!page) {
1208 unsigned long ra_pages;
1209
1210 if (VM_SequentialReadHint(area)) {
1211 handle_ra_miss(mapping, ra, pgoff);
1212 goto no_cached_page;
1213 }
1214 ra->mmap_miss++;
1215
1216 /*
1217 * Do we miss much more than hit in this file? If so,
1218 * stop bothering with read-ahead. It will only hurt.
1219 */
1220 if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS)
1221 goto no_cached_page;
1222
1223 /*
1224 * To keep the pgmajfault counter straight, we need to
1225 * check did_readaround, as this is an inner loop.
1226 */
1227 if (!did_readaround) {
1228 majmin = VM_FAULT_MAJOR;
1229 inc_page_state(pgmajfault);
1230 }
1231 did_readaround = 1;
1232 ra_pages = max_sane_readahead(file->f_ra.ra_pages);
1233 if (ra_pages) {
1234 pgoff_t start = 0;
1235
1236 if (pgoff > ra_pages / 2)
1237 start = pgoff - ra_pages / 2;
1238 do_page_cache_readahead(mapping, file, start, ra_pages);
1239 }
1240 page = find_get_page(mapping, pgoff);
1241 if (!page)
1242 goto no_cached_page;
1243 }
1244
1245 if (!did_readaround)
1246 ra->mmap_hit++;
1247
1248 /*
1249 * Ok, found a page in the page cache, now we need to check
1250 * that it's up-to-date.
1251 */
1252 if (!PageUptodate(page))
1253 goto page_not_uptodate;
1254
1255success:
1256 /*
1257 * Found the page and have a reference on it.
1258 */
1259 mark_page_accessed(page);
1260 if (type)
1261 *type = majmin;
1262 return page;
1263
1264outside_data_content:
1265 /*
1266 * An external ptracer can access pages that normally aren't
1267 * accessible..
1268 */
1269 if (area->vm_mm == current->mm)
1270 return NULL;
1271 /* Fall through to the non-read-ahead case */
1272no_cached_page:
1273 /*
1274 * We're only likely to ever get here if MADV_RANDOM is in
1275 * effect.
1276 */
1277 error = page_cache_read(file, pgoff);
1278 grab_swap_token();
1279
1280 /*
1281 * The page we want has now been added to the page cache.
1282 * In the unlikely event that someone removed it in the
1283 * meantime, we'll just come back here and read it again.
1284 */
1285 if (error >= 0)
1286 goto retry_find;
1287
1288 /*
1289 * An error return from page_cache_read can result if the
1290 * system is low on memory, or a problem occurs while trying
1291 * to schedule I/O.
1292 */
1293 if (error == -ENOMEM)
1294 return NOPAGE_OOM;
1295 return NULL;
1296
1297page_not_uptodate:
1298 if (!did_readaround) {
1299 majmin = VM_FAULT_MAJOR;
1300 inc_page_state(pgmajfault);
1301 }
1302 lock_page(page);
1303
1304 /* Did it get unhashed while we waited for it? */
1305 if (!page->mapping) {
1306 unlock_page(page);
1307 page_cache_release(page);
1308 goto retry_all;
1309 }
1310
1311 /* Did somebody else get it up-to-date? */
1312 if (PageUptodate(page)) {
1313 unlock_page(page);
1314 goto success;
1315 }
1316
1317 if (!mapping->a_ops->readpage(file, page)) {
1318 wait_on_page_locked(page);
1319 if (PageUptodate(page))
1320 goto success;
1321 }
1322
1323 /*
1324 * Umm, take care of errors if the page isn't up-to-date.
1325 * Try to re-read it _once_. We do this synchronously,
1326 * because there really aren't any performance issues here
1327 * and we need to check for errors.
1328 */
1329 lock_page(page);
1330
1331 /* Somebody truncated the page on us? */
1332 if (!page->mapping) {
1333 unlock_page(page);
1334 page_cache_release(page);
1335 goto retry_all;
1336 }
1337
1338 /* Somebody else successfully read it in? */
1339 if (PageUptodate(page)) {
1340 unlock_page(page);
1341 goto success;
1342 }
1343 ClearPageError(page);
1344 if (!mapping->a_ops->readpage(file, page)) {
1345 wait_on_page_locked(page);
1346 if (PageUptodate(page))
1347 goto success;
1348 }
1349
1350 /*
1351 * Things didn't work out. Return zero to tell the
1352 * mm layer so, possibly freeing the page cache page first.
1353 */
1354 page_cache_release(page);
1355 return NULL;
1356}
1357
1358EXPORT_SYMBOL(filemap_nopage);
1359
1360static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
1361 int nonblock)
1362{
1363 struct address_space *mapping = file->f_mapping;
1364 struct page *page;
1365 int error;
1366
1367 /*
1368 * Do we have something in the page cache already?
1369 */
1370retry_find:
1371 page = find_get_page(mapping, pgoff);
1372 if (!page) {
1373 if (nonblock)
1374 return NULL;
1375 goto no_cached_page;
1376 }
1377
1378 /*
1379 * Ok, found a page in the page cache, now we need to check
1380 * that it's up-to-date.
1381 */
1382 if (!PageUptodate(page))
1383 goto page_not_uptodate;
1384
1385success:
1386 /*
1387 * Found the page and have a reference on it.
1388 */
1389 mark_page_accessed(page);
1390 return page;
1391
1392no_cached_page:
1393 error = page_cache_read(file, pgoff);
1394
1395 /*
1396 * The page we want has now been added to the page cache.
1397 * In the unlikely event that someone removed it in the
1398 * meantime, we'll just come back here and read it again.
1399 */
1400 if (error >= 0)
1401 goto retry_find;
1402
1403 /*
1404 * An error return from page_cache_read can result if the
1405 * system is low on memory, or a problem occurs while trying
1406 * to schedule I/O.
1407 */
1408 return NULL;
1409
1410page_not_uptodate:
1411 lock_page(page);
1412
1413 /* Did it get unhashed while we waited for it? */
1414 if (!page->mapping) {
1415 unlock_page(page);
1416 goto err;
1417 }
1418
1419 /* Did somebody else get it up-to-date? */
1420 if (PageUptodate(page)) {
1421 unlock_page(page);
1422 goto success;
1423 }
1424
1425 if (!mapping->a_ops->readpage(file, page)) {
1426 wait_on_page_locked(page);
1427 if (PageUptodate(page))
1428 goto success;
1429 }
1430
1431 /*
1432 * Umm, take care of errors if the page isn't up-to-date.
1433 * Try to re-read it _once_. We do this synchronously,
1434 * because there really aren't any performance issues here
1435 * and we need to check for errors.
1436 */
1437 lock_page(page);
1438
1439 /* Somebody truncated the page on us? */
1440 if (!page->mapping) {
1441 unlock_page(page);
1442 goto err;
1443 }
1444 /* Somebody else successfully read it in? */
1445 if (PageUptodate(page)) {
1446 unlock_page(page);
1447 goto success;
1448 }
1449
1450 ClearPageError(page);
1451 if (!mapping->a_ops->readpage(file, page)) {
1452 wait_on_page_locked(page);
1453 if (PageUptodate(page))
1454 goto success;
1455 }
1456
1457 /*
1458 * Things didn't work out. Return zero to tell the
1459 * mm layer so, possibly freeing the page cache page first.
1460 */
1461err:
1462 page_cache_release(page);
1463
1464 return NULL;
1465}
1466
1467int filemap_populate(struct vm_area_struct *vma, unsigned long addr,
1468 unsigned long len, pgprot_t prot, unsigned long pgoff,
1469 int nonblock)
1470{
1471 struct file *file = vma->vm_file;
1472 struct address_space *mapping = file->f_mapping;
1473 struct inode *inode = mapping->host;
1474 unsigned long size;
1475 struct mm_struct *mm = vma->vm_mm;
1476 struct page *page;
1477 int err;
1478
1479 if (!nonblock)
1480 force_page_cache_readahead(mapping, vma->vm_file,
1481 pgoff, len >> PAGE_CACHE_SHIFT);
1482
1483repeat:
1484 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1485 if (pgoff + (len >> PAGE_CACHE_SHIFT) > size)
1486 return -EINVAL;
1487
1488 page = filemap_getpage(file, pgoff, nonblock);
1489 if (!page && !nonblock)
1490 return -ENOMEM;
1491 if (page) {
1492 err = install_page(mm, vma, addr, page, prot);
1493 if (err) {
1494 page_cache_release(page);
1495 return err;
1496 }
1497 } else {
1498 err = install_file_pte(mm, vma, addr, pgoff, prot);
1499 if (err)
1500 return err;
1501 }
1502
1503 len -= PAGE_SIZE;
1504 addr += PAGE_SIZE;
1505 pgoff++;
1506 if (len)
1507 goto repeat;
1508
1509 return 0;
1510}
1511
1512struct vm_operations_struct generic_file_vm_ops = {
1513 .nopage = filemap_nopage,
1514 .populate = filemap_populate,
1515};
1516
1517/* This is used for a general mmap of a disk file */
1518
1519int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1520{
1521 struct address_space *mapping = file->f_mapping;
1522
1523 if (!mapping->a_ops->readpage)
1524 return -ENOEXEC;
1525 file_accessed(file);
1526 vma->vm_ops = &generic_file_vm_ops;
1527 return 0;
1528}
1529EXPORT_SYMBOL(filemap_populate);
1530
1531/*
1532 * This is for filesystems which do not implement ->writepage.
1533 */
1534int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
1535{
1536 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1537 return -EINVAL;
1538 return generic_file_mmap(file, vma);
1539}
1540#else
1541int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1542{
1543 return -ENOSYS;
1544}
1545int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
1546{
1547 return -ENOSYS;
1548}
1549#endif /* CONFIG_MMU */
1550
1551EXPORT_SYMBOL(generic_file_mmap);
1552EXPORT_SYMBOL(generic_file_readonly_mmap);
1553
1554static inline struct page *__read_cache_page(struct address_space *mapping,
1555 unsigned long index,
1556 int (*filler)(void *,struct page*),
1557 void *data)
1558{
1559 struct page *page, *cached_page = NULL;
1560 int err;
1561repeat:
1562 page = find_get_page(mapping, index);
1563 if (!page) {
1564 if (!cached_page) {
1565 cached_page = page_cache_alloc_cold(mapping);
1566 if (!cached_page)
1567 return ERR_PTR(-ENOMEM);
1568 }
1569 err = add_to_page_cache_lru(cached_page, mapping,
1570 index, GFP_KERNEL);
1571 if (err == -EEXIST)
1572 goto repeat;
1573 if (err < 0) {
1574 /* Presumably ENOMEM for radix tree node */
1575 page_cache_release(cached_page);
1576 return ERR_PTR(err);
1577 }
1578 page = cached_page;
1579 cached_page = NULL;
1580 err = filler(data, page);
1581 if (err < 0) {
1582 page_cache_release(page);
1583 page = ERR_PTR(err);
1584 }
1585 }
1586 if (cached_page)
1587 page_cache_release(cached_page);
1588 return page;
1589}
1590
1591/*
1592 * Read into the page cache. If a page already exists,
1593 * and PageUptodate() is not set, try to fill the page.
1594 */
1595struct page *read_cache_page(struct address_space *mapping,
1596 unsigned long index,
1597 int (*filler)(void *,struct page*),
1598 void *data)
1599{
1600 struct page *page;
1601 int err;
1602
1603retry:
1604 page = __read_cache_page(mapping, index, filler, data);
1605 if (IS_ERR(page))
1606 goto out;
1607 mark_page_accessed(page);
1608 if (PageUptodate(page))
1609 goto out;
1610
1611 lock_page(page);
1612 if (!page->mapping) {
1613 unlock_page(page);
1614 page_cache_release(page);
1615 goto retry;
1616 }
1617 if (PageUptodate(page)) {
1618 unlock_page(page);
1619 goto out;
1620 }
1621 err = filler(data, page);
1622 if (err < 0) {
1623 page_cache_release(page);
1624 page = ERR_PTR(err);
1625 }
1626 out:
1627 return page;
1628}
1629
1630EXPORT_SYMBOL(read_cache_page);
1631
1632/*
1633 * If the page was newly created, increment its refcount and add it to the
1634 * caller's lru-buffering pagevec. This function is specifically for
1635 * generic_file_write().
1636 */
1637static inline struct page *
1638__grab_cache_page(struct address_space *mapping, unsigned long index,
1639 struct page **cached_page, struct pagevec *lru_pvec)
1640{
1641 int err;
1642 struct page *page;
1643repeat:
1644 page = find_lock_page(mapping, index);
1645 if (!page) {
1646 if (!*cached_page) {
1647 *cached_page = page_cache_alloc(mapping);
1648 if (!*cached_page)
1649 return NULL;
1650 }
1651 err = add_to_page_cache(*cached_page, mapping,
1652 index, GFP_KERNEL);
1653 if (err == -EEXIST)
1654 goto repeat;
1655 if (err == 0) {
1656 page = *cached_page;
1657 page_cache_get(page);
1658 if (!pagevec_add(lru_pvec, page))
1659 __pagevec_lru_add(lru_pvec);
1660 *cached_page = NULL;
1661 }
1662 }
1663 return page;
1664}
1665
1666/*
1667 * The logic we want is
1668 *
1669 * if suid or (sgid and xgrp)
1670 * remove privs
1671 */
1672int remove_suid(struct dentry *dentry)
1673{
1674 mode_t mode = dentry->d_inode->i_mode;
1675 int kill = 0;
1676 int result = 0;
1677
1678 /* suid always must be killed */
1679 if (unlikely(mode & S_ISUID))
1680 kill = ATTR_KILL_SUID;
1681
1682 /*
1683 * sgid without any exec bits is just a mandatory locking mark; leave
1684 * it alone. If some exec bits are set, it's a real sgid; kill it.
1685 */
1686 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1687 kill |= ATTR_KILL_SGID;
1688
1689 if (unlikely(kill && !capable(CAP_FSETID))) {
1690 struct iattr newattrs;
1691
1692 newattrs.ia_valid = ATTR_FORCE | kill;
1693 result = notify_change(dentry, &newattrs);
1694 }
1695 return result;
1696}
1697EXPORT_SYMBOL(remove_suid);
1698
1699/*
1700 * Copy as much as we can into the page and return the number of bytes which
1701 * were sucessfully copied. If a fault is encountered then clear the page
1702 * out to (offset+bytes) and return the number of bytes which were copied.
1703 */
1704static inline size_t
1705filemap_copy_from_user(struct page *page, unsigned long offset,
1706 const char __user *buf, unsigned bytes)
1707{
1708 char *kaddr;
1709 int left;
1710
1711 kaddr = kmap_atomic(page, KM_USER0);
1712 left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
1713 kunmap_atomic(kaddr, KM_USER0);
1714
1715 if (left != 0) {
1716 /* Do it the slow way */
1717 kaddr = kmap(page);
1718 left = __copy_from_user(kaddr + offset, buf, bytes);
1719 kunmap(page);
1720 }
1721 return bytes - left;
1722}
1723
1724static size_t
1725__filemap_copy_from_user_iovec(char *vaddr,
1726 const struct iovec *iov, size_t base, size_t bytes)
1727{
1728 size_t copied = 0, left = 0;
1729
1730 while (bytes) {
1731 char __user *buf = iov->iov_base + base;
1732 int copy = min(bytes, iov->iov_len - base);
1733
1734 base = 0;
1735 left = __copy_from_user_inatomic(vaddr, buf, copy);
1736 copied += copy;
1737 bytes -= copy;
1738 vaddr += copy;
1739 iov++;
1740
1741 if (unlikely(left)) {
1742 /* zero the rest of the target like __copy_from_user */
1743 if (bytes)
1744 memset(vaddr, 0, bytes);
1745 break;
1746 }
1747 }
1748 return copied - left;
1749}
1750
1751/*
1752 * This has the same sideeffects and return value as filemap_copy_from_user().
1753 * The difference is that on a fault we need to memset the remainder of the
1754 * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
1755 * single-segment behaviour.
1756 */
1757static inline size_t
1758filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
1759 const struct iovec *iov, size_t base, size_t bytes)
1760{
1761 char *kaddr;
1762 size_t copied;
1763
1764 kaddr = kmap_atomic(page, KM_USER0);
1765 copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
1766 base, bytes);
1767 kunmap_atomic(kaddr, KM_USER0);
1768 if (copied != bytes) {
1769 kaddr = kmap(page);
1770 copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
1771 base, bytes);
1772 kunmap(page);
1773 }
1774 return copied;
1775}
1776
1777static inline void
1778filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
1779{
1780 const struct iovec *iov = *iovp;
1781 size_t base = *basep;
1782
1783 while (bytes) {
1784 int copy = min(bytes, iov->iov_len - base);
1785
1786 bytes -= copy;
1787 base += copy;
1788 if (iov->iov_len == base) {
1789 iov++;
1790 base = 0;
1791 }
1792 }
1793 *iovp = iov;
1794 *basep = base;
1795}
1796
1797/*
1798 * Performs necessary checks before doing a write
1799 *
1800 * Can adjust writing position aor amount of bytes to write.
1801 * Returns appropriate error code that caller should return or
1802 * zero in case that write should be allowed.
1803 */
1804inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
1805{
1806 struct inode *inode = file->f_mapping->host;
1807 unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1808
1809 if (unlikely(*pos < 0))
1810 return -EINVAL;
1811
1812 if (unlikely(file->f_error)) {
1813 int err = file->f_error;
1814 file->f_error = 0;
1815 return err;
1816 }
1817
1818 if (!isblk) {
1819 /* FIXME: this is for backwards compatibility with 2.4 */
1820 if (file->f_flags & O_APPEND)
1821 *pos = i_size_read(inode);
1822
1823 if (limit != RLIM_INFINITY) {
1824 if (*pos >= limit) {
1825 send_sig(SIGXFSZ, current, 0);
1826 return -EFBIG;
1827 }
1828 if (*count > limit - (typeof(limit))*pos) {
1829 *count = limit - (typeof(limit))*pos;
1830 }
1831 }
1832 }
1833
1834 /*
1835 * LFS rule
1836 */
1837 if (unlikely(*pos + *count > MAX_NON_LFS &&
1838 !(file->f_flags & O_LARGEFILE))) {
1839 if (*pos >= MAX_NON_LFS) {
1840 send_sig(SIGXFSZ, current, 0);
1841 return -EFBIG;
1842 }
1843 if (*count > MAX_NON_LFS - (unsigned long)*pos) {
1844 *count = MAX_NON_LFS - (unsigned long)*pos;
1845 }
1846 }
1847
1848 /*
1849 * Are we about to exceed the fs block limit ?
1850 *
1851 * If we have written data it becomes a short write. If we have
1852 * exceeded without writing data we send a signal and return EFBIG.
1853 * Linus frestrict idea will clean these up nicely..
1854 */
1855 if (likely(!isblk)) {
1856 if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
1857 if (*count || *pos > inode->i_sb->s_maxbytes) {
1858 send_sig(SIGXFSZ, current, 0);
1859 return -EFBIG;
1860 }
1861 /* zero-length writes at ->s_maxbytes are OK */
1862 }
1863
1864 if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
1865 *count = inode->i_sb->s_maxbytes - *pos;
1866 } else {
1867 loff_t isize;
1868 if (bdev_read_only(I_BDEV(inode)))
1869 return -EPERM;
1870 isize = i_size_read(inode);
1871 if (*pos >= isize) {
1872 if (*count || *pos > isize)
1873 return -ENOSPC;
1874 }
1875
1876 if (*pos + *count > isize)
1877 *count = isize - *pos;
1878 }
1879 return 0;
1880}
1881EXPORT_SYMBOL(generic_write_checks);
1882
1883ssize_t
1884generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1885 unsigned long *nr_segs, loff_t pos, loff_t *ppos,
1886 size_t count, size_t ocount)
1887{
1888 struct file *file = iocb->ki_filp;
1889 struct address_space *mapping = file->f_mapping;
1890 struct inode *inode = mapping->host;
1891 ssize_t written;
1892
1893 if (count != ocount)
1894 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
1895
1896 written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs);
1897 if (written > 0) {
1898 loff_t end = pos + written;
1899 if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
1900 i_size_write(inode, end);
1901 mark_inode_dirty(inode);
1902 }
1903 *ppos = end;
1904 }
1905
1906 /*
1907 * Sync the fs metadata but not the minor inode changes and
1908 * of course not the data as we did direct DMA for the IO.
1909 * i_sem is held, which protects generic_osync_inode() from
1910 * livelocking.
1911 */
1912 if (written >= 0 && file->f_flags & O_SYNC)
1913 generic_osync_inode(inode, mapping, OSYNC_METADATA);
1914 if (written == count && !is_sync_kiocb(iocb))
1915 written = -EIOCBQUEUED;
1916 return written;
1917}
1918EXPORT_SYMBOL(generic_file_direct_write);
1919
1920ssize_t
1921generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1922 unsigned long nr_segs, loff_t pos, loff_t *ppos,
1923 size_t count, ssize_t written)
1924{
1925 struct file *file = iocb->ki_filp;
1926 struct address_space * mapping = file->f_mapping;
1927 struct address_space_operations *a_ops = mapping->a_ops;
1928 struct inode *inode = mapping->host;
1929 long status = 0;
1930 struct page *page;
1931 struct page *cached_page = NULL;
1932 size_t bytes;
1933 struct pagevec lru_pvec;
1934 const struct iovec *cur_iov = iov; /* current iovec */
1935 size_t iov_base = 0; /* offset in the current iovec */
1936 char __user *buf;
1937
1938 pagevec_init(&lru_pvec, 0);
1939
1940 /*
1941 * handle partial DIO write. Adjust cur_iov if needed.
1942 */
1943 if (likely(nr_segs == 1))
1944 buf = iov->iov_base + written;
1945 else {
1946 filemap_set_next_iovec(&cur_iov, &iov_base, written);
1947 buf = iov->iov_base + iov_base;
1948 }
1949
1950 do {
1951 unsigned long index;
1952 unsigned long offset;
1953 size_t copied;
1954
1955 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1956 index = pos >> PAGE_CACHE_SHIFT;
1957 bytes = PAGE_CACHE_SIZE - offset;
1958 if (bytes > count)
1959 bytes = count;
1960
1961 /*
1962 * Bring in the user page that we will copy from _first_.
1963 * Otherwise there's a nasty deadlock on copying from the
1964 * same page as we're writing to, without it being marked
1965 * up-to-date.
1966 */
1967 fault_in_pages_readable(buf, bytes);
1968
1969 page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
1970 if (!page) {
1971 status = -ENOMEM;
1972 break;
1973 }
1974
1975 status = a_ops->prepare_write(file, page, offset, offset+bytes);
1976 if (unlikely(status)) {
1977 loff_t isize = i_size_read(inode);
1978 /*
1979 * prepare_write() may have instantiated a few blocks
1980 * outside i_size. Trim these off again.
1981 */
1982 unlock_page(page);
1983 page_cache_release(page);
1984 if (pos + bytes > isize)
1985 vmtruncate(inode, isize);
1986 break;
1987 }
1988 if (likely(nr_segs == 1))
1989 copied = filemap_copy_from_user(page, offset,
1990 buf, bytes);
1991 else
1992 copied = filemap_copy_from_user_iovec(page, offset,
1993 cur_iov, iov_base, bytes);
1994 flush_dcache_page(page);
1995 status = a_ops->commit_write(file, page, offset, offset+bytes);
1996 if (likely(copied > 0)) {
1997 if (!status)
1998 status = copied;
1999
2000 if (status >= 0) {
2001 written += status;
2002 count -= status;
2003 pos += status;
2004 buf += status;
2005 if (unlikely(nr_segs > 1))
2006 filemap_set_next_iovec(&cur_iov,
2007 &iov_base, status);
2008 }
2009 }
2010 if (unlikely(copied != bytes))
2011 if (status >= 0)
2012 status = -EFAULT;
2013 unlock_page(page);
2014 mark_page_accessed(page);
2015 page_cache_release(page);
2016 if (status < 0)
2017 break;
2018 balance_dirty_pages_ratelimited(mapping);
2019 cond_resched();
2020 } while (count);
2021 *ppos = pos;
2022
2023 if (cached_page)
2024 page_cache_release(cached_page);
2025
2026 /*
2027 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
2028 */
2029 if (likely(status >= 0)) {
2030 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2031 if (!a_ops->writepage || !is_sync_kiocb(iocb))
2032 status = generic_osync_inode(inode, mapping,
2033 OSYNC_METADATA|OSYNC_DATA);
2034 }
2035 }
2036
2037 /*
2038 * If we get here for O_DIRECT writes then we must have fallen through
2039 * to buffered writes (block instantiation inside i_size). So we sync
2040 * the file data here, to try to honour O_DIRECT expectations.
2041 */
2042 if (unlikely(file->f_flags & O_DIRECT) && written)
2043 status = filemap_write_and_wait(mapping);
2044
2045 pagevec_lru_add(&lru_pvec);
2046 return written ? written : status;
2047}
2048EXPORT_SYMBOL(generic_file_buffered_write);
2049
2050ssize_t
2051__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2052 unsigned long nr_segs, loff_t *ppos)
2053{
2054 struct file *file = iocb->ki_filp;
2055 struct address_space * mapping = file->f_mapping;
2056 size_t ocount; /* original count */
2057 size_t count; /* after file limit checks */
2058 struct inode *inode = mapping->host;
2059 unsigned long seg;
2060 loff_t pos;
2061 ssize_t written;
2062 ssize_t err;
2063
2064 ocount = 0;
2065 for (seg = 0; seg < nr_segs; seg++) {
2066 const struct iovec *iv = &iov[seg];
2067
2068 /*
2069 * If any segment has a negative length, or the cumulative
2070 * length ever wraps negative then return -EINVAL.
2071 */
2072 ocount += iv->iov_len;
2073 if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
2074 return -EINVAL;
2075 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
2076 continue;
2077 if (seg == 0)
2078 return -EFAULT;
2079 nr_segs = seg;
2080 ocount -= iv->iov_len; /* This segment is no good */
2081 break;
2082 }
2083
2084 count = ocount;
2085 pos = *ppos;
2086
2087 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2088
2089 /* We can write back this queue in page reclaim */
2090 current->backing_dev_info = mapping->backing_dev_info;
2091 written = 0;
2092
2093 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2094 if (err)
2095 goto out;
2096
2097 if (count == 0)
2098 goto out;
2099
2100 err = remove_suid(file->f_dentry);
2101 if (err)
2102 goto out;
2103
2104 inode_update_time(inode, 1);
2105
2106 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2107 if (unlikely(file->f_flags & O_DIRECT)) {
2108 written = generic_file_direct_write(iocb, iov,
2109 &nr_segs, pos, ppos, count, ocount);
2110 if (written < 0 || written == count)
2111 goto out;
2112 /*
2113 * direct-io write to a hole: fall through to buffered I/O
2114 * for completing the rest of the request.
2115 */
2116 pos += written;
2117 count -= written;
2118 }
2119
2120 written = generic_file_buffered_write(iocb, iov, nr_segs,
2121 pos, ppos, count, written);
2122out:
2123 current->backing_dev_info = NULL;
2124 return written ? written : err;
2125}
2126EXPORT_SYMBOL(generic_file_aio_write_nolock);
2127
2128ssize_t
2129generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2130 unsigned long nr_segs, loff_t *ppos)
2131{
2132 struct file *file = iocb->ki_filp;
2133 struct address_space *mapping = file->f_mapping;
2134 struct inode *inode = mapping->host;
2135 ssize_t ret;
2136 loff_t pos = *ppos;
2137
2138 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos);
2139
2140 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2141 int err;
2142
2143 err = sync_page_range_nolock(inode, mapping, pos, ret);
2144 if (err < 0)
2145 ret = err;
2146 }
2147 return ret;
2148}
2149
2150ssize_t
2151__generic_file_write_nolock(struct file *file, const struct iovec *iov,
2152 unsigned long nr_segs, loff_t *ppos)
2153{
2154 struct kiocb kiocb;
2155 ssize_t ret;
2156
2157 init_sync_kiocb(&kiocb, file);
2158 ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2159 if (ret == -EIOCBQUEUED)
2160 ret = wait_on_sync_kiocb(&kiocb);
2161 return ret;
2162}
2163
2164ssize_t
2165generic_file_write_nolock(struct file *file, const struct iovec *iov,
2166 unsigned long nr_segs, loff_t *ppos)
2167{
2168 struct kiocb kiocb;
2169 ssize_t ret;
2170
2171 init_sync_kiocb(&kiocb, file);
2172 ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2173 if (-EIOCBQUEUED == ret)
2174 ret = wait_on_sync_kiocb(&kiocb);
2175 return ret;
2176}
2177EXPORT_SYMBOL(generic_file_write_nolock);
2178
2179ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
2180 size_t count, loff_t pos)
2181{
2182 struct file *file = iocb->ki_filp;
2183 struct address_space *mapping = file->f_mapping;
2184 struct inode *inode = mapping->host;
2185 ssize_t ret;
2186 struct iovec local_iov = { .iov_base = (void __user *)buf,
2187 .iov_len = count };
2188
2189 BUG_ON(iocb->ki_pos != pos);
2190
2191 down(&inode->i_sem);
2192 ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
2193 &iocb->ki_pos);
2194 up(&inode->i_sem);
2195
2196 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2197 ssize_t err;
2198
2199 err = sync_page_range(inode, mapping, pos, ret);
2200 if (err < 0)
2201 ret = err;
2202 }
2203 return ret;
2204}
2205EXPORT_SYMBOL(generic_file_aio_write);
2206
2207ssize_t generic_file_write(struct file *file, const char __user *buf,
2208 size_t count, loff_t *ppos)
2209{
2210 struct address_space *mapping = file->f_mapping;
2211 struct inode *inode = mapping->host;
2212 ssize_t ret;
2213 struct iovec local_iov = { .iov_base = (void __user *)buf,
2214 .iov_len = count };
2215
2216 down(&inode->i_sem);
2217 ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
2218 up(&inode->i_sem);
2219
2220 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2221 ssize_t err;
2222
2223 err = sync_page_range(inode, mapping, *ppos - ret, ret);
2224 if (err < 0)
2225 ret = err;
2226 }
2227 return ret;
2228}
2229EXPORT_SYMBOL(generic_file_write);
2230
2231ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
2232 unsigned long nr_segs, loff_t *ppos)
2233{
2234 struct kiocb kiocb;
2235 ssize_t ret;
2236
2237 init_sync_kiocb(&kiocb, filp);
2238 ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos);
2239 if (-EIOCBQUEUED == ret)
2240 ret = wait_on_sync_kiocb(&kiocb);
2241 return ret;
2242}
2243EXPORT_SYMBOL(generic_file_readv);
2244
2245ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
2246 unsigned long nr_segs, loff_t *ppos)
2247{
2248 struct address_space *mapping = file->f_mapping;
2249 struct inode *inode = mapping->host;
2250 ssize_t ret;
2251
2252 down(&inode->i_sem);
2253 ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
2254 up(&inode->i_sem);
2255
2256 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2257 int err;
2258
2259 err = sync_page_range(inode, mapping, *ppos - ret, ret);
2260 if (err < 0)
2261 ret = err;
2262 }
2263 return ret;
2264}
2265EXPORT_SYMBOL(generic_file_writev);
2266
2267/*
2268 * Called under i_sem for writes to S_ISREG files. Returns -EIO if something
2269 * went wrong during pagecache shootdown.
2270 */
2271ssize_t
2272generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2273 loff_t offset, unsigned long nr_segs)
2274{
2275 struct file *file = iocb->ki_filp;
2276 struct address_space *mapping = file->f_mapping;
2277 ssize_t retval;
2278 size_t write_len = 0;
2279
2280 /*
2281 * If it's a write, unmap all mmappings of the file up-front. This
2282 * will cause any pte dirty bits to be propagated into the pageframes
2283 * for the subsequent filemap_write_and_wait().
2284 */
2285 if (rw == WRITE) {
2286 write_len = iov_length(iov, nr_segs);
2287 if (mapping_mapped(mapping))
2288 unmap_mapping_range(mapping, offset, write_len, 0);
2289 }
2290
2291 retval = filemap_write_and_wait(mapping);
2292 if (retval == 0) {
2293 retval = mapping->a_ops->direct_IO(rw, iocb, iov,
2294 offset, nr_segs);
2295 if (rw == WRITE && mapping->nrpages) {
2296 pgoff_t end = (offset + write_len - 1)
2297 >> PAGE_CACHE_SHIFT;
2298 int err = invalidate_inode_pages2_range(mapping,
2299 offset >> PAGE_CACHE_SHIFT, end);
2300 if (err)
2301 retval = err;
2302 }
2303 }
2304 return retval;
2305}
2306EXPORT_SYMBOL_GPL(generic_file_direct_IO);
diff --git a/mm/fremap.c b/mm/fremap.c
new file mode 100644
index 00000000000..3235fb77c13
--- /dev/null
+++ b/mm/fremap.c
@@ -0,0 +1,256 @@
1/*
2 * linux/mm/fremap.c
3 *
4 * Explicit pagetable population and nonlinear (random) mappings support.
5 *
6 * started by Ingo Molnar, Copyright (C) 2002, 2003
7 */
8
9#include <linux/mm.h>
10#include <linux/swap.h>
11#include <linux/file.h>
12#include <linux/mman.h>
13#include <linux/pagemap.h>
14#include <linux/swapops.h>
15#include <linux/rmap.h>
16#include <linux/module.h>
17#include <linux/syscalls.h>
18
19#include <asm/mmu_context.h>
20#include <asm/cacheflush.h>
21#include <asm/tlbflush.h>
22
23static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
24 unsigned long addr, pte_t *ptep)
25{
26 pte_t pte = *ptep;
27
28 if (pte_none(pte))
29 return;
30 if (pte_present(pte)) {
31 unsigned long pfn = pte_pfn(pte);
32
33 flush_cache_page(vma, addr, pfn);
34 pte = ptep_clear_flush(vma, addr, ptep);
35 if (pfn_valid(pfn)) {
36 struct page *page = pfn_to_page(pfn);
37 if (!PageReserved(page)) {
38 if (pte_dirty(pte))
39 set_page_dirty(page);
40 page_remove_rmap(page);
41 page_cache_release(page);
42 dec_mm_counter(mm, rss);
43 }
44 }
45 } else {
46 if (!pte_file(pte))
47 free_swap_and_cache(pte_to_swp_entry(pte));
48 pte_clear(mm, addr, ptep);
49 }
50}
51
52/*
53 * Install a file page to a given virtual memory address, release any
54 * previously existing mapping.
55 */
56int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
57 unsigned long addr, struct page *page, pgprot_t prot)
58{
59 struct inode *inode;
60 pgoff_t size;
61 int err = -ENOMEM;
62 pte_t *pte;
63 pmd_t *pmd;
64 pud_t *pud;
65 pgd_t *pgd;
66 pte_t pte_val;
67
68 pgd = pgd_offset(mm, addr);
69 spin_lock(&mm->page_table_lock);
70
71 pud = pud_alloc(mm, pgd, addr);
72 if (!pud)
73 goto err_unlock;
74
75 pmd = pmd_alloc(mm, pud, addr);
76 if (!pmd)
77 goto err_unlock;
78
79 pte = pte_alloc_map(mm, pmd, addr);
80 if (!pte)
81 goto err_unlock;
82
83 /*
84 * This page may have been truncated. Tell the
85 * caller about it.
86 */
87 err = -EINVAL;
88 inode = vma->vm_file->f_mapping->host;
89 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
90 if (!page->mapping || page->index >= size)
91 goto err_unlock;
92
93 zap_pte(mm, vma, addr, pte);
94
95 inc_mm_counter(mm,rss);
96 flush_icache_page(vma, page);
97 set_pte_at(mm, addr, pte, mk_pte(page, prot));
98 page_add_file_rmap(page);
99 pte_val = *pte;
100 pte_unmap(pte);
101 update_mmu_cache(vma, addr, pte_val);
102
103 err = 0;
104err_unlock:
105 spin_unlock(&mm->page_table_lock);
106 return err;
107}
108EXPORT_SYMBOL(install_page);
109
110
111/*
112 * Install a file pte to a given virtual memory address, release any
113 * previously existing mapping.
114 */
115int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
116 unsigned long addr, unsigned long pgoff, pgprot_t prot)
117{
118 int err = -ENOMEM;
119 pte_t *pte;
120 pmd_t *pmd;
121 pud_t *pud;
122 pgd_t *pgd;
123 pte_t pte_val;
124
125 pgd = pgd_offset(mm, addr);
126 spin_lock(&mm->page_table_lock);
127
128 pud = pud_alloc(mm, pgd, addr);
129 if (!pud)
130 goto err_unlock;
131
132 pmd = pmd_alloc(mm, pud, addr);
133 if (!pmd)
134 goto err_unlock;
135
136 pte = pte_alloc_map(mm, pmd, addr);
137 if (!pte)
138 goto err_unlock;
139
140 zap_pte(mm, vma, addr, pte);
141
142 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
143 pte_val = *pte;
144 pte_unmap(pte);
145 update_mmu_cache(vma, addr, pte_val);
146 spin_unlock(&mm->page_table_lock);
147 return 0;
148
149err_unlock:
150 spin_unlock(&mm->page_table_lock);
151 return err;
152}
153
154
155/***
156 * sys_remap_file_pages - remap arbitrary pages of a shared backing store
157 * file within an existing vma.
158 * @start: start of the remapped virtual memory range
159 * @size: size of the remapped virtual memory range
160 * @prot: new protection bits of the range
161 * @pgoff: to be mapped page of the backing store file
162 * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
163 *
164 * this syscall works purely via pagetables, so it's the most efficient
165 * way to map the same (large) file into a given virtual window. Unlike
166 * mmap()/mremap() it does not create any new vmas. The new mappings are
167 * also safe across swapout.
168 *
169 * NOTE: the 'prot' parameter right now is ignored, and the vma's default
170 * protection is used. Arbitrary protections might be implemented in the
171 * future.
172 */
173asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
174 unsigned long __prot, unsigned long pgoff, unsigned long flags)
175{
176 struct mm_struct *mm = current->mm;
177 struct address_space *mapping;
178 unsigned long end = start + size;
179 struct vm_area_struct *vma;
180 int err = -EINVAL;
181 int has_write_lock = 0;
182
183 if (__prot)
184 return err;
185 /*
186 * Sanitize the syscall parameters:
187 */
188 start = start & PAGE_MASK;
189 size = size & PAGE_MASK;
190
191 /* Does the address range wrap, or is the span zero-sized? */
192 if (start + size <= start)
193 return err;
194
195 /* Can we represent this offset inside this architecture's pte's? */
196#if PTE_FILE_MAX_BITS < BITS_PER_LONG
197 if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
198 return err;
199#endif
200
201 /* We need down_write() to change vma->vm_flags. */
202 down_read(&mm->mmap_sem);
203 retry:
204 vma = find_vma(mm, start);
205
206 /*
207 * Make sure the vma is shared, that it supports prefaulting,
208 * and that the remapped range is valid and fully within
209 * the single existing vma. vm_private_data is used as a
210 * swapout cursor in a VM_NONLINEAR vma (unless VM_RESERVED
211 * or VM_LOCKED, but VM_LOCKED could be revoked later on).
212 */
213 if (vma && (vma->vm_flags & VM_SHARED) &&
214 (!vma->vm_private_data ||
215 (vma->vm_flags & (VM_NONLINEAR|VM_RESERVED))) &&
216 vma->vm_ops && vma->vm_ops->populate &&
217 end > start && start >= vma->vm_start &&
218 end <= vma->vm_end) {
219
220 /* Must set VM_NONLINEAR before any pages are populated. */
221 if (pgoff != linear_page_index(vma, start) &&
222 !(vma->vm_flags & VM_NONLINEAR)) {
223 if (!has_write_lock) {
224 up_read(&mm->mmap_sem);
225 down_write(&mm->mmap_sem);
226 has_write_lock = 1;
227 goto retry;
228 }
229 mapping = vma->vm_file->f_mapping;
230 spin_lock(&mapping->i_mmap_lock);
231 flush_dcache_mmap_lock(mapping);
232 vma->vm_flags |= VM_NONLINEAR;
233 vma_prio_tree_remove(vma, &mapping->i_mmap);
234 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
235 flush_dcache_mmap_unlock(mapping);
236 spin_unlock(&mapping->i_mmap_lock);
237 }
238
239 err = vma->vm_ops->populate(vma, start, size,
240 vma->vm_page_prot,
241 pgoff, flags & MAP_NONBLOCK);
242
243 /*
244 * We can't clear VM_NONLINEAR because we'd have to do
245 * it after ->populate completes, and that would prevent
246 * downgrading the lock. (Locks can't be upgraded).
247 */
248 }
249 if (likely(!has_write_lock))
250 up_read(&mm->mmap_sem);
251 else
252 up_write(&mm->mmap_sem);
253
254 return err;
255}
256
diff --git a/mm/highmem.c b/mm/highmem.c
new file mode 100644
index 00000000000..d01276506b0
--- /dev/null
+++ b/mm/highmem.c
@@ -0,0 +1,607 @@
1/*
2 * High memory handling common code and variables.
3 *
4 * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
5 * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
6 *
7 *
8 * Redesigned the x86 32-bit VM architecture to deal with
9 * 64-bit physical space. With current x86 CPUs this
10 * means up to 64 Gigabytes physical RAM.
11 *
12 * Rewrote high memory support to move the page cache into
13 * high memory. Implemented permanent (schedulable) kmaps
14 * based on Linus' idea.
15 *
16 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
17 */
18
19#include <linux/mm.h>
20#include <linux/module.h>
21#include <linux/swap.h>
22#include <linux/bio.h>
23#include <linux/pagemap.h>
24#include <linux/mempool.h>
25#include <linux/blkdev.h>
26#include <linux/init.h>
27#include <linux/hash.h>
28#include <linux/highmem.h>
29#include <asm/tlbflush.h>
30
31static mempool_t *page_pool, *isa_page_pool;
32
33static void *page_pool_alloc(unsigned int __nocast gfp_mask, void *data)
34{
35 unsigned int gfp = gfp_mask | (unsigned int) (long) data;
36
37 return alloc_page(gfp);
38}
39
40static void page_pool_free(void *page, void *data)
41{
42 __free_page(page);
43}
44
45/*
46 * Virtual_count is not a pure "count".
47 * 0 means that it is not mapped, and has not been mapped
48 * since a TLB flush - it is usable.
49 * 1 means that there are no users, but it has been mapped
50 * since the last TLB flush - so we can't use it.
51 * n means that there are (n-1) current users of it.
52 */
53#ifdef CONFIG_HIGHMEM
54static int pkmap_count[LAST_PKMAP];
55static unsigned int last_pkmap_nr;
56static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
57
58pte_t * pkmap_page_table;
59
60static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
61
62static void flush_all_zero_pkmaps(void)
63{
64 int i;
65
66 flush_cache_kmaps();
67
68 for (i = 0; i < LAST_PKMAP; i++) {
69 struct page *page;
70
71 /*
72 * zero means we don't have anything to do,
73 * >1 means that it is still in use. Only
74 * a count of 1 means that it is free but
75 * needs to be unmapped
76 */
77 if (pkmap_count[i] != 1)
78 continue;
79 pkmap_count[i] = 0;
80
81 /* sanity check */
82 if (pte_none(pkmap_page_table[i]))
83 BUG();
84
85 /*
86 * Don't need an atomic fetch-and-clear op here;
87 * no-one has the page mapped, and cannot get at
88 * its virtual address (and hence PTE) without first
89 * getting the kmap_lock (which is held here).
90 * So no dangers, even with speculative execution.
91 */
92 page = pte_page(pkmap_page_table[i]);
93 pte_clear(&init_mm, (unsigned long)page_address(page),
94 &pkmap_page_table[i]);
95
96 set_page_address(page, NULL);
97 }
98 flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
99}
100
101static inline unsigned long map_new_virtual(struct page *page)
102{
103 unsigned long vaddr;
104 int count;
105
106start:
107 count = LAST_PKMAP;
108 /* Find an empty entry */
109 for (;;) {
110 last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
111 if (!last_pkmap_nr) {
112 flush_all_zero_pkmaps();
113 count = LAST_PKMAP;
114 }
115 if (!pkmap_count[last_pkmap_nr])
116 break; /* Found a usable entry */
117 if (--count)
118 continue;
119
120 /*
121 * Sleep for somebody else to unmap their entries
122 */
123 {
124 DECLARE_WAITQUEUE(wait, current);
125
126 __set_current_state(TASK_UNINTERRUPTIBLE);
127 add_wait_queue(&pkmap_map_wait, &wait);
128 spin_unlock(&kmap_lock);
129 schedule();
130 remove_wait_queue(&pkmap_map_wait, &wait);
131 spin_lock(&kmap_lock);
132
133 /* Somebody else might have mapped it while we slept */
134 if (page_address(page))
135 return (unsigned long)page_address(page);
136
137 /* Re-start */
138 goto start;
139 }
140 }
141 vaddr = PKMAP_ADDR(last_pkmap_nr);
142 set_pte_at(&init_mm, vaddr,
143 &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
144
145 pkmap_count[last_pkmap_nr] = 1;
146 set_page_address(page, (void *)vaddr);
147
148 return vaddr;
149}
150
151void fastcall *kmap_high(struct page *page)
152{
153 unsigned long vaddr;
154
155 /*
156 * For highmem pages, we can't trust "virtual" until
157 * after we have the lock.
158 *
159 * We cannot call this from interrupts, as it may block
160 */
161 spin_lock(&kmap_lock);
162 vaddr = (unsigned long)page_address(page);
163 if (!vaddr)
164 vaddr = map_new_virtual(page);
165 pkmap_count[PKMAP_NR(vaddr)]++;
166 if (pkmap_count[PKMAP_NR(vaddr)] < 2)
167 BUG();
168 spin_unlock(&kmap_lock);
169 return (void*) vaddr;
170}
171
172EXPORT_SYMBOL(kmap_high);
173
174void fastcall kunmap_high(struct page *page)
175{
176 unsigned long vaddr;
177 unsigned long nr;
178 int need_wakeup;
179
180 spin_lock(&kmap_lock);
181 vaddr = (unsigned long)page_address(page);
182 if (!vaddr)
183 BUG();
184 nr = PKMAP_NR(vaddr);
185
186 /*
187 * A count must never go down to zero
188 * without a TLB flush!
189 */
190 need_wakeup = 0;
191 switch (--pkmap_count[nr]) {
192 case 0:
193 BUG();
194 case 1:
195 /*
196 * Avoid an unnecessary wake_up() function call.
197 * The common case is pkmap_count[] == 1, but
198 * no waiters.
199 * The tasks queued in the wait-queue are guarded
200 * by both the lock in the wait-queue-head and by
201 * the kmap_lock. As the kmap_lock is held here,
202 * no need for the wait-queue-head's lock. Simply
203 * test if the queue is empty.
204 */
205 need_wakeup = waitqueue_active(&pkmap_map_wait);
206 }
207 spin_unlock(&kmap_lock);
208
209 /* do wake-up, if needed, race-free outside of the spin lock */
210 if (need_wakeup)
211 wake_up(&pkmap_map_wait);
212}
213
214EXPORT_SYMBOL(kunmap_high);
215
216#define POOL_SIZE 64
217
218static __init int init_emergency_pool(void)
219{
220 struct sysinfo i;
221 si_meminfo(&i);
222 si_swapinfo(&i);
223
224 if (!i.totalhigh)
225 return 0;
226
227 page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL);
228 if (!page_pool)
229 BUG();
230 printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
231
232 return 0;
233}
234
235__initcall(init_emergency_pool);
236
237/*
238 * highmem version, map in to vec
239 */
240static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
241{
242 unsigned long flags;
243 unsigned char *vto;
244
245 local_irq_save(flags);
246 vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
247 memcpy(vto + to->bv_offset, vfrom, to->bv_len);
248 kunmap_atomic(vto, KM_BOUNCE_READ);
249 local_irq_restore(flags);
250}
251
252#else /* CONFIG_HIGHMEM */
253
254#define bounce_copy_vec(to, vfrom) \
255 memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
256
257#endif
258
259#define ISA_POOL_SIZE 16
260
261/*
262 * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
263 * as the max address, so check if the pool has already been created.
264 */
265int init_emergency_isa_pool(void)
266{
267 if (isa_page_pool)
268 return 0;
269
270 isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc, page_pool_free, (void *) __GFP_DMA);
271 if (!isa_page_pool)
272 BUG();
273
274 printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
275 return 0;
276}
277
278/*
279 * Simple bounce buffer support for highmem pages. Depending on the
280 * queue gfp mask set, *to may or may not be a highmem page. kmap it
281 * always, it will do the Right Thing
282 */
283static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
284{
285 unsigned char *vfrom;
286 struct bio_vec *tovec, *fromvec;
287 int i;
288
289 __bio_for_each_segment(tovec, to, i, 0) {
290 fromvec = from->bi_io_vec + i;
291
292 /*
293 * not bounced
294 */
295 if (tovec->bv_page == fromvec->bv_page)
296 continue;
297
298 /*
299 * fromvec->bv_offset and fromvec->bv_len might have been
300 * modified by the block layer, so use the original copy,
301 * bounce_copy_vec already uses tovec->bv_len
302 */
303 vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
304
305 flush_dcache_page(tovec->bv_page);
306 bounce_copy_vec(tovec, vfrom);
307 }
308}
309
310static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
311{
312 struct bio *bio_orig = bio->bi_private;
313 struct bio_vec *bvec, *org_vec;
314 int i;
315
316 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
317 set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
318
319 /*
320 * free up bounce indirect pages used
321 */
322 __bio_for_each_segment(bvec, bio, i, 0) {
323 org_vec = bio_orig->bi_io_vec + i;
324 if (bvec->bv_page == org_vec->bv_page)
325 continue;
326
327 mempool_free(bvec->bv_page, pool);
328 }
329
330 bio_endio(bio_orig, bio_orig->bi_size, err);
331 bio_put(bio);
332}
333
334static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done,int err)
335{
336 if (bio->bi_size)
337 return 1;
338
339 bounce_end_io(bio, page_pool, err);
340 return 0;
341}
342
343static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
344{
345 if (bio->bi_size)
346 return 1;
347
348 bounce_end_io(bio, isa_page_pool, err);
349 return 0;
350}
351
352static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
353{
354 struct bio *bio_orig = bio->bi_private;
355
356 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
357 copy_to_high_bio_irq(bio_orig, bio);
358
359 bounce_end_io(bio, pool, err);
360}
361
362static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
363{
364 if (bio->bi_size)
365 return 1;
366
367 __bounce_end_io_read(bio, page_pool, err);
368 return 0;
369}
370
371static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
372{
373 if (bio->bi_size)
374 return 1;
375
376 __bounce_end_io_read(bio, isa_page_pool, err);
377 return 0;
378}
379
380static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
381 mempool_t *pool)
382{
383 struct page *page;
384 struct bio *bio = NULL;
385 int i, rw = bio_data_dir(*bio_orig);
386 struct bio_vec *to, *from;
387
388 bio_for_each_segment(from, *bio_orig, i) {
389 page = from->bv_page;
390
391 /*
392 * is destination page below bounce pfn?
393 */
394 if (page_to_pfn(page) < q->bounce_pfn)
395 continue;
396
397 /*
398 * irk, bounce it
399 */
400 if (!bio)
401 bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
402
403 to = bio->bi_io_vec + i;
404
405 to->bv_page = mempool_alloc(pool, q->bounce_gfp);
406 to->bv_len = from->bv_len;
407 to->bv_offset = from->bv_offset;
408
409 if (rw == WRITE) {
410 char *vto, *vfrom;
411
412 flush_dcache_page(from->bv_page);
413 vto = page_address(to->bv_page) + to->bv_offset;
414 vfrom = kmap(from->bv_page) + from->bv_offset;
415 memcpy(vto, vfrom, to->bv_len);
416 kunmap(from->bv_page);
417 }
418 }
419
420 /*
421 * no pages bounced
422 */
423 if (!bio)
424 return;
425
426 /*
427 * at least one page was bounced, fill in possible non-highmem
428 * pages
429 */
430 __bio_for_each_segment(from, *bio_orig, i, 0) {
431 to = bio_iovec_idx(bio, i);
432 if (!to->bv_page) {
433 to->bv_page = from->bv_page;
434 to->bv_len = from->bv_len;
435 to->bv_offset = from->bv_offset;
436 }
437 }
438
439 bio->bi_bdev = (*bio_orig)->bi_bdev;
440 bio->bi_flags |= (1 << BIO_BOUNCED);
441 bio->bi_sector = (*bio_orig)->bi_sector;
442 bio->bi_rw = (*bio_orig)->bi_rw;
443
444 bio->bi_vcnt = (*bio_orig)->bi_vcnt;
445 bio->bi_idx = (*bio_orig)->bi_idx;
446 bio->bi_size = (*bio_orig)->bi_size;
447
448 if (pool == page_pool) {
449 bio->bi_end_io = bounce_end_io_write;
450 if (rw == READ)
451 bio->bi_end_io = bounce_end_io_read;
452 } else {
453 bio->bi_end_io = bounce_end_io_write_isa;
454 if (rw == READ)
455 bio->bi_end_io = bounce_end_io_read_isa;
456 }
457
458 bio->bi_private = *bio_orig;
459 *bio_orig = bio;
460}
461
462void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
463{
464 mempool_t *pool;
465
466 /*
467 * for non-isa bounce case, just check if the bounce pfn is equal
468 * to or bigger than the highest pfn in the system -- in that case,
469 * don't waste time iterating over bio segments
470 */
471 if (!(q->bounce_gfp & GFP_DMA)) {
472 if (q->bounce_pfn >= blk_max_pfn)
473 return;
474 pool = page_pool;
475 } else {
476 BUG_ON(!isa_page_pool);
477 pool = isa_page_pool;
478 }
479
480 /*
481 * slow path
482 */
483 __blk_queue_bounce(q, bio_orig, pool);
484}
485
486EXPORT_SYMBOL(blk_queue_bounce);
487
488#if defined(HASHED_PAGE_VIRTUAL)
489
490#define PA_HASH_ORDER 7
491
492/*
493 * Describes one page->virtual association
494 */
495struct page_address_map {
496 struct page *page;
497 void *virtual;
498 struct list_head list;
499};
500
501/*
502 * page_address_map freelist, allocated from page_address_maps.
503 */
504static struct list_head page_address_pool; /* freelist */
505static spinlock_t pool_lock; /* protects page_address_pool */
506
507/*
508 * Hash table bucket
509 */
510static struct page_address_slot {
511 struct list_head lh; /* List of page_address_maps */
512 spinlock_t lock; /* Protect this bucket's list */
513} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
514
515static struct page_address_slot *page_slot(struct page *page)
516{
517 return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
518}
519
520void *page_address(struct page *page)
521{
522 unsigned long flags;
523 void *ret;
524 struct page_address_slot *pas;
525
526 if (!PageHighMem(page))
527 return lowmem_page_address(page);
528
529 pas = page_slot(page);
530 ret = NULL;
531 spin_lock_irqsave(&pas->lock, flags);
532 if (!list_empty(&pas->lh)) {
533 struct page_address_map *pam;
534
535 list_for_each_entry(pam, &pas->lh, list) {
536 if (pam->page == page) {
537 ret = pam->virtual;
538 goto done;
539 }
540 }
541 }
542done:
543 spin_unlock_irqrestore(&pas->lock, flags);
544 return ret;
545}
546
547EXPORT_SYMBOL(page_address);
548
549void set_page_address(struct page *page, void *virtual)
550{
551 unsigned long flags;
552 struct page_address_slot *pas;
553 struct page_address_map *pam;
554
555 BUG_ON(!PageHighMem(page));
556
557 pas = page_slot(page);
558 if (virtual) { /* Add */
559 BUG_ON(list_empty(&page_address_pool));
560
561 spin_lock_irqsave(&pool_lock, flags);
562 pam = list_entry(page_address_pool.next,
563 struct page_address_map, list);
564 list_del(&pam->list);
565 spin_unlock_irqrestore(&pool_lock, flags);
566
567 pam->page = page;
568 pam->virtual = virtual;
569
570 spin_lock_irqsave(&pas->lock, flags);
571 list_add_tail(&pam->list, &pas->lh);
572 spin_unlock_irqrestore(&pas->lock, flags);
573 } else { /* Remove */
574 spin_lock_irqsave(&pas->lock, flags);
575 list_for_each_entry(pam, &pas->lh, list) {
576 if (pam->page == page) {
577 list_del(&pam->list);
578 spin_unlock_irqrestore(&pas->lock, flags);
579 spin_lock_irqsave(&pool_lock, flags);
580 list_add_tail(&pam->list, &page_address_pool);
581 spin_unlock_irqrestore(&pool_lock, flags);
582 goto done;
583 }
584 }
585 spin_unlock_irqrestore(&pas->lock, flags);
586 }
587done:
588 return;
589}
590
591static struct page_address_map page_address_maps[LAST_PKMAP];
592
593void __init page_address_init(void)
594{
595 int i;
596
597 INIT_LIST_HEAD(&page_address_pool);
598 for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
599 list_add(&page_address_maps[i].list, &page_address_pool);
600 for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
601 INIT_LIST_HEAD(&page_address_htable[i].lh);
602 spin_lock_init(&page_address_htable[i].lock);
603 }
604 spin_lock_init(&pool_lock);
605}
606
607#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
new file mode 100644
index 00000000000..4eb5ae3fbe1
--- /dev/null
+++ b/mm/hugetlb.c
@@ -0,0 +1,260 @@
1/*
2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004
4 */
5#include <linux/gfp.h>
6#include <linux/list.h>
7#include <linux/init.h>
8#include <linux/module.h>
9#include <linux/mm.h>
10#include <linux/hugetlb.h>
11#include <linux/sysctl.h>
12#include <linux/highmem.h>
13#include <linux/nodemask.h>
14
15const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
16static unsigned long nr_huge_pages, free_huge_pages;
17unsigned long max_huge_pages;
18static struct list_head hugepage_freelists[MAX_NUMNODES];
19static unsigned int nr_huge_pages_node[MAX_NUMNODES];
20static unsigned int free_huge_pages_node[MAX_NUMNODES];
21static DEFINE_SPINLOCK(hugetlb_lock);
22
23static void enqueue_huge_page(struct page *page)
24{
25 int nid = page_to_nid(page);
26 list_add(&page->lru, &hugepage_freelists[nid]);
27 free_huge_pages++;
28 free_huge_pages_node[nid]++;
29}
30
31static struct page *dequeue_huge_page(void)
32{
33 int nid = numa_node_id();
34 struct page *page = NULL;
35
36 if (list_empty(&hugepage_freelists[nid])) {
37 for (nid = 0; nid < MAX_NUMNODES; ++nid)
38 if (!list_empty(&hugepage_freelists[nid]))
39 break;
40 }
41 if (nid >= 0 && nid < MAX_NUMNODES &&
42 !list_empty(&hugepage_freelists[nid])) {
43 page = list_entry(hugepage_freelists[nid].next,
44 struct page, lru);
45 list_del(&page->lru);
46 free_huge_pages--;
47 free_huge_pages_node[nid]--;
48 }
49 return page;
50}
51
52static struct page *alloc_fresh_huge_page(void)
53{
54 static int nid = 0;
55 struct page *page;
56 page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
57 HUGETLB_PAGE_ORDER);
58 nid = (nid + 1) % num_online_nodes();
59 if (page) {
60 nr_huge_pages++;
61 nr_huge_pages_node[page_to_nid(page)]++;
62 }
63 return page;
64}
65
66void free_huge_page(struct page *page)
67{
68 BUG_ON(page_count(page));
69
70 INIT_LIST_HEAD(&page->lru);
71 page[1].mapping = NULL;
72
73 spin_lock(&hugetlb_lock);
74 enqueue_huge_page(page);
75 spin_unlock(&hugetlb_lock);
76}
77
78struct page *alloc_huge_page(void)
79{
80 struct page *page;
81 int i;
82
83 spin_lock(&hugetlb_lock);
84 page = dequeue_huge_page();
85 if (!page) {
86 spin_unlock(&hugetlb_lock);
87 return NULL;
88 }
89 spin_unlock(&hugetlb_lock);
90 set_page_count(page, 1);
91 page[1].mapping = (void *)free_huge_page;
92 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
93 clear_highpage(&page[i]);
94 return page;
95}
96
97static int __init hugetlb_init(void)
98{
99 unsigned long i;
100 struct page *page;
101
102 for (i = 0; i < MAX_NUMNODES; ++i)
103 INIT_LIST_HEAD(&hugepage_freelists[i]);
104
105 for (i = 0; i < max_huge_pages; ++i) {
106 page = alloc_fresh_huge_page();
107 if (!page)
108 break;
109 spin_lock(&hugetlb_lock);
110 enqueue_huge_page(page);
111 spin_unlock(&hugetlb_lock);
112 }
113 max_huge_pages = free_huge_pages = nr_huge_pages = i;
114 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
115 return 0;
116}
117module_init(hugetlb_init);
118
119static int __init hugetlb_setup(char *s)
120{
121 if (sscanf(s, "%lu", &max_huge_pages) <= 0)
122 max_huge_pages = 0;
123 return 1;
124}
125__setup("hugepages=", hugetlb_setup);
126
127#ifdef CONFIG_SYSCTL
128static void update_and_free_page(struct page *page)
129{
130 int i;
131 nr_huge_pages--;
132 nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--;
133 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
134 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
135 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
136 1 << PG_private | 1<< PG_writeback);
137 set_page_count(&page[i], 0);
138 }
139 set_page_count(page, 1);
140 __free_pages(page, HUGETLB_PAGE_ORDER);
141}
142
143#ifdef CONFIG_HIGHMEM
144static void try_to_free_low(unsigned long count)
145{
146 int i, nid;
147 for (i = 0; i < MAX_NUMNODES; ++i) {
148 struct page *page, *next;
149 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
150 if (PageHighMem(page))
151 continue;
152 list_del(&page->lru);
153 update_and_free_page(page);
154 nid = page_zone(page)->zone_pgdat->node_id;
155 free_huge_pages--;
156 free_huge_pages_node[nid]--;
157 if (count >= nr_huge_pages)
158 return;
159 }
160 }
161}
162#else
163static inline void try_to_free_low(unsigned long count)
164{
165}
166#endif
167
168static unsigned long set_max_huge_pages(unsigned long count)
169{
170 while (count > nr_huge_pages) {
171 struct page *page = alloc_fresh_huge_page();
172 if (!page)
173 return nr_huge_pages;
174 spin_lock(&hugetlb_lock);
175 enqueue_huge_page(page);
176 spin_unlock(&hugetlb_lock);
177 }
178 if (count >= nr_huge_pages)
179 return nr_huge_pages;
180
181 spin_lock(&hugetlb_lock);
182 try_to_free_low(count);
183 while (count < nr_huge_pages) {
184 struct page *page = dequeue_huge_page();
185 if (!page)
186 break;
187 update_and_free_page(page);
188 }
189 spin_unlock(&hugetlb_lock);
190 return nr_huge_pages;
191}
192
193int hugetlb_sysctl_handler(struct ctl_table *table, int write,
194 struct file *file, void __user *buffer,
195 size_t *length, loff_t *ppos)
196{
197 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
198 max_huge_pages = set_max_huge_pages(max_huge_pages);
199 return 0;
200}
201#endif /* CONFIG_SYSCTL */
202
203int hugetlb_report_meminfo(char *buf)
204{
205 return sprintf(buf,
206 "HugePages_Total: %5lu\n"
207 "HugePages_Free: %5lu\n"
208 "Hugepagesize: %5lu kB\n",
209 nr_huge_pages,
210 free_huge_pages,
211 HPAGE_SIZE/1024);
212}
213
214int hugetlb_report_node_meminfo(int nid, char *buf)
215{
216 return sprintf(buf,
217 "Node %d HugePages_Total: %5u\n"
218 "Node %d HugePages_Free: %5u\n",
219 nid, nr_huge_pages_node[nid],
220 nid, free_huge_pages_node[nid]);
221}
222
223int is_hugepage_mem_enough(size_t size)
224{
225 return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
226}
227
228/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
229unsigned long hugetlb_total_pages(void)
230{
231 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
232}
233EXPORT_SYMBOL(hugetlb_total_pages);
234
235/*
236 * We cannot handle pagefaults against hugetlb pages at all. They cause
237 * handle_mm_fault() to try to instantiate regular-sized pages in the
238 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
239 * this far.
240 */
241static struct page *hugetlb_nopage(struct vm_area_struct *vma,
242 unsigned long address, int *unused)
243{
244 BUG();
245 return NULL;
246}
247
248struct vm_operations_struct hugetlb_vm_ops = {
249 .nopage = hugetlb_nopage,
250};
251
252void zap_hugepage_range(struct vm_area_struct *vma,
253 unsigned long start, unsigned long length)
254{
255 struct mm_struct *mm = vma->vm_mm;
256
257 spin_lock(&mm->page_table_lock);
258 unmap_hugepage_range(vma, start, start + length);
259 spin_unlock(&mm->page_table_lock);
260}
diff --git a/mm/internal.h b/mm/internal.h
new file mode 100644
index 00000000000..6bf134e8fb3
--- /dev/null
+++ b/mm/internal.h
@@ -0,0 +1,13 @@
1/* internal.h: mm/ internal definitions
2 *
3 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12/* page_alloc.c */
13extern void set_page_refs(struct page *page, int order);
diff --git a/mm/madvise.c b/mm/madvise.c
new file mode 100644
index 00000000000..944b5e52d81
--- /dev/null
+++ b/mm/madvise.c
@@ -0,0 +1,242 @@
1/*
2 * linux/mm/madvise.c
3 *
4 * Copyright (C) 1999 Linus Torvalds
5 * Copyright (C) 2002 Christoph Hellwig
6 */
7
8#include <linux/mman.h>
9#include <linux/pagemap.h>
10#include <linux/syscalls.h>
11#include <linux/hugetlb.h>
12
13/*
14 * We can potentially split a vm area into separate
15 * areas, each area with its own behavior.
16 */
17static long madvise_behavior(struct vm_area_struct * vma, unsigned long start,
18 unsigned long end, int behavior)
19{
20 struct mm_struct * mm = vma->vm_mm;
21 int error = 0;
22
23 if (start != vma->vm_start) {
24 error = split_vma(mm, vma, start, 1);
25 if (error)
26 goto out;
27 }
28
29 if (end != vma->vm_end) {
30 error = split_vma(mm, vma, end, 0);
31 if (error)
32 goto out;
33 }
34
35 /*
36 * vm_flags is protected by the mmap_sem held in write mode.
37 */
38 VM_ClearReadHint(vma);
39
40 switch (behavior) {
41 case MADV_SEQUENTIAL:
42 vma->vm_flags |= VM_SEQ_READ;
43 break;
44 case MADV_RANDOM:
45 vma->vm_flags |= VM_RAND_READ;
46 break;
47 default:
48 break;
49 }
50
51out:
52 if (error == -ENOMEM)
53 error = -EAGAIN;
54 return error;
55}
56
57/*
58 * Schedule all required I/O operations. Do not wait for completion.
59 */
60static long madvise_willneed(struct vm_area_struct * vma,
61 unsigned long start, unsigned long end)
62{
63 struct file *file = vma->vm_file;
64
65 if (!file)
66 return -EBADF;
67
68 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
69 if (end > vma->vm_end)
70 end = vma->vm_end;
71 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
72
73 force_page_cache_readahead(file->f_mapping,
74 file, start, max_sane_readahead(end - start));
75 return 0;
76}
77
78/*
79 * Application no longer needs these pages. If the pages are dirty,
80 * it's OK to just throw them away. The app will be more careful about
81 * data it wants to keep. Be sure to free swap resources too. The
82 * zap_page_range call sets things up for refill_inactive to actually free
83 * these pages later if no one else has touched them in the meantime,
84 * although we could add these pages to a global reuse list for
85 * refill_inactive to pick up before reclaiming other pages.
86 *
87 * NB: This interface discards data rather than pushes it out to swap,
88 * as some implementations do. This has performance implications for
89 * applications like large transactional databases which want to discard
90 * pages in anonymous maps after committing to backing store the data
91 * that was kept in them. There is no reason to write this data out to
92 * the swap area if the application is discarding it.
93 *
94 * An interface that causes the system to free clean pages and flush
95 * dirty pages is already available as msync(MS_INVALIDATE).
96 */
97static long madvise_dontneed(struct vm_area_struct * vma,
98 unsigned long start, unsigned long end)
99{
100 if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
101 return -EINVAL;
102
103 if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
104 struct zap_details details = {
105 .nonlinear_vma = vma,
106 .last_index = ULONG_MAX,
107 };
108 zap_page_range(vma, start, end - start, &details);
109 } else
110 zap_page_range(vma, start, end - start, NULL);
111 return 0;
112}
113
114static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
115 unsigned long end, int behavior)
116{
117 long error = -EBADF;
118
119 switch (behavior) {
120 case MADV_NORMAL:
121 case MADV_SEQUENTIAL:
122 case MADV_RANDOM:
123 error = madvise_behavior(vma, start, end, behavior);
124 break;
125
126 case MADV_WILLNEED:
127 error = madvise_willneed(vma, start, end);
128 break;
129
130 case MADV_DONTNEED:
131 error = madvise_dontneed(vma, start, end);
132 break;
133
134 default:
135 error = -EINVAL;
136 break;
137 }
138
139 return error;
140}
141
142/*
143 * The madvise(2) system call.
144 *
145 * Applications can use madvise() to advise the kernel how it should
146 * handle paging I/O in this VM area. The idea is to help the kernel
147 * use appropriate read-ahead and caching techniques. The information
148 * provided is advisory only, and can be safely disregarded by the
149 * kernel without affecting the correct operation of the application.
150 *
151 * behavior values:
152 * MADV_NORMAL - the default behavior is to read clusters. This
153 * results in some read-ahead and read-behind.
154 * MADV_RANDOM - the system should read the minimum amount of data
155 * on any access, since it is unlikely that the appli-
156 * cation will need more than what it asks for.
157 * MADV_SEQUENTIAL - pages in the given range will probably be accessed
158 * once, so they can be aggressively read ahead, and
159 * can be freed soon after they are accessed.
160 * MADV_WILLNEED - the application is notifying the system to read
161 * some pages ahead.
162 * MADV_DONTNEED - the application is finished with the given range,
163 * so the kernel can free resources associated with it.
164 *
165 * return values:
166 * zero - success
167 * -EINVAL - start + len < 0, start is not page-aligned,
168 * "behavior" is not a valid value, or application
169 * is attempting to release locked or shared pages.
170 * -ENOMEM - addresses in the specified range are not currently
171 * mapped, or are outside the AS of the process.
172 * -EIO - an I/O error occurred while paging in data.
173 * -EBADF - map exists, but area maps something that isn't a file.
174 * -EAGAIN - a kernel resource was temporarily unavailable.
175 */
176asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
177{
178 unsigned long end;
179 struct vm_area_struct * vma;
180 int unmapped_error = 0;
181 int error = -EINVAL;
182 size_t len;
183
184 down_write(&current->mm->mmap_sem);
185
186 if (start & ~PAGE_MASK)
187 goto out;
188 len = (len_in + ~PAGE_MASK) & PAGE_MASK;
189
190 /* Check to see whether len was rounded up from small -ve to zero */
191 if (len_in && !len)
192 goto out;
193
194 end = start + len;
195 if (end < start)
196 goto out;
197
198 error = 0;
199 if (end == start)
200 goto out;
201
202 /*
203 * If the interval [start,end) covers some unmapped address
204 * ranges, just ignore them, but return -ENOMEM at the end.
205 */
206 vma = find_vma(current->mm, start);
207 for (;;) {
208 /* Still start < end. */
209 error = -ENOMEM;
210 if (!vma)
211 goto out;
212
213 /* Here start < vma->vm_end. */
214 if (start < vma->vm_start) {
215 unmapped_error = -ENOMEM;
216 start = vma->vm_start;
217 }
218
219 /* Here vma->vm_start <= start < vma->vm_end. */
220 if (end <= vma->vm_end) {
221 if (start < end) {
222 error = madvise_vma(vma, start, end,
223 behavior);
224 if (error)
225 goto out;
226 }
227 error = unmapped_error;
228 goto out;
229 }
230
231 /* Here vma->vm_start <= start < vma->vm_end < end. */
232 error = madvise_vma(vma, start, vma->vm_end, behavior);
233 if (error)
234 goto out;
235 start = vma->vm_end;
236 vma = vma->vm_next;
237 }
238
239out:
240 up_write(&current->mm->mmap_sem);
241 return error;
242}
diff --git a/mm/memory.c b/mm/memory.c
new file mode 100644
index 00000000000..fb6e5deb873
--- /dev/null
+++ b/mm/memory.c
@@ -0,0 +1,2165 @@
1/*
2 * linux/mm/memory.c
3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 */
6
7/*
8 * demand-loading started 01.12.91 - seems it is high on the list of
9 * things wanted, and it should be easy to implement. - Linus
10 */
11
12/*
13 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
14 * pages started 02.12.91, seems to work. - Linus.
15 *
16 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
17 * would have taken more than the 6M I have free, but it worked well as
18 * far as I could see.
19 *
20 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
21 */
22
23/*
24 * Real VM (paging to/from disk) started 18.12.91. Much more work and
25 * thought has to go into this. Oh, well..
26 * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
27 * Found it. Everything seems to work now.
28 * 20.12.91 - Ok, making the swap-device changeable like the root.
29 */
30
31/*
32 * 05.04.94 - Multi-page memory management added for v1.1.
33 * Idea by Alex Bligh (alex@cconcepts.co.uk)
34 *
35 * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
36 * (Gerhard.Wichert@pdb.siemens.de)
37 *
38 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
39 */
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/hugetlb.h>
44#include <linux/mman.h>
45#include <linux/swap.h>
46#include <linux/highmem.h>
47#include <linux/pagemap.h>
48#include <linux/rmap.h>
49#include <linux/module.h>
50#include <linux/init.h>
51
52#include <asm/pgalloc.h>
53#include <asm/uaccess.h>
54#include <asm/tlb.h>
55#include <asm/tlbflush.h>
56#include <asm/pgtable.h>
57
58#include <linux/swapops.h>
59#include <linux/elf.h>
60
61#ifndef CONFIG_DISCONTIGMEM
62/* use the per-pgdat data instead for discontigmem - mbligh */
63unsigned long max_mapnr;
64struct page *mem_map;
65
66EXPORT_SYMBOL(max_mapnr);
67EXPORT_SYMBOL(mem_map);
68#endif
69
70unsigned long num_physpages;
71/*
72 * A number of key systems in x86 including ioremap() rely on the assumption
73 * that high_memory defines the upper bound on direct map memory, then end
74 * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
75 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
76 * and ZONE_HIGHMEM.
77 */
78void * high_memory;
79unsigned long vmalloc_earlyreserve;
80
81EXPORT_SYMBOL(num_physpages);
82EXPORT_SYMBOL(high_memory);
83EXPORT_SYMBOL(vmalloc_earlyreserve);
84
85/*
86 * If a p?d_bad entry is found while walking page tables, report
87 * the error, before resetting entry to p?d_none. Usually (but
88 * very seldom) called out from the p?d_none_or_clear_bad macros.
89 */
90
91void pgd_clear_bad(pgd_t *pgd)
92{
93 pgd_ERROR(*pgd);
94 pgd_clear(pgd);
95}
96
97void pud_clear_bad(pud_t *pud)
98{
99 pud_ERROR(*pud);
100 pud_clear(pud);
101}
102
103void pmd_clear_bad(pmd_t *pmd)
104{
105 pmd_ERROR(*pmd);
106 pmd_clear(pmd);
107}
108
109/*
110 * Note: this doesn't free the actual pages themselves. That
111 * has been handled earlier when unmapping all the memory regions.
112 */
113static inline void clear_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
114 unsigned long addr, unsigned long end)
115{
116 if (!((addr | end) & ~PMD_MASK)) {
117 /* Only free fully aligned ranges */
118 struct page *page = pmd_page(*pmd);
119 pmd_clear(pmd);
120 dec_page_state(nr_page_table_pages);
121 tlb->mm->nr_ptes--;
122 pte_free_tlb(tlb, page);
123 }
124}
125
126static inline void clear_pmd_range(struct mmu_gather *tlb, pud_t *pud,
127 unsigned long addr, unsigned long end)
128{
129 pmd_t *pmd;
130 unsigned long next;
131 pmd_t *empty_pmd = NULL;
132
133 pmd = pmd_offset(pud, addr);
134
135 /* Only free fully aligned ranges */
136 if (!((addr | end) & ~PUD_MASK))
137 empty_pmd = pmd;
138 do {
139 next = pmd_addr_end(addr, end);
140 if (pmd_none_or_clear_bad(pmd))
141 continue;
142 clear_pte_range(tlb, pmd, addr, next);
143 } while (pmd++, addr = next, addr != end);
144
145 if (empty_pmd) {
146 pud_clear(pud);
147 pmd_free_tlb(tlb, empty_pmd);
148 }
149}
150
151static inline void clear_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
152 unsigned long addr, unsigned long end)
153{
154 pud_t *pud;
155 unsigned long next;
156 pud_t *empty_pud = NULL;
157
158 pud = pud_offset(pgd, addr);
159
160 /* Only free fully aligned ranges */
161 if (!((addr | end) & ~PGDIR_MASK))
162 empty_pud = pud;
163 do {
164 next = pud_addr_end(addr, end);
165 if (pud_none_or_clear_bad(pud))
166 continue;
167 clear_pmd_range(tlb, pud, addr, next);
168 } while (pud++, addr = next, addr != end);
169
170 if (empty_pud) {
171 pgd_clear(pgd);
172 pud_free_tlb(tlb, empty_pud);
173 }
174}
175
176/*
177 * This function clears user-level page tables of a process.
178 * Unlike other pagetable walks, some memory layouts might give end 0.
179 * Must be called with pagetable lock held.
180 */
181void clear_page_range(struct mmu_gather *tlb,
182 unsigned long addr, unsigned long end)
183{
184 pgd_t *pgd;
185 unsigned long next;
186
187 pgd = pgd_offset(tlb->mm, addr);
188 do {
189 next = pgd_addr_end(addr, end);
190 if (pgd_none_or_clear_bad(pgd))
191 continue;
192 clear_pud_range(tlb, pgd, addr, next);
193 } while (pgd++, addr = next, addr != end);
194}
195
196pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
197{
198 if (!pmd_present(*pmd)) {
199 struct page *new;
200
201 spin_unlock(&mm->page_table_lock);
202 new = pte_alloc_one(mm, address);
203 spin_lock(&mm->page_table_lock);
204 if (!new)
205 return NULL;
206 /*
207 * Because we dropped the lock, we should re-check the
208 * entry, as somebody else could have populated it..
209 */
210 if (pmd_present(*pmd)) {
211 pte_free(new);
212 goto out;
213 }
214 mm->nr_ptes++;
215 inc_page_state(nr_page_table_pages);
216 pmd_populate(mm, pmd, new);
217 }
218out:
219 return pte_offset_map(pmd, address);
220}
221
222pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
223{
224 if (!pmd_present(*pmd)) {
225 pte_t *new;
226
227 spin_unlock(&mm->page_table_lock);
228 new = pte_alloc_one_kernel(mm, address);
229 spin_lock(&mm->page_table_lock);
230 if (!new)
231 return NULL;
232
233 /*
234 * Because we dropped the lock, we should re-check the
235 * entry, as somebody else could have populated it..
236 */
237 if (pmd_present(*pmd)) {
238 pte_free_kernel(new);
239 goto out;
240 }
241 pmd_populate_kernel(mm, pmd, new);
242 }
243out:
244 return pte_offset_kernel(pmd, address);
245}
246
247/*
248 * copy one vm_area from one task to the other. Assumes the page tables
249 * already present in the new task to be cleared in the whole range
250 * covered by this vma.
251 *
252 * dst->page_table_lock is held on entry and exit,
253 * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
254 */
255
256static inline void
257copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
258 pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags,
259 unsigned long addr)
260{
261 pte_t pte = *src_pte;
262 struct page *page;
263 unsigned long pfn;
264
265 /* pte contains position in swap or file, so copy. */
266 if (unlikely(!pte_present(pte))) {
267 if (!pte_file(pte)) {
268 swap_duplicate(pte_to_swp_entry(pte));
269 /* make sure dst_mm is on swapoff's mmlist. */
270 if (unlikely(list_empty(&dst_mm->mmlist))) {
271 spin_lock(&mmlist_lock);
272 list_add(&dst_mm->mmlist, &src_mm->mmlist);
273 spin_unlock(&mmlist_lock);
274 }
275 }
276 set_pte_at(dst_mm, addr, dst_pte, pte);
277 return;
278 }
279
280 pfn = pte_pfn(pte);
281 /* the pte points outside of valid memory, the
282 * mapping is assumed to be good, meaningful
283 * and not mapped via rmap - duplicate the
284 * mapping as is.
285 */
286 page = NULL;
287 if (pfn_valid(pfn))
288 page = pfn_to_page(pfn);
289
290 if (!page || PageReserved(page)) {
291 set_pte_at(dst_mm, addr, dst_pte, pte);
292 return;
293 }
294
295 /*
296 * If it's a COW mapping, write protect it both
297 * in the parent and the child
298 */
299 if ((vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) {
300 ptep_set_wrprotect(src_mm, addr, src_pte);
301 pte = *src_pte;
302 }
303
304 /*
305 * If it's a shared mapping, mark it clean in
306 * the child
307 */
308 if (vm_flags & VM_SHARED)
309 pte = pte_mkclean(pte);
310 pte = pte_mkold(pte);
311 get_page(page);
312 inc_mm_counter(dst_mm, rss);
313 if (PageAnon(page))
314 inc_mm_counter(dst_mm, anon_rss);
315 set_pte_at(dst_mm, addr, dst_pte, pte);
316 page_dup_rmap(page);
317}
318
319static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
320 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
321 unsigned long addr, unsigned long end)
322{
323 pte_t *src_pte, *dst_pte;
324 unsigned long vm_flags = vma->vm_flags;
325 int progress;
326
327again:
328 dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr);
329 if (!dst_pte)
330 return -ENOMEM;
331 src_pte = pte_offset_map_nested(src_pmd, addr);
332
333 progress = 0;
334 spin_lock(&src_mm->page_table_lock);
335 do {
336 /*
337 * We are holding two locks at this point - either of them
338 * could generate latencies in another task on another CPU.
339 */
340 if (progress >= 32 && (need_resched() ||
341 need_lockbreak(&src_mm->page_table_lock) ||
342 need_lockbreak(&dst_mm->page_table_lock)))
343 break;
344 if (pte_none(*src_pte)) {
345 progress++;
346 continue;
347 }
348 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr);
349 progress += 8;
350 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
351 spin_unlock(&src_mm->page_table_lock);
352
353 pte_unmap_nested(src_pte - 1);
354 pte_unmap(dst_pte - 1);
355 cond_resched_lock(&dst_mm->page_table_lock);
356 if (addr != end)
357 goto again;
358 return 0;
359}
360
361static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
362 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
363 unsigned long addr, unsigned long end)
364{
365 pmd_t *src_pmd, *dst_pmd;
366 unsigned long next;
367
368 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
369 if (!dst_pmd)
370 return -ENOMEM;
371 src_pmd = pmd_offset(src_pud, addr);
372 do {
373 next = pmd_addr_end(addr, end);
374 if (pmd_none_or_clear_bad(src_pmd))
375 continue;
376 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
377 vma, addr, next))
378 return -ENOMEM;
379 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
380 return 0;
381}
382
383static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
384 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
385 unsigned long addr, unsigned long end)
386{
387 pud_t *src_pud, *dst_pud;
388 unsigned long next;
389
390 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
391 if (!dst_pud)
392 return -ENOMEM;
393 src_pud = pud_offset(src_pgd, addr);
394 do {
395 next = pud_addr_end(addr, end);
396 if (pud_none_or_clear_bad(src_pud))
397 continue;
398 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
399 vma, addr, next))
400 return -ENOMEM;
401 } while (dst_pud++, src_pud++, addr = next, addr != end);
402 return 0;
403}
404
405int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
406 struct vm_area_struct *vma)
407{
408 pgd_t *src_pgd, *dst_pgd;
409 unsigned long next;
410 unsigned long addr = vma->vm_start;
411 unsigned long end = vma->vm_end;
412
413 if (is_vm_hugetlb_page(vma))
414 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
415
416 dst_pgd = pgd_offset(dst_mm, addr);
417 src_pgd = pgd_offset(src_mm, addr);
418 do {
419 next = pgd_addr_end(addr, end);
420 if (pgd_none_or_clear_bad(src_pgd))
421 continue;
422 if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
423 vma, addr, next))
424 return -ENOMEM;
425 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
426 return 0;
427}
428
429static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
430 unsigned long addr, unsigned long end,
431 struct zap_details *details)
432{
433 pte_t *pte;
434
435 pte = pte_offset_map(pmd, addr);
436 do {
437 pte_t ptent = *pte;
438 if (pte_none(ptent))
439 continue;
440 if (pte_present(ptent)) {
441 struct page *page = NULL;
442 unsigned long pfn = pte_pfn(ptent);
443 if (pfn_valid(pfn)) {
444 page = pfn_to_page(pfn);
445 if (PageReserved(page))
446 page = NULL;
447 }
448 if (unlikely(details) && page) {
449 /*
450 * unmap_shared_mapping_pages() wants to
451 * invalidate cache without truncating:
452 * unmap shared but keep private pages.
453 */
454 if (details->check_mapping &&
455 details->check_mapping != page->mapping)
456 continue;
457 /*
458 * Each page->index must be checked when
459 * invalidating or truncating nonlinear.
460 */
461 if (details->nonlinear_vma &&
462 (page->index < details->first_index ||
463 page->index > details->last_index))
464 continue;
465 }
466 ptent = ptep_get_and_clear(tlb->mm, addr, pte);
467 tlb_remove_tlb_entry(tlb, pte, addr);
468 if (unlikely(!page))
469 continue;
470 if (unlikely(details) && details->nonlinear_vma
471 && linear_page_index(details->nonlinear_vma,
472 addr) != page->index)
473 set_pte_at(tlb->mm, addr, pte,
474 pgoff_to_pte(page->index));
475 if (pte_dirty(ptent))
476 set_page_dirty(page);
477 if (PageAnon(page))
478 dec_mm_counter(tlb->mm, anon_rss);
479 else if (pte_young(ptent))
480 mark_page_accessed(page);
481 tlb->freed++;
482 page_remove_rmap(page);
483 tlb_remove_page(tlb, page);
484 continue;
485 }
486 /*
487 * If details->check_mapping, we leave swap entries;
488 * if details->nonlinear_vma, we leave file entries.
489 */
490 if (unlikely(details))
491 continue;
492 if (!pte_file(ptent))
493 free_swap_and_cache(pte_to_swp_entry(ptent));
494 pte_clear(tlb->mm, addr, pte);
495 } while (pte++, addr += PAGE_SIZE, addr != end);
496 pte_unmap(pte - 1);
497}
498
499static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
500 unsigned long addr, unsigned long end,
501 struct zap_details *details)
502{
503 pmd_t *pmd;
504 unsigned long next;
505
506 pmd = pmd_offset(pud, addr);
507 do {
508 next = pmd_addr_end(addr, end);
509 if (pmd_none_or_clear_bad(pmd))
510 continue;
511 zap_pte_range(tlb, pmd, addr, next, details);
512 } while (pmd++, addr = next, addr != end);
513}
514
515static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
516 unsigned long addr, unsigned long end,
517 struct zap_details *details)
518{
519 pud_t *pud;
520 unsigned long next;
521
522 pud = pud_offset(pgd, addr);
523 do {
524 next = pud_addr_end(addr, end);
525 if (pud_none_or_clear_bad(pud))
526 continue;
527 zap_pmd_range(tlb, pud, addr, next, details);
528 } while (pud++, addr = next, addr != end);
529}
530
531static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
532 unsigned long addr, unsigned long end,
533 struct zap_details *details)
534{
535 pgd_t *pgd;
536 unsigned long next;
537
538 if (details && !details->check_mapping && !details->nonlinear_vma)
539 details = NULL;
540
541 BUG_ON(addr >= end);
542 tlb_start_vma(tlb, vma);
543 pgd = pgd_offset(vma->vm_mm, addr);
544 do {
545 next = pgd_addr_end(addr, end);
546 if (pgd_none_or_clear_bad(pgd))
547 continue;
548 zap_pud_range(tlb, pgd, addr, next, details);
549 } while (pgd++, addr = next, addr != end);
550 tlb_end_vma(tlb, vma);
551}
552
553#ifdef CONFIG_PREEMPT
554# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
555#else
556/* No preempt: go for improved straight-line efficiency */
557# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
558#endif
559
560/**
561 * unmap_vmas - unmap a range of memory covered by a list of vma's
562 * @tlbp: address of the caller's struct mmu_gather
563 * @mm: the controlling mm_struct
564 * @vma: the starting vma
565 * @start_addr: virtual address at which to start unmapping
566 * @end_addr: virtual address at which to end unmapping
567 * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
568 * @details: details of nonlinear truncation or shared cache invalidation
569 *
570 * Returns the number of vma's which were covered by the unmapping.
571 *
572 * Unmap all pages in the vma list. Called under page_table_lock.
573 *
574 * We aim to not hold page_table_lock for too long (for scheduling latency
575 * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
576 * return the ending mmu_gather to the caller.
577 *
578 * Only addresses between `start' and `end' will be unmapped.
579 *
580 * The VMA list must be sorted in ascending virtual address order.
581 *
582 * unmap_vmas() assumes that the caller will flush the whole unmapped address
583 * range after unmap_vmas() returns. So the only responsibility here is to
584 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
585 * drops the lock and schedules.
586 */
587int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
588 struct vm_area_struct *vma, unsigned long start_addr,
589 unsigned long end_addr, unsigned long *nr_accounted,
590 struct zap_details *details)
591{
592 unsigned long zap_bytes = ZAP_BLOCK_SIZE;
593 unsigned long tlb_start = 0; /* For tlb_finish_mmu */
594 int tlb_start_valid = 0;
595 int ret = 0;
596 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
597 int fullmm = tlb_is_full_mm(*tlbp);
598
599 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
600 unsigned long start;
601 unsigned long end;
602
603 start = max(vma->vm_start, start_addr);
604 if (start >= vma->vm_end)
605 continue;
606 end = min(vma->vm_end, end_addr);
607 if (end <= vma->vm_start)
608 continue;
609
610 if (vma->vm_flags & VM_ACCOUNT)
611 *nr_accounted += (end - start) >> PAGE_SHIFT;
612
613 ret++;
614 while (start != end) {
615 unsigned long block;
616
617 if (!tlb_start_valid) {
618 tlb_start = start;
619 tlb_start_valid = 1;
620 }
621
622 if (is_vm_hugetlb_page(vma)) {
623 block = end - start;
624 unmap_hugepage_range(vma, start, end);
625 } else {
626 block = min(zap_bytes, end - start);
627 unmap_page_range(*tlbp, vma, start,
628 start + block, details);
629 }
630
631 start += block;
632 zap_bytes -= block;
633 if ((long)zap_bytes > 0)
634 continue;
635
636 tlb_finish_mmu(*tlbp, tlb_start, start);
637
638 if (need_resched() ||
639 need_lockbreak(&mm->page_table_lock) ||
640 (i_mmap_lock && need_lockbreak(i_mmap_lock))) {
641 if (i_mmap_lock) {
642 /* must reset count of rss freed */
643 *tlbp = tlb_gather_mmu(mm, fullmm);
644 details->break_addr = start;
645 goto out;
646 }
647 spin_unlock(&mm->page_table_lock);
648 cond_resched();
649 spin_lock(&mm->page_table_lock);
650 }
651
652 *tlbp = tlb_gather_mmu(mm, fullmm);
653 tlb_start_valid = 0;
654 zap_bytes = ZAP_BLOCK_SIZE;
655 }
656 }
657out:
658 return ret;
659}
660
661/**
662 * zap_page_range - remove user pages in a given range
663 * @vma: vm_area_struct holding the applicable pages
664 * @address: starting address of pages to zap
665 * @size: number of bytes to zap
666 * @details: details of nonlinear truncation or shared cache invalidation
667 */
668void zap_page_range(struct vm_area_struct *vma, unsigned long address,
669 unsigned long size, struct zap_details *details)
670{
671 struct mm_struct *mm = vma->vm_mm;
672 struct mmu_gather *tlb;
673 unsigned long end = address + size;
674 unsigned long nr_accounted = 0;
675
676 if (is_vm_hugetlb_page(vma)) {
677 zap_hugepage_range(vma, address, size);
678 return;
679 }
680
681 lru_add_drain();
682 spin_lock(&mm->page_table_lock);
683 tlb = tlb_gather_mmu(mm, 0);
684 unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
685 tlb_finish_mmu(tlb, address, end);
686 spin_unlock(&mm->page_table_lock);
687}
688
689/*
690 * Do a quick page-table lookup for a single page.
691 * mm->page_table_lock must be held.
692 */
693static struct page *
694__follow_page(struct mm_struct *mm, unsigned long address, int read, int write)
695{
696 pgd_t *pgd;
697 pud_t *pud;
698 pmd_t *pmd;
699 pte_t *ptep, pte;
700 unsigned long pfn;
701 struct page *page;
702
703 page = follow_huge_addr(mm, address, write);
704 if (! IS_ERR(page))
705 return page;
706
707 pgd = pgd_offset(mm, address);
708 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
709 goto out;
710
711 pud = pud_offset(pgd, address);
712 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
713 goto out;
714
715 pmd = pmd_offset(pud, address);
716 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
717 goto out;
718 if (pmd_huge(*pmd))
719 return follow_huge_pmd(mm, address, pmd, write);
720
721 ptep = pte_offset_map(pmd, address);
722 if (!ptep)
723 goto out;
724
725 pte = *ptep;
726 pte_unmap(ptep);
727 if (pte_present(pte)) {
728 if (write && !pte_write(pte))
729 goto out;
730 if (read && !pte_read(pte))
731 goto out;
732 pfn = pte_pfn(pte);
733 if (pfn_valid(pfn)) {
734 page = pfn_to_page(pfn);
735 if (write && !pte_dirty(pte) && !PageDirty(page))
736 set_page_dirty(page);
737 mark_page_accessed(page);
738 return page;
739 }
740 }
741
742out:
743 return NULL;
744}
745
746struct page *
747follow_page(struct mm_struct *mm, unsigned long address, int write)
748{
749 return __follow_page(mm, address, /*read*/0, write);
750}
751
752int
753check_user_page_readable(struct mm_struct *mm, unsigned long address)
754{
755 return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL;
756}
757
758EXPORT_SYMBOL(check_user_page_readable);
759
760/*
761 * Given a physical address, is there a useful struct page pointing to
762 * it? This may become more complex in the future if we start dealing
763 * with IO-aperture pages for direct-IO.
764 */
765
766static inline struct page *get_page_map(struct page *page)
767{
768 if (!pfn_valid(page_to_pfn(page)))
769 return NULL;
770 return page;
771}
772
773
774static inline int
775untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
776 unsigned long address)
777{
778 pgd_t *pgd;
779 pud_t *pud;
780 pmd_t *pmd;
781
782 /* Check if the vma is for an anonymous mapping. */
783 if (vma->vm_ops && vma->vm_ops->nopage)
784 return 0;
785
786 /* Check if page directory entry exists. */
787 pgd = pgd_offset(mm, address);
788 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
789 return 1;
790
791 pud = pud_offset(pgd, address);
792 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
793 return 1;
794
795 /* Check if page middle directory entry exists. */
796 pmd = pmd_offset(pud, address);
797 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
798 return 1;
799
800 /* There is a pte slot for 'address' in 'mm'. */
801 return 0;
802}
803
804
805int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
806 unsigned long start, int len, int write, int force,
807 struct page **pages, struct vm_area_struct **vmas)
808{
809 int i;
810 unsigned int flags;
811
812 /*
813 * Require read or write permissions.
814 * If 'force' is set, we only require the "MAY" flags.
815 */
816 flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
817 flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
818 i = 0;
819
820 do {
821 struct vm_area_struct * vma;
822
823 vma = find_extend_vma(mm, start);
824 if (!vma && in_gate_area(tsk, start)) {
825 unsigned long pg = start & PAGE_MASK;
826 struct vm_area_struct *gate_vma = get_gate_vma(tsk);
827 pgd_t *pgd;
828 pud_t *pud;
829 pmd_t *pmd;
830 pte_t *pte;
831 if (write) /* user gate pages are read-only */
832 return i ? : -EFAULT;
833 if (pg > TASK_SIZE)
834 pgd = pgd_offset_k(pg);
835 else
836 pgd = pgd_offset_gate(mm, pg);
837 BUG_ON(pgd_none(*pgd));
838 pud = pud_offset(pgd, pg);
839 BUG_ON(pud_none(*pud));
840 pmd = pmd_offset(pud, pg);
841 BUG_ON(pmd_none(*pmd));
842 pte = pte_offset_map(pmd, pg);
843 BUG_ON(pte_none(*pte));
844 if (pages) {
845 pages[i] = pte_page(*pte);
846 get_page(pages[i]);
847 }
848 pte_unmap(pte);
849 if (vmas)
850 vmas[i] = gate_vma;
851 i++;
852 start += PAGE_SIZE;
853 len--;
854 continue;
855 }
856
857 if (!vma || (vma->vm_flags & VM_IO)
858 || !(flags & vma->vm_flags))
859 return i ? : -EFAULT;
860
861 if (is_vm_hugetlb_page(vma)) {
862 i = follow_hugetlb_page(mm, vma, pages, vmas,
863 &start, &len, i);
864 continue;
865 }
866 spin_lock(&mm->page_table_lock);
867 do {
868 struct page *map;
869 int lookup_write = write;
870
871 cond_resched_lock(&mm->page_table_lock);
872 while (!(map = follow_page(mm, start, lookup_write))) {
873 /*
874 * Shortcut for anonymous pages. We don't want
875 * to force the creation of pages tables for
876 * insanly big anonymously mapped areas that
877 * nobody touched so far. This is important
878 * for doing a core dump for these mappings.
879 */
880 if (!lookup_write &&
881 untouched_anonymous_page(mm,vma,start)) {
882 map = ZERO_PAGE(start);
883 break;
884 }
885 spin_unlock(&mm->page_table_lock);
886 switch (handle_mm_fault(mm,vma,start,write)) {
887 case VM_FAULT_MINOR:
888 tsk->min_flt++;
889 break;
890 case VM_FAULT_MAJOR:
891 tsk->maj_flt++;
892 break;
893 case VM_FAULT_SIGBUS:
894 return i ? i : -EFAULT;
895 case VM_FAULT_OOM:
896 return i ? i : -ENOMEM;
897 default:
898 BUG();
899 }
900 /*
901 * Now that we have performed a write fault
902 * and surely no longer have a shared page we
903 * shouldn't write, we shouldn't ignore an
904 * unwritable page in the page table if
905 * we are forcing write access.
906 */
907 lookup_write = write && !force;
908 spin_lock(&mm->page_table_lock);
909 }
910 if (pages) {
911 pages[i] = get_page_map(map);
912 if (!pages[i]) {
913 spin_unlock(&mm->page_table_lock);
914 while (i--)
915 page_cache_release(pages[i]);
916 i = -EFAULT;
917 goto out;
918 }
919 flush_dcache_page(pages[i]);
920 if (!PageReserved(pages[i]))
921 page_cache_get(pages[i]);
922 }
923 if (vmas)
924 vmas[i] = vma;
925 i++;
926 start += PAGE_SIZE;
927 len--;
928 } while(len && start < vma->vm_end);
929 spin_unlock(&mm->page_table_lock);
930 } while(len);
931out:
932 return i;
933}
934
935EXPORT_SYMBOL(get_user_pages);
936
937static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
938 unsigned long addr, unsigned long end, pgprot_t prot)
939{
940 pte_t *pte;
941
942 pte = pte_alloc_map(mm, pmd, addr);
943 if (!pte)
944 return -ENOMEM;
945 do {
946 pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot));
947 BUG_ON(!pte_none(*pte));
948 set_pte_at(mm, addr, pte, zero_pte);
949 } while (pte++, addr += PAGE_SIZE, addr != end);
950 pte_unmap(pte - 1);
951 return 0;
952}
953
954static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
955 unsigned long addr, unsigned long end, pgprot_t prot)
956{
957 pmd_t *pmd;
958 unsigned long next;
959
960 pmd = pmd_alloc(mm, pud, addr);
961 if (!pmd)
962 return -ENOMEM;
963 do {
964 next = pmd_addr_end(addr, end);
965 if (zeromap_pte_range(mm, pmd, addr, next, prot))
966 return -ENOMEM;
967 } while (pmd++, addr = next, addr != end);
968 return 0;
969}
970
971static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
972 unsigned long addr, unsigned long end, pgprot_t prot)
973{
974 pud_t *pud;
975 unsigned long next;
976
977 pud = pud_alloc(mm, pgd, addr);
978 if (!pud)
979 return -ENOMEM;
980 do {
981 next = pud_addr_end(addr, end);
982 if (zeromap_pmd_range(mm, pud, addr, next, prot))
983 return -ENOMEM;
984 } while (pud++, addr = next, addr != end);
985 return 0;
986}
987
988int zeromap_page_range(struct vm_area_struct *vma,
989 unsigned long addr, unsigned long size, pgprot_t prot)
990{
991 pgd_t *pgd;
992 unsigned long next;
993 unsigned long end = addr + size;
994 struct mm_struct *mm = vma->vm_mm;
995 int err;
996
997 BUG_ON(addr >= end);
998 pgd = pgd_offset(mm, addr);
999 flush_cache_range(vma, addr, end);
1000 spin_lock(&mm->page_table_lock);
1001 do {
1002 next = pgd_addr_end(addr, end);
1003 err = zeromap_pud_range(mm, pgd, addr, next, prot);
1004 if (err)
1005 break;
1006 } while (pgd++, addr = next, addr != end);
1007 spin_unlock(&mm->page_table_lock);
1008 return err;
1009}
1010
1011/*
1012 * maps a range of physical memory into the requested pages. the old
1013 * mappings are removed. any references to nonexistent pages results
1014 * in null mappings (currently treated as "copy-on-access")
1015 */
1016static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1017 unsigned long addr, unsigned long end,
1018 unsigned long pfn, pgprot_t prot)
1019{
1020 pte_t *pte;
1021
1022 pte = pte_alloc_map(mm, pmd, addr);
1023 if (!pte)
1024 return -ENOMEM;
1025 do {
1026 BUG_ON(!pte_none(*pte));
1027 if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn)))
1028 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1029 pfn++;
1030 } while (pte++, addr += PAGE_SIZE, addr != end);
1031 pte_unmap(pte - 1);
1032 return 0;
1033}
1034
1035static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1036 unsigned long addr, unsigned long end,
1037 unsigned long pfn, pgprot_t prot)
1038{
1039 pmd_t *pmd;
1040 unsigned long next;
1041
1042 pfn -= addr >> PAGE_SHIFT;
1043 pmd = pmd_alloc(mm, pud, addr);
1044 if (!pmd)
1045 return -ENOMEM;
1046 do {
1047 next = pmd_addr_end(addr, end);
1048 if (remap_pte_range(mm, pmd, addr, next,
1049 pfn + (addr >> PAGE_SHIFT), prot))
1050 return -ENOMEM;
1051 } while (pmd++, addr = next, addr != end);
1052 return 0;
1053}
1054
1055static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1056 unsigned long addr, unsigned long end,
1057 unsigned long pfn, pgprot_t prot)
1058{
1059 pud_t *pud;
1060 unsigned long next;
1061
1062 pfn -= addr >> PAGE_SHIFT;
1063 pud = pud_alloc(mm, pgd, addr);
1064 if (!pud)
1065 return -ENOMEM;
1066 do {
1067 next = pud_addr_end(addr, end);
1068 if (remap_pmd_range(mm, pud, addr, next,
1069 pfn + (addr >> PAGE_SHIFT), prot))
1070 return -ENOMEM;
1071 } while (pud++, addr = next, addr != end);
1072 return 0;
1073}
1074
1075/* Note: this is only safe if the mm semaphore is held when called. */
1076int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1077 unsigned long pfn, unsigned long size, pgprot_t prot)
1078{
1079 pgd_t *pgd;
1080 unsigned long next;
1081 unsigned long end = addr + size;
1082 struct mm_struct *mm = vma->vm_mm;
1083 int err;
1084
1085 /*
1086 * Physically remapped pages are special. Tell the
1087 * rest of the world about it:
1088 * VM_IO tells people not to look at these pages
1089 * (accesses can have side effects).
1090 * VM_RESERVED tells swapout not to try to touch
1091 * this region.
1092 */
1093 vma->vm_flags |= VM_IO | VM_RESERVED;
1094
1095 BUG_ON(addr >= end);
1096 pfn -= addr >> PAGE_SHIFT;
1097 pgd = pgd_offset(mm, addr);
1098 flush_cache_range(vma, addr, end);
1099 spin_lock(&mm->page_table_lock);
1100 do {
1101 next = pgd_addr_end(addr, end);
1102 err = remap_pud_range(mm, pgd, addr, next,
1103 pfn + (addr >> PAGE_SHIFT), prot);
1104 if (err)
1105 break;
1106 } while (pgd++, addr = next, addr != end);
1107 spin_unlock(&mm->page_table_lock);
1108 return err;
1109}
1110EXPORT_SYMBOL(remap_pfn_range);
1111
1112/*
1113 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
1114 * servicing faults for write access. In the normal case, do always want
1115 * pte_mkwrite. But get_user_pages can cause write faults for mappings
1116 * that do not have writing enabled, when used by access_process_vm.
1117 */
1118static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1119{
1120 if (likely(vma->vm_flags & VM_WRITE))
1121 pte = pte_mkwrite(pte);
1122 return pte;
1123}
1124
1125/*
1126 * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
1127 */
1128static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address,
1129 pte_t *page_table)
1130{
1131 pte_t entry;
1132
1133 entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)),
1134 vma);
1135 ptep_establish(vma, address, page_table, entry);
1136 update_mmu_cache(vma, address, entry);
1137 lazy_mmu_prot_update(entry);
1138}
1139
1140/*
1141 * This routine handles present pages, when users try to write
1142 * to a shared page. It is done by copying the page to a new address
1143 * and decrementing the shared-page counter for the old page.
1144 *
1145 * Goto-purists beware: the only reason for goto's here is that it results
1146 * in better assembly code.. The "default" path will see no jumps at all.
1147 *
1148 * Note that this routine assumes that the protection checks have been
1149 * done by the caller (the low-level page fault routine in most cases).
1150 * Thus we can safely just mark it writable once we've done any necessary
1151 * COW.
1152 *
1153 * We also mark the page dirty at this point even though the page will
1154 * change only once the write actually happens. This avoids a few races,
1155 * and potentially makes it more efficient.
1156 *
1157 * We hold the mm semaphore and the page_table_lock on entry and exit
1158 * with the page_table_lock released.
1159 */
1160static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
1161 unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte)
1162{
1163 struct page *old_page, *new_page;
1164 unsigned long pfn = pte_pfn(pte);
1165 pte_t entry;
1166
1167 if (unlikely(!pfn_valid(pfn))) {
1168 /*
1169 * This should really halt the system so it can be debugged or
1170 * at least the kernel stops what it's doing before it corrupts
1171 * data, but for the moment just pretend this is OOM.
1172 */
1173 pte_unmap(page_table);
1174 printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n",
1175 address);
1176 spin_unlock(&mm->page_table_lock);
1177 return VM_FAULT_OOM;
1178 }
1179 old_page = pfn_to_page(pfn);
1180
1181 if (!TestSetPageLocked(old_page)) {
1182 int reuse = can_share_swap_page(old_page);
1183 unlock_page(old_page);
1184 if (reuse) {
1185 flush_cache_page(vma, address, pfn);
1186 entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)),
1187 vma);
1188 ptep_set_access_flags(vma, address, page_table, entry, 1);
1189 update_mmu_cache(vma, address, entry);
1190 lazy_mmu_prot_update(entry);
1191 pte_unmap(page_table);
1192 spin_unlock(&mm->page_table_lock);
1193 return VM_FAULT_MINOR;
1194 }
1195 }
1196 pte_unmap(page_table);
1197
1198 /*
1199 * Ok, we need to copy. Oh, well..
1200 */
1201 if (!PageReserved(old_page))
1202 page_cache_get(old_page);
1203 spin_unlock(&mm->page_table_lock);
1204
1205 if (unlikely(anon_vma_prepare(vma)))
1206 goto no_new_page;
1207 if (old_page == ZERO_PAGE(address)) {
1208 new_page = alloc_zeroed_user_highpage(vma, address);
1209 if (!new_page)
1210 goto no_new_page;
1211 } else {
1212 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
1213 if (!new_page)
1214 goto no_new_page;
1215 copy_user_highpage(new_page, old_page, address);
1216 }
1217 /*
1218 * Re-check the pte - we dropped the lock
1219 */
1220 spin_lock(&mm->page_table_lock);
1221 page_table = pte_offset_map(pmd, address);
1222 if (likely(pte_same(*page_table, pte))) {
1223 if (PageAnon(old_page))
1224 dec_mm_counter(mm, anon_rss);
1225 if (PageReserved(old_page))
1226 inc_mm_counter(mm, rss);
1227 else
1228 page_remove_rmap(old_page);
1229 flush_cache_page(vma, address, pfn);
1230 break_cow(vma, new_page, address, page_table);
1231 lru_cache_add_active(new_page);
1232 page_add_anon_rmap(new_page, vma, address);
1233
1234 /* Free the old page.. */
1235 new_page = old_page;
1236 }
1237 pte_unmap(page_table);
1238 page_cache_release(new_page);
1239 page_cache_release(old_page);
1240 spin_unlock(&mm->page_table_lock);
1241 return VM_FAULT_MINOR;
1242
1243no_new_page:
1244 page_cache_release(old_page);
1245 return VM_FAULT_OOM;
1246}
1247
1248/*
1249 * Helper functions for unmap_mapping_range().
1250 *
1251 * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
1252 *
1253 * We have to restart searching the prio_tree whenever we drop the lock,
1254 * since the iterator is only valid while the lock is held, and anyway
1255 * a later vma might be split and reinserted earlier while lock dropped.
1256 *
1257 * The list of nonlinear vmas could be handled more efficiently, using
1258 * a placeholder, but handle it in the same way until a need is shown.
1259 * It is important to search the prio_tree before nonlinear list: a vma
1260 * may become nonlinear and be shifted from prio_tree to nonlinear list
1261 * while the lock is dropped; but never shifted from list to prio_tree.
1262 *
1263 * In order to make forward progress despite restarting the search,
1264 * vm_truncate_count is used to mark a vma as now dealt with, so we can
1265 * quickly skip it next time around. Since the prio_tree search only
1266 * shows us those vmas affected by unmapping the range in question, we
1267 * can't efficiently keep all vmas in step with mapping->truncate_count:
1268 * so instead reset them all whenever it wraps back to 0 (then go to 1).
1269 * mapping->truncate_count and vma->vm_truncate_count are protected by
1270 * i_mmap_lock.
1271 *
1272 * In order to make forward progress despite repeatedly restarting some
1273 * large vma, note the break_addr set by unmap_vmas when it breaks out:
1274 * and restart from that address when we reach that vma again. It might
1275 * have been split or merged, shrunk or extended, but never shifted: so
1276 * restart_addr remains valid so long as it remains in the vma's range.
1277 * unmap_mapping_range forces truncate_count to leap over page-aligned
1278 * values so we can save vma's restart_addr in its truncate_count field.
1279 */
1280#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
1281
1282static void reset_vma_truncate_counts(struct address_space *mapping)
1283{
1284 struct vm_area_struct *vma;
1285 struct prio_tree_iter iter;
1286
1287 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
1288 vma->vm_truncate_count = 0;
1289 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
1290 vma->vm_truncate_count = 0;
1291}
1292
1293static int unmap_mapping_range_vma(struct vm_area_struct *vma,
1294 unsigned long start_addr, unsigned long end_addr,
1295 struct zap_details *details)
1296{
1297 unsigned long restart_addr;
1298 int need_break;
1299
1300again:
1301 restart_addr = vma->vm_truncate_count;
1302 if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
1303 start_addr = restart_addr;
1304 if (start_addr >= end_addr) {
1305 /* Top of vma has been split off since last time */
1306 vma->vm_truncate_count = details->truncate_count;
1307 return 0;
1308 }
1309 }
1310
1311 details->break_addr = end_addr;
1312 zap_page_range(vma, start_addr, end_addr - start_addr, details);
1313
1314 /*
1315 * We cannot rely on the break test in unmap_vmas:
1316 * on the one hand, we don't want to restart our loop
1317 * just because that broke out for the page_table_lock;
1318 * on the other hand, it does no test when vma is small.
1319 */
1320 need_break = need_resched() ||
1321 need_lockbreak(details->i_mmap_lock);
1322
1323 if (details->break_addr >= end_addr) {
1324 /* We have now completed this vma: mark it so */
1325 vma->vm_truncate_count = details->truncate_count;
1326 if (!need_break)
1327 return 0;
1328 } else {
1329 /* Note restart_addr in vma's truncate_count field */
1330 vma->vm_truncate_count = details->break_addr;
1331 if (!need_break)
1332 goto again;
1333 }
1334
1335 spin_unlock(details->i_mmap_lock);
1336 cond_resched();
1337 spin_lock(details->i_mmap_lock);
1338 return -EINTR;
1339}
1340
1341static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
1342 struct zap_details *details)
1343{
1344 struct vm_area_struct *vma;
1345 struct prio_tree_iter iter;
1346 pgoff_t vba, vea, zba, zea;
1347
1348restart:
1349 vma_prio_tree_foreach(vma, &iter, root,
1350 details->first_index, details->last_index) {
1351 /* Skip quickly over those we have already dealt with */
1352 if (vma->vm_truncate_count == details->truncate_count)
1353 continue;
1354
1355 vba = vma->vm_pgoff;
1356 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
1357 /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
1358 zba = details->first_index;
1359 if (zba < vba)
1360 zba = vba;
1361 zea = details->last_index;
1362 if (zea > vea)
1363 zea = vea;
1364
1365 if (unmap_mapping_range_vma(vma,
1366 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
1367 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
1368 details) < 0)
1369 goto restart;
1370 }
1371}
1372
1373static inline void unmap_mapping_range_list(struct list_head *head,
1374 struct zap_details *details)
1375{
1376 struct vm_area_struct *vma;
1377
1378 /*
1379 * In nonlinear VMAs there is no correspondence between virtual address
1380 * offset and file offset. So we must perform an exhaustive search
1381 * across *all* the pages in each nonlinear VMA, not just the pages
1382 * whose virtual address lies outside the file truncation point.
1383 */
1384restart:
1385 list_for_each_entry(vma, head, shared.vm_set.list) {
1386 /* Skip quickly over those we have already dealt with */
1387 if (vma->vm_truncate_count == details->truncate_count)
1388 continue;
1389 details->nonlinear_vma = vma;
1390 if (unmap_mapping_range_vma(vma, vma->vm_start,
1391 vma->vm_end, details) < 0)
1392 goto restart;
1393 }
1394}
1395
1396/**
1397 * unmap_mapping_range - unmap the portion of all mmaps
1398 * in the specified address_space corresponding to the specified
1399 * page range in the underlying file.
1400 * @address_space: the address space containing mmaps to be unmapped.
1401 * @holebegin: byte in first page to unmap, relative to the start of
1402 * the underlying file. This will be rounded down to a PAGE_SIZE
1403 * boundary. Note that this is different from vmtruncate(), which
1404 * must keep the partial page. In contrast, we must get rid of
1405 * partial pages.
1406 * @holelen: size of prospective hole in bytes. This will be rounded
1407 * up to a PAGE_SIZE boundary. A holelen of zero truncates to the
1408 * end of the file.
1409 * @even_cows: 1 when truncating a file, unmap even private COWed pages;
1410 * but 0 when invalidating pagecache, don't throw away private data.
1411 */
1412void unmap_mapping_range(struct address_space *mapping,
1413 loff_t const holebegin, loff_t const holelen, int even_cows)
1414{
1415 struct zap_details details;
1416 pgoff_t hba = holebegin >> PAGE_SHIFT;
1417 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
1418
1419 /* Check for overflow. */
1420 if (sizeof(holelen) > sizeof(hlen)) {
1421 long long holeend =
1422 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
1423 if (holeend & ~(long long)ULONG_MAX)
1424 hlen = ULONG_MAX - hba + 1;
1425 }
1426
1427 details.check_mapping = even_cows? NULL: mapping;
1428 details.nonlinear_vma = NULL;
1429 details.first_index = hba;
1430 details.last_index = hba + hlen - 1;
1431 if (details.last_index < details.first_index)
1432 details.last_index = ULONG_MAX;
1433 details.i_mmap_lock = &mapping->i_mmap_lock;
1434
1435 spin_lock(&mapping->i_mmap_lock);
1436
1437 /* serialize i_size write against truncate_count write */
1438 smp_wmb();
1439 /* Protect against page faults, and endless unmapping loops */
1440 mapping->truncate_count++;
1441 /*
1442 * For archs where spin_lock has inclusive semantics like ia64
1443 * this smp_mb() will prevent to read pagetable contents
1444 * before the truncate_count increment is visible to
1445 * other cpus.
1446 */
1447 smp_mb();
1448 if (unlikely(is_restart_addr(mapping->truncate_count))) {
1449 if (mapping->truncate_count == 0)
1450 reset_vma_truncate_counts(mapping);
1451 mapping->truncate_count++;
1452 }
1453 details.truncate_count = mapping->truncate_count;
1454
1455 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
1456 unmap_mapping_range_tree(&mapping->i_mmap, &details);
1457 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
1458 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
1459 spin_unlock(&mapping->i_mmap_lock);
1460}
1461EXPORT_SYMBOL(unmap_mapping_range);
1462
1463/*
1464 * Handle all mappings that got truncated by a "truncate()"
1465 * system call.
1466 *
1467 * NOTE! We have to be ready to update the memory sharing
1468 * between the file and the memory map for a potential last
1469 * incomplete page. Ugly, but necessary.
1470 */
1471int vmtruncate(struct inode * inode, loff_t offset)
1472{
1473 struct address_space *mapping = inode->i_mapping;
1474 unsigned long limit;
1475
1476 if (inode->i_size < offset)
1477 goto do_expand;
1478 /*
1479 * truncation of in-use swapfiles is disallowed - it would cause
1480 * subsequent swapout to scribble on the now-freed blocks.
1481 */
1482 if (IS_SWAPFILE(inode))
1483 goto out_busy;
1484 i_size_write(inode, offset);
1485 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
1486 truncate_inode_pages(mapping, offset);
1487 goto out_truncate;
1488
1489do_expand:
1490 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1491 if (limit != RLIM_INFINITY && offset > limit)
1492 goto out_sig;
1493 if (offset > inode->i_sb->s_maxbytes)
1494 goto out_big;
1495 i_size_write(inode, offset);
1496
1497out_truncate:
1498 if (inode->i_op && inode->i_op->truncate)
1499 inode->i_op->truncate(inode);
1500 return 0;
1501out_sig:
1502 send_sig(SIGXFSZ, current, 0);
1503out_big:
1504 return -EFBIG;
1505out_busy:
1506 return -ETXTBSY;
1507}
1508
1509EXPORT_SYMBOL(vmtruncate);
1510
1511/*
1512 * Primitive swap readahead code. We simply read an aligned block of
1513 * (1 << page_cluster) entries in the swap area. This method is chosen
1514 * because it doesn't cost us any seek time. We also make sure to queue
1515 * the 'original' request together with the readahead ones...
1516 *
1517 * This has been extended to use the NUMA policies from the mm triggering
1518 * the readahead.
1519 *
1520 * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
1521 */
1522void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
1523{
1524#ifdef CONFIG_NUMA
1525 struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL;
1526#endif
1527 int i, num;
1528 struct page *new_page;
1529 unsigned long offset;
1530
1531 /*
1532 * Get the number of handles we should do readahead io to.
1533 */
1534 num = valid_swaphandles(entry, &offset);
1535 for (i = 0; i < num; offset++, i++) {
1536 /* Ok, do the async read-ahead now */
1537 new_page = read_swap_cache_async(swp_entry(swp_type(entry),
1538 offset), vma, addr);
1539 if (!new_page)
1540 break;
1541 page_cache_release(new_page);
1542#ifdef CONFIG_NUMA
1543 /*
1544 * Find the next applicable VMA for the NUMA policy.
1545 */
1546 addr += PAGE_SIZE;
1547 if (addr == 0)
1548 vma = NULL;
1549 if (vma) {
1550 if (addr >= vma->vm_end) {
1551 vma = next_vma;
1552 next_vma = vma ? vma->vm_next : NULL;
1553 }
1554 if (vma && addr < vma->vm_start)
1555 vma = NULL;
1556 } else {
1557 if (next_vma && addr >= next_vma->vm_start) {
1558 vma = next_vma;
1559 next_vma = vma->vm_next;
1560 }
1561 }
1562#endif
1563 }
1564 lru_add_drain(); /* Push any new pages onto the LRU now */
1565}
1566
1567/*
1568 * We hold the mm semaphore and the page_table_lock on entry and
1569 * should release the pagetable lock on exit..
1570 */
1571static int do_swap_page(struct mm_struct * mm,
1572 struct vm_area_struct * vma, unsigned long address,
1573 pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
1574{
1575 struct page *page;
1576 swp_entry_t entry = pte_to_swp_entry(orig_pte);
1577 pte_t pte;
1578 int ret = VM_FAULT_MINOR;
1579
1580 pte_unmap(page_table);
1581 spin_unlock(&mm->page_table_lock);
1582 page = lookup_swap_cache(entry);
1583 if (!page) {
1584 swapin_readahead(entry, address, vma);
1585 page = read_swap_cache_async(entry, vma, address);
1586 if (!page) {
1587 /*
1588 * Back out if somebody else faulted in this pte while
1589 * we released the page table lock.
1590 */
1591 spin_lock(&mm->page_table_lock);
1592 page_table = pte_offset_map(pmd, address);
1593 if (likely(pte_same(*page_table, orig_pte)))
1594 ret = VM_FAULT_OOM;
1595 else
1596 ret = VM_FAULT_MINOR;
1597 pte_unmap(page_table);
1598 spin_unlock(&mm->page_table_lock);
1599 goto out;
1600 }
1601
1602 /* Had to read the page from swap area: Major fault */
1603 ret = VM_FAULT_MAJOR;
1604 inc_page_state(pgmajfault);
1605 grab_swap_token();
1606 }
1607
1608 mark_page_accessed(page);
1609 lock_page(page);
1610
1611 /*
1612 * Back out if somebody else faulted in this pte while we
1613 * released the page table lock.
1614 */
1615 spin_lock(&mm->page_table_lock);
1616 page_table = pte_offset_map(pmd, address);
1617 if (unlikely(!pte_same(*page_table, orig_pte))) {
1618 pte_unmap(page_table);
1619 spin_unlock(&mm->page_table_lock);
1620 unlock_page(page);
1621 page_cache_release(page);
1622 ret = VM_FAULT_MINOR;
1623 goto out;
1624 }
1625
1626 /* The page isn't present yet, go ahead with the fault. */
1627
1628 swap_free(entry);
1629 if (vm_swap_full())
1630 remove_exclusive_swap_page(page);
1631
1632 inc_mm_counter(mm, rss);
1633 pte = mk_pte(page, vma->vm_page_prot);
1634 if (write_access && can_share_swap_page(page)) {
1635 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
1636 write_access = 0;
1637 }
1638 unlock_page(page);
1639
1640 flush_icache_page(vma, page);
1641 set_pte_at(mm, address, page_table, pte);
1642 page_add_anon_rmap(page, vma, address);
1643
1644 if (write_access) {
1645 if (do_wp_page(mm, vma, address,
1646 page_table, pmd, pte) == VM_FAULT_OOM)
1647 ret = VM_FAULT_OOM;
1648 goto out;
1649 }
1650
1651 /* No need to invalidate - it was non-present before */
1652 update_mmu_cache(vma, address, pte);
1653 lazy_mmu_prot_update(pte);
1654 pte_unmap(page_table);
1655 spin_unlock(&mm->page_table_lock);
1656out:
1657 return ret;
1658}
1659
1660/*
1661 * We are called with the MM semaphore and page_table_lock
1662 * spinlock held to protect against concurrent faults in
1663 * multithreaded programs.
1664 */
1665static int
1666do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
1667 pte_t *page_table, pmd_t *pmd, int write_access,
1668 unsigned long addr)
1669{
1670 pte_t entry;
1671 struct page * page = ZERO_PAGE(addr);
1672
1673 /* Read-only mapping of ZERO_PAGE. */
1674 entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
1675
1676 /* ..except if it's a write access */
1677 if (write_access) {
1678 /* Allocate our own private page. */
1679 pte_unmap(page_table);
1680 spin_unlock(&mm->page_table_lock);
1681
1682 if (unlikely(anon_vma_prepare(vma)))
1683 goto no_mem;
1684 page = alloc_zeroed_user_highpage(vma, addr);
1685 if (!page)
1686 goto no_mem;
1687
1688 spin_lock(&mm->page_table_lock);
1689 page_table = pte_offset_map(pmd, addr);
1690
1691 if (!pte_none(*page_table)) {
1692 pte_unmap(page_table);
1693 page_cache_release(page);
1694 spin_unlock(&mm->page_table_lock);
1695 goto out;
1696 }
1697 inc_mm_counter(mm, rss);
1698 entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
1699 vma->vm_page_prot)),
1700 vma);
1701 lru_cache_add_active(page);
1702 SetPageReferenced(page);
1703 page_add_anon_rmap(page, vma, addr);
1704 }
1705
1706 set_pte_at(mm, addr, page_table, entry);
1707 pte_unmap(page_table);
1708
1709 /* No need to invalidate - it was non-present before */
1710 update_mmu_cache(vma, addr, entry);
1711 lazy_mmu_prot_update(entry);
1712 spin_unlock(&mm->page_table_lock);
1713out:
1714 return VM_FAULT_MINOR;
1715no_mem:
1716 return VM_FAULT_OOM;
1717}
1718
1719/*
1720 * do_no_page() tries to create a new page mapping. It aggressively
1721 * tries to share with existing pages, but makes a separate copy if
1722 * the "write_access" parameter is true in order to avoid the next
1723 * page fault.
1724 *
1725 * As this is called only for pages that do not currently exist, we
1726 * do not need to flush old virtual caches or the TLB.
1727 *
1728 * This is called with the MM semaphore held and the page table
1729 * spinlock held. Exit with the spinlock released.
1730 */
1731static int
1732do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1733 unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
1734{
1735 struct page * new_page;
1736 struct address_space *mapping = NULL;
1737 pte_t entry;
1738 unsigned int sequence = 0;
1739 int ret = VM_FAULT_MINOR;
1740 int anon = 0;
1741
1742 if (!vma->vm_ops || !vma->vm_ops->nopage)
1743 return do_anonymous_page(mm, vma, page_table,
1744 pmd, write_access, address);
1745 pte_unmap(page_table);
1746 spin_unlock(&mm->page_table_lock);
1747
1748 if (vma->vm_file) {
1749 mapping = vma->vm_file->f_mapping;
1750 sequence = mapping->truncate_count;
1751 smp_rmb(); /* serializes i_size against truncate_count */
1752 }
1753retry:
1754 cond_resched();
1755 new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
1756 /*
1757 * No smp_rmb is needed here as long as there's a full
1758 * spin_lock/unlock sequence inside the ->nopage callback
1759 * (for the pagecache lookup) that acts as an implicit
1760 * smp_mb() and prevents the i_size read to happen
1761 * after the next truncate_count read.
1762 */
1763
1764 /* no page was available -- either SIGBUS or OOM */
1765 if (new_page == NOPAGE_SIGBUS)
1766 return VM_FAULT_SIGBUS;
1767 if (new_page == NOPAGE_OOM)
1768 return VM_FAULT_OOM;
1769
1770 /*
1771 * Should we do an early C-O-W break?
1772 */
1773 if (write_access && !(vma->vm_flags & VM_SHARED)) {
1774 struct page *page;
1775
1776 if (unlikely(anon_vma_prepare(vma)))
1777 goto oom;
1778 page = alloc_page_vma(GFP_HIGHUSER, vma, address);
1779 if (!page)
1780 goto oom;
1781 copy_user_highpage(page, new_page, address);
1782 page_cache_release(new_page);
1783 new_page = page;
1784 anon = 1;
1785 }
1786
1787 spin_lock(&mm->page_table_lock);
1788 /*
1789 * For a file-backed vma, someone could have truncated or otherwise
1790 * invalidated this page. If unmap_mapping_range got called,
1791 * retry getting the page.
1792 */
1793 if (mapping && unlikely(sequence != mapping->truncate_count)) {
1794 sequence = mapping->truncate_count;
1795 spin_unlock(&mm->page_table_lock);
1796 page_cache_release(new_page);
1797 goto retry;
1798 }
1799 page_table = pte_offset_map(pmd, address);
1800
1801 /*
1802 * This silly early PAGE_DIRTY setting removes a race
1803 * due to the bad i386 page protection. But it's valid
1804 * for other architectures too.
1805 *
1806 * Note that if write_access is true, we either now have
1807 * an exclusive copy of the page, or this is a shared mapping,
1808 * so we can make it writable and dirty to avoid having to
1809 * handle that later.
1810 */
1811 /* Only go through if we didn't race with anybody else... */
1812 if (pte_none(*page_table)) {
1813 if (!PageReserved(new_page))
1814 inc_mm_counter(mm, rss);
1815
1816 flush_icache_page(vma, new_page);
1817 entry = mk_pte(new_page, vma->vm_page_prot);
1818 if (write_access)
1819 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1820 set_pte_at(mm, address, page_table, entry);
1821 if (anon) {
1822 lru_cache_add_active(new_page);
1823 page_add_anon_rmap(new_page, vma, address);
1824 } else
1825 page_add_file_rmap(new_page);
1826 pte_unmap(page_table);
1827 } else {
1828 /* One of our sibling threads was faster, back out. */
1829 pte_unmap(page_table);
1830 page_cache_release(new_page);
1831 spin_unlock(&mm->page_table_lock);
1832 goto out;
1833 }
1834
1835 /* no need to invalidate: a not-present page shouldn't be cached */
1836 update_mmu_cache(vma, address, entry);
1837 lazy_mmu_prot_update(entry);
1838 spin_unlock(&mm->page_table_lock);
1839out:
1840 return ret;
1841oom:
1842 page_cache_release(new_page);
1843 ret = VM_FAULT_OOM;
1844 goto out;
1845}
1846
1847/*
1848 * Fault of a previously existing named mapping. Repopulate the pte
1849 * from the encoded file_pte if possible. This enables swappable
1850 * nonlinear vmas.
1851 */
1852static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
1853 unsigned long address, int write_access, pte_t *pte, pmd_t *pmd)
1854{
1855 unsigned long pgoff;
1856 int err;
1857
1858 BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage);
1859 /*
1860 * Fall back to the linear mapping if the fs does not support
1861 * ->populate:
1862 */
1863 if (!vma->vm_ops || !vma->vm_ops->populate ||
1864 (write_access && !(vma->vm_flags & VM_SHARED))) {
1865 pte_clear(mm, address, pte);
1866 return do_no_page(mm, vma, address, write_access, pte, pmd);
1867 }
1868
1869 pgoff = pte_to_pgoff(*pte);
1870
1871 pte_unmap(pte);
1872 spin_unlock(&mm->page_table_lock);
1873
1874 err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0);
1875 if (err == -ENOMEM)
1876 return VM_FAULT_OOM;
1877 if (err)
1878 return VM_FAULT_SIGBUS;
1879 return VM_FAULT_MAJOR;
1880}
1881
1882/*
1883 * These routines also need to handle stuff like marking pages dirty
1884 * and/or accessed for architectures that don't do it in hardware (most
1885 * RISC architectures). The early dirtying is also good on the i386.
1886 *
1887 * There is also a hook called "update_mmu_cache()" that architectures
1888 * with external mmu caches can use to update those (ie the Sparc or
1889 * PowerPC hashed page tables that act as extended TLBs).
1890 *
1891 * Note the "page_table_lock". It is to protect against kswapd removing
1892 * pages from under us. Note that kswapd only ever _removes_ pages, never
1893 * adds them. As such, once we have noticed that the page is not present,
1894 * we can drop the lock early.
1895 *
1896 * The adding of pages is protected by the MM semaphore (which we hold),
1897 * so we don't need to worry about a page being suddenly been added into
1898 * our VM.
1899 *
1900 * We enter with the pagetable spinlock held, we are supposed to
1901 * release it when done.
1902 */
1903static inline int handle_pte_fault(struct mm_struct *mm,
1904 struct vm_area_struct * vma, unsigned long address,
1905 int write_access, pte_t *pte, pmd_t *pmd)
1906{
1907 pte_t entry;
1908
1909 entry = *pte;
1910 if (!pte_present(entry)) {
1911 /*
1912 * If it truly wasn't present, we know that kswapd
1913 * and the PTE updates will not touch it later. So
1914 * drop the lock.
1915 */
1916 if (pte_none(entry))
1917 return do_no_page(mm, vma, address, write_access, pte, pmd);
1918 if (pte_file(entry))
1919 return do_file_page(mm, vma, address, write_access, pte, pmd);
1920 return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
1921 }
1922
1923 if (write_access) {
1924 if (!pte_write(entry))
1925 return do_wp_page(mm, vma, address, pte, pmd, entry);
1926
1927 entry = pte_mkdirty(entry);
1928 }
1929 entry = pte_mkyoung(entry);
1930 ptep_set_access_flags(vma, address, pte, entry, write_access);
1931 update_mmu_cache(vma, address, entry);
1932 lazy_mmu_prot_update(entry);
1933 pte_unmap(pte);
1934 spin_unlock(&mm->page_table_lock);
1935 return VM_FAULT_MINOR;
1936}
1937
1938/*
1939 * By the time we get here, we already hold the mm semaphore
1940 */
1941int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
1942 unsigned long address, int write_access)
1943{
1944 pgd_t *pgd;
1945 pud_t *pud;
1946 pmd_t *pmd;
1947 pte_t *pte;
1948
1949 __set_current_state(TASK_RUNNING);
1950
1951 inc_page_state(pgfault);
1952
1953 if (is_vm_hugetlb_page(vma))
1954 return VM_FAULT_SIGBUS; /* mapping truncation does this. */
1955
1956 /*
1957 * We need the page table lock to synchronize with kswapd
1958 * and the SMP-safe atomic PTE updates.
1959 */
1960 pgd = pgd_offset(mm, address);
1961 spin_lock(&mm->page_table_lock);
1962
1963 pud = pud_alloc(mm, pgd, address);
1964 if (!pud)
1965 goto oom;
1966
1967 pmd = pmd_alloc(mm, pud, address);
1968 if (!pmd)
1969 goto oom;
1970
1971 pte = pte_alloc_map(mm, pmd, address);
1972 if (!pte)
1973 goto oom;
1974
1975 return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
1976
1977 oom:
1978 spin_unlock(&mm->page_table_lock);
1979 return VM_FAULT_OOM;
1980}
1981
1982#ifndef __PAGETABLE_PUD_FOLDED
1983/*
1984 * Allocate page upper directory.
1985 *
1986 * We've already handled the fast-path in-line, and we own the
1987 * page table lock.
1988 */
1989pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
1990{
1991 pud_t *new;
1992
1993 spin_unlock(&mm->page_table_lock);
1994 new = pud_alloc_one(mm, address);
1995 spin_lock(&mm->page_table_lock);
1996 if (!new)
1997 return NULL;
1998
1999 /*
2000 * Because we dropped the lock, we should re-check the
2001 * entry, as somebody else could have populated it..
2002 */
2003 if (pgd_present(*pgd)) {
2004 pud_free(new);
2005 goto out;
2006 }
2007 pgd_populate(mm, pgd, new);
2008 out:
2009 return pud_offset(pgd, address);
2010}
2011#endif /* __PAGETABLE_PUD_FOLDED */
2012
2013#ifndef __PAGETABLE_PMD_FOLDED
2014/*
2015 * Allocate page middle directory.
2016 *
2017 * We've already handled the fast-path in-line, and we own the
2018 * page table lock.
2019 */
2020pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2021{
2022 pmd_t *new;
2023
2024 spin_unlock(&mm->page_table_lock);
2025 new = pmd_alloc_one(mm, address);
2026 spin_lock(&mm->page_table_lock);
2027 if (!new)
2028 return NULL;
2029
2030 /*
2031 * Because we dropped the lock, we should re-check the
2032 * entry, as somebody else could have populated it..
2033 */
2034#ifndef __ARCH_HAS_4LEVEL_HACK
2035 if (pud_present(*pud)) {
2036 pmd_free(new);
2037 goto out;
2038 }
2039 pud_populate(mm, pud, new);
2040#else
2041 if (pgd_present(*pud)) {
2042 pmd_free(new);
2043 goto out;
2044 }
2045 pgd_populate(mm, pud, new);
2046#endif /* __ARCH_HAS_4LEVEL_HACK */
2047
2048 out:
2049 return pmd_offset(pud, address);
2050}
2051#endif /* __PAGETABLE_PMD_FOLDED */
2052
2053int make_pages_present(unsigned long addr, unsigned long end)
2054{
2055 int ret, len, write;
2056 struct vm_area_struct * vma;
2057
2058 vma = find_vma(current->mm, addr);
2059 if (!vma)
2060 return -1;
2061 write = (vma->vm_flags & VM_WRITE) != 0;
2062 if (addr >= end)
2063 BUG();
2064 if (end > vma->vm_end)
2065 BUG();
2066 len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
2067 ret = get_user_pages(current, current->mm, addr,
2068 len, write, 0, NULL, NULL);
2069 if (ret < 0)
2070 return ret;
2071 return ret == len ? 0 : -1;
2072}
2073
2074/*
2075 * Map a vmalloc()-space virtual address to the physical page.
2076 */
2077struct page * vmalloc_to_page(void * vmalloc_addr)
2078{
2079 unsigned long addr = (unsigned long) vmalloc_addr;
2080 struct page *page = NULL;
2081 pgd_t *pgd = pgd_offset_k(addr);
2082 pud_t *pud;
2083 pmd_t *pmd;
2084 pte_t *ptep, pte;
2085
2086 if (!pgd_none(*pgd)) {
2087 pud = pud_offset(pgd, addr);
2088 if (!pud_none(*pud)) {
2089 pmd = pmd_offset(pud, addr);
2090 if (!pmd_none(*pmd)) {
2091 ptep = pte_offset_map(pmd, addr);
2092 pte = *ptep;
2093 if (pte_present(pte))
2094 page = pte_page(pte);
2095 pte_unmap(ptep);
2096 }
2097 }
2098 }
2099 return page;
2100}
2101
2102EXPORT_SYMBOL(vmalloc_to_page);
2103
2104/*
2105 * Map a vmalloc()-space virtual address to the physical page frame number.
2106 */
2107unsigned long vmalloc_to_pfn(void * vmalloc_addr)
2108{
2109 return page_to_pfn(vmalloc_to_page(vmalloc_addr));
2110}
2111
2112EXPORT_SYMBOL(vmalloc_to_pfn);
2113
2114/*
2115 * update_mem_hiwater
2116 * - update per process rss and vm high water data
2117 */
2118void update_mem_hiwater(struct task_struct *tsk)
2119{
2120 if (tsk->mm) {
2121 unsigned long rss = get_mm_counter(tsk->mm, rss);
2122
2123 if (tsk->mm->hiwater_rss < rss)
2124 tsk->mm->hiwater_rss = rss;
2125 if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
2126 tsk->mm->hiwater_vm = tsk->mm->total_vm;
2127 }
2128}
2129
2130#if !defined(__HAVE_ARCH_GATE_AREA)
2131
2132#if defined(AT_SYSINFO_EHDR)
2133struct vm_area_struct gate_vma;
2134
2135static int __init gate_vma_init(void)
2136{
2137 gate_vma.vm_mm = NULL;
2138 gate_vma.vm_start = FIXADDR_USER_START;
2139 gate_vma.vm_end = FIXADDR_USER_END;
2140 gate_vma.vm_page_prot = PAGE_READONLY;
2141 gate_vma.vm_flags = 0;
2142 return 0;
2143}
2144__initcall(gate_vma_init);
2145#endif
2146
2147struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
2148{
2149#ifdef AT_SYSINFO_EHDR
2150 return &gate_vma;
2151#else
2152 return NULL;
2153#endif
2154}
2155
2156int in_gate_area_no_task(unsigned long addr)
2157{
2158#ifdef AT_SYSINFO_EHDR
2159 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
2160 return 1;
2161#endif
2162 return 0;
2163}
2164
2165#endif /* __HAVE_ARCH_GATE_AREA */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
new file mode 100644
index 00000000000..a3b44a671ce
--- /dev/null
+++ b/mm/mempolicy.c
@@ -0,0 +1,1138 @@
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
6 *
7 * NUMA policy allows the user to give hints in which node(s) memory should
8 * be allocated.
9 *
10 * Support four policies per VMA and per process:
11 *
12 * The VMA policy has priority over the process policy for a page fault.
13 *
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
19 * is used.
20 * bind Only allocate memory on a specific set of nodes,
21 * no fallback.
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
26 * process policy.
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
30 *
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
35 *
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
39 *
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
44 *
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
47 */
48
49/* Notebook:
50 fix mmap readahead to honour policy and enable policy for any page cache
51 object
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
54 first item above.
55 handle mremap for shared memory (currently ignored for the policy)
56 grows down?
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
60*/
61
62#include <linux/mempolicy.h>
63#include <linux/mm.h>
64#include <linux/highmem.h>
65#include <linux/hugetlb.h>
66#include <linux/kernel.h>
67#include <linux/sched.h>
68#include <linux/mm.h>
69#include <linux/nodemask.h>
70#include <linux/cpuset.h>
71#include <linux/gfp.h>
72#include <linux/slab.h>
73#include <linux/string.h>
74#include <linux/module.h>
75#include <linux/interrupt.h>
76#include <linux/init.h>
77#include <linux/compat.h>
78#include <linux/mempolicy.h>
79#include <asm/tlbflush.h>
80#include <asm/uaccess.h>
81
82static kmem_cache_t *policy_cache;
83static kmem_cache_t *sn_cache;
84
85#define PDprintk(fmt...)
86
87/* Highest zone. An specific allocation for a zone below that is not
88 policied. */
89static int policy_zone;
90
91static struct mempolicy default_policy = {
92 .refcnt = ATOMIC_INIT(1), /* never free it */
93 .policy = MPOL_DEFAULT,
94};
95
96/* Check if all specified nodes are online */
97static int nodes_online(unsigned long *nodes)
98{
99 DECLARE_BITMAP(online2, MAX_NUMNODES);
100
101 bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
102 if (bitmap_empty(online2, MAX_NUMNODES))
103 set_bit(0, online2);
104 if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
105 return -EINVAL;
106 return 0;
107}
108
109/* Do sanity checking on a policy */
110static int mpol_check_policy(int mode, unsigned long *nodes)
111{
112 int empty = bitmap_empty(nodes, MAX_NUMNODES);
113
114 switch (mode) {
115 case MPOL_DEFAULT:
116 if (!empty)
117 return -EINVAL;
118 break;
119 case MPOL_BIND:
120 case MPOL_INTERLEAVE:
121 /* Preferred will only use the first bit, but allow
122 more for now. */
123 if (empty)
124 return -EINVAL;
125 break;
126 }
127 return nodes_online(nodes);
128}
129
130/* Copy a node mask from user space. */
131static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
132 unsigned long maxnode, int mode)
133{
134 unsigned long k;
135 unsigned long nlongs;
136 unsigned long endmask;
137
138 --maxnode;
139 bitmap_zero(nodes, MAX_NUMNODES);
140 if (maxnode == 0 || !nmask)
141 return 0;
142
143 nlongs = BITS_TO_LONGS(maxnode);
144 if ((maxnode % BITS_PER_LONG) == 0)
145 endmask = ~0UL;
146 else
147 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
148
149 /* When the user specified more nodes than supported just check
150 if the non supported part is all zero. */
151 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
152 if (nlongs > PAGE_SIZE/sizeof(long))
153 return -EINVAL;
154 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
155 unsigned long t;
156 if (get_user(t, nmask + k))
157 return -EFAULT;
158 if (k == nlongs - 1) {
159 if (t & endmask)
160 return -EINVAL;
161 } else if (t)
162 return -EINVAL;
163 }
164 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
165 endmask = ~0UL;
166 }
167
168 if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
169 return -EFAULT;
170 nodes[nlongs-1] &= endmask;
171 /* Update current mems_allowed */
172 cpuset_update_current_mems_allowed();
173 /* Ignore nodes not set in current->mems_allowed */
174 cpuset_restrict_to_mems_allowed(nodes);
175 return mpol_check_policy(mode, nodes);
176}
177
178/* Generate a custom zonelist for the BIND policy. */
179static struct zonelist *bind_zonelist(unsigned long *nodes)
180{
181 struct zonelist *zl;
182 int num, max, nd;
183
184 max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
185 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
186 if (!zl)
187 return NULL;
188 num = 0;
189 for (nd = find_first_bit(nodes, MAX_NUMNODES);
190 nd < MAX_NUMNODES;
191 nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
192 int k;
193 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
194 struct zone *z = &NODE_DATA(nd)->node_zones[k];
195 if (!z->present_pages)
196 continue;
197 zl->zones[num++] = z;
198 if (k > policy_zone)
199 policy_zone = k;
200 }
201 }
202 BUG_ON(num >= max);
203 zl->zones[num] = NULL;
204 return zl;
205}
206
207/* Create a new policy */
208static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
209{
210 struct mempolicy *policy;
211
212 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
213 if (mode == MPOL_DEFAULT)
214 return NULL;
215 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
216 if (!policy)
217 return ERR_PTR(-ENOMEM);
218 atomic_set(&policy->refcnt, 1);
219 switch (mode) {
220 case MPOL_INTERLEAVE:
221 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
222 break;
223 case MPOL_PREFERRED:
224 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
225 if (policy->v.preferred_node >= MAX_NUMNODES)
226 policy->v.preferred_node = -1;
227 break;
228 case MPOL_BIND:
229 policy->v.zonelist = bind_zonelist(nodes);
230 if (policy->v.zonelist == NULL) {
231 kmem_cache_free(policy_cache, policy);
232 return ERR_PTR(-ENOMEM);
233 }
234 break;
235 }
236 policy->policy = mode;
237 return policy;
238}
239
240/* Ensure all existing pages follow the policy. */
241static int
242verify_pages(struct mm_struct *mm,
243 unsigned long addr, unsigned long end, unsigned long *nodes)
244{
245 while (addr < end) {
246 struct page *p;
247 pte_t *pte;
248 pmd_t *pmd;
249 pud_t *pud;
250 pgd_t *pgd;
251 pgd = pgd_offset(mm, addr);
252 if (pgd_none(*pgd)) {
253 unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK;
254 if (next > addr)
255 break;
256 addr = next;
257 continue;
258 }
259 pud = pud_offset(pgd, addr);
260 if (pud_none(*pud)) {
261 addr = (addr + PUD_SIZE) & PUD_MASK;
262 continue;
263 }
264 pmd = pmd_offset(pud, addr);
265 if (pmd_none(*pmd)) {
266 addr = (addr + PMD_SIZE) & PMD_MASK;
267 continue;
268 }
269 p = NULL;
270 pte = pte_offset_map(pmd, addr);
271 if (pte_present(*pte))
272 p = pte_page(*pte);
273 pte_unmap(pte);
274 if (p) {
275 unsigned nid = page_to_nid(p);
276 if (!test_bit(nid, nodes))
277 return -EIO;
278 }
279 addr += PAGE_SIZE;
280 }
281 return 0;
282}
283
284/* Step 1: check the range */
285static struct vm_area_struct *
286check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
287 unsigned long *nodes, unsigned long flags)
288{
289 int err;
290 struct vm_area_struct *first, *vma, *prev;
291
292 first = find_vma(mm, start);
293 if (!first)
294 return ERR_PTR(-EFAULT);
295 prev = NULL;
296 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
297 if (!vma->vm_next && vma->vm_end < end)
298 return ERR_PTR(-EFAULT);
299 if (prev && prev->vm_end < vma->vm_start)
300 return ERR_PTR(-EFAULT);
301 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
302 err = verify_pages(vma->vm_mm,
303 vma->vm_start, vma->vm_end, nodes);
304 if (err) {
305 first = ERR_PTR(err);
306 break;
307 }
308 }
309 prev = vma;
310 }
311 return first;
312}
313
314/* Apply policy to a single VMA */
315static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
316{
317 int err = 0;
318 struct mempolicy *old = vma->vm_policy;
319
320 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
321 vma->vm_start, vma->vm_end, vma->vm_pgoff,
322 vma->vm_ops, vma->vm_file,
323 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
324
325 if (vma->vm_ops && vma->vm_ops->set_policy)
326 err = vma->vm_ops->set_policy(vma, new);
327 if (!err) {
328 mpol_get(new);
329 vma->vm_policy = new;
330 mpol_free(old);
331 }
332 return err;
333}
334
335/* Step 2: apply policy to a range and do splits. */
336static int mbind_range(struct vm_area_struct *vma, unsigned long start,
337 unsigned long end, struct mempolicy *new)
338{
339 struct vm_area_struct *next;
340 int err;
341
342 err = 0;
343 for (; vma && vma->vm_start < end; vma = next) {
344 next = vma->vm_next;
345 if (vma->vm_start < start)
346 err = split_vma(vma->vm_mm, vma, start, 1);
347 if (!err && vma->vm_end > end)
348 err = split_vma(vma->vm_mm, vma, end, 0);
349 if (!err)
350 err = policy_vma(vma, new);
351 if (err)
352 break;
353 }
354 return err;
355}
356
357/* Change policy for a memory range */
358asmlinkage long sys_mbind(unsigned long start, unsigned long len,
359 unsigned long mode,
360 unsigned long __user *nmask, unsigned long maxnode,
361 unsigned flags)
362{
363 struct vm_area_struct *vma;
364 struct mm_struct *mm = current->mm;
365 struct mempolicy *new;
366 unsigned long end;
367 DECLARE_BITMAP(nodes, MAX_NUMNODES);
368 int err;
369
370 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
371 return -EINVAL;
372 if (start & ~PAGE_MASK)
373 return -EINVAL;
374 if (mode == MPOL_DEFAULT)
375 flags &= ~MPOL_MF_STRICT;
376 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
377 end = start + len;
378 if (end < start)
379 return -EINVAL;
380 if (end == start)
381 return 0;
382
383 err = get_nodes(nodes, nmask, maxnode, mode);
384 if (err)
385 return err;
386
387 new = mpol_new(mode, nodes);
388 if (IS_ERR(new))
389 return PTR_ERR(new);
390
391 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
392 mode,nodes[0]);
393
394 down_write(&mm->mmap_sem);
395 vma = check_range(mm, start, end, nodes, flags);
396 err = PTR_ERR(vma);
397 if (!IS_ERR(vma))
398 err = mbind_range(vma, start, end, new);
399 up_write(&mm->mmap_sem);
400 mpol_free(new);
401 return err;
402}
403
404/* Set the process memory policy */
405asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
406 unsigned long maxnode)
407{
408 int err;
409 struct mempolicy *new;
410 DECLARE_BITMAP(nodes, MAX_NUMNODES);
411
412 if (mode > MPOL_MAX)
413 return -EINVAL;
414 err = get_nodes(nodes, nmask, maxnode, mode);
415 if (err)
416 return err;
417 new = mpol_new(mode, nodes);
418 if (IS_ERR(new))
419 return PTR_ERR(new);
420 mpol_free(current->mempolicy);
421 current->mempolicy = new;
422 if (new && new->policy == MPOL_INTERLEAVE)
423 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
424 return 0;
425}
426
427/* Fill a zone bitmap for a policy */
428static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
429{
430 int i;
431
432 bitmap_zero(nodes, MAX_NUMNODES);
433 switch (p->policy) {
434 case MPOL_BIND:
435 for (i = 0; p->v.zonelist->zones[i]; i++)
436 __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
437 break;
438 case MPOL_DEFAULT:
439 break;
440 case MPOL_INTERLEAVE:
441 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
442 break;
443 case MPOL_PREFERRED:
444 /* or use current node instead of online map? */
445 if (p->v.preferred_node < 0)
446 bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
447 else
448 __set_bit(p->v.preferred_node, nodes);
449 break;
450 default:
451 BUG();
452 }
453}
454
455static int lookup_node(struct mm_struct *mm, unsigned long addr)
456{
457 struct page *p;
458 int err;
459
460 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
461 if (err >= 0) {
462 err = page_to_nid(p);
463 put_page(p);
464 }
465 return err;
466}
467
468/* Copy a kernel node mask to user space */
469static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
470 void *nodes, unsigned nbytes)
471{
472 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
473
474 if (copy > nbytes) {
475 if (copy > PAGE_SIZE)
476 return -EINVAL;
477 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
478 return -EFAULT;
479 copy = nbytes;
480 }
481 return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
482}
483
484/* Retrieve NUMA policy */
485asmlinkage long sys_get_mempolicy(int __user *policy,
486 unsigned long __user *nmask,
487 unsigned long maxnode,
488 unsigned long addr, unsigned long flags)
489{
490 int err, pval;
491 struct mm_struct *mm = current->mm;
492 struct vm_area_struct *vma = NULL;
493 struct mempolicy *pol = current->mempolicy;
494
495 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
496 return -EINVAL;
497 if (nmask != NULL && maxnode < MAX_NUMNODES)
498 return -EINVAL;
499 if (flags & MPOL_F_ADDR) {
500 down_read(&mm->mmap_sem);
501 vma = find_vma_intersection(mm, addr, addr+1);
502 if (!vma) {
503 up_read(&mm->mmap_sem);
504 return -EFAULT;
505 }
506 if (vma->vm_ops && vma->vm_ops->get_policy)
507 pol = vma->vm_ops->get_policy(vma, addr);
508 else
509 pol = vma->vm_policy;
510 } else if (addr)
511 return -EINVAL;
512
513 if (!pol)
514 pol = &default_policy;
515
516 if (flags & MPOL_F_NODE) {
517 if (flags & MPOL_F_ADDR) {
518 err = lookup_node(mm, addr);
519 if (err < 0)
520 goto out;
521 pval = err;
522 } else if (pol == current->mempolicy &&
523 pol->policy == MPOL_INTERLEAVE) {
524 pval = current->il_next;
525 } else {
526 err = -EINVAL;
527 goto out;
528 }
529 } else
530 pval = pol->policy;
531
532 if (vma) {
533 up_read(&current->mm->mmap_sem);
534 vma = NULL;
535 }
536
537 if (policy && put_user(pval, policy))
538 return -EFAULT;
539
540 err = 0;
541 if (nmask) {
542 DECLARE_BITMAP(nodes, MAX_NUMNODES);
543 get_zonemask(pol, nodes);
544 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
545 }
546
547 out:
548 if (vma)
549 up_read(&current->mm->mmap_sem);
550 return err;
551}
552
553#ifdef CONFIG_COMPAT
554
555asmlinkage long compat_sys_get_mempolicy(int __user *policy,
556 compat_ulong_t __user *nmask,
557 compat_ulong_t maxnode,
558 compat_ulong_t addr, compat_ulong_t flags)
559{
560 long err;
561 unsigned long __user *nm = NULL;
562 unsigned long nr_bits, alloc_size;
563 DECLARE_BITMAP(bm, MAX_NUMNODES);
564
565 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
566 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
567
568 if (nmask)
569 nm = compat_alloc_user_space(alloc_size);
570
571 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
572
573 if (!err && nmask) {
574 err = copy_from_user(bm, nm, alloc_size);
575 /* ensure entire bitmap is zeroed */
576 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
577 err |= compat_put_bitmap(nmask, bm, nr_bits);
578 }
579
580 return err;
581}
582
583asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
584 compat_ulong_t maxnode)
585{
586 long err = 0;
587 unsigned long __user *nm = NULL;
588 unsigned long nr_bits, alloc_size;
589 DECLARE_BITMAP(bm, MAX_NUMNODES);
590
591 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
592 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
593
594 if (nmask) {
595 err = compat_get_bitmap(bm, nmask, nr_bits);
596 nm = compat_alloc_user_space(alloc_size);
597 err |= copy_to_user(nm, bm, alloc_size);
598 }
599
600 if (err)
601 return -EFAULT;
602
603 return sys_set_mempolicy(mode, nm, nr_bits+1);
604}
605
606asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
607 compat_ulong_t mode, compat_ulong_t __user *nmask,
608 compat_ulong_t maxnode, compat_ulong_t flags)
609{
610 long err = 0;
611 unsigned long __user *nm = NULL;
612 unsigned long nr_bits, alloc_size;
613 DECLARE_BITMAP(bm, MAX_NUMNODES);
614
615 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
616 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
617
618 if (nmask) {
619 err = compat_get_bitmap(bm, nmask, nr_bits);
620 nm = compat_alloc_user_space(alloc_size);
621 err |= copy_to_user(nm, bm, alloc_size);
622 }
623
624 if (err)
625 return -EFAULT;
626
627 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
628}
629
630#endif
631
632/* Return effective policy for a VMA */
633static struct mempolicy *
634get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
635{
636 struct mempolicy *pol = current->mempolicy;
637
638 if (vma) {
639 if (vma->vm_ops && vma->vm_ops->get_policy)
640 pol = vma->vm_ops->get_policy(vma, addr);
641 else if (vma->vm_policy &&
642 vma->vm_policy->policy != MPOL_DEFAULT)
643 pol = vma->vm_policy;
644 }
645 if (!pol)
646 pol = &default_policy;
647 return pol;
648}
649
650/* Return a zonelist representing a mempolicy */
651static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempolicy *policy)
652{
653 int nd;
654
655 switch (policy->policy) {
656 case MPOL_PREFERRED:
657 nd = policy->v.preferred_node;
658 if (nd < 0)
659 nd = numa_node_id();
660 break;
661 case MPOL_BIND:
662 /* Lower zones don't get a policy applied */
663 /* Careful: current->mems_allowed might have moved */
664 if (gfp >= policy_zone)
665 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
666 return policy->v.zonelist;
667 /*FALL THROUGH*/
668 case MPOL_INTERLEAVE: /* should not happen */
669 case MPOL_DEFAULT:
670 nd = numa_node_id();
671 break;
672 default:
673 nd = 0;
674 BUG();
675 }
676 return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
677}
678
679/* Do dynamic interleaving for a process */
680static unsigned interleave_nodes(struct mempolicy *policy)
681{
682 unsigned nid, next;
683 struct task_struct *me = current;
684
685 nid = me->il_next;
686 BUG_ON(nid >= MAX_NUMNODES);
687 next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
688 if (next >= MAX_NUMNODES)
689 next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
690 me->il_next = next;
691 return nid;
692}
693
694/* Do static interleaving for a VMA with known offset. */
695static unsigned offset_il_node(struct mempolicy *pol,
696 struct vm_area_struct *vma, unsigned long off)
697{
698 unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
699 unsigned target = (unsigned)off % nnodes;
700 int c;
701 int nid = -1;
702
703 c = 0;
704 do {
705 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
706 c++;
707 } while (c <= target);
708 BUG_ON(nid >= MAX_NUMNODES);
709 BUG_ON(!test_bit(nid, pol->v.nodes));
710 return nid;
711}
712
713/* Allocate a page in interleaved policy.
714 Own path because it needs to do special accounting. */
715static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid)
716{
717 struct zonelist *zl;
718 struct page *page;
719
720 BUG_ON(!node_online(nid));
721 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
722 page = __alloc_pages(gfp, order, zl);
723 if (page && page_zone(page) == zl->zones[0]) {
724 zl->zones[0]->pageset[get_cpu()].interleave_hit++;
725 put_cpu();
726 }
727 return page;
728}
729
730/**
731 * alloc_page_vma - Allocate a page for a VMA.
732 *
733 * @gfp:
734 * %GFP_USER user allocation.
735 * %GFP_KERNEL kernel allocations,
736 * %GFP_HIGHMEM highmem/user allocations,
737 * %GFP_FS allocation should not call back into a file system.
738 * %GFP_ATOMIC don't sleep.
739 *
740 * @vma: Pointer to VMA or NULL if not available.
741 * @addr: Virtual Address of the allocation. Must be inside the VMA.
742 *
743 * This function allocates a page from the kernel page pool and applies
744 * a NUMA policy associated with the VMA or the current process.
745 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
746 * mm_struct of the VMA to prevent it from going away. Should be used for
747 * all allocations for pages that will be mapped into
748 * user space. Returns NULL when no page can be allocated.
749 *
750 * Should be called with the mm_sem of the vma hold.
751 */
752struct page *
753alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr)
754{
755 struct mempolicy *pol = get_vma_policy(vma, addr);
756
757 cpuset_update_current_mems_allowed();
758
759 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
760 unsigned nid;
761 if (vma) {
762 unsigned long off;
763 BUG_ON(addr >= vma->vm_end);
764 BUG_ON(addr < vma->vm_start);
765 off = vma->vm_pgoff;
766 off += (addr - vma->vm_start) >> PAGE_SHIFT;
767 nid = offset_il_node(pol, vma, off);
768 } else {
769 /* fall back to process interleaving */
770 nid = interleave_nodes(pol);
771 }
772 return alloc_page_interleave(gfp, 0, nid);
773 }
774 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
775}
776
777/**
778 * alloc_pages_current - Allocate pages.
779 *
780 * @gfp:
781 * %GFP_USER user allocation,
782 * %GFP_KERNEL kernel allocation,
783 * %GFP_HIGHMEM highmem allocation,
784 * %GFP_FS don't call back into a file system.
785 * %GFP_ATOMIC don't sleep.
786 * @order: Power of two of allocation size in pages. 0 is a single page.
787 *
788 * Allocate a page from the kernel page pool. When not in
789 * interrupt context and apply the current process NUMA policy.
790 * Returns NULL when no page can be allocated.
791 *
792 * Don't call cpuset_update_current_mems_allowed() unless
793 * 1) it's ok to take cpuset_sem (can WAIT), and
794 * 2) allocating for current task (not interrupt).
795 */
796struct page *alloc_pages_current(unsigned int __nocast gfp, unsigned order)
797{
798 struct mempolicy *pol = current->mempolicy;
799
800 if ((gfp & __GFP_WAIT) && !in_interrupt())
801 cpuset_update_current_mems_allowed();
802 if (!pol || in_interrupt())
803 pol = &default_policy;
804 if (pol->policy == MPOL_INTERLEAVE)
805 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
806 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
807}
808EXPORT_SYMBOL(alloc_pages_current);
809
810/* Slow path of a mempolicy copy */
811struct mempolicy *__mpol_copy(struct mempolicy *old)
812{
813 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
814
815 if (!new)
816 return ERR_PTR(-ENOMEM);
817 *new = *old;
818 atomic_set(&new->refcnt, 1);
819 if (new->policy == MPOL_BIND) {
820 int sz = ksize(old->v.zonelist);
821 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
822 if (!new->v.zonelist) {
823 kmem_cache_free(policy_cache, new);
824 return ERR_PTR(-ENOMEM);
825 }
826 memcpy(new->v.zonelist, old->v.zonelist, sz);
827 }
828 return new;
829}
830
831/* Slow path of a mempolicy comparison */
832int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
833{
834 if (!a || !b)
835 return 0;
836 if (a->policy != b->policy)
837 return 0;
838 switch (a->policy) {
839 case MPOL_DEFAULT:
840 return 1;
841 case MPOL_INTERLEAVE:
842 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
843 case MPOL_PREFERRED:
844 return a->v.preferred_node == b->v.preferred_node;
845 case MPOL_BIND: {
846 int i;
847 for (i = 0; a->v.zonelist->zones[i]; i++)
848 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
849 return 0;
850 return b->v.zonelist->zones[i] == NULL;
851 }
852 default:
853 BUG();
854 return 0;
855 }
856}
857
858/* Slow path of a mpol destructor. */
859void __mpol_free(struct mempolicy *p)
860{
861 if (!atomic_dec_and_test(&p->refcnt))
862 return;
863 if (p->policy == MPOL_BIND)
864 kfree(p->v.zonelist);
865 p->policy = MPOL_DEFAULT;
866 kmem_cache_free(policy_cache, p);
867}
868
869/*
870 * Hugetlb policy. Same as above, just works with node numbers instead of
871 * zonelists.
872 */
873
874/* Find first node suitable for an allocation */
875int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
876{
877 struct mempolicy *pol = get_vma_policy(vma, addr);
878
879 switch (pol->policy) {
880 case MPOL_DEFAULT:
881 return numa_node_id();
882 case MPOL_BIND:
883 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
884 case MPOL_INTERLEAVE:
885 return interleave_nodes(pol);
886 case MPOL_PREFERRED:
887 return pol->v.preferred_node >= 0 ?
888 pol->v.preferred_node : numa_node_id();
889 }
890 BUG();
891 return 0;
892}
893
894/* Find secondary valid nodes for an allocation */
895int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
896{
897 struct mempolicy *pol = get_vma_policy(vma, addr);
898
899 switch (pol->policy) {
900 case MPOL_PREFERRED:
901 case MPOL_DEFAULT:
902 case MPOL_INTERLEAVE:
903 return 1;
904 case MPOL_BIND: {
905 struct zone **z;
906 for (z = pol->v.zonelist->zones; *z; z++)
907 if ((*z)->zone_pgdat->node_id == nid)
908 return 1;
909 return 0;
910 }
911 default:
912 BUG();
913 return 0;
914 }
915}
916
917/*
918 * Shared memory backing store policy support.
919 *
920 * Remember policies even when nobody has shared memory mapped.
921 * The policies are kept in Red-Black tree linked from the inode.
922 * They are protected by the sp->lock spinlock, which should be held
923 * for any accesses to the tree.
924 */
925
926/* lookup first element intersecting start-end */
927/* Caller holds sp->lock */
928static struct sp_node *
929sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
930{
931 struct rb_node *n = sp->root.rb_node;
932
933 while (n) {
934 struct sp_node *p = rb_entry(n, struct sp_node, nd);
935
936 if (start >= p->end)
937 n = n->rb_right;
938 else if (end <= p->start)
939 n = n->rb_left;
940 else
941 break;
942 }
943 if (!n)
944 return NULL;
945 for (;;) {
946 struct sp_node *w = NULL;
947 struct rb_node *prev = rb_prev(n);
948 if (!prev)
949 break;
950 w = rb_entry(prev, struct sp_node, nd);
951 if (w->end <= start)
952 break;
953 n = prev;
954 }
955 return rb_entry(n, struct sp_node, nd);
956}
957
958/* Insert a new shared policy into the list. */
959/* Caller holds sp->lock */
960static void sp_insert(struct shared_policy *sp, struct sp_node *new)
961{
962 struct rb_node **p = &sp->root.rb_node;
963 struct rb_node *parent = NULL;
964 struct sp_node *nd;
965
966 while (*p) {
967 parent = *p;
968 nd = rb_entry(parent, struct sp_node, nd);
969 if (new->start < nd->start)
970 p = &(*p)->rb_left;
971 else if (new->end > nd->end)
972 p = &(*p)->rb_right;
973 else
974 BUG();
975 }
976 rb_link_node(&new->nd, parent, p);
977 rb_insert_color(&new->nd, &sp->root);
978 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
979 new->policy ? new->policy->policy : 0);
980}
981
982/* Find shared policy intersecting idx */
983struct mempolicy *
984mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
985{
986 struct mempolicy *pol = NULL;
987 struct sp_node *sn;
988
989 if (!sp->root.rb_node)
990 return NULL;
991 spin_lock(&sp->lock);
992 sn = sp_lookup(sp, idx, idx+1);
993 if (sn) {
994 mpol_get(sn->policy);
995 pol = sn->policy;
996 }
997 spin_unlock(&sp->lock);
998 return pol;
999}
1000
1001static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1002{
1003 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1004 rb_erase(&n->nd, &sp->root);
1005 mpol_free(n->policy);
1006 kmem_cache_free(sn_cache, n);
1007}
1008
1009struct sp_node *
1010sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1011{
1012 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1013
1014 if (!n)
1015 return NULL;
1016 n->start = start;
1017 n->end = end;
1018 mpol_get(pol);
1019 n->policy = pol;
1020 return n;
1021}
1022
1023/* Replace a policy range. */
1024static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1025 unsigned long end, struct sp_node *new)
1026{
1027 struct sp_node *n, *new2 = NULL;
1028
1029restart:
1030 spin_lock(&sp->lock);
1031 n = sp_lookup(sp, start, end);
1032 /* Take care of old policies in the same range. */
1033 while (n && n->start < end) {
1034 struct rb_node *next = rb_next(&n->nd);
1035 if (n->start >= start) {
1036 if (n->end <= end)
1037 sp_delete(sp, n);
1038 else
1039 n->start = end;
1040 } else {
1041 /* Old policy spanning whole new range. */
1042 if (n->end > end) {
1043 if (!new2) {
1044 spin_unlock(&sp->lock);
1045 new2 = sp_alloc(end, n->end, n->policy);
1046 if (!new2)
1047 return -ENOMEM;
1048 goto restart;
1049 }
1050 n->end = start;
1051 sp_insert(sp, new2);
1052 new2 = NULL;
1053 break;
1054 } else
1055 n->end = start;
1056 }
1057 if (!next)
1058 break;
1059 n = rb_entry(next, struct sp_node, nd);
1060 }
1061 if (new)
1062 sp_insert(sp, new);
1063 spin_unlock(&sp->lock);
1064 if (new2) {
1065 mpol_free(new2->policy);
1066 kmem_cache_free(sn_cache, new2);
1067 }
1068 return 0;
1069}
1070
1071int mpol_set_shared_policy(struct shared_policy *info,
1072 struct vm_area_struct *vma, struct mempolicy *npol)
1073{
1074 int err;
1075 struct sp_node *new = NULL;
1076 unsigned long sz = vma_pages(vma);
1077
1078 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1079 vma->vm_pgoff,
1080 sz, npol? npol->policy : -1,
1081 npol ? npol->v.nodes[0] : -1);
1082
1083 if (npol) {
1084 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1085 if (!new)
1086 return -ENOMEM;
1087 }
1088 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1089 if (err && new)
1090 kmem_cache_free(sn_cache, new);
1091 return err;
1092}
1093
1094/* Free a backing policy store on inode delete. */
1095void mpol_free_shared_policy(struct shared_policy *p)
1096{
1097 struct sp_node *n;
1098 struct rb_node *next;
1099
1100 if (!p->root.rb_node)
1101 return;
1102 spin_lock(&p->lock);
1103 next = rb_first(&p->root);
1104 while (next) {
1105 n = rb_entry(next, struct sp_node, nd);
1106 next = rb_next(&n->nd);
1107 mpol_free(n->policy);
1108 kmem_cache_free(sn_cache, n);
1109 }
1110 spin_unlock(&p->lock);
1111 p->root = RB_ROOT;
1112}
1113
1114/* assumes fs == KERNEL_DS */
1115void __init numa_policy_init(void)
1116{
1117 policy_cache = kmem_cache_create("numa_policy",
1118 sizeof(struct mempolicy),
1119 0, SLAB_PANIC, NULL, NULL);
1120
1121 sn_cache = kmem_cache_create("shared_policy_node",
1122 sizeof(struct sp_node),
1123 0, SLAB_PANIC, NULL, NULL);
1124
1125 /* Set interleaving policy for system init. This way not all
1126 the data structures allocated at system boot end up in node zero. */
1127
1128 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1129 MAX_NUMNODES) < 0)
1130 printk("numa_policy_init: interleaving failed\n");
1131}
1132
1133/* Reset policy of current process to default.
1134 * Assumes fs == KERNEL_DS */
1135void numa_default_policy(void)
1136{
1137 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
1138}
diff --git a/mm/mempool.c b/mm/mempool.c
new file mode 100644
index 00000000000..b014ffeaa41
--- /dev/null
+++ b/mm/mempool.c
@@ -0,0 +1,290 @@
1/*
2 * linux/mm/mempool.c
3 *
4 * memory buffer pool support. Such pools are mostly used
5 * for guaranteed, deadlock-free memory allocations during
6 * extreme VM load.
7 *
8 * started by Ingo Molnar, Copyright (C) 2001
9 */
10
11#include <linux/mm.h>
12#include <linux/slab.h>
13#include <linux/module.h>
14#include <linux/mempool.h>
15#include <linux/blkdev.h>
16#include <linux/writeback.h>
17
18static void add_element(mempool_t *pool, void *element)
19{
20 BUG_ON(pool->curr_nr >= pool->min_nr);
21 pool->elements[pool->curr_nr++] = element;
22}
23
24static void *remove_element(mempool_t *pool)
25{
26 BUG_ON(pool->curr_nr <= 0);
27 return pool->elements[--pool->curr_nr];
28}
29
30static void free_pool(mempool_t *pool)
31{
32 while (pool->curr_nr) {
33 void *element = remove_element(pool);
34 pool->free(element, pool->pool_data);
35 }
36 kfree(pool->elements);
37 kfree(pool);
38}
39
40/**
41 * mempool_create - create a memory pool
42 * @min_nr: the minimum number of elements guaranteed to be
43 * allocated for this pool.
44 * @alloc_fn: user-defined element-allocation function.
45 * @free_fn: user-defined element-freeing function.
46 * @pool_data: optional private data available to the user-defined functions.
47 *
48 * this function creates and allocates a guaranteed size, preallocated
49 * memory pool. The pool can be used from the mempool_alloc and mempool_free
50 * functions. This function might sleep. Both the alloc_fn() and the free_fn()
51 * functions might sleep - as long as the mempool_alloc function is not called
52 * from IRQ contexts.
53 */
54mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
55 mempool_free_t *free_fn, void *pool_data)
56{
57 mempool_t *pool;
58
59 pool = kmalloc(sizeof(*pool), GFP_KERNEL);
60 if (!pool)
61 return NULL;
62 memset(pool, 0, sizeof(*pool));
63 pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL);
64 if (!pool->elements) {
65 kfree(pool);
66 return NULL;
67 }
68 spin_lock_init(&pool->lock);
69 pool->min_nr = min_nr;
70 pool->pool_data = pool_data;
71 init_waitqueue_head(&pool->wait);
72 pool->alloc = alloc_fn;
73 pool->free = free_fn;
74
75 /*
76 * First pre-allocate the guaranteed number of buffers.
77 */
78 while (pool->curr_nr < pool->min_nr) {
79 void *element;
80
81 element = pool->alloc(GFP_KERNEL, pool->pool_data);
82 if (unlikely(!element)) {
83 free_pool(pool);
84 return NULL;
85 }
86 add_element(pool, element);
87 }
88 return pool;
89}
90EXPORT_SYMBOL(mempool_create);
91
92/**
93 * mempool_resize - resize an existing memory pool
94 * @pool: pointer to the memory pool which was allocated via
95 * mempool_create().
96 * @new_min_nr: the new minimum number of elements guaranteed to be
97 * allocated for this pool.
98 * @gfp_mask: the usual allocation bitmask.
99 *
100 * This function shrinks/grows the pool. In the case of growing,
101 * it cannot be guaranteed that the pool will be grown to the new
102 * size immediately, but new mempool_free() calls will refill it.
103 *
104 * Note, the caller must guarantee that no mempool_destroy is called
105 * while this function is running. mempool_alloc() & mempool_free()
106 * might be called (eg. from IRQ contexts) while this function executes.
107 */
108int mempool_resize(mempool_t *pool, int new_min_nr, unsigned int __nocast gfp_mask)
109{
110 void *element;
111 void **new_elements;
112 unsigned long flags;
113
114 BUG_ON(new_min_nr <= 0);
115
116 spin_lock_irqsave(&pool->lock, flags);
117 if (new_min_nr <= pool->min_nr) {
118 while (new_min_nr < pool->curr_nr) {
119 element = remove_element(pool);
120 spin_unlock_irqrestore(&pool->lock, flags);
121 pool->free(element, pool->pool_data);
122 spin_lock_irqsave(&pool->lock, flags);
123 }
124 pool->min_nr = new_min_nr;
125 goto out_unlock;
126 }
127 spin_unlock_irqrestore(&pool->lock, flags);
128
129 /* Grow the pool */
130 new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask);
131 if (!new_elements)
132 return -ENOMEM;
133
134 spin_lock_irqsave(&pool->lock, flags);
135 if (unlikely(new_min_nr <= pool->min_nr)) {
136 /* Raced, other resize will do our work */
137 spin_unlock_irqrestore(&pool->lock, flags);
138 kfree(new_elements);
139 goto out;
140 }
141 memcpy(new_elements, pool->elements,
142 pool->curr_nr * sizeof(*new_elements));
143 kfree(pool->elements);
144 pool->elements = new_elements;
145 pool->min_nr = new_min_nr;
146
147 while (pool->curr_nr < pool->min_nr) {
148 spin_unlock_irqrestore(&pool->lock, flags);
149 element = pool->alloc(gfp_mask, pool->pool_data);
150 if (!element)
151 goto out;
152 spin_lock_irqsave(&pool->lock, flags);
153 if (pool->curr_nr < pool->min_nr) {
154 add_element(pool, element);
155 } else {
156 spin_unlock_irqrestore(&pool->lock, flags);
157 pool->free(element, pool->pool_data); /* Raced */
158 goto out;
159 }
160 }
161out_unlock:
162 spin_unlock_irqrestore(&pool->lock, flags);
163out:
164 return 0;
165}
166EXPORT_SYMBOL(mempool_resize);
167
168/**
169 * mempool_destroy - deallocate a memory pool
170 * @pool: pointer to the memory pool which was allocated via
171 * mempool_create().
172 *
173 * this function only sleeps if the free_fn() function sleeps. The caller
174 * has to guarantee that all elements have been returned to the pool (ie:
175 * freed) prior to calling mempool_destroy().
176 */
177void mempool_destroy(mempool_t *pool)
178{
179 if (pool->curr_nr != pool->min_nr)
180 BUG(); /* There were outstanding elements */
181 free_pool(pool);
182}
183EXPORT_SYMBOL(mempool_destroy);
184
185/**
186 * mempool_alloc - allocate an element from a specific memory pool
187 * @pool: pointer to the memory pool which was allocated via
188 * mempool_create().
189 * @gfp_mask: the usual allocation bitmask.
190 *
191 * this function only sleeps if the alloc_fn function sleeps or
192 * returns NULL. Note that due to preallocation, this function
193 * *never* fails when called from process contexts. (it might
194 * fail if called from an IRQ context.)
195 */
196void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask)
197{
198 void *element;
199 unsigned long flags;
200 DEFINE_WAIT(wait);
201 int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
202
203 might_sleep_if(gfp_mask & __GFP_WAIT);
204repeat_alloc:
205 element = pool->alloc(gfp_nowait|__GFP_NOWARN, pool->pool_data);
206 if (likely(element != NULL))
207 return element;
208
209 /*
210 * If the pool is less than 50% full and we can perform effective
211 * page reclaim then try harder to allocate an element.
212 */
213 mb();
214 if ((gfp_mask & __GFP_FS) && (gfp_mask != gfp_nowait) &&
215 (pool->curr_nr <= pool->min_nr/2)) {
216 element = pool->alloc(gfp_mask, pool->pool_data);
217 if (likely(element != NULL))
218 return element;
219 }
220
221 /*
222 * Kick the VM at this point.
223 */
224 wakeup_bdflush(0);
225
226 spin_lock_irqsave(&pool->lock, flags);
227 if (likely(pool->curr_nr)) {
228 element = remove_element(pool);
229 spin_unlock_irqrestore(&pool->lock, flags);
230 return element;
231 }
232 spin_unlock_irqrestore(&pool->lock, flags);
233
234 /* We must not sleep in the GFP_ATOMIC case */
235 if (!(gfp_mask & __GFP_WAIT))
236 return NULL;
237
238 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
239 mb();
240 if (!pool->curr_nr)
241 io_schedule();
242 finish_wait(&pool->wait, &wait);
243
244 goto repeat_alloc;
245}
246EXPORT_SYMBOL(mempool_alloc);
247
248/**
249 * mempool_free - return an element to the pool.
250 * @element: pool element pointer.
251 * @pool: pointer to the memory pool which was allocated via
252 * mempool_create().
253 *
254 * this function only sleeps if the free_fn() function sleeps.
255 */
256void mempool_free(void *element, mempool_t *pool)
257{
258 unsigned long flags;
259
260 mb();
261 if (pool->curr_nr < pool->min_nr) {
262 spin_lock_irqsave(&pool->lock, flags);
263 if (pool->curr_nr < pool->min_nr) {
264 add_element(pool, element);
265 spin_unlock_irqrestore(&pool->lock, flags);
266 wake_up(&pool->wait);
267 return;
268 }
269 spin_unlock_irqrestore(&pool->lock, flags);
270 }
271 pool->free(element, pool->pool_data);
272}
273EXPORT_SYMBOL(mempool_free);
274
275/*
276 * A commonly used alloc and free fn.
277 */
278void *mempool_alloc_slab(unsigned int __nocast gfp_mask, void *pool_data)
279{
280 kmem_cache_t *mem = (kmem_cache_t *) pool_data;
281 return kmem_cache_alloc(mem, gfp_mask);
282}
283EXPORT_SYMBOL(mempool_alloc_slab);
284
285void mempool_free_slab(void *element, void *pool_data)
286{
287 kmem_cache_t *mem = (kmem_cache_t *) pool_data;
288 kmem_cache_free(mem, element);
289}
290EXPORT_SYMBOL(mempool_free_slab);
diff --git a/mm/mincore.c b/mm/mincore.c
new file mode 100644
index 00000000000..07833dc5829
--- /dev/null
+++ b/mm/mincore.c
@@ -0,0 +1,191 @@
1/*
2 * linux/mm/mincore.c
3 *
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
6
7/*
8 * The mincore() system call.
9 */
10#include <linux/slab.h>
11#include <linux/pagemap.h>
12#include <linux/mm.h>
13#include <linux/mman.h>
14#include <linux/syscalls.h>
15
16#include <asm/uaccess.h>
17#include <asm/pgtable.h>
18
19/*
20 * Later we can get more picky about what "in core" means precisely.
21 * For now, simply check to see if the page is in the page cache,
22 * and is up to date; i.e. that no page-in operation would be required
23 * at this time if an application were to map and access this page.
24 */
25static unsigned char mincore_page(struct vm_area_struct * vma,
26 unsigned long pgoff)
27{
28 unsigned char present = 0;
29 struct address_space * as = vma->vm_file->f_mapping;
30 struct page * page;
31
32 page = find_get_page(as, pgoff);
33 if (page) {
34 present = PageUptodate(page);
35 page_cache_release(page);
36 }
37
38 return present;
39}
40
41static long mincore_vma(struct vm_area_struct * vma,
42 unsigned long start, unsigned long end, unsigned char __user * vec)
43{
44 long error, i, remaining;
45 unsigned char * tmp;
46
47 error = -ENOMEM;
48 if (!vma->vm_file)
49 return error;
50
51 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
52 if (end > vma->vm_end)
53 end = vma->vm_end;
54 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
55
56 error = -EAGAIN;
57 tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
58 if (!tmp)
59 return error;
60
61 /* (end - start) is # of pages, and also # of bytes in "vec */
62 remaining = (end - start),
63
64 error = 0;
65 for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
66 int j = 0;
67 long thispiece = (remaining < PAGE_SIZE) ?
68 remaining : PAGE_SIZE;
69
70 while (j < thispiece)
71 tmp[j++] = mincore_page(vma, start++);
72
73 if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
74 error = -EFAULT;
75 break;
76 }
77 }
78
79 free_page((unsigned long) tmp);
80 return error;
81}
82
83/*
84 * The mincore(2) system call.
85 *
86 * mincore() returns the memory residency status of the pages in the
87 * current process's address space specified by [addr, addr + len).
88 * The status is returned in a vector of bytes. The least significant
89 * bit of each byte is 1 if the referenced page is in memory, otherwise
90 * it is zero.
91 *
92 * Because the status of a page can change after mincore() checks it
93 * but before it returns to the application, the returned vector may
94 * contain stale information. Only locked pages are guaranteed to
95 * remain in memory.
96 *
97 * return values:
98 * zero - success
99 * -EFAULT - vec points to an illegal address
100 * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE
101 * -ENOMEM - Addresses in the range [addr, addr + len] are
102 * invalid for the address space of this process, or
103 * specify one or more pages which are not currently
104 * mapped
105 * -EAGAIN - A kernel resource was temporarily unavailable.
106 */
107asmlinkage long sys_mincore(unsigned long start, size_t len,
108 unsigned char __user * vec)
109{
110 int index = 0;
111 unsigned long end, limit;
112 struct vm_area_struct * vma;
113 size_t max;
114 int unmapped_error = 0;
115 long error;
116
117 /* check the arguments */
118 if (start & ~PAGE_CACHE_MASK)
119 goto einval;
120
121 if (start < FIRST_USER_PGD_NR * PGDIR_SIZE)
122 goto enomem;
123
124 limit = TASK_SIZE;
125 if (start >= limit)
126 goto enomem;
127
128 if (!len)
129 return 0;
130
131 max = limit - start;
132 len = PAGE_CACHE_ALIGN(len);
133 if (len > max || !len)
134 goto enomem;
135
136 end = start + len;
137
138 /* check the output buffer whilst holding the lock */
139 error = -EFAULT;
140 down_read(&current->mm->mmap_sem);
141
142 if (!access_ok(VERIFY_WRITE, vec, len >> PAGE_SHIFT))
143 goto out;
144
145 /*
146 * If the interval [start,end) covers some unmapped address
147 * ranges, just ignore them, but return -ENOMEM at the end.
148 */
149 error = 0;
150
151 vma = find_vma(current->mm, start);
152 while (vma) {
153 /* Here start < vma->vm_end. */
154 if (start < vma->vm_start) {
155 unmapped_error = -ENOMEM;
156 start = vma->vm_start;
157 }
158
159 /* Here vma->vm_start <= start < vma->vm_end. */
160 if (end <= vma->vm_end) {
161 if (start < end) {
162 error = mincore_vma(vma, start, end,
163 &vec[index]);
164 if (error)
165 goto out;
166 }
167 error = unmapped_error;
168 goto out;
169 }
170
171 /* Here vma->vm_start <= start < vma->vm_end < end. */
172 error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
173 if (error)
174 goto out;
175 index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
176 start = vma->vm_end;
177 vma = vma->vm_next;
178 }
179
180 /* we found a hole in the area queried if we arrive here */
181 error = -ENOMEM;
182
183out:
184 up_read(&current->mm->mmap_sem);
185 return error;
186
187einval:
188 return -EINVAL;
189enomem:
190 return -ENOMEM;
191}
diff --git a/mm/mlock.c b/mm/mlock.c
new file mode 100644
index 00000000000..4ae3a46ff76
--- /dev/null
+++ b/mm/mlock.c
@@ -0,0 +1,253 @@
1/*
2 * linux/mm/mlock.c
3 *
4 * (C) Copyright 1995 Linus Torvalds
5 * (C) Copyright 2002 Christoph Hellwig
6 */
7
8#include <linux/mman.h>
9#include <linux/mm.h>
10#include <linux/mempolicy.h>
11#include <linux/syscalls.h>
12
13
14static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
15 unsigned long start, unsigned long end, unsigned int newflags)
16{
17 struct mm_struct * mm = vma->vm_mm;
18 pgoff_t pgoff;
19 int pages;
20 int ret = 0;
21
22 if (newflags == vma->vm_flags) {
23 *prev = vma;
24 goto out;
25 }
26
27 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
28 *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
29 vma->vm_file, pgoff, vma_policy(vma));
30 if (*prev) {
31 vma = *prev;
32 goto success;
33 }
34
35 *prev = vma;
36
37 if (start != vma->vm_start) {
38 ret = split_vma(mm, vma, start, 1);
39 if (ret)
40 goto out;
41 }
42
43 if (end != vma->vm_end) {
44 ret = split_vma(mm, vma, end, 0);
45 if (ret)
46 goto out;
47 }
48
49success:
50 /*
51 * vm_flags is protected by the mmap_sem held in write mode.
52 * It's okay if try_to_unmap_one unmaps a page just after we
53 * set VM_LOCKED, make_pages_present below will bring it back.
54 */
55 vma->vm_flags = newflags;
56
57 /*
58 * Keep track of amount of locked VM.
59 */
60 pages = (end - start) >> PAGE_SHIFT;
61 if (newflags & VM_LOCKED) {
62 pages = -pages;
63 if (!(newflags & VM_IO))
64 ret = make_pages_present(start, end);
65 }
66
67 vma->vm_mm->locked_vm -= pages;
68out:
69 if (ret == -ENOMEM)
70 ret = -EAGAIN;
71 return ret;
72}
73
74static int do_mlock(unsigned long start, size_t len, int on)
75{
76 unsigned long nstart, end, tmp;
77 struct vm_area_struct * vma, * prev;
78 int error;
79
80 len = PAGE_ALIGN(len);
81 end = start + len;
82 if (end < start)
83 return -EINVAL;
84 if (end == start)
85 return 0;
86 vma = find_vma_prev(current->mm, start, &prev);
87 if (!vma || vma->vm_start > start)
88 return -ENOMEM;
89
90 if (start > vma->vm_start)
91 prev = vma;
92
93 for (nstart = start ; ; ) {
94 unsigned int newflags;
95
96 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
97
98 newflags = vma->vm_flags | VM_LOCKED;
99 if (!on)
100 newflags &= ~VM_LOCKED;
101
102 tmp = vma->vm_end;
103 if (tmp > end)
104 tmp = end;
105 error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
106 if (error)
107 break;
108 nstart = tmp;
109 if (nstart < prev->vm_end)
110 nstart = prev->vm_end;
111 if (nstart >= end)
112 break;
113
114 vma = prev->vm_next;
115 if (!vma || vma->vm_start != nstart) {
116 error = -ENOMEM;
117 break;
118 }
119 }
120 return error;
121}
122
123asmlinkage long sys_mlock(unsigned long start, size_t len)
124{
125 unsigned long locked;
126 unsigned long lock_limit;
127 int error = -ENOMEM;
128
129 if (!can_do_mlock())
130 return -EPERM;
131
132 down_write(&current->mm->mmap_sem);
133 len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
134 start &= PAGE_MASK;
135
136 locked = len >> PAGE_SHIFT;
137 locked += current->mm->locked_vm;
138
139 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
140 lock_limit >>= PAGE_SHIFT;
141
142 /* check against resource limits */
143 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
144 error = do_mlock(start, len, 1);
145 up_write(&current->mm->mmap_sem);
146 return error;
147}
148
149asmlinkage long sys_munlock(unsigned long start, size_t len)
150{
151 int ret;
152
153 down_write(&current->mm->mmap_sem);
154 len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
155 start &= PAGE_MASK;
156 ret = do_mlock(start, len, 0);
157 up_write(&current->mm->mmap_sem);
158 return ret;
159}
160
161static int do_mlockall(int flags)
162{
163 struct vm_area_struct * vma, * prev = NULL;
164 unsigned int def_flags = 0;
165
166 if (flags & MCL_FUTURE)
167 def_flags = VM_LOCKED;
168 current->mm->def_flags = def_flags;
169 if (flags == MCL_FUTURE)
170 goto out;
171
172 for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
173 unsigned int newflags;
174
175 newflags = vma->vm_flags | VM_LOCKED;
176 if (!(flags & MCL_CURRENT))
177 newflags &= ~VM_LOCKED;
178
179 /* Ignore errors */
180 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
181 }
182out:
183 return 0;
184}
185
186asmlinkage long sys_mlockall(int flags)
187{
188 unsigned long lock_limit;
189 int ret = -EINVAL;
190
191 if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
192 goto out;
193
194 ret = -EPERM;
195 if (!can_do_mlock())
196 goto out;
197
198 down_write(&current->mm->mmap_sem);
199
200 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
201 lock_limit >>= PAGE_SHIFT;
202
203 ret = -ENOMEM;
204 if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
205 capable(CAP_IPC_LOCK))
206 ret = do_mlockall(flags);
207 up_write(&current->mm->mmap_sem);
208out:
209 return ret;
210}
211
212asmlinkage long sys_munlockall(void)
213{
214 int ret;
215
216 down_write(&current->mm->mmap_sem);
217 ret = do_mlockall(0);
218 up_write(&current->mm->mmap_sem);
219 return ret;
220}
221
222/*
223 * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
224 * shm segments) get accounted against the user_struct instead.
225 */
226static DEFINE_SPINLOCK(shmlock_user_lock);
227
228int user_shm_lock(size_t size, struct user_struct *user)
229{
230 unsigned long lock_limit, locked;
231 int allowed = 0;
232
233 locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
234 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
235 lock_limit >>= PAGE_SHIFT;
236 spin_lock(&shmlock_user_lock);
237 if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
238 goto out;
239 get_uid(user);
240 user->locked_shm += locked;
241 allowed = 1;
242out:
243 spin_unlock(&shmlock_user_lock);
244 return allowed;
245}
246
247void user_shm_unlock(size_t size, struct user_struct *user)
248{
249 spin_lock(&shmlock_user_lock);
250 user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
251 spin_unlock(&shmlock_user_lock);
252 free_uid(user);
253}
diff --git a/mm/mmap.c b/mm/mmap.c
new file mode 100644
index 00000000000..a95ebda2744
--- /dev/null
+++ b/mm/mmap.c
@@ -0,0 +1,2082 @@
1/*
2 * mm/mmap.c
3 *
4 * Written by obz.
5 *
6 * Address space accounting code <alan@redhat.com>
7 */
8
9#include <linux/slab.h>
10#include <linux/mm.h>
11#include <linux/shm.h>
12#include <linux/mman.h>
13#include <linux/pagemap.h>
14#include <linux/swap.h>
15#include <linux/syscalls.h>
16#include <linux/init.h>
17#include <linux/file.h>
18#include <linux/fs.h>
19#include <linux/personality.h>
20#include <linux/security.h>
21#include <linux/hugetlb.h>
22#include <linux/profile.h>
23#include <linux/module.h>
24#include <linux/mount.h>
25#include <linux/mempolicy.h>
26#include <linux/rmap.h>
27
28#include <asm/uaccess.h>
29#include <asm/cacheflush.h>
30#include <asm/tlb.h>
31
32/*
33 * WARNING: the debugging will use recursive algorithms so never enable this
34 * unless you know what you are doing.
35 */
36#undef DEBUG_MM_RB
37
38/* description of effects of mapping type and prot in current implementation.
39 * this is due to the limited x86 page protection hardware. The expected
40 * behavior is in parens:
41 *
42 * map_type prot
43 * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC
44 * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
45 * w: (no) no w: (no) no w: (yes) yes w: (no) no
46 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
47 *
48 * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
49 * w: (no) no w: (no) no w: (copy) copy w: (no) no
50 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
51 *
52 */
53pgprot_t protection_map[16] = {
54 __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
55 __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
56};
57
58int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
59int sysctl_overcommit_ratio = 50; /* default is 50% */
60int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
61atomic_t vm_committed_space = ATOMIC_INIT(0);
62
63/*
64 * Check that a process has enough memory to allocate a new virtual
65 * mapping. 0 means there is enough memory for the allocation to
66 * succeed and -ENOMEM implies there is not.
67 *
68 * We currently support three overcommit policies, which are set via the
69 * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
70 *
71 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
72 * Additional code 2002 Jul 20 by Robert Love.
73 *
74 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
75 *
76 * Note this is a helper function intended to be used by LSMs which
77 * wish to use this logic.
78 */
79int __vm_enough_memory(long pages, int cap_sys_admin)
80{
81 unsigned long free, allowed;
82
83 vm_acct_memory(pages);
84
85 /*
86 * Sometimes we want to use more memory than we have
87 */
88 if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
89 return 0;
90
91 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
92 unsigned long n;
93
94 free = get_page_cache_size();
95 free += nr_swap_pages;
96
97 /*
98 * Any slabs which are created with the
99 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
100 * which are reclaimable, under pressure. The dentry
101 * cache and most inode caches should fall into this
102 */
103 free += atomic_read(&slab_reclaim_pages);
104
105 /*
106 * Leave the last 3% for root
107 */
108 if (!cap_sys_admin)
109 free -= free / 32;
110
111 if (free > pages)
112 return 0;
113
114 /*
115 * nr_free_pages() is very expensive on large systems,
116 * only call if we're about to fail.
117 */
118 n = nr_free_pages();
119 if (!cap_sys_admin)
120 n -= n / 32;
121 free += n;
122
123 if (free > pages)
124 return 0;
125 vm_unacct_memory(pages);
126 return -ENOMEM;
127 }
128
129 allowed = (totalram_pages - hugetlb_total_pages())
130 * sysctl_overcommit_ratio / 100;
131 /*
132 * Leave the last 3% for root
133 */
134 if (!cap_sys_admin)
135 allowed -= allowed / 32;
136 allowed += total_swap_pages;
137
138 /* Don't let a single process grow too big:
139 leave 3% of the size of this process for other processes */
140 allowed -= current->mm->total_vm / 32;
141
142 if (atomic_read(&vm_committed_space) < allowed)
143 return 0;
144
145 vm_unacct_memory(pages);
146
147 return -ENOMEM;
148}
149
150EXPORT_SYMBOL(sysctl_overcommit_memory);
151EXPORT_SYMBOL(sysctl_overcommit_ratio);
152EXPORT_SYMBOL(sysctl_max_map_count);
153EXPORT_SYMBOL(vm_committed_space);
154EXPORT_SYMBOL(__vm_enough_memory);
155
156/*
157 * Requires inode->i_mapping->i_mmap_lock
158 */
159static void __remove_shared_vm_struct(struct vm_area_struct *vma,
160 struct file *file, struct address_space *mapping)
161{
162 if (vma->vm_flags & VM_DENYWRITE)
163 atomic_inc(&file->f_dentry->d_inode->i_writecount);
164 if (vma->vm_flags & VM_SHARED)
165 mapping->i_mmap_writable--;
166
167 flush_dcache_mmap_lock(mapping);
168 if (unlikely(vma->vm_flags & VM_NONLINEAR))
169 list_del_init(&vma->shared.vm_set.list);
170 else
171 vma_prio_tree_remove(vma, &mapping->i_mmap);
172 flush_dcache_mmap_unlock(mapping);
173}
174
175/*
176 * Remove one vm structure and free it.
177 */
178static void remove_vm_struct(struct vm_area_struct *vma)
179{
180 struct file *file = vma->vm_file;
181
182 might_sleep();
183 if (file) {
184 struct address_space *mapping = file->f_mapping;
185 spin_lock(&mapping->i_mmap_lock);
186 __remove_shared_vm_struct(vma, file, mapping);
187 spin_unlock(&mapping->i_mmap_lock);
188 }
189 if (vma->vm_ops && vma->vm_ops->close)
190 vma->vm_ops->close(vma);
191 if (file)
192 fput(file);
193 anon_vma_unlink(vma);
194 mpol_free(vma_policy(vma));
195 kmem_cache_free(vm_area_cachep, vma);
196}
197
198/*
199 * sys_brk() for the most part doesn't need the global kernel
200 * lock, except when an application is doing something nasty
201 * like trying to un-brk an area that has already been mapped
202 * to a regular file. in this case, the unmapping will need
203 * to invoke file system routines that need the global lock.
204 */
205asmlinkage unsigned long sys_brk(unsigned long brk)
206{
207 unsigned long rlim, retval;
208 unsigned long newbrk, oldbrk;
209 struct mm_struct *mm = current->mm;
210
211 down_write(&mm->mmap_sem);
212
213 if (brk < mm->end_code)
214 goto out;
215 newbrk = PAGE_ALIGN(brk);
216 oldbrk = PAGE_ALIGN(mm->brk);
217 if (oldbrk == newbrk)
218 goto set_brk;
219
220 /* Always allow shrinking brk. */
221 if (brk <= mm->brk) {
222 if (!do_munmap(mm, newbrk, oldbrk-newbrk))
223 goto set_brk;
224 goto out;
225 }
226
227 /* Check against rlimit.. */
228 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
229 if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
230 goto out;
231
232 /* Check against existing mmap mappings. */
233 if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
234 goto out;
235
236 /* Ok, looks good - let it rip. */
237 if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
238 goto out;
239set_brk:
240 mm->brk = brk;
241out:
242 retval = mm->brk;
243 up_write(&mm->mmap_sem);
244 return retval;
245}
246
247#ifdef DEBUG_MM_RB
248static int browse_rb(struct rb_root *root)
249{
250 int i = 0, j;
251 struct rb_node *nd, *pn = NULL;
252 unsigned long prev = 0, pend = 0;
253
254 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
255 struct vm_area_struct *vma;
256 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
257 if (vma->vm_start < prev)
258 printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
259 if (vma->vm_start < pend)
260 printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
261 if (vma->vm_start > vma->vm_end)
262 printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
263 i++;
264 pn = nd;
265 }
266 j = 0;
267 for (nd = pn; nd; nd = rb_prev(nd)) {
268 j++;
269 }
270 if (i != j)
271 printk("backwards %d, forwards %d\n", j, i), i = 0;
272 return i;
273}
274
275void validate_mm(struct mm_struct *mm)
276{
277 int bug = 0;
278 int i = 0;
279 struct vm_area_struct *tmp = mm->mmap;
280 while (tmp) {
281 tmp = tmp->vm_next;
282 i++;
283 }
284 if (i != mm->map_count)
285 printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
286 i = browse_rb(&mm->mm_rb);
287 if (i != mm->map_count)
288 printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
289 if (bug)
290 BUG();
291}
292#else
293#define validate_mm(mm) do { } while (0)
294#endif
295
296static struct vm_area_struct *
297find_vma_prepare(struct mm_struct *mm, unsigned long addr,
298 struct vm_area_struct **pprev, struct rb_node ***rb_link,
299 struct rb_node ** rb_parent)
300{
301 struct vm_area_struct * vma;
302 struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
303
304 __rb_link = &mm->mm_rb.rb_node;
305 rb_prev = __rb_parent = NULL;
306 vma = NULL;
307
308 while (*__rb_link) {
309 struct vm_area_struct *vma_tmp;
310
311 __rb_parent = *__rb_link;
312 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
313
314 if (vma_tmp->vm_end > addr) {
315 vma = vma_tmp;
316 if (vma_tmp->vm_start <= addr)
317 return vma;
318 __rb_link = &__rb_parent->rb_left;
319 } else {
320 rb_prev = __rb_parent;
321 __rb_link = &__rb_parent->rb_right;
322 }
323 }
324
325 *pprev = NULL;
326 if (rb_prev)
327 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
328 *rb_link = __rb_link;
329 *rb_parent = __rb_parent;
330 return vma;
331}
332
333static inline void
334__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
335 struct vm_area_struct *prev, struct rb_node *rb_parent)
336{
337 if (prev) {
338 vma->vm_next = prev->vm_next;
339 prev->vm_next = vma;
340 } else {
341 mm->mmap = vma;
342 if (rb_parent)
343 vma->vm_next = rb_entry(rb_parent,
344 struct vm_area_struct, vm_rb);
345 else
346 vma->vm_next = NULL;
347 }
348}
349
350void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
351 struct rb_node **rb_link, struct rb_node *rb_parent)
352{
353 rb_link_node(&vma->vm_rb, rb_parent, rb_link);
354 rb_insert_color(&vma->vm_rb, &mm->mm_rb);
355}
356
357static inline void __vma_link_file(struct vm_area_struct *vma)
358{
359 struct file * file;
360
361 file = vma->vm_file;
362 if (file) {
363 struct address_space *mapping = file->f_mapping;
364
365 if (vma->vm_flags & VM_DENYWRITE)
366 atomic_dec(&file->f_dentry->d_inode->i_writecount);
367 if (vma->vm_flags & VM_SHARED)
368 mapping->i_mmap_writable++;
369
370 flush_dcache_mmap_lock(mapping);
371 if (unlikely(vma->vm_flags & VM_NONLINEAR))
372 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
373 else
374 vma_prio_tree_insert(vma, &mapping->i_mmap);
375 flush_dcache_mmap_unlock(mapping);
376 }
377}
378
379static void
380__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
381 struct vm_area_struct *prev, struct rb_node **rb_link,
382 struct rb_node *rb_parent)
383{
384 __vma_link_list(mm, vma, prev, rb_parent);
385 __vma_link_rb(mm, vma, rb_link, rb_parent);
386 __anon_vma_link(vma);
387}
388
389static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
390 struct vm_area_struct *prev, struct rb_node **rb_link,
391 struct rb_node *rb_parent)
392{
393 struct address_space *mapping = NULL;
394
395 if (vma->vm_file)
396 mapping = vma->vm_file->f_mapping;
397
398 if (mapping) {
399 spin_lock(&mapping->i_mmap_lock);
400 vma->vm_truncate_count = mapping->truncate_count;
401 }
402 anon_vma_lock(vma);
403
404 __vma_link(mm, vma, prev, rb_link, rb_parent);
405 __vma_link_file(vma);
406
407 anon_vma_unlock(vma);
408 if (mapping)
409 spin_unlock(&mapping->i_mmap_lock);
410
411 mm->map_count++;
412 validate_mm(mm);
413}
414
415/*
416 * Helper for vma_adjust in the split_vma insert case:
417 * insert vm structure into list and rbtree and anon_vma,
418 * but it has already been inserted into prio_tree earlier.
419 */
420static void
421__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
422{
423 struct vm_area_struct * __vma, * prev;
424 struct rb_node ** rb_link, * rb_parent;
425
426 __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
427 if (__vma && __vma->vm_start < vma->vm_end)
428 BUG();
429 __vma_link(mm, vma, prev, rb_link, rb_parent);
430 mm->map_count++;
431}
432
433static inline void
434__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
435 struct vm_area_struct *prev)
436{
437 prev->vm_next = vma->vm_next;
438 rb_erase(&vma->vm_rb, &mm->mm_rb);
439 if (mm->mmap_cache == vma)
440 mm->mmap_cache = prev;
441}
442
443/*
444 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
445 * is already present in an i_mmap tree without adjusting the tree.
446 * The following helper function should be used when such adjustments
447 * are necessary. The "insert" vma (if any) is to be inserted
448 * before we drop the necessary locks.
449 */
450void vma_adjust(struct vm_area_struct *vma, unsigned long start,
451 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
452{
453 struct mm_struct *mm = vma->vm_mm;
454 struct vm_area_struct *next = vma->vm_next;
455 struct vm_area_struct *importer = NULL;
456 struct address_space *mapping = NULL;
457 struct prio_tree_root *root = NULL;
458 struct file *file = vma->vm_file;
459 struct anon_vma *anon_vma = NULL;
460 long adjust_next = 0;
461 int remove_next = 0;
462
463 if (next && !insert) {
464 if (end >= next->vm_end) {
465 /*
466 * vma expands, overlapping all the next, and
467 * perhaps the one after too (mprotect case 6).
468 */
469again: remove_next = 1 + (end > next->vm_end);
470 end = next->vm_end;
471 anon_vma = next->anon_vma;
472 importer = vma;
473 } else if (end > next->vm_start) {
474 /*
475 * vma expands, overlapping part of the next:
476 * mprotect case 5 shifting the boundary up.
477 */
478 adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
479 anon_vma = next->anon_vma;
480 importer = vma;
481 } else if (end < vma->vm_end) {
482 /*
483 * vma shrinks, and !insert tells it's not
484 * split_vma inserting another: so it must be
485 * mprotect case 4 shifting the boundary down.
486 */
487 adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
488 anon_vma = next->anon_vma;
489 importer = next;
490 }
491 }
492
493 if (file) {
494 mapping = file->f_mapping;
495 if (!(vma->vm_flags & VM_NONLINEAR))
496 root = &mapping->i_mmap;
497 spin_lock(&mapping->i_mmap_lock);
498 if (importer &&
499 vma->vm_truncate_count != next->vm_truncate_count) {
500 /*
501 * unmap_mapping_range might be in progress:
502 * ensure that the expanding vma is rescanned.
503 */
504 importer->vm_truncate_count = 0;
505 }
506 if (insert) {
507 insert->vm_truncate_count = vma->vm_truncate_count;
508 /*
509 * Put into prio_tree now, so instantiated pages
510 * are visible to arm/parisc __flush_dcache_page
511 * throughout; but we cannot insert into address
512 * space until vma start or end is updated.
513 */
514 __vma_link_file(insert);
515 }
516 }
517
518 /*
519 * When changing only vma->vm_end, we don't really need
520 * anon_vma lock: but is that case worth optimizing out?
521 */
522 if (vma->anon_vma)
523 anon_vma = vma->anon_vma;
524 if (anon_vma) {
525 spin_lock(&anon_vma->lock);
526 /*
527 * Easily overlooked: when mprotect shifts the boundary,
528 * make sure the expanding vma has anon_vma set if the
529 * shrinking vma had, to cover any anon pages imported.
530 */
531 if (importer && !importer->anon_vma) {
532 importer->anon_vma = anon_vma;
533 __anon_vma_link(importer);
534 }
535 }
536
537 if (root) {
538 flush_dcache_mmap_lock(mapping);
539 vma_prio_tree_remove(vma, root);
540 if (adjust_next)
541 vma_prio_tree_remove(next, root);
542 }
543
544 vma->vm_start = start;
545 vma->vm_end = end;
546 vma->vm_pgoff = pgoff;
547 if (adjust_next) {
548 next->vm_start += adjust_next << PAGE_SHIFT;
549 next->vm_pgoff += adjust_next;
550 }
551
552 if (root) {
553 if (adjust_next)
554 vma_prio_tree_insert(next, root);
555 vma_prio_tree_insert(vma, root);
556 flush_dcache_mmap_unlock(mapping);
557 }
558
559 if (remove_next) {
560 /*
561 * vma_merge has merged next into vma, and needs
562 * us to remove next before dropping the locks.
563 */
564 __vma_unlink(mm, next, vma);
565 if (file)
566 __remove_shared_vm_struct(next, file, mapping);
567 if (next->anon_vma)
568 __anon_vma_merge(vma, next);
569 } else if (insert) {
570 /*
571 * split_vma has split insert from vma, and needs
572 * us to insert it before dropping the locks
573 * (it may either follow vma or precede it).
574 */
575 __insert_vm_struct(mm, insert);
576 }
577
578 if (anon_vma)
579 spin_unlock(&anon_vma->lock);
580 if (mapping)
581 spin_unlock(&mapping->i_mmap_lock);
582
583 if (remove_next) {
584 if (file)
585 fput(file);
586 mm->map_count--;
587 mpol_free(vma_policy(next));
588 kmem_cache_free(vm_area_cachep, next);
589 /*
590 * In mprotect's case 6 (see comments on vma_merge),
591 * we must remove another next too. It would clutter
592 * up the code too much to do both in one go.
593 */
594 if (remove_next == 2) {
595 next = vma->vm_next;
596 goto again;
597 }
598 }
599
600 validate_mm(mm);
601}
602
603/*
604 * If the vma has a ->close operation then the driver probably needs to release
605 * per-vma resources, so we don't attempt to merge those.
606 */
607#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED)
608
609static inline int is_mergeable_vma(struct vm_area_struct *vma,
610 struct file *file, unsigned long vm_flags)
611{
612 if (vma->vm_flags != vm_flags)
613 return 0;
614 if (vma->vm_file != file)
615 return 0;
616 if (vma->vm_ops && vma->vm_ops->close)
617 return 0;
618 return 1;
619}
620
621static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
622 struct anon_vma *anon_vma2)
623{
624 return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2);
625}
626
627/*
628 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
629 * in front of (at a lower virtual address and file offset than) the vma.
630 *
631 * We cannot merge two vmas if they have differently assigned (non-NULL)
632 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
633 *
634 * We don't check here for the merged mmap wrapping around the end of pagecache
635 * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
636 * wrap, nor mmaps which cover the final page at index -1UL.
637 */
638static int
639can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
640 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
641{
642 if (is_mergeable_vma(vma, file, vm_flags) &&
643 is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
644 if (vma->vm_pgoff == vm_pgoff)
645 return 1;
646 }
647 return 0;
648}
649
650/*
651 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
652 * beyond (at a higher virtual address and file offset than) the vma.
653 *
654 * We cannot merge two vmas if they have differently assigned (non-NULL)
655 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
656 */
657static int
658can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
659 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
660{
661 if (is_mergeable_vma(vma, file, vm_flags) &&
662 is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
663 pgoff_t vm_pglen;
664 vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
665 if (vma->vm_pgoff + vm_pglen == vm_pgoff)
666 return 1;
667 }
668 return 0;
669}
670
671/*
672 * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
673 * whether that can be merged with its predecessor or its successor.
674 * Or both (it neatly fills a hole).
675 *
676 * In most cases - when called for mmap, brk or mremap - [addr,end) is
677 * certain not to be mapped by the time vma_merge is called; but when
678 * called for mprotect, it is certain to be already mapped (either at
679 * an offset within prev, or at the start of next), and the flags of
680 * this area are about to be changed to vm_flags - and the no-change
681 * case has already been eliminated.
682 *
683 * The following mprotect cases have to be considered, where AAAA is
684 * the area passed down from mprotect_fixup, never extending beyond one
685 * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
686 *
687 * AAAA AAAA AAAA AAAA
688 * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX
689 * cannot merge might become might become might become
690 * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or
691 * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or
692 * mremap move: PPPPNNNNNNNN 8
693 * AAAA
694 * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN
695 * might become case 1 below case 2 below case 3 below
696 *
697 * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
698 * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
699 */
700struct vm_area_struct *vma_merge(struct mm_struct *mm,
701 struct vm_area_struct *prev, unsigned long addr,
702 unsigned long end, unsigned long vm_flags,
703 struct anon_vma *anon_vma, struct file *file,
704 pgoff_t pgoff, struct mempolicy *policy)
705{
706 pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
707 struct vm_area_struct *area, *next;
708
709 /*
710 * We later require that vma->vm_flags == vm_flags,
711 * so this tests vma->vm_flags & VM_SPECIAL, too.
712 */
713 if (vm_flags & VM_SPECIAL)
714 return NULL;
715
716 if (prev)
717 next = prev->vm_next;
718 else
719 next = mm->mmap;
720 area = next;
721 if (next && next->vm_end == end) /* cases 6, 7, 8 */
722 next = next->vm_next;
723
724 /*
725 * Can it merge with the predecessor?
726 */
727 if (prev && prev->vm_end == addr &&
728 mpol_equal(vma_policy(prev), policy) &&
729 can_vma_merge_after(prev, vm_flags,
730 anon_vma, file, pgoff)) {
731 /*
732 * OK, it can. Can we now merge in the successor as well?
733 */
734 if (next && end == next->vm_start &&
735 mpol_equal(policy, vma_policy(next)) &&
736 can_vma_merge_before(next, vm_flags,
737 anon_vma, file, pgoff+pglen) &&
738 is_mergeable_anon_vma(prev->anon_vma,
739 next->anon_vma)) {
740 /* cases 1, 6 */
741 vma_adjust(prev, prev->vm_start,
742 next->vm_end, prev->vm_pgoff, NULL);
743 } else /* cases 2, 5, 7 */
744 vma_adjust(prev, prev->vm_start,
745 end, prev->vm_pgoff, NULL);
746 return prev;
747 }
748
749 /*
750 * Can this new request be merged in front of next?
751 */
752 if (next && end == next->vm_start &&
753 mpol_equal(policy, vma_policy(next)) &&
754 can_vma_merge_before(next, vm_flags,
755 anon_vma, file, pgoff+pglen)) {
756 if (prev && addr < prev->vm_end) /* case 4 */
757 vma_adjust(prev, prev->vm_start,
758 addr, prev->vm_pgoff, NULL);
759 else /* cases 3, 8 */
760 vma_adjust(area, addr, next->vm_end,
761 next->vm_pgoff - pglen, NULL);
762 return area;
763 }
764
765 return NULL;
766}
767
768/*
769 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
770 * neighbouring vmas for a suitable anon_vma, before it goes off
771 * to allocate a new anon_vma. It checks because a repetitive
772 * sequence of mprotects and faults may otherwise lead to distinct
773 * anon_vmas being allocated, preventing vma merge in subsequent
774 * mprotect.
775 */
776struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
777{
778 struct vm_area_struct *near;
779 unsigned long vm_flags;
780
781 near = vma->vm_next;
782 if (!near)
783 goto try_prev;
784
785 /*
786 * Since only mprotect tries to remerge vmas, match flags
787 * which might be mprotected into each other later on.
788 * Neither mlock nor madvise tries to remerge at present,
789 * so leave their flags as obstructing a merge.
790 */
791 vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
792 vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
793
794 if (near->anon_vma && vma->vm_end == near->vm_start &&
795 mpol_equal(vma_policy(vma), vma_policy(near)) &&
796 can_vma_merge_before(near, vm_flags,
797 NULL, vma->vm_file, vma->vm_pgoff +
798 ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
799 return near->anon_vma;
800try_prev:
801 /*
802 * It is potentially slow to have to call find_vma_prev here.
803 * But it's only on the first write fault on the vma, not
804 * every time, and we could devise a way to avoid it later
805 * (e.g. stash info in next's anon_vma_node when assigning
806 * an anon_vma, or when trying vma_merge). Another time.
807 */
808 if (find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma)
809 BUG();
810 if (!near)
811 goto none;
812
813 vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
814 vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
815
816 if (near->anon_vma && near->vm_end == vma->vm_start &&
817 mpol_equal(vma_policy(near), vma_policy(vma)) &&
818 can_vma_merge_after(near, vm_flags,
819 NULL, vma->vm_file, vma->vm_pgoff))
820 return near->anon_vma;
821none:
822 /*
823 * There's no absolute need to look only at touching neighbours:
824 * we could search further afield for "compatible" anon_vmas.
825 * But it would probably just be a waste of time searching,
826 * or lead to too many vmas hanging off the same anon_vma.
827 * We're trying to allow mprotect remerging later on,
828 * not trying to minimize memory used for anon_vmas.
829 */
830 return NULL;
831}
832
833#ifdef CONFIG_PROC_FS
834void __vm_stat_account(struct mm_struct *mm, unsigned long flags,
835 struct file *file, long pages)
836{
837 const unsigned long stack_flags
838 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
839
840#ifdef CONFIG_HUGETLB
841 if (flags & VM_HUGETLB) {
842 if (!(flags & VM_DONTCOPY))
843 mm->shared_vm += pages;
844 return;
845 }
846#endif /* CONFIG_HUGETLB */
847
848 if (file) {
849 mm->shared_vm += pages;
850 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
851 mm->exec_vm += pages;
852 } else if (flags & stack_flags)
853 mm->stack_vm += pages;
854 if (flags & (VM_RESERVED|VM_IO))
855 mm->reserved_vm += pages;
856}
857#endif /* CONFIG_PROC_FS */
858
859/*
860 * The caller must hold down_write(current->mm->mmap_sem).
861 */
862
863unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
864 unsigned long len, unsigned long prot,
865 unsigned long flags, unsigned long pgoff)
866{
867 struct mm_struct * mm = current->mm;
868 struct vm_area_struct * vma, * prev;
869 struct inode *inode;
870 unsigned int vm_flags;
871 int correct_wcount = 0;
872 int error;
873 struct rb_node ** rb_link, * rb_parent;
874 int accountable = 1;
875 unsigned long charged = 0, reqprot = prot;
876
877 if (file) {
878 if (is_file_hugepages(file))
879 accountable = 0;
880
881 if (!file->f_op || !file->f_op->mmap)
882 return -ENODEV;
883
884 if ((prot & PROT_EXEC) &&
885 (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
886 return -EPERM;
887 }
888 /*
889 * Does the application expect PROT_READ to imply PROT_EXEC?
890 *
891 * (the exception is when the underlying filesystem is noexec
892 * mounted, in which case we dont add PROT_EXEC.)
893 */
894 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
895 if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)))
896 prot |= PROT_EXEC;
897
898 if (!len)
899 return -EINVAL;
900
901 /* Careful about overflows.. */
902 len = PAGE_ALIGN(len);
903 if (!len || len > TASK_SIZE)
904 return -ENOMEM;
905
906 /* offset overflow? */
907 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
908 return -EOVERFLOW;
909
910 /* Too many mappings? */
911 if (mm->map_count > sysctl_max_map_count)
912 return -ENOMEM;
913
914 /* Obtain the address to map to. we verify (or select) it and ensure
915 * that it represents a valid section of the address space.
916 */
917 addr = get_unmapped_area(file, addr, len, pgoff, flags);
918 if (addr & ~PAGE_MASK)
919 return addr;
920
921 /* Do simple checking here so the lower-level routines won't have
922 * to. we assume access permissions have been handled by the open
923 * of the memory object, so we don't do any here.
924 */
925 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
926 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
927
928 if (flags & MAP_LOCKED) {
929 if (!can_do_mlock())
930 return -EPERM;
931 vm_flags |= VM_LOCKED;
932 }
933 /* mlock MCL_FUTURE? */
934 if (vm_flags & VM_LOCKED) {
935 unsigned long locked, lock_limit;
936 locked = mm->locked_vm << PAGE_SHIFT;
937 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
938 locked += len;
939 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
940 return -EAGAIN;
941 }
942
943 inode = file ? file->f_dentry->d_inode : NULL;
944
945 if (file) {
946 switch (flags & MAP_TYPE) {
947 case MAP_SHARED:
948 if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
949 return -EACCES;
950
951 /*
952 * Make sure we don't allow writing to an append-only
953 * file..
954 */
955 if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
956 return -EACCES;
957
958 /*
959 * Make sure there are no mandatory locks on the file.
960 */
961 if (locks_verify_locked(inode))
962 return -EAGAIN;
963
964 vm_flags |= VM_SHARED | VM_MAYSHARE;
965 if (!(file->f_mode & FMODE_WRITE))
966 vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
967
968 /* fall through */
969 case MAP_PRIVATE:
970 if (!(file->f_mode & FMODE_READ))
971 return -EACCES;
972 break;
973
974 default:
975 return -EINVAL;
976 }
977 } else {
978 switch (flags & MAP_TYPE) {
979 case MAP_SHARED:
980 vm_flags |= VM_SHARED | VM_MAYSHARE;
981 break;
982 case MAP_PRIVATE:
983 /*
984 * Set pgoff according to addr for anon_vma.
985 */
986 pgoff = addr >> PAGE_SHIFT;
987 break;
988 default:
989 return -EINVAL;
990 }
991 }
992
993 error = security_file_mmap(file, reqprot, prot, flags);
994 if (error)
995 return error;
996
997 /* Clear old maps */
998 error = -ENOMEM;
999munmap_back:
1000 vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
1001 if (vma && vma->vm_start < addr + len) {
1002 if (do_munmap(mm, addr, len))
1003 return -ENOMEM;
1004 goto munmap_back;
1005 }
1006
1007 /* Check against address space limit. */
1008 if ((mm->total_vm << PAGE_SHIFT) + len
1009 > current->signal->rlim[RLIMIT_AS].rlim_cur)
1010 return -ENOMEM;
1011
1012 if (accountable && (!(flags & MAP_NORESERVE) ||
1013 sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
1014 if (vm_flags & VM_SHARED) {
1015 /* Check memory availability in shmem_file_setup? */
1016 vm_flags |= VM_ACCOUNT;
1017 } else if (vm_flags & VM_WRITE) {
1018 /*
1019 * Private writable mapping: check memory availability
1020 */
1021 charged = len >> PAGE_SHIFT;
1022 if (security_vm_enough_memory(charged))
1023 return -ENOMEM;
1024 vm_flags |= VM_ACCOUNT;
1025 }
1026 }
1027
1028 /*
1029 * Can we just expand an old private anonymous mapping?
1030 * The VM_SHARED test is necessary because shmem_zero_setup
1031 * will create the file object for a shared anonymous map below.
1032 */
1033 if (!file && !(vm_flags & VM_SHARED) &&
1034 vma_merge(mm, prev, addr, addr + len, vm_flags,
1035 NULL, NULL, pgoff, NULL))
1036 goto out;
1037
1038 /*
1039 * Determine the object being mapped and call the appropriate
1040 * specific mapper. the address has already been validated, but
1041 * not unmapped, but the maps are removed from the list.
1042 */
1043 vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1044 if (!vma) {
1045 error = -ENOMEM;
1046 goto unacct_error;
1047 }
1048 memset(vma, 0, sizeof(*vma));
1049
1050 vma->vm_mm = mm;
1051 vma->vm_start = addr;
1052 vma->vm_end = addr + len;
1053 vma->vm_flags = vm_flags;
1054 vma->vm_page_prot = protection_map[vm_flags & 0x0f];
1055 vma->vm_pgoff = pgoff;
1056
1057 if (file) {
1058 error = -EINVAL;
1059 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1060 goto free_vma;
1061 if (vm_flags & VM_DENYWRITE) {
1062 error = deny_write_access(file);
1063 if (error)
1064 goto free_vma;
1065 correct_wcount = 1;
1066 }
1067 vma->vm_file = file;
1068 get_file(file);
1069 error = file->f_op->mmap(file, vma);
1070 if (error)
1071 goto unmap_and_free_vma;
1072 } else if (vm_flags & VM_SHARED) {
1073 error = shmem_zero_setup(vma);
1074 if (error)
1075 goto free_vma;
1076 }
1077
1078 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
1079 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
1080 * that memory reservation must be checked; but that reservation
1081 * belongs to shared memory object, not to vma: so now clear it.
1082 */
1083 if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
1084 vma->vm_flags &= ~VM_ACCOUNT;
1085
1086 /* Can addr have changed??
1087 *
1088 * Answer: Yes, several device drivers can do it in their
1089 * f_op->mmap method. -DaveM
1090 */
1091 addr = vma->vm_start;
1092 pgoff = vma->vm_pgoff;
1093 vm_flags = vma->vm_flags;
1094
1095 if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
1096 vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
1097 file = vma->vm_file;
1098 vma_link(mm, vma, prev, rb_link, rb_parent);
1099 if (correct_wcount)
1100 atomic_inc(&inode->i_writecount);
1101 } else {
1102 if (file) {
1103 if (correct_wcount)
1104 atomic_inc(&inode->i_writecount);
1105 fput(file);
1106 }
1107 mpol_free(vma_policy(vma));
1108 kmem_cache_free(vm_area_cachep, vma);
1109 }
1110out:
1111 mm->total_vm += len >> PAGE_SHIFT;
1112 __vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1113 if (vm_flags & VM_LOCKED) {
1114 mm->locked_vm += len >> PAGE_SHIFT;
1115 make_pages_present(addr, addr + len);
1116 }
1117 if (flags & MAP_POPULATE) {
1118 up_write(&mm->mmap_sem);
1119 sys_remap_file_pages(addr, len, 0,
1120 pgoff, flags & MAP_NONBLOCK);
1121 down_write(&mm->mmap_sem);
1122 }
1123 return addr;
1124
1125unmap_and_free_vma:
1126 if (correct_wcount)
1127 atomic_inc(&inode->i_writecount);
1128 vma->vm_file = NULL;
1129 fput(file);
1130
1131 /* Undo any partial mapping done by a device driver. */
1132 zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
1133free_vma:
1134 kmem_cache_free(vm_area_cachep, vma);
1135unacct_error:
1136 if (charged)
1137 vm_unacct_memory(charged);
1138 return error;
1139}
1140
1141EXPORT_SYMBOL(do_mmap_pgoff);
1142
1143/* Get an address range which is currently unmapped.
1144 * For shmat() with addr=0.
1145 *
1146 * Ugly calling convention alert:
1147 * Return value with the low bits set means error value,
1148 * ie
1149 * if (ret & ~PAGE_MASK)
1150 * error = ret;
1151 *
1152 * This function "knows" that -ENOMEM has the bits set.
1153 */
1154#ifndef HAVE_ARCH_UNMAPPED_AREA
1155unsigned long
1156arch_get_unmapped_area(struct file *filp, unsigned long addr,
1157 unsigned long len, unsigned long pgoff, unsigned long flags)
1158{
1159 struct mm_struct *mm = current->mm;
1160 struct vm_area_struct *vma;
1161 unsigned long start_addr;
1162
1163 if (len > TASK_SIZE)
1164 return -ENOMEM;
1165
1166 if (addr) {
1167 addr = PAGE_ALIGN(addr);
1168 vma = find_vma(mm, addr);
1169 if (TASK_SIZE - len >= addr &&
1170 (!vma || addr + len <= vma->vm_start))
1171 return addr;
1172 }
1173 start_addr = addr = mm->free_area_cache;
1174
1175full_search:
1176 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
1177 /* At this point: (!vma || addr < vma->vm_end). */
1178 if (TASK_SIZE - len < addr) {
1179 /*
1180 * Start a new search - just in case we missed
1181 * some holes.
1182 */
1183 if (start_addr != TASK_UNMAPPED_BASE) {
1184 start_addr = addr = TASK_UNMAPPED_BASE;
1185 goto full_search;
1186 }
1187 return -ENOMEM;
1188 }
1189 if (!vma || addr + len <= vma->vm_start) {
1190 /*
1191 * Remember the place where we stopped the search:
1192 */
1193 mm->free_area_cache = addr + len;
1194 return addr;
1195 }
1196 addr = vma->vm_end;
1197 }
1198}
1199#endif
1200
1201void arch_unmap_area(struct vm_area_struct *area)
1202{
1203 /*
1204 * Is this a new hole at the lowest possible address?
1205 */
1206 if (area->vm_start >= TASK_UNMAPPED_BASE &&
1207 area->vm_start < area->vm_mm->free_area_cache)
1208 area->vm_mm->free_area_cache = area->vm_start;
1209}
1210
1211/*
1212 * This mmap-allocator allocates new areas top-down from below the
1213 * stack's low limit (the base):
1214 */
1215#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
1216unsigned long
1217arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1218 const unsigned long len, const unsigned long pgoff,
1219 const unsigned long flags)
1220{
1221 struct vm_area_struct *vma;
1222 struct mm_struct *mm = current->mm;
1223 unsigned long addr = addr0;
1224
1225 /* requested length too big for entire address space */
1226 if (len > TASK_SIZE)
1227 return -ENOMEM;
1228
1229 /* requesting a specific address */
1230 if (addr) {
1231 addr = PAGE_ALIGN(addr);
1232 vma = find_vma(mm, addr);
1233 if (TASK_SIZE - len >= addr &&
1234 (!vma || addr + len <= vma->vm_start))
1235 return addr;
1236 }
1237
1238 /* either no address requested or can't fit in requested address hole */
1239 addr = mm->free_area_cache;
1240
1241 /* make sure it can fit in the remaining address space */
1242 if (addr >= len) {
1243 vma = find_vma(mm, addr-len);
1244 if (!vma || addr <= vma->vm_start)
1245 /* remember the address as a hint for next time */
1246 return (mm->free_area_cache = addr-len);
1247 }
1248
1249 addr = mm->mmap_base-len;
1250
1251 do {
1252 /*
1253 * Lookup failure means no vma is above this address,
1254 * else if new region fits below vma->vm_start,
1255 * return with success:
1256 */
1257 vma = find_vma(mm, addr);
1258 if (!vma || addr+len <= vma->vm_start)
1259 /* remember the address as a hint for next time */
1260 return (mm->free_area_cache = addr);
1261
1262 /* try just below the current vma->vm_start */
1263 addr = vma->vm_start-len;
1264 } while (len <= vma->vm_start);
1265
1266 /*
1267 * A failed mmap() very likely causes application failure,
1268 * so fall back to the bottom-up function here. This scenario
1269 * can happen with large stack limits and large mmap()
1270 * allocations.
1271 */
1272 mm->free_area_cache = TASK_UNMAPPED_BASE;
1273 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
1274 /*
1275 * Restore the topdown base:
1276 */
1277 mm->free_area_cache = mm->mmap_base;
1278
1279 return addr;
1280}
1281#endif
1282
1283void arch_unmap_area_topdown(struct vm_area_struct *area)
1284{
1285 /*
1286 * Is this a new hole at the highest possible address?
1287 */
1288 if (area->vm_end > area->vm_mm->free_area_cache)
1289 area->vm_mm->free_area_cache = area->vm_end;
1290
1291 /* dont allow allocations above current base */
1292 if (area->vm_mm->free_area_cache > area->vm_mm->mmap_base)
1293 area->vm_mm->free_area_cache = area->vm_mm->mmap_base;
1294}
1295
1296unsigned long
1297get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1298 unsigned long pgoff, unsigned long flags)
1299{
1300 if (flags & MAP_FIXED) {
1301 unsigned long ret;
1302
1303 if (addr > TASK_SIZE - len)
1304 return -ENOMEM;
1305 if (addr & ~PAGE_MASK)
1306 return -EINVAL;
1307 if (file && is_file_hugepages(file)) {
1308 /*
1309 * Check if the given range is hugepage aligned, and
1310 * can be made suitable for hugepages.
1311 */
1312 ret = prepare_hugepage_range(addr, len);
1313 } else {
1314 /*
1315 * Ensure that a normal request is not falling in a
1316 * reserved hugepage range. For some archs like IA-64,
1317 * there is a separate region for hugepages.
1318 */
1319 ret = is_hugepage_only_range(current->mm, addr, len);
1320 }
1321 if (ret)
1322 return -EINVAL;
1323 return addr;
1324 }
1325
1326 if (file && file->f_op && file->f_op->get_unmapped_area)
1327 return file->f_op->get_unmapped_area(file, addr, len,
1328 pgoff, flags);
1329
1330 return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
1331}
1332
1333EXPORT_SYMBOL(get_unmapped_area);
1334
1335/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
1336struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
1337{
1338 struct vm_area_struct *vma = NULL;
1339
1340 if (mm) {
1341 /* Check the cache first. */
1342 /* (Cache hit rate is typically around 35%.) */
1343 vma = mm->mmap_cache;
1344 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
1345 struct rb_node * rb_node;
1346
1347 rb_node = mm->mm_rb.rb_node;
1348 vma = NULL;
1349
1350 while (rb_node) {
1351 struct vm_area_struct * vma_tmp;
1352
1353 vma_tmp = rb_entry(rb_node,
1354 struct vm_area_struct, vm_rb);
1355
1356 if (vma_tmp->vm_end > addr) {
1357 vma = vma_tmp;
1358 if (vma_tmp->vm_start <= addr)
1359 break;
1360 rb_node = rb_node->rb_left;
1361 } else
1362 rb_node = rb_node->rb_right;
1363 }
1364 if (vma)
1365 mm->mmap_cache = vma;
1366 }
1367 }
1368 return vma;
1369}
1370
1371EXPORT_SYMBOL(find_vma);
1372
1373/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
1374struct vm_area_struct *
1375find_vma_prev(struct mm_struct *mm, unsigned long addr,
1376 struct vm_area_struct **pprev)
1377{
1378 struct vm_area_struct *vma = NULL, *prev = NULL;
1379 struct rb_node * rb_node;
1380 if (!mm)
1381 goto out;
1382
1383 /* Guard against addr being lower than the first VMA */
1384 vma = mm->mmap;
1385
1386 /* Go through the RB tree quickly. */
1387 rb_node = mm->mm_rb.rb_node;
1388
1389 while (rb_node) {
1390 struct vm_area_struct *vma_tmp;
1391 vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
1392
1393 if (addr < vma_tmp->vm_end) {
1394 rb_node = rb_node->rb_left;
1395 } else {
1396 prev = vma_tmp;
1397 if (!prev->vm_next || (addr < prev->vm_next->vm_end))
1398 break;
1399 rb_node = rb_node->rb_right;
1400 }
1401 }
1402
1403out:
1404 *pprev = prev;
1405 return prev ? prev->vm_next : vma;
1406}
1407
1408/*
1409 * Verify that the stack growth is acceptable and
1410 * update accounting. This is shared with both the
1411 * grow-up and grow-down cases.
1412 */
1413static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow)
1414{
1415 struct mm_struct *mm = vma->vm_mm;
1416 struct rlimit *rlim = current->signal->rlim;
1417
1418 /* address space limit tests */
1419 if (mm->total_vm + grow > rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT)
1420 return -ENOMEM;
1421
1422 /* Stack limit test */
1423 if (size > rlim[RLIMIT_STACK].rlim_cur)
1424 return -ENOMEM;
1425
1426 /* mlock limit tests */
1427 if (vma->vm_flags & VM_LOCKED) {
1428 unsigned long locked;
1429 unsigned long limit;
1430 locked = mm->locked_vm + grow;
1431 limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
1432 if (locked > limit && !capable(CAP_IPC_LOCK))
1433 return -ENOMEM;
1434 }
1435
1436 /*
1437 * Overcommit.. This must be the final test, as it will
1438 * update security statistics.
1439 */
1440 if (security_vm_enough_memory(grow))
1441 return -ENOMEM;
1442
1443 /* Ok, everything looks good - let it rip */
1444 mm->total_vm += grow;
1445 if (vma->vm_flags & VM_LOCKED)
1446 mm->locked_vm += grow;
1447 __vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
1448 return 0;
1449}
1450
1451#ifdef CONFIG_STACK_GROWSUP
1452/*
1453 * vma is the first one with address > vma->vm_end. Have to extend vma.
1454 */
1455int expand_stack(struct vm_area_struct * vma, unsigned long address)
1456{
1457 int error;
1458
1459 if (!(vma->vm_flags & VM_GROWSUP))
1460 return -EFAULT;
1461
1462 /*
1463 * We must make sure the anon_vma is allocated
1464 * so that the anon_vma locking is not a noop.
1465 */
1466 if (unlikely(anon_vma_prepare(vma)))
1467 return -ENOMEM;
1468 anon_vma_lock(vma);
1469
1470 /*
1471 * vma->vm_start/vm_end cannot change under us because the caller
1472 * is required to hold the mmap_sem in read mode. We need the
1473 * anon_vma lock to serialize against concurrent expand_stacks.
1474 */
1475 address += 4 + PAGE_SIZE - 1;
1476 address &= PAGE_MASK;
1477 error = 0;
1478
1479 /* Somebody else might have raced and expanded it already */
1480 if (address > vma->vm_end) {
1481 unsigned long size, grow;
1482
1483 size = address - vma->vm_start;
1484 grow = (address - vma->vm_end) >> PAGE_SHIFT;
1485
1486 error = acct_stack_growth(vma, size, grow);
1487 if (!error)
1488 vma->vm_end = address;
1489 }
1490 anon_vma_unlock(vma);
1491 return error;
1492}
1493
1494struct vm_area_struct *
1495find_extend_vma(struct mm_struct *mm, unsigned long addr)
1496{
1497 struct vm_area_struct *vma, *prev;
1498
1499 addr &= PAGE_MASK;
1500 vma = find_vma_prev(mm, addr, &prev);
1501 if (vma && (vma->vm_start <= addr))
1502 return vma;
1503 if (!prev || expand_stack(prev, addr))
1504 return NULL;
1505 if (prev->vm_flags & VM_LOCKED) {
1506 make_pages_present(addr, prev->vm_end);
1507 }
1508 return prev;
1509}
1510#else
1511/*
1512 * vma is the first one with address < vma->vm_start. Have to extend vma.
1513 */
1514int expand_stack(struct vm_area_struct *vma, unsigned long address)
1515{
1516 int error;
1517
1518 /*
1519 * We must make sure the anon_vma is allocated
1520 * so that the anon_vma locking is not a noop.
1521 */
1522 if (unlikely(anon_vma_prepare(vma)))
1523 return -ENOMEM;
1524 anon_vma_lock(vma);
1525
1526 /*
1527 * vma->vm_start/vm_end cannot change under us because the caller
1528 * is required to hold the mmap_sem in read mode. We need the
1529 * anon_vma lock to serialize against concurrent expand_stacks.
1530 */
1531 address &= PAGE_MASK;
1532 error = 0;
1533
1534 /* Somebody else might have raced and expanded it already */
1535 if (address < vma->vm_start) {
1536 unsigned long size, grow;
1537
1538 size = vma->vm_end - address;
1539 grow = (vma->vm_start - address) >> PAGE_SHIFT;
1540
1541 error = acct_stack_growth(vma, size, grow);
1542 if (!error) {
1543 vma->vm_start = address;
1544 vma->vm_pgoff -= grow;
1545 }
1546 }
1547 anon_vma_unlock(vma);
1548 return error;
1549}
1550
1551struct vm_area_struct *
1552find_extend_vma(struct mm_struct * mm, unsigned long addr)
1553{
1554 struct vm_area_struct * vma;
1555 unsigned long start;
1556
1557 addr &= PAGE_MASK;
1558 vma = find_vma(mm,addr);
1559 if (!vma)
1560 return NULL;
1561 if (vma->vm_start <= addr)
1562 return vma;
1563 if (!(vma->vm_flags & VM_GROWSDOWN))
1564 return NULL;
1565 start = vma->vm_start;
1566 if (expand_stack(vma, addr))
1567 return NULL;
1568 if (vma->vm_flags & VM_LOCKED) {
1569 make_pages_present(addr, start);
1570 }
1571 return vma;
1572}
1573#endif
1574
1575/*
1576 * Try to free as many page directory entries as we can,
1577 * without having to work very hard at actually scanning
1578 * the page tables themselves.
1579 *
1580 * Right now we try to free page tables if we have a nice
1581 * PGDIR-aligned area that got free'd up. We could be more
1582 * granular if we want to, but this is fast and simple,
1583 * and covers the bad cases.
1584 *
1585 * "prev", if it exists, points to a vma before the one
1586 * we just free'd - but there's no telling how much before.
1587 */
1588static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
1589 unsigned long start, unsigned long end)
1590{
1591 unsigned long first = start & PGDIR_MASK;
1592 unsigned long last = end + PGDIR_SIZE - 1;
1593 struct mm_struct *mm = tlb->mm;
1594
1595 if (last > MM_VM_SIZE(mm) || last < end)
1596 last = MM_VM_SIZE(mm);
1597
1598 if (!prev) {
1599 prev = mm->mmap;
1600 if (!prev)
1601 goto no_mmaps;
1602 if (prev->vm_end > start) {
1603 if (last > prev->vm_start)
1604 last = prev->vm_start;
1605 goto no_mmaps;
1606 }
1607 }
1608 for (;;) {
1609 struct vm_area_struct *next = prev->vm_next;
1610
1611 if (next) {
1612 if (next->vm_start < start) {
1613 prev = next;
1614 continue;
1615 }
1616 if (last > next->vm_start)
1617 last = next->vm_start;
1618 }
1619 if (prev->vm_end > first)
1620 first = prev->vm_end;
1621 break;
1622 }
1623no_mmaps:
1624 if (last < first) /* for arches with discontiguous pgd indices */
1625 return;
1626 if (first < FIRST_USER_PGD_NR * PGDIR_SIZE)
1627 first = FIRST_USER_PGD_NR * PGDIR_SIZE;
1628 /* No point trying to free anything if we're in the same pte page */
1629 if ((first & PMD_MASK) < (last & PMD_MASK)) {
1630 clear_page_range(tlb, first, last);
1631 flush_tlb_pgtables(mm, first, last);
1632 }
1633}
1634
1635/* Normal function to fix up a mapping
1636 * This function is the default for when an area has no specific
1637 * function. This may be used as part of a more specific routine.
1638 *
1639 * By the time this function is called, the area struct has been
1640 * removed from the process mapping list.
1641 */
1642static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
1643{
1644 size_t len = area->vm_end - area->vm_start;
1645
1646 area->vm_mm->total_vm -= len >> PAGE_SHIFT;
1647 if (area->vm_flags & VM_LOCKED)
1648 area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
1649 vm_stat_unaccount(area);
1650 area->vm_mm->unmap_area(area);
1651 remove_vm_struct(area);
1652}
1653
1654/*
1655 * Update the VMA and inode share lists.
1656 *
1657 * Ok - we have the memory areas we should free on the 'free' list,
1658 * so release them, and do the vma updates.
1659 */
1660static void unmap_vma_list(struct mm_struct *mm,
1661 struct vm_area_struct *mpnt)
1662{
1663 do {
1664 struct vm_area_struct *next = mpnt->vm_next;
1665 unmap_vma(mm, mpnt);
1666 mpnt = next;
1667 } while (mpnt != NULL);
1668 validate_mm(mm);
1669}
1670
1671/*
1672 * Get rid of page table information in the indicated region.
1673 *
1674 * Called with the page table lock held.
1675 */
1676static void unmap_region(struct mm_struct *mm,
1677 struct vm_area_struct *vma,
1678 struct vm_area_struct *prev,
1679 unsigned long start,
1680 unsigned long end)
1681{
1682 struct mmu_gather *tlb;
1683 unsigned long nr_accounted = 0;
1684
1685 lru_add_drain();
1686 tlb = tlb_gather_mmu(mm, 0);
1687 unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
1688 vm_unacct_memory(nr_accounted);
1689
1690 if (is_hugepage_only_range(mm, start, end - start))
1691 hugetlb_free_pgtables(tlb, prev, start, end);
1692 else
1693 free_pgtables(tlb, prev, start, end);
1694 tlb_finish_mmu(tlb, start, end);
1695}
1696
1697/*
1698 * Create a list of vma's touched by the unmap, removing them from the mm's
1699 * vma list as we go..
1700 */
1701static void
1702detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1703 struct vm_area_struct *prev, unsigned long end)
1704{
1705 struct vm_area_struct **insertion_point;
1706 struct vm_area_struct *tail_vma = NULL;
1707
1708 insertion_point = (prev ? &prev->vm_next : &mm->mmap);
1709 do {
1710 rb_erase(&vma->vm_rb, &mm->mm_rb);
1711 mm->map_count--;
1712 tail_vma = vma;
1713 vma = vma->vm_next;
1714 } while (vma && vma->vm_start < end);
1715 *insertion_point = vma;
1716 tail_vma->vm_next = NULL;
1717 mm->mmap_cache = NULL; /* Kill the cache. */
1718}
1719
1720/*
1721 * Split a vma into two pieces at address 'addr', a new vma is allocated
1722 * either for the first part or the the tail.
1723 */
1724int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1725 unsigned long addr, int new_below)
1726{
1727 struct mempolicy *pol;
1728 struct vm_area_struct *new;
1729
1730 if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK))
1731 return -EINVAL;
1732
1733 if (mm->map_count >= sysctl_max_map_count)
1734 return -ENOMEM;
1735
1736 new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1737 if (!new)
1738 return -ENOMEM;
1739
1740 /* most fields are the same, copy all, and then fixup */
1741 *new = *vma;
1742
1743 if (new_below)
1744 new->vm_end = addr;
1745 else {
1746 new->vm_start = addr;
1747 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
1748 }
1749
1750 pol = mpol_copy(vma_policy(vma));
1751 if (IS_ERR(pol)) {
1752 kmem_cache_free(vm_area_cachep, new);
1753 return PTR_ERR(pol);
1754 }
1755 vma_set_policy(new, pol);
1756
1757 if (new->vm_file)
1758 get_file(new->vm_file);
1759
1760 if (new->vm_ops && new->vm_ops->open)
1761 new->vm_ops->open(new);
1762
1763 if (new_below)
1764 vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
1765 ((addr - new->vm_start) >> PAGE_SHIFT), new);
1766 else
1767 vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
1768
1769 return 0;
1770}
1771
1772/* Munmap is split into 2 main parts -- this part which finds
1773 * what needs doing, and the areas themselves, which do the
1774 * work. This now handles partial unmappings.
1775 * Jeremy Fitzhardinge <jeremy@goop.org>
1776 */
1777int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1778{
1779 unsigned long end;
1780 struct vm_area_struct *mpnt, *prev, *last;
1781
1782 if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
1783 return -EINVAL;
1784
1785 if ((len = PAGE_ALIGN(len)) == 0)
1786 return -EINVAL;
1787
1788 /* Find the first overlapping VMA */
1789 mpnt = find_vma_prev(mm, start, &prev);
1790 if (!mpnt)
1791 return 0;
1792 /* we have start < mpnt->vm_end */
1793
1794 /* if it doesn't overlap, we have nothing.. */
1795 end = start + len;
1796 if (mpnt->vm_start >= end)
1797 return 0;
1798
1799 /*
1800 * If we need to split any vma, do it now to save pain later.
1801 *
1802 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
1803 * unmapped vm_area_struct will remain in use: so lower split_vma
1804 * places tmp vma above, and higher split_vma places tmp vma below.
1805 */
1806 if (start > mpnt->vm_start) {
1807 int error = split_vma(mm, mpnt, start, 0);
1808 if (error)
1809 return error;
1810 prev = mpnt;
1811 }
1812
1813 /* Does it split the last one? */
1814 last = find_vma(mm, end);
1815 if (last && end > last->vm_start) {
1816 int error = split_vma(mm, last, end, 1);
1817 if (error)
1818 return error;
1819 }
1820 mpnt = prev? prev->vm_next: mm->mmap;
1821
1822 /*
1823 * Remove the vma's, and unmap the actual pages
1824 */
1825 detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
1826 spin_lock(&mm->page_table_lock);
1827 unmap_region(mm, mpnt, prev, start, end);
1828 spin_unlock(&mm->page_table_lock);
1829
1830 /* Fix up all other VM information */
1831 unmap_vma_list(mm, mpnt);
1832
1833 return 0;
1834}
1835
1836EXPORT_SYMBOL(do_munmap);
1837
1838asmlinkage long sys_munmap(unsigned long addr, size_t len)
1839{
1840 int ret;
1841 struct mm_struct *mm = current->mm;
1842
1843 profile_munmap(addr);
1844
1845 down_write(&mm->mmap_sem);
1846 ret = do_munmap(mm, addr, len);
1847 up_write(&mm->mmap_sem);
1848 return ret;
1849}
1850
1851static inline void verify_mm_writelocked(struct mm_struct *mm)
1852{
1853#ifdef CONFIG_DEBUG_KERNEL
1854 if (unlikely(down_read_trylock(&mm->mmap_sem))) {
1855 WARN_ON(1);
1856 up_read(&mm->mmap_sem);
1857 }
1858#endif
1859}
1860
1861/*
1862 * this is really a simplified "do_mmap". it only handles
1863 * anonymous maps. eventually we may be able to do some
1864 * brk-specific accounting here.
1865 */
1866unsigned long do_brk(unsigned long addr, unsigned long len)
1867{
1868 struct mm_struct * mm = current->mm;
1869 struct vm_area_struct * vma, * prev;
1870 unsigned long flags;
1871 struct rb_node ** rb_link, * rb_parent;
1872 pgoff_t pgoff = addr >> PAGE_SHIFT;
1873
1874 len = PAGE_ALIGN(len);
1875 if (!len)
1876 return addr;
1877
1878 if ((addr + len) > TASK_SIZE || (addr + len) < addr)
1879 return -EINVAL;
1880
1881 /*
1882 * mlock MCL_FUTURE?
1883 */
1884 if (mm->def_flags & VM_LOCKED) {
1885 unsigned long locked, lock_limit;
1886 locked = mm->locked_vm << PAGE_SHIFT;
1887 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
1888 locked += len;
1889 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
1890 return -EAGAIN;
1891 }
1892
1893 /*
1894 * mm->mmap_sem is required to protect against another thread
1895 * changing the mappings in case we sleep.
1896 */
1897 verify_mm_writelocked(mm);
1898
1899 /*
1900 * Clear old maps. this also does some error checking for us
1901 */
1902 munmap_back:
1903 vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
1904 if (vma && vma->vm_start < addr + len) {
1905 if (do_munmap(mm, addr, len))
1906 return -ENOMEM;
1907 goto munmap_back;
1908 }
1909
1910 /* Check against address space limits *after* clearing old maps... */
1911 if ((mm->total_vm << PAGE_SHIFT) + len
1912 > current->signal->rlim[RLIMIT_AS].rlim_cur)
1913 return -ENOMEM;
1914
1915 if (mm->map_count > sysctl_max_map_count)
1916 return -ENOMEM;
1917
1918 if (security_vm_enough_memory(len >> PAGE_SHIFT))
1919 return -ENOMEM;
1920
1921 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
1922
1923 /* Can we just expand an old private anonymous mapping? */
1924 if (vma_merge(mm, prev, addr, addr + len, flags,
1925 NULL, NULL, pgoff, NULL))
1926 goto out;
1927
1928 /*
1929 * create a vma struct for an anonymous mapping
1930 */
1931 vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1932 if (!vma) {
1933 vm_unacct_memory(len >> PAGE_SHIFT);
1934 return -ENOMEM;
1935 }
1936 memset(vma, 0, sizeof(*vma));
1937
1938 vma->vm_mm = mm;
1939 vma->vm_start = addr;
1940 vma->vm_end = addr + len;
1941 vma->vm_pgoff = pgoff;
1942 vma->vm_flags = flags;
1943 vma->vm_page_prot = protection_map[flags & 0x0f];
1944 vma_link(mm, vma, prev, rb_link, rb_parent);
1945out:
1946 mm->total_vm += len >> PAGE_SHIFT;
1947 if (flags & VM_LOCKED) {
1948 mm->locked_vm += len >> PAGE_SHIFT;
1949 make_pages_present(addr, addr + len);
1950 }
1951 return addr;
1952}
1953
1954EXPORT_SYMBOL(do_brk);
1955
1956/* Release all mmaps. */
1957void exit_mmap(struct mm_struct *mm)
1958{
1959 struct mmu_gather *tlb;
1960 struct vm_area_struct *vma;
1961 unsigned long nr_accounted = 0;
1962
1963 lru_add_drain();
1964
1965 spin_lock(&mm->page_table_lock);
1966
1967 tlb = tlb_gather_mmu(mm, 1);
1968 flush_cache_mm(mm);
1969 /* Use ~0UL here to ensure all VMAs in the mm are unmapped */
1970 mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
1971 ~0UL, &nr_accounted, NULL);
1972 vm_unacct_memory(nr_accounted);
1973 BUG_ON(mm->map_count); /* This is just debugging */
1974 clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm));
1975
1976 tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
1977
1978 vma = mm->mmap;
1979 mm->mmap = mm->mmap_cache = NULL;
1980 mm->mm_rb = RB_ROOT;
1981 set_mm_counter(mm, rss, 0);
1982 mm->total_vm = 0;
1983 mm->locked_vm = 0;
1984
1985 spin_unlock(&mm->page_table_lock);
1986
1987 /*
1988 * Walk the list again, actually closing and freeing it
1989 * without holding any MM locks.
1990 */
1991 while (vma) {
1992 struct vm_area_struct *next = vma->vm_next;
1993 remove_vm_struct(vma);
1994 vma = next;
1995 }
1996}
1997
1998/* Insert vm structure into process list sorted by address
1999 * and into the inode's i_mmap tree. If vm_file is non-NULL
2000 * then i_mmap_lock is taken here.
2001 */
2002int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
2003{
2004 struct vm_area_struct * __vma, * prev;
2005 struct rb_node ** rb_link, * rb_parent;
2006
2007 /*
2008 * The vm_pgoff of a purely anonymous vma should be irrelevant
2009 * until its first write fault, when page's anon_vma and index
2010 * are set. But now set the vm_pgoff it will almost certainly
2011 * end up with (unless mremap moves it elsewhere before that
2012 * first wfault), so /proc/pid/maps tells a consistent story.
2013 *
2014 * By setting it to reflect the virtual start address of the
2015 * vma, merges and splits can happen in a seamless way, just
2016 * using the existing file pgoff checks and manipulations.
2017 * Similarly in do_mmap_pgoff and in do_brk.
2018 */
2019 if (!vma->vm_file) {
2020 BUG_ON(vma->anon_vma);
2021 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
2022 }
2023 __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
2024 if (__vma && __vma->vm_start < vma->vm_end)
2025 return -ENOMEM;
2026 vma_link(mm, vma, prev, rb_link, rb_parent);
2027 return 0;
2028}
2029
2030/*
2031 * Copy the vma structure to a new location in the same mm,
2032 * prior to moving page table entries, to effect an mremap move.
2033 */
2034struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2035 unsigned long addr, unsigned long len, pgoff_t pgoff)
2036{
2037 struct vm_area_struct *vma = *vmap;
2038 unsigned long vma_start = vma->vm_start;
2039 struct mm_struct *mm = vma->vm_mm;
2040 struct vm_area_struct *new_vma, *prev;
2041 struct rb_node **rb_link, *rb_parent;
2042 struct mempolicy *pol;
2043
2044 /*
2045 * If anonymous vma has not yet been faulted, update new pgoff
2046 * to match new location, to increase its chance of merging.
2047 */
2048 if (!vma->vm_file && !vma->anon_vma)
2049 pgoff = addr >> PAGE_SHIFT;
2050
2051 find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
2052 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
2053 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
2054 if (new_vma) {
2055 /*
2056 * Source vma may have been merged into new_vma
2057 */
2058 if (vma_start >= new_vma->vm_start &&
2059 vma_start < new_vma->vm_end)
2060 *vmap = new_vma;
2061 } else {
2062 new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2063 if (new_vma) {
2064 *new_vma = *vma;
2065 pol = mpol_copy(vma_policy(vma));
2066 if (IS_ERR(pol)) {
2067 kmem_cache_free(vm_area_cachep, new_vma);
2068 return NULL;
2069 }
2070 vma_set_policy(new_vma, pol);
2071 new_vma->vm_start = addr;
2072 new_vma->vm_end = addr + len;
2073 new_vma->vm_pgoff = pgoff;
2074 if (new_vma->vm_file)
2075 get_file(new_vma->vm_file);
2076 if (new_vma->vm_ops && new_vma->vm_ops->open)
2077 new_vma->vm_ops->open(new_vma);
2078 vma_link(mm, new_vma, prev, rb_link, rb_parent);
2079 }
2080 }
2081 return new_vma;
2082}
diff --git a/mm/mprotect.c b/mm/mprotect.c
new file mode 100644
index 00000000000..e9fbd013ad9
--- /dev/null
+++ b/mm/mprotect.c
@@ -0,0 +1,282 @@
1/*
2 * mm/mprotect.c
3 *
4 * (C) Copyright 1994 Linus Torvalds
5 * (C) Copyright 2002 Christoph Hellwig
6 *
7 * Address space accounting code <alan@redhat.com>
8 * (C) Copyright 2002 Red Hat Inc, All Rights Reserved
9 */
10
11#include <linux/mm.h>
12#include <linux/hugetlb.h>
13#include <linux/slab.h>
14#include <linux/shm.h>
15#include <linux/mman.h>
16#include <linux/fs.h>
17#include <linux/highmem.h>
18#include <linux/security.h>
19#include <linux/mempolicy.h>
20#include <linux/personality.h>
21#include <linux/syscalls.h>
22
23#include <asm/uaccess.h>
24#include <asm/pgtable.h>
25#include <asm/cacheflush.h>
26#include <asm/tlbflush.h>
27
28static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
29 unsigned long addr, unsigned long end, pgprot_t newprot)
30{
31 pte_t *pte;
32
33 pte = pte_offset_map(pmd, addr);
34 do {
35 if (pte_present(*pte)) {
36 pte_t ptent;
37
38 /* Avoid an SMP race with hardware updated dirty/clean
39 * bits by wiping the pte and then setting the new pte
40 * into place.
41 */
42 ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot);
43 set_pte_at(mm, addr, pte, ptent);
44 lazy_mmu_prot_update(ptent);
45 }
46 } while (pte++, addr += PAGE_SIZE, addr != end);
47 pte_unmap(pte - 1);
48}
49
50static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
51 unsigned long addr, unsigned long end, pgprot_t newprot)
52{
53 pmd_t *pmd;
54 unsigned long next;
55
56 pmd = pmd_offset(pud, addr);
57 do {
58 next = pmd_addr_end(addr, end);
59 if (pmd_none_or_clear_bad(pmd))
60 continue;
61 change_pte_range(mm, pmd, addr, next, newprot);
62 } while (pmd++, addr = next, addr != end);
63}
64
65static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
66 unsigned long addr, unsigned long end, pgprot_t newprot)
67{
68 pud_t *pud;
69 unsigned long next;
70
71 pud = pud_offset(pgd, addr);
72 do {
73 next = pud_addr_end(addr, end);
74 if (pud_none_or_clear_bad(pud))
75 continue;
76 change_pmd_range(mm, pud, addr, next, newprot);
77 } while (pud++, addr = next, addr != end);
78}
79
80static void change_protection(struct vm_area_struct *vma,
81 unsigned long addr, unsigned long end, pgprot_t newprot)
82{
83 struct mm_struct *mm = vma->vm_mm;
84 pgd_t *pgd;
85 unsigned long next;
86 unsigned long start = addr;
87
88 BUG_ON(addr >= end);
89 pgd = pgd_offset(mm, addr);
90 flush_cache_range(vma, addr, end);
91 spin_lock(&mm->page_table_lock);
92 do {
93 next = pgd_addr_end(addr, end);
94 if (pgd_none_or_clear_bad(pgd))
95 continue;
96 change_pud_range(mm, pgd, addr, next, newprot);
97 } while (pgd++, addr = next, addr != end);
98 flush_tlb_range(vma, start, end);
99 spin_unlock(&mm->page_table_lock);
100}
101
102static int
103mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
104 unsigned long start, unsigned long end, unsigned long newflags)
105{
106 struct mm_struct *mm = vma->vm_mm;
107 unsigned long oldflags = vma->vm_flags;
108 long nrpages = (end - start) >> PAGE_SHIFT;
109 unsigned long charged = 0;
110 pgprot_t newprot;
111 pgoff_t pgoff;
112 int error;
113
114 if (newflags == oldflags) {
115 *pprev = vma;
116 return 0;
117 }
118
119 /*
120 * If we make a private mapping writable we increase our commit;
121 * but (without finer accounting) cannot reduce our commit if we
122 * make it unwritable again.
123 *
124 * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting
125 * a MAP_NORESERVE private mapping to writable will now reserve.
126 */
127 if (newflags & VM_WRITE) {
128 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
129 charged = nrpages;
130 if (security_vm_enough_memory(charged))
131 return -ENOMEM;
132 newflags |= VM_ACCOUNT;
133 }
134 }
135
136 newprot = protection_map[newflags & 0xf];
137
138 /*
139 * First try to merge with previous and/or next vma.
140 */
141 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
142 *pprev = vma_merge(mm, *pprev, start, end, newflags,
143 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
144 if (*pprev) {
145 vma = *pprev;
146 goto success;
147 }
148
149 *pprev = vma;
150
151 if (start != vma->vm_start) {
152 error = split_vma(mm, vma, start, 1);
153 if (error)
154 goto fail;
155 }
156
157 if (end != vma->vm_end) {
158 error = split_vma(mm, vma, end, 0);
159 if (error)
160 goto fail;
161 }
162
163success:
164 /*
165 * vm_flags and vm_page_prot are protected by the mmap_sem
166 * held in write mode.
167 */
168 vma->vm_flags = newflags;
169 vma->vm_page_prot = newprot;
170 change_protection(vma, start, end, newprot);
171 __vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
172 __vm_stat_account(mm, newflags, vma->vm_file, nrpages);
173 return 0;
174
175fail:
176 vm_unacct_memory(charged);
177 return error;
178}
179
180asmlinkage long
181sys_mprotect(unsigned long start, size_t len, unsigned long prot)
182{
183 unsigned long vm_flags, nstart, end, tmp, reqprot;
184 struct vm_area_struct *vma, *prev;
185 int error = -EINVAL;
186 const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
187 prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
188 if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
189 return -EINVAL;
190
191 if (start & ~PAGE_MASK)
192 return -EINVAL;
193 if (!len)
194 return 0;
195 len = PAGE_ALIGN(len);
196 end = start + len;
197 if (end <= start)
198 return -ENOMEM;
199 if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM))
200 return -EINVAL;
201
202 reqprot = prot;
203 /*
204 * Does the application expect PROT_READ to imply PROT_EXEC:
205 */
206 if (unlikely((prot & PROT_READ) &&
207 (current->personality & READ_IMPLIES_EXEC)))
208 prot |= PROT_EXEC;
209
210 vm_flags = calc_vm_prot_bits(prot);
211
212 down_write(&current->mm->mmap_sem);
213
214 vma = find_vma_prev(current->mm, start, &prev);
215 error = -ENOMEM;
216 if (!vma)
217 goto out;
218 if (unlikely(grows & PROT_GROWSDOWN)) {
219 if (vma->vm_start >= end)
220 goto out;
221 start = vma->vm_start;
222 error = -EINVAL;
223 if (!(vma->vm_flags & VM_GROWSDOWN))
224 goto out;
225 }
226 else {
227 if (vma->vm_start > start)
228 goto out;
229 if (unlikely(grows & PROT_GROWSUP)) {
230 end = vma->vm_end;
231 error = -EINVAL;
232 if (!(vma->vm_flags & VM_GROWSUP))
233 goto out;
234 }
235 }
236 if (start > vma->vm_start)
237 prev = vma;
238
239 for (nstart = start ; ; ) {
240 unsigned long newflags;
241
242 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
243
244 if (is_vm_hugetlb_page(vma)) {
245 error = -EACCES;
246 goto out;
247 }
248
249 newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
250
251 if ((newflags & ~(newflags >> 4)) & 0xf) {
252 error = -EACCES;
253 goto out;
254 }
255
256 error = security_file_mprotect(vma, reqprot, prot);
257 if (error)
258 goto out;
259
260 tmp = vma->vm_end;
261 if (tmp > end)
262 tmp = end;
263 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
264 if (error)
265 goto out;
266 nstart = tmp;
267
268 if (nstart < prev->vm_end)
269 nstart = prev->vm_end;
270 if (nstart >= end)
271 goto out;
272
273 vma = prev->vm_next;
274 if (!vma || vma->vm_start != nstart) {
275 error = -ENOMEM;
276 goto out;
277 }
278 }
279out:
280 up_write(&current->mm->mmap_sem);
281 return error;
282}
diff --git a/mm/mremap.c b/mm/mremap.c
new file mode 100644
index 00000000000..0d1c1b9c7a0
--- /dev/null
+++ b/mm/mremap.c
@@ -0,0 +1,426 @@
1/*
2 * mm/mremap.c
3 *
4 * (C) Copyright 1996 Linus Torvalds
5 *
6 * Address space accounting code <alan@redhat.com>
7 * (C) Copyright 2002 Red Hat Inc, All Rights Reserved
8 */
9
10#include <linux/mm.h>
11#include <linux/hugetlb.h>
12#include <linux/slab.h>
13#include <linux/shm.h>
14#include <linux/mman.h>
15#include <linux/swap.h>
16#include <linux/fs.h>
17#include <linux/highmem.h>
18#include <linux/security.h>
19#include <linux/syscalls.h>
20
21#include <asm/uaccess.h>
22#include <asm/cacheflush.h>
23#include <asm/tlbflush.h>
24
25static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr)
26{
27 pgd_t *pgd;
28 pud_t *pud;
29 pmd_t *pmd;
30 pte_t *pte = NULL;
31
32 pgd = pgd_offset(mm, addr);
33 if (pgd_none_or_clear_bad(pgd))
34 goto end;
35
36 pud = pud_offset(pgd, addr);
37 if (pud_none_or_clear_bad(pud))
38 goto end;
39
40 pmd = pmd_offset(pud, addr);
41 if (pmd_none_or_clear_bad(pmd))
42 goto end;
43
44 pte = pte_offset_map_nested(pmd, addr);
45 if (pte_none(*pte)) {
46 pte_unmap_nested(pte);
47 pte = NULL;
48 }
49end:
50 return pte;
51}
52
53static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
54{
55 pgd_t *pgd;
56 pud_t *pud;
57 pmd_t *pmd;
58
59 pgd = pgd_offset(mm, addr);
60 if (pgd_none_or_clear_bad(pgd))
61 return NULL;
62
63 pud = pud_offset(pgd, addr);
64 if (pud_none_or_clear_bad(pud))
65 return NULL;
66
67 pmd = pmd_offset(pud, addr);
68 if (pmd_none_or_clear_bad(pmd))
69 return NULL;
70
71 return pte_offset_map(pmd, addr);
72}
73
74static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr)
75{
76 pgd_t *pgd;
77 pud_t *pud;
78 pmd_t *pmd;
79 pte_t *pte = NULL;
80
81 pgd = pgd_offset(mm, addr);
82
83 pud = pud_alloc(mm, pgd, addr);
84 if (!pud)
85 return NULL;
86 pmd = pmd_alloc(mm, pud, addr);
87 if (pmd)
88 pte = pte_alloc_map(mm, pmd, addr);
89 return pte;
90}
91
92static int
93move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
94 struct vm_area_struct *new_vma, unsigned long new_addr)
95{
96 struct address_space *mapping = NULL;
97 struct mm_struct *mm = vma->vm_mm;
98 int error = 0;
99 pte_t *src, *dst;
100
101 if (vma->vm_file) {
102 /*
103 * Subtle point from Rajesh Venkatasubramanian: before
104 * moving file-based ptes, we must lock vmtruncate out,
105 * since it might clean the dst vma before the src vma,
106 * and we propagate stale pages into the dst afterward.
107 */
108 mapping = vma->vm_file->f_mapping;
109 spin_lock(&mapping->i_mmap_lock);
110 if (new_vma->vm_truncate_count &&
111 new_vma->vm_truncate_count != vma->vm_truncate_count)
112 new_vma->vm_truncate_count = 0;
113 }
114 spin_lock(&mm->page_table_lock);
115
116 src = get_one_pte_map_nested(mm, old_addr);
117 if (src) {
118 /*
119 * Look to see whether alloc_one_pte_map needs to perform a
120 * memory allocation. If it does then we need to drop the
121 * atomic kmap
122 */
123 dst = get_one_pte_map(mm, new_addr);
124 if (unlikely(!dst)) {
125 pte_unmap_nested(src);
126 if (mapping)
127 spin_unlock(&mapping->i_mmap_lock);
128 dst = alloc_one_pte_map(mm, new_addr);
129 if (mapping && !spin_trylock(&mapping->i_mmap_lock)) {
130 spin_unlock(&mm->page_table_lock);
131 spin_lock(&mapping->i_mmap_lock);
132 spin_lock(&mm->page_table_lock);
133 }
134 src = get_one_pte_map_nested(mm, old_addr);
135 }
136 /*
137 * Since alloc_one_pte_map can drop and re-acquire
138 * page_table_lock, we should re-check the src entry...
139 */
140 if (src) {
141 if (dst) {
142 pte_t pte;
143 pte = ptep_clear_flush(vma, old_addr, src);
144 set_pte_at(mm, new_addr, dst, pte);
145 } else
146 error = -ENOMEM;
147 pte_unmap_nested(src);
148 }
149 if (dst)
150 pte_unmap(dst);
151 }
152 spin_unlock(&mm->page_table_lock);
153 if (mapping)
154 spin_unlock(&mapping->i_mmap_lock);
155 return error;
156}
157
158static unsigned long move_page_tables(struct vm_area_struct *vma,
159 unsigned long old_addr, struct vm_area_struct *new_vma,
160 unsigned long new_addr, unsigned long len)
161{
162 unsigned long offset;
163
164 flush_cache_range(vma, old_addr, old_addr + len);
165
166 /*
167 * This is not the clever way to do this, but we're taking the
168 * easy way out on the assumption that most remappings will be
169 * only a few pages.. This also makes error recovery easier.
170 */
171 for (offset = 0; offset < len; offset += PAGE_SIZE) {
172 if (move_one_page(vma, old_addr + offset,
173 new_vma, new_addr + offset) < 0)
174 break;
175 cond_resched();
176 }
177 return offset;
178}
179
180static unsigned long move_vma(struct vm_area_struct *vma,
181 unsigned long old_addr, unsigned long old_len,
182 unsigned long new_len, unsigned long new_addr)
183{
184 struct mm_struct *mm = vma->vm_mm;
185 struct vm_area_struct *new_vma;
186 unsigned long vm_flags = vma->vm_flags;
187 unsigned long new_pgoff;
188 unsigned long moved_len;
189 unsigned long excess = 0;
190 int split = 0;
191
192 /*
193 * We'd prefer to avoid failure later on in do_munmap:
194 * which may split one vma into three before unmapping.
195 */
196 if (mm->map_count >= sysctl_max_map_count - 3)
197 return -ENOMEM;
198
199 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
200 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
201 if (!new_vma)
202 return -ENOMEM;
203
204 moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
205 if (moved_len < old_len) {
206 /*
207 * On error, move entries back from new area to old,
208 * which will succeed since page tables still there,
209 * and then proceed to unmap new area instead of old.
210 */
211 move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
212 vma = new_vma;
213 old_len = new_len;
214 old_addr = new_addr;
215 new_addr = -ENOMEM;
216 }
217
218 /* Conceal VM_ACCOUNT so old reservation is not undone */
219 if (vm_flags & VM_ACCOUNT) {
220 vma->vm_flags &= ~VM_ACCOUNT;
221 excess = vma->vm_end - vma->vm_start - old_len;
222 if (old_addr > vma->vm_start &&
223 old_addr + old_len < vma->vm_end)
224 split = 1;
225 }
226
227 if (do_munmap(mm, old_addr, old_len) < 0) {
228 /* OOM: unable to split vma, just get accounts right */
229 vm_unacct_memory(excess >> PAGE_SHIFT);
230 excess = 0;
231 }
232
233 /* Restore VM_ACCOUNT if one or two pieces of vma left */
234 if (excess) {
235 vma->vm_flags |= VM_ACCOUNT;
236 if (split)
237 vma->vm_next->vm_flags |= VM_ACCOUNT;
238 }
239
240 mm->total_vm += new_len >> PAGE_SHIFT;
241 __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
242 if (vm_flags & VM_LOCKED) {
243 mm->locked_vm += new_len >> PAGE_SHIFT;
244 if (new_len > old_len)
245 make_pages_present(new_addr + old_len,
246 new_addr + new_len);
247 }
248
249 return new_addr;
250}
251
252/*
253 * Expand (or shrink) an existing mapping, potentially moving it at the
254 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
255 *
256 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
257 * This option implies MREMAP_MAYMOVE.
258 */
259unsigned long do_mremap(unsigned long addr,
260 unsigned long old_len, unsigned long new_len,
261 unsigned long flags, unsigned long new_addr)
262{
263 struct vm_area_struct *vma;
264 unsigned long ret = -EINVAL;
265 unsigned long charged = 0;
266
267 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
268 goto out;
269
270 if (addr & ~PAGE_MASK)
271 goto out;
272
273 old_len = PAGE_ALIGN(old_len);
274 new_len = PAGE_ALIGN(new_len);
275
276 /*
277 * We allow a zero old-len as a special case
278 * for DOS-emu "duplicate shm area" thing. But
279 * a zero new-len is nonsensical.
280 */
281 if (!new_len)
282 goto out;
283
284 /* new_addr is only valid if MREMAP_FIXED is specified */
285 if (flags & MREMAP_FIXED) {
286 if (new_addr & ~PAGE_MASK)
287 goto out;
288 if (!(flags & MREMAP_MAYMOVE))
289 goto out;
290
291 if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
292 goto out;
293
294 /* Check if the location we're moving into overlaps the
295 * old location at all, and fail if it does.
296 */
297 if ((new_addr <= addr) && (new_addr+new_len) > addr)
298 goto out;
299
300 if ((addr <= new_addr) && (addr+old_len) > new_addr)
301 goto out;
302
303 ret = do_munmap(current->mm, new_addr, new_len);
304 if (ret)
305 goto out;
306 }
307
308 /*
309 * Always allow a shrinking remap: that just unmaps
310 * the unnecessary pages..
311 * do_munmap does all the needed commit accounting
312 */
313 if (old_len >= new_len) {
314 ret = do_munmap(current->mm, addr+new_len, old_len - new_len);
315 if (ret && old_len != new_len)
316 goto out;
317 ret = addr;
318 if (!(flags & MREMAP_FIXED) || (new_addr == addr))
319 goto out;
320 old_len = new_len;
321 }
322
323 /*
324 * Ok, we need to grow.. or relocate.
325 */
326 ret = -EFAULT;
327 vma = find_vma(current->mm, addr);
328 if (!vma || vma->vm_start > addr)
329 goto out;
330 if (is_vm_hugetlb_page(vma)) {
331 ret = -EINVAL;
332 goto out;
333 }
334 /* We can't remap across vm area boundaries */
335 if (old_len > vma->vm_end - addr)
336 goto out;
337 if (vma->vm_flags & VM_DONTEXPAND) {
338 if (new_len > old_len)
339 goto out;
340 }
341 if (vma->vm_flags & VM_LOCKED) {
342 unsigned long locked, lock_limit;
343 locked = current->mm->locked_vm << PAGE_SHIFT;
344 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
345 locked += new_len - old_len;
346 ret = -EAGAIN;
347 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
348 goto out;
349 }
350 ret = -ENOMEM;
351 if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len)
352 > current->signal->rlim[RLIMIT_AS].rlim_cur)
353 goto out;
354
355 if (vma->vm_flags & VM_ACCOUNT) {
356 charged = (new_len - old_len) >> PAGE_SHIFT;
357 if (security_vm_enough_memory(charged))
358 goto out_nc;
359 }
360
361 /* old_len exactly to the end of the area..
362 * And we're not relocating the area.
363 */
364 if (old_len == vma->vm_end - addr &&
365 !((flags & MREMAP_FIXED) && (addr != new_addr)) &&
366 (old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
367 unsigned long max_addr = TASK_SIZE;
368 if (vma->vm_next)
369 max_addr = vma->vm_next->vm_start;
370 /* can we just expand the current mapping? */
371 if (max_addr - addr >= new_len) {
372 int pages = (new_len - old_len) >> PAGE_SHIFT;
373
374 vma_adjust(vma, vma->vm_start,
375 addr + new_len, vma->vm_pgoff, NULL);
376
377 current->mm->total_vm += pages;
378 __vm_stat_account(vma->vm_mm, vma->vm_flags,
379 vma->vm_file, pages);
380 if (vma->vm_flags & VM_LOCKED) {
381 current->mm->locked_vm += pages;
382 make_pages_present(addr + old_len,
383 addr + new_len);
384 }
385 ret = addr;
386 goto out;
387 }
388 }
389
390 /*
391 * We weren't able to just expand or shrink the area,
392 * we need to create a new one and move it..
393 */
394 ret = -ENOMEM;
395 if (flags & MREMAP_MAYMOVE) {
396 if (!(flags & MREMAP_FIXED)) {
397 unsigned long map_flags = 0;
398 if (vma->vm_flags & VM_MAYSHARE)
399 map_flags |= MAP_SHARED;
400
401 new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
402 vma->vm_pgoff, map_flags);
403 ret = new_addr;
404 if (new_addr & ~PAGE_MASK)
405 goto out;
406 }
407 ret = move_vma(vma, addr, old_len, new_len, new_addr);
408 }
409out:
410 if (ret & ~PAGE_MASK)
411 vm_unacct_memory(charged);
412out_nc:
413 return ret;
414}
415
416asmlinkage unsigned long sys_mremap(unsigned long addr,
417 unsigned long old_len, unsigned long new_len,
418 unsigned long flags, unsigned long new_addr)
419{
420 unsigned long ret;
421
422 down_write(&current->mm->mmap_sem);
423 ret = do_mremap(addr, old_len, new_len, flags, new_addr);
424 up_write(&current->mm->mmap_sem);
425 return ret;
426}
diff --git a/mm/msync.c b/mm/msync.c
new file mode 100644
index 00000000000..090f426bca7
--- /dev/null
+++ b/mm/msync.c
@@ -0,0 +1,236 @@
1/*
2 * linux/mm/msync.c
3 *
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
6
7/*
8 * The msync() system call.
9 */
10#include <linux/slab.h>
11#include <linux/pagemap.h>
12#include <linux/mm.h>
13#include <linux/mman.h>
14#include <linux/hugetlb.h>
15#include <linux/syscalls.h>
16
17#include <asm/pgtable.h>
18#include <asm/tlbflush.h>
19
20/*
21 * Called with mm->page_table_lock held to protect against other
22 * threads/the swapper from ripping pte's out from under us.
23 */
24
25static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
26 unsigned long addr, unsigned long end)
27{
28 pte_t *pte;
29
30 pte = pte_offset_map(pmd, addr);
31 do {
32 unsigned long pfn;
33 struct page *page;
34
35 if (!pte_present(*pte))
36 continue;
37 pfn = pte_pfn(*pte);
38 if (!pfn_valid(pfn))
39 continue;
40 page = pfn_to_page(pfn);
41 if (PageReserved(page))
42 continue;
43
44 if (ptep_clear_flush_dirty(vma, addr, pte) ||
45 page_test_and_clear_dirty(page))
46 set_page_dirty(page);
47 } while (pte++, addr += PAGE_SIZE, addr != end);
48 pte_unmap(pte - 1);
49}
50
51static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
52 unsigned long addr, unsigned long end)
53{
54 pmd_t *pmd;
55 unsigned long next;
56
57 pmd = pmd_offset(pud, addr);
58 do {
59 next = pmd_addr_end(addr, end);
60 if (pmd_none_or_clear_bad(pmd))
61 continue;
62 sync_pte_range(vma, pmd, addr, next);
63 } while (pmd++, addr = next, addr != end);
64}
65
66static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
67 unsigned long addr, unsigned long end)
68{
69 pud_t *pud;
70 unsigned long next;
71
72 pud = pud_offset(pgd, addr);
73 do {
74 next = pud_addr_end(addr, end);
75 if (pud_none_or_clear_bad(pud))
76 continue;
77 sync_pmd_range(vma, pud, addr, next);
78 } while (pud++, addr = next, addr != end);
79}
80
81static void sync_page_range(struct vm_area_struct *vma,
82 unsigned long addr, unsigned long end)
83{
84 struct mm_struct *mm = vma->vm_mm;
85 pgd_t *pgd;
86 unsigned long next;
87
88 /* For hugepages we can't go walking the page table normally,
89 * but that's ok, hugetlbfs is memory based, so we don't need
90 * to do anything more on an msync() */
91 if (is_vm_hugetlb_page(vma))
92 return;
93
94 BUG_ON(addr >= end);
95 pgd = pgd_offset(mm, addr);
96 flush_cache_range(vma, addr, end);
97 spin_lock(&mm->page_table_lock);
98 do {
99 next = pgd_addr_end(addr, end);
100 if (pgd_none_or_clear_bad(pgd))
101 continue;
102 sync_pud_range(vma, pgd, addr, next);
103 } while (pgd++, addr = next, addr != end);
104 spin_unlock(&mm->page_table_lock);
105}
106
107#ifdef CONFIG_PREEMPT
108static inline void filemap_sync(struct vm_area_struct *vma,
109 unsigned long addr, unsigned long end)
110{
111 const size_t chunk = 64 * 1024; /* bytes */
112 unsigned long next;
113
114 do {
115 next = addr + chunk;
116 if (next > end || next < addr)
117 next = end;
118 sync_page_range(vma, addr, next);
119 cond_resched();
120 } while (addr = next, addr != end);
121}
122#else
123static inline void filemap_sync(struct vm_area_struct *vma,
124 unsigned long addr, unsigned long end)
125{
126 sync_page_range(vma, addr, end);
127}
128#endif
129
130/*
131 * MS_SYNC syncs the entire file - including mappings.
132 *
133 * MS_ASYNC does not start I/O (it used to, up to 2.5.67). Instead, it just
134 * marks the relevant pages dirty. The application may now run fsync() to
135 * write out the dirty pages and wait on the writeout and check the result.
136 * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
137 * async writeout immediately.
138 * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to
139 * applications.
140 */
141static int msync_interval(struct vm_area_struct *vma,
142 unsigned long addr, unsigned long end, int flags)
143{
144 int ret = 0;
145 struct file *file = vma->vm_file;
146
147 if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
148 return -EBUSY;
149
150 if (file && (vma->vm_flags & VM_SHARED)) {
151 filemap_sync(vma, addr, end);
152
153 if (flags & MS_SYNC) {
154 struct address_space *mapping = file->f_mapping;
155 int err;
156
157 ret = filemap_fdatawrite(mapping);
158 if (file->f_op && file->f_op->fsync) {
159 /*
160 * We don't take i_sem here because mmap_sem
161 * is already held.
162 */
163 err = file->f_op->fsync(file,file->f_dentry,1);
164 if (err && !ret)
165 ret = err;
166 }
167 err = filemap_fdatawait(mapping);
168 if (!ret)
169 ret = err;
170 }
171 }
172 return ret;
173}
174
175asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
176{
177 unsigned long end;
178 struct vm_area_struct *vma;
179 int unmapped_error, error = -EINVAL;
180
181 if (flags & MS_SYNC)
182 current->flags |= PF_SYNCWRITE;
183
184 down_read(&current->mm->mmap_sem);
185 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
186 goto out;
187 if (start & ~PAGE_MASK)
188 goto out;
189 if ((flags & MS_ASYNC) && (flags & MS_SYNC))
190 goto out;
191 error = -ENOMEM;
192 len = (len + ~PAGE_MASK) & PAGE_MASK;
193 end = start + len;
194 if (end < start)
195 goto out;
196 error = 0;
197 if (end == start)
198 goto out;
199 /*
200 * If the interval [start,end) covers some unmapped address ranges,
201 * just ignore them, but return -ENOMEM at the end.
202 */
203 vma = find_vma(current->mm, start);
204 unmapped_error = 0;
205 for (;;) {
206 /* Still start < end. */
207 error = -ENOMEM;
208 if (!vma)
209 goto out;
210 /* Here start < vma->vm_end. */
211 if (start < vma->vm_start) {
212 unmapped_error = -ENOMEM;
213 start = vma->vm_start;
214 }
215 /* Here vma->vm_start <= start < vma->vm_end. */
216 if (end <= vma->vm_end) {
217 if (start < end) {
218 error = msync_interval(vma, start, end, flags);
219 if (error)
220 goto out;
221 }
222 error = unmapped_error;
223 goto out;
224 }
225 /* Here vma->vm_start <= start < vma->vm_end < end. */
226 error = msync_interval(vma, start, vma->vm_end, flags);
227 if (error)
228 goto out;
229 start = vma->vm_end;
230 vma = vma->vm_next;
231 }
232out:
233 up_read(&current->mm->mmap_sem);
234 current->flags &= ~PF_SYNCWRITE;
235 return error;
236}
diff --git a/mm/nommu.c b/mm/nommu.c
new file mode 100644
index 00000000000..b293ec1cc4e
--- /dev/null
+++ b/mm/nommu.c
@@ -0,0 +1,1180 @@
1/*
2 * linux/mm/nommu.c
3 *
4 * Replacement code for mm functions to support CPU's that don't
5 * have any form of memory management unit (thus no virtual memory).
6 *
7 * See Documentation/nommu-mmap.txt
8 *
9 * Copyright (c) 2004-2005 David Howells <dhowells@redhat.com>
10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
13 */
14
15#include <linux/mm.h>
16#include <linux/mman.h>
17#include <linux/swap.h>
18#include <linux/file.h>
19#include <linux/highmem.h>
20#include <linux/pagemap.h>
21#include <linux/slab.h>
22#include <linux/vmalloc.h>
23#include <linux/ptrace.h>
24#include <linux/blkdev.h>
25#include <linux/backing-dev.h>
26#include <linux/mount.h>
27#include <linux/personality.h>
28#include <linux/security.h>
29#include <linux/syscalls.h>
30
31#include <asm/uaccess.h>
32#include <asm/tlb.h>
33#include <asm/tlbflush.h>
34
35void *high_memory;
36struct page *mem_map;
37unsigned long max_mapnr;
38unsigned long num_physpages;
39unsigned long askedalloc, realalloc;
40atomic_t vm_committed_space = ATOMIC_INIT(0);
41int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
42int sysctl_overcommit_ratio = 50; /* default is 50% */
43int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
44int heap_stack_gap = 0;
45
46EXPORT_SYMBOL(mem_map);
47EXPORT_SYMBOL(sysctl_max_map_count);
48EXPORT_SYMBOL(sysctl_overcommit_memory);
49EXPORT_SYMBOL(sysctl_overcommit_ratio);
50EXPORT_SYMBOL(vm_committed_space);
51EXPORT_SYMBOL(__vm_enough_memory);
52
53/* list of shareable VMAs */
54struct rb_root nommu_vma_tree = RB_ROOT;
55DECLARE_RWSEM(nommu_vma_sem);
56
57struct vm_operations_struct generic_file_vm_ops = {
58};
59
60/*
61 * Handle all mappings that got truncated by a "truncate()"
62 * system call.
63 *
64 * NOTE! We have to be ready to update the memory sharing
65 * between the file and the memory map for a potential last
66 * incomplete page. Ugly, but necessary.
67 */
68int vmtruncate(struct inode *inode, loff_t offset)
69{
70 struct address_space *mapping = inode->i_mapping;
71 unsigned long limit;
72
73 if (inode->i_size < offset)
74 goto do_expand;
75 i_size_write(inode, offset);
76
77 truncate_inode_pages(mapping, offset);
78 goto out_truncate;
79
80do_expand:
81 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
82 if (limit != RLIM_INFINITY && offset > limit)
83 goto out_sig;
84 if (offset > inode->i_sb->s_maxbytes)
85 goto out;
86 i_size_write(inode, offset);
87
88out_truncate:
89 if (inode->i_op && inode->i_op->truncate)
90 inode->i_op->truncate(inode);
91 return 0;
92out_sig:
93 send_sig(SIGXFSZ, current, 0);
94out:
95 return -EFBIG;
96}
97
98EXPORT_SYMBOL(vmtruncate);
99
100/*
101 * Return the total memory allocated for this pointer, not
102 * just what the caller asked for.
103 *
104 * Doesn't have to be accurate, i.e. may have races.
105 */
106unsigned int kobjsize(const void *objp)
107{
108 struct page *page;
109
110 if (!objp || !((page = virt_to_page(objp))))
111 return 0;
112
113 if (PageSlab(page))
114 return ksize(objp);
115
116 BUG_ON(page->index < 0);
117 BUG_ON(page->index >= MAX_ORDER);
118
119 return (PAGE_SIZE << page->index);
120}
121
122/*
123 * The nommu dodgy version :-)
124 */
125int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
126 unsigned long start, int len, int write, int force,
127 struct page **pages, struct vm_area_struct **vmas)
128{
129 int i;
130 static struct vm_area_struct dummy_vma;
131
132 for (i = 0; i < len; i++) {
133 if (pages) {
134 pages[i] = virt_to_page(start);
135 if (pages[i])
136 page_cache_get(pages[i]);
137 }
138 if (vmas)
139 vmas[i] = &dummy_vma;
140 start += PAGE_SIZE;
141 }
142 return(i);
143}
144
145DEFINE_RWLOCK(vmlist_lock);
146struct vm_struct *vmlist;
147
148void vfree(void *addr)
149{
150 kfree(addr);
151}
152
153void *__vmalloc(unsigned long size, int gfp_mask, pgprot_t prot)
154{
155 /*
156 * kmalloc doesn't like __GFP_HIGHMEM for some reason
157 */
158 return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM);
159}
160
161struct page * vmalloc_to_page(void *addr)
162{
163 return virt_to_page(addr);
164}
165
166unsigned long vmalloc_to_pfn(void *addr)
167{
168 return page_to_pfn(virt_to_page(addr));
169}
170
171
172long vread(char *buf, char *addr, unsigned long count)
173{
174 memcpy(buf, addr, count);
175 return count;
176}
177
178long vwrite(char *buf, char *addr, unsigned long count)
179{
180 /* Don't allow overflow */
181 if ((unsigned long) addr + count < count)
182 count = -(unsigned long) addr;
183
184 memcpy(addr, buf, count);
185 return(count);
186}
187
188/*
189 * vmalloc - allocate virtually continguos memory
190 *
191 * @size: allocation size
192 *
193 * Allocate enough pages to cover @size from the page level
194 * allocator and map them into continguos kernel virtual space.
195 *
196 * For tight cotrol over page level allocator and protection flags
197 * use __vmalloc() instead.
198 */
199void *vmalloc(unsigned long size)
200{
201 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
202}
203
204/*
205 * vmalloc_32 - allocate virtually continguos memory (32bit addressable)
206 *
207 * @size: allocation size
208 *
209 * Allocate enough 32bit PA addressable pages to cover @size from the
210 * page level allocator and map them into continguos kernel virtual space.
211 */
212void *vmalloc_32(unsigned long size)
213{
214 return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
215}
216
217void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot)
218{
219 BUG();
220 return NULL;
221}
222
223void vunmap(void *addr)
224{
225 BUG();
226}
227
228/*
229 * sys_brk() for the most part doesn't need the global kernel
230 * lock, except when an application is doing something nasty
231 * like trying to un-brk an area that has already been mapped
232 * to a regular file. in this case, the unmapping will need
233 * to invoke file system routines that need the global lock.
234 */
235asmlinkage unsigned long sys_brk(unsigned long brk)
236{
237 struct mm_struct *mm = current->mm;
238
239 if (brk < mm->start_brk || brk > mm->context.end_brk)
240 return mm->brk;
241
242 if (mm->brk == brk)
243 return mm->brk;
244
245 /*
246 * Always allow shrinking brk
247 */
248 if (brk <= mm->brk) {
249 mm->brk = brk;
250 return brk;
251 }
252
253 /*
254 * Ok, looks good - let it rip.
255 */
256 return mm->brk = brk;
257}
258
259#ifdef DEBUG
260static void show_process_blocks(void)
261{
262 struct vm_list_struct *vml;
263
264 printk("Process blocks %d:", current->pid);
265
266 for (vml = &current->mm->context.vmlist; vml; vml = vml->next) {
267 printk(" %p: %p", vml, vml->vma);
268 if (vml->vma)
269 printk(" (%d @%lx #%d)",
270 kobjsize((void *) vml->vma->vm_start),
271 vml->vma->vm_start,
272 atomic_read(&vml->vma->vm_usage));
273 printk(vml->next ? " ->" : ".\n");
274 }
275}
276#endif /* DEBUG */
277
278static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
279{
280 struct vm_area_struct *vma;
281 struct rb_node *n = nommu_vma_tree.rb_node;
282
283 while (n) {
284 vma = rb_entry(n, struct vm_area_struct, vm_rb);
285
286 if (start < vma->vm_start)
287 n = n->rb_left;
288 else if (start > vma->vm_start)
289 n = n->rb_right;
290 else
291 return vma;
292 }
293
294 return NULL;
295}
296
297static void add_nommu_vma(struct vm_area_struct *vma)
298{
299 struct vm_area_struct *pvma;
300 struct address_space *mapping;
301 struct rb_node **p = &nommu_vma_tree.rb_node;
302 struct rb_node *parent = NULL;
303
304 /* add the VMA to the mapping */
305 if (vma->vm_file) {
306 mapping = vma->vm_file->f_mapping;
307
308 flush_dcache_mmap_lock(mapping);
309 vma_prio_tree_insert(vma, &mapping->i_mmap);
310 flush_dcache_mmap_unlock(mapping);
311 }
312
313 /* add the VMA to the master list */
314 while (*p) {
315 parent = *p;
316 pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
317
318 if (vma->vm_start < pvma->vm_start) {
319 p = &(*p)->rb_left;
320 }
321 else if (vma->vm_start > pvma->vm_start) {
322 p = &(*p)->rb_right;
323 }
324 else {
325 /* mappings are at the same address - this can only
326 * happen for shared-mem chardevs and shared file
327 * mappings backed by ramfs/tmpfs */
328 BUG_ON(!(pvma->vm_flags & VM_SHARED));
329
330 if (vma < pvma)
331 p = &(*p)->rb_left;
332 else if (vma > pvma)
333 p = &(*p)->rb_right;
334 else
335 BUG();
336 }
337 }
338
339 rb_link_node(&vma->vm_rb, parent, p);
340 rb_insert_color(&vma->vm_rb, &nommu_vma_tree);
341}
342
343static void delete_nommu_vma(struct vm_area_struct *vma)
344{
345 struct address_space *mapping;
346
347 /* remove the VMA from the mapping */
348 if (vma->vm_file) {
349 mapping = vma->vm_file->f_mapping;
350
351 flush_dcache_mmap_lock(mapping);
352 vma_prio_tree_remove(vma, &mapping->i_mmap);
353 flush_dcache_mmap_unlock(mapping);
354 }
355
356 /* remove from the master list */
357 rb_erase(&vma->vm_rb, &nommu_vma_tree);
358}
359
360/*
361 * determine whether a mapping should be permitted and, if so, what sort of
362 * mapping we're capable of supporting
363 */
364static int validate_mmap_request(struct file *file,
365 unsigned long addr,
366 unsigned long len,
367 unsigned long prot,
368 unsigned long flags,
369 unsigned long pgoff,
370 unsigned long *_capabilities)
371{
372 unsigned long capabilities;
373 unsigned long reqprot = prot;
374 int ret;
375
376 /* do the simple checks first */
377 if (flags & MAP_FIXED || addr) {
378 printk(KERN_DEBUG
379 "%d: Can't do fixed-address/overlay mmap of RAM\n",
380 current->pid);
381 return -EINVAL;
382 }
383
384 if ((flags & MAP_TYPE) != MAP_PRIVATE &&
385 (flags & MAP_TYPE) != MAP_SHARED)
386 return -EINVAL;
387
388 if (PAGE_ALIGN(len) == 0)
389 return addr;
390
391 if (len > TASK_SIZE)
392 return -EINVAL;
393
394 /* offset overflow? */
395 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
396 return -EINVAL;
397
398 if (file) {
399 /* validate file mapping requests */
400 struct address_space *mapping;
401
402 /* files must support mmap */
403 if (!file->f_op || !file->f_op->mmap)
404 return -ENODEV;
405
406 /* work out if what we've got could possibly be shared
407 * - we support chardevs that provide their own "memory"
408 * - we support files/blockdevs that are memory backed
409 */
410 mapping = file->f_mapping;
411 if (!mapping)
412 mapping = file->f_dentry->d_inode->i_mapping;
413
414 capabilities = 0;
415 if (mapping && mapping->backing_dev_info)
416 capabilities = mapping->backing_dev_info->capabilities;
417
418 if (!capabilities) {
419 /* no explicit capabilities set, so assume some
420 * defaults */
421 switch (file->f_dentry->d_inode->i_mode & S_IFMT) {
422 case S_IFREG:
423 case S_IFBLK:
424 capabilities = BDI_CAP_MAP_COPY;
425 break;
426
427 case S_IFCHR:
428 capabilities =
429 BDI_CAP_MAP_DIRECT |
430 BDI_CAP_READ_MAP |
431 BDI_CAP_WRITE_MAP;
432 break;
433
434 default:
435 return -EINVAL;
436 }
437 }
438
439 /* eliminate any capabilities that we can't support on this
440 * device */
441 if (!file->f_op->get_unmapped_area)
442 capabilities &= ~BDI_CAP_MAP_DIRECT;
443 if (!file->f_op->read)
444 capabilities &= ~BDI_CAP_MAP_COPY;
445
446 if (flags & MAP_SHARED) {
447 /* do checks for writing, appending and locking */
448 if ((prot & PROT_WRITE) &&
449 !(file->f_mode & FMODE_WRITE))
450 return -EACCES;
451
452 if (IS_APPEND(file->f_dentry->d_inode) &&
453 (file->f_mode & FMODE_WRITE))
454 return -EACCES;
455
456 if (locks_verify_locked(file->f_dentry->d_inode))
457 return -EAGAIN;
458
459 if (!(capabilities & BDI_CAP_MAP_DIRECT))
460 return -ENODEV;
461
462 if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) ||
463 ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
464 ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP))
465 ) {
466 printk("MAP_SHARED not completely supported on !MMU\n");
467 return -EINVAL;
468 }
469
470 /* we mustn't privatise shared mappings */
471 capabilities &= ~BDI_CAP_MAP_COPY;
472 }
473 else {
474 /* we're going to read the file into private memory we
475 * allocate */
476 if (!(capabilities & BDI_CAP_MAP_COPY))
477 return -ENODEV;
478
479 /* we don't permit a private writable mapping to be
480 * shared with the backing device */
481 if (prot & PROT_WRITE)
482 capabilities &= ~BDI_CAP_MAP_DIRECT;
483 }
484
485 /* handle executable mappings and implied executable
486 * mappings */
487 if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) {
488 if (prot & PROT_EXEC)
489 return -EPERM;
490 }
491 else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
492 /* handle implication of PROT_EXEC by PROT_READ */
493 if (current->personality & READ_IMPLIES_EXEC) {
494 if (capabilities & BDI_CAP_EXEC_MAP)
495 prot |= PROT_EXEC;
496 }
497 }
498 else if ((prot & PROT_READ) &&
499 (prot & PROT_EXEC) &&
500 !(capabilities & BDI_CAP_EXEC_MAP)
501 ) {
502 /* backing file is not executable, try to copy */
503 capabilities &= ~BDI_CAP_MAP_DIRECT;
504 }
505 }
506 else {
507 /* anonymous mappings are always memory backed and can be
508 * privately mapped
509 */
510 capabilities = BDI_CAP_MAP_COPY;
511
512 /* handle PROT_EXEC implication by PROT_READ */
513 if ((prot & PROT_READ) &&
514 (current->personality & READ_IMPLIES_EXEC))
515 prot |= PROT_EXEC;
516 }
517
518 /* allow the security API to have its say */
519 ret = security_file_mmap(file, reqprot, prot, flags);
520 if (ret < 0)
521 return ret;
522
523 /* looks okay */
524 *_capabilities = capabilities;
525 return 0;
526}
527
528/*
529 * we've determined that we can make the mapping, now translate what we
530 * now know into VMA flags
531 */
532static unsigned long determine_vm_flags(struct file *file,
533 unsigned long prot,
534 unsigned long flags,
535 unsigned long capabilities)
536{
537 unsigned long vm_flags;
538
539 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
540 vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
541 /* vm_flags |= mm->def_flags; */
542
543 if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
544 /* attempt to share read-only copies of mapped file chunks */
545 if (file && !(prot & PROT_WRITE))
546 vm_flags |= VM_MAYSHARE;
547 }
548 else {
549 /* overlay a shareable mapping on the backing device or inode
550 * if possible - used for chardevs, ramfs/tmpfs/shmfs and
551 * romfs/cramfs */
552 if (flags & MAP_SHARED)
553 vm_flags |= VM_MAYSHARE | VM_SHARED;
554 else if ((((vm_flags & capabilities) ^ vm_flags) & BDI_CAP_VMFLAGS) == 0)
555 vm_flags |= VM_MAYSHARE;
556 }
557
558 /* refuse to let anyone share private mappings with this process if
559 * it's being traced - otherwise breakpoints set in it may interfere
560 * with another untraced process
561 */
562 if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED))
563 vm_flags &= ~VM_MAYSHARE;
564
565 return vm_flags;
566}
567
568/*
569 * set up a shared mapping on a file
570 */
571static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len)
572{
573 int ret;
574
575 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
576 if (ret != -ENOSYS)
577 return ret;
578
579 /* getting an ENOSYS error indicates that direct mmap isn't
580 * possible (as opposed to tried but failed) so we'll fall
581 * through to making a private copy of the data and mapping
582 * that if we can */
583 return -ENODEV;
584}
585
586/*
587 * set up a private mapping or an anonymous shared mapping
588 */
589static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
590{
591 void *base;
592 int ret;
593
594 /* invoke the file's mapping function so that it can keep track of
595 * shared mappings on devices or memory
596 * - VM_MAYSHARE will be set if it may attempt to share
597 */
598 if (vma->vm_file) {
599 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
600 if (ret != -ENOSYS) {
601 /* shouldn't return success if we're not sharing */
602 BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE));
603 return ret; /* success or a real error */
604 }
605
606 /* getting an ENOSYS error indicates that direct mmap isn't
607 * possible (as opposed to tried but failed) so we'll try to
608 * make a private copy of the data and map that instead */
609 }
610
611 /* allocate some memory to hold the mapping
612 * - note that this may not return a page-aligned address if the object
613 * we're allocating is smaller than a page
614 */
615 base = kmalloc(len, GFP_KERNEL);
616 if (!base)
617 goto enomem;
618
619 vma->vm_start = (unsigned long) base;
620 vma->vm_end = vma->vm_start + len;
621 vma->vm_flags |= VM_MAPPED_COPY;
622
623#ifdef WARN_ON_SLACK
624 if (len + WARN_ON_SLACK <= kobjsize(result))
625 printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n",
626 len, current->pid, kobjsize(result) - len);
627#endif
628
629 if (vma->vm_file) {
630 /* read the contents of a file into the copy */
631 mm_segment_t old_fs;
632 loff_t fpos;
633
634 fpos = vma->vm_pgoff;
635 fpos <<= PAGE_SHIFT;
636
637 old_fs = get_fs();
638 set_fs(KERNEL_DS);
639 ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos);
640 set_fs(old_fs);
641
642 if (ret < 0)
643 goto error_free;
644
645 /* clear the last little bit */
646 if (ret < len)
647 memset(base + ret, 0, len - ret);
648
649 } else {
650 /* if it's an anonymous mapping, then just clear it */
651 memset(base, 0, len);
652 }
653
654 return 0;
655
656error_free:
657 kfree(base);
658 vma->vm_start = 0;
659 return ret;
660
661enomem:
662 printk("Allocation of length %lu from process %d failed\n",
663 len, current->pid);
664 show_free_areas();
665 return -ENOMEM;
666}
667
668/*
669 * handle mapping creation for uClinux
670 */
671unsigned long do_mmap_pgoff(struct file *file,
672 unsigned long addr,
673 unsigned long len,
674 unsigned long prot,
675 unsigned long flags,
676 unsigned long pgoff)
677{
678 struct vm_list_struct *vml = NULL;
679 struct vm_area_struct *vma = NULL;
680 struct rb_node *rb;
681 unsigned long capabilities, vm_flags;
682 void *result;
683 int ret;
684
685 /* decide whether we should attempt the mapping, and if so what sort of
686 * mapping */
687 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
688 &capabilities);
689 if (ret < 0)
690 return ret;
691
692 /* we've determined that we can make the mapping, now translate what we
693 * now know into VMA flags */
694 vm_flags = determine_vm_flags(file, prot, flags, capabilities);
695
696 /* we're going to need to record the mapping if it works */
697 vml = kmalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
698 if (!vml)
699 goto error_getting_vml;
700 memset(vml, 0, sizeof(*vml));
701
702 down_write(&nommu_vma_sem);
703
704 /* if we want to share, we need to check for VMAs created by other
705 * mmap() calls that overlap with our proposed mapping
706 * - we can only share with an exact match on most regular files
707 * - shared mappings on character devices and memory backed files are
708 * permitted to overlap inexactly as far as we are concerned for in
709 * these cases, sharing is handled in the driver or filesystem rather
710 * than here
711 */
712 if (vm_flags & VM_MAYSHARE) {
713 unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
714 unsigned long vmpglen;
715
716 for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) {
717 vma = rb_entry(rb, struct vm_area_struct, vm_rb);
718
719 if (!(vma->vm_flags & VM_MAYSHARE))
720 continue;
721
722 /* search for overlapping mappings on the same file */
723 if (vma->vm_file->f_dentry->d_inode != file->f_dentry->d_inode)
724 continue;
725
726 if (vma->vm_pgoff >= pgoff + pglen)
727 continue;
728
729 vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1;
730 vmpglen >>= PAGE_SHIFT;
731 if (pgoff >= vma->vm_pgoff + vmpglen)
732 continue;
733
734 /* handle inexactly overlapping matches between mappings */
735 if (vma->vm_pgoff != pgoff || vmpglen != pglen) {
736 if (!(capabilities & BDI_CAP_MAP_DIRECT))
737 goto sharing_violation;
738 continue;
739 }
740
741 /* we've found a VMA we can share */
742 atomic_inc(&vma->vm_usage);
743
744 vml->vma = vma;
745 result = (void *) vma->vm_start;
746 goto shared;
747 }
748
749 vma = NULL;
750
751 /* obtain the address at which to make a shared mapping
752 * - this is the hook for quasi-memory character devices to
753 * tell us the location of a shared mapping
754 */
755 if (file && file->f_op->get_unmapped_area) {
756 addr = file->f_op->get_unmapped_area(file, addr, len,
757 pgoff, flags);
758 if (IS_ERR((void *) addr)) {
759 ret = addr;
760 if (ret != (unsigned long) -ENOSYS)
761 goto error;
762
763 /* the driver refused to tell us where to site
764 * the mapping so we'll have to attempt to copy
765 * it */
766 ret = (unsigned long) -ENODEV;
767 if (!(capabilities & BDI_CAP_MAP_COPY))
768 goto error;
769
770 capabilities &= ~BDI_CAP_MAP_DIRECT;
771 }
772 }
773 }
774
775 /* we're going to need a VMA struct as well */
776 vma = kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
777 if (!vma)
778 goto error_getting_vma;
779
780 memset(vma, 0, sizeof(*vma));
781 INIT_LIST_HEAD(&vma->anon_vma_node);
782 atomic_set(&vma->vm_usage, 1);
783 if (file)
784 get_file(file);
785 vma->vm_file = file;
786 vma->vm_flags = vm_flags;
787 vma->vm_start = addr;
788 vma->vm_end = addr + len;
789 vma->vm_pgoff = pgoff;
790
791 vml->vma = vma;
792
793 /* set up the mapping */
794 if (file && vma->vm_flags & VM_SHARED)
795 ret = do_mmap_shared_file(vma, len);
796 else
797 ret = do_mmap_private(vma, len);
798 if (ret < 0)
799 goto error;
800
801 /* okay... we have a mapping; now we have to register it */
802 result = (void *) vma->vm_start;
803
804 if (vma->vm_flags & VM_MAPPED_COPY) {
805 realalloc += kobjsize(result);
806 askedalloc += len;
807 }
808
809 realalloc += kobjsize(vma);
810 askedalloc += sizeof(*vma);
811
812 current->mm->total_vm += len >> PAGE_SHIFT;
813
814 add_nommu_vma(vma);
815
816 shared:
817 realalloc += kobjsize(vml);
818 askedalloc += sizeof(*vml);
819
820 vml->next = current->mm->context.vmlist;
821 current->mm->context.vmlist = vml;
822
823 up_write(&nommu_vma_sem);
824
825 if (prot & PROT_EXEC)
826 flush_icache_range((unsigned long) result,
827 (unsigned long) result + len);
828
829#ifdef DEBUG
830 printk("do_mmap:\n");
831 show_process_blocks();
832#endif
833
834 return (unsigned long) result;
835
836 error:
837 up_write(&nommu_vma_sem);
838 kfree(vml);
839 if (vma) {
840 fput(vma->vm_file);
841 kfree(vma);
842 }
843 return ret;
844
845 sharing_violation:
846 up_write(&nommu_vma_sem);
847 printk("Attempt to share mismatched mappings\n");
848 kfree(vml);
849 return -EINVAL;
850
851 error_getting_vma:
852 up_write(&nommu_vma_sem);
853 kfree(vml);
854 printk("Allocation of vml for %lu byte allocation from process %d failed\n",
855 len, current->pid);
856 show_free_areas();
857 return -ENOMEM;
858
859 error_getting_vml:
860 printk("Allocation of vml for %lu byte allocation from process %d failed\n",
861 len, current->pid);
862 show_free_areas();
863 return -ENOMEM;
864}
865
866/*
867 * handle mapping disposal for uClinux
868 */
869static void put_vma(struct vm_area_struct *vma)
870{
871 if (vma) {
872 down_write(&nommu_vma_sem);
873
874 if (atomic_dec_and_test(&vma->vm_usage)) {
875 delete_nommu_vma(vma);
876
877 if (vma->vm_ops && vma->vm_ops->close)
878 vma->vm_ops->close(vma);
879
880 /* IO memory and memory shared directly out of the pagecache from
881 * ramfs/tmpfs mustn't be released here */
882 if (vma->vm_flags & VM_MAPPED_COPY) {
883 realalloc -= kobjsize((void *) vma->vm_start);
884 askedalloc -= vma->vm_end - vma->vm_start;
885 kfree((void *) vma->vm_start);
886 }
887
888 realalloc -= kobjsize(vma);
889 askedalloc -= sizeof(*vma);
890
891 if (vma->vm_file)
892 fput(vma->vm_file);
893 kfree(vma);
894 }
895
896 up_write(&nommu_vma_sem);
897 }
898}
899
900int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
901{
902 struct vm_list_struct *vml, **parent;
903 unsigned long end = addr + len;
904
905#ifdef DEBUG
906 printk("do_munmap:\n");
907#endif
908
909 for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next)
910 if ((*parent)->vma->vm_start == addr &&
911 (*parent)->vma->vm_end == end)
912 goto found;
913
914 printk("munmap of non-mmaped memory by process %d (%s): %p\n",
915 current->pid, current->comm, (void *) addr);
916 return -EINVAL;
917
918 found:
919 vml = *parent;
920
921 put_vma(vml->vma);
922
923 *parent = vml->next;
924 realalloc -= kobjsize(vml);
925 askedalloc -= sizeof(*vml);
926 kfree(vml);
927 mm->total_vm -= len >> PAGE_SHIFT;
928
929#ifdef DEBUG
930 show_process_blocks();
931#endif
932
933 return 0;
934}
935
936/* Release all mmaps. */
937void exit_mmap(struct mm_struct * mm)
938{
939 struct vm_list_struct *tmp;
940
941 if (mm) {
942#ifdef DEBUG
943 printk("Exit_mmap:\n");
944#endif
945
946 mm->total_vm = 0;
947
948 while ((tmp = mm->context.vmlist)) {
949 mm->context.vmlist = tmp->next;
950 put_vma(tmp->vma);
951
952 realalloc -= kobjsize(tmp);
953 askedalloc -= sizeof(*tmp);
954 kfree(tmp);
955 }
956
957#ifdef DEBUG
958 show_process_blocks();
959#endif
960 }
961}
962
963asmlinkage long sys_munmap(unsigned long addr, size_t len)
964{
965 int ret;
966 struct mm_struct *mm = current->mm;
967
968 down_write(&mm->mmap_sem);
969 ret = do_munmap(mm, addr, len);
970 up_write(&mm->mmap_sem);
971 return ret;
972}
973
974unsigned long do_brk(unsigned long addr, unsigned long len)
975{
976 return -ENOMEM;
977}
978
979/*
980 * Expand (or shrink) an existing mapping, potentially moving it at the
981 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
982 *
983 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
984 * This option implies MREMAP_MAYMOVE.
985 *
986 * on uClinux, we only permit changing a mapping's size, and only as long as it stays within the
987 * hole allocated by the kmalloc() call in do_mmap_pgoff() and the block is not shareable
988 */
989unsigned long do_mremap(unsigned long addr,
990 unsigned long old_len, unsigned long new_len,
991 unsigned long flags, unsigned long new_addr)
992{
993 struct vm_list_struct *vml = NULL;
994
995 /* insanity checks first */
996 if (new_len == 0)
997 return (unsigned long) -EINVAL;
998
999 if (flags & MREMAP_FIXED && new_addr != addr)
1000 return (unsigned long) -EINVAL;
1001
1002 for (vml = current->mm->context.vmlist; vml; vml = vml->next)
1003 if (vml->vma->vm_start == addr)
1004 goto found;
1005
1006 return (unsigned long) -EINVAL;
1007
1008 found:
1009 if (vml->vma->vm_end != vml->vma->vm_start + old_len)
1010 return (unsigned long) -EFAULT;
1011
1012 if (vml->vma->vm_flags & VM_MAYSHARE)
1013 return (unsigned long) -EPERM;
1014
1015 if (new_len > kobjsize((void *) addr))
1016 return (unsigned long) -ENOMEM;
1017
1018 /* all checks complete - do it */
1019 vml->vma->vm_end = vml->vma->vm_start + new_len;
1020
1021 askedalloc -= old_len;
1022 askedalloc += new_len;
1023
1024 return vml->vma->vm_start;
1025}
1026
1027/*
1028 * Look up the first VMA which satisfies addr < vm_end, NULL if none
1029 */
1030struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1031{
1032 struct vm_list_struct *vml;
1033
1034 for (vml = mm->context.vmlist; vml; vml = vml->next)
1035 if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end)
1036 return vml->vma;
1037
1038 return NULL;
1039}
1040
1041EXPORT_SYMBOL(find_vma);
1042
1043struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write)
1044{
1045 return NULL;
1046}
1047
1048struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
1049{
1050 return NULL;
1051}
1052
1053int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
1054 unsigned long to, unsigned long size, pgprot_t prot)
1055{
1056 return -EPERM;
1057}
1058
1059void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1060{
1061}
1062
1063unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
1064 unsigned long len, unsigned long pgoff, unsigned long flags)
1065{
1066 return -ENOMEM;
1067}
1068
1069void arch_unmap_area(struct vm_area_struct *area)
1070{
1071}
1072
1073void update_mem_hiwater(struct task_struct *tsk)
1074{
1075 unsigned long rss = get_mm_counter(tsk->mm, rss);
1076
1077 if (likely(tsk->mm)) {
1078 if (tsk->mm->hiwater_rss < rss)
1079 tsk->mm->hiwater_rss = rss;
1080 if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
1081 tsk->mm->hiwater_vm = tsk->mm->total_vm;
1082 }
1083}
1084
1085void unmap_mapping_range(struct address_space *mapping,
1086 loff_t const holebegin, loff_t const holelen,
1087 int even_cows)
1088{
1089}
1090
1091/*
1092 * Check that a process has enough memory to allocate a new virtual
1093 * mapping. 0 means there is enough memory for the allocation to
1094 * succeed and -ENOMEM implies there is not.
1095 *
1096 * We currently support three overcommit policies, which are set via the
1097 * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
1098 *
1099 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
1100 * Additional code 2002 Jul 20 by Robert Love.
1101 *
1102 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
1103 *
1104 * Note this is a helper function intended to be used by LSMs which
1105 * wish to use this logic.
1106 */
1107int __vm_enough_memory(long pages, int cap_sys_admin)
1108{
1109 unsigned long free, allowed;
1110
1111 vm_acct_memory(pages);
1112
1113 /*
1114 * Sometimes we want to use more memory than we have
1115 */
1116 if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
1117 return 0;
1118
1119 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
1120 unsigned long n;
1121
1122 free = get_page_cache_size();
1123 free += nr_swap_pages;
1124
1125 /*
1126 * Any slabs which are created with the
1127 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
1128 * which are reclaimable, under pressure. The dentry
1129 * cache and most inode caches should fall into this
1130 */
1131 free += atomic_read(&slab_reclaim_pages);
1132
1133 /*
1134 * Leave the last 3% for root
1135 */
1136 if (!cap_sys_admin)
1137 free -= free / 32;
1138
1139 if (free > pages)
1140 return 0;
1141
1142 /*
1143 * nr_free_pages() is very expensive on large systems,
1144 * only call if we're about to fail.
1145 */
1146 n = nr_free_pages();
1147 if (!cap_sys_admin)
1148 n -= n / 32;
1149 free += n;
1150
1151 if (free > pages)
1152 return 0;
1153 vm_unacct_memory(pages);
1154 return -ENOMEM;
1155 }
1156
1157 allowed = totalram_pages * sysctl_overcommit_ratio / 100;
1158 /*
1159 * Leave the last 3% for root
1160 */
1161 if (!cap_sys_admin)
1162 allowed -= allowed / 32;
1163 allowed += total_swap_pages;
1164
1165 /* Don't let a single process grow too big:
1166 leave 3% of the size of this process for other processes */
1167 allowed -= current->mm->total_vm / 32;
1168
1169 if (atomic_read(&vm_committed_space) < allowed)
1170 return 0;
1171
1172 vm_unacct_memory(pages);
1173
1174 return -ENOMEM;
1175}
1176
1177int in_gate_area_no_task(unsigned long addr)
1178{
1179 return 0;
1180}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
new file mode 100644
index 00000000000..9595a0f6c4b
--- /dev/null
+++ b/mm/oom_kill.c
@@ -0,0 +1,292 @@
1/*
2 * linux/mm/oom_kill.c
3 *
4 * Copyright (C) 1998,2000 Rik van Riel
5 * Thanks go out to Claus Fischer for some serious inspiration and
6 * for goading me into coding this file...
7 *
8 * The routines in this file are used to kill a process when
9 * we're seriously out of memory. This gets called from kswapd()
10 * in linux/mm/vmscan.c when we really run out of memory.
11 *
12 * Since we won't call these routines often (on a well-configured
13 * machine) this file will double as a 'coding guide' and a signpost
14 * for newbie kernel hackers. It features several pointers to major
15 * kernel subsystems and hints as to where to find out what things do.
16 */
17
18#include <linux/mm.h>
19#include <linux/sched.h>
20#include <linux/swap.h>
21#include <linux/timex.h>
22#include <linux/jiffies.h>
23
24/* #define DEBUG */
25
26/**
27 * oom_badness - calculate a numeric value for how bad this task has been
28 * @p: task struct of which task we should calculate
29 * @p: current uptime in seconds
30 *
31 * The formula used is relatively simple and documented inline in the
32 * function. The main rationale is that we want to select a good task
33 * to kill when we run out of memory.
34 *
35 * Good in this context means that:
36 * 1) we lose the minimum amount of work done
37 * 2) we recover a large amount of memory
38 * 3) we don't kill anything innocent of eating tons of memory
39 * 4) we want to kill the minimum amount of processes (one)
40 * 5) we try to kill the process the user expects us to kill, this
41 * algorithm has been meticulously tuned to meet the principle
42 * of least surprise ... (be careful when you change it)
43 */
44
45unsigned long badness(struct task_struct *p, unsigned long uptime)
46{
47 unsigned long points, cpu_time, run_time, s;
48 struct list_head *tsk;
49
50 if (!p->mm)
51 return 0;
52
53 /*
54 * The memory size of the process is the basis for the badness.
55 */
56 points = p->mm->total_vm;
57
58 /*
59 * Processes which fork a lot of child processes are likely
60 * a good choice. We add the vmsize of the childs if they
61 * have an own mm. This prevents forking servers to flood the
62 * machine with an endless amount of childs
63 */
64 list_for_each(tsk, &p->children) {
65 struct task_struct *chld;
66 chld = list_entry(tsk, struct task_struct, sibling);
67 if (chld->mm != p->mm && chld->mm)
68 points += chld->mm->total_vm;
69 }
70
71 /*
72 * CPU time is in tens of seconds and run time is in thousands
73 * of seconds. There is no particular reason for this other than
74 * that it turned out to work very well in practice.
75 */
76 cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime))
77 >> (SHIFT_HZ + 3);
78
79 if (uptime >= p->start_time.tv_sec)
80 run_time = (uptime - p->start_time.tv_sec) >> 10;
81 else
82 run_time = 0;
83
84 s = int_sqrt(cpu_time);
85 if (s)
86 points /= s;
87 s = int_sqrt(int_sqrt(run_time));
88 if (s)
89 points /= s;
90
91 /*
92 * Niced processes are most likely less important, so double
93 * their badness points.
94 */
95 if (task_nice(p) > 0)
96 points *= 2;
97
98 /*
99 * Superuser processes are usually more important, so we make it
100 * less likely that we kill those.
101 */
102 if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) ||
103 p->uid == 0 || p->euid == 0)
104 points /= 4;
105
106 /*
107 * We don't want to kill a process with direct hardware access.
108 * Not only could that mess up the hardware, but usually users
109 * tend to only have this flag set on applications they think
110 * of as important.
111 */
112 if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
113 points /= 4;
114
115 /*
116 * Adjust the score by oomkilladj.
117 */
118 if (p->oomkilladj) {
119 if (p->oomkilladj > 0)
120 points <<= p->oomkilladj;
121 else
122 points >>= -(p->oomkilladj);
123 }
124
125#ifdef DEBUG
126 printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
127 p->pid, p->comm, points);
128#endif
129 return points;
130}
131
132/*
133 * Simple selection loop. We chose the process with the highest
134 * number of 'points'. We expect the caller will lock the tasklist.
135 *
136 * (not docbooked, we don't want this one cluttering up the manual)
137 */
138static struct task_struct * select_bad_process(void)
139{
140 unsigned long maxpoints = 0;
141 struct task_struct *g, *p;
142 struct task_struct *chosen = NULL;
143 struct timespec uptime;
144
145 do_posix_clock_monotonic_gettime(&uptime);
146 do_each_thread(g, p)
147 /* skip the init task with pid == 1 */
148 if (p->pid > 1) {
149 unsigned long points;
150
151 /*
152 * This is in the process of releasing memory so wait it
153 * to finish before killing some other task by mistake.
154 */
155 if ((unlikely(test_tsk_thread_flag(p, TIF_MEMDIE)) || (p->flags & PF_EXITING)) &&
156 !(p->flags & PF_DEAD))
157 return ERR_PTR(-1UL);
158 if (p->flags & PF_SWAPOFF)
159 return p;
160
161 points = badness(p, uptime.tv_sec);
162 if (points > maxpoints || !chosen) {
163 chosen = p;
164 maxpoints = points;
165 }
166 }
167 while_each_thread(g, p);
168 return chosen;
169}
170
171/**
172 * We must be careful though to never send SIGKILL a process with
173 * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
174 * we select a process with CAP_SYS_RAW_IO set).
175 */
176static void __oom_kill_task(task_t *p)
177{
178 if (p->pid == 1) {
179 WARN_ON(1);
180 printk(KERN_WARNING "tried to kill init!\n");
181 return;
182 }
183
184 task_lock(p);
185 if (!p->mm || p->mm == &init_mm) {
186 WARN_ON(1);
187 printk(KERN_WARNING "tried to kill an mm-less task!\n");
188 task_unlock(p);
189 return;
190 }
191 task_unlock(p);
192 printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm);
193
194 /*
195 * We give our sacrificial lamb high priority and access to
196 * all the memory it needs. That way it should be able to
197 * exit() and clear out its resources quickly...
198 */
199 p->time_slice = HZ;
200 set_tsk_thread_flag(p, TIF_MEMDIE);
201
202 force_sig(SIGKILL, p);
203}
204
205static struct mm_struct *oom_kill_task(task_t *p)
206{
207 struct mm_struct *mm = get_task_mm(p);
208 task_t * g, * q;
209
210 if (!mm)
211 return NULL;
212 if (mm == &init_mm) {
213 mmput(mm);
214 return NULL;
215 }
216
217 __oom_kill_task(p);
218 /*
219 * kill all processes that share the ->mm (i.e. all threads),
220 * but are in a different thread group
221 */
222 do_each_thread(g, q)
223 if (q->mm == mm && q->tgid != p->tgid)
224 __oom_kill_task(q);
225 while_each_thread(g, q);
226
227 return mm;
228}
229
230static struct mm_struct *oom_kill_process(struct task_struct *p)
231{
232 struct mm_struct *mm;
233 struct task_struct *c;
234 struct list_head *tsk;
235
236 /* Try to kill a child first */
237 list_for_each(tsk, &p->children) {
238 c = list_entry(tsk, struct task_struct, sibling);
239 if (c->mm == p->mm)
240 continue;
241 mm = oom_kill_task(c);
242 if (mm)
243 return mm;
244 }
245 return oom_kill_task(p);
246}
247
248/**
249 * oom_kill - kill the "best" process when we run out of memory
250 *
251 * If we run out of memory, we have the choice between either
252 * killing a random task (bad), letting the system crash (worse)
253 * OR try to be smart about which process to kill. Note that we
254 * don't have to be perfect here, we just have to be good.
255 */
256void out_of_memory(unsigned int __nocast gfp_mask)
257{
258 struct mm_struct *mm = NULL;
259 task_t * p;
260
261 read_lock(&tasklist_lock);
262retry:
263 p = select_bad_process();
264
265 if (PTR_ERR(p) == -1UL)
266 goto out;
267
268 /* Found nothing?!?! Either we hang forever, or we panic. */
269 if (!p) {
270 read_unlock(&tasklist_lock);
271 show_free_areas();
272 panic("Out of memory and no killable processes...\n");
273 }
274
275 printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
276 show_free_areas();
277 mm = oom_kill_process(p);
278 if (!mm)
279 goto retry;
280
281 out:
282 read_unlock(&tasklist_lock);
283 if (mm)
284 mmput(mm);
285
286 /*
287 * Give "p" a good chance of killing itself before we
288 * retry to allocate memory.
289 */
290 __set_current_state(TASK_INTERRUPTIBLE);
291 schedule_timeout(1);
292}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
new file mode 100644
index 00000000000..6ddd6a29c73
--- /dev/null
+++ b/mm/page-writeback.c
@@ -0,0 +1,819 @@
1/*
2 * mm/page-writeback.c.
3 *
4 * Copyright (C) 2002, Linus Torvalds.
5 *
6 * Contains functions related to writing back dirty pages at the
7 * address_space level.
8 *
9 * 10Apr2002 akpm@zip.com.au
10 * Initial version
11 */
12
13#include <linux/kernel.h>
14#include <linux/module.h>
15#include <linux/spinlock.h>
16#include <linux/fs.h>
17#include <linux/mm.h>
18#include <linux/swap.h>
19#include <linux/slab.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/init.h>
23#include <linux/backing-dev.h>
24#include <linux/blkdev.h>
25#include <linux/mpage.h>
26#include <linux/percpu.h>
27#include <linux/notifier.h>
28#include <linux/smp.h>
29#include <linux/sysctl.h>
30#include <linux/cpu.h>
31#include <linux/syscalls.h>
32
33/*
34 * The maximum number of pages to writeout in a single bdflush/kupdate
35 * operation. We do this so we don't hold I_LOCK against an inode for
36 * enormous amounts of time, which would block a userspace task which has
37 * been forced to throttle against that inode. Also, the code reevaluates
38 * the dirty each time it has written this many pages.
39 */
40#define MAX_WRITEBACK_PAGES 1024
41
42/*
43 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
44 * will look to see if it needs to force writeback or throttling.
45 */
46static long ratelimit_pages = 32;
47
48static long total_pages; /* The total number of pages in the machine. */
49static int dirty_exceeded; /* Dirty mem may be over limit */
50
51/*
52 * When balance_dirty_pages decides that the caller needs to perform some
53 * non-background writeback, this is how many pages it will attempt to write.
54 * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
55 * large amounts of I/O are submitted.
56 */
57static inline long sync_writeback_pages(void)
58{
59 return ratelimit_pages + ratelimit_pages / 2;
60}
61
62/* The following parameters are exported via /proc/sys/vm */
63
64/*
65 * Start background writeback (via pdflush) at this percentage
66 */
67int dirty_background_ratio = 10;
68
69/*
70 * The generator of dirty data starts writeback at this percentage
71 */
72int vm_dirty_ratio = 40;
73
74/*
75 * The interval between `kupdate'-style writebacks, in centiseconds
76 * (hundredths of a second)
77 */
78int dirty_writeback_centisecs = 5 * 100;
79
80/*
81 * The longest number of centiseconds for which data is allowed to remain dirty
82 */
83int dirty_expire_centisecs = 30 * 100;
84
85/*
86 * Flag that makes the machine dump writes/reads and block dirtyings.
87 */
88int block_dump;
89
90/*
91 * Flag that puts the machine in "laptop mode".
92 */
93int laptop_mode;
94
95EXPORT_SYMBOL(laptop_mode);
96
97/* End of sysctl-exported parameters */
98
99
100static void background_writeout(unsigned long _min_pages);
101
102struct writeback_state
103{
104 unsigned long nr_dirty;
105 unsigned long nr_unstable;
106 unsigned long nr_mapped;
107 unsigned long nr_writeback;
108};
109
110static void get_writeback_state(struct writeback_state *wbs)
111{
112 wbs->nr_dirty = read_page_state(nr_dirty);
113 wbs->nr_unstable = read_page_state(nr_unstable);
114 wbs->nr_mapped = read_page_state(nr_mapped);
115 wbs->nr_writeback = read_page_state(nr_writeback);
116}
117
118/*
119 * Work out the current dirty-memory clamping and background writeout
120 * thresholds.
121 *
122 * The main aim here is to lower them aggressively if there is a lot of mapped
123 * memory around. To avoid stressing page reclaim with lots of unreclaimable
124 * pages. It is better to clamp down on writers than to start swapping, and
125 * performing lots of scanning.
126 *
127 * We only allow 1/2 of the currently-unmapped memory to be dirtied.
128 *
129 * We don't permit the clamping level to fall below 5% - that is getting rather
130 * excessive.
131 *
132 * We make sure that the background writeout level is below the adjusted
133 * clamping level.
134 */
135static void
136get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty,
137 struct address_space *mapping)
138{
139 int background_ratio; /* Percentages */
140 int dirty_ratio;
141 int unmapped_ratio;
142 long background;
143 long dirty;
144 unsigned long available_memory = total_pages;
145 struct task_struct *tsk;
146
147 get_writeback_state(wbs);
148
149#ifdef CONFIG_HIGHMEM
150 /*
151 * If this mapping can only allocate from low memory,
152 * we exclude high memory from our count.
153 */
154 if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM))
155 available_memory -= totalhigh_pages;
156#endif
157
158
159 unmapped_ratio = 100 - (wbs->nr_mapped * 100) / total_pages;
160
161 dirty_ratio = vm_dirty_ratio;
162 if (dirty_ratio > unmapped_ratio / 2)
163 dirty_ratio = unmapped_ratio / 2;
164
165 if (dirty_ratio < 5)
166 dirty_ratio = 5;
167
168 background_ratio = dirty_background_ratio;
169 if (background_ratio >= dirty_ratio)
170 background_ratio = dirty_ratio / 2;
171
172 background = (background_ratio * available_memory) / 100;
173 dirty = (dirty_ratio * available_memory) / 100;
174 tsk = current;
175 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
176 background += background / 4;
177 dirty += dirty / 4;
178 }
179 *pbackground = background;
180 *pdirty = dirty;
181}
182
183/*
184 * balance_dirty_pages() must be called by processes which are generating dirty
185 * data. It looks at the number of dirty pages in the machine and will force
186 * the caller to perform writeback if the system is over `vm_dirty_ratio'.
187 * If we're over `background_thresh' then pdflush is woken to perform some
188 * writeout.
189 */
190static void balance_dirty_pages(struct address_space *mapping)
191{
192 struct writeback_state wbs;
193 long nr_reclaimable;
194 long background_thresh;
195 long dirty_thresh;
196 unsigned long pages_written = 0;
197 unsigned long write_chunk = sync_writeback_pages();
198
199 struct backing_dev_info *bdi = mapping->backing_dev_info;
200
201 for (;;) {
202 struct writeback_control wbc = {
203 .bdi = bdi,
204 .sync_mode = WB_SYNC_NONE,
205 .older_than_this = NULL,
206 .nr_to_write = write_chunk,
207 };
208
209 get_dirty_limits(&wbs, &background_thresh,
210 &dirty_thresh, mapping);
211 nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable;
212 if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
213 break;
214
215 dirty_exceeded = 1;
216
217 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
218 * Unstable writes are a feature of certain networked
219 * filesystems (i.e. NFS) in which data may have been
220 * written to the server's write cache, but has not yet
221 * been flushed to permanent storage.
222 */
223 if (nr_reclaimable) {
224 writeback_inodes(&wbc);
225 get_dirty_limits(&wbs, &background_thresh,
226 &dirty_thresh, mapping);
227 nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable;
228 if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
229 break;
230 pages_written += write_chunk - wbc.nr_to_write;
231 if (pages_written >= write_chunk)
232 break; /* We've done our duty */
233 }
234 blk_congestion_wait(WRITE, HZ/10);
235 }
236
237 if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
238 dirty_exceeded = 0;
239
240 if (writeback_in_progress(bdi))
241 return; /* pdflush is already working this queue */
242
243 /*
244 * In laptop mode, we wait until hitting the higher threshold before
245 * starting background writeout, and then write out all the way down
246 * to the lower threshold. So slow writers cause minimal disk activity.
247 *
248 * In normal mode, we start background writeout at the lower
249 * background_thresh, to keep the amount of dirty memory low.
250 */
251 if ((laptop_mode && pages_written) ||
252 (!laptop_mode && (nr_reclaimable > background_thresh)))
253 pdflush_operation(background_writeout, 0);
254}
255
256/**
257 * balance_dirty_pages_ratelimited - balance dirty memory state
258 * @mapping - address_space which was dirtied
259 *
260 * Processes which are dirtying memory should call in here once for each page
261 * which was newly dirtied. The function will periodically check the system's
262 * dirty state and will initiate writeback if needed.
263 *
264 * On really big machines, get_writeback_state is expensive, so try to avoid
265 * calling it too often (ratelimiting). But once we're over the dirty memory
266 * limit we decrease the ratelimiting by a lot, to prevent individual processes
267 * from overshooting the limit by (ratelimit_pages) each.
268 */
269void balance_dirty_pages_ratelimited(struct address_space *mapping)
270{
271 static DEFINE_PER_CPU(int, ratelimits) = 0;
272 long ratelimit;
273
274 ratelimit = ratelimit_pages;
275 if (dirty_exceeded)
276 ratelimit = 8;
277
278 /*
279 * Check the rate limiting. Also, we do not want to throttle real-time
280 * tasks in balance_dirty_pages(). Period.
281 */
282 if (get_cpu_var(ratelimits)++ >= ratelimit) {
283 __get_cpu_var(ratelimits) = 0;
284 put_cpu_var(ratelimits);
285 balance_dirty_pages(mapping);
286 return;
287 }
288 put_cpu_var(ratelimits);
289}
290EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
291
292void throttle_vm_writeout(void)
293{
294 struct writeback_state wbs;
295 long background_thresh;
296 long dirty_thresh;
297
298 for ( ; ; ) {
299 get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL);
300
301 /*
302 * Boost the allowable dirty threshold a bit for page
303 * allocators so they don't get DoS'ed by heavy writers
304 */
305 dirty_thresh += dirty_thresh / 10; /* wheeee... */
306
307 if (wbs.nr_unstable + wbs.nr_writeback <= dirty_thresh)
308 break;
309 blk_congestion_wait(WRITE, HZ/10);
310 }
311}
312
313
314/*
315 * writeback at least _min_pages, and keep writing until the amount of dirty
316 * memory is less than the background threshold, or until we're all clean.
317 */
318static void background_writeout(unsigned long _min_pages)
319{
320 long min_pages = _min_pages;
321 struct writeback_control wbc = {
322 .bdi = NULL,
323 .sync_mode = WB_SYNC_NONE,
324 .older_than_this = NULL,
325 .nr_to_write = 0,
326 .nonblocking = 1,
327 };
328
329 for ( ; ; ) {
330 struct writeback_state wbs;
331 long background_thresh;
332 long dirty_thresh;
333
334 get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL);
335 if (wbs.nr_dirty + wbs.nr_unstable < background_thresh
336 && min_pages <= 0)
337 break;
338 wbc.encountered_congestion = 0;
339 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
340 wbc.pages_skipped = 0;
341 writeback_inodes(&wbc);
342 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
343 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
344 /* Wrote less than expected */
345 blk_congestion_wait(WRITE, HZ/10);
346 if (!wbc.encountered_congestion)
347 break;
348 }
349 }
350}
351
352/*
353 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
354 * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
355 * -1 if all pdflush threads were busy.
356 */
357int wakeup_bdflush(long nr_pages)
358{
359 if (nr_pages == 0) {
360 struct writeback_state wbs;
361
362 get_writeback_state(&wbs);
363 nr_pages = wbs.nr_dirty + wbs.nr_unstable;
364 }
365 return pdflush_operation(background_writeout, nr_pages);
366}
367
368static void wb_timer_fn(unsigned long unused);
369static void laptop_timer_fn(unsigned long unused);
370
371static struct timer_list wb_timer =
372 TIMER_INITIALIZER(wb_timer_fn, 0, 0);
373static struct timer_list laptop_mode_wb_timer =
374 TIMER_INITIALIZER(laptop_timer_fn, 0, 0);
375
376/*
377 * Periodic writeback of "old" data.
378 *
379 * Define "old": the first time one of an inode's pages is dirtied, we mark the
380 * dirtying-time in the inode's address_space. So this periodic writeback code
381 * just walks the superblock inode list, writing back any inodes which are
382 * older than a specific point in time.
383 *
384 * Try to run once per dirty_writeback_centisecs. But if a writeback event
385 * takes longer than a dirty_writeback_centisecs interval, then leave a
386 * one-second gap.
387 *
388 * older_than_this takes precedence over nr_to_write. So we'll only write back
389 * all dirty pages if they are all attached to "old" mappings.
390 */
391static void wb_kupdate(unsigned long arg)
392{
393 unsigned long oldest_jif;
394 unsigned long start_jif;
395 unsigned long next_jif;
396 long nr_to_write;
397 struct writeback_state wbs;
398 struct writeback_control wbc = {
399 .bdi = NULL,
400 .sync_mode = WB_SYNC_NONE,
401 .older_than_this = &oldest_jif,
402 .nr_to_write = 0,
403 .nonblocking = 1,
404 .for_kupdate = 1,
405 };
406
407 sync_supers();
408
409 get_writeback_state(&wbs);
410 oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
411 start_jif = jiffies;
412 next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
413 nr_to_write = wbs.nr_dirty + wbs.nr_unstable +
414 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
415 while (nr_to_write > 0) {
416 wbc.encountered_congestion = 0;
417 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
418 writeback_inodes(&wbc);
419 if (wbc.nr_to_write > 0) {
420 if (wbc.encountered_congestion)
421 blk_congestion_wait(WRITE, HZ/10);
422 else
423 break; /* All the old data is written */
424 }
425 nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
426 }
427 if (time_before(next_jif, jiffies + HZ))
428 next_jif = jiffies + HZ;
429 if (dirty_writeback_centisecs)
430 mod_timer(&wb_timer, next_jif);
431}
432
433/*
434 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
435 */
436int dirty_writeback_centisecs_handler(ctl_table *table, int write,
437 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
438{
439 proc_dointvec(table, write, file, buffer, length, ppos);
440 if (dirty_writeback_centisecs) {
441 mod_timer(&wb_timer,
442 jiffies + (dirty_writeback_centisecs * HZ) / 100);
443 } else {
444 del_timer(&wb_timer);
445 }
446 return 0;
447}
448
449static void wb_timer_fn(unsigned long unused)
450{
451 if (pdflush_operation(wb_kupdate, 0) < 0)
452 mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
453}
454
455static void laptop_flush(unsigned long unused)
456{
457 sys_sync();
458}
459
460static void laptop_timer_fn(unsigned long unused)
461{
462 pdflush_operation(laptop_flush, 0);
463}
464
465/*
466 * We've spun up the disk and we're in laptop mode: schedule writeback
467 * of all dirty data a few seconds from now. If the flush is already scheduled
468 * then push it back - the user is still using the disk.
469 */
470void laptop_io_completion(void)
471{
472 mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ);
473}
474
475/*
476 * We're in laptop mode and we've just synced. The sync's writes will have
477 * caused another writeback to be scheduled by laptop_io_completion.
478 * Nothing needs to be written back anymore, so we unschedule the writeback.
479 */
480void laptop_sync_completion(void)
481{
482 del_timer(&laptop_mode_wb_timer);
483}
484
485/*
486 * If ratelimit_pages is too high then we can get into dirty-data overload
487 * if a large number of processes all perform writes at the same time.
488 * If it is too low then SMP machines will call the (expensive)
489 * get_writeback_state too often.
490 *
491 * Here we set ratelimit_pages to a level which ensures that when all CPUs are
492 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
493 * thresholds before writeback cuts in.
494 *
495 * But the limit should not be set too high. Because it also controls the
496 * amount of memory which the balance_dirty_pages() caller has to write back.
497 * If this is too large then the caller will block on the IO queue all the
498 * time. So limit it to four megabytes - the balance_dirty_pages() caller
499 * will write six megabyte chunks, max.
500 */
501
502static void set_ratelimit(void)
503{
504 ratelimit_pages = total_pages / (num_online_cpus() * 32);
505 if (ratelimit_pages < 16)
506 ratelimit_pages = 16;
507 if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
508 ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
509}
510
511static int
512ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
513{
514 set_ratelimit();
515 return 0;
516}
517
518static struct notifier_block ratelimit_nb = {
519 .notifier_call = ratelimit_handler,
520 .next = NULL,
521};
522
523/*
524 * If the machine has a large highmem:lowmem ratio then scale back the default
525 * dirty memory thresholds: allowing too much dirty highmem pins an excessive
526 * number of buffer_heads.
527 */
528void __init page_writeback_init(void)
529{
530 long buffer_pages = nr_free_buffer_pages();
531 long correction;
532
533 total_pages = nr_free_pagecache_pages();
534
535 correction = (100 * 4 * buffer_pages) / total_pages;
536
537 if (correction < 100) {
538 dirty_background_ratio *= correction;
539 dirty_background_ratio /= 100;
540 vm_dirty_ratio *= correction;
541 vm_dirty_ratio /= 100;
542
543 if (dirty_background_ratio <= 0)
544 dirty_background_ratio = 1;
545 if (vm_dirty_ratio <= 0)
546 vm_dirty_ratio = 1;
547 }
548 mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100);
549 set_ratelimit();
550 register_cpu_notifier(&ratelimit_nb);
551}
552
553int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
554{
555 if (wbc->nr_to_write <= 0)
556 return 0;
557 if (mapping->a_ops->writepages)
558 return mapping->a_ops->writepages(mapping, wbc);
559 return generic_writepages(mapping, wbc);
560}
561
562/**
563 * write_one_page - write out a single page and optionally wait on I/O
564 *
565 * @page - the page to write
566 * @wait - if true, wait on writeout
567 *
568 * The page must be locked by the caller and will be unlocked upon return.
569 *
570 * write_one_page() returns a negative error code if I/O failed.
571 */
572int write_one_page(struct page *page, int wait)
573{
574 struct address_space *mapping = page->mapping;
575 int ret = 0;
576 struct writeback_control wbc = {
577 .sync_mode = WB_SYNC_ALL,
578 .nr_to_write = 1,
579 };
580
581 BUG_ON(!PageLocked(page));
582
583 if (wait)
584 wait_on_page_writeback(page);
585
586 if (clear_page_dirty_for_io(page)) {
587 page_cache_get(page);
588 ret = mapping->a_ops->writepage(page, &wbc);
589 if (ret == 0 && wait) {
590 wait_on_page_writeback(page);
591 if (PageError(page))
592 ret = -EIO;
593 }
594 page_cache_release(page);
595 } else {
596 unlock_page(page);
597 }
598 return ret;
599}
600EXPORT_SYMBOL(write_one_page);
601
602/*
603 * For address_spaces which do not use buffers. Just tag the page as dirty in
604 * its radix tree.
605 *
606 * This is also used when a single buffer is being dirtied: we want to set the
607 * page dirty in that case, but not all the buffers. This is a "bottom-up"
608 * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
609 *
610 * Most callers have locked the page, which pins the address_space in memory.
611 * But zap_pte_range() does not lock the page, however in that case the
612 * mapping is pinned by the vma's ->vm_file reference.
613 *
614 * We take care to handle the case where the page was truncated from the
615 * mapping by re-checking page_mapping() insode tree_lock.
616 */
617int __set_page_dirty_nobuffers(struct page *page)
618{
619 int ret = 0;
620
621 if (!TestSetPageDirty(page)) {
622 struct address_space *mapping = page_mapping(page);
623 struct address_space *mapping2;
624
625 if (mapping) {
626 write_lock_irq(&mapping->tree_lock);
627 mapping2 = page_mapping(page);
628 if (mapping2) { /* Race with truncate? */
629 BUG_ON(mapping2 != mapping);
630 if (mapping_cap_account_dirty(mapping))
631 inc_page_state(nr_dirty);
632 radix_tree_tag_set(&mapping->page_tree,
633 page_index(page), PAGECACHE_TAG_DIRTY);
634 }
635 write_unlock_irq(&mapping->tree_lock);
636 if (mapping->host) {
637 /* !PageAnon && !swapper_space */
638 __mark_inode_dirty(mapping->host,
639 I_DIRTY_PAGES);
640 }
641 }
642 }
643 return ret;
644}
645EXPORT_SYMBOL(__set_page_dirty_nobuffers);
646
647/*
648 * When a writepage implementation decides that it doesn't want to write this
649 * page for some reason, it should redirty the locked page via
650 * redirty_page_for_writepage() and it should then unlock the page and return 0
651 */
652int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
653{
654 wbc->pages_skipped++;
655 return __set_page_dirty_nobuffers(page);
656}
657EXPORT_SYMBOL(redirty_page_for_writepage);
658
659/*
660 * If the mapping doesn't provide a set_page_dirty a_op, then
661 * just fall through and assume that it wants buffer_heads.
662 */
663int fastcall set_page_dirty(struct page *page)
664{
665 struct address_space *mapping = page_mapping(page);
666
667 if (likely(mapping)) {
668 int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
669 if (spd)
670 return (*spd)(page);
671 return __set_page_dirty_buffers(page);
672 }
673 if (!PageDirty(page))
674 SetPageDirty(page);
675 return 0;
676}
677EXPORT_SYMBOL(set_page_dirty);
678
679/*
680 * set_page_dirty() is racy if the caller has no reference against
681 * page->mapping->host, and if the page is unlocked. This is because another
682 * CPU could truncate the page off the mapping and then free the mapping.
683 *
684 * Usually, the page _is_ locked, or the caller is a user-space process which
685 * holds a reference on the inode by having an open file.
686 *
687 * In other cases, the page should be locked before running set_page_dirty().
688 */
689int set_page_dirty_lock(struct page *page)
690{
691 int ret;
692
693 lock_page(page);
694 ret = set_page_dirty(page);
695 unlock_page(page);
696 return ret;
697}
698EXPORT_SYMBOL(set_page_dirty_lock);
699
700/*
701 * Clear a page's dirty flag, while caring for dirty memory accounting.
702 * Returns true if the page was previously dirty.
703 */
704int test_clear_page_dirty(struct page *page)
705{
706 struct address_space *mapping = page_mapping(page);
707 unsigned long flags;
708
709 if (mapping) {
710 write_lock_irqsave(&mapping->tree_lock, flags);
711 if (TestClearPageDirty(page)) {
712 radix_tree_tag_clear(&mapping->page_tree,
713 page_index(page),
714 PAGECACHE_TAG_DIRTY);
715 write_unlock_irqrestore(&mapping->tree_lock, flags);
716 if (mapping_cap_account_dirty(mapping))
717 dec_page_state(nr_dirty);
718 return 1;
719 }
720 write_unlock_irqrestore(&mapping->tree_lock, flags);
721 return 0;
722 }
723 return TestClearPageDirty(page);
724}
725EXPORT_SYMBOL(test_clear_page_dirty);
726
727/*
728 * Clear a page's dirty flag, while caring for dirty memory accounting.
729 * Returns true if the page was previously dirty.
730 *
731 * This is for preparing to put the page under writeout. We leave the page
732 * tagged as dirty in the radix tree so that a concurrent write-for-sync
733 * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
734 * implementation will run either set_page_writeback() or set_page_dirty(),
735 * at which stage we bring the page's dirty flag and radix-tree dirty tag
736 * back into sync.
737 *
738 * This incoherency between the page's dirty flag and radix-tree tag is
739 * unfortunate, but it only exists while the page is locked.
740 */
741int clear_page_dirty_for_io(struct page *page)
742{
743 struct address_space *mapping = page_mapping(page);
744
745 if (mapping) {
746 if (TestClearPageDirty(page)) {
747 if (mapping_cap_account_dirty(mapping))
748 dec_page_state(nr_dirty);
749 return 1;
750 }
751 return 0;
752 }
753 return TestClearPageDirty(page);
754}
755EXPORT_SYMBOL(clear_page_dirty_for_io);
756
757int test_clear_page_writeback(struct page *page)
758{
759 struct address_space *mapping = page_mapping(page);
760 int ret;
761
762 if (mapping) {
763 unsigned long flags;
764
765 write_lock_irqsave(&mapping->tree_lock, flags);
766 ret = TestClearPageWriteback(page);
767 if (ret)
768 radix_tree_tag_clear(&mapping->page_tree,
769 page_index(page),
770 PAGECACHE_TAG_WRITEBACK);
771 write_unlock_irqrestore(&mapping->tree_lock, flags);
772 } else {
773 ret = TestClearPageWriteback(page);
774 }
775 return ret;
776}
777
778int test_set_page_writeback(struct page *page)
779{
780 struct address_space *mapping = page_mapping(page);
781 int ret;
782
783 if (mapping) {
784 unsigned long flags;
785
786 write_lock_irqsave(&mapping->tree_lock, flags);
787 ret = TestSetPageWriteback(page);
788 if (!ret)
789 radix_tree_tag_set(&mapping->page_tree,
790 page_index(page),
791 PAGECACHE_TAG_WRITEBACK);
792 if (!PageDirty(page))
793 radix_tree_tag_clear(&mapping->page_tree,
794 page_index(page),
795 PAGECACHE_TAG_DIRTY);
796 write_unlock_irqrestore(&mapping->tree_lock, flags);
797 } else {
798 ret = TestSetPageWriteback(page);
799 }
800 return ret;
801
802}
803EXPORT_SYMBOL(test_set_page_writeback);
804
805/*
806 * Return true if any of the pages in the mapping are marged with the
807 * passed tag.
808 */
809int mapping_tagged(struct address_space *mapping, int tag)
810{
811 unsigned long flags;
812 int ret;
813
814 read_lock_irqsave(&mapping->tree_lock, flags);
815 ret = radix_tree_tagged(&mapping->page_tree, tag);
816 read_unlock_irqrestore(&mapping->tree_lock, flags);
817 return ret;
818}
819EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
new file mode 100644
index 00000000000..c73dbbc1cd8
--- /dev/null
+++ b/mm/page_alloc.c
@@ -0,0 +1,2220 @@
1/*
2 * linux/mm/page_alloc.c
3 *
4 * Manages the free list, the system allocates free pages here.
5 * Note that kmalloc() lives in slab.c
6 *
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 * Swap reorganised 29.12.95, Stephen Tweedie
9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */
16
17#include <linux/config.h>
18#include <linux/stddef.h>
19#include <linux/mm.h>
20#include <linux/swap.h>
21#include <linux/interrupt.h>
22#include <linux/pagemap.h>
23#include <linux/bootmem.h>
24#include <linux/compiler.h>
25#include <linux/module.h>
26#include <linux/suspend.h>
27#include <linux/pagevec.h>
28#include <linux/blkdev.h>
29#include <linux/slab.h>
30#include <linux/notifier.h>
31#include <linux/topology.h>
32#include <linux/sysctl.h>
33#include <linux/cpu.h>
34#include <linux/cpuset.h>
35#include <linux/nodemask.h>
36#include <linux/vmalloc.h>
37
38#include <asm/tlbflush.h>
39#include "internal.h"
40
41/*
42 * MCD - HACK: Find somewhere to initialize this EARLY, or make this
43 * initializer cleaner
44 */
45nodemask_t node_online_map = { { [0] = 1UL } };
46nodemask_t node_possible_map = NODE_MASK_ALL;
47struct pglist_data *pgdat_list;
48unsigned long totalram_pages;
49unsigned long totalhigh_pages;
50long nr_swap_pages;
51
52/*
53 * results with 256, 32 in the lowmem_reserve sysctl:
54 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
55 * 1G machine -> (16M dma, 784M normal, 224M high)
56 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
57 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
58 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
59 */
60int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
61
62EXPORT_SYMBOL(totalram_pages);
63EXPORT_SYMBOL(nr_swap_pages);
64
65/*
66 * Used by page_zone() to look up the address of the struct zone whose
67 * id is encoded in the upper bits of page->flags
68 */
69struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
70EXPORT_SYMBOL(zone_table);
71
72static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
73int min_free_kbytes = 1024;
74
75unsigned long __initdata nr_kernel_pages;
76unsigned long __initdata nr_all_pages;
77
78/*
79 * Temporary debugging check for pages not lying within a given zone.
80 */
81static int bad_range(struct zone *zone, struct page *page)
82{
83 if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
84 return 1;
85 if (page_to_pfn(page) < zone->zone_start_pfn)
86 return 1;
87#ifdef CONFIG_HOLES_IN_ZONE
88 if (!pfn_valid(page_to_pfn(page)))
89 return 1;
90#endif
91 if (zone != page_zone(page))
92 return 1;
93 return 0;
94}
95
96static void bad_page(const char *function, struct page *page)
97{
98 printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
99 function, current->comm, page);
100 printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
101 (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags,
102 page->mapping, page_mapcount(page), page_count(page));
103 printk(KERN_EMERG "Backtrace:\n");
104 dump_stack();
105 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
106 page->flags &= ~(1 << PG_private |
107 1 << PG_locked |
108 1 << PG_lru |
109 1 << PG_active |
110 1 << PG_dirty |
111 1 << PG_swapcache |
112 1 << PG_writeback);
113 set_page_count(page, 0);
114 reset_page_mapcount(page);
115 page->mapping = NULL;
116 tainted |= TAINT_BAD_PAGE;
117}
118
119#ifndef CONFIG_HUGETLB_PAGE
120#define prep_compound_page(page, order) do { } while (0)
121#define destroy_compound_page(page, order) do { } while (0)
122#else
123/*
124 * Higher-order pages are called "compound pages". They are structured thusly:
125 *
126 * The first PAGE_SIZE page is called the "head page".
127 *
128 * The remaining PAGE_SIZE pages are called "tail pages".
129 *
130 * All pages have PG_compound set. All pages have their ->private pointing at
131 * the head page (even the head page has this).
132 *
133 * The first tail page's ->mapping, if non-zero, holds the address of the
134 * compound page's put_page() function.
135 *
136 * The order of the allocation is stored in the first tail page's ->index
137 * This is only for debug at present. This usage means that zero-order pages
138 * may not be compound.
139 */
140static void prep_compound_page(struct page *page, unsigned long order)
141{
142 int i;
143 int nr_pages = 1 << order;
144
145 page[1].mapping = NULL;
146 page[1].index = order;
147 for (i = 0; i < nr_pages; i++) {
148 struct page *p = page + i;
149
150 SetPageCompound(p);
151 p->private = (unsigned long)page;
152 }
153}
154
155static void destroy_compound_page(struct page *page, unsigned long order)
156{
157 int i;
158 int nr_pages = 1 << order;
159
160 if (!PageCompound(page))
161 return;
162
163 if (page[1].index != order)
164 bad_page(__FUNCTION__, page);
165
166 for (i = 0; i < nr_pages; i++) {
167 struct page *p = page + i;
168
169 if (!PageCompound(p))
170 bad_page(__FUNCTION__, page);
171 if (p->private != (unsigned long)page)
172 bad_page(__FUNCTION__, page);
173 ClearPageCompound(p);
174 }
175}
176#endif /* CONFIG_HUGETLB_PAGE */
177
178/*
179 * function for dealing with page's order in buddy system.
180 * zone->lock is already acquired when we use these.
181 * So, we don't need atomic page->flags operations here.
182 */
183static inline unsigned long page_order(struct page *page) {
184 return page->private;
185}
186
187static inline void set_page_order(struct page *page, int order) {
188 page->private = order;
189 __SetPagePrivate(page);
190}
191
192static inline void rmv_page_order(struct page *page)
193{
194 __ClearPagePrivate(page);
195 page->private = 0;
196}
197
198/*
199 * Locate the struct page for both the matching buddy in our
200 * pair (buddy1) and the combined O(n+1) page they form (page).
201 *
202 * 1) Any buddy B1 will have an order O twin B2 which satisfies
203 * the following equation:
204 * B2 = B1 ^ (1 << O)
205 * For example, if the starting buddy (buddy2) is #8 its order
206 * 1 buddy is #10:
207 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
208 *
209 * 2) Any buddy B will have an order O+1 parent P which
210 * satisfies the following equation:
211 * P = B & ~(1 << O)
212 *
213 * Assumption: *_mem_map is contigious at least up to MAX_ORDER
214 */
215static inline struct page *
216__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
217{
218 unsigned long buddy_idx = page_idx ^ (1 << order);
219
220 return page + (buddy_idx - page_idx);
221}
222
223static inline unsigned long
224__find_combined_index(unsigned long page_idx, unsigned int order)
225{
226 return (page_idx & ~(1 << order));
227}
228
229/*
230 * This function checks whether a page is free && is the buddy
231 * we can do coalesce a page and its buddy if
232 * (a) the buddy is free &&
233 * (b) the buddy is on the buddy system &&
234 * (c) a page and its buddy have the same order.
235 * for recording page's order, we use page->private and PG_private.
236 *
237 */
238static inline int page_is_buddy(struct page *page, int order)
239{
240 if (PagePrivate(page) &&
241 (page_order(page) == order) &&
242 !PageReserved(page) &&
243 page_count(page) == 0)
244 return 1;
245 return 0;
246}
247
248/*
249 * Freeing function for a buddy system allocator.
250 *
251 * The concept of a buddy system is to maintain direct-mapped table
252 * (containing bit values) for memory blocks of various "orders".
253 * The bottom level table contains the map for the smallest allocatable
254 * units of memory (here, pages), and each level above it describes
255 * pairs of units from the levels below, hence, "buddies".
256 * At a high level, all that happens here is marking the table entry
257 * at the bottom level available, and propagating the changes upward
258 * as necessary, plus some accounting needed to play nicely with other
259 * parts of the VM system.
260 * At each level, we keep a list of pages, which are heads of continuous
261 * free pages of length of (1 << order) and marked with PG_Private.Page's
262 * order is recorded in page->private field.
263 * So when we are allocating or freeing one, we can derive the state of the
264 * other. That is, if we allocate a small block, and both were
265 * free, the remainder of the region must be split into blocks.
266 * If a block is freed, and its buddy is also free, then this
267 * triggers coalescing into a block of larger size.
268 *
269 * -- wli
270 */
271
272static inline void __free_pages_bulk (struct page *page,
273 struct zone *zone, unsigned int order)
274{
275 unsigned long page_idx;
276 int order_size = 1 << order;
277
278 if (unlikely(order))
279 destroy_compound_page(page, order);
280
281 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
282
283 BUG_ON(page_idx & (order_size - 1));
284 BUG_ON(bad_range(zone, page));
285
286 zone->free_pages += order_size;
287 while (order < MAX_ORDER-1) {
288 unsigned long combined_idx;
289 struct free_area *area;
290 struct page *buddy;
291
292 combined_idx = __find_combined_index(page_idx, order);
293 buddy = __page_find_buddy(page, page_idx, order);
294
295 if (bad_range(zone, buddy))
296 break;
297 if (!page_is_buddy(buddy, order))
298 break; /* Move the buddy up one level. */
299 list_del(&buddy->lru);
300 area = zone->free_area + order;
301 area->nr_free--;
302 rmv_page_order(buddy);
303 page = page + (combined_idx - page_idx);
304 page_idx = combined_idx;
305 order++;
306 }
307 set_page_order(page, order);
308 list_add(&page->lru, &zone->free_area[order].free_list);
309 zone->free_area[order].nr_free++;
310}
311
312static inline void free_pages_check(const char *function, struct page *page)
313{
314 if ( page_mapcount(page) ||
315 page->mapping != NULL ||
316 page_count(page) != 0 ||
317 (page->flags & (
318 1 << PG_lru |
319 1 << PG_private |
320 1 << PG_locked |
321 1 << PG_active |
322 1 << PG_reclaim |
323 1 << PG_slab |
324 1 << PG_swapcache |
325 1 << PG_writeback )))
326 bad_page(function, page);
327 if (PageDirty(page))
328 ClearPageDirty(page);
329}
330
331/*
332 * Frees a list of pages.
333 * Assumes all pages on list are in same zone, and of same order.
334 * count is the number of pages to free, or 0 for all on the list.
335 *
336 * If the zone was previously in an "all pages pinned" state then look to
337 * see if this freeing clears that state.
338 *
339 * And clear the zone's pages_scanned counter, to hold off the "all pages are
340 * pinned" detection logic.
341 */
342static int
343free_pages_bulk(struct zone *zone, int count,
344 struct list_head *list, unsigned int order)
345{
346 unsigned long flags;
347 struct page *page = NULL;
348 int ret = 0;
349
350 spin_lock_irqsave(&zone->lock, flags);
351 zone->all_unreclaimable = 0;
352 zone->pages_scanned = 0;
353 while (!list_empty(list) && count--) {
354 page = list_entry(list->prev, struct page, lru);
355 /* have to delete it as __free_pages_bulk list manipulates */
356 list_del(&page->lru);
357 __free_pages_bulk(page, zone, order);
358 ret++;
359 }
360 spin_unlock_irqrestore(&zone->lock, flags);
361 return ret;
362}
363
364void __free_pages_ok(struct page *page, unsigned int order)
365{
366 LIST_HEAD(list);
367 int i;
368
369 arch_free_page(page, order);
370
371 mod_page_state(pgfree, 1 << order);
372
373#ifndef CONFIG_MMU
374 if (order > 0)
375 for (i = 1 ; i < (1 << order) ; ++i)
376 __put_page(page + i);
377#endif
378
379 for (i = 0 ; i < (1 << order) ; ++i)
380 free_pages_check(__FUNCTION__, page + i);
381 list_add(&page->lru, &list);
382 kernel_map_pages(page, 1<<order, 0);
383 free_pages_bulk(page_zone(page), 1, &list, order);
384}
385
386
387/*
388 * The order of subdivision here is critical for the IO subsystem.
389 * Please do not alter this order without good reasons and regression
390 * testing. Specifically, as large blocks of memory are subdivided,
391 * the order in which smaller blocks are delivered depends on the order
392 * they're subdivided in this function. This is the primary factor
393 * influencing the order in which pages are delivered to the IO
394 * subsystem according to empirical testing, and this is also justified
395 * by considering the behavior of a buddy system containing a single
396 * large block of memory acted on by a series of small allocations.
397 * This behavior is a critical factor in sglist merging's success.
398 *
399 * -- wli
400 */
401static inline struct page *
402expand(struct zone *zone, struct page *page,
403 int low, int high, struct free_area *area)
404{
405 unsigned long size = 1 << high;
406
407 while (high > low) {
408 area--;
409 high--;
410 size >>= 1;
411 BUG_ON(bad_range(zone, &page[size]));
412 list_add(&page[size].lru, &area->free_list);
413 area->nr_free++;
414 set_page_order(&page[size], high);
415 }
416 return page;
417}
418
419void set_page_refs(struct page *page, int order)
420{
421#ifdef CONFIG_MMU
422 set_page_count(page, 1);
423#else
424 int i;
425
426 /*
427 * We need to reference all the pages for this order, otherwise if
428 * anyone accesses one of the pages with (get/put) it will be freed.
429 * - eg: access_process_vm()
430 */
431 for (i = 0; i < (1 << order); i++)
432 set_page_count(page + i, 1);
433#endif /* CONFIG_MMU */
434}
435
436/*
437 * This page is about to be returned from the page allocator
438 */
439static void prep_new_page(struct page *page, int order)
440{
441 if (page->mapping || page_mapcount(page) ||
442 (page->flags & (
443 1 << PG_private |
444 1 << PG_locked |
445 1 << PG_lru |
446 1 << PG_active |
447 1 << PG_dirty |
448 1 << PG_reclaim |
449 1 << PG_swapcache |
450 1 << PG_writeback )))
451 bad_page(__FUNCTION__, page);
452
453 page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
454 1 << PG_referenced | 1 << PG_arch_1 |
455 1 << PG_checked | 1 << PG_mappedtodisk);
456 page->private = 0;
457 set_page_refs(page, order);
458 kernel_map_pages(page, 1 << order, 1);
459}
460
461/*
462 * Do the hard work of removing an element from the buddy allocator.
463 * Call me with the zone->lock already held.
464 */
465static struct page *__rmqueue(struct zone *zone, unsigned int order)
466{
467 struct free_area * area;
468 unsigned int current_order;
469 struct page *page;
470
471 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
472 area = zone->free_area + current_order;
473 if (list_empty(&area->free_list))
474 continue;
475
476 page = list_entry(area->free_list.next, struct page, lru);
477 list_del(&page->lru);
478 rmv_page_order(page);
479 area->nr_free--;
480 zone->free_pages -= 1UL << order;
481 return expand(zone, page, order, current_order, area);
482 }
483
484 return NULL;
485}
486
487/*
488 * Obtain a specified number of elements from the buddy allocator, all under
489 * a single hold of the lock, for efficiency. Add them to the supplied list.
490 * Returns the number of new pages which were placed at *list.
491 */
492static int rmqueue_bulk(struct zone *zone, unsigned int order,
493 unsigned long count, struct list_head *list)
494{
495 unsigned long flags;
496 int i;
497 int allocated = 0;
498 struct page *page;
499
500 spin_lock_irqsave(&zone->lock, flags);
501 for (i = 0; i < count; ++i) {
502 page = __rmqueue(zone, order);
503 if (page == NULL)
504 break;
505 allocated++;
506 list_add_tail(&page->lru, list);
507 }
508 spin_unlock_irqrestore(&zone->lock, flags);
509 return allocated;
510}
511
512#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
513static void __drain_pages(unsigned int cpu)
514{
515 struct zone *zone;
516 int i;
517
518 for_each_zone(zone) {
519 struct per_cpu_pageset *pset;
520
521 pset = &zone->pageset[cpu];
522 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
523 struct per_cpu_pages *pcp;
524
525 pcp = &pset->pcp[i];
526 pcp->count -= free_pages_bulk(zone, pcp->count,
527 &pcp->list, 0);
528 }
529 }
530}
531#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
532
533#ifdef CONFIG_PM
534
535void mark_free_pages(struct zone *zone)
536{
537 unsigned long zone_pfn, flags;
538 int order;
539 struct list_head *curr;
540
541 if (!zone->spanned_pages)
542 return;
543
544 spin_lock_irqsave(&zone->lock, flags);
545 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
546 ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));
547
548 for (order = MAX_ORDER - 1; order >= 0; --order)
549 list_for_each(curr, &zone->free_area[order].free_list) {
550 unsigned long start_pfn, i;
551
552 start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
553
554 for (i=0; i < (1<<order); i++)
555 SetPageNosaveFree(pfn_to_page(start_pfn+i));
556 }
557 spin_unlock_irqrestore(&zone->lock, flags);
558}
559
560/*
561 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
562 */
563void drain_local_pages(void)
564{
565 unsigned long flags;
566
567 local_irq_save(flags);
568 __drain_pages(smp_processor_id());
569 local_irq_restore(flags);
570}
571#endif /* CONFIG_PM */
572
573static void zone_statistics(struct zonelist *zonelist, struct zone *z)
574{
575#ifdef CONFIG_NUMA
576 unsigned long flags;
577 int cpu;
578 pg_data_t *pg = z->zone_pgdat;
579 pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
580 struct per_cpu_pageset *p;
581
582 local_irq_save(flags);
583 cpu = smp_processor_id();
584 p = &z->pageset[cpu];
585 if (pg == orig) {
586 z->pageset[cpu].numa_hit++;
587 } else {
588 p->numa_miss++;
589 zonelist->zones[0]->pageset[cpu].numa_foreign++;
590 }
591 if (pg == NODE_DATA(numa_node_id()))
592 p->local_node++;
593 else
594 p->other_node++;
595 local_irq_restore(flags);
596#endif
597}
598
599/*
600 * Free a 0-order page
601 */
602static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
603static void fastcall free_hot_cold_page(struct page *page, int cold)
604{
605 struct zone *zone = page_zone(page);
606 struct per_cpu_pages *pcp;
607 unsigned long flags;
608
609 arch_free_page(page, 0);
610
611 kernel_map_pages(page, 1, 0);
612 inc_page_state(pgfree);
613 if (PageAnon(page))
614 page->mapping = NULL;
615 free_pages_check(__FUNCTION__, page);
616 pcp = &zone->pageset[get_cpu()].pcp[cold];
617 local_irq_save(flags);
618 if (pcp->count >= pcp->high)
619 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
620 list_add(&page->lru, &pcp->list);
621 pcp->count++;
622 local_irq_restore(flags);
623 put_cpu();
624}
625
626void fastcall free_hot_page(struct page *page)
627{
628 free_hot_cold_page(page, 0);
629}
630
631void fastcall free_cold_page(struct page *page)
632{
633 free_hot_cold_page(page, 1);
634}
635
636static inline void prep_zero_page(struct page *page, int order, unsigned int __nocast gfp_flags)
637{
638 int i;
639
640 BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
641 for(i = 0; i < (1 << order); i++)
642 clear_highpage(page + i);
643}
644
645/*
646 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
647 * we cheat by calling it from here, in the order > 0 path. Saves a branch
648 * or two.
649 */
650static struct page *
651buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
652{
653 unsigned long flags;
654 struct page *page = NULL;
655 int cold = !!(gfp_flags & __GFP_COLD);
656
657 if (order == 0) {
658 struct per_cpu_pages *pcp;
659
660 pcp = &zone->pageset[get_cpu()].pcp[cold];
661 local_irq_save(flags);
662 if (pcp->count <= pcp->low)
663 pcp->count += rmqueue_bulk(zone, 0,
664 pcp->batch, &pcp->list);
665 if (pcp->count) {
666 page = list_entry(pcp->list.next, struct page, lru);
667 list_del(&page->lru);
668 pcp->count--;
669 }
670 local_irq_restore(flags);
671 put_cpu();
672 }
673
674 if (page == NULL) {
675 spin_lock_irqsave(&zone->lock, flags);
676 page = __rmqueue(zone, order);
677 spin_unlock_irqrestore(&zone->lock, flags);
678 }
679
680 if (page != NULL) {
681 BUG_ON(bad_range(zone, page));
682 mod_page_state_zone(zone, pgalloc, 1 << order);
683 prep_new_page(page, order);
684
685 if (gfp_flags & __GFP_ZERO)
686 prep_zero_page(page, order, gfp_flags);
687
688 if (order && (gfp_flags & __GFP_COMP))
689 prep_compound_page(page, order);
690 }
691 return page;
692}
693
694/*
695 * Return 1 if free pages are above 'mark'. This takes into account the order
696 * of the allocation.
697 */
698int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
699 int classzone_idx, int can_try_harder, int gfp_high)
700{
701 /* free_pages my go negative - that's OK */
702 long min = mark, free_pages = z->free_pages - (1 << order) + 1;
703 int o;
704
705 if (gfp_high)
706 min -= min / 2;
707 if (can_try_harder)
708 min -= min / 4;
709
710 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
711 return 0;
712 for (o = 0; o < order; o++) {
713 /* At the next order, this order's pages become unavailable */
714 free_pages -= z->free_area[o].nr_free << o;
715
716 /* Require fewer higher order pages to be free */
717 min >>= 1;
718
719 if (free_pages <= min)
720 return 0;
721 }
722 return 1;
723}
724
725/*
726 * This is the 'heart' of the zoned buddy allocator.
727 */
728struct page * fastcall
729__alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
730 struct zonelist *zonelist)
731{
732 const int wait = gfp_mask & __GFP_WAIT;
733 struct zone **zones, *z;
734 struct page *page;
735 struct reclaim_state reclaim_state;
736 struct task_struct *p = current;
737 int i;
738 int classzone_idx;
739 int do_retry;
740 int can_try_harder;
741 int did_some_progress;
742
743 might_sleep_if(wait);
744
745 /*
746 * The caller may dip into page reserves a bit more if the caller
747 * cannot run direct reclaim, or is the caller has realtime scheduling
748 * policy
749 */
750 can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
751
752 zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
753
754 if (unlikely(zones[0] == NULL)) {
755 /* Should this ever happen?? */
756 return NULL;
757 }
758
759 classzone_idx = zone_idx(zones[0]);
760
761 restart:
762 /* Go through the zonelist once, looking for a zone with enough free */
763 for (i = 0; (z = zones[i]) != NULL; i++) {
764
765 if (!zone_watermark_ok(z, order, z->pages_low,
766 classzone_idx, 0, 0))
767 continue;
768
769 if (!cpuset_zone_allowed(z))
770 continue;
771
772 page = buffered_rmqueue(z, order, gfp_mask);
773 if (page)
774 goto got_pg;
775 }
776
777 for (i = 0; (z = zones[i]) != NULL; i++)
778 wakeup_kswapd(z, order);
779
780 /*
781 * Go through the zonelist again. Let __GFP_HIGH and allocations
782 * coming from realtime tasks to go deeper into reserves
783 *
784 * This is the last chance, in general, before the goto nopage.
785 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
786 */
787 for (i = 0; (z = zones[i]) != NULL; i++) {
788 if (!zone_watermark_ok(z, order, z->pages_min,
789 classzone_idx, can_try_harder,
790 gfp_mask & __GFP_HIGH))
791 continue;
792
793 if (wait && !cpuset_zone_allowed(z))
794 continue;
795
796 page = buffered_rmqueue(z, order, gfp_mask);
797 if (page)
798 goto got_pg;
799 }
800
801 /* This allocation should allow future memory freeing. */
802 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) {
803 /* go through the zonelist yet again, ignoring mins */
804 for (i = 0; (z = zones[i]) != NULL; i++) {
805 if (!cpuset_zone_allowed(z))
806 continue;
807 page = buffered_rmqueue(z, order, gfp_mask);
808 if (page)
809 goto got_pg;
810 }
811 goto nopage;
812 }
813
814 /* Atomic allocations - we can't balance anything */
815 if (!wait)
816 goto nopage;
817
818rebalance:
819 cond_resched();
820
821 /* We now go into synchronous reclaim */
822 p->flags |= PF_MEMALLOC;
823 reclaim_state.reclaimed_slab = 0;
824 p->reclaim_state = &reclaim_state;
825
826 did_some_progress = try_to_free_pages(zones, gfp_mask, order);
827
828 p->reclaim_state = NULL;
829 p->flags &= ~PF_MEMALLOC;
830
831 cond_resched();
832
833 if (likely(did_some_progress)) {
834 /*
835 * Go through the zonelist yet one more time, keep
836 * very high watermark here, this is only to catch
837 * a parallel oom killing, we must fail if we're still
838 * under heavy pressure.
839 */
840 for (i = 0; (z = zones[i]) != NULL; i++) {
841 if (!zone_watermark_ok(z, order, z->pages_min,
842 classzone_idx, can_try_harder,
843 gfp_mask & __GFP_HIGH))
844 continue;
845
846 if (!cpuset_zone_allowed(z))
847 continue;
848
849 page = buffered_rmqueue(z, order, gfp_mask);
850 if (page)
851 goto got_pg;
852 }
853 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
854 /*
855 * Go through the zonelist yet one more time, keep
856 * very high watermark here, this is only to catch
857 * a parallel oom killing, we must fail if we're still
858 * under heavy pressure.
859 */
860 for (i = 0; (z = zones[i]) != NULL; i++) {
861 if (!zone_watermark_ok(z, order, z->pages_high,
862 classzone_idx, 0, 0))
863 continue;
864
865 if (!cpuset_zone_allowed(z))
866 continue;
867
868 page = buffered_rmqueue(z, order, gfp_mask);
869 if (page)
870 goto got_pg;
871 }
872
873 out_of_memory(gfp_mask);
874 goto restart;
875 }
876
877 /*
878 * Don't let big-order allocations loop unless the caller explicitly
879 * requests that. Wait for some write requests to complete then retry.
880 *
881 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
882 * <= 3, but that may not be true in other implementations.
883 */
884 do_retry = 0;
885 if (!(gfp_mask & __GFP_NORETRY)) {
886 if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
887 do_retry = 1;
888 if (gfp_mask & __GFP_NOFAIL)
889 do_retry = 1;
890 }
891 if (do_retry) {
892 blk_congestion_wait(WRITE, HZ/50);
893 goto rebalance;
894 }
895
896nopage:
897 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
898 printk(KERN_WARNING "%s: page allocation failure."
899 " order:%d, mode:0x%x\n",
900 p->comm, order, gfp_mask);
901 dump_stack();
902 }
903 return NULL;
904got_pg:
905 zone_statistics(zonelist, z);
906 return page;
907}
908
909EXPORT_SYMBOL(__alloc_pages);
910
911/*
912 * Common helper functions.
913 */
914fastcall unsigned long __get_free_pages(unsigned int __nocast gfp_mask, unsigned int order)
915{
916 struct page * page;
917 page = alloc_pages(gfp_mask, order);
918 if (!page)
919 return 0;
920 return (unsigned long) page_address(page);
921}
922
923EXPORT_SYMBOL(__get_free_pages);
924
925fastcall unsigned long get_zeroed_page(unsigned int __nocast gfp_mask)
926{
927 struct page * page;
928
929 /*
930 * get_zeroed_page() returns a 32-bit address, which cannot represent
931 * a highmem page
932 */
933 BUG_ON(gfp_mask & __GFP_HIGHMEM);
934
935 page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
936 if (page)
937 return (unsigned long) page_address(page);
938 return 0;
939}
940
941EXPORT_SYMBOL(get_zeroed_page);
942
943void __pagevec_free(struct pagevec *pvec)
944{
945 int i = pagevec_count(pvec);
946
947 while (--i >= 0)
948 free_hot_cold_page(pvec->pages[i], pvec->cold);
949}
950
951fastcall void __free_pages(struct page *page, unsigned int order)
952{
953 if (!PageReserved(page) && put_page_testzero(page)) {
954 if (order == 0)
955 free_hot_page(page);
956 else
957 __free_pages_ok(page, order);
958 }
959}
960
961EXPORT_SYMBOL(__free_pages);
962
963fastcall void free_pages(unsigned long addr, unsigned int order)
964{
965 if (addr != 0) {
966 BUG_ON(!virt_addr_valid((void *)addr));
967 __free_pages(virt_to_page((void *)addr), order);
968 }
969}
970
971EXPORT_SYMBOL(free_pages);
972
973/*
974 * Total amount of free (allocatable) RAM:
975 */
976unsigned int nr_free_pages(void)
977{
978 unsigned int sum = 0;
979 struct zone *zone;
980
981 for_each_zone(zone)
982 sum += zone->free_pages;
983
984 return sum;
985}
986
987EXPORT_SYMBOL(nr_free_pages);
988
989#ifdef CONFIG_NUMA
990unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
991{
992 unsigned int i, sum = 0;
993
994 for (i = 0; i < MAX_NR_ZONES; i++)
995 sum += pgdat->node_zones[i].free_pages;
996
997 return sum;
998}
999#endif
1000
1001static unsigned int nr_free_zone_pages(int offset)
1002{
1003 pg_data_t *pgdat;
1004 unsigned int sum = 0;
1005
1006 for_each_pgdat(pgdat) {
1007 struct zonelist *zonelist = pgdat->node_zonelists + offset;
1008 struct zone **zonep = zonelist->zones;
1009 struct zone *zone;
1010
1011 for (zone = *zonep++; zone; zone = *zonep++) {
1012 unsigned long size = zone->present_pages;
1013 unsigned long high = zone->pages_high;
1014 if (size > high)
1015 sum += size - high;
1016 }
1017 }
1018
1019 return sum;
1020}
1021
1022/*
1023 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
1024 */
1025unsigned int nr_free_buffer_pages(void)
1026{
1027 return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK);
1028}
1029
1030/*
1031 * Amount of free RAM allocatable within all zones
1032 */
1033unsigned int nr_free_pagecache_pages(void)
1034{
1035 return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK);
1036}
1037
1038#ifdef CONFIG_HIGHMEM
1039unsigned int nr_free_highpages (void)
1040{
1041 pg_data_t *pgdat;
1042 unsigned int pages = 0;
1043
1044 for_each_pgdat(pgdat)
1045 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1046
1047 return pages;
1048}
1049#endif
1050
1051#ifdef CONFIG_NUMA
1052static void show_node(struct zone *zone)
1053{
1054 printk("Node %d ", zone->zone_pgdat->node_id);
1055}
1056#else
1057#define show_node(zone) do { } while (0)
1058#endif
1059
1060/*
1061 * Accumulate the page_state information across all CPUs.
1062 * The result is unavoidably approximate - it can change
1063 * during and after execution of this function.
1064 */
1065static DEFINE_PER_CPU(struct page_state, page_states) = {0};
1066
1067atomic_t nr_pagecache = ATOMIC_INIT(0);
1068EXPORT_SYMBOL(nr_pagecache);
1069#ifdef CONFIG_SMP
1070DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
1071#endif
1072
1073void __get_page_state(struct page_state *ret, int nr)
1074{
1075 int cpu = 0;
1076
1077 memset(ret, 0, sizeof(*ret));
1078
1079 cpu = first_cpu(cpu_online_map);
1080 while (cpu < NR_CPUS) {
1081 unsigned long *in, *out, off;
1082
1083 in = (unsigned long *)&per_cpu(page_states, cpu);
1084
1085 cpu = next_cpu(cpu, cpu_online_map);
1086
1087 if (cpu < NR_CPUS)
1088 prefetch(&per_cpu(page_states, cpu));
1089
1090 out = (unsigned long *)ret;
1091 for (off = 0; off < nr; off++)
1092 *out++ += *in++;
1093 }
1094}
1095
1096void get_page_state(struct page_state *ret)
1097{
1098 int nr;
1099
1100 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
1101 nr /= sizeof(unsigned long);
1102
1103 __get_page_state(ret, nr + 1);
1104}
1105
1106void get_full_page_state(struct page_state *ret)
1107{
1108 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
1109}
1110
1111unsigned long __read_page_state(unsigned offset)
1112{
1113 unsigned long ret = 0;
1114 int cpu;
1115
1116 for_each_online_cpu(cpu) {
1117 unsigned long in;
1118
1119 in = (unsigned long)&per_cpu(page_states, cpu) + offset;
1120 ret += *((unsigned long *)in);
1121 }
1122 return ret;
1123}
1124
1125void __mod_page_state(unsigned offset, unsigned long delta)
1126{
1127 unsigned long flags;
1128 void* ptr;
1129
1130 local_irq_save(flags);
1131 ptr = &__get_cpu_var(page_states);
1132 *(unsigned long*)(ptr + offset) += delta;
1133 local_irq_restore(flags);
1134}
1135
1136EXPORT_SYMBOL(__mod_page_state);
1137
1138void __get_zone_counts(unsigned long *active, unsigned long *inactive,
1139 unsigned long *free, struct pglist_data *pgdat)
1140{
1141 struct zone *zones = pgdat->node_zones;
1142 int i;
1143
1144 *active = 0;
1145 *inactive = 0;
1146 *free = 0;
1147 for (i = 0; i < MAX_NR_ZONES; i++) {
1148 *active += zones[i].nr_active;
1149 *inactive += zones[i].nr_inactive;
1150 *free += zones[i].free_pages;
1151 }
1152}
1153
1154void get_zone_counts(unsigned long *active,
1155 unsigned long *inactive, unsigned long *free)
1156{
1157 struct pglist_data *pgdat;
1158
1159 *active = 0;
1160 *inactive = 0;
1161 *free = 0;
1162 for_each_pgdat(pgdat) {
1163 unsigned long l, m, n;
1164 __get_zone_counts(&l, &m, &n, pgdat);
1165 *active += l;
1166 *inactive += m;
1167 *free += n;
1168 }
1169}
1170
1171void si_meminfo(struct sysinfo *val)
1172{
1173 val->totalram = totalram_pages;
1174 val->sharedram = 0;
1175 val->freeram = nr_free_pages();
1176 val->bufferram = nr_blockdev_pages();
1177#ifdef CONFIG_HIGHMEM
1178 val->totalhigh = totalhigh_pages;
1179 val->freehigh = nr_free_highpages();
1180#else
1181 val->totalhigh = 0;
1182 val->freehigh = 0;
1183#endif
1184 val->mem_unit = PAGE_SIZE;
1185}
1186
1187EXPORT_SYMBOL(si_meminfo);
1188
1189#ifdef CONFIG_NUMA
1190void si_meminfo_node(struct sysinfo *val, int nid)
1191{
1192 pg_data_t *pgdat = NODE_DATA(nid);
1193
1194 val->totalram = pgdat->node_present_pages;
1195 val->freeram = nr_free_pages_pgdat(pgdat);
1196 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
1197 val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1198 val->mem_unit = PAGE_SIZE;
1199}
1200#endif
1201
1202#define K(x) ((x) << (PAGE_SHIFT-10))
1203
1204/*
1205 * Show free area list (used inside shift_scroll-lock stuff)
1206 * We also calculate the percentage fragmentation. We do this by counting the
1207 * memory on each free list with the exception of the first item on the list.
1208 */
1209void show_free_areas(void)
1210{
1211 struct page_state ps;
1212 int cpu, temperature;
1213 unsigned long active;
1214 unsigned long inactive;
1215 unsigned long free;
1216 struct zone *zone;
1217
1218 for_each_zone(zone) {
1219 show_node(zone);
1220 printk("%s per-cpu:", zone->name);
1221
1222 if (!zone->present_pages) {
1223 printk(" empty\n");
1224 continue;
1225 } else
1226 printk("\n");
1227
1228 for (cpu = 0; cpu < NR_CPUS; ++cpu) {
1229 struct per_cpu_pageset *pageset;
1230
1231 if (!cpu_possible(cpu))
1232 continue;
1233
1234 pageset = zone->pageset + cpu;
1235
1236 for (temperature = 0; temperature < 2; temperature++)
1237 printk("cpu %d %s: low %d, high %d, batch %d\n",
1238 cpu,
1239 temperature ? "cold" : "hot",
1240 pageset->pcp[temperature].low,
1241 pageset->pcp[temperature].high,
1242 pageset->pcp[temperature].batch);
1243 }
1244 }
1245
1246 get_page_state(&ps);
1247 get_zone_counts(&active, &inactive, &free);
1248
1249 printk("\nFree pages: %11ukB (%ukB HighMem)\n",
1250 K(nr_free_pages()),
1251 K(nr_free_highpages()));
1252
1253 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
1254 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
1255 active,
1256 inactive,
1257 ps.nr_dirty,
1258 ps.nr_writeback,
1259 ps.nr_unstable,
1260 nr_free_pages(),
1261 ps.nr_slab,
1262 ps.nr_mapped,
1263 ps.nr_page_table_pages);
1264
1265 for_each_zone(zone) {
1266 int i;
1267
1268 show_node(zone);
1269 printk("%s"
1270 " free:%lukB"
1271 " min:%lukB"
1272 " low:%lukB"
1273 " high:%lukB"
1274 " active:%lukB"
1275 " inactive:%lukB"
1276 " present:%lukB"
1277 " pages_scanned:%lu"
1278 " all_unreclaimable? %s"
1279 "\n",
1280 zone->name,
1281 K(zone->free_pages),
1282 K(zone->pages_min),
1283 K(zone->pages_low),
1284 K(zone->pages_high),
1285 K(zone->nr_active),
1286 K(zone->nr_inactive),
1287 K(zone->present_pages),
1288 zone->pages_scanned,
1289 (zone->all_unreclaimable ? "yes" : "no")
1290 );
1291 printk("lowmem_reserve[]:");
1292 for (i = 0; i < MAX_NR_ZONES; i++)
1293 printk(" %lu", zone->lowmem_reserve[i]);
1294 printk("\n");
1295 }
1296
1297 for_each_zone(zone) {
1298 unsigned long nr, flags, order, total = 0;
1299
1300 show_node(zone);
1301 printk("%s: ", zone->name);
1302 if (!zone->present_pages) {
1303 printk("empty\n");
1304 continue;
1305 }
1306
1307 spin_lock_irqsave(&zone->lock, flags);
1308 for (order = 0; order < MAX_ORDER; order++) {
1309 nr = zone->free_area[order].nr_free;
1310 total += nr << order;
1311 printk("%lu*%lukB ", nr, K(1UL) << order);
1312 }
1313 spin_unlock_irqrestore(&zone->lock, flags);
1314 printk("= %lukB\n", K(total));
1315 }
1316
1317 show_swap_cache_info();
1318}
1319
1320/*
1321 * Builds allocation fallback zone lists.
1322 */
1323static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
1324{
1325 switch (k) {
1326 struct zone *zone;
1327 default:
1328 BUG();
1329 case ZONE_HIGHMEM:
1330 zone = pgdat->node_zones + ZONE_HIGHMEM;
1331 if (zone->present_pages) {
1332#ifndef CONFIG_HIGHMEM
1333 BUG();
1334#endif
1335 zonelist->zones[j++] = zone;
1336 }
1337 case ZONE_NORMAL:
1338 zone = pgdat->node_zones + ZONE_NORMAL;
1339 if (zone->present_pages)
1340 zonelist->zones[j++] = zone;
1341 case ZONE_DMA:
1342 zone = pgdat->node_zones + ZONE_DMA;
1343 if (zone->present_pages)
1344 zonelist->zones[j++] = zone;
1345 }
1346
1347 return j;
1348}
1349
1350#ifdef CONFIG_NUMA
1351#define MAX_NODE_LOAD (num_online_nodes())
1352static int __initdata node_load[MAX_NUMNODES];
1353/**
1354 * find_next_best_node - find the next node that should appear in a given
1355 * node's fallback list
1356 * @node: node whose fallback list we're appending
1357 * @used_node_mask: nodemask_t of already used nodes
1358 *
1359 * We use a number of factors to determine which is the next node that should
1360 * appear on a given node's fallback list. The node should not have appeared
1361 * already in @node's fallback list, and it should be the next closest node
1362 * according to the distance array (which contains arbitrary distance values
1363 * from each node to each node in the system), and should also prefer nodes
1364 * with no CPUs, since presumably they'll have very little allocation pressure
1365 * on them otherwise.
1366 * It returns -1 if no node is found.
1367 */
1368static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
1369{
1370 int i, n, val;
1371 int min_val = INT_MAX;
1372 int best_node = -1;
1373
1374 for_each_online_node(i) {
1375 cpumask_t tmp;
1376
1377 /* Start from local node */
1378 n = (node+i) % num_online_nodes();
1379
1380 /* Don't want a node to appear more than once */
1381 if (node_isset(n, *used_node_mask))
1382 continue;
1383
1384 /* Use the local node if we haven't already */
1385 if (!node_isset(node, *used_node_mask)) {
1386 best_node = node;
1387 break;
1388 }
1389
1390 /* Use the distance array to find the distance */
1391 val = node_distance(node, n);
1392
1393 /* Give preference to headless and unused nodes */
1394 tmp = node_to_cpumask(n);
1395 if (!cpus_empty(tmp))
1396 val += PENALTY_FOR_NODE_WITH_CPUS;
1397
1398 /* Slight preference for less loaded node */
1399 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
1400 val += node_load[n];
1401
1402 if (val < min_val) {
1403 min_val = val;
1404 best_node = n;
1405 }
1406 }
1407
1408 if (best_node >= 0)
1409 node_set(best_node, *used_node_mask);
1410
1411 return best_node;
1412}
1413
1414static void __init build_zonelists(pg_data_t *pgdat)
1415{
1416 int i, j, k, node, local_node;
1417 int prev_node, load;
1418 struct zonelist *zonelist;
1419 nodemask_t used_mask;
1420
1421 /* initialize zonelists */
1422 for (i = 0; i < GFP_ZONETYPES; i++) {
1423 zonelist = pgdat->node_zonelists + i;
1424 zonelist->zones[0] = NULL;
1425 }
1426
1427 /* NUMA-aware ordering of nodes */
1428 local_node = pgdat->node_id;
1429 load = num_online_nodes();
1430 prev_node = local_node;
1431 nodes_clear(used_mask);
1432 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
1433 /*
1434 * We don't want to pressure a particular node.
1435 * So adding penalty to the first node in same
1436 * distance group to make it round-robin.
1437 */
1438 if (node_distance(local_node, node) !=
1439 node_distance(local_node, prev_node))
1440 node_load[node] += load;
1441 prev_node = node;
1442 load--;
1443 for (i = 0; i < GFP_ZONETYPES; i++) {
1444 zonelist = pgdat->node_zonelists + i;
1445 for (j = 0; zonelist->zones[j] != NULL; j++);
1446
1447 k = ZONE_NORMAL;
1448 if (i & __GFP_HIGHMEM)
1449 k = ZONE_HIGHMEM;
1450 if (i & __GFP_DMA)
1451 k = ZONE_DMA;
1452
1453 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1454 zonelist->zones[j] = NULL;
1455 }
1456 }
1457}
1458
1459#else /* CONFIG_NUMA */
1460
1461static void __init build_zonelists(pg_data_t *pgdat)
1462{
1463 int i, j, k, node, local_node;
1464
1465 local_node = pgdat->node_id;
1466 for (i = 0; i < GFP_ZONETYPES; i++) {
1467 struct zonelist *zonelist;
1468
1469 zonelist = pgdat->node_zonelists + i;
1470
1471 j = 0;
1472 k = ZONE_NORMAL;
1473 if (i & __GFP_HIGHMEM)
1474 k = ZONE_HIGHMEM;
1475 if (i & __GFP_DMA)
1476 k = ZONE_DMA;
1477
1478 j = build_zonelists_node(pgdat, zonelist, j, k);
1479 /*
1480 * Now we build the zonelist so that it contains the zones
1481 * of all the other nodes.
1482 * We don't want to pressure a particular node, so when
1483 * building the zones for node N, we make sure that the
1484 * zones coming right after the local ones are those from
1485 * node N+1 (modulo N)
1486 */
1487 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
1488 if (!node_online(node))
1489 continue;
1490 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1491 }
1492 for (node = 0; node < local_node; node++) {
1493 if (!node_online(node))
1494 continue;
1495 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1496 }
1497
1498 zonelist->zones[j] = NULL;
1499 }
1500}
1501
1502#endif /* CONFIG_NUMA */
1503
1504void __init build_all_zonelists(void)
1505{
1506 int i;
1507
1508 for_each_online_node(i)
1509 build_zonelists(NODE_DATA(i));
1510 printk("Built %i zonelists\n", num_online_nodes());
1511 cpuset_init_current_mems_allowed();
1512}
1513
1514/*
1515 * Helper functions to size the waitqueue hash table.
1516 * Essentially these want to choose hash table sizes sufficiently
1517 * large so that collisions trying to wait on pages are rare.
1518 * But in fact, the number of active page waitqueues on typical
1519 * systems is ridiculously low, less than 200. So this is even
1520 * conservative, even though it seems large.
1521 *
1522 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
1523 * waitqueues, i.e. the size of the waitq table given the number of pages.
1524 */
1525#define PAGES_PER_WAITQUEUE 256
1526
1527static inline unsigned long wait_table_size(unsigned long pages)
1528{
1529 unsigned long size = 1;
1530
1531 pages /= PAGES_PER_WAITQUEUE;
1532
1533 while (size < pages)
1534 size <<= 1;
1535
1536 /*
1537 * Once we have dozens or even hundreds of threads sleeping
1538 * on IO we've got bigger problems than wait queue collision.
1539 * Limit the size of the wait table to a reasonable size.
1540 */
1541 size = min(size, 4096UL);
1542
1543 return max(size, 4UL);
1544}
1545
1546/*
1547 * This is an integer logarithm so that shifts can be used later
1548 * to extract the more random high bits from the multiplicative
1549 * hash function before the remainder is taken.
1550 */
1551static inline unsigned long wait_table_bits(unsigned long size)
1552{
1553 return ffz(~size);
1554}
1555
1556#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
1557
1558static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1559 unsigned long *zones_size, unsigned long *zholes_size)
1560{
1561 unsigned long realtotalpages, totalpages = 0;
1562 int i;
1563
1564 for (i = 0; i < MAX_NR_ZONES; i++)
1565 totalpages += zones_size[i];
1566 pgdat->node_spanned_pages = totalpages;
1567
1568 realtotalpages = totalpages;
1569 if (zholes_size)
1570 for (i = 0; i < MAX_NR_ZONES; i++)
1571 realtotalpages -= zholes_size[i];
1572 pgdat->node_present_pages = realtotalpages;
1573 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
1574}
1575
1576
1577/*
1578 * Initially all pages are reserved - free ones are freed
1579 * up by free_all_bootmem() once the early boot process is
1580 * done. Non-atomic initialization, single-pass.
1581 */
1582void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1583 unsigned long start_pfn)
1584{
1585 struct page *start = pfn_to_page(start_pfn);
1586 struct page *page;
1587
1588 for (page = start; page < (start + size); page++) {
1589 set_page_zone(page, NODEZONE(nid, zone));
1590 set_page_count(page, 0);
1591 reset_page_mapcount(page);
1592 SetPageReserved(page);
1593 INIT_LIST_HEAD(&page->lru);
1594#ifdef WANT_PAGE_VIRTUAL
1595 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
1596 if (!is_highmem_idx(zone))
1597 set_page_address(page, __va(start_pfn << PAGE_SHIFT));
1598#endif
1599 start_pfn++;
1600 }
1601}
1602
1603void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1604 unsigned long size)
1605{
1606 int order;
1607 for (order = 0; order < MAX_ORDER ; order++) {
1608 INIT_LIST_HEAD(&zone->free_area[order].free_list);
1609 zone->free_area[order].nr_free = 0;
1610 }
1611}
1612
1613#ifndef __HAVE_ARCH_MEMMAP_INIT
1614#define memmap_init(size, nid, zone, start_pfn) \
1615 memmap_init_zone((size), (nid), (zone), (start_pfn))
1616#endif
1617
1618/*
1619 * Set up the zone data structures:
1620 * - mark all pages reserved
1621 * - mark all memory queues empty
1622 * - clear the memory bitmaps
1623 */
1624static void __init free_area_init_core(struct pglist_data *pgdat,
1625 unsigned long *zones_size, unsigned long *zholes_size)
1626{
1627 unsigned long i, j;
1628 const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
1629 int cpu, nid = pgdat->node_id;
1630 unsigned long zone_start_pfn = pgdat->node_start_pfn;
1631
1632 pgdat->nr_zones = 0;
1633 init_waitqueue_head(&pgdat->kswapd_wait);
1634 pgdat->kswapd_max_order = 0;
1635
1636 for (j = 0; j < MAX_NR_ZONES; j++) {
1637 struct zone *zone = pgdat->node_zones + j;
1638 unsigned long size, realsize;
1639 unsigned long batch;
1640
1641 zone_table[NODEZONE(nid, j)] = zone;
1642 realsize = size = zones_size[j];
1643 if (zholes_size)
1644 realsize -= zholes_size[j];
1645
1646 if (j == ZONE_DMA || j == ZONE_NORMAL)
1647 nr_kernel_pages += realsize;
1648 nr_all_pages += realsize;
1649
1650 zone->spanned_pages = size;
1651 zone->present_pages = realsize;
1652 zone->name = zone_names[j];
1653 spin_lock_init(&zone->lock);
1654 spin_lock_init(&zone->lru_lock);
1655 zone->zone_pgdat = pgdat;
1656 zone->free_pages = 0;
1657
1658 zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
1659
1660 /*
1661 * The per-cpu-pages pools are set to around 1000th of the
1662 * size of the zone. But no more than 1/4 of a meg - there's
1663 * no point in going beyond the size of L2 cache.
1664 *
1665 * OK, so we don't know how big the cache is. So guess.
1666 */
1667 batch = zone->present_pages / 1024;
1668 if (batch * PAGE_SIZE > 256 * 1024)
1669 batch = (256 * 1024) / PAGE_SIZE;
1670 batch /= 4; /* We effectively *= 4 below */
1671 if (batch < 1)
1672 batch = 1;
1673
1674 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1675 struct per_cpu_pages *pcp;
1676
1677 pcp = &zone->pageset[cpu].pcp[0]; /* hot */
1678 pcp->count = 0;
1679 pcp->low = 2 * batch;
1680 pcp->high = 6 * batch;
1681 pcp->batch = 1 * batch;
1682 INIT_LIST_HEAD(&pcp->list);
1683
1684 pcp = &zone->pageset[cpu].pcp[1]; /* cold */
1685 pcp->count = 0;
1686 pcp->low = 0;
1687 pcp->high = 2 * batch;
1688 pcp->batch = 1 * batch;
1689 INIT_LIST_HEAD(&pcp->list);
1690 }
1691 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
1692 zone_names[j], realsize, batch);
1693 INIT_LIST_HEAD(&zone->active_list);
1694 INIT_LIST_HEAD(&zone->inactive_list);
1695 zone->nr_scan_active = 0;
1696 zone->nr_scan_inactive = 0;
1697 zone->nr_active = 0;
1698 zone->nr_inactive = 0;
1699 if (!size)
1700 continue;
1701
1702 /*
1703 * The per-page waitqueue mechanism uses hashed waitqueues
1704 * per zone.
1705 */
1706 zone->wait_table_size = wait_table_size(size);
1707 zone->wait_table_bits =
1708 wait_table_bits(zone->wait_table_size);
1709 zone->wait_table = (wait_queue_head_t *)
1710 alloc_bootmem_node(pgdat, zone->wait_table_size
1711 * sizeof(wait_queue_head_t));
1712
1713 for(i = 0; i < zone->wait_table_size; ++i)
1714 init_waitqueue_head(zone->wait_table + i);
1715
1716 pgdat->nr_zones = j+1;
1717
1718 zone->zone_mem_map = pfn_to_page(zone_start_pfn);
1719 zone->zone_start_pfn = zone_start_pfn;
1720
1721 if ((zone_start_pfn) & (zone_required_alignment-1))
1722 printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n");
1723
1724 memmap_init(size, nid, j, zone_start_pfn);
1725
1726 zone_start_pfn += size;
1727
1728 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
1729 }
1730}
1731
1732static void __init alloc_node_mem_map(struct pglist_data *pgdat)
1733{
1734 unsigned long size;
1735
1736 /* Skip empty nodes */
1737 if (!pgdat->node_spanned_pages)
1738 return;
1739
1740 /* ia64 gets its own node_mem_map, before this, without bootmem */
1741 if (!pgdat->node_mem_map) {
1742 size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
1743 pgdat->node_mem_map = alloc_bootmem_node(pgdat, size);
1744 }
1745#ifndef CONFIG_DISCONTIGMEM
1746 /*
1747 * With no DISCONTIG, the global mem_map is just set as node 0's
1748 */
1749 if (pgdat == NODE_DATA(0))
1750 mem_map = NODE_DATA(0)->node_mem_map;
1751#endif
1752}
1753
1754void __init free_area_init_node(int nid, struct pglist_data *pgdat,
1755 unsigned long *zones_size, unsigned long node_start_pfn,
1756 unsigned long *zholes_size)
1757{
1758 pgdat->node_id = nid;
1759 pgdat->node_start_pfn = node_start_pfn;
1760 calculate_zone_totalpages(pgdat, zones_size, zholes_size);
1761
1762 alloc_node_mem_map(pgdat);
1763
1764 free_area_init_core(pgdat, zones_size, zholes_size);
1765}
1766
1767#ifndef CONFIG_DISCONTIGMEM
1768static bootmem_data_t contig_bootmem_data;
1769struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
1770
1771EXPORT_SYMBOL(contig_page_data);
1772
1773void __init free_area_init(unsigned long *zones_size)
1774{
1775 free_area_init_node(0, &contig_page_data, zones_size,
1776 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
1777}
1778#endif
1779
1780#ifdef CONFIG_PROC_FS
1781
1782#include <linux/seq_file.h>
1783
1784static void *frag_start(struct seq_file *m, loff_t *pos)
1785{
1786 pg_data_t *pgdat;
1787 loff_t node = *pos;
1788
1789 for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
1790 --node;
1791
1792 return pgdat;
1793}
1794
1795static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1796{
1797 pg_data_t *pgdat = (pg_data_t *)arg;
1798
1799 (*pos)++;
1800 return pgdat->pgdat_next;
1801}
1802
1803static void frag_stop(struct seq_file *m, void *arg)
1804{
1805}
1806
1807/*
1808 * This walks the free areas for each zone.
1809 */
1810static int frag_show(struct seq_file *m, void *arg)
1811{
1812 pg_data_t *pgdat = (pg_data_t *)arg;
1813 struct zone *zone;
1814 struct zone *node_zones = pgdat->node_zones;
1815 unsigned long flags;
1816 int order;
1817
1818 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
1819 if (!zone->present_pages)
1820 continue;
1821
1822 spin_lock_irqsave(&zone->lock, flags);
1823 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1824 for (order = 0; order < MAX_ORDER; ++order)
1825 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
1826 spin_unlock_irqrestore(&zone->lock, flags);
1827 seq_putc(m, '\n');
1828 }
1829 return 0;
1830}
1831
1832struct seq_operations fragmentation_op = {
1833 .start = frag_start,
1834 .next = frag_next,
1835 .stop = frag_stop,
1836 .show = frag_show,
1837};
1838
1839static char *vmstat_text[] = {
1840 "nr_dirty",
1841 "nr_writeback",
1842 "nr_unstable",
1843 "nr_page_table_pages",
1844 "nr_mapped",
1845 "nr_slab",
1846
1847 "pgpgin",
1848 "pgpgout",
1849 "pswpin",
1850 "pswpout",
1851 "pgalloc_high",
1852
1853 "pgalloc_normal",
1854 "pgalloc_dma",
1855 "pgfree",
1856 "pgactivate",
1857 "pgdeactivate",
1858
1859 "pgfault",
1860 "pgmajfault",
1861 "pgrefill_high",
1862 "pgrefill_normal",
1863 "pgrefill_dma",
1864
1865 "pgsteal_high",
1866 "pgsteal_normal",
1867 "pgsteal_dma",
1868 "pgscan_kswapd_high",
1869 "pgscan_kswapd_normal",
1870
1871 "pgscan_kswapd_dma",
1872 "pgscan_direct_high",
1873 "pgscan_direct_normal",
1874 "pgscan_direct_dma",
1875 "pginodesteal",
1876
1877 "slabs_scanned",
1878 "kswapd_steal",
1879 "kswapd_inodesteal",
1880 "pageoutrun",
1881 "allocstall",
1882
1883 "pgrotated",
1884};
1885
1886static void *vmstat_start(struct seq_file *m, loff_t *pos)
1887{
1888 struct page_state *ps;
1889
1890 if (*pos >= ARRAY_SIZE(vmstat_text))
1891 return NULL;
1892
1893 ps = kmalloc(sizeof(*ps), GFP_KERNEL);
1894 m->private = ps;
1895 if (!ps)
1896 return ERR_PTR(-ENOMEM);
1897 get_full_page_state(ps);
1898 ps->pgpgin /= 2; /* sectors -> kbytes */
1899 ps->pgpgout /= 2;
1900 return (unsigned long *)ps + *pos;
1901}
1902
1903static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1904{
1905 (*pos)++;
1906 if (*pos >= ARRAY_SIZE(vmstat_text))
1907 return NULL;
1908 return (unsigned long *)m->private + *pos;
1909}
1910
1911static int vmstat_show(struct seq_file *m, void *arg)
1912{
1913 unsigned long *l = arg;
1914 unsigned long off = l - (unsigned long *)m->private;
1915
1916 seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
1917 return 0;
1918}
1919
1920static void vmstat_stop(struct seq_file *m, void *arg)
1921{
1922 kfree(m->private);
1923 m->private = NULL;
1924}
1925
1926struct seq_operations vmstat_op = {
1927 .start = vmstat_start,
1928 .next = vmstat_next,
1929 .stop = vmstat_stop,
1930 .show = vmstat_show,
1931};
1932
1933#endif /* CONFIG_PROC_FS */
1934
1935#ifdef CONFIG_HOTPLUG_CPU
1936static int page_alloc_cpu_notify(struct notifier_block *self,
1937 unsigned long action, void *hcpu)
1938{
1939 int cpu = (unsigned long)hcpu;
1940 long *count;
1941 unsigned long *src, *dest;
1942
1943 if (action == CPU_DEAD) {
1944 int i;
1945
1946 /* Drain local pagecache count. */
1947 count = &per_cpu(nr_pagecache_local, cpu);
1948 atomic_add(*count, &nr_pagecache);
1949 *count = 0;
1950 local_irq_disable();
1951 __drain_pages(cpu);
1952
1953 /* Add dead cpu's page_states to our own. */
1954 dest = (unsigned long *)&__get_cpu_var(page_states);
1955 src = (unsigned long *)&per_cpu(page_states, cpu);
1956
1957 for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long);
1958 i++) {
1959 dest[i] += src[i];
1960 src[i] = 0;
1961 }
1962
1963 local_irq_enable();
1964 }
1965 return NOTIFY_OK;
1966}
1967#endif /* CONFIG_HOTPLUG_CPU */
1968
1969void __init page_alloc_init(void)
1970{
1971 hotcpu_notifier(page_alloc_cpu_notify, 0);
1972}
1973
1974/*
1975 * setup_per_zone_lowmem_reserve - called whenever
1976 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
1977 * has a correct pages reserved value, so an adequate number of
1978 * pages are left in the zone after a successful __alloc_pages().
1979 */
1980static void setup_per_zone_lowmem_reserve(void)
1981{
1982 struct pglist_data *pgdat;
1983 int j, idx;
1984
1985 for_each_pgdat(pgdat) {
1986 for (j = 0; j < MAX_NR_ZONES; j++) {
1987 struct zone *zone = pgdat->node_zones + j;
1988 unsigned long present_pages = zone->present_pages;
1989
1990 zone->lowmem_reserve[j] = 0;
1991
1992 for (idx = j-1; idx >= 0; idx--) {
1993 struct zone *lower_zone;
1994
1995 if (sysctl_lowmem_reserve_ratio[idx] < 1)
1996 sysctl_lowmem_reserve_ratio[idx] = 1;
1997
1998 lower_zone = pgdat->node_zones + idx;
1999 lower_zone->lowmem_reserve[j] = present_pages /
2000 sysctl_lowmem_reserve_ratio[idx];
2001 present_pages += lower_zone->present_pages;
2002 }
2003 }
2004 }
2005}
2006
2007/*
2008 * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures
2009 * that the pages_{min,low,high} values for each zone are set correctly
2010 * with respect to min_free_kbytes.
2011 */
2012static void setup_per_zone_pages_min(void)
2013{
2014 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
2015 unsigned long lowmem_pages = 0;
2016 struct zone *zone;
2017 unsigned long flags;
2018
2019 /* Calculate total number of !ZONE_HIGHMEM pages */
2020 for_each_zone(zone) {
2021 if (!is_highmem(zone))
2022 lowmem_pages += zone->present_pages;
2023 }
2024
2025 for_each_zone(zone) {
2026 spin_lock_irqsave(&zone->lru_lock, flags);
2027 if (is_highmem(zone)) {
2028 /*
2029 * Often, highmem doesn't need to reserve any pages.
2030 * But the pages_min/low/high values are also used for
2031 * batching up page reclaim activity so we need a
2032 * decent value here.
2033 */
2034 int min_pages;
2035
2036 min_pages = zone->present_pages / 1024;
2037 if (min_pages < SWAP_CLUSTER_MAX)
2038 min_pages = SWAP_CLUSTER_MAX;
2039 if (min_pages > 128)
2040 min_pages = 128;
2041 zone->pages_min = min_pages;
2042 } else {
2043 /* if it's a lowmem zone, reserve a number of pages
2044 * proportionate to the zone's size.
2045 */
2046 zone->pages_min = (pages_min * zone->present_pages) /
2047 lowmem_pages;
2048 }
2049
2050 /*
2051 * When interpreting these watermarks, just keep in mind that:
2052 * zone->pages_min == (zone->pages_min * 4) / 4;
2053 */
2054 zone->pages_low = (zone->pages_min * 5) / 4;
2055 zone->pages_high = (zone->pages_min * 6) / 4;
2056 spin_unlock_irqrestore(&zone->lru_lock, flags);
2057 }
2058}
2059
2060/*
2061 * Initialise min_free_kbytes.
2062 *
2063 * For small machines we want it small (128k min). For large machines
2064 * we want it large (64MB max). But it is not linear, because network
2065 * bandwidth does not increase linearly with machine size. We use
2066 *
2067 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
2068 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
2069 *
2070 * which yields
2071 *
2072 * 16MB: 512k
2073 * 32MB: 724k
2074 * 64MB: 1024k
2075 * 128MB: 1448k
2076 * 256MB: 2048k
2077 * 512MB: 2896k
2078 * 1024MB: 4096k
2079 * 2048MB: 5792k
2080 * 4096MB: 8192k
2081 * 8192MB: 11584k
2082 * 16384MB: 16384k
2083 */
2084static int __init init_per_zone_pages_min(void)
2085{
2086 unsigned long lowmem_kbytes;
2087
2088 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
2089
2090 min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
2091 if (min_free_kbytes < 128)
2092 min_free_kbytes = 128;
2093 if (min_free_kbytes > 65536)
2094 min_free_kbytes = 65536;
2095 setup_per_zone_pages_min();
2096 setup_per_zone_lowmem_reserve();
2097 return 0;
2098}
2099module_init(init_per_zone_pages_min)
2100
2101/*
2102 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
2103 * that we can call two helper functions whenever min_free_kbytes
2104 * changes.
2105 */
2106int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
2107 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2108{
2109 proc_dointvec(table, write, file, buffer, length, ppos);
2110 setup_per_zone_pages_min();
2111 return 0;
2112}
2113
2114/*
2115 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
2116 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
2117 * whenever sysctl_lowmem_reserve_ratio changes.
2118 *
2119 * The reserve ratio obviously has absolutely no relation with the
2120 * pages_min watermarks. The lowmem reserve ratio can only make sense
2121 * if in function of the boot time zone sizes.
2122 */
2123int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
2124 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2125{
2126 proc_dointvec_minmax(table, write, file, buffer, length, ppos);
2127 setup_per_zone_lowmem_reserve();
2128 return 0;
2129}
2130
2131__initdata int hashdist = HASHDIST_DEFAULT;
2132
2133#ifdef CONFIG_NUMA
2134static int __init set_hashdist(char *str)
2135{
2136 if (!str)
2137 return 0;
2138 hashdist = simple_strtoul(str, &str, 0);
2139 return 1;
2140}
2141__setup("hashdist=", set_hashdist);
2142#endif
2143
2144/*
2145 * allocate a large system hash table from bootmem
2146 * - it is assumed that the hash table must contain an exact power-of-2
2147 * quantity of entries
2148 * - limit is the number of hash buckets, not the total allocation size
2149 */
2150void *__init alloc_large_system_hash(const char *tablename,
2151 unsigned long bucketsize,
2152 unsigned long numentries,
2153 int scale,
2154 int flags,
2155 unsigned int *_hash_shift,
2156 unsigned int *_hash_mask,
2157 unsigned long limit)
2158{
2159 unsigned long long max = limit;
2160 unsigned long log2qty, size;
2161 void *table = NULL;
2162
2163 /* allow the kernel cmdline to have a say */
2164 if (!numentries) {
2165 /* round applicable memory size up to nearest megabyte */
2166 numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
2167 numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
2168 numentries >>= 20 - PAGE_SHIFT;
2169 numentries <<= 20 - PAGE_SHIFT;
2170
2171 /* limit to 1 bucket per 2^scale bytes of low memory */
2172 if (scale > PAGE_SHIFT)
2173 numentries >>= (scale - PAGE_SHIFT);
2174 else
2175 numentries <<= (PAGE_SHIFT - scale);
2176 }
2177 /* rounded up to nearest power of 2 in size */
2178 numentries = 1UL << (long_log2(numentries) + 1);
2179
2180 /* limit allocation size to 1/16 total memory by default */
2181 if (max == 0) {
2182 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
2183 do_div(max, bucketsize);
2184 }
2185
2186 if (numentries > max)
2187 numentries = max;
2188
2189 log2qty = long_log2(numentries);
2190
2191 do {
2192 size = bucketsize << log2qty;
2193 if (flags & HASH_EARLY)
2194 table = alloc_bootmem(size);
2195 else if (hashdist)
2196 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
2197 else {
2198 unsigned long order;
2199 for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
2200 ;
2201 table = (void*) __get_free_pages(GFP_ATOMIC, order);
2202 }
2203 } while (!table && size > PAGE_SIZE && --log2qty);
2204
2205 if (!table)
2206 panic("Failed to allocate %s hash table\n", tablename);
2207
2208 printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
2209 tablename,
2210 (1U << log2qty),
2211 long_log2(size) - PAGE_SHIFT,
2212 size);
2213
2214 if (_hash_shift)
2215 *_hash_shift = log2qty;
2216 if (_hash_mask)
2217 *_hash_mask = (1 << log2qty) - 1;
2218
2219 return table;
2220}
diff --git a/mm/page_io.c b/mm/page_io.c
new file mode 100644
index 00000000000..667c76df1ec
--- /dev/null
+++ b/mm/page_io.c
@@ -0,0 +1,160 @@
1/*
2 * linux/mm/page_io.c
3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 *
6 * Swap reorganised 29.12.95,
7 * Asynchronous swapping added 30.12.95. Stephen Tweedie
8 * Removed race in async swapping. 14.4.1996. Bruno Haible
9 * Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
10 * Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
11 */
12
13#include <linux/mm.h>
14#include <linux/kernel_stat.h>
15#include <linux/pagemap.h>
16#include <linux/swap.h>
17#include <linux/bio.h>
18#include <linux/swapops.h>
19#include <linux/writeback.h>
20#include <asm/pgtable.h>
21
22static struct bio *get_swap_bio(unsigned int __nocast gfp_flags, pgoff_t index,
23 struct page *page, bio_end_io_t end_io)
24{
25 struct bio *bio;
26
27 bio = bio_alloc(gfp_flags, 1);
28 if (bio) {
29 struct swap_info_struct *sis;
30 swp_entry_t entry = { .val = index, };
31
32 sis = get_swap_info_struct(swp_type(entry));
33 bio->bi_sector = map_swap_page(sis, swp_offset(entry)) *
34 (PAGE_SIZE >> 9);
35 bio->bi_bdev = sis->bdev;
36 bio->bi_io_vec[0].bv_page = page;
37 bio->bi_io_vec[0].bv_len = PAGE_SIZE;
38 bio->bi_io_vec[0].bv_offset = 0;
39 bio->bi_vcnt = 1;
40 bio->bi_idx = 0;
41 bio->bi_size = PAGE_SIZE;
42 bio->bi_end_io = end_io;
43 }
44 return bio;
45}
46
47static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err)
48{
49 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
50 struct page *page = bio->bi_io_vec[0].bv_page;
51
52 if (bio->bi_size)
53 return 1;
54
55 if (!uptodate)
56 SetPageError(page);
57 end_page_writeback(page);
58 bio_put(bio);
59 return 0;
60}
61
62static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
63{
64 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
65 struct page *page = bio->bi_io_vec[0].bv_page;
66
67 if (bio->bi_size)
68 return 1;
69
70 if (!uptodate) {
71 SetPageError(page);
72 ClearPageUptodate(page);
73 } else {
74 SetPageUptodate(page);
75 }
76 unlock_page(page);
77 bio_put(bio);
78 return 0;
79}
80
81/*
82 * We may have stale swap cache pages in memory: notice
83 * them here and get rid of the unnecessary final write.
84 */
85int swap_writepage(struct page *page, struct writeback_control *wbc)
86{
87 struct bio *bio;
88 int ret = 0, rw = WRITE;
89
90 if (remove_exclusive_swap_page(page)) {
91 unlock_page(page);
92 goto out;
93 }
94 bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write);
95 if (bio == NULL) {
96 set_page_dirty(page);
97 unlock_page(page);
98 ret = -ENOMEM;
99 goto out;
100 }
101 if (wbc->sync_mode == WB_SYNC_ALL)
102 rw |= (1 << BIO_RW_SYNC);
103 inc_page_state(pswpout);
104 set_page_writeback(page);
105 unlock_page(page);
106 submit_bio(rw, bio);
107out:
108 return ret;
109}
110
111int swap_readpage(struct file *file, struct page *page)
112{
113 struct bio *bio;
114 int ret = 0;
115
116 BUG_ON(!PageLocked(page));
117 ClearPageUptodate(page);
118 bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read);
119 if (bio == NULL) {
120 unlock_page(page);
121 ret = -ENOMEM;
122 goto out;
123 }
124 inc_page_state(pswpin);
125 submit_bio(READ, bio);
126out:
127 return ret;
128}
129
130#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_PM_DISK)
131/*
132 * A scruffy utility function to read or write an arbitrary swap page
133 * and wait on the I/O. The caller must have a ref on the page.
134 *
135 * We use end_swap_bio_read() even for writes, because it happens to do what
136 * we want.
137 */
138int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page)
139{
140 struct bio *bio;
141 int ret = 0;
142
143 lock_page(page);
144
145 bio = get_swap_bio(GFP_KERNEL, entry.val, page, end_swap_bio_read);
146 if (bio == NULL) {
147 unlock_page(page);
148 ret = -ENOMEM;
149 goto out;
150 }
151
152 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
153 wait_on_page_locked(page);
154
155 if (!PageUptodate(page) || PageError(page))
156 ret = -EIO;
157out:
158 return ret;
159}
160#endif
diff --git a/mm/pdflush.c b/mm/pdflush.c
new file mode 100644
index 00000000000..38ce279cc8c
--- /dev/null
+++ b/mm/pdflush.c
@@ -0,0 +1,228 @@
1/*
2 * mm/pdflush.c - worker threads for writing back filesystem data
3 *
4 * Copyright (C) 2002, Linus Torvalds.
5 *
6 * 09Apr2002 akpm@zip.com.au
7 * Initial version
8 * 29Feb2004 kaos@sgi.com
9 * Move worker thread creation to kthread to avoid chewing
10 * up stack space with nested calls to kernel_thread.
11 */
12
13#include <linux/sched.h>
14#include <linux/list.h>
15#include <linux/signal.h>
16#include <linux/spinlock.h>
17#include <linux/gfp.h>
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/fs.h> // Needed by writeback.h
21#include <linux/writeback.h> // Prototypes pdflush_operation()
22#include <linux/kthread.h>
23
24
25/*
26 * Minimum and maximum number of pdflush instances
27 */
28#define MIN_PDFLUSH_THREADS 2
29#define MAX_PDFLUSH_THREADS 8
30
31static void start_one_pdflush_thread(void);
32
33
34/*
35 * The pdflush threads are worker threads for writing back dirty data.
36 * Ideally, we'd like one thread per active disk spindle. But the disk
37 * topology is very hard to divine at this level. Instead, we take
38 * care in various places to prevent more than one pdflush thread from
39 * performing writeback against a single filesystem. pdflush threads
40 * have the PF_FLUSHER flag set in current->flags to aid in this.
41 */
42
43/*
44 * All the pdflush threads. Protected by pdflush_lock
45 */
46static LIST_HEAD(pdflush_list);
47static DEFINE_SPINLOCK(pdflush_lock);
48
49/*
50 * The count of currently-running pdflush threads. Protected
51 * by pdflush_lock.
52 *
53 * Readable by sysctl, but not writable. Published to userspace at
54 * /proc/sys/vm/nr_pdflush_threads.
55 */
56int nr_pdflush_threads = 0;
57
58/*
59 * The time at which the pdflush thread pool last went empty
60 */
61static unsigned long last_empty_jifs;
62
63/*
64 * The pdflush thread.
65 *
66 * Thread pool management algorithm:
67 *
68 * - The minimum and maximum number of pdflush instances are bound
69 * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
70 *
71 * - If there have been no idle pdflush instances for 1 second, create
72 * a new one.
73 *
74 * - If the least-recently-went-to-sleep pdflush thread has been asleep
75 * for more than one second, terminate a thread.
76 */
77
78/*
79 * A structure for passing work to a pdflush thread. Also for passing
80 * state information between pdflush threads. Protected by pdflush_lock.
81 */
82struct pdflush_work {
83 struct task_struct *who; /* The thread */
84 void (*fn)(unsigned long); /* A callback function */
85 unsigned long arg0; /* An argument to the callback */
86 struct list_head list; /* On pdflush_list, when idle */
87 unsigned long when_i_went_to_sleep;
88};
89
90static int __pdflush(struct pdflush_work *my_work)
91{
92 current->flags |= PF_FLUSHER;
93 my_work->fn = NULL;
94 my_work->who = current;
95 INIT_LIST_HEAD(&my_work->list);
96
97 spin_lock_irq(&pdflush_lock);
98 nr_pdflush_threads++;
99 for ( ; ; ) {
100 struct pdflush_work *pdf;
101
102 set_current_state(TASK_INTERRUPTIBLE);
103 list_move(&my_work->list, &pdflush_list);
104 my_work->when_i_went_to_sleep = jiffies;
105 spin_unlock_irq(&pdflush_lock);
106
107 schedule();
108 if (try_to_freeze(PF_FREEZE)) {
109 spin_lock_irq(&pdflush_lock);
110 continue;
111 }
112
113 spin_lock_irq(&pdflush_lock);
114 if (!list_empty(&my_work->list)) {
115 printk("pdflush: bogus wakeup!\n");
116 my_work->fn = NULL;
117 continue;
118 }
119 if (my_work->fn == NULL) {
120 printk("pdflush: NULL work function\n");
121 continue;
122 }
123 spin_unlock_irq(&pdflush_lock);
124
125 (*my_work->fn)(my_work->arg0);
126
127 /*
128 * Thread creation: For how long have there been zero
129 * available threads?
130 */
131 if (jiffies - last_empty_jifs > 1 * HZ) {
132 /* unlocked list_empty() test is OK here */
133 if (list_empty(&pdflush_list)) {
134 /* unlocked test is OK here */
135 if (nr_pdflush_threads < MAX_PDFLUSH_THREADS)
136 start_one_pdflush_thread();
137 }
138 }
139
140 spin_lock_irq(&pdflush_lock);
141 my_work->fn = NULL;
142
143 /*
144 * Thread destruction: For how long has the sleepiest
145 * thread slept?
146 */
147 if (list_empty(&pdflush_list))
148 continue;
149 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
150 continue;
151 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
152 if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) {
153 /* Limit exit rate */
154 pdf->when_i_went_to_sleep = jiffies;
155 break; /* exeunt */
156 }
157 }
158 nr_pdflush_threads--;
159 spin_unlock_irq(&pdflush_lock);
160 return 0;
161}
162
163/*
164 * Of course, my_work wants to be just a local in __pdflush(). It is
165 * separated out in this manner to hopefully prevent the compiler from
166 * performing unfortunate optimisations against the auto variables. Because
167 * these are visible to other tasks and CPUs. (No problem has actually
168 * been observed. This is just paranoia).
169 */
170static int pdflush(void *dummy)
171{
172 struct pdflush_work my_work;
173
174 /*
175 * pdflush can spend a lot of time doing encryption via dm-crypt. We
176 * don't want to do that at keventd's priority.
177 */
178 set_user_nice(current, 0);
179 return __pdflush(&my_work);
180}
181
182/*
183 * Attempt to wake up a pdflush thread, and get it to do some work for you.
184 * Returns zero if it indeed managed to find a worker thread, and passed your
185 * payload to it.
186 */
187int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
188{
189 unsigned long flags;
190 int ret = 0;
191
192 if (fn == NULL)
193 BUG(); /* Hard to diagnose if it's deferred */
194
195 spin_lock_irqsave(&pdflush_lock, flags);
196 if (list_empty(&pdflush_list)) {
197 spin_unlock_irqrestore(&pdflush_lock, flags);
198 ret = -1;
199 } else {
200 struct pdflush_work *pdf;
201
202 pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
203 list_del_init(&pdf->list);
204 if (list_empty(&pdflush_list))
205 last_empty_jifs = jiffies;
206 pdf->fn = fn;
207 pdf->arg0 = arg0;
208 wake_up_process(pdf->who);
209 spin_unlock_irqrestore(&pdflush_lock, flags);
210 }
211 return ret;
212}
213
214static void start_one_pdflush_thread(void)
215{
216 kthread_run(pdflush, NULL, "pdflush");
217}
218
219static int __init pdflush_init(void)
220{
221 int i;
222
223 for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
224 start_one_pdflush_thread();
225 return 0;
226}
227
228module_init(pdflush_init);
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
new file mode 100644
index 00000000000..b4e76c25f95
--- /dev/null
+++ b/mm/prio_tree.c
@@ -0,0 +1,207 @@
1/*
2 * mm/prio_tree.c - priority search tree for mapping->i_mmap
3 *
4 * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
5 *
6 * This file is released under the GPL v2.
7 *
8 * Based on the radix priority search tree proposed by Edward M. McCreight
9 * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
10 *
11 * 02Feb2004 Initial version
12 */
13
14#include <linux/mm.h>
15#include <linux/prio_tree.h>
16
17/*
18 * See lib/prio_tree.c for details on the general radix priority search tree
19 * code.
20 */
21
22/*
23 * The following #defines are mirrored from lib/prio_tree.c. They're only used
24 * for debugging, and should be removed (along with the debugging code using
25 * them) when switching also VMAs to the regular prio_tree code.
26 */
27
28#define RADIX_INDEX(vma) ((vma)->vm_pgoff)
29#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
30/* avoid overflow */
31#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
32
33/*
34 * Radix priority search tree for address_space->i_mmap
35 *
36 * For each vma that map a unique set of file pages i.e., unique [radix_index,
37 * heap_index] value, we have a corresponing priority search tree node. If
38 * multiple vmas have identical [radix_index, heap_index] value, then one of
39 * them is used as a tree node and others are stored in a vm_set list. The tree
40 * node points to the first vma (head) of the list using vm_set.head.
41 *
42 * prio_tree_root
43 * |
44 * A vm_set.head
45 * / \ /
46 * L R -> H-I-J-K-M-N-O-P-Q-S
47 * ^ ^ <-- vm_set.list -->
48 * tree nodes
49 *
50 * We need some way to identify whether a vma is a tree node, head of a vm_set
51 * list, or just a member of a vm_set list. We cannot use vm_flags to store
52 * such information. The reason is, in the above figure, it is possible that
53 * vm_flags' of R and H are covered by the different mmap_sems. When R is
54 * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
55 * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
56 * That's why some trick involving shared.vm_set.parent is used for identifying
57 * tree nodes and list head nodes.
58 *
59 * vma radix priority search tree node rules:
60 *
61 * vma->shared.vm_set.parent != NULL ==> a tree node
62 * vma->shared.vm_set.head != NULL ==> list of others mapping same range
63 * vma->shared.vm_set.head == NULL ==> no others map the same range
64 *
65 * vma->shared.vm_set.parent == NULL
66 * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
67 * vma->shared.vm_set.head == NULL ==> a list node
68 */
69
70/*
71 * Add a new vma known to map the same set of pages as the old vma:
72 * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
73 * Note that it just happens to work correctly on i_mmap_nonlinear too.
74 */
75void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
76{
77 /* Leave these BUG_ONs till prio_tree patch stabilizes */
78 BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
79 BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
80
81 vma->shared.vm_set.head = NULL;
82 vma->shared.vm_set.parent = NULL;
83
84 if (!old->shared.vm_set.parent)
85 list_add(&vma->shared.vm_set.list,
86 &old->shared.vm_set.list);
87 else if (old->shared.vm_set.head)
88 list_add_tail(&vma->shared.vm_set.list,
89 &old->shared.vm_set.head->shared.vm_set.list);
90 else {
91 INIT_LIST_HEAD(&vma->shared.vm_set.list);
92 vma->shared.vm_set.head = old;
93 old->shared.vm_set.head = vma;
94 }
95}
96
97void vma_prio_tree_insert(struct vm_area_struct *vma,
98 struct prio_tree_root *root)
99{
100 struct prio_tree_node *ptr;
101 struct vm_area_struct *old;
102
103 vma->shared.vm_set.head = NULL;
104
105 ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node);
106 if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) {
107 old = prio_tree_entry(ptr, struct vm_area_struct,
108 shared.prio_tree_node);
109 vma_prio_tree_add(vma, old);
110 }
111}
112
113void vma_prio_tree_remove(struct vm_area_struct *vma,
114 struct prio_tree_root *root)
115{
116 struct vm_area_struct *node, *head, *new_head;
117
118 if (!vma->shared.vm_set.head) {
119 if (!vma->shared.vm_set.parent)
120 list_del_init(&vma->shared.vm_set.list);
121 else
122 raw_prio_tree_remove(root, &vma->shared.prio_tree_node);
123 } else {
124 /* Leave this BUG_ON till prio_tree patch stabilizes */
125 BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
126 if (vma->shared.vm_set.parent) {
127 head = vma->shared.vm_set.head;
128 if (!list_empty(&head->shared.vm_set.list)) {
129 new_head = list_entry(
130 head->shared.vm_set.list.next,
131 struct vm_area_struct,
132 shared.vm_set.list);
133 list_del_init(&head->shared.vm_set.list);
134 } else
135 new_head = NULL;
136
137 raw_prio_tree_replace(root, &vma->shared.prio_tree_node,
138 &head->shared.prio_tree_node);
139 head->shared.vm_set.head = new_head;
140 if (new_head)
141 new_head->shared.vm_set.head = head;
142
143 } else {
144 node = vma->shared.vm_set.head;
145 if (!list_empty(&vma->shared.vm_set.list)) {
146 new_head = list_entry(
147 vma->shared.vm_set.list.next,
148 struct vm_area_struct,
149 shared.vm_set.list);
150 list_del_init(&vma->shared.vm_set.list);
151 node->shared.vm_set.head = new_head;
152 new_head->shared.vm_set.head = node;
153 } else
154 node->shared.vm_set.head = NULL;
155 }
156 }
157}
158
159/*
160 * Helper function to enumerate vmas that map a given file page or a set of
161 * contiguous file pages. The function returns vmas that at least map a single
162 * page in the given range of contiguous file pages.
163 */
164struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
165 struct prio_tree_iter *iter)
166{
167 struct prio_tree_node *ptr;
168 struct vm_area_struct *next;
169
170 if (!vma) {
171 /*
172 * First call is with NULL vma
173 */
174 ptr = prio_tree_next(iter);
175 if (ptr) {
176 next = prio_tree_entry(ptr, struct vm_area_struct,
177 shared.prio_tree_node);
178 prefetch(next->shared.vm_set.head);
179 return next;
180 } else
181 return NULL;
182 }
183
184 if (vma->shared.vm_set.parent) {
185 if (vma->shared.vm_set.head) {
186 next = vma->shared.vm_set.head;
187 prefetch(next->shared.vm_set.list.next);
188 return next;
189 }
190 } else {
191 next = list_entry(vma->shared.vm_set.list.next,
192 struct vm_area_struct, shared.vm_set.list);
193 if (!next->shared.vm_set.head) {
194 prefetch(next->shared.vm_set.list.next);
195 return next;
196 }
197 }
198
199 ptr = prio_tree_next(iter);
200 if (ptr) {
201 next = prio_tree_entry(ptr, struct vm_area_struct,
202 shared.prio_tree_node);
203 prefetch(next->shared.vm_set.head);
204 return next;
205 } else
206 return NULL;
207}
diff --git a/mm/readahead.c b/mm/readahead.c
new file mode 100644
index 00000000000..b840e7c6ea7
--- /dev/null
+++ b/mm/readahead.c
@@ -0,0 +1,557 @@
1/*
2 * mm/readahead.c - address_space-level file readahead.
3 *
4 * Copyright (C) 2002, Linus Torvalds
5 *
6 * 09Apr2002 akpm@zip.com.au
7 * Initial version.
8 */
9
10#include <linux/kernel.h>
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/module.h>
14#include <linux/blkdev.h>
15#include <linux/backing-dev.h>
16#include <linux/pagevec.h>
17
18void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
19{
20}
21EXPORT_SYMBOL(default_unplug_io_fn);
22
23struct backing_dev_info default_backing_dev_info = {
24 .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
25 .state = 0,
26 .capabilities = BDI_CAP_MAP_COPY,
27 .unplug_io_fn = default_unplug_io_fn,
28};
29EXPORT_SYMBOL_GPL(default_backing_dev_info);
30
31/*
32 * Initialise a struct file's readahead state. Assumes that the caller has
33 * memset *ra to zero.
34 */
35void
36file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
37{
38 ra->ra_pages = mapping->backing_dev_info->ra_pages;
39 ra->prev_page = -1;
40}
41
42/*
43 * Return max readahead size for this inode in number-of-pages.
44 */
45static inline unsigned long get_max_readahead(struct file_ra_state *ra)
46{
47 return ra->ra_pages;
48}
49
50static inline unsigned long get_min_readahead(struct file_ra_state *ra)
51{
52 return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
53}
54
55static inline void ra_off(struct file_ra_state *ra)
56{
57 ra->start = 0;
58 ra->flags = 0;
59 ra->size = 0;
60 ra->ahead_start = 0;
61 ra->ahead_size = 0;
62 return;
63}
64
65/*
66 * Set the initial window size, round to next power of 2 and square
67 * for small size, x 4 for medium, and x 2 for large
68 * for 128k (32 page) max ra
69 * 1-8 page = 32k initial, > 8 page = 128k initial
70 */
71static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
72{
73 unsigned long newsize = roundup_pow_of_two(size);
74
75 if (newsize <= max / 64)
76 newsize = newsize * newsize;
77 else if (newsize <= max / 4)
78 newsize = max / 4;
79 else
80 newsize = max;
81 return newsize;
82}
83
84/*
85 * Set the new window size, this is called only when I/O is to be submitted,
86 * not for each call to readahead. If a cache miss occured, reduce next I/O
87 * size, else increase depending on how close to max we are.
88 */
89static inline unsigned long get_next_ra_size(struct file_ra_state *ra)
90{
91 unsigned long max = get_max_readahead(ra);
92 unsigned long min = get_min_readahead(ra);
93 unsigned long cur = ra->size;
94 unsigned long newsize;
95
96 if (ra->flags & RA_FLAG_MISS) {
97 ra->flags &= ~RA_FLAG_MISS;
98 newsize = max((cur - 2), min);
99 } else if (cur < max / 16) {
100 newsize = 4 * cur;
101 } else {
102 newsize = 2 * cur;
103 }
104 return min(newsize, max);
105}
106
107#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
108
109/**
110 * read_cache_pages - populate an address space with some pages, and
111 * start reads against them.
112 * @mapping: the address_space
113 * @pages: The address of a list_head which contains the target pages. These
114 * pages have their ->index populated and are otherwise uninitialised.
115 * @filler: callback routine for filling a single page.
116 * @data: private data for the callback routine.
117 *
118 * Hides the details of the LRU cache etc from the filesystems.
119 */
120int read_cache_pages(struct address_space *mapping, struct list_head *pages,
121 int (*filler)(void *, struct page *), void *data)
122{
123 struct page *page;
124 struct pagevec lru_pvec;
125 int ret = 0;
126
127 pagevec_init(&lru_pvec, 0);
128
129 while (!list_empty(pages)) {
130 page = list_to_page(pages);
131 list_del(&page->lru);
132 if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
133 page_cache_release(page);
134 continue;
135 }
136 ret = filler(data, page);
137 if (!pagevec_add(&lru_pvec, page))
138 __pagevec_lru_add(&lru_pvec);
139 if (ret) {
140 while (!list_empty(pages)) {
141 struct page *victim;
142
143 victim = list_to_page(pages);
144 list_del(&victim->lru);
145 page_cache_release(victim);
146 }
147 break;
148 }
149 }
150 pagevec_lru_add(&lru_pvec);
151 return ret;
152}
153
154EXPORT_SYMBOL(read_cache_pages);
155
156static int read_pages(struct address_space *mapping, struct file *filp,
157 struct list_head *pages, unsigned nr_pages)
158{
159 unsigned page_idx;
160 struct pagevec lru_pvec;
161 int ret = 0;
162
163 if (mapping->a_ops->readpages) {
164 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
165 goto out;
166 }
167
168 pagevec_init(&lru_pvec, 0);
169 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
170 struct page *page = list_to_page(pages);
171 list_del(&page->lru);
172 if (!add_to_page_cache(page, mapping,
173 page->index, GFP_KERNEL)) {
174 mapping->a_ops->readpage(filp, page);
175 if (!pagevec_add(&lru_pvec, page))
176 __pagevec_lru_add(&lru_pvec);
177 } else {
178 page_cache_release(page);
179 }
180 }
181 pagevec_lru_add(&lru_pvec);
182out:
183 return ret;
184}
185
186/*
187 * Readahead design.
188 *
189 * The fields in struct file_ra_state represent the most-recently-executed
190 * readahead attempt:
191 *
192 * start: Page index at which we started the readahead
193 * size: Number of pages in that read
194 * Together, these form the "current window".
195 * Together, start and size represent the `readahead window'.
196 * prev_page: The page which the readahead algorithm most-recently inspected.
197 * It is mainly used to detect sequential file reading.
198 * If page_cache_readahead sees that it is again being called for
199 * a page which it just looked at, it can return immediately without
200 * making any state changes.
201 * ahead_start,
202 * ahead_size: Together, these form the "ahead window".
203 * ra_pages: The externally controlled max readahead for this fd.
204 *
205 * When readahead is in the off state (size == 0), readahead is disabled.
206 * In this state, prev_page is used to detect the resumption of sequential I/O.
207 *
208 * The readahead code manages two windows - the "current" and the "ahead"
209 * windows. The intent is that while the application is walking the pages
210 * in the current window, I/O is underway on the ahead window. When the
211 * current window is fully traversed, it is replaced by the ahead window
212 * and the ahead window is invalidated. When this copying happens, the
213 * new current window's pages are probably still locked. So
214 * we submit a new batch of I/O immediately, creating a new ahead window.
215 *
216 * So:
217 *
218 * ----|----------------|----------------|-----
219 * ^start ^start+size
220 * ^ahead_start ^ahead_start+ahead_size
221 *
222 * ^ When this page is read, we submit I/O for the
223 * ahead window.
224 *
225 * A `readahead hit' occurs when a read request is made against a page which is
226 * the next sequential page. Ahead window calculations are done only when it
227 * is time to submit a new IO. The code ramps up the size agressively at first,
228 * but slow down as it approaches max_readhead.
229 *
230 * Any seek/ramdom IO will result in readahead being turned off. It will resume
231 * at the first sequential access.
232 *
233 * There is a special-case: if the first page which the application tries to
234 * read happens to be the first page of the file, it is assumed that a linear
235 * read is about to happen and the window is immediately set to the initial size
236 * based on I/O request size and the max_readahead.
237 *
238 * This function is to be called for every read request, rather than when
239 * it is time to perform readahead. It is called only once for the entire I/O
240 * regardless of size unless readahead is unable to start enough I/O to satisfy
241 * the request (I/O request > max_readahead).
242 */
243
244/*
245 * do_page_cache_readahead actually reads a chunk of disk. It allocates all
246 * the pages first, then submits them all for I/O. This avoids the very bad
247 * behaviour which would occur if page allocations are causing VM writeback.
248 * We really don't want to intermingle reads and writes like that.
249 *
250 * Returns the number of pages requested, or the maximum amount of I/O allowed.
251 *
252 * do_page_cache_readahead() returns -1 if it encountered request queue
253 * congestion.
254 */
255static int
256__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
257 unsigned long offset, unsigned long nr_to_read)
258{
259 struct inode *inode = mapping->host;
260 struct page *page;
261 unsigned long end_index; /* The last page we want to read */
262 LIST_HEAD(page_pool);
263 int page_idx;
264 int ret = 0;
265 loff_t isize = i_size_read(inode);
266
267 if (isize == 0)
268 goto out;
269
270 end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
271
272 /*
273 * Preallocate as many pages as we will need.
274 */
275 read_lock_irq(&mapping->tree_lock);
276 for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
277 unsigned long page_offset = offset + page_idx;
278
279 if (page_offset > end_index)
280 break;
281
282 page = radix_tree_lookup(&mapping->page_tree, page_offset);
283 if (page)
284 continue;
285
286 read_unlock_irq(&mapping->tree_lock);
287 page = page_cache_alloc_cold(mapping);
288 read_lock_irq(&mapping->tree_lock);
289 if (!page)
290 break;
291 page->index = page_offset;
292 list_add(&page->lru, &page_pool);
293 ret++;
294 }
295 read_unlock_irq(&mapping->tree_lock);
296
297 /*
298 * Now start the IO. We ignore I/O errors - if the page is not
299 * uptodate then the caller will launch readpage again, and
300 * will then handle the error.
301 */
302 if (ret)
303 read_pages(mapping, filp, &page_pool, ret);
304 BUG_ON(!list_empty(&page_pool));
305out:
306 return ret;
307}
308
309/*
310 * Chunk the readahead into 2 megabyte units, so that we don't pin too much
311 * memory at once.
312 */
313int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
314 unsigned long offset, unsigned long nr_to_read)
315{
316 int ret = 0;
317
318 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
319 return -EINVAL;
320
321 while (nr_to_read) {
322 int err;
323
324 unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;
325
326 if (this_chunk > nr_to_read)
327 this_chunk = nr_to_read;
328 err = __do_page_cache_readahead(mapping, filp,
329 offset, this_chunk);
330 if (err < 0) {
331 ret = err;
332 break;
333 }
334 ret += err;
335 offset += this_chunk;
336 nr_to_read -= this_chunk;
337 }
338 return ret;
339}
340
341/*
342 * Check how effective readahead is being. If the amount of started IO is
343 * less than expected then the file is partly or fully in pagecache and
344 * readahead isn't helping.
345 *
346 */
347static inline int check_ra_success(struct file_ra_state *ra,
348 unsigned long nr_to_read, unsigned long actual)
349{
350 if (actual == 0) {
351 ra->cache_hit += nr_to_read;
352 if (ra->cache_hit >= VM_MAX_CACHE_HIT) {
353 ra_off(ra);
354 ra->flags |= RA_FLAG_INCACHE;
355 return 0;
356 }
357 } else {
358 ra->cache_hit=0;
359 }
360 return 1;
361}
362
363/*
364 * This version skips the IO if the queue is read-congested, and will tell the
365 * block layer to abandon the readahead if request allocation would block.
366 *
367 * force_page_cache_readahead() will ignore queue congestion and will block on
368 * request queues.
369 */
370int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
371 unsigned long offset, unsigned long nr_to_read)
372{
373 if (bdi_read_congested(mapping->backing_dev_info))
374 return -1;
375
376 return __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
377}
378
379/*
380 * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'
381 * is set wait till the read completes. Otherwise attempt to read without
382 * blocking.
383 * Returns 1 meaning 'success' if read is succesfull without switching off
384 * readhaead mode. Otherwise return failure.
385 */
386static int
387blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,
388 unsigned long offset, unsigned long nr_to_read,
389 struct file_ra_state *ra, int block)
390{
391 int actual;
392
393 if (!block && bdi_read_congested(mapping->backing_dev_info))
394 return 0;
395
396 actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
397
398 return check_ra_success(ra, nr_to_read, actual);
399}
400
401static int make_ahead_window(struct address_space *mapping, struct file *filp,
402 struct file_ra_state *ra, int force)
403{
404 int block, ret;
405
406 ra->ahead_size = get_next_ra_size(ra);
407 ra->ahead_start = ra->start + ra->size;
408
409 block = force || (ra->prev_page >= ra->ahead_start);
410 ret = blockable_page_cache_readahead(mapping, filp,
411 ra->ahead_start, ra->ahead_size, ra, block);
412
413 if (!ret && !force) {
414 /* A read failure in blocking mode, implies pages are
415 * all cached. So we can safely assume we have taken
416 * care of all the pages requested in this call.
417 * A read failure in non-blocking mode, implies we are
418 * reading more pages than requested in this call. So
419 * we safely assume we have taken care of all the pages
420 * requested in this call.
421 *
422 * Just reset the ahead window in case we failed due to
423 * congestion. The ahead window will any way be closed
424 * in case we failed due to excessive page cache hits.
425 */
426 ra->ahead_start = 0;
427 ra->ahead_size = 0;
428 }
429
430 return ret;
431}
432
433/*
434 * page_cache_readahead is the main function. If performs the adaptive
435 * readahead window size management and submits the readahead I/O.
436 */
437unsigned long
438page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
439 struct file *filp, unsigned long offset,
440 unsigned long req_size)
441{
442 unsigned long max, newsize;
443 int sequential;
444
445 /*
446 * We avoid doing extra work and bogusly perturbing the readahead
447 * window expansion logic.
448 */
449 if (offset == ra->prev_page && --req_size)
450 ++offset;
451
452 /* Note that prev_page == -1 if it is a first read */
453 sequential = (offset == ra->prev_page + 1);
454 ra->prev_page = offset;
455
456 max = get_max_readahead(ra);
457 newsize = min(req_size, max);
458
459 /* No readahead or sub-page sized read or file already in cache */
460 if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE))
461 goto out;
462
463 ra->prev_page += newsize - 1;
464
465 /*
466 * Special case - first read at start of file. We'll assume it's
467 * a whole-file read and grow the window fast. Or detect first
468 * sequential access
469 */
470 if (sequential && ra->size == 0) {
471 ra->size = get_init_ra_size(newsize, max);
472 ra->start = offset;
473 if (!blockable_page_cache_readahead(mapping, filp, offset,
474 ra->size, ra, 1))
475 goto out;
476
477 /*
478 * If the request size is larger than our max readahead, we
479 * at least want to be sure that we get 2 IOs in flight and
480 * we know that we will definitly need the new I/O.
481 * once we do this, subsequent calls should be able to overlap
482 * IOs,* thus preventing stalls. so issue the ahead window
483 * immediately.
484 */
485 if (req_size >= max)
486 make_ahead_window(mapping, filp, ra, 1);
487
488 goto out;
489 }
490
491 /*
492 * Now handle the random case:
493 * partial page reads and first access were handled above,
494 * so this must be the next page otherwise it is random
495 */
496 if (!sequential) {
497 ra_off(ra);
498 blockable_page_cache_readahead(mapping, filp, offset,
499 newsize, ra, 1);
500 goto out;
501 }
502
503 /*
504 * If we get here we are doing sequential IO and this was not the first
505 * occurence (ie we have an existing window)
506 */
507
508 if (ra->ahead_start == 0) { /* no ahead window yet */
509 if (!make_ahead_window(mapping, filp, ra, 0))
510 goto out;
511 }
512 /*
513 * Already have an ahead window, check if we crossed into it.
514 * If so, shift windows and issue a new ahead window.
515 * Only return the #pages that are in the current window, so that
516 * we get called back on the first page of the ahead window which
517 * will allow us to submit more IO.
518 */
519 if (ra->prev_page >= ra->ahead_start) {
520 ra->start = ra->ahead_start;
521 ra->size = ra->ahead_size;
522 make_ahead_window(mapping, filp, ra, 0);
523 }
524
525out:
526 return ra->prev_page + 1;
527}
528
529/*
530 * handle_ra_miss() is called when it is known that a page which should have
531 * been present in the pagecache (we just did some readahead there) was in fact
532 * not found. This will happen if it was evicted by the VM (readahead
533 * thrashing)
534 *
535 * Turn on the cache miss flag in the RA struct, this will cause the RA code
536 * to reduce the RA size on the next read.
537 */
538void handle_ra_miss(struct address_space *mapping,
539 struct file_ra_state *ra, pgoff_t offset)
540{
541 ra->flags |= RA_FLAG_MISS;
542 ra->flags &= ~RA_FLAG_INCACHE;
543}
544
545/*
546 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
547 * sensible upper limit.
548 */
549unsigned long max_sane_readahead(unsigned long nr)
550{
551 unsigned long active;
552 unsigned long inactive;
553 unsigned long free;
554
555 __get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id()));
556 return min(nr, (inactive + free) / 2);
557}
diff --git a/mm/rmap.c b/mm/rmap.c
new file mode 100644
index 00000000000..884d6d1928b
--- /dev/null
+++ b/mm/rmap.c
@@ -0,0 +1,862 @@
1/*
2 * mm/rmap.c - physical to virtual reverse mappings
3 *
4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
5 * Released under the General Public License (GPL).
6 *
7 * Simple, low overhead reverse mapping scheme.
8 * Please try to keep this thing as modular as possible.
9 *
10 * Provides methods for unmapping each kind of mapped page:
11 * the anon methods track anonymous pages, and
12 * the file methods track pages belonging to an inode.
13 *
14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001
15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
17 * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
18 */
19
20/*
21 * Lock ordering in mm:
22 *
23 * inode->i_sem (while writing or truncating, not reading or faulting)
24 * inode->i_alloc_sem
25 *
26 * When a page fault occurs in writing from user to file, down_read
27 * of mmap_sem nests within i_sem; in sys_msync, i_sem nests within
28 * down_read of mmap_sem; i_sem and down_write of mmap_sem are never
29 * taken together; in truncation, i_sem is taken outermost.
30 *
31 * mm->mmap_sem
32 * page->flags PG_locked (lock_page)
33 * mapping->i_mmap_lock
34 * anon_vma->lock
35 * mm->page_table_lock
36 * zone->lru_lock (in mark_page_accessed)
37 * swap_list_lock (in swap_free etc's swap_info_get)
38 * mmlist_lock (in mmput, drain_mmlist and others)
39 * swap_device_lock (in swap_duplicate, swap_info_get)
40 * mapping->private_lock (in __set_page_dirty_buffers)
41 * inode_lock (in set_page_dirty's __mark_inode_dirty)
42 * sb_lock (within inode_lock in fs/fs-writeback.c)
43 * mapping->tree_lock (widely used, in set_page_dirty,
44 * in arch-dependent flush_dcache_mmap_lock,
45 * within inode_lock in __sync_single_inode)
46 */
47
48#include <linux/mm.h>
49#include <linux/pagemap.h>
50#include <linux/swap.h>
51#include <linux/swapops.h>
52#include <linux/slab.h>
53#include <linux/init.h>
54#include <linux/rmap.h>
55#include <linux/rcupdate.h>
56
57#include <asm/tlbflush.h>
58
59//#define RMAP_DEBUG /* can be enabled only for debugging */
60
61kmem_cache_t *anon_vma_cachep;
62
63static inline void validate_anon_vma(struct vm_area_struct *find_vma)
64{
65#ifdef RMAP_DEBUG
66 struct anon_vma *anon_vma = find_vma->anon_vma;
67 struct vm_area_struct *vma;
68 unsigned int mapcount = 0;
69 int found = 0;
70
71 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
72 mapcount++;
73 BUG_ON(mapcount > 100000);
74 if (vma == find_vma)
75 found = 1;
76 }
77 BUG_ON(!found);
78#endif
79}
80
81/* This must be called under the mmap_sem. */
82int anon_vma_prepare(struct vm_area_struct *vma)
83{
84 struct anon_vma *anon_vma = vma->anon_vma;
85
86 might_sleep();
87 if (unlikely(!anon_vma)) {
88 struct mm_struct *mm = vma->vm_mm;
89 struct anon_vma *allocated, *locked;
90
91 anon_vma = find_mergeable_anon_vma(vma);
92 if (anon_vma) {
93 allocated = NULL;
94 locked = anon_vma;
95 spin_lock(&locked->lock);
96 } else {
97 anon_vma = anon_vma_alloc();
98 if (unlikely(!anon_vma))
99 return -ENOMEM;
100 allocated = anon_vma;
101 locked = NULL;
102 }
103
104 /* page_table_lock to protect against threads */
105 spin_lock(&mm->page_table_lock);
106 if (likely(!vma->anon_vma)) {
107 vma->anon_vma = anon_vma;
108 list_add(&vma->anon_vma_node, &anon_vma->head);
109 allocated = NULL;
110 }
111 spin_unlock(&mm->page_table_lock);
112
113 if (locked)
114 spin_unlock(&locked->lock);
115 if (unlikely(allocated))
116 anon_vma_free(allocated);
117 }
118 return 0;
119}
120
121void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
122{
123 BUG_ON(vma->anon_vma != next->anon_vma);
124 list_del(&next->anon_vma_node);
125}
126
127void __anon_vma_link(struct vm_area_struct *vma)
128{
129 struct anon_vma *anon_vma = vma->anon_vma;
130
131 if (anon_vma) {
132 list_add(&vma->anon_vma_node, &anon_vma->head);
133 validate_anon_vma(vma);
134 }
135}
136
137void anon_vma_link(struct vm_area_struct *vma)
138{
139 struct anon_vma *anon_vma = vma->anon_vma;
140
141 if (anon_vma) {
142 spin_lock(&anon_vma->lock);
143 list_add(&vma->anon_vma_node, &anon_vma->head);
144 validate_anon_vma(vma);
145 spin_unlock(&anon_vma->lock);
146 }
147}
148
149void anon_vma_unlink(struct vm_area_struct *vma)
150{
151 struct anon_vma *anon_vma = vma->anon_vma;
152 int empty;
153
154 if (!anon_vma)
155 return;
156
157 spin_lock(&anon_vma->lock);
158 validate_anon_vma(vma);
159 list_del(&vma->anon_vma_node);
160
161 /* We must garbage collect the anon_vma if it's empty */
162 empty = list_empty(&anon_vma->head);
163 spin_unlock(&anon_vma->lock);
164
165 if (empty)
166 anon_vma_free(anon_vma);
167}
168
169static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
170{
171 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
172 SLAB_CTOR_CONSTRUCTOR) {
173 struct anon_vma *anon_vma = data;
174
175 spin_lock_init(&anon_vma->lock);
176 INIT_LIST_HEAD(&anon_vma->head);
177 }
178}
179
180void __init anon_vma_init(void)
181{
182 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
183 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL);
184}
185
186/*
187 * Getting a lock on a stable anon_vma from a page off the LRU is
188 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
189 */
190static struct anon_vma *page_lock_anon_vma(struct page *page)
191{
192 struct anon_vma *anon_vma = NULL;
193 unsigned long anon_mapping;
194
195 rcu_read_lock();
196 anon_mapping = (unsigned long) page->mapping;
197 if (!(anon_mapping & PAGE_MAPPING_ANON))
198 goto out;
199 if (!page_mapped(page))
200 goto out;
201
202 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
203 spin_lock(&anon_vma->lock);
204out:
205 rcu_read_unlock();
206 return anon_vma;
207}
208
209/*
210 * At what user virtual address is page expected in vma?
211 */
212static inline unsigned long
213vma_address(struct page *page, struct vm_area_struct *vma)
214{
215 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
216 unsigned long address;
217
218 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
219 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
220 /* page should be within any vma from prio_tree_next */
221 BUG_ON(!PageAnon(page));
222 return -EFAULT;
223 }
224 return address;
225}
226
227/*
228 * At what user virtual address is page expected in vma? checking that the
229 * page matches the vma: currently only used by unuse_process, on anon pages.
230 */
231unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
232{
233 if (PageAnon(page)) {
234 if ((void *)vma->anon_vma !=
235 (void *)page->mapping - PAGE_MAPPING_ANON)
236 return -EFAULT;
237 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
238 if (vma->vm_file->f_mapping != page->mapping)
239 return -EFAULT;
240 } else
241 return -EFAULT;
242 return vma_address(page, vma);
243}
244
245/*
246 * Subfunctions of page_referenced: page_referenced_one called
247 * repeatedly from either page_referenced_anon or page_referenced_file.
248 */
249static int page_referenced_one(struct page *page,
250 struct vm_area_struct *vma, unsigned int *mapcount, int ignore_token)
251{
252 struct mm_struct *mm = vma->vm_mm;
253 unsigned long address;
254 pgd_t *pgd;
255 pud_t *pud;
256 pmd_t *pmd;
257 pte_t *pte;
258 int referenced = 0;
259
260 if (!get_mm_counter(mm, rss))
261 goto out;
262 address = vma_address(page, vma);
263 if (address == -EFAULT)
264 goto out;
265
266 spin_lock(&mm->page_table_lock);
267
268 pgd = pgd_offset(mm, address);
269 if (!pgd_present(*pgd))
270 goto out_unlock;
271
272 pud = pud_offset(pgd, address);
273 if (!pud_present(*pud))
274 goto out_unlock;
275
276 pmd = pmd_offset(pud, address);
277 if (!pmd_present(*pmd))
278 goto out_unlock;
279
280 pte = pte_offset_map(pmd, address);
281 if (!pte_present(*pte))
282 goto out_unmap;
283
284 if (page_to_pfn(page) != pte_pfn(*pte))
285 goto out_unmap;
286
287 if (ptep_clear_flush_young(vma, address, pte))
288 referenced++;
289
290 if (mm != current->mm && !ignore_token && has_swap_token(mm))
291 referenced++;
292
293 (*mapcount)--;
294
295out_unmap:
296 pte_unmap(pte);
297out_unlock:
298 spin_unlock(&mm->page_table_lock);
299out:
300 return referenced;
301}
302
303static int page_referenced_anon(struct page *page, int ignore_token)
304{
305 unsigned int mapcount;
306 struct anon_vma *anon_vma;
307 struct vm_area_struct *vma;
308 int referenced = 0;
309
310 anon_vma = page_lock_anon_vma(page);
311 if (!anon_vma)
312 return referenced;
313
314 mapcount = page_mapcount(page);
315 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
316 referenced += page_referenced_one(page, vma, &mapcount,
317 ignore_token);
318 if (!mapcount)
319 break;
320 }
321 spin_unlock(&anon_vma->lock);
322 return referenced;
323}
324
325/**
326 * page_referenced_file - referenced check for object-based rmap
327 * @page: the page we're checking references on.
328 *
329 * For an object-based mapped page, find all the places it is mapped and
330 * check/clear the referenced flag. This is done by following the page->mapping
331 * pointer, then walking the chain of vmas it holds. It returns the number
332 * of references it found.
333 *
334 * This function is only called from page_referenced for object-based pages.
335 */
336static int page_referenced_file(struct page *page, int ignore_token)
337{
338 unsigned int mapcount;
339 struct address_space *mapping = page->mapping;
340 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
341 struct vm_area_struct *vma;
342 struct prio_tree_iter iter;
343 int referenced = 0;
344
345 /*
346 * The caller's checks on page->mapping and !PageAnon have made
347 * sure that this is a file page: the check for page->mapping
348 * excludes the case just before it gets set on an anon page.
349 */
350 BUG_ON(PageAnon(page));
351
352 /*
353 * The page lock not only makes sure that page->mapping cannot
354 * suddenly be NULLified by truncation, it makes sure that the
355 * structure at mapping cannot be freed and reused yet,
356 * so we can safely take mapping->i_mmap_lock.
357 */
358 BUG_ON(!PageLocked(page));
359
360 spin_lock(&mapping->i_mmap_lock);
361
362 /*
363 * i_mmap_lock does not stabilize mapcount at all, but mapcount
364 * is more likely to be accurate if we note it after spinning.
365 */
366 mapcount = page_mapcount(page);
367
368 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
369 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
370 == (VM_LOCKED|VM_MAYSHARE)) {
371 referenced++;
372 break;
373 }
374 referenced += page_referenced_one(page, vma, &mapcount,
375 ignore_token);
376 if (!mapcount)
377 break;
378 }
379
380 spin_unlock(&mapping->i_mmap_lock);
381 return referenced;
382}
383
384/**
385 * page_referenced - test if the page was referenced
386 * @page: the page to test
387 * @is_locked: caller holds lock on the page
388 *
389 * Quick test_and_clear_referenced for all mappings to a page,
390 * returns the number of ptes which referenced the page.
391 */
392int page_referenced(struct page *page, int is_locked, int ignore_token)
393{
394 int referenced = 0;
395
396 if (!swap_token_default_timeout)
397 ignore_token = 1;
398
399 if (page_test_and_clear_young(page))
400 referenced++;
401
402 if (TestClearPageReferenced(page))
403 referenced++;
404
405 if (page_mapped(page) && page->mapping) {
406 if (PageAnon(page))
407 referenced += page_referenced_anon(page, ignore_token);
408 else if (is_locked)
409 referenced += page_referenced_file(page, ignore_token);
410 else if (TestSetPageLocked(page))
411 referenced++;
412 else {
413 if (page->mapping)
414 referenced += page_referenced_file(page,
415 ignore_token);
416 unlock_page(page);
417 }
418 }
419 return referenced;
420}
421
422/**
423 * page_add_anon_rmap - add pte mapping to an anonymous page
424 * @page: the page to add the mapping to
425 * @vma: the vm area in which the mapping is added
426 * @address: the user virtual address mapped
427 *
428 * The caller needs to hold the mm->page_table_lock.
429 */
430void page_add_anon_rmap(struct page *page,
431 struct vm_area_struct *vma, unsigned long address)
432{
433 struct anon_vma *anon_vma = vma->anon_vma;
434 pgoff_t index;
435
436 BUG_ON(PageReserved(page));
437 BUG_ON(!anon_vma);
438
439 inc_mm_counter(vma->vm_mm, anon_rss);
440
441 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
442 index = (address - vma->vm_start) >> PAGE_SHIFT;
443 index += vma->vm_pgoff;
444 index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
445
446 if (atomic_inc_and_test(&page->_mapcount)) {
447 page->index = index;
448 page->mapping = (struct address_space *) anon_vma;
449 inc_page_state(nr_mapped);
450 }
451 /* else checking page index and mapping is racy */
452}
453
454/**
455 * page_add_file_rmap - add pte mapping to a file page
456 * @page: the page to add the mapping to
457 *
458 * The caller needs to hold the mm->page_table_lock.
459 */
460void page_add_file_rmap(struct page *page)
461{
462 BUG_ON(PageAnon(page));
463 if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
464 return;
465
466 if (atomic_inc_and_test(&page->_mapcount))
467 inc_page_state(nr_mapped);
468}
469
470/**
471 * page_remove_rmap - take down pte mapping from a page
472 * @page: page to remove mapping from
473 *
474 * Caller needs to hold the mm->page_table_lock.
475 */
476void page_remove_rmap(struct page *page)
477{
478 BUG_ON(PageReserved(page));
479
480 if (atomic_add_negative(-1, &page->_mapcount)) {
481 BUG_ON(page_mapcount(page) < 0);
482 /*
483 * It would be tidy to reset the PageAnon mapping here,
484 * but that might overwrite a racing page_add_anon_rmap
485 * which increments mapcount after us but sets mapping
486 * before us: so leave the reset to free_hot_cold_page,
487 * and remember that it's only reliable while mapped.
488 * Leaving it set also helps swapoff to reinstate ptes
489 * faster for those pages still in swapcache.
490 */
491 if (page_test_and_clear_dirty(page))
492 set_page_dirty(page);
493 dec_page_state(nr_mapped);
494 }
495}
496
497/*
498 * Subfunctions of try_to_unmap: try_to_unmap_one called
499 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
500 */
501static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
502{
503 struct mm_struct *mm = vma->vm_mm;
504 unsigned long address;
505 pgd_t *pgd;
506 pud_t *pud;
507 pmd_t *pmd;
508 pte_t *pte;
509 pte_t pteval;
510 int ret = SWAP_AGAIN;
511
512 if (!get_mm_counter(mm, rss))
513 goto out;
514 address = vma_address(page, vma);
515 if (address == -EFAULT)
516 goto out;
517
518 /*
519 * We need the page_table_lock to protect us from page faults,
520 * munmap, fork, etc...
521 */
522 spin_lock(&mm->page_table_lock);
523
524 pgd = pgd_offset(mm, address);
525 if (!pgd_present(*pgd))
526 goto out_unlock;
527
528 pud = pud_offset(pgd, address);
529 if (!pud_present(*pud))
530 goto out_unlock;
531
532 pmd = pmd_offset(pud, address);
533 if (!pmd_present(*pmd))
534 goto out_unlock;
535
536 pte = pte_offset_map(pmd, address);
537 if (!pte_present(*pte))
538 goto out_unmap;
539
540 if (page_to_pfn(page) != pte_pfn(*pte))
541 goto out_unmap;
542
543 /*
544 * If the page is mlock()d, we cannot swap it out.
545 * If it's recently referenced (perhaps page_referenced
546 * skipped over this mm) then we should reactivate it.
547 */
548 if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
549 ptep_clear_flush_young(vma, address, pte)) {
550 ret = SWAP_FAIL;
551 goto out_unmap;
552 }
553
554 /*
555 * Don't pull an anonymous page out from under get_user_pages.
556 * GUP carefully breaks COW and raises page count (while holding
557 * page_table_lock, as we have here) to make sure that the page
558 * cannot be freed. If we unmap that page here, a user write
559 * access to the virtual address will bring back the page, but
560 * its raised count will (ironically) be taken to mean it's not
561 * an exclusive swap page, do_wp_page will replace it by a copy
562 * page, and the user never get to see the data GUP was holding
563 * the original page for.
564 *
565 * This test is also useful for when swapoff (unuse_process) has
566 * to drop page lock: its reference to the page stops existing
567 * ptes from being unmapped, so swapoff can make progress.
568 */
569 if (PageSwapCache(page) &&
570 page_count(page) != page_mapcount(page) + 2) {
571 ret = SWAP_FAIL;
572 goto out_unmap;
573 }
574
575 /* Nuke the page table entry. */
576 flush_cache_page(vma, address, page_to_pfn(page));
577 pteval = ptep_clear_flush(vma, address, pte);
578
579 /* Move the dirty bit to the physical page now the pte is gone. */
580 if (pte_dirty(pteval))
581 set_page_dirty(page);
582
583 if (PageAnon(page)) {
584 swp_entry_t entry = { .val = page->private };
585 /*
586 * Store the swap location in the pte.
587 * See handle_pte_fault() ...
588 */
589 BUG_ON(!PageSwapCache(page));
590 swap_duplicate(entry);
591 if (list_empty(&mm->mmlist)) {
592 spin_lock(&mmlist_lock);
593 list_add(&mm->mmlist, &init_mm.mmlist);
594 spin_unlock(&mmlist_lock);
595 }
596 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
597 BUG_ON(pte_file(*pte));
598 dec_mm_counter(mm, anon_rss);
599 }
600
601 inc_mm_counter(mm, rss);
602 page_remove_rmap(page);
603 page_cache_release(page);
604
605out_unmap:
606 pte_unmap(pte);
607out_unlock:
608 spin_unlock(&mm->page_table_lock);
609out:
610 return ret;
611}
612
613/*
614 * objrmap doesn't work for nonlinear VMAs because the assumption that
615 * offset-into-file correlates with offset-into-virtual-addresses does not hold.
616 * Consequently, given a particular page and its ->index, we cannot locate the
617 * ptes which are mapping that page without an exhaustive linear search.
618 *
619 * So what this code does is a mini "virtual scan" of each nonlinear VMA which
620 * maps the file to which the target page belongs. The ->vm_private_data field
621 * holds the current cursor into that scan. Successive searches will circulate
622 * around the vma's virtual address space.
623 *
624 * So as more replacement pressure is applied to the pages in a nonlinear VMA,
625 * more scanning pressure is placed against them as well. Eventually pages
626 * will become fully unmapped and are eligible for eviction.
627 *
628 * For very sparsely populated VMAs this is a little inefficient - chances are
629 * there there won't be many ptes located within the scan cluster. In this case
630 * maybe we could scan further - to the end of the pte page, perhaps.
631 */
632#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
633#define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
634
635static void try_to_unmap_cluster(unsigned long cursor,
636 unsigned int *mapcount, struct vm_area_struct *vma)
637{
638 struct mm_struct *mm = vma->vm_mm;
639 pgd_t *pgd;
640 pud_t *pud;
641 pmd_t *pmd;
642 pte_t *pte;
643 pte_t pteval;
644 struct page *page;
645 unsigned long address;
646 unsigned long end;
647 unsigned long pfn;
648
649 /*
650 * We need the page_table_lock to protect us from page faults,
651 * munmap, fork, etc...
652 */
653 spin_lock(&mm->page_table_lock);
654
655 address = (vma->vm_start + cursor) & CLUSTER_MASK;
656 end = address + CLUSTER_SIZE;
657 if (address < vma->vm_start)
658 address = vma->vm_start;
659 if (end > vma->vm_end)
660 end = vma->vm_end;
661
662 pgd = pgd_offset(mm, address);
663 if (!pgd_present(*pgd))
664 goto out_unlock;
665
666 pud = pud_offset(pgd, address);
667 if (!pud_present(*pud))
668 goto out_unlock;
669
670 pmd = pmd_offset(pud, address);
671 if (!pmd_present(*pmd))
672 goto out_unlock;
673
674 for (pte = pte_offset_map(pmd, address);
675 address < end; pte++, address += PAGE_SIZE) {
676
677 if (!pte_present(*pte))
678 continue;
679
680 pfn = pte_pfn(*pte);
681 if (!pfn_valid(pfn))
682 continue;
683
684 page = pfn_to_page(pfn);
685 BUG_ON(PageAnon(page));
686 if (PageReserved(page))
687 continue;
688
689 if (ptep_clear_flush_young(vma, address, pte))
690 continue;
691
692 /* Nuke the page table entry. */
693 flush_cache_page(vma, address, pfn);
694 pteval = ptep_clear_flush(vma, address, pte);
695
696 /* If nonlinear, store the file page offset in the pte. */
697 if (page->index != linear_page_index(vma, address))
698 set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
699
700 /* Move the dirty bit to the physical page now the pte is gone. */
701 if (pte_dirty(pteval))
702 set_page_dirty(page);
703
704 page_remove_rmap(page);
705 page_cache_release(page);
706 dec_mm_counter(mm, rss);
707 (*mapcount)--;
708 }
709
710 pte_unmap(pte);
711
712out_unlock:
713 spin_unlock(&mm->page_table_lock);
714}
715
716static int try_to_unmap_anon(struct page *page)
717{
718 struct anon_vma *anon_vma;
719 struct vm_area_struct *vma;
720 int ret = SWAP_AGAIN;
721
722 anon_vma = page_lock_anon_vma(page);
723 if (!anon_vma)
724 return ret;
725
726 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
727 ret = try_to_unmap_one(page, vma);
728 if (ret == SWAP_FAIL || !page_mapped(page))
729 break;
730 }
731 spin_unlock(&anon_vma->lock);
732 return ret;
733}
734
735/**
736 * try_to_unmap_file - unmap file page using the object-based rmap method
737 * @page: the page to unmap
738 *
739 * Find all the mappings of a page using the mapping pointer and the vma chains
740 * contained in the address_space struct it points to.
741 *
742 * This function is only called from try_to_unmap for object-based pages.
743 */
744static int try_to_unmap_file(struct page *page)
745{
746 struct address_space *mapping = page->mapping;
747 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
748 struct vm_area_struct *vma;
749 struct prio_tree_iter iter;
750 int ret = SWAP_AGAIN;
751 unsigned long cursor;
752 unsigned long max_nl_cursor = 0;
753 unsigned long max_nl_size = 0;
754 unsigned int mapcount;
755
756 spin_lock(&mapping->i_mmap_lock);
757 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
758 ret = try_to_unmap_one(page, vma);
759 if (ret == SWAP_FAIL || !page_mapped(page))
760 goto out;
761 }
762
763 if (list_empty(&mapping->i_mmap_nonlinear))
764 goto out;
765
766 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
767 shared.vm_set.list) {
768 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
769 continue;
770 cursor = (unsigned long) vma->vm_private_data;
771 if (cursor > max_nl_cursor)
772 max_nl_cursor = cursor;
773 cursor = vma->vm_end - vma->vm_start;
774 if (cursor > max_nl_size)
775 max_nl_size = cursor;
776 }
777
778 if (max_nl_size == 0) { /* any nonlinears locked or reserved */
779 ret = SWAP_FAIL;
780 goto out;
781 }
782
783 /*
784 * We don't try to search for this page in the nonlinear vmas,
785 * and page_referenced wouldn't have found it anyway. Instead
786 * just walk the nonlinear vmas trying to age and unmap some.
787 * The mapcount of the page we came in with is irrelevant,
788 * but even so use it as a guide to how hard we should try?
789 */
790 mapcount = page_mapcount(page);
791 if (!mapcount)
792 goto out;
793 cond_resched_lock(&mapping->i_mmap_lock);
794
795 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
796 if (max_nl_cursor == 0)
797 max_nl_cursor = CLUSTER_SIZE;
798
799 do {
800 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
801 shared.vm_set.list) {
802 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
803 continue;
804 cursor = (unsigned long) vma->vm_private_data;
805 while (get_mm_counter(vma->vm_mm, rss) &&
806 cursor < max_nl_cursor &&
807 cursor < vma->vm_end - vma->vm_start) {
808 try_to_unmap_cluster(cursor, &mapcount, vma);
809 cursor += CLUSTER_SIZE;
810 vma->vm_private_data = (void *) cursor;
811 if ((int)mapcount <= 0)
812 goto out;
813 }
814 vma->vm_private_data = (void *) max_nl_cursor;
815 }
816 cond_resched_lock(&mapping->i_mmap_lock);
817 max_nl_cursor += CLUSTER_SIZE;
818 } while (max_nl_cursor <= max_nl_size);
819
820 /*
821 * Don't loop forever (perhaps all the remaining pages are
822 * in locked vmas). Reset cursor on all unreserved nonlinear
823 * vmas, now forgetting on which ones it had fallen behind.
824 */
825 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
826 shared.vm_set.list) {
827 if (!(vma->vm_flags & VM_RESERVED))
828 vma->vm_private_data = NULL;
829 }
830out:
831 spin_unlock(&mapping->i_mmap_lock);
832 return ret;
833}
834
835/**
836 * try_to_unmap - try to remove all page table mappings to a page
837 * @page: the page to get unmapped
838 *
839 * Tries to remove all the page table entries which are mapping this
840 * page, used in the pageout path. Caller must hold the page lock.
841 * Return values are:
842 *
843 * SWAP_SUCCESS - we succeeded in removing all mappings
844 * SWAP_AGAIN - we missed a mapping, try again later
845 * SWAP_FAIL - the page is unswappable
846 */
847int try_to_unmap(struct page *page)
848{
849 int ret;
850
851 BUG_ON(PageReserved(page));
852 BUG_ON(!PageLocked(page));
853
854 if (PageAnon(page))
855 ret = try_to_unmap_anon(page);
856 else
857 ret = try_to_unmap_file(page);
858
859 if (!page_mapped(page))
860 ret = SWAP_SUCCESS;
861 return ret;
862}
diff --git a/mm/shmem.c b/mm/shmem.c
new file mode 100644
index 00000000000..61574b81d97
--- /dev/null
+++ b/mm/shmem.c
@@ -0,0 +1,2326 @@
1/*
2 * Resizable virtual memory filesystem for Linux.
3 *
4 * Copyright (C) 2000 Linus Torvalds.
5 * 2000 Transmeta Corp.
6 * 2000-2001 Christoph Rohland
7 * 2000-2001 SAP AG
8 * 2002 Red Hat Inc.
9 * Copyright (C) 2002-2004 Hugh Dickins.
10 * Copyright (C) 2002-2004 VERITAS Software Corporation.
11 * Copyright (C) 2004 Andi Kleen, SuSE Labs
12 *
13 * Extended attribute support for tmpfs:
14 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
15 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
16 *
17 * This file is released under the GPL.
18 */
19
20/*
21 * This virtual memory filesystem is heavily based on the ramfs. It
22 * extends ramfs by the ability to use swap and honor resource limits
23 * which makes it a completely usable filesystem.
24 */
25
26#include <linux/config.h>
27#include <linux/module.h>
28#include <linux/init.h>
29#include <linux/devfs_fs_kernel.h>
30#include <linux/fs.h>
31#include <linux/mm.h>
32#include <linux/mman.h>
33#include <linux/file.h>
34#include <linux/swap.h>
35#include <linux/pagemap.h>
36#include <linux/string.h>
37#include <linux/slab.h>
38#include <linux/backing-dev.h>
39#include <linux/shmem_fs.h>
40#include <linux/mount.h>
41#include <linux/writeback.h>
42#include <linux/vfs.h>
43#include <linux/blkdev.h>
44#include <linux/security.h>
45#include <linux/swapops.h>
46#include <linux/mempolicy.h>
47#include <linux/namei.h>
48#include <linux/xattr.h>
49#include <asm/uaccess.h>
50#include <asm/div64.h>
51#include <asm/pgtable.h>
52
53/* This magic number is used in glibc for posix shared memory */
54#define TMPFS_MAGIC 0x01021994
55
56#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
57#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
58#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
59
60#define SHMEM_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
61#define SHMEM_MAX_BYTES ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT)
62
63#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
64
65/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
66#define SHMEM_PAGEIN VM_READ
67#define SHMEM_TRUNCATE VM_WRITE
68
69/* Definition to limit shmem_truncate's steps between cond_rescheds */
70#define LATENCY_LIMIT 64
71
72/* Pretend that each entry is of this size in directory's i_size */
73#define BOGO_DIRENT_SIZE 20
74
75/* Keep swapped page count in private field of indirect struct page */
76#define nr_swapped private
77
78/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
79enum sgp_type {
80 SGP_QUICK, /* don't try more than file page cache lookup */
81 SGP_READ, /* don't exceed i_size, don't allocate page */
82 SGP_CACHE, /* don't exceed i_size, may allocate page */
83 SGP_WRITE, /* may exceed i_size, may allocate page */
84};
85
86static int shmem_getpage(struct inode *inode, unsigned long idx,
87 struct page **pagep, enum sgp_type sgp, int *type);
88
89static inline struct page *shmem_dir_alloc(unsigned int gfp_mask)
90{
91 /*
92 * The above definition of ENTRIES_PER_PAGE, and the use of
93 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
94 * might be reconsidered if it ever diverges from PAGE_SIZE.
95 */
96 return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
97}
98
99static inline void shmem_dir_free(struct page *page)
100{
101 __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
102}
103
104static struct page **shmem_dir_map(struct page *page)
105{
106 return (struct page **)kmap_atomic(page, KM_USER0);
107}
108
109static inline void shmem_dir_unmap(struct page **dir)
110{
111 kunmap_atomic(dir, KM_USER0);
112}
113
114static swp_entry_t *shmem_swp_map(struct page *page)
115{
116 return (swp_entry_t *)kmap_atomic(page, KM_USER1);
117}
118
119static inline void shmem_swp_balance_unmap(void)
120{
121 /*
122 * When passing a pointer to an i_direct entry, to code which
123 * also handles indirect entries and so will shmem_swp_unmap,
124 * we must arrange for the preempt count to remain in balance.
125 * What kmap_atomic of a lowmem page does depends on config
126 * and architecture, so pretend to kmap_atomic some lowmem page.
127 */
128 (void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
129}
130
131static inline void shmem_swp_unmap(swp_entry_t *entry)
132{
133 kunmap_atomic(entry, KM_USER1);
134}
135
136static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
137{
138 return sb->s_fs_info;
139}
140
141/*
142 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
143 * for shared memory and for shared anonymous (/dev/zero) mappings
144 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
145 * consistent with the pre-accounting of private mappings ...
146 */
147static inline int shmem_acct_size(unsigned long flags, loff_t size)
148{
149 return (flags & VM_ACCOUNT)?
150 security_vm_enough_memory(VM_ACCT(size)): 0;
151}
152
153static inline void shmem_unacct_size(unsigned long flags, loff_t size)
154{
155 if (flags & VM_ACCOUNT)
156 vm_unacct_memory(VM_ACCT(size));
157}
158
159/*
160 * ... whereas tmpfs objects are accounted incrementally as
161 * pages are allocated, in order to allow huge sparse files.
162 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
163 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
164 */
165static inline int shmem_acct_block(unsigned long flags)
166{
167 return (flags & VM_ACCOUNT)?
168 0: security_vm_enough_memory(VM_ACCT(PAGE_CACHE_SIZE));
169}
170
171static inline void shmem_unacct_blocks(unsigned long flags, long pages)
172{
173 if (!(flags & VM_ACCOUNT))
174 vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
175}
176
177static struct super_operations shmem_ops;
178static struct address_space_operations shmem_aops;
179static struct file_operations shmem_file_operations;
180static struct inode_operations shmem_inode_operations;
181static struct inode_operations shmem_dir_inode_operations;
182static struct inode_operations shmem_special_inode_operations;
183static struct vm_operations_struct shmem_vm_ops;
184
185static struct backing_dev_info shmem_backing_dev_info = {
186 .ra_pages = 0, /* No readahead */
187 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
188 .unplug_io_fn = default_unplug_io_fn,
189};
190
191static LIST_HEAD(shmem_swaplist);
192static DEFINE_SPINLOCK(shmem_swaplist_lock);
193
194static void shmem_free_blocks(struct inode *inode, long pages)
195{
196 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
197 if (sbinfo) {
198 spin_lock(&sbinfo->stat_lock);
199 sbinfo->free_blocks += pages;
200 inode->i_blocks -= pages*BLOCKS_PER_PAGE;
201 spin_unlock(&sbinfo->stat_lock);
202 }
203}
204
205/*
206 * shmem_recalc_inode - recalculate the size of an inode
207 *
208 * @inode: inode to recalc
209 *
210 * We have to calculate the free blocks since the mm can drop
211 * undirtied hole pages behind our back.
212 *
213 * But normally info->alloced == inode->i_mapping->nrpages + info->swapped
214 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
215 *
216 * It has to be called with the spinlock held.
217 */
218static void shmem_recalc_inode(struct inode *inode)
219{
220 struct shmem_inode_info *info = SHMEM_I(inode);
221 long freed;
222
223 freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
224 if (freed > 0) {
225 info->alloced -= freed;
226 shmem_unacct_blocks(info->flags, freed);
227 shmem_free_blocks(inode, freed);
228 }
229}
230
231/*
232 * shmem_swp_entry - find the swap vector position in the info structure
233 *
234 * @info: info structure for the inode
235 * @index: index of the page to find
236 * @page: optional page to add to the structure. Has to be preset to
237 * all zeros
238 *
239 * If there is no space allocated yet it will return NULL when
240 * page is NULL, else it will use the page for the needed block,
241 * setting it to NULL on return to indicate that it has been used.
242 *
243 * The swap vector is organized the following way:
244 *
245 * There are SHMEM_NR_DIRECT entries directly stored in the
246 * shmem_inode_info structure. So small files do not need an addional
247 * allocation.
248 *
249 * For pages with index > SHMEM_NR_DIRECT there is the pointer
250 * i_indirect which points to a page which holds in the first half
251 * doubly indirect blocks, in the second half triple indirect blocks:
252 *
253 * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
254 * following layout (for SHMEM_NR_DIRECT == 16):
255 *
256 * i_indirect -> dir --> 16-19
257 * | +-> 20-23
258 * |
259 * +-->dir2 --> 24-27
260 * | +-> 28-31
261 * | +-> 32-35
262 * | +-> 36-39
263 * |
264 * +-->dir3 --> 40-43
265 * +-> 44-47
266 * +-> 48-51
267 * +-> 52-55
268 */
269static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
270{
271 unsigned long offset;
272 struct page **dir;
273 struct page *subdir;
274
275 if (index < SHMEM_NR_DIRECT) {
276 shmem_swp_balance_unmap();
277 return info->i_direct+index;
278 }
279 if (!info->i_indirect) {
280 if (page) {
281 info->i_indirect = *page;
282 *page = NULL;
283 }
284 return NULL; /* need another page */
285 }
286
287 index -= SHMEM_NR_DIRECT;
288 offset = index % ENTRIES_PER_PAGE;
289 index /= ENTRIES_PER_PAGE;
290 dir = shmem_dir_map(info->i_indirect);
291
292 if (index >= ENTRIES_PER_PAGE/2) {
293 index -= ENTRIES_PER_PAGE/2;
294 dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
295 index %= ENTRIES_PER_PAGE;
296 subdir = *dir;
297 if (!subdir) {
298 if (page) {
299 *dir = *page;
300 *page = NULL;
301 }
302 shmem_dir_unmap(dir);
303 return NULL; /* need another page */
304 }
305 shmem_dir_unmap(dir);
306 dir = shmem_dir_map(subdir);
307 }
308
309 dir += index;
310 subdir = *dir;
311 if (!subdir) {
312 if (!page || !(subdir = *page)) {
313 shmem_dir_unmap(dir);
314 return NULL; /* need a page */
315 }
316 *dir = subdir;
317 *page = NULL;
318 }
319 shmem_dir_unmap(dir);
320 return shmem_swp_map(subdir) + offset;
321}
322
323static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
324{
325 long incdec = value? 1: -1;
326
327 entry->val = value;
328 info->swapped += incdec;
329 if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT)
330 kmap_atomic_to_page(entry)->nr_swapped += incdec;
331}
332
333/*
334 * shmem_swp_alloc - get the position of the swap entry for the page.
335 * If it does not exist allocate the entry.
336 *
337 * @info: info structure for the inode
338 * @index: index of the page to find
339 * @sgp: check and recheck i_size? skip allocation?
340 */
341static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
342{
343 struct inode *inode = &info->vfs_inode;
344 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
345 struct page *page = NULL;
346 swp_entry_t *entry;
347
348 if (sgp != SGP_WRITE &&
349 ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
350 return ERR_PTR(-EINVAL);
351
352 while (!(entry = shmem_swp_entry(info, index, &page))) {
353 if (sgp == SGP_READ)
354 return shmem_swp_map(ZERO_PAGE(0));
355 /*
356 * Test free_blocks against 1 not 0, since we have 1 data
357 * page (and perhaps indirect index pages) yet to allocate:
358 * a waste to allocate index if we cannot allocate data.
359 */
360 if (sbinfo) {
361 spin_lock(&sbinfo->stat_lock);
362 if (sbinfo->free_blocks <= 1) {
363 spin_unlock(&sbinfo->stat_lock);
364 return ERR_PTR(-ENOSPC);
365 }
366 sbinfo->free_blocks--;
367 inode->i_blocks += BLOCKS_PER_PAGE;
368 spin_unlock(&sbinfo->stat_lock);
369 }
370
371 spin_unlock(&info->lock);
372 page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
373 if (page) {
374 page->nr_swapped = 0;
375 }
376 spin_lock(&info->lock);
377
378 if (!page) {
379 shmem_free_blocks(inode, 1);
380 return ERR_PTR(-ENOMEM);
381 }
382 if (sgp != SGP_WRITE &&
383 ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
384 entry = ERR_PTR(-EINVAL);
385 break;
386 }
387 if (info->next_index <= index)
388 info->next_index = index + 1;
389 }
390 if (page) {
391 /* another task gave its page, or truncated the file */
392 shmem_free_blocks(inode, 1);
393 shmem_dir_free(page);
394 }
395 if (info->next_index <= index && !IS_ERR(entry))
396 info->next_index = index + 1;
397 return entry;
398}
399
400/*
401 * shmem_free_swp - free some swap entries in a directory
402 *
403 * @dir: pointer to the directory
404 * @edir: pointer after last entry of the directory
405 */
406static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir)
407{
408 swp_entry_t *ptr;
409 int freed = 0;
410
411 for (ptr = dir; ptr < edir; ptr++) {
412 if (ptr->val) {
413 free_swap_and_cache(*ptr);
414 *ptr = (swp_entry_t){0};
415 freed++;
416 }
417 }
418 return freed;
419}
420
421static int shmem_map_and_free_swp(struct page *subdir,
422 int offset, int limit, struct page ***dir)
423{
424 swp_entry_t *ptr;
425 int freed = 0;
426
427 ptr = shmem_swp_map(subdir);
428 for (; offset < limit; offset += LATENCY_LIMIT) {
429 int size = limit - offset;
430 if (size > LATENCY_LIMIT)
431 size = LATENCY_LIMIT;
432 freed += shmem_free_swp(ptr+offset, ptr+offset+size);
433 if (need_resched()) {
434 shmem_swp_unmap(ptr);
435 if (*dir) {
436 shmem_dir_unmap(*dir);
437 *dir = NULL;
438 }
439 cond_resched();
440 ptr = shmem_swp_map(subdir);
441 }
442 }
443 shmem_swp_unmap(ptr);
444 return freed;
445}
446
447static void shmem_free_pages(struct list_head *next)
448{
449 struct page *page;
450 int freed = 0;
451
452 do {
453 page = container_of(next, struct page, lru);
454 next = next->next;
455 shmem_dir_free(page);
456 freed++;
457 if (freed >= LATENCY_LIMIT) {
458 cond_resched();
459 freed = 0;
460 }
461 } while (next);
462}
463
464static void shmem_truncate(struct inode *inode)
465{
466 struct shmem_inode_info *info = SHMEM_I(inode);
467 unsigned long idx;
468 unsigned long size;
469 unsigned long limit;
470 unsigned long stage;
471 unsigned long diroff;
472 struct page **dir;
473 struct page *topdir;
474 struct page *middir;
475 struct page *subdir;
476 swp_entry_t *ptr;
477 LIST_HEAD(pages_to_free);
478 long nr_pages_to_free = 0;
479 long nr_swaps_freed = 0;
480 int offset;
481 int freed;
482
483 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
484 idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
485 if (idx >= info->next_index)
486 return;
487
488 spin_lock(&info->lock);
489 info->flags |= SHMEM_TRUNCATE;
490 limit = info->next_index;
491 info->next_index = idx;
492 topdir = info->i_indirect;
493 if (topdir && idx <= SHMEM_NR_DIRECT) {
494 info->i_indirect = NULL;
495 nr_pages_to_free++;
496 list_add(&topdir->lru, &pages_to_free);
497 }
498 spin_unlock(&info->lock);
499
500 if (info->swapped && idx < SHMEM_NR_DIRECT) {
501 ptr = info->i_direct;
502 size = limit;
503 if (size > SHMEM_NR_DIRECT)
504 size = SHMEM_NR_DIRECT;
505 nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size);
506 }
507 if (!topdir)
508 goto done2;
509
510 BUG_ON(limit <= SHMEM_NR_DIRECT);
511 limit -= SHMEM_NR_DIRECT;
512 idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
513 offset = idx % ENTRIES_PER_PAGE;
514 idx -= offset;
515
516 dir = shmem_dir_map(topdir);
517 stage = ENTRIES_PER_PAGEPAGE/2;
518 if (idx < ENTRIES_PER_PAGEPAGE/2) {
519 middir = topdir;
520 diroff = idx/ENTRIES_PER_PAGE;
521 } else {
522 dir += ENTRIES_PER_PAGE/2;
523 dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
524 while (stage <= idx)
525 stage += ENTRIES_PER_PAGEPAGE;
526 middir = *dir;
527 if (*dir) {
528 diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
529 ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
530 if (!diroff && !offset) {
531 *dir = NULL;
532 nr_pages_to_free++;
533 list_add(&middir->lru, &pages_to_free);
534 }
535 shmem_dir_unmap(dir);
536 dir = shmem_dir_map(middir);
537 } else {
538 diroff = 0;
539 offset = 0;
540 idx = stage;
541 }
542 }
543
544 for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
545 if (unlikely(idx == stage)) {
546 shmem_dir_unmap(dir);
547 dir = shmem_dir_map(topdir) +
548 ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
549 while (!*dir) {
550 dir++;
551 idx += ENTRIES_PER_PAGEPAGE;
552 if (idx >= limit)
553 goto done1;
554 }
555 stage = idx + ENTRIES_PER_PAGEPAGE;
556 middir = *dir;
557 *dir = NULL;
558 nr_pages_to_free++;
559 list_add(&middir->lru, &pages_to_free);
560 shmem_dir_unmap(dir);
561 cond_resched();
562 dir = shmem_dir_map(middir);
563 diroff = 0;
564 }
565 subdir = dir[diroff];
566 if (subdir && subdir->nr_swapped) {
567 size = limit - idx;
568 if (size > ENTRIES_PER_PAGE)
569 size = ENTRIES_PER_PAGE;
570 freed = shmem_map_and_free_swp(subdir,
571 offset, size, &dir);
572 if (!dir)
573 dir = shmem_dir_map(middir);
574 nr_swaps_freed += freed;
575 if (offset)
576 spin_lock(&info->lock);
577 subdir->nr_swapped -= freed;
578 if (offset)
579 spin_unlock(&info->lock);
580 BUG_ON(subdir->nr_swapped > offset);
581 }
582 if (offset)
583 offset = 0;
584 else if (subdir) {
585 dir[diroff] = NULL;
586 nr_pages_to_free++;
587 list_add(&subdir->lru, &pages_to_free);
588 }
589 }
590done1:
591 shmem_dir_unmap(dir);
592done2:
593 if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
594 /*
595 * Call truncate_inode_pages again: racing shmem_unuse_inode
596 * may have swizzled a page in from swap since vmtruncate or
597 * generic_delete_inode did it, before we lowered next_index.
598 * Also, though shmem_getpage checks i_size before adding to
599 * cache, no recheck after: so fix the narrow window there too.
600 */
601 truncate_inode_pages(inode->i_mapping, inode->i_size);
602 }
603
604 spin_lock(&info->lock);
605 info->flags &= ~SHMEM_TRUNCATE;
606 info->swapped -= nr_swaps_freed;
607 if (nr_pages_to_free)
608 shmem_free_blocks(inode, nr_pages_to_free);
609 shmem_recalc_inode(inode);
610 spin_unlock(&info->lock);
611
612 /*
613 * Empty swap vector directory pages to be freed?
614 */
615 if (!list_empty(&pages_to_free)) {
616 pages_to_free.prev->next = NULL;
617 shmem_free_pages(pages_to_free.next);
618 }
619}
620
621static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
622{
623 struct inode *inode = dentry->d_inode;
624 struct page *page = NULL;
625 int error;
626
627 if (attr->ia_valid & ATTR_SIZE) {
628 if (attr->ia_size < inode->i_size) {
629 /*
630 * If truncating down to a partial page, then
631 * if that page is already allocated, hold it
632 * in memory until the truncation is over, so
633 * truncate_partial_page cannnot miss it were
634 * it assigned to swap.
635 */
636 if (attr->ia_size & (PAGE_CACHE_SIZE-1)) {
637 (void) shmem_getpage(inode,
638 attr->ia_size>>PAGE_CACHE_SHIFT,
639 &page, SGP_READ, NULL);
640 }
641 /*
642 * Reset SHMEM_PAGEIN flag so that shmem_truncate can
643 * detect if any pages might have been added to cache
644 * after truncate_inode_pages. But we needn't bother
645 * if it's being fully truncated to zero-length: the
646 * nrpages check is efficient enough in that case.
647 */
648 if (attr->ia_size) {
649 struct shmem_inode_info *info = SHMEM_I(inode);
650 spin_lock(&info->lock);
651 info->flags &= ~SHMEM_PAGEIN;
652 spin_unlock(&info->lock);
653 }
654 }
655 }
656
657 error = inode_change_ok(inode, attr);
658 if (!error)
659 error = inode_setattr(inode, attr);
660 if (page)
661 page_cache_release(page);
662 return error;
663}
664
665static void shmem_delete_inode(struct inode *inode)
666{
667 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
668 struct shmem_inode_info *info = SHMEM_I(inode);
669
670 if (inode->i_op->truncate == shmem_truncate) {
671 shmem_unacct_size(info->flags, inode->i_size);
672 inode->i_size = 0;
673 shmem_truncate(inode);
674 if (!list_empty(&info->swaplist)) {
675 spin_lock(&shmem_swaplist_lock);
676 list_del_init(&info->swaplist);
677 spin_unlock(&shmem_swaplist_lock);
678 }
679 }
680 if (sbinfo) {
681 BUG_ON(inode->i_blocks);
682 spin_lock(&sbinfo->stat_lock);
683 sbinfo->free_inodes++;
684 spin_unlock(&sbinfo->stat_lock);
685 }
686 clear_inode(inode);
687}
688
689static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
690{
691 swp_entry_t *ptr;
692
693 for (ptr = dir; ptr < edir; ptr++) {
694 if (ptr->val == entry.val)
695 return ptr - dir;
696 }
697 return -1;
698}
699
700static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
701{
702 struct inode *inode;
703 unsigned long idx;
704 unsigned long size;
705 unsigned long limit;
706 unsigned long stage;
707 struct page **dir;
708 struct page *subdir;
709 swp_entry_t *ptr;
710 int offset;
711
712 idx = 0;
713 ptr = info->i_direct;
714 spin_lock(&info->lock);
715 limit = info->next_index;
716 size = limit;
717 if (size > SHMEM_NR_DIRECT)
718 size = SHMEM_NR_DIRECT;
719 offset = shmem_find_swp(entry, ptr, ptr+size);
720 if (offset >= 0) {
721 shmem_swp_balance_unmap();
722 goto found;
723 }
724 if (!info->i_indirect)
725 goto lost2;
726
727 dir = shmem_dir_map(info->i_indirect);
728 stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
729
730 for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
731 if (unlikely(idx == stage)) {
732 shmem_dir_unmap(dir-1);
733 dir = shmem_dir_map(info->i_indirect) +
734 ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
735 while (!*dir) {
736 dir++;
737 idx += ENTRIES_PER_PAGEPAGE;
738 if (idx >= limit)
739 goto lost1;
740 }
741 stage = idx + ENTRIES_PER_PAGEPAGE;
742 subdir = *dir;
743 shmem_dir_unmap(dir);
744 dir = shmem_dir_map(subdir);
745 }
746 subdir = *dir;
747 if (subdir && subdir->nr_swapped) {
748 ptr = shmem_swp_map(subdir);
749 size = limit - idx;
750 if (size > ENTRIES_PER_PAGE)
751 size = ENTRIES_PER_PAGE;
752 offset = shmem_find_swp(entry, ptr, ptr+size);
753 if (offset >= 0) {
754 shmem_dir_unmap(dir);
755 goto found;
756 }
757 shmem_swp_unmap(ptr);
758 }
759 }
760lost1:
761 shmem_dir_unmap(dir-1);
762lost2:
763 spin_unlock(&info->lock);
764 return 0;
765found:
766 idx += offset;
767 inode = &info->vfs_inode;
768 if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) {
769 info->flags |= SHMEM_PAGEIN;
770 shmem_swp_set(info, ptr + offset, 0);
771 }
772 shmem_swp_unmap(ptr);
773 spin_unlock(&info->lock);
774 /*
775 * Decrement swap count even when the entry is left behind:
776 * try_to_unuse will skip over mms, then reincrement count.
777 */
778 swap_free(entry);
779 return 1;
780}
781
782/*
783 * shmem_unuse() search for an eventually swapped out shmem page.
784 */
785int shmem_unuse(swp_entry_t entry, struct page *page)
786{
787 struct list_head *p, *next;
788 struct shmem_inode_info *info;
789 int found = 0;
790
791 spin_lock(&shmem_swaplist_lock);
792 list_for_each_safe(p, next, &shmem_swaplist) {
793 info = list_entry(p, struct shmem_inode_info, swaplist);
794 if (!info->swapped)
795 list_del_init(&info->swaplist);
796 else if (shmem_unuse_inode(info, entry, page)) {
797 /* move head to start search for next from here */
798 list_move_tail(&shmem_swaplist, &info->swaplist);
799 found = 1;
800 break;
801 }
802 }
803 spin_unlock(&shmem_swaplist_lock);
804 return found;
805}
806
807/*
808 * Move the page from the page cache to the swap cache.
809 */
810static int shmem_writepage(struct page *page, struct writeback_control *wbc)
811{
812 struct shmem_inode_info *info;
813 swp_entry_t *entry, swap;
814 struct address_space *mapping;
815 unsigned long index;
816 struct inode *inode;
817
818 BUG_ON(!PageLocked(page));
819 BUG_ON(page_mapped(page));
820
821 mapping = page->mapping;
822 index = page->index;
823 inode = mapping->host;
824 info = SHMEM_I(inode);
825 if (info->flags & VM_LOCKED)
826 goto redirty;
827 swap = get_swap_page();
828 if (!swap.val)
829 goto redirty;
830
831 spin_lock(&info->lock);
832 shmem_recalc_inode(inode);
833 if (index >= info->next_index) {
834 BUG_ON(!(info->flags & SHMEM_TRUNCATE));
835 goto unlock;
836 }
837 entry = shmem_swp_entry(info, index, NULL);
838 BUG_ON(!entry);
839 BUG_ON(entry->val);
840
841 if (move_to_swap_cache(page, swap) == 0) {
842 shmem_swp_set(info, entry, swap.val);
843 shmem_swp_unmap(entry);
844 spin_unlock(&info->lock);
845 if (list_empty(&info->swaplist)) {
846 spin_lock(&shmem_swaplist_lock);
847 /* move instead of add in case we're racing */
848 list_move_tail(&info->swaplist, &shmem_swaplist);
849 spin_unlock(&shmem_swaplist_lock);
850 }
851 unlock_page(page);
852 return 0;
853 }
854
855 shmem_swp_unmap(entry);
856unlock:
857 spin_unlock(&info->lock);
858 swap_free(swap);
859redirty:
860 set_page_dirty(page);
861 return WRITEPAGE_ACTIVATE; /* Return with the page locked */
862}
863
864#ifdef CONFIG_NUMA
865static struct page *shmem_swapin_async(struct shared_policy *p,
866 swp_entry_t entry, unsigned long idx)
867{
868 struct page *page;
869 struct vm_area_struct pvma;
870
871 /* Create a pseudo vma that just contains the policy */
872 memset(&pvma, 0, sizeof(struct vm_area_struct));
873 pvma.vm_end = PAGE_SIZE;
874 pvma.vm_pgoff = idx;
875 pvma.vm_policy = mpol_shared_policy_lookup(p, idx);
876 page = read_swap_cache_async(entry, &pvma, 0);
877 mpol_free(pvma.vm_policy);
878 return page;
879}
880
881struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry,
882 unsigned long idx)
883{
884 struct shared_policy *p = &info->policy;
885 int i, num;
886 struct page *page;
887 unsigned long offset;
888
889 num = valid_swaphandles(entry, &offset);
890 for (i = 0; i < num; offset++, i++) {
891 page = shmem_swapin_async(p,
892 swp_entry(swp_type(entry), offset), idx);
893 if (!page)
894 break;
895 page_cache_release(page);
896 }
897 lru_add_drain(); /* Push any new pages onto the LRU now */
898 return shmem_swapin_async(p, entry, idx);
899}
900
901static struct page *
902shmem_alloc_page(unsigned long gfp, struct shmem_inode_info *info,
903 unsigned long idx)
904{
905 struct vm_area_struct pvma;
906 struct page *page;
907
908 memset(&pvma, 0, sizeof(struct vm_area_struct));
909 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
910 pvma.vm_pgoff = idx;
911 pvma.vm_end = PAGE_SIZE;
912 page = alloc_page_vma(gfp | __GFP_ZERO, &pvma, 0);
913 mpol_free(pvma.vm_policy);
914 return page;
915}
916#else
917static inline struct page *
918shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
919{
920 swapin_readahead(entry, 0, NULL);
921 return read_swap_cache_async(entry, NULL, 0);
922}
923
924static inline struct page *
925shmem_alloc_page(unsigned int __nocast gfp,struct shmem_inode_info *info,
926 unsigned long idx)
927{
928 return alloc_page(gfp | __GFP_ZERO);
929}
930#endif
931
932/*
933 * shmem_getpage - either get the page from swap or allocate a new one
934 *
935 * If we allocate a new one we do not mark it dirty. That's up to the
936 * vm. If we swap it in we mark it dirty since we also free the swap
937 * entry since a page cannot live in both the swap and page cache
938 */
939static int shmem_getpage(struct inode *inode, unsigned long idx,
940 struct page **pagep, enum sgp_type sgp, int *type)
941{
942 struct address_space *mapping = inode->i_mapping;
943 struct shmem_inode_info *info = SHMEM_I(inode);
944 struct shmem_sb_info *sbinfo;
945 struct page *filepage = *pagep;
946 struct page *swappage;
947 swp_entry_t *entry;
948 swp_entry_t swap;
949 int error;
950
951 if (idx >= SHMEM_MAX_INDEX)
952 return -EFBIG;
953 /*
954 * Normally, filepage is NULL on entry, and either found
955 * uptodate immediately, or allocated and zeroed, or read
956 * in under swappage, which is then assigned to filepage.
957 * But shmem_prepare_write passes in a locked filepage,
958 * which may be found not uptodate by other callers too,
959 * and may need to be copied from the swappage read in.
960 */
961repeat:
962 if (!filepage)
963 filepage = find_lock_page(mapping, idx);
964 if (filepage && PageUptodate(filepage))
965 goto done;
966 error = 0;
967 if (sgp == SGP_QUICK)
968 goto failed;
969
970 spin_lock(&info->lock);
971 shmem_recalc_inode(inode);
972 entry = shmem_swp_alloc(info, idx, sgp);
973 if (IS_ERR(entry)) {
974 spin_unlock(&info->lock);
975 error = PTR_ERR(entry);
976 goto failed;
977 }
978 swap = *entry;
979
980 if (swap.val) {
981 /* Look it up and read it in.. */
982 swappage = lookup_swap_cache(swap);
983 if (!swappage) {
984 shmem_swp_unmap(entry);
985 spin_unlock(&info->lock);
986 /* here we actually do the io */
987 if (type && *type == VM_FAULT_MINOR) {
988 inc_page_state(pgmajfault);
989 *type = VM_FAULT_MAJOR;
990 }
991 swappage = shmem_swapin(info, swap, idx);
992 if (!swappage) {
993 spin_lock(&info->lock);
994 entry = shmem_swp_alloc(info, idx, sgp);
995 if (IS_ERR(entry))
996 error = PTR_ERR(entry);
997 else {
998 if (entry->val == swap.val)
999 error = -ENOMEM;
1000 shmem_swp_unmap(entry);
1001 }
1002 spin_unlock(&info->lock);
1003 if (error)
1004 goto failed;
1005 goto repeat;
1006 }
1007 wait_on_page_locked(swappage);
1008 page_cache_release(swappage);
1009 goto repeat;
1010 }
1011
1012 /* We have to do this with page locked to prevent races */
1013 if (TestSetPageLocked(swappage)) {
1014 shmem_swp_unmap(entry);
1015 spin_unlock(&info->lock);
1016 wait_on_page_locked(swappage);
1017 page_cache_release(swappage);
1018 goto repeat;
1019 }
1020 if (PageWriteback(swappage)) {
1021 shmem_swp_unmap(entry);
1022 spin_unlock(&info->lock);
1023 wait_on_page_writeback(swappage);
1024 unlock_page(swappage);
1025 page_cache_release(swappage);
1026 goto repeat;
1027 }
1028 if (!PageUptodate(swappage)) {
1029 shmem_swp_unmap(entry);
1030 spin_unlock(&info->lock);
1031 unlock_page(swappage);
1032 page_cache_release(swappage);
1033 error = -EIO;
1034 goto failed;
1035 }
1036
1037 if (filepage) {
1038 shmem_swp_set(info, entry, 0);
1039 shmem_swp_unmap(entry);
1040 delete_from_swap_cache(swappage);
1041 spin_unlock(&info->lock);
1042 copy_highpage(filepage, swappage);
1043 unlock_page(swappage);
1044 page_cache_release(swappage);
1045 flush_dcache_page(filepage);
1046 SetPageUptodate(filepage);
1047 set_page_dirty(filepage);
1048 swap_free(swap);
1049 } else if (!(error = move_from_swap_cache(
1050 swappage, idx, mapping))) {
1051 info->flags |= SHMEM_PAGEIN;
1052 shmem_swp_set(info, entry, 0);
1053 shmem_swp_unmap(entry);
1054 spin_unlock(&info->lock);
1055 filepage = swappage;
1056 swap_free(swap);
1057 } else {
1058 shmem_swp_unmap(entry);
1059 spin_unlock(&info->lock);
1060 unlock_page(swappage);
1061 page_cache_release(swappage);
1062 if (error == -ENOMEM) {
1063 /* let kswapd refresh zone for GFP_ATOMICs */
1064 blk_congestion_wait(WRITE, HZ/50);
1065 }
1066 goto repeat;
1067 }
1068 } else if (sgp == SGP_READ && !filepage) {
1069 shmem_swp_unmap(entry);
1070 filepage = find_get_page(mapping, idx);
1071 if (filepage &&
1072 (!PageUptodate(filepage) || TestSetPageLocked(filepage))) {
1073 spin_unlock(&info->lock);
1074 wait_on_page_locked(filepage);
1075 page_cache_release(filepage);
1076 filepage = NULL;
1077 goto repeat;
1078 }
1079 spin_unlock(&info->lock);
1080 } else {
1081 shmem_swp_unmap(entry);
1082 sbinfo = SHMEM_SB(inode->i_sb);
1083 if (sbinfo) {
1084 spin_lock(&sbinfo->stat_lock);
1085 if (sbinfo->free_blocks == 0 ||
1086 shmem_acct_block(info->flags)) {
1087 spin_unlock(&sbinfo->stat_lock);
1088 spin_unlock(&info->lock);
1089 error = -ENOSPC;
1090 goto failed;
1091 }
1092 sbinfo->free_blocks--;
1093 inode->i_blocks += BLOCKS_PER_PAGE;
1094 spin_unlock(&sbinfo->stat_lock);
1095 } else if (shmem_acct_block(info->flags)) {
1096 spin_unlock(&info->lock);
1097 error = -ENOSPC;
1098 goto failed;
1099 }
1100
1101 if (!filepage) {
1102 spin_unlock(&info->lock);
1103 filepage = shmem_alloc_page(mapping_gfp_mask(mapping),
1104 info,
1105 idx);
1106 if (!filepage) {
1107 shmem_unacct_blocks(info->flags, 1);
1108 shmem_free_blocks(inode, 1);
1109 error = -ENOMEM;
1110 goto failed;
1111 }
1112
1113 spin_lock(&info->lock);
1114 entry = shmem_swp_alloc(info, idx, sgp);
1115 if (IS_ERR(entry))
1116 error = PTR_ERR(entry);
1117 else {
1118 swap = *entry;
1119 shmem_swp_unmap(entry);
1120 }
1121 if (error || swap.val || 0 != add_to_page_cache_lru(
1122 filepage, mapping, idx, GFP_ATOMIC)) {
1123 spin_unlock(&info->lock);
1124 page_cache_release(filepage);
1125 shmem_unacct_blocks(info->flags, 1);
1126 shmem_free_blocks(inode, 1);
1127 filepage = NULL;
1128 if (error)
1129 goto failed;
1130 goto repeat;
1131 }
1132 info->flags |= SHMEM_PAGEIN;
1133 }
1134
1135 info->alloced++;
1136 spin_unlock(&info->lock);
1137 flush_dcache_page(filepage);
1138 SetPageUptodate(filepage);
1139 }
1140done:
1141 if (*pagep != filepage) {
1142 unlock_page(filepage);
1143 *pagep = filepage;
1144 }
1145 return 0;
1146
1147failed:
1148 if (*pagep != filepage) {
1149 unlock_page(filepage);
1150 page_cache_release(filepage);
1151 }
1152 return error;
1153}
1154
1155struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type)
1156{
1157 struct inode *inode = vma->vm_file->f_dentry->d_inode;
1158 struct page *page = NULL;
1159 unsigned long idx;
1160 int error;
1161
1162 idx = (address - vma->vm_start) >> PAGE_SHIFT;
1163 idx += vma->vm_pgoff;
1164 idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
1165 if (((loff_t) idx << PAGE_CACHE_SHIFT) >= i_size_read(inode))
1166 return NOPAGE_SIGBUS;
1167
1168 error = shmem_getpage(inode, idx, &page, SGP_CACHE, type);
1169 if (error)
1170 return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
1171
1172 mark_page_accessed(page);
1173 return page;
1174}
1175
1176static int shmem_populate(struct vm_area_struct *vma,
1177 unsigned long addr, unsigned long len,
1178 pgprot_t prot, unsigned long pgoff, int nonblock)
1179{
1180 struct inode *inode = vma->vm_file->f_dentry->d_inode;
1181 struct mm_struct *mm = vma->vm_mm;
1182 enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE;
1183 unsigned long size;
1184
1185 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
1186 if (pgoff >= size || pgoff + (len >> PAGE_SHIFT) > size)
1187 return -EINVAL;
1188
1189 while ((long) len > 0) {
1190 struct page *page = NULL;
1191 int err;
1192 /*
1193 * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE
1194 */
1195 err = shmem_getpage(inode, pgoff, &page, sgp, NULL);
1196 if (err)
1197 return err;
1198 if (page) {
1199 mark_page_accessed(page);
1200 err = install_page(mm, vma, addr, page, prot);
1201 if (err) {
1202 page_cache_release(page);
1203 return err;
1204 }
1205 } else if (nonblock) {
1206 err = install_file_pte(mm, vma, addr, pgoff, prot);
1207 if (err)
1208 return err;
1209 }
1210
1211 len -= PAGE_SIZE;
1212 addr += PAGE_SIZE;
1213 pgoff++;
1214 }
1215 return 0;
1216}
1217
1218#ifdef CONFIG_NUMA
1219int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
1220{
1221 struct inode *i = vma->vm_file->f_dentry->d_inode;
1222 return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
1223}
1224
1225struct mempolicy *
1226shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
1227{
1228 struct inode *i = vma->vm_file->f_dentry->d_inode;
1229 unsigned long idx;
1230
1231 idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1232 return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
1233}
1234#endif
1235
1236int shmem_lock(struct file *file, int lock, struct user_struct *user)
1237{
1238 struct inode *inode = file->f_dentry->d_inode;
1239 struct shmem_inode_info *info = SHMEM_I(inode);
1240 int retval = -ENOMEM;
1241
1242 spin_lock(&info->lock);
1243 if (lock && !(info->flags & VM_LOCKED)) {
1244 if (!user_shm_lock(inode->i_size, user))
1245 goto out_nomem;
1246 info->flags |= VM_LOCKED;
1247 }
1248 if (!lock && (info->flags & VM_LOCKED) && user) {
1249 user_shm_unlock(inode->i_size, user);
1250 info->flags &= ~VM_LOCKED;
1251 }
1252 retval = 0;
1253out_nomem:
1254 spin_unlock(&info->lock);
1255 return retval;
1256}
1257
1258static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1259{
1260 file_accessed(file);
1261 vma->vm_ops = &shmem_vm_ops;
1262 return 0;
1263}
1264
1265static struct inode *
1266shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1267{
1268 struct inode *inode;
1269 struct shmem_inode_info *info;
1270 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1271
1272 if (sbinfo) {
1273 spin_lock(&sbinfo->stat_lock);
1274 if (!sbinfo->free_inodes) {
1275 spin_unlock(&sbinfo->stat_lock);
1276 return NULL;
1277 }
1278 sbinfo->free_inodes--;
1279 spin_unlock(&sbinfo->stat_lock);
1280 }
1281
1282 inode = new_inode(sb);
1283 if (inode) {
1284 inode->i_mode = mode;
1285 inode->i_uid = current->fsuid;
1286 inode->i_gid = current->fsgid;
1287 inode->i_blksize = PAGE_CACHE_SIZE;
1288 inode->i_blocks = 0;
1289 inode->i_mapping->a_ops = &shmem_aops;
1290 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1291 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1292 info = SHMEM_I(inode);
1293 memset(info, 0, (char *)inode - (char *)info);
1294 spin_lock_init(&info->lock);
1295 INIT_LIST_HEAD(&info->swaplist);
1296
1297 switch (mode & S_IFMT) {
1298 default:
1299 inode->i_op = &shmem_special_inode_operations;
1300 init_special_inode(inode, mode, dev);
1301 break;
1302 case S_IFREG:
1303 inode->i_op = &shmem_inode_operations;
1304 inode->i_fop = &shmem_file_operations;
1305 mpol_shared_policy_init(&info->policy);
1306 break;
1307 case S_IFDIR:
1308 inode->i_nlink++;
1309 /* Some things misbehave if size == 0 on a directory */
1310 inode->i_size = 2 * BOGO_DIRENT_SIZE;
1311 inode->i_op = &shmem_dir_inode_operations;
1312 inode->i_fop = &simple_dir_operations;
1313 break;
1314 case S_IFLNK:
1315 /*
1316 * Must not load anything in the rbtree,
1317 * mpol_free_shared_policy will not be called.
1318 */
1319 mpol_shared_policy_init(&info->policy);
1320 break;
1321 }
1322 } else if (sbinfo) {
1323 spin_lock(&sbinfo->stat_lock);
1324 sbinfo->free_inodes++;
1325 spin_unlock(&sbinfo->stat_lock);
1326 }
1327 return inode;
1328}
1329
1330#ifdef CONFIG_TMPFS
1331
1332static int shmem_set_size(struct shmem_sb_info *sbinfo,
1333 unsigned long max_blocks, unsigned long max_inodes)
1334{
1335 int error;
1336 unsigned long blocks, inodes;
1337
1338 spin_lock(&sbinfo->stat_lock);
1339 blocks = sbinfo->max_blocks - sbinfo->free_blocks;
1340 inodes = sbinfo->max_inodes - sbinfo->free_inodes;
1341 error = -EINVAL;
1342 if (max_blocks < blocks)
1343 goto out;
1344 if (max_inodes < inodes)
1345 goto out;
1346 error = 0;
1347 sbinfo->max_blocks = max_blocks;
1348 sbinfo->free_blocks = max_blocks - blocks;
1349 sbinfo->max_inodes = max_inodes;
1350 sbinfo->free_inodes = max_inodes - inodes;
1351out:
1352 spin_unlock(&sbinfo->stat_lock);
1353 return error;
1354}
1355
1356static struct inode_operations shmem_symlink_inode_operations;
1357static struct inode_operations shmem_symlink_inline_operations;
1358
1359/*
1360 * Normally tmpfs makes no use of shmem_prepare_write, but it
1361 * lets a tmpfs file be used read-write below the loop driver.
1362 */
1363static int
1364shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
1365{
1366 struct inode *inode = page->mapping->host;
1367 return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL);
1368}
1369
1370static ssize_t
1371shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
1372{
1373 struct inode *inode = file->f_dentry->d_inode;
1374 loff_t pos;
1375 unsigned long written;
1376 ssize_t err;
1377
1378 if ((ssize_t) count < 0)
1379 return -EINVAL;
1380
1381 if (!access_ok(VERIFY_READ, buf, count))
1382 return -EFAULT;
1383
1384 down(&inode->i_sem);
1385
1386 pos = *ppos;
1387 written = 0;
1388
1389 err = generic_write_checks(file, &pos, &count, 0);
1390 if (err || !count)
1391 goto out;
1392
1393 err = remove_suid(file->f_dentry);
1394 if (err)
1395 goto out;
1396
1397 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1398
1399 do {
1400 struct page *page = NULL;
1401 unsigned long bytes, index, offset;
1402 char *kaddr;
1403 int left;
1404
1405 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1406 index = pos >> PAGE_CACHE_SHIFT;
1407 bytes = PAGE_CACHE_SIZE - offset;
1408 if (bytes > count)
1409 bytes = count;
1410
1411 /*
1412 * We don't hold page lock across copy from user -
1413 * what would it guard against? - so no deadlock here.
1414 * But it still may be a good idea to prefault below.
1415 */
1416
1417 err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL);
1418 if (err)
1419 break;
1420
1421 left = bytes;
1422 if (PageHighMem(page)) {
1423 volatile unsigned char dummy;
1424 __get_user(dummy, buf);
1425 __get_user(dummy, buf + bytes - 1);
1426
1427 kaddr = kmap_atomic(page, KM_USER0);
1428 left = __copy_from_user_inatomic(kaddr + offset,
1429 buf, bytes);
1430 kunmap_atomic(kaddr, KM_USER0);
1431 }
1432 if (left) {
1433 kaddr = kmap(page);
1434 left = __copy_from_user(kaddr + offset, buf, bytes);
1435 kunmap(page);
1436 }
1437
1438 written += bytes;
1439 count -= bytes;
1440 pos += bytes;
1441 buf += bytes;
1442 if (pos > inode->i_size)
1443 i_size_write(inode, pos);
1444
1445 flush_dcache_page(page);
1446 set_page_dirty(page);
1447 mark_page_accessed(page);
1448 page_cache_release(page);
1449
1450 if (left) {
1451 pos -= left;
1452 written -= left;
1453 err = -EFAULT;
1454 break;
1455 }
1456
1457 /*
1458 * Our dirty pages are not counted in nr_dirty,
1459 * and we do not attempt to balance dirty pages.
1460 */
1461
1462 cond_resched();
1463 } while (count);
1464
1465 *ppos = pos;
1466 if (written)
1467 err = written;
1468out:
1469 up(&inode->i_sem);
1470 return err;
1471}
1472
1473static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
1474{
1475 struct inode *inode = filp->f_dentry->d_inode;
1476 struct address_space *mapping = inode->i_mapping;
1477 unsigned long index, offset;
1478
1479 index = *ppos >> PAGE_CACHE_SHIFT;
1480 offset = *ppos & ~PAGE_CACHE_MASK;
1481
1482 for (;;) {
1483 struct page *page = NULL;
1484 unsigned long end_index, nr, ret;
1485 loff_t i_size = i_size_read(inode);
1486
1487 end_index = i_size >> PAGE_CACHE_SHIFT;
1488 if (index > end_index)
1489 break;
1490 if (index == end_index) {
1491 nr = i_size & ~PAGE_CACHE_MASK;
1492 if (nr <= offset)
1493 break;
1494 }
1495
1496 desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL);
1497 if (desc->error) {
1498 if (desc->error == -EINVAL)
1499 desc->error = 0;
1500 break;
1501 }
1502
1503 /*
1504 * We must evaluate after, since reads (unlike writes)
1505 * are called without i_sem protection against truncate
1506 */
1507 nr = PAGE_CACHE_SIZE;
1508 i_size = i_size_read(inode);
1509 end_index = i_size >> PAGE_CACHE_SHIFT;
1510 if (index == end_index) {
1511 nr = i_size & ~PAGE_CACHE_MASK;
1512 if (nr <= offset) {
1513 if (page)
1514 page_cache_release(page);
1515 break;
1516 }
1517 }
1518 nr -= offset;
1519
1520 if (page) {
1521 /*
1522 * If users can be writing to this page using arbitrary
1523 * virtual addresses, take care about potential aliasing
1524 * before reading the page on the kernel side.
1525 */
1526 if (mapping_writably_mapped(mapping))
1527 flush_dcache_page(page);
1528 /*
1529 * Mark the page accessed if we read the beginning.
1530 */
1531 if (!offset)
1532 mark_page_accessed(page);
1533 } else
1534 page = ZERO_PAGE(0);
1535
1536 /*
1537 * Ok, we have the page, and it's up-to-date, so
1538 * now we can copy it to user space...
1539 *
1540 * The actor routine returns how many bytes were actually used..
1541 * NOTE! This may not be the same as how much of a user buffer
1542 * we filled up (we may be padding etc), so we can only update
1543 * "pos" here (the actor routine has to update the user buffer
1544 * pointers and the remaining count).
1545 */
1546 ret = actor(desc, page, offset, nr);
1547 offset += ret;
1548 index += offset >> PAGE_CACHE_SHIFT;
1549 offset &= ~PAGE_CACHE_MASK;
1550
1551 page_cache_release(page);
1552 if (ret != nr || !desc->count)
1553 break;
1554
1555 cond_resched();
1556 }
1557
1558 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1559 file_accessed(filp);
1560}
1561
1562static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
1563{
1564 read_descriptor_t desc;
1565
1566 if ((ssize_t) count < 0)
1567 return -EINVAL;
1568 if (!access_ok(VERIFY_WRITE, buf, count))
1569 return -EFAULT;
1570 if (!count)
1571 return 0;
1572
1573 desc.written = 0;
1574 desc.count = count;
1575 desc.arg.buf = buf;
1576 desc.error = 0;
1577
1578 do_shmem_file_read(filp, ppos, &desc, file_read_actor);
1579 if (desc.written)
1580 return desc.written;
1581 return desc.error;
1582}
1583
1584static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
1585 size_t count, read_actor_t actor, void *target)
1586{
1587 read_descriptor_t desc;
1588
1589 if (!count)
1590 return 0;
1591
1592 desc.written = 0;
1593 desc.count = count;
1594 desc.arg.data = target;
1595 desc.error = 0;
1596
1597 do_shmem_file_read(in_file, ppos, &desc, actor);
1598 if (desc.written)
1599 return desc.written;
1600 return desc.error;
1601}
1602
1603static int shmem_statfs(struct super_block *sb, struct kstatfs *buf)
1604{
1605 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1606
1607 buf->f_type = TMPFS_MAGIC;
1608 buf->f_bsize = PAGE_CACHE_SIZE;
1609 buf->f_namelen = NAME_MAX;
1610 if (sbinfo) {
1611 spin_lock(&sbinfo->stat_lock);
1612 buf->f_blocks = sbinfo->max_blocks;
1613 buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
1614 buf->f_files = sbinfo->max_inodes;
1615 buf->f_ffree = sbinfo->free_inodes;
1616 spin_unlock(&sbinfo->stat_lock);
1617 }
1618 /* else leave those fields 0 like simple_statfs */
1619 return 0;
1620}
1621
1622/*
1623 * File creation. Allocate an inode, and we're done..
1624 */
1625static int
1626shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1627{
1628 struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev);
1629 int error = -ENOSPC;
1630
1631 if (inode) {
1632 if (dir->i_mode & S_ISGID) {
1633 inode->i_gid = dir->i_gid;
1634 if (S_ISDIR(mode))
1635 inode->i_mode |= S_ISGID;
1636 }
1637 dir->i_size += BOGO_DIRENT_SIZE;
1638 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1639 d_instantiate(dentry, inode);
1640 dget(dentry); /* Extra count - pin the dentry in core */
1641 error = 0;
1642 }
1643 return error;
1644}
1645
1646static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1647{
1648 int error;
1649
1650 if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
1651 return error;
1652 dir->i_nlink++;
1653 return 0;
1654}
1655
1656static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
1657 struct nameidata *nd)
1658{
1659 return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
1660}
1661
1662/*
1663 * Link a file..
1664 */
1665static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1666{
1667 struct inode *inode = old_dentry->d_inode;
1668 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1669
1670 /*
1671 * No ordinary (disk based) filesystem counts links as inodes;
1672 * but each new link needs a new dentry, pinning lowmem, and
1673 * tmpfs dentries cannot be pruned until they are unlinked.
1674 */
1675 if (sbinfo) {
1676 spin_lock(&sbinfo->stat_lock);
1677 if (!sbinfo->free_inodes) {
1678 spin_unlock(&sbinfo->stat_lock);
1679 return -ENOSPC;
1680 }
1681 sbinfo->free_inodes--;
1682 spin_unlock(&sbinfo->stat_lock);
1683 }
1684
1685 dir->i_size += BOGO_DIRENT_SIZE;
1686 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1687 inode->i_nlink++;
1688 atomic_inc(&inode->i_count); /* New dentry reference */
1689 dget(dentry); /* Extra pinning count for the created dentry */
1690 d_instantiate(dentry, inode);
1691 return 0;
1692}
1693
1694static int shmem_unlink(struct inode *dir, struct dentry *dentry)
1695{
1696 struct inode *inode = dentry->d_inode;
1697
1698 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) {
1699 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1700 if (sbinfo) {
1701 spin_lock(&sbinfo->stat_lock);
1702 sbinfo->free_inodes++;
1703 spin_unlock(&sbinfo->stat_lock);
1704 }
1705 }
1706
1707 dir->i_size -= BOGO_DIRENT_SIZE;
1708 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1709 inode->i_nlink--;
1710 dput(dentry); /* Undo the count from "create" - this does all the work */
1711 return 0;
1712}
1713
1714static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
1715{
1716 if (!simple_empty(dentry))
1717 return -ENOTEMPTY;
1718
1719 dir->i_nlink--;
1720 return shmem_unlink(dir, dentry);
1721}
1722
1723/*
1724 * The VFS layer already does all the dentry stuff for rename,
1725 * we just have to decrement the usage count for the target if
1726 * it exists so that the VFS layer correctly free's it when it
1727 * gets overwritten.
1728 */
1729static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
1730{
1731 struct inode *inode = old_dentry->d_inode;
1732 int they_are_dirs = S_ISDIR(inode->i_mode);
1733
1734 if (!simple_empty(new_dentry))
1735 return -ENOTEMPTY;
1736
1737 if (new_dentry->d_inode) {
1738 (void) shmem_unlink(new_dir, new_dentry);
1739 if (they_are_dirs)
1740 old_dir->i_nlink--;
1741 } else if (they_are_dirs) {
1742 old_dir->i_nlink--;
1743 new_dir->i_nlink++;
1744 }
1745
1746 old_dir->i_size -= BOGO_DIRENT_SIZE;
1747 new_dir->i_size += BOGO_DIRENT_SIZE;
1748 old_dir->i_ctime = old_dir->i_mtime =
1749 new_dir->i_ctime = new_dir->i_mtime =
1750 inode->i_ctime = CURRENT_TIME;
1751 return 0;
1752}
1753
1754static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1755{
1756 int error;
1757 int len;
1758 struct inode *inode;
1759 struct page *page = NULL;
1760 char *kaddr;
1761 struct shmem_inode_info *info;
1762
1763 len = strlen(symname) + 1;
1764 if (len > PAGE_CACHE_SIZE)
1765 return -ENAMETOOLONG;
1766
1767 inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
1768 if (!inode)
1769 return -ENOSPC;
1770
1771 info = SHMEM_I(inode);
1772 inode->i_size = len-1;
1773 if (len <= (char *)inode - (char *)info) {
1774 /* do it inline */
1775 memcpy(info, symname, len);
1776 inode->i_op = &shmem_symlink_inline_operations;
1777 } else {
1778 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
1779 if (error) {
1780 iput(inode);
1781 return error;
1782 }
1783 inode->i_op = &shmem_symlink_inode_operations;
1784 kaddr = kmap_atomic(page, KM_USER0);
1785 memcpy(kaddr, symname, len);
1786 kunmap_atomic(kaddr, KM_USER0);
1787 set_page_dirty(page);
1788 page_cache_release(page);
1789 }
1790 if (dir->i_mode & S_ISGID)
1791 inode->i_gid = dir->i_gid;
1792 dir->i_size += BOGO_DIRENT_SIZE;
1793 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1794 d_instantiate(dentry, inode);
1795 dget(dentry);
1796 return 0;
1797}
1798
1799static int shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
1800{
1801 nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode));
1802 return 0;
1803}
1804
1805static int shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
1806{
1807 struct page *page = NULL;
1808 int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
1809 nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
1810 return 0;
1811}
1812
1813static void shmem_put_link(struct dentry *dentry, struct nameidata *nd)
1814{
1815 if (!IS_ERR(nd_get_link(nd))) {
1816 struct page *page;
1817
1818 page = find_get_page(dentry->d_inode->i_mapping, 0);
1819 if (!page)
1820 BUG();
1821 kunmap(page);
1822 mark_page_accessed(page);
1823 page_cache_release(page);
1824 page_cache_release(page);
1825 }
1826}
1827
1828static struct inode_operations shmem_symlink_inline_operations = {
1829 .readlink = generic_readlink,
1830 .follow_link = shmem_follow_link_inline,
1831#ifdef CONFIG_TMPFS_XATTR
1832 .setxattr = generic_setxattr,
1833 .getxattr = generic_getxattr,
1834 .listxattr = generic_listxattr,
1835 .removexattr = generic_removexattr,
1836#endif
1837};
1838
1839static struct inode_operations shmem_symlink_inode_operations = {
1840 .truncate = shmem_truncate,
1841 .readlink = generic_readlink,
1842 .follow_link = shmem_follow_link,
1843 .put_link = shmem_put_link,
1844#ifdef CONFIG_TMPFS_XATTR
1845 .setxattr = generic_setxattr,
1846 .getxattr = generic_getxattr,
1847 .listxattr = generic_listxattr,
1848 .removexattr = generic_removexattr,
1849#endif
1850};
1851
1852static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes)
1853{
1854 char *this_char, *value, *rest;
1855
1856 while ((this_char = strsep(&options, ",")) != NULL) {
1857 if (!*this_char)
1858 continue;
1859 if ((value = strchr(this_char,'=')) != NULL) {
1860 *value++ = 0;
1861 } else {
1862 printk(KERN_ERR
1863 "tmpfs: No value for mount option '%s'\n",
1864 this_char);
1865 return 1;
1866 }
1867
1868 if (!strcmp(this_char,"size")) {
1869 unsigned long long size;
1870 size = memparse(value,&rest);
1871 if (*rest == '%') {
1872 size <<= PAGE_SHIFT;
1873 size *= totalram_pages;
1874 do_div(size, 100);
1875 rest++;
1876 }
1877 if (*rest)
1878 goto bad_val;
1879 *blocks = size >> PAGE_CACHE_SHIFT;
1880 } else if (!strcmp(this_char,"nr_blocks")) {
1881 *blocks = memparse(value,&rest);
1882 if (*rest)
1883 goto bad_val;
1884 } else if (!strcmp(this_char,"nr_inodes")) {
1885 *inodes = memparse(value,&rest);
1886 if (*rest)
1887 goto bad_val;
1888 } else if (!strcmp(this_char,"mode")) {
1889 if (!mode)
1890 continue;
1891 *mode = simple_strtoul(value,&rest,8);
1892 if (*rest)
1893 goto bad_val;
1894 } else if (!strcmp(this_char,"uid")) {
1895 if (!uid)
1896 continue;
1897 *uid = simple_strtoul(value,&rest,0);
1898 if (*rest)
1899 goto bad_val;
1900 } else if (!strcmp(this_char,"gid")) {
1901 if (!gid)
1902 continue;
1903 *gid = simple_strtoul(value,&rest,0);
1904 if (*rest)
1905 goto bad_val;
1906 } else {
1907 printk(KERN_ERR "tmpfs: Bad mount option %s\n",
1908 this_char);
1909 return 1;
1910 }
1911 }
1912 return 0;
1913
1914bad_val:
1915 printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
1916 value, this_char);
1917 return 1;
1918
1919}
1920
1921static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
1922{
1923 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1924 unsigned long max_blocks = 0;
1925 unsigned long max_inodes = 0;
1926
1927 if (sbinfo) {
1928 max_blocks = sbinfo->max_blocks;
1929 max_inodes = sbinfo->max_inodes;
1930 }
1931 if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes))
1932 return -EINVAL;
1933 /* Keep it simple: disallow limited <-> unlimited remount */
1934 if ((max_blocks || max_inodes) == !sbinfo)
1935 return -EINVAL;
1936 /* But allow the pointless unlimited -> unlimited remount */
1937 if (!sbinfo)
1938 return 0;
1939 return shmem_set_size(sbinfo, max_blocks, max_inodes);
1940}
1941#endif
1942
1943static void shmem_put_super(struct super_block *sb)
1944{
1945 kfree(sb->s_fs_info);
1946 sb->s_fs_info = NULL;
1947}
1948
1949#ifdef CONFIG_TMPFS_XATTR
1950static struct xattr_handler *shmem_xattr_handlers[];
1951#else
1952#define shmem_xattr_handlers NULL
1953#endif
1954
1955static int shmem_fill_super(struct super_block *sb,
1956 void *data, int silent)
1957{
1958 struct inode *inode;
1959 struct dentry *root;
1960 int mode = S_IRWXUGO | S_ISVTX;
1961 uid_t uid = current->fsuid;
1962 gid_t gid = current->fsgid;
1963 int err = -ENOMEM;
1964
1965#ifdef CONFIG_TMPFS
1966 unsigned long blocks = 0;
1967 unsigned long inodes = 0;
1968
1969 /*
1970 * Per default we only allow half of the physical ram per
1971 * tmpfs instance, limiting inodes to one per page of lowmem;
1972 * but the internal instance is left unlimited.
1973 */
1974 if (!(sb->s_flags & MS_NOUSER)) {
1975 blocks = totalram_pages / 2;
1976 inodes = totalram_pages - totalhigh_pages;
1977 if (inodes > blocks)
1978 inodes = blocks;
1979
1980 if (shmem_parse_options(data, &mode,
1981 &uid, &gid, &blocks, &inodes))
1982 return -EINVAL;
1983 }
1984
1985 if (blocks || inodes) {
1986 struct shmem_sb_info *sbinfo;
1987 sbinfo = kmalloc(sizeof(struct shmem_sb_info), GFP_KERNEL);
1988 if (!sbinfo)
1989 return -ENOMEM;
1990 sb->s_fs_info = sbinfo;
1991 spin_lock_init(&sbinfo->stat_lock);
1992 sbinfo->max_blocks = blocks;
1993 sbinfo->free_blocks = blocks;
1994 sbinfo->max_inodes = inodes;
1995 sbinfo->free_inodes = inodes;
1996 }
1997 sb->s_xattr = shmem_xattr_handlers;
1998#else
1999 sb->s_flags |= MS_NOUSER;
2000#endif
2001
2002 sb->s_maxbytes = SHMEM_MAX_BYTES;
2003 sb->s_blocksize = PAGE_CACHE_SIZE;
2004 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
2005 sb->s_magic = TMPFS_MAGIC;
2006 sb->s_op = &shmem_ops;
2007 inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
2008 if (!inode)
2009 goto failed;
2010 inode->i_uid = uid;
2011 inode->i_gid = gid;
2012 root = d_alloc_root(inode);
2013 if (!root)
2014 goto failed_iput;
2015 sb->s_root = root;
2016 return 0;
2017
2018failed_iput:
2019 iput(inode);
2020failed:
2021 shmem_put_super(sb);
2022 return err;
2023}
2024
2025static kmem_cache_t *shmem_inode_cachep;
2026
2027static struct inode *shmem_alloc_inode(struct super_block *sb)
2028{
2029 struct shmem_inode_info *p;
2030 p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL);
2031 if (!p)
2032 return NULL;
2033 return &p->vfs_inode;
2034}
2035
2036static void shmem_destroy_inode(struct inode *inode)
2037{
2038 if ((inode->i_mode & S_IFMT) == S_IFREG) {
2039 /* only struct inode is valid if it's an inline symlink */
2040 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2041 }
2042 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2043}
2044
2045static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
2046{
2047 struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2048
2049 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
2050 SLAB_CTOR_CONSTRUCTOR) {
2051 inode_init_once(&p->vfs_inode);
2052 }
2053}
2054
2055static int init_inodecache(void)
2056{
2057 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
2058 sizeof(struct shmem_inode_info),
2059 0, 0, init_once, NULL);
2060 if (shmem_inode_cachep == NULL)
2061 return -ENOMEM;
2062 return 0;
2063}
2064
2065static void destroy_inodecache(void)
2066{
2067 if (kmem_cache_destroy(shmem_inode_cachep))
2068 printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n");
2069}
2070
2071static struct address_space_operations shmem_aops = {
2072 .writepage = shmem_writepage,
2073 .set_page_dirty = __set_page_dirty_nobuffers,
2074#ifdef CONFIG_TMPFS
2075 .prepare_write = shmem_prepare_write,
2076 .commit_write = simple_commit_write,
2077#endif
2078};
2079
2080static struct file_operations shmem_file_operations = {
2081 .mmap = shmem_mmap,
2082#ifdef CONFIG_TMPFS
2083 .llseek = generic_file_llseek,
2084 .read = shmem_file_read,
2085 .write = shmem_file_write,
2086 .fsync = simple_sync_file,
2087 .sendfile = shmem_file_sendfile,
2088#endif
2089};
2090
2091static struct inode_operations shmem_inode_operations = {
2092 .truncate = shmem_truncate,
2093 .setattr = shmem_notify_change,
2094#ifdef CONFIG_TMPFS_XATTR
2095 .setxattr = generic_setxattr,
2096 .getxattr = generic_getxattr,
2097 .listxattr = generic_listxattr,
2098 .removexattr = generic_removexattr,
2099#endif
2100};
2101
2102static struct inode_operations shmem_dir_inode_operations = {
2103#ifdef CONFIG_TMPFS
2104 .create = shmem_create,
2105 .lookup = simple_lookup,
2106 .link = shmem_link,
2107 .unlink = shmem_unlink,
2108 .symlink = shmem_symlink,
2109 .mkdir = shmem_mkdir,
2110 .rmdir = shmem_rmdir,
2111 .mknod = shmem_mknod,
2112 .rename = shmem_rename,
2113#ifdef CONFIG_TMPFS_XATTR
2114 .setxattr = generic_setxattr,
2115 .getxattr = generic_getxattr,
2116 .listxattr = generic_listxattr,
2117 .removexattr = generic_removexattr,
2118#endif
2119#endif
2120};
2121
2122static struct inode_operations shmem_special_inode_operations = {
2123#ifdef CONFIG_TMPFS_XATTR
2124 .setxattr = generic_setxattr,
2125 .getxattr = generic_getxattr,
2126 .listxattr = generic_listxattr,
2127 .removexattr = generic_removexattr,
2128#endif
2129};
2130
2131static struct super_operations shmem_ops = {
2132 .alloc_inode = shmem_alloc_inode,
2133 .destroy_inode = shmem_destroy_inode,
2134#ifdef CONFIG_TMPFS
2135 .statfs = shmem_statfs,
2136 .remount_fs = shmem_remount_fs,
2137#endif
2138 .delete_inode = shmem_delete_inode,
2139 .drop_inode = generic_delete_inode,
2140 .put_super = shmem_put_super,
2141};
2142
2143static struct vm_operations_struct shmem_vm_ops = {
2144 .nopage = shmem_nopage,
2145 .populate = shmem_populate,
2146#ifdef CONFIG_NUMA
2147 .set_policy = shmem_set_policy,
2148 .get_policy = shmem_get_policy,
2149#endif
2150};
2151
2152
2153#ifdef CONFIG_TMPFS_SECURITY
2154
2155static size_t shmem_xattr_security_list(struct inode *inode, char *list, size_t list_len,
2156 const char *name, size_t name_len)
2157{
2158 return security_inode_listsecurity(inode, list, list_len);
2159}
2160
2161static int shmem_xattr_security_get(struct inode *inode, const char *name, void *buffer, size_t size)
2162{
2163 if (strcmp(name, "") == 0)
2164 return -EINVAL;
2165 return security_inode_getsecurity(inode, name, buffer, size);
2166}
2167
2168static int shmem_xattr_security_set(struct inode *inode, const char *name, const void *value, size_t size, int flags)
2169{
2170 if (strcmp(name, "") == 0)
2171 return -EINVAL;
2172 return security_inode_setsecurity(inode, name, value, size, flags);
2173}
2174
2175static struct xattr_handler shmem_xattr_security_handler = {
2176 .prefix = XATTR_SECURITY_PREFIX,
2177 .list = shmem_xattr_security_list,
2178 .get = shmem_xattr_security_get,
2179 .set = shmem_xattr_security_set,
2180};
2181
2182#endif /* CONFIG_TMPFS_SECURITY */
2183
2184#ifdef CONFIG_TMPFS_XATTR
2185
2186static struct xattr_handler *shmem_xattr_handlers[] = {
2187#ifdef CONFIG_TMPFS_SECURITY
2188 &shmem_xattr_security_handler,
2189#endif
2190 NULL
2191};
2192
2193#endif /* CONFIG_TMPFS_XATTR */
2194
2195static struct super_block *shmem_get_sb(struct file_system_type *fs_type,
2196 int flags, const char *dev_name, void *data)
2197{
2198 return get_sb_nodev(fs_type, flags, data, shmem_fill_super);
2199}
2200
2201static struct file_system_type tmpfs_fs_type = {
2202 .owner = THIS_MODULE,
2203 .name = "tmpfs",
2204 .get_sb = shmem_get_sb,
2205 .kill_sb = kill_litter_super,
2206};
2207static struct vfsmount *shm_mnt;
2208
2209static int __init init_tmpfs(void)
2210{
2211 int error;
2212
2213 error = init_inodecache();
2214 if (error)
2215 goto out3;
2216
2217 error = register_filesystem(&tmpfs_fs_type);
2218 if (error) {
2219 printk(KERN_ERR "Could not register tmpfs\n");
2220 goto out2;
2221 }
2222#ifdef CONFIG_TMPFS
2223 devfs_mk_dir("shm");
2224#endif
2225 shm_mnt = do_kern_mount(tmpfs_fs_type.name, MS_NOUSER,
2226 tmpfs_fs_type.name, NULL);
2227 if (IS_ERR(shm_mnt)) {
2228 error = PTR_ERR(shm_mnt);
2229 printk(KERN_ERR "Could not kern_mount tmpfs\n");
2230 goto out1;
2231 }
2232 return 0;
2233
2234out1:
2235 unregister_filesystem(&tmpfs_fs_type);
2236out2:
2237 destroy_inodecache();
2238out3:
2239 shm_mnt = ERR_PTR(error);
2240 return error;
2241}
2242module_init(init_tmpfs)
2243
2244/*
2245 * shmem_file_setup - get an unlinked file living in tmpfs
2246 *
2247 * @name: name for dentry (to be seen in /proc/<pid>/maps
2248 * @size: size to be set for the file
2249 *
2250 */
2251struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2252{
2253 int error;
2254 struct file *file;
2255 struct inode *inode;
2256 struct dentry *dentry, *root;
2257 struct qstr this;
2258
2259 if (IS_ERR(shm_mnt))
2260 return (void *)shm_mnt;
2261
2262 if (size < 0 || size > SHMEM_MAX_BYTES)
2263 return ERR_PTR(-EINVAL);
2264
2265 if (shmem_acct_size(flags, size))
2266 return ERR_PTR(-ENOMEM);
2267
2268 error = -ENOMEM;
2269 this.name = name;
2270 this.len = strlen(name);
2271 this.hash = 0; /* will go */
2272 root = shm_mnt->mnt_root;
2273 dentry = d_alloc(root, &this);
2274 if (!dentry)
2275 goto put_memory;
2276
2277 error = -ENFILE;
2278 file = get_empty_filp();
2279 if (!file)
2280 goto put_dentry;
2281
2282 error = -ENOSPC;
2283 inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
2284 if (!inode)
2285 goto close_file;
2286
2287 SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
2288 d_instantiate(dentry, inode);
2289 inode->i_size = size;
2290 inode->i_nlink = 0; /* It is unlinked */
2291 file->f_vfsmnt = mntget(shm_mnt);
2292 file->f_dentry = dentry;
2293 file->f_mapping = inode->i_mapping;
2294 file->f_op = &shmem_file_operations;
2295 file->f_mode = FMODE_WRITE | FMODE_READ;
2296 return file;
2297
2298close_file:
2299 put_filp(file);
2300put_dentry:
2301 dput(dentry);
2302put_memory:
2303 shmem_unacct_size(flags, size);
2304 return ERR_PTR(error);
2305}
2306
2307/*
2308 * shmem_zero_setup - setup a shared anonymous mapping
2309 *
2310 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
2311 */
2312int shmem_zero_setup(struct vm_area_struct *vma)
2313{
2314 struct file *file;
2315 loff_t size = vma->vm_end - vma->vm_start;
2316
2317 file = shmem_file_setup("dev/zero", size, vma->vm_flags);
2318 if (IS_ERR(file))
2319 return PTR_ERR(file);
2320
2321 if (vma->vm_file)
2322 fput(vma->vm_file);
2323 vma->vm_file = file;
2324 vma->vm_ops = &shmem_vm_ops;
2325 return 0;
2326}
diff --git a/mm/slab.c b/mm/slab.c
new file mode 100644
index 00000000000..ec660d85ddd
--- /dev/null
+++ b/mm/slab.c
@@ -0,0 +1,3060 @@
1/*
2 * linux/mm/slab.c
3 * Written by Mark Hemment, 1996/97.
4 * (markhe@nextd.demon.co.uk)
5 *
6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
7 *
8 * Major cleanup, different bufctl logic, per-cpu arrays
9 * (c) 2000 Manfred Spraul
10 *
11 * Cleanup, make the head arrays unconditional, preparation for NUMA
12 * (c) 2002 Manfred Spraul
13 *
14 * An implementation of the Slab Allocator as described in outline in;
15 * UNIX Internals: The New Frontiers by Uresh Vahalia
16 * Pub: Prentice Hall ISBN 0-13-101908-2
17 * or with a little more detail in;
18 * The Slab Allocator: An Object-Caching Kernel Memory Allocator
19 * Jeff Bonwick (Sun Microsystems).
20 * Presented at: USENIX Summer 1994 Technical Conference
21 *
22 * The memory is organized in caches, one cache for each object type.
23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
24 * Each cache consists out of many slabs (they are small (usually one
25 * page long) and always contiguous), and each slab contains multiple
26 * initialized objects.
27 *
28 * This means, that your constructor is used only for newly allocated
29 * slabs and you must pass objects with the same intializations to
30 * kmem_cache_free.
31 *
32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
33 * normal). If you need a special memory type, then must create a new
34 * cache for that memory type.
35 *
36 * In order to reduce fragmentation, the slabs are sorted in 3 groups:
37 * full slabs with 0 free objects
38 * partial slabs
39 * empty slabs with no allocated objects
40 *
41 * If partial slabs exist, then new allocations come from these slabs,
42 * otherwise from empty slabs or new slabs are allocated.
43 *
44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
46 *
47 * Each cache has a short per-cpu head array, most allocs
48 * and frees go into that array, and if that array overflows, then 1/2
49 * of the entries in the array are given back into the global cache.
50 * The head array is strictly LIFO and should improve the cache hit rates.
51 * On SMP, it additionally reduces the spinlock operations.
52 *
53 * The c_cpuarray may not be read with enabled local interrupts -
54 * it's changed with a smp_call_function().
55 *
56 * SMP synchronization:
57 * constructors and destructors are called without any locking.
58 * Several members in kmem_cache_t and struct slab never change, they
59 * are accessed without any locking.
60 * The per-cpu arrays are never accessed from the wrong cpu, no locking,
61 * and local interrupts are disabled so slab code is preempt-safe.
62 * The non-constant members are protected with a per-cache irq spinlock.
63 *
64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
65 * in 2000 - many ideas in the current implementation are derived from
66 * his patch.
67 *
68 * Further notes from the original documentation:
69 *
70 * 11 April '97. Started multi-threading - markhe
71 * The global cache-chain is protected by the semaphore 'cache_chain_sem'.
72 * The sem is only needed when accessing/extending the cache-chain, which
73 * can never happen inside an interrupt (kmem_cache_create(),
74 * kmem_cache_shrink() and kmem_cache_reap()).
75 *
76 * At present, each engine can be growing a cache. This should be blocked.
77 *
78 */
79
80#include <linux/config.h>
81#include <linux/slab.h>
82#include <linux/mm.h>
83#include <linux/swap.h>
84#include <linux/cache.h>
85#include <linux/interrupt.h>
86#include <linux/init.h>
87#include <linux/compiler.h>
88#include <linux/seq_file.h>
89#include <linux/notifier.h>
90#include <linux/kallsyms.h>
91#include <linux/cpu.h>
92#include <linux/sysctl.h>
93#include <linux/module.h>
94#include <linux/rcupdate.h>
95
96#include <asm/uaccess.h>
97#include <asm/cacheflush.h>
98#include <asm/tlbflush.h>
99#include <asm/page.h>
100
101/*
102 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
103 * SLAB_RED_ZONE & SLAB_POISON.
104 * 0 for faster, smaller code (especially in the critical paths).
105 *
106 * STATS - 1 to collect stats for /proc/slabinfo.
107 * 0 for faster, smaller code (especially in the critical paths).
108 *
109 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
110 */
111
112#ifdef CONFIG_DEBUG_SLAB
113#define DEBUG 1
114#define STATS 1
115#define FORCED_DEBUG 1
116#else
117#define DEBUG 0
118#define STATS 0
119#define FORCED_DEBUG 0
120#endif
121
122
123/* Shouldn't this be in a header file somewhere? */
124#define BYTES_PER_WORD sizeof(void *)
125
126#ifndef cache_line_size
127#define cache_line_size() L1_CACHE_BYTES
128#endif
129
130#ifndef ARCH_KMALLOC_MINALIGN
131/*
132 * Enforce a minimum alignment for the kmalloc caches.
133 * Usually, the kmalloc caches are cache_line_size() aligned, except when
134 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
135 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
136 * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that.
137 * Note that this flag disables some debug features.
138 */
139#define ARCH_KMALLOC_MINALIGN 0
140#endif
141
142#ifndef ARCH_SLAB_MINALIGN
143/*
144 * Enforce a minimum alignment for all caches.
145 * Intended for archs that get misalignment faults even for BYTES_PER_WORD
146 * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
147 * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
148 * some debug features.
149 */
150#define ARCH_SLAB_MINALIGN 0
151#endif
152
153#ifndef ARCH_KMALLOC_FLAGS
154#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
155#endif
156
157/* Legal flag mask for kmem_cache_create(). */
158#if DEBUG
159# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
160 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
161 SLAB_NO_REAP | SLAB_CACHE_DMA | \
162 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
163 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
164 SLAB_DESTROY_BY_RCU)
165#else
166# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
167 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
168 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
169 SLAB_DESTROY_BY_RCU)
170#endif
171
172/*
173 * kmem_bufctl_t:
174 *
175 * Bufctl's are used for linking objs within a slab
176 * linked offsets.
177 *
178 * This implementation relies on "struct page" for locating the cache &
179 * slab an object belongs to.
180 * This allows the bufctl structure to be small (one int), but limits
181 * the number of objects a slab (not a cache) can contain when off-slab
182 * bufctls are used. The limit is the size of the largest general cache
183 * that does not use off-slab slabs.
184 * For 32bit archs with 4 kB pages, is this 56.
185 * This is not serious, as it is only for large objects, when it is unwise
186 * to have too many per slab.
187 * Note: This limit can be raised by introducing a general cache whose size
188 * is less than 512 (PAGE_SIZE<<3), but greater than 256.
189 */
190
191#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0)
192#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1)
193#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2)
194
195/* Max number of objs-per-slab for caches which use off-slab slabs.
196 * Needed to avoid a possible looping condition in cache_grow().
197 */
198static unsigned long offslab_limit;
199
200/*
201 * struct slab
202 *
203 * Manages the objs in a slab. Placed either at the beginning of mem allocated
204 * for a slab, or allocated from an general cache.
205 * Slabs are chained into three list: fully used, partial, fully free slabs.
206 */
207struct slab {
208 struct list_head list;
209 unsigned long colouroff;
210 void *s_mem; /* including colour offset */
211 unsigned int inuse; /* num of objs active in slab */
212 kmem_bufctl_t free;
213};
214
215/*
216 * struct slab_rcu
217 *
218 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
219 * arrange for kmem_freepages to be called via RCU. This is useful if
220 * we need to approach a kernel structure obliquely, from its address
221 * obtained without the usual locking. We can lock the structure to
222 * stabilize it and check it's still at the given address, only if we
223 * can be sure that the memory has not been meanwhile reused for some
224 * other kind of object (which our subsystem's lock might corrupt).
225 *
226 * rcu_read_lock before reading the address, then rcu_read_unlock after
227 * taking the spinlock within the structure expected at that address.
228 *
229 * We assume struct slab_rcu can overlay struct slab when destroying.
230 */
231struct slab_rcu {
232 struct rcu_head head;
233 kmem_cache_t *cachep;
234 void *addr;
235};
236
237/*
238 * struct array_cache
239 *
240 * Per cpu structures
241 * Purpose:
242 * - LIFO ordering, to hand out cache-warm objects from _alloc
243 * - reduce the number of linked list operations
244 * - reduce spinlock operations
245 *
246 * The limit is stored in the per-cpu structure to reduce the data cache
247 * footprint.
248 *
249 */
250struct array_cache {
251 unsigned int avail;
252 unsigned int limit;
253 unsigned int batchcount;
254 unsigned int touched;
255};
256
257/* bootstrap: The caches do not work without cpuarrays anymore,
258 * but the cpuarrays are allocated from the generic caches...
259 */
260#define BOOT_CPUCACHE_ENTRIES 1
261struct arraycache_init {
262 struct array_cache cache;
263 void * entries[BOOT_CPUCACHE_ENTRIES];
264};
265
266/*
267 * The slab lists of all objects.
268 * Hopefully reduce the internal fragmentation
269 * NUMA: The spinlock could be moved from the kmem_cache_t
270 * into this structure, too. Figure out what causes
271 * fewer cross-node spinlock operations.
272 */
273struct kmem_list3 {
274 struct list_head slabs_partial; /* partial list first, better asm code */
275 struct list_head slabs_full;
276 struct list_head slabs_free;
277 unsigned long free_objects;
278 int free_touched;
279 unsigned long next_reap;
280 struct array_cache *shared;
281};
282
283#define LIST3_INIT(parent) \
284 { \
285 .slabs_full = LIST_HEAD_INIT(parent.slabs_full), \
286 .slabs_partial = LIST_HEAD_INIT(parent.slabs_partial), \
287 .slabs_free = LIST_HEAD_INIT(parent.slabs_free) \
288 }
289#define list3_data(cachep) \
290 (&(cachep)->lists)
291
292/* NUMA: per-node */
293#define list3_data_ptr(cachep, ptr) \
294 list3_data(cachep)
295
296/*
297 * kmem_cache_t
298 *
299 * manages a cache.
300 */
301
302struct kmem_cache_s {
303/* 1) per-cpu data, touched during every alloc/free */
304 struct array_cache *array[NR_CPUS];
305 unsigned int batchcount;
306 unsigned int limit;
307/* 2) touched by every alloc & free from the backend */
308 struct kmem_list3 lists;
309 /* NUMA: kmem_3list_t *nodelists[MAX_NUMNODES] */
310 unsigned int objsize;
311 unsigned int flags; /* constant flags */
312 unsigned int num; /* # of objs per slab */
313 unsigned int free_limit; /* upper limit of objects in the lists */
314 spinlock_t spinlock;
315
316/* 3) cache_grow/shrink */
317 /* order of pgs per slab (2^n) */
318 unsigned int gfporder;
319
320 /* force GFP flags, e.g. GFP_DMA */
321 unsigned int gfpflags;
322
323 size_t colour; /* cache colouring range */
324 unsigned int colour_off; /* colour offset */
325 unsigned int colour_next; /* cache colouring */
326 kmem_cache_t *slabp_cache;
327 unsigned int slab_size;
328 unsigned int dflags; /* dynamic flags */
329
330 /* constructor func */
331 void (*ctor)(void *, kmem_cache_t *, unsigned long);
332
333 /* de-constructor func */
334 void (*dtor)(void *, kmem_cache_t *, unsigned long);
335
336/* 4) cache creation/removal */
337 const char *name;
338 struct list_head next;
339
340/* 5) statistics */
341#if STATS
342 unsigned long num_active;
343 unsigned long num_allocations;
344 unsigned long high_mark;
345 unsigned long grown;
346 unsigned long reaped;
347 unsigned long errors;
348 unsigned long max_freeable;
349 unsigned long node_allocs;
350 atomic_t allochit;
351 atomic_t allocmiss;
352 atomic_t freehit;
353 atomic_t freemiss;
354#endif
355#if DEBUG
356 int dbghead;
357 int reallen;
358#endif
359};
360
361#define CFLGS_OFF_SLAB (0x80000000UL)
362#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
363
364#define BATCHREFILL_LIMIT 16
365/* Optimization question: fewer reaps means less
366 * probability for unnessary cpucache drain/refill cycles.
367 *
368 * OTHO the cpuarrays can contain lots of objects,
369 * which could lock up otherwise freeable slabs.
370 */
371#define REAPTIMEOUT_CPUC (2*HZ)
372#define REAPTIMEOUT_LIST3 (4*HZ)
373
374#if STATS
375#define STATS_INC_ACTIVE(x) ((x)->num_active++)
376#define STATS_DEC_ACTIVE(x) ((x)->num_active--)
377#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
378#define STATS_INC_GROWN(x) ((x)->grown++)
379#define STATS_INC_REAPED(x) ((x)->reaped++)
380#define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \
381 (x)->high_mark = (x)->num_active; \
382 } while (0)
383#define STATS_INC_ERR(x) ((x)->errors++)
384#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
385#define STATS_SET_FREEABLE(x, i) \
386 do { if ((x)->max_freeable < i) \
387 (x)->max_freeable = i; \
388 } while (0)
389
390#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit)
391#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss)
392#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit)
393#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss)
394#else
395#define STATS_INC_ACTIVE(x) do { } while (0)
396#define STATS_DEC_ACTIVE(x) do { } while (0)
397#define STATS_INC_ALLOCED(x) do { } while (0)
398#define STATS_INC_GROWN(x) do { } while (0)
399#define STATS_INC_REAPED(x) do { } while (0)
400#define STATS_SET_HIGH(x) do { } while (0)
401#define STATS_INC_ERR(x) do { } while (0)
402#define STATS_INC_NODEALLOCS(x) do { } while (0)
403#define STATS_SET_FREEABLE(x, i) \
404 do { } while (0)
405
406#define STATS_INC_ALLOCHIT(x) do { } while (0)
407#define STATS_INC_ALLOCMISS(x) do { } while (0)
408#define STATS_INC_FREEHIT(x) do { } while (0)
409#define STATS_INC_FREEMISS(x) do { } while (0)
410#endif
411
412#if DEBUG
413/* Magic nums for obj red zoning.
414 * Placed in the first word before and the first word after an obj.
415 */
416#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */
417#define RED_ACTIVE 0x170FC2A5UL /* when obj is active */
418
419/* ...and for poisoning */
420#define POISON_INUSE 0x5a /* for use-uninitialised poisoning */
421#define POISON_FREE 0x6b /* for use-after-free poisoning */
422#define POISON_END 0xa5 /* end-byte of poisoning */
423
424/* memory layout of objects:
425 * 0 : objp
426 * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that
427 * the end of an object is aligned with the end of the real
428 * allocation. Catches writes behind the end of the allocation.
429 * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1:
430 * redzone word.
431 * cachep->dbghead: The real object.
432 * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
433 * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
434 */
435static int obj_dbghead(kmem_cache_t *cachep)
436{
437 return cachep->dbghead;
438}
439
440static int obj_reallen(kmem_cache_t *cachep)
441{
442 return cachep->reallen;
443}
444
445static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp)
446{
447 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
448 return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD);
449}
450
451static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
452{
453 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
454 if (cachep->flags & SLAB_STORE_USER)
455 return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD);
456 return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD);
457}
458
459static void **dbg_userword(kmem_cache_t *cachep, void *objp)
460{
461 BUG_ON(!(cachep->flags & SLAB_STORE_USER));
462 return (void**)(objp+cachep->objsize-BYTES_PER_WORD);
463}
464
465#else
466
467#define obj_dbghead(x) 0
468#define obj_reallen(cachep) (cachep->objsize)
469#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;})
470#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;})
471#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})
472
473#endif
474
475/*
476 * Maximum size of an obj (in 2^order pages)
477 * and absolute limit for the gfp order.
478 */
479#if defined(CONFIG_LARGE_ALLOCS)
480#define MAX_OBJ_ORDER 13 /* up to 32Mb */
481#define MAX_GFP_ORDER 13 /* up to 32Mb */
482#elif defined(CONFIG_MMU)
483#define MAX_OBJ_ORDER 5 /* 32 pages */
484#define MAX_GFP_ORDER 5 /* 32 pages */
485#else
486#define MAX_OBJ_ORDER 8 /* up to 1Mb */
487#define MAX_GFP_ORDER 8 /* up to 1Mb */
488#endif
489
490/*
491 * Do not go above this order unless 0 objects fit into the slab.
492 */
493#define BREAK_GFP_ORDER_HI 1
494#define BREAK_GFP_ORDER_LO 0
495static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
496
497/* Macros for storing/retrieving the cachep and or slab from the
498 * global 'mem_map'. These are used to find the slab an obj belongs to.
499 * With kfree(), these are used to find the cache which an obj belongs to.
500 */
501#define SET_PAGE_CACHE(pg,x) ((pg)->lru.next = (struct list_head *)(x))
502#define GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->lru.next)
503#define SET_PAGE_SLAB(pg,x) ((pg)->lru.prev = (struct list_head *)(x))
504#define GET_PAGE_SLAB(pg) ((struct slab *)(pg)->lru.prev)
505
506/* These are the default caches for kmalloc. Custom caches can have other sizes. */
507struct cache_sizes malloc_sizes[] = {
508#define CACHE(x) { .cs_size = (x) },
509#include <linux/kmalloc_sizes.h>
510 CACHE(ULONG_MAX)
511#undef CACHE
512};
513EXPORT_SYMBOL(malloc_sizes);
514
515/* Must match cache_sizes above. Out of line to keep cache footprint low. */
516struct cache_names {
517 char *name;
518 char *name_dma;
519};
520
521static struct cache_names __initdata cache_names[] = {
522#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
523#include <linux/kmalloc_sizes.h>
524 { NULL, }
525#undef CACHE
526};
527
528static struct arraycache_init initarray_cache __initdata =
529 { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
530static struct arraycache_init initarray_generic =
531 { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
532
533/* internal cache of cache description objs */
534static kmem_cache_t cache_cache = {
535 .lists = LIST3_INIT(cache_cache.lists),
536 .batchcount = 1,
537 .limit = BOOT_CPUCACHE_ENTRIES,
538 .objsize = sizeof(kmem_cache_t),
539 .flags = SLAB_NO_REAP,
540 .spinlock = SPIN_LOCK_UNLOCKED,
541 .name = "kmem_cache",
542#if DEBUG
543 .reallen = sizeof(kmem_cache_t),
544#endif
545};
546
547/* Guard access to the cache-chain. */
548static struct semaphore cache_chain_sem;
549static struct list_head cache_chain;
550
551/*
552 * vm_enough_memory() looks at this to determine how many
553 * slab-allocated pages are possibly freeable under pressure
554 *
555 * SLAB_RECLAIM_ACCOUNT turns this on per-slab
556 */
557atomic_t slab_reclaim_pages;
558EXPORT_SYMBOL(slab_reclaim_pages);
559
560/*
561 * chicken and egg problem: delay the per-cpu array allocation
562 * until the general caches are up.
563 */
564static enum {
565 NONE,
566 PARTIAL,
567 FULL
568} g_cpucache_up;
569
570static DEFINE_PER_CPU(struct work_struct, reap_work);
571
572static void free_block(kmem_cache_t* cachep, void** objpp, int len);
573static void enable_cpucache (kmem_cache_t *cachep);
574static void cache_reap (void *unused);
575
576static inline void **ac_entry(struct array_cache *ac)
577{
578 return (void**)(ac+1);
579}
580
581static inline struct array_cache *ac_data(kmem_cache_t *cachep)
582{
583 return cachep->array[smp_processor_id()];
584}
585
586static inline kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags)
587{
588 struct cache_sizes *csizep = malloc_sizes;
589
590#if DEBUG
591 /* This happens if someone tries to call
592 * kmem_cache_create(), or __kmalloc(), before
593 * the generic caches are initialized.
594 */
595 BUG_ON(csizep->cs_cachep == NULL);
596#endif
597 while (size > csizep->cs_size)
598 csizep++;
599
600 /*
601 * Really subtile: The last entry with cs->cs_size==ULONG_MAX
602 * has cs_{dma,}cachep==NULL. Thus no special case
603 * for large kmalloc calls required.
604 */
605 if (unlikely(gfpflags & GFP_DMA))
606 return csizep->cs_dmacachep;
607 return csizep->cs_cachep;
608}
609
610/* Cal the num objs, wastage, and bytes left over for a given slab size. */
611static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
612 int flags, size_t *left_over, unsigned int *num)
613{
614 int i;
615 size_t wastage = PAGE_SIZE<<gfporder;
616 size_t extra = 0;
617 size_t base = 0;
618
619 if (!(flags & CFLGS_OFF_SLAB)) {
620 base = sizeof(struct slab);
621 extra = sizeof(kmem_bufctl_t);
622 }
623 i = 0;
624 while (i*size + ALIGN(base+i*extra, align) <= wastage)
625 i++;
626 if (i > 0)
627 i--;
628
629 if (i > SLAB_LIMIT)
630 i = SLAB_LIMIT;
631
632 *num = i;
633 wastage -= i*size;
634 wastage -= ALIGN(base+i*extra, align);
635 *left_over = wastage;
636}
637
638#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
639
640static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
641{
642 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
643 function, cachep->name, msg);
644 dump_stack();
645}
646
647/*
648 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz
649 * via the workqueue/eventd.
650 * Add the CPU number into the expiration time to minimize the possibility of
651 * the CPUs getting into lockstep and contending for the global cache chain
652 * lock.
653 */
654static void __devinit start_cpu_timer(int cpu)
655{
656 struct work_struct *reap_work = &per_cpu(reap_work, cpu);
657
658 /*
659 * When this gets called from do_initcalls via cpucache_init(),
660 * init_workqueues() has already run, so keventd will be setup
661 * at that time.
662 */
663 if (keventd_up() && reap_work->func == NULL) {
664 INIT_WORK(reap_work, cache_reap, NULL);
665 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
666 }
667}
668
669static struct array_cache *alloc_arraycache(int cpu, int entries,
670 int batchcount)
671{
672 int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
673 struct array_cache *nc = NULL;
674
675 if (cpu != -1) {
676 kmem_cache_t *cachep;
677 cachep = kmem_find_general_cachep(memsize, GFP_KERNEL);
678 if (cachep)
679 nc = kmem_cache_alloc_node(cachep, cpu_to_node(cpu));
680 }
681 if (!nc)
682 nc = kmalloc(memsize, GFP_KERNEL);
683 if (nc) {
684 nc->avail = 0;
685 nc->limit = entries;
686 nc->batchcount = batchcount;
687 nc->touched = 0;
688 }
689 return nc;
690}
691
692static int __devinit cpuup_callback(struct notifier_block *nfb,
693 unsigned long action, void *hcpu)
694{
695 long cpu = (long)hcpu;
696 kmem_cache_t* cachep;
697
698 switch (action) {
699 case CPU_UP_PREPARE:
700 down(&cache_chain_sem);
701 list_for_each_entry(cachep, &cache_chain, next) {
702 struct array_cache *nc;
703
704 nc = alloc_arraycache(cpu, cachep->limit, cachep->batchcount);
705 if (!nc)
706 goto bad;
707
708 spin_lock_irq(&cachep->spinlock);
709 cachep->array[cpu] = nc;
710 cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
711 + cachep->num;
712 spin_unlock_irq(&cachep->spinlock);
713
714 }
715 up(&cache_chain_sem);
716 break;
717 case CPU_ONLINE:
718 start_cpu_timer(cpu);
719 break;
720#ifdef CONFIG_HOTPLUG_CPU
721 case CPU_DEAD:
722 /* fall thru */
723 case CPU_UP_CANCELED:
724 down(&cache_chain_sem);
725
726 list_for_each_entry(cachep, &cache_chain, next) {
727 struct array_cache *nc;
728
729 spin_lock_irq(&cachep->spinlock);
730 /* cpu is dead; no one can alloc from it. */
731 nc = cachep->array[cpu];
732 cachep->array[cpu] = NULL;
733 cachep->free_limit -= cachep->batchcount;
734 free_block(cachep, ac_entry(nc), nc->avail);
735 spin_unlock_irq(&cachep->spinlock);
736 kfree(nc);
737 }
738 up(&cache_chain_sem);
739 break;
740#endif
741 }
742 return NOTIFY_OK;
743bad:
744 up(&cache_chain_sem);
745 return NOTIFY_BAD;
746}
747
748static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
749
750/* Initialisation.
751 * Called after the gfp() functions have been enabled, and before smp_init().
752 */
753void __init kmem_cache_init(void)
754{
755 size_t left_over;
756 struct cache_sizes *sizes;
757 struct cache_names *names;
758
759 /*
760 * Fragmentation resistance on low memory - only use bigger
761 * page orders on machines with more than 32MB of memory.
762 */
763 if (num_physpages > (32 << 20) >> PAGE_SHIFT)
764 slab_break_gfp_order = BREAK_GFP_ORDER_HI;
765
766
767 /* Bootstrap is tricky, because several objects are allocated
768 * from caches that do not exist yet:
769 * 1) initialize the cache_cache cache: it contains the kmem_cache_t
770 * structures of all caches, except cache_cache itself: cache_cache
771 * is statically allocated.
772 * Initially an __init data area is used for the head array, it's
773 * replaced with a kmalloc allocated array at the end of the bootstrap.
774 * 2) Create the first kmalloc cache.
775 * The kmem_cache_t for the new cache is allocated normally. An __init
776 * data area is used for the head array.
777 * 3) Create the remaining kmalloc caches, with minimally sized head arrays.
778 * 4) Replace the __init data head arrays for cache_cache and the first
779 * kmalloc cache with kmalloc allocated arrays.
780 * 5) Resize the head arrays of the kmalloc caches to their final sizes.
781 */
782
783 /* 1) create the cache_cache */
784 init_MUTEX(&cache_chain_sem);
785 INIT_LIST_HEAD(&cache_chain);
786 list_add(&cache_cache.next, &cache_chain);
787 cache_cache.colour_off = cache_line_size();
788 cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
789
790 cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
791
792 cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
793 &left_over, &cache_cache.num);
794 if (!cache_cache.num)
795 BUG();
796
797 cache_cache.colour = left_over/cache_cache.colour_off;
798 cache_cache.colour_next = 0;
799 cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) +
800 sizeof(struct slab), cache_line_size());
801
802 /* 2+3) create the kmalloc caches */
803 sizes = malloc_sizes;
804 names = cache_names;
805
806 while (sizes->cs_size != ULONG_MAX) {
807 /* For performance, all the general caches are L1 aligned.
808 * This should be particularly beneficial on SMP boxes, as it
809 * eliminates "false sharing".
810 * Note for systems short on memory removing the alignment will
811 * allow tighter packing of the smaller caches. */
812 sizes->cs_cachep = kmem_cache_create(names->name,
813 sizes->cs_size, ARCH_KMALLOC_MINALIGN,
814 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
815
816 /* Inc off-slab bufctl limit until the ceiling is hit. */
817 if (!(OFF_SLAB(sizes->cs_cachep))) {
818 offslab_limit = sizes->cs_size-sizeof(struct slab);
819 offslab_limit /= sizeof(kmem_bufctl_t);
820 }
821
822 sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
823 sizes->cs_size, ARCH_KMALLOC_MINALIGN,
824 (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC),
825 NULL, NULL);
826
827 sizes++;
828 names++;
829 }
830 /* 4) Replace the bootstrap head arrays */
831 {
832 void * ptr;
833
834 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
835 local_irq_disable();
836 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
837 memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init));
838 cache_cache.array[smp_processor_id()] = ptr;
839 local_irq_enable();
840
841 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
842 local_irq_disable();
843 BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache);
844 memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep),
845 sizeof(struct arraycache_init));
846 malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr;
847 local_irq_enable();
848 }
849
850 /* 5) resize the head arrays to their final sizes */
851 {
852 kmem_cache_t *cachep;
853 down(&cache_chain_sem);
854 list_for_each_entry(cachep, &cache_chain, next)
855 enable_cpucache(cachep);
856 up(&cache_chain_sem);
857 }
858
859 /* Done! */
860 g_cpucache_up = FULL;
861
862 /* Register a cpu startup notifier callback
863 * that initializes ac_data for all new cpus
864 */
865 register_cpu_notifier(&cpucache_notifier);
866
867
868 /* The reap timers are started later, with a module init call:
869 * That part of the kernel is not yet operational.
870 */
871}
872
873static int __init cpucache_init(void)
874{
875 int cpu;
876
877 /*
878 * Register the timers that return unneeded
879 * pages to gfp.
880 */
881 for (cpu = 0; cpu < NR_CPUS; cpu++) {
882 if (cpu_online(cpu))
883 start_cpu_timer(cpu);
884 }
885
886 return 0;
887}
888
889__initcall(cpucache_init);
890
891/*
892 * Interface to system's page allocator. No need to hold the cache-lock.
893 *
894 * If we requested dmaable memory, we will get it. Even if we
895 * did not request dmaable memory, we might get it, but that
896 * would be relatively rare and ignorable.
897 */
898static void *kmem_getpages(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
899{
900 struct page *page;
901 void *addr;
902 int i;
903
904 flags |= cachep->gfpflags;
905 if (likely(nodeid == -1)) {
906 page = alloc_pages(flags, cachep->gfporder);
907 } else {
908 page = alloc_pages_node(nodeid, flags, cachep->gfporder);
909 }
910 if (!page)
911 return NULL;
912 addr = page_address(page);
913
914 i = (1 << cachep->gfporder);
915 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
916 atomic_add(i, &slab_reclaim_pages);
917 add_page_state(nr_slab, i);
918 while (i--) {
919 SetPageSlab(page);
920 page++;
921 }
922 return addr;
923}
924
925/*
926 * Interface to system's page release.
927 */
928static void kmem_freepages(kmem_cache_t *cachep, void *addr)
929{
930 unsigned long i = (1<<cachep->gfporder);
931 struct page *page = virt_to_page(addr);
932 const unsigned long nr_freed = i;
933
934 while (i--) {
935 if (!TestClearPageSlab(page))
936 BUG();
937 page++;
938 }
939 sub_page_state(nr_slab, nr_freed);
940 if (current->reclaim_state)
941 current->reclaim_state->reclaimed_slab += nr_freed;
942 free_pages((unsigned long)addr, cachep->gfporder);
943 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
944 atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages);
945}
946
947static void kmem_rcu_free(struct rcu_head *head)
948{
949 struct slab_rcu *slab_rcu = (struct slab_rcu *) head;
950 kmem_cache_t *cachep = slab_rcu->cachep;
951
952 kmem_freepages(cachep, slab_rcu->addr);
953 if (OFF_SLAB(cachep))
954 kmem_cache_free(cachep->slabp_cache, slab_rcu);
955}
956
957#if DEBUG
958
959#ifdef CONFIG_DEBUG_PAGEALLOC
960static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
961 unsigned long caller)
962{
963 int size = obj_reallen(cachep);
964
965 addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)];
966
967 if (size < 5*sizeof(unsigned long))
968 return;
969
970 *addr++=0x12345678;
971 *addr++=caller;
972 *addr++=smp_processor_id();
973 size -= 3*sizeof(unsigned long);
974 {
975 unsigned long *sptr = &caller;
976 unsigned long svalue;
977
978 while (!kstack_end(sptr)) {
979 svalue = *sptr++;
980 if (kernel_text_address(svalue)) {
981 *addr++=svalue;
982 size -= sizeof(unsigned long);
983 if (size <= sizeof(unsigned long))
984 break;
985 }
986 }
987
988 }
989 *addr++=0x87654321;
990}
991#endif
992
993static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
994{
995 int size = obj_reallen(cachep);
996 addr = &((char*)addr)[obj_dbghead(cachep)];
997
998 memset(addr, val, size);
999 *(unsigned char *)(addr+size-1) = POISON_END;
1000}
1001
1002static void dump_line(char *data, int offset, int limit)
1003{
1004 int i;
1005 printk(KERN_ERR "%03x:", offset);
1006 for (i=0;i<limit;i++) {
1007 printk(" %02x", (unsigned char)data[offset+i]);
1008 }
1009 printk("\n");
1010}
1011#endif
1012
1013#if DEBUG
1014
1015static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
1016{
1017 int i, size;
1018 char *realobj;
1019
1020 if (cachep->flags & SLAB_RED_ZONE) {
1021 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
1022 *dbg_redzone1(cachep, objp),
1023 *dbg_redzone2(cachep, objp));
1024 }
1025
1026 if (cachep->flags & SLAB_STORE_USER) {
1027 printk(KERN_ERR "Last user: [<%p>]",
1028 *dbg_userword(cachep, objp));
1029 print_symbol("(%s)",
1030 (unsigned long)*dbg_userword(cachep, objp));
1031 printk("\n");
1032 }
1033 realobj = (char*)objp+obj_dbghead(cachep);
1034 size = obj_reallen(cachep);
1035 for (i=0; i<size && lines;i+=16, lines--) {
1036 int limit;
1037 limit = 16;
1038 if (i+limit > size)
1039 limit = size-i;
1040 dump_line(realobj, i, limit);
1041 }
1042}
1043
1044static void check_poison_obj(kmem_cache_t *cachep, void *objp)
1045{
1046 char *realobj;
1047 int size, i;
1048 int lines = 0;
1049
1050 realobj = (char*)objp+obj_dbghead(cachep);
1051 size = obj_reallen(cachep);
1052
1053 for (i=0;i<size;i++) {
1054 char exp = POISON_FREE;
1055 if (i == size-1)
1056 exp = POISON_END;
1057 if (realobj[i] != exp) {
1058 int limit;
1059 /* Mismatch ! */
1060 /* Print header */
1061 if (lines == 0) {
1062 printk(KERN_ERR "Slab corruption: start=%p, len=%d\n",
1063 realobj, size);
1064 print_objinfo(cachep, objp, 0);
1065 }
1066 /* Hexdump the affected line */
1067 i = (i/16)*16;
1068 limit = 16;
1069 if (i+limit > size)
1070 limit = size-i;
1071 dump_line(realobj, i, limit);
1072 i += 16;
1073 lines++;
1074 /* Limit to 5 lines */
1075 if (lines > 5)
1076 break;
1077 }
1078 }
1079 if (lines != 0) {
1080 /* Print some data about the neighboring objects, if they
1081 * exist:
1082 */
1083 struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp));
1084 int objnr;
1085
1086 objnr = (objp-slabp->s_mem)/cachep->objsize;
1087 if (objnr) {
1088 objp = slabp->s_mem+(objnr-1)*cachep->objsize;
1089 realobj = (char*)objp+obj_dbghead(cachep);
1090 printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1091 realobj, size);
1092 print_objinfo(cachep, objp, 2);
1093 }
1094 if (objnr+1 < cachep->num) {
1095 objp = slabp->s_mem+(objnr+1)*cachep->objsize;
1096 realobj = (char*)objp+obj_dbghead(cachep);
1097 printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1098 realobj, size);
1099 print_objinfo(cachep, objp, 2);
1100 }
1101 }
1102}
1103#endif
1104
1105/* Destroy all the objs in a slab, and release the mem back to the system.
1106 * Before calling the slab must have been unlinked from the cache.
1107 * The cache-lock is not held/needed.
1108 */
1109static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
1110{
1111 void *addr = slabp->s_mem - slabp->colouroff;
1112
1113#if DEBUG
1114 int i;
1115 for (i = 0; i < cachep->num; i++) {
1116 void *objp = slabp->s_mem + cachep->objsize * i;
1117
1118 if (cachep->flags & SLAB_POISON) {
1119#ifdef CONFIG_DEBUG_PAGEALLOC
1120 if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep))
1121 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1);
1122 else
1123 check_poison_obj(cachep, objp);
1124#else
1125 check_poison_obj(cachep, objp);
1126#endif
1127 }
1128 if (cachep->flags & SLAB_RED_ZONE) {
1129 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1130 slab_error(cachep, "start of a freed object "
1131 "was overwritten");
1132 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1133 slab_error(cachep, "end of a freed object "
1134 "was overwritten");
1135 }
1136 if (cachep->dtor && !(cachep->flags & SLAB_POISON))
1137 (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0);
1138 }
1139#else
1140 if (cachep->dtor) {
1141 int i;
1142 for (i = 0; i < cachep->num; i++) {
1143 void* objp = slabp->s_mem+cachep->objsize*i;
1144 (cachep->dtor)(objp, cachep, 0);
1145 }
1146 }
1147#endif
1148
1149 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1150 struct slab_rcu *slab_rcu;
1151
1152 slab_rcu = (struct slab_rcu *) slabp;
1153 slab_rcu->cachep = cachep;
1154 slab_rcu->addr = addr;
1155 call_rcu(&slab_rcu->head, kmem_rcu_free);
1156 } else {
1157 kmem_freepages(cachep, addr);
1158 if (OFF_SLAB(cachep))
1159 kmem_cache_free(cachep->slabp_cache, slabp);
1160 }
1161}
1162
1163/**
1164 * kmem_cache_create - Create a cache.
1165 * @name: A string which is used in /proc/slabinfo to identify this cache.
1166 * @size: The size of objects to be created in this cache.
1167 * @align: The required alignment for the objects.
1168 * @flags: SLAB flags
1169 * @ctor: A constructor for the objects.
1170 * @dtor: A destructor for the objects.
1171 *
1172 * Returns a ptr to the cache on success, NULL on failure.
1173 * Cannot be called within a int, but can be interrupted.
1174 * The @ctor is run when new pages are allocated by the cache
1175 * and the @dtor is run before the pages are handed back.
1176 *
1177 * @name must be valid until the cache is destroyed. This implies that
1178 * the module calling this has to destroy the cache before getting
1179 * unloaded.
1180 *
1181 * The flags are
1182 *
1183 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
1184 * to catch references to uninitialised memory.
1185 *
1186 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
1187 * for buffer overruns.
1188 *
1189 * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
1190 * memory pressure.
1191 *
1192 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
1193 * cacheline. This can be beneficial if you're counting cycles as closely
1194 * as davem.
1195 */
1196kmem_cache_t *
1197kmem_cache_create (const char *name, size_t size, size_t align,
1198 unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
1199 void (*dtor)(void*, kmem_cache_t *, unsigned long))
1200{
1201 size_t left_over, slab_size, ralign;
1202 kmem_cache_t *cachep = NULL;
1203
1204 /*
1205 * Sanity checks... these are all serious usage bugs.
1206 */
1207 if ((!name) ||
1208 in_interrupt() ||
1209 (size < BYTES_PER_WORD) ||
1210 (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) ||
1211 (dtor && !ctor)) {
1212 printk(KERN_ERR "%s: Early error in slab %s\n",
1213 __FUNCTION__, name);
1214 BUG();
1215 }
1216
1217#if DEBUG
1218 WARN_ON(strchr(name, ' ')); /* It confuses parsers */
1219 if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
1220 /* No constructor, but inital state check requested */
1221 printk(KERN_ERR "%s: No con, but init state check "
1222 "requested - %s\n", __FUNCTION__, name);
1223 flags &= ~SLAB_DEBUG_INITIAL;
1224 }
1225
1226#if FORCED_DEBUG
1227 /*
1228 * Enable redzoning and last user accounting, except for caches with
1229 * large objects, if the increased size would increase the object size
1230 * above the next power of two: caches with object sizes just above a
1231 * power of two have a significant amount of internal fragmentation.
1232 */
1233 if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)))
1234 flags |= SLAB_RED_ZONE|SLAB_STORE_USER;
1235 if (!(flags & SLAB_DESTROY_BY_RCU))
1236 flags |= SLAB_POISON;
1237#endif
1238 if (flags & SLAB_DESTROY_BY_RCU)
1239 BUG_ON(flags & SLAB_POISON);
1240#endif
1241 if (flags & SLAB_DESTROY_BY_RCU)
1242 BUG_ON(dtor);
1243
1244 /*
1245 * Always checks flags, a caller might be expecting debug
1246 * support which isn't available.
1247 */
1248 if (flags & ~CREATE_MASK)
1249 BUG();
1250
1251 /* Check that size is in terms of words. This is needed to avoid
1252 * unaligned accesses for some archs when redzoning is used, and makes
1253 * sure any on-slab bufctl's are also correctly aligned.
1254 */
1255 if (size & (BYTES_PER_WORD-1)) {
1256 size += (BYTES_PER_WORD-1);
1257 size &= ~(BYTES_PER_WORD-1);
1258 }
1259
1260 /* calculate out the final buffer alignment: */
1261 /* 1) arch recommendation: can be overridden for debug */
1262 if (flags & SLAB_HWCACHE_ALIGN) {
1263 /* Default alignment: as specified by the arch code.
1264 * Except if an object is really small, then squeeze multiple
1265 * objects into one cacheline.
1266 */
1267 ralign = cache_line_size();
1268 while (size <= ralign/2)
1269 ralign /= 2;
1270 } else {
1271 ralign = BYTES_PER_WORD;
1272 }
1273 /* 2) arch mandated alignment: disables debug if necessary */
1274 if (ralign < ARCH_SLAB_MINALIGN) {
1275 ralign = ARCH_SLAB_MINALIGN;
1276 if (ralign > BYTES_PER_WORD)
1277 flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
1278 }
1279 /* 3) caller mandated alignment: disables debug if necessary */
1280 if (ralign < align) {
1281 ralign = align;
1282 if (ralign > BYTES_PER_WORD)
1283 flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
1284 }
1285 /* 4) Store it. Note that the debug code below can reduce
1286 * the alignment to BYTES_PER_WORD.
1287 */
1288 align = ralign;
1289
1290 /* Get cache's description obj. */
1291 cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
1292 if (!cachep)
1293 goto opps;
1294 memset(cachep, 0, sizeof(kmem_cache_t));
1295
1296#if DEBUG
1297 cachep->reallen = size;
1298
1299 if (flags & SLAB_RED_ZONE) {
1300 /* redzoning only works with word aligned caches */
1301 align = BYTES_PER_WORD;
1302
1303 /* add space for red zone words */
1304 cachep->dbghead += BYTES_PER_WORD;
1305 size += 2*BYTES_PER_WORD;
1306 }
1307 if (flags & SLAB_STORE_USER) {
1308 /* user store requires word alignment and
1309 * one word storage behind the end of the real
1310 * object.
1311 */
1312 align = BYTES_PER_WORD;
1313 size += BYTES_PER_WORD;
1314 }
1315#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
1316 if (size > 128 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
1317 cachep->dbghead += PAGE_SIZE - size;
1318 size = PAGE_SIZE;
1319 }
1320#endif
1321#endif
1322
1323 /* Determine if the slab management is 'on' or 'off' slab. */
1324 if (size >= (PAGE_SIZE>>3))
1325 /*
1326 * Size is large, assume best to place the slab management obj
1327 * off-slab (should allow better packing of objs).
1328 */
1329 flags |= CFLGS_OFF_SLAB;
1330
1331 size = ALIGN(size, align);
1332
1333 if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) {
1334 /*
1335 * A VFS-reclaimable slab tends to have most allocations
1336 * as GFP_NOFS and we really don't want to have to be allocating
1337 * higher-order pages when we are unable to shrink dcache.
1338 */
1339 cachep->gfporder = 0;
1340 cache_estimate(cachep->gfporder, size, align, flags,
1341 &left_over, &cachep->num);
1342 } else {
1343 /*
1344 * Calculate size (in pages) of slabs, and the num of objs per
1345 * slab. This could be made much more intelligent. For now,
1346 * try to avoid using high page-orders for slabs. When the
1347 * gfp() funcs are more friendly towards high-order requests,
1348 * this should be changed.
1349 */
1350 do {
1351 unsigned int break_flag = 0;
1352cal_wastage:
1353 cache_estimate(cachep->gfporder, size, align, flags,
1354 &left_over, &cachep->num);
1355 if (break_flag)
1356 break;
1357 if (cachep->gfporder >= MAX_GFP_ORDER)
1358 break;
1359 if (!cachep->num)
1360 goto next;
1361 if (flags & CFLGS_OFF_SLAB &&
1362 cachep->num > offslab_limit) {
1363 /* This num of objs will cause problems. */
1364 cachep->gfporder--;
1365 break_flag++;
1366 goto cal_wastage;
1367 }
1368
1369 /*
1370 * Large num of objs is good, but v. large slabs are
1371 * currently bad for the gfp()s.
1372 */
1373 if (cachep->gfporder >= slab_break_gfp_order)
1374 break;
1375
1376 if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))
1377 break; /* Acceptable internal fragmentation. */
1378next:
1379 cachep->gfporder++;
1380 } while (1);
1381 }
1382
1383 if (!cachep->num) {
1384 printk("kmem_cache_create: couldn't create cache %s.\n", name);
1385 kmem_cache_free(&cache_cache, cachep);
1386 cachep = NULL;
1387 goto opps;
1388 }
1389 slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t)
1390 + sizeof(struct slab), align);
1391
1392 /*
1393 * If the slab has been placed off-slab, and we have enough space then
1394 * move it on-slab. This is at the expense of any extra colouring.
1395 */
1396 if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
1397 flags &= ~CFLGS_OFF_SLAB;
1398 left_over -= slab_size;
1399 }
1400
1401 if (flags & CFLGS_OFF_SLAB) {
1402 /* really off slab. No need for manual alignment */
1403 slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab);
1404 }
1405
1406 cachep->colour_off = cache_line_size();
1407 /* Offset must be a multiple of the alignment. */
1408 if (cachep->colour_off < align)
1409 cachep->colour_off = align;
1410 cachep->colour = left_over/cachep->colour_off;
1411 cachep->slab_size = slab_size;
1412 cachep->flags = flags;
1413 cachep->gfpflags = 0;
1414 if (flags & SLAB_CACHE_DMA)
1415 cachep->gfpflags |= GFP_DMA;
1416 spin_lock_init(&cachep->spinlock);
1417 cachep->objsize = size;
1418 /* NUMA */
1419 INIT_LIST_HEAD(&cachep->lists.slabs_full);
1420 INIT_LIST_HEAD(&cachep->lists.slabs_partial);
1421 INIT_LIST_HEAD(&cachep->lists.slabs_free);
1422
1423 if (flags & CFLGS_OFF_SLAB)
1424 cachep->slabp_cache = kmem_find_general_cachep(slab_size,0);
1425 cachep->ctor = ctor;
1426 cachep->dtor = dtor;
1427 cachep->name = name;
1428
1429 /* Don't let CPUs to come and go */
1430 lock_cpu_hotplug();
1431
1432 if (g_cpucache_up == FULL) {
1433 enable_cpucache(cachep);
1434 } else {
1435 if (g_cpucache_up == NONE) {
1436 /* Note: the first kmem_cache_create must create
1437 * the cache that's used by kmalloc(24), otherwise
1438 * the creation of further caches will BUG().
1439 */
1440 cachep->array[smp_processor_id()] = &initarray_generic.cache;
1441 g_cpucache_up = PARTIAL;
1442 } else {
1443 cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL);
1444 }
1445 BUG_ON(!ac_data(cachep));
1446 ac_data(cachep)->avail = 0;
1447 ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
1448 ac_data(cachep)->batchcount = 1;
1449 ac_data(cachep)->touched = 0;
1450 cachep->batchcount = 1;
1451 cachep->limit = BOOT_CPUCACHE_ENTRIES;
1452 cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
1453 + cachep->num;
1454 }
1455
1456 cachep->lists.next_reap = jiffies + REAPTIMEOUT_LIST3 +
1457 ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
1458
1459 /* Need the semaphore to access the chain. */
1460 down(&cache_chain_sem);
1461 {
1462 struct list_head *p;
1463 mm_segment_t old_fs;
1464
1465 old_fs = get_fs();
1466 set_fs(KERNEL_DS);
1467 list_for_each(p, &cache_chain) {
1468 kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
1469 char tmp;
1470 /* This happens when the module gets unloaded and doesn't
1471 destroy its slab cache and noone else reuses the vmalloc
1472 area of the module. Print a warning. */
1473 if (__get_user(tmp,pc->name)) {
1474 printk("SLAB: cache with size %d has lost its name\n",
1475 pc->objsize);
1476 continue;
1477 }
1478 if (!strcmp(pc->name,name)) {
1479 printk("kmem_cache_create: duplicate cache %s\n",name);
1480 up(&cache_chain_sem);
1481 unlock_cpu_hotplug();
1482 BUG();
1483 }
1484 }
1485 set_fs(old_fs);
1486 }
1487
1488 /* cache setup completed, link it into the list */
1489 list_add(&cachep->next, &cache_chain);
1490 up(&cache_chain_sem);
1491 unlock_cpu_hotplug();
1492opps:
1493 if (!cachep && (flags & SLAB_PANIC))
1494 panic("kmem_cache_create(): failed to create slab `%s'\n",
1495 name);
1496 return cachep;
1497}
1498EXPORT_SYMBOL(kmem_cache_create);
1499
1500#if DEBUG
1501static void check_irq_off(void)
1502{
1503 BUG_ON(!irqs_disabled());
1504}
1505
1506static void check_irq_on(void)
1507{
1508 BUG_ON(irqs_disabled());
1509}
1510
1511static void check_spinlock_acquired(kmem_cache_t *cachep)
1512{
1513#ifdef CONFIG_SMP
1514 check_irq_off();
1515 BUG_ON(spin_trylock(&cachep->spinlock));
1516#endif
1517}
1518#else
1519#define check_irq_off() do { } while(0)
1520#define check_irq_on() do { } while(0)
1521#define check_spinlock_acquired(x) do { } while(0)
1522#endif
1523
1524/*
1525 * Waits for all CPUs to execute func().
1526 */
1527static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
1528{
1529 check_irq_on();
1530 preempt_disable();
1531
1532 local_irq_disable();
1533 func(arg);
1534 local_irq_enable();
1535
1536 if (smp_call_function(func, arg, 1, 1))
1537 BUG();
1538
1539 preempt_enable();
1540}
1541
1542static void drain_array_locked(kmem_cache_t* cachep,
1543 struct array_cache *ac, int force);
1544
1545static void do_drain(void *arg)
1546{
1547 kmem_cache_t *cachep = (kmem_cache_t*)arg;
1548 struct array_cache *ac;
1549
1550 check_irq_off();
1551 ac = ac_data(cachep);
1552 spin_lock(&cachep->spinlock);
1553 free_block(cachep, &ac_entry(ac)[0], ac->avail);
1554 spin_unlock(&cachep->spinlock);
1555 ac->avail = 0;
1556}
1557
1558static void drain_cpu_caches(kmem_cache_t *cachep)
1559{
1560 smp_call_function_all_cpus(do_drain, cachep);
1561 check_irq_on();
1562 spin_lock_irq(&cachep->spinlock);
1563 if (cachep->lists.shared)
1564 drain_array_locked(cachep, cachep->lists.shared, 1);
1565 spin_unlock_irq(&cachep->spinlock);
1566}
1567
1568
1569/* NUMA shrink all list3s */
1570static int __cache_shrink(kmem_cache_t *cachep)
1571{
1572 struct slab *slabp;
1573 int ret;
1574
1575 drain_cpu_caches(cachep);
1576
1577 check_irq_on();
1578 spin_lock_irq(&cachep->spinlock);
1579
1580 for(;;) {
1581 struct list_head *p;
1582
1583 p = cachep->lists.slabs_free.prev;
1584 if (p == &cachep->lists.slabs_free)
1585 break;
1586
1587 slabp = list_entry(cachep->lists.slabs_free.prev, struct slab, list);
1588#if DEBUG
1589 if (slabp->inuse)
1590 BUG();
1591#endif
1592 list_del(&slabp->list);
1593
1594 cachep->lists.free_objects -= cachep->num;
1595 spin_unlock_irq(&cachep->spinlock);
1596 slab_destroy(cachep, slabp);
1597 spin_lock_irq(&cachep->spinlock);
1598 }
1599 ret = !list_empty(&cachep->lists.slabs_full) ||
1600 !list_empty(&cachep->lists.slabs_partial);
1601 spin_unlock_irq(&cachep->spinlock);
1602 return ret;
1603}
1604
1605/**
1606 * kmem_cache_shrink - Shrink a cache.
1607 * @cachep: The cache to shrink.
1608 *
1609 * Releases as many slabs as possible for a cache.
1610 * To help debugging, a zero exit status indicates all slabs were released.
1611 */
1612int kmem_cache_shrink(kmem_cache_t *cachep)
1613{
1614 if (!cachep || in_interrupt())
1615 BUG();
1616
1617 return __cache_shrink(cachep);
1618}
1619EXPORT_SYMBOL(kmem_cache_shrink);
1620
1621/**
1622 * kmem_cache_destroy - delete a cache
1623 * @cachep: the cache to destroy
1624 *
1625 * Remove a kmem_cache_t object from the slab cache.
1626 * Returns 0 on success.
1627 *
1628 * It is expected this function will be called by a module when it is
1629 * unloaded. This will remove the cache completely, and avoid a duplicate
1630 * cache being allocated each time a module is loaded and unloaded, if the
1631 * module doesn't have persistent in-kernel storage across loads and unloads.
1632 *
1633 * The cache must be empty before calling this function.
1634 *
1635 * The caller must guarantee that noone will allocate memory from the cache
1636 * during the kmem_cache_destroy().
1637 */
1638int kmem_cache_destroy(kmem_cache_t * cachep)
1639{
1640 int i;
1641
1642 if (!cachep || in_interrupt())
1643 BUG();
1644
1645 /* Don't let CPUs to come and go */
1646 lock_cpu_hotplug();
1647
1648 /* Find the cache in the chain of caches. */
1649 down(&cache_chain_sem);
1650 /*
1651 * the chain is never empty, cache_cache is never destroyed
1652 */
1653 list_del(&cachep->next);
1654 up(&cache_chain_sem);
1655
1656 if (__cache_shrink(cachep)) {
1657 slab_error(cachep, "Can't free all objects");
1658 down(&cache_chain_sem);
1659 list_add(&cachep->next,&cache_chain);
1660 up(&cache_chain_sem);
1661 unlock_cpu_hotplug();
1662 return 1;
1663 }
1664
1665 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
1666 synchronize_kernel();
1667
1668 /* no cpu_online check required here since we clear the percpu
1669 * array on cpu offline and set this to NULL.
1670 */
1671 for (i = 0; i < NR_CPUS; i++)
1672 kfree(cachep->array[i]);
1673
1674 /* NUMA: free the list3 structures */
1675 kfree(cachep->lists.shared);
1676 cachep->lists.shared = NULL;
1677 kmem_cache_free(&cache_cache, cachep);
1678
1679 unlock_cpu_hotplug();
1680
1681 return 0;
1682}
1683EXPORT_SYMBOL(kmem_cache_destroy);
1684
1685/* Get the memory for a slab management obj. */
1686static struct slab* alloc_slabmgmt(kmem_cache_t *cachep,
1687 void *objp, int colour_off, unsigned int __nocast local_flags)
1688{
1689 struct slab *slabp;
1690
1691 if (OFF_SLAB(cachep)) {
1692 /* Slab management obj is off-slab. */
1693 slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
1694 if (!slabp)
1695 return NULL;
1696 } else {
1697 slabp = objp+colour_off;
1698 colour_off += cachep->slab_size;
1699 }
1700 slabp->inuse = 0;
1701 slabp->colouroff = colour_off;
1702 slabp->s_mem = objp+colour_off;
1703
1704 return slabp;
1705}
1706
1707static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
1708{
1709 return (kmem_bufctl_t *)(slabp+1);
1710}
1711
1712static void cache_init_objs(kmem_cache_t *cachep,
1713 struct slab *slabp, unsigned long ctor_flags)
1714{
1715 int i;
1716
1717 for (i = 0; i < cachep->num; i++) {
1718 void* objp = slabp->s_mem+cachep->objsize*i;
1719#if DEBUG
1720 /* need to poison the objs? */
1721 if (cachep->flags & SLAB_POISON)
1722 poison_obj(cachep, objp, POISON_FREE);
1723 if (cachep->flags & SLAB_STORE_USER)
1724 *dbg_userword(cachep, objp) = NULL;
1725
1726 if (cachep->flags & SLAB_RED_ZONE) {
1727 *dbg_redzone1(cachep, objp) = RED_INACTIVE;
1728 *dbg_redzone2(cachep, objp) = RED_INACTIVE;
1729 }
1730 /*
1731 * Constructors are not allowed to allocate memory from
1732 * the same cache which they are a constructor for.
1733 * Otherwise, deadlock. They must also be threaded.
1734 */
1735 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
1736 cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags);
1737
1738 if (cachep->flags & SLAB_RED_ZONE) {
1739 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1740 slab_error(cachep, "constructor overwrote the"
1741 " end of an object");
1742 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1743 slab_error(cachep, "constructor overwrote the"
1744 " start of an object");
1745 }
1746 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
1747 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
1748#else
1749 if (cachep->ctor)
1750 cachep->ctor(objp, cachep, ctor_flags);
1751#endif
1752 slab_bufctl(slabp)[i] = i+1;
1753 }
1754 slab_bufctl(slabp)[i-1] = BUFCTL_END;
1755 slabp->free = 0;
1756}
1757
1758static void kmem_flagcheck(kmem_cache_t *cachep, unsigned int flags)
1759{
1760 if (flags & SLAB_DMA) {
1761 if (!(cachep->gfpflags & GFP_DMA))
1762 BUG();
1763 } else {
1764 if (cachep->gfpflags & GFP_DMA)
1765 BUG();
1766 }
1767}
1768
1769static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
1770{
1771 int i;
1772 struct page *page;
1773
1774 /* Nasty!!!!!! I hope this is OK. */
1775 i = 1 << cachep->gfporder;
1776 page = virt_to_page(objp);
1777 do {
1778 SET_PAGE_CACHE(page, cachep);
1779 SET_PAGE_SLAB(page, slabp);
1780 page++;
1781 } while (--i);
1782}
1783
1784/*
1785 * Grow (by 1) the number of slabs within a cache. This is called by
1786 * kmem_cache_alloc() when there are no active objs left in a cache.
1787 */
1788static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
1789{
1790 struct slab *slabp;
1791 void *objp;
1792 size_t offset;
1793 unsigned int local_flags;
1794 unsigned long ctor_flags;
1795
1796 /* Be lazy and only check for valid flags here,
1797 * keeping it out of the critical path in kmem_cache_alloc().
1798 */
1799 if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW))
1800 BUG();
1801 if (flags & SLAB_NO_GROW)
1802 return 0;
1803
1804 ctor_flags = SLAB_CTOR_CONSTRUCTOR;
1805 local_flags = (flags & SLAB_LEVEL_MASK);
1806 if (!(local_flags & __GFP_WAIT))
1807 /*
1808 * Not allowed to sleep. Need to tell a constructor about
1809 * this - it might need to know...
1810 */
1811 ctor_flags |= SLAB_CTOR_ATOMIC;
1812
1813 /* About to mess with non-constant members - lock. */
1814 check_irq_off();
1815 spin_lock(&cachep->spinlock);
1816
1817 /* Get colour for the slab, and cal the next value. */
1818 offset = cachep->colour_next;
1819 cachep->colour_next++;
1820 if (cachep->colour_next >= cachep->colour)
1821 cachep->colour_next = 0;
1822 offset *= cachep->colour_off;
1823
1824 spin_unlock(&cachep->spinlock);
1825
1826 if (local_flags & __GFP_WAIT)
1827 local_irq_enable();
1828
1829 /*
1830 * The test for missing atomic flag is performed here, rather than
1831 * the more obvious place, simply to reduce the critical path length
1832 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
1833 * will eventually be caught here (where it matters).
1834 */
1835 kmem_flagcheck(cachep, flags);
1836
1837
1838 /* Get mem for the objs. */
1839 if (!(objp = kmem_getpages(cachep, flags, nodeid)))
1840 goto failed;
1841
1842 /* Get slab management. */
1843 if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
1844 goto opps1;
1845
1846 set_slab_attr(cachep, slabp, objp);
1847
1848 cache_init_objs(cachep, slabp, ctor_flags);
1849
1850 if (local_flags & __GFP_WAIT)
1851 local_irq_disable();
1852 check_irq_off();
1853 spin_lock(&cachep->spinlock);
1854
1855 /* Make slab active. */
1856 list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_free));
1857 STATS_INC_GROWN(cachep);
1858 list3_data(cachep)->free_objects += cachep->num;
1859 spin_unlock(&cachep->spinlock);
1860 return 1;
1861opps1:
1862 kmem_freepages(cachep, objp);
1863failed:
1864 if (local_flags & __GFP_WAIT)
1865 local_irq_disable();
1866 return 0;
1867}
1868
1869#if DEBUG
1870
1871/*
1872 * Perform extra freeing checks:
1873 * - detect bad pointers.
1874 * - POISON/RED_ZONE checking
1875 * - destructor calls, for caches with POISON+dtor
1876 */
1877static void kfree_debugcheck(const void *objp)
1878{
1879 struct page *page;
1880
1881 if (!virt_addr_valid(objp)) {
1882 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
1883 (unsigned long)objp);
1884 BUG();
1885 }
1886 page = virt_to_page(objp);
1887 if (!PageSlab(page)) {
1888 printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp);
1889 BUG();
1890 }
1891}
1892
1893static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
1894 void *caller)
1895{
1896 struct page *page;
1897 unsigned int objnr;
1898 struct slab *slabp;
1899
1900 objp -= obj_dbghead(cachep);
1901 kfree_debugcheck(objp);
1902 page = virt_to_page(objp);
1903
1904 if (GET_PAGE_CACHE(page) != cachep) {
1905 printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n",
1906 GET_PAGE_CACHE(page),cachep);
1907 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
1908 printk(KERN_ERR "%p is %s.\n", GET_PAGE_CACHE(page), GET_PAGE_CACHE(page)->name);
1909 WARN_ON(1);
1910 }
1911 slabp = GET_PAGE_SLAB(page);
1912
1913 if (cachep->flags & SLAB_RED_ZONE) {
1914 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
1915 slab_error(cachep, "double free, or memory outside"
1916 " object was overwritten");
1917 printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
1918 objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
1919 }
1920 *dbg_redzone1(cachep, objp) = RED_INACTIVE;
1921 *dbg_redzone2(cachep, objp) = RED_INACTIVE;
1922 }
1923 if (cachep->flags & SLAB_STORE_USER)
1924 *dbg_userword(cachep, objp) = caller;
1925
1926 objnr = (objp-slabp->s_mem)/cachep->objsize;
1927
1928 BUG_ON(objnr >= cachep->num);
1929 BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize);
1930
1931 if (cachep->flags & SLAB_DEBUG_INITIAL) {
1932 /* Need to call the slab's constructor so the
1933 * caller can perform a verify of its state (debugging).
1934 * Called without the cache-lock held.
1935 */
1936 cachep->ctor(objp+obj_dbghead(cachep),
1937 cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
1938 }
1939 if (cachep->flags & SLAB_POISON && cachep->dtor) {
1940 /* we want to cache poison the object,
1941 * call the destruction callback
1942 */
1943 cachep->dtor(objp+obj_dbghead(cachep), cachep, 0);
1944 }
1945 if (cachep->flags & SLAB_POISON) {
1946#ifdef CONFIG_DEBUG_PAGEALLOC
1947 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
1948 store_stackinfo(cachep, objp, (unsigned long)caller);
1949 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
1950 } else {
1951 poison_obj(cachep, objp, POISON_FREE);
1952 }
1953#else
1954 poison_obj(cachep, objp, POISON_FREE);
1955#endif
1956 }
1957 return objp;
1958}
1959
1960static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
1961{
1962 kmem_bufctl_t i;
1963 int entries = 0;
1964
1965 check_spinlock_acquired(cachep);
1966 /* Check slab's freelist to see if this obj is there. */
1967 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
1968 entries++;
1969 if (entries > cachep->num || i >= cachep->num)
1970 goto bad;
1971 }
1972 if (entries != cachep->num - slabp->inuse) {
1973bad:
1974 printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
1975 cachep->name, cachep->num, slabp, slabp->inuse);
1976 for (i=0;i<sizeof(slabp)+cachep->num*sizeof(kmem_bufctl_t);i++) {
1977 if ((i%16)==0)
1978 printk("\n%03x:", i);
1979 printk(" %02x", ((unsigned char*)slabp)[i]);
1980 }
1981 printk("\n");
1982 BUG();
1983 }
1984}
1985#else
1986#define kfree_debugcheck(x) do { } while(0)
1987#define cache_free_debugcheck(x,objp,z) (objp)
1988#define check_slabp(x,y) do { } while(0)
1989#endif
1990
1991static void *cache_alloc_refill(kmem_cache_t *cachep, unsigned int __nocast flags)
1992{
1993 int batchcount;
1994 struct kmem_list3 *l3;
1995 struct array_cache *ac;
1996
1997 check_irq_off();
1998 ac = ac_data(cachep);
1999retry:
2000 batchcount = ac->batchcount;
2001 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2002 /* if there was little recent activity on this
2003 * cache, then perform only a partial refill.
2004 * Otherwise we could generate refill bouncing.
2005 */
2006 batchcount = BATCHREFILL_LIMIT;
2007 }
2008 l3 = list3_data(cachep);
2009
2010 BUG_ON(ac->avail > 0);
2011 spin_lock(&cachep->spinlock);
2012 if (l3->shared) {
2013 struct array_cache *shared_array = l3->shared;
2014 if (shared_array->avail) {
2015 if (batchcount > shared_array->avail)
2016 batchcount = shared_array->avail;
2017 shared_array->avail -= batchcount;
2018 ac->avail = batchcount;
2019 memcpy(ac_entry(ac), &ac_entry(shared_array)[shared_array->avail],
2020 sizeof(void*)*batchcount);
2021 shared_array->touched = 1;
2022 goto alloc_done;
2023 }
2024 }
2025 while (batchcount > 0) {
2026 struct list_head *entry;
2027 struct slab *slabp;
2028 /* Get slab alloc is to come from. */
2029 entry = l3->slabs_partial.next;
2030 if (entry == &l3->slabs_partial) {
2031 l3->free_touched = 1;
2032 entry = l3->slabs_free.next;
2033 if (entry == &l3->slabs_free)
2034 goto must_grow;
2035 }
2036
2037 slabp = list_entry(entry, struct slab, list);
2038 check_slabp(cachep, slabp);
2039 check_spinlock_acquired(cachep);
2040 while (slabp->inuse < cachep->num && batchcount--) {
2041 kmem_bufctl_t next;
2042 STATS_INC_ALLOCED(cachep);
2043 STATS_INC_ACTIVE(cachep);
2044 STATS_SET_HIGH(cachep);
2045
2046 /* get obj pointer */
2047 ac_entry(ac)[ac->avail++] = slabp->s_mem + slabp->free*cachep->objsize;
2048
2049 slabp->inuse++;
2050 next = slab_bufctl(slabp)[slabp->free];
2051#if DEBUG
2052 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2053#endif
2054 slabp->free = next;
2055 }
2056 check_slabp(cachep, slabp);
2057
2058 /* move slabp to correct slabp list: */
2059 list_del(&slabp->list);
2060 if (slabp->free == BUFCTL_END)
2061 list_add(&slabp->list, &l3->slabs_full);
2062 else
2063 list_add(&slabp->list, &l3->slabs_partial);
2064 }
2065
2066must_grow:
2067 l3->free_objects -= ac->avail;
2068alloc_done:
2069 spin_unlock(&cachep->spinlock);
2070
2071 if (unlikely(!ac->avail)) {
2072 int x;
2073 x = cache_grow(cachep, flags, -1);
2074
2075 // cache_grow can reenable interrupts, then ac could change.
2076 ac = ac_data(cachep);
2077 if (!x && ac->avail == 0) // no objects in sight? abort
2078 return NULL;
2079
2080 if (!ac->avail) // objects refilled by interrupt?
2081 goto retry;
2082 }
2083 ac->touched = 1;
2084 return ac_entry(ac)[--ac->avail];
2085}
2086
2087static inline void
2088cache_alloc_debugcheck_before(kmem_cache_t *cachep, unsigned int __nocast flags)
2089{
2090 might_sleep_if(flags & __GFP_WAIT);
2091#if DEBUG
2092 kmem_flagcheck(cachep, flags);
2093#endif
2094}
2095
2096#if DEBUG
2097static void *
2098cache_alloc_debugcheck_after(kmem_cache_t *cachep,
2099 unsigned long flags, void *objp, void *caller)
2100{
2101 if (!objp)
2102 return objp;
2103 if (cachep->flags & SLAB_POISON) {
2104#ifdef CONFIG_DEBUG_PAGEALLOC
2105 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
2106 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1);
2107 else
2108 check_poison_obj(cachep, objp);
2109#else
2110 check_poison_obj(cachep, objp);
2111#endif
2112 poison_obj(cachep, objp, POISON_INUSE);
2113 }
2114 if (cachep->flags & SLAB_STORE_USER)
2115 *dbg_userword(cachep, objp) = caller;
2116
2117 if (cachep->flags & SLAB_RED_ZONE) {
2118 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
2119 slab_error(cachep, "double free, or memory outside"
2120 " object was overwritten");
2121 printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
2122 objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
2123 }
2124 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2125 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2126 }
2127 objp += obj_dbghead(cachep);
2128 if (cachep->ctor && cachep->flags & SLAB_POISON) {
2129 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2130
2131 if (!(flags & __GFP_WAIT))
2132 ctor_flags |= SLAB_CTOR_ATOMIC;
2133
2134 cachep->ctor(objp, cachep, ctor_flags);
2135 }
2136 return objp;
2137}
2138#else
2139#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
2140#endif
2141
2142
2143static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags)
2144{
2145 unsigned long save_flags;
2146 void* objp;
2147 struct array_cache *ac;
2148
2149 cache_alloc_debugcheck_before(cachep, flags);
2150
2151 local_irq_save(save_flags);
2152 ac = ac_data(cachep);
2153 if (likely(ac->avail)) {
2154 STATS_INC_ALLOCHIT(cachep);
2155 ac->touched = 1;
2156 objp = ac_entry(ac)[--ac->avail];
2157 } else {
2158 STATS_INC_ALLOCMISS(cachep);
2159 objp = cache_alloc_refill(cachep, flags);
2160 }
2161 local_irq_restore(save_flags);
2162 objp = cache_alloc_debugcheck_after(cachep, flags, objp, __builtin_return_address(0));
2163 return objp;
2164}
2165
2166/*
2167 * NUMA: different approach needed if the spinlock is moved into
2168 * the l3 structure
2169 */
2170
2171static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
2172{
2173 int i;
2174
2175 check_spinlock_acquired(cachep);
2176
2177 /* NUMA: move add into loop */
2178 cachep->lists.free_objects += nr_objects;
2179
2180 for (i = 0; i < nr_objects; i++) {
2181 void *objp = objpp[i];
2182 struct slab *slabp;
2183 unsigned int objnr;
2184
2185 slabp = GET_PAGE_SLAB(virt_to_page(objp));
2186 list_del(&slabp->list);
2187 objnr = (objp - slabp->s_mem) / cachep->objsize;
2188 check_slabp(cachep, slabp);
2189#if DEBUG
2190 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2191 printk(KERN_ERR "slab: double free detected in cache '%s', objp %p.\n",
2192 cachep->name, objp);
2193 BUG();
2194 }
2195#endif
2196 slab_bufctl(slabp)[objnr] = slabp->free;
2197 slabp->free = objnr;
2198 STATS_DEC_ACTIVE(cachep);
2199 slabp->inuse--;
2200 check_slabp(cachep, slabp);
2201
2202 /* fixup slab chains */
2203 if (slabp->inuse == 0) {
2204 if (cachep->lists.free_objects > cachep->free_limit) {
2205 cachep->lists.free_objects -= cachep->num;
2206 slab_destroy(cachep, slabp);
2207 } else {
2208 list_add(&slabp->list,
2209 &list3_data_ptr(cachep, objp)->slabs_free);
2210 }
2211 } else {
2212 /* Unconditionally move a slab to the end of the
2213 * partial list on free - maximum time for the
2214 * other objects to be freed, too.
2215 */
2216 list_add_tail(&slabp->list,
2217 &list3_data_ptr(cachep, objp)->slabs_partial);
2218 }
2219 }
2220}
2221
2222static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
2223{
2224 int batchcount;
2225
2226 batchcount = ac->batchcount;
2227#if DEBUG
2228 BUG_ON(!batchcount || batchcount > ac->avail);
2229#endif
2230 check_irq_off();
2231 spin_lock(&cachep->spinlock);
2232 if (cachep->lists.shared) {
2233 struct array_cache *shared_array = cachep->lists.shared;
2234 int max = shared_array->limit-shared_array->avail;
2235 if (max) {
2236 if (batchcount > max)
2237 batchcount = max;
2238 memcpy(&ac_entry(shared_array)[shared_array->avail],
2239 &ac_entry(ac)[0],
2240 sizeof(void*)*batchcount);
2241 shared_array->avail += batchcount;
2242 goto free_done;
2243 }
2244 }
2245
2246 free_block(cachep, &ac_entry(ac)[0], batchcount);
2247free_done:
2248#if STATS
2249 {
2250 int i = 0;
2251 struct list_head *p;
2252
2253 p = list3_data(cachep)->slabs_free.next;
2254 while (p != &(list3_data(cachep)->slabs_free)) {
2255 struct slab *slabp;
2256
2257 slabp = list_entry(p, struct slab, list);
2258 BUG_ON(slabp->inuse);
2259
2260 i++;
2261 p = p->next;
2262 }
2263 STATS_SET_FREEABLE(cachep, i);
2264 }
2265#endif
2266 spin_unlock(&cachep->spinlock);
2267 ac->avail -= batchcount;
2268 memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount],
2269 sizeof(void*)*ac->avail);
2270}
2271
2272/*
2273 * __cache_free
2274 * Release an obj back to its cache. If the obj has a constructed
2275 * state, it must be in this state _before_ it is released.
2276 *
2277 * Called with disabled ints.
2278 */
2279static inline void __cache_free(kmem_cache_t *cachep, void *objp)
2280{
2281 struct array_cache *ac = ac_data(cachep);
2282
2283 check_irq_off();
2284 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
2285
2286 if (likely(ac->avail < ac->limit)) {
2287 STATS_INC_FREEHIT(cachep);
2288 ac_entry(ac)[ac->avail++] = objp;
2289 return;
2290 } else {
2291 STATS_INC_FREEMISS(cachep);
2292 cache_flusharray(cachep, ac);
2293 ac_entry(ac)[ac->avail++] = objp;
2294 }
2295}
2296
2297/**
2298 * kmem_cache_alloc - Allocate an object
2299 * @cachep: The cache to allocate from.
2300 * @flags: See kmalloc().
2301 *
2302 * Allocate an object from this cache. The flags are only relevant
2303 * if the cache has no available objects.
2304 */
2305void *kmem_cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags)
2306{
2307 return __cache_alloc(cachep, flags);
2308}
2309EXPORT_SYMBOL(kmem_cache_alloc);
2310
2311/**
2312 * kmem_ptr_validate - check if an untrusted pointer might
2313 * be a slab entry.
2314 * @cachep: the cache we're checking against
2315 * @ptr: pointer to validate
2316 *
2317 * This verifies that the untrusted pointer looks sane:
2318 * it is _not_ a guarantee that the pointer is actually
2319 * part of the slab cache in question, but it at least
2320 * validates that the pointer can be dereferenced and
2321 * looks half-way sane.
2322 *
2323 * Currently only used for dentry validation.
2324 */
2325int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
2326{
2327 unsigned long addr = (unsigned long) ptr;
2328 unsigned long min_addr = PAGE_OFFSET;
2329 unsigned long align_mask = BYTES_PER_WORD-1;
2330 unsigned long size = cachep->objsize;
2331 struct page *page;
2332
2333 if (unlikely(addr < min_addr))
2334 goto out;
2335 if (unlikely(addr > (unsigned long)high_memory - size))
2336 goto out;
2337 if (unlikely(addr & align_mask))
2338 goto out;
2339 if (unlikely(!kern_addr_valid(addr)))
2340 goto out;
2341 if (unlikely(!kern_addr_valid(addr + size - 1)))
2342 goto out;
2343 page = virt_to_page(ptr);
2344 if (unlikely(!PageSlab(page)))
2345 goto out;
2346 if (unlikely(GET_PAGE_CACHE(page) != cachep))
2347 goto out;
2348 return 1;
2349out:
2350 return 0;
2351}
2352
2353#ifdef CONFIG_NUMA
2354/**
2355 * kmem_cache_alloc_node - Allocate an object on the specified node
2356 * @cachep: The cache to allocate from.
2357 * @flags: See kmalloc().
2358 * @nodeid: node number of the target node.
2359 *
2360 * Identical to kmem_cache_alloc, except that this function is slow
2361 * and can sleep. And it will allocate memory on the given node, which
2362 * can improve the performance for cpu bound structures.
2363 */
2364void *kmem_cache_alloc_node(kmem_cache_t *cachep, int nodeid)
2365{
2366 int loop;
2367 void *objp;
2368 struct slab *slabp;
2369 kmem_bufctl_t next;
2370
2371 for (loop = 0;;loop++) {
2372 struct list_head *q;
2373
2374 objp = NULL;
2375 check_irq_on();
2376 spin_lock_irq(&cachep->spinlock);
2377 /* walk through all partial and empty slab and find one
2378 * from the right node */
2379 list_for_each(q,&cachep->lists.slabs_partial) {
2380 slabp = list_entry(q, struct slab, list);
2381
2382 if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid ||
2383 loop > 2)
2384 goto got_slabp;
2385 }
2386 list_for_each(q, &cachep->lists.slabs_free) {
2387 slabp = list_entry(q, struct slab, list);
2388
2389 if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid ||
2390 loop > 2)
2391 goto got_slabp;
2392 }
2393 spin_unlock_irq(&cachep->spinlock);
2394
2395 local_irq_disable();
2396 if (!cache_grow(cachep, GFP_KERNEL, nodeid)) {
2397 local_irq_enable();
2398 return NULL;
2399 }
2400 local_irq_enable();
2401 }
2402got_slabp:
2403 /* found one: allocate object */
2404 check_slabp(cachep, slabp);
2405 check_spinlock_acquired(cachep);
2406
2407 STATS_INC_ALLOCED(cachep);
2408 STATS_INC_ACTIVE(cachep);
2409 STATS_SET_HIGH(cachep);
2410 STATS_INC_NODEALLOCS(cachep);
2411
2412 objp = slabp->s_mem + slabp->free*cachep->objsize;
2413
2414 slabp->inuse++;
2415 next = slab_bufctl(slabp)[slabp->free];
2416#if DEBUG
2417 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2418#endif
2419 slabp->free = next;
2420 check_slabp(cachep, slabp);
2421
2422 /* move slabp to correct slabp list: */
2423 list_del(&slabp->list);
2424 if (slabp->free == BUFCTL_END)
2425 list_add(&slabp->list, &cachep->lists.slabs_full);
2426 else
2427 list_add(&slabp->list, &cachep->lists.slabs_partial);
2428
2429 list3_data(cachep)->free_objects--;
2430 spin_unlock_irq(&cachep->spinlock);
2431
2432 objp = cache_alloc_debugcheck_after(cachep, GFP_KERNEL, objp,
2433 __builtin_return_address(0));
2434 return objp;
2435}
2436EXPORT_SYMBOL(kmem_cache_alloc_node);
2437
2438#endif
2439
2440/**
2441 * kmalloc - allocate memory
2442 * @size: how many bytes of memory are required.
2443 * @flags: the type of memory to allocate.
2444 *
2445 * kmalloc is the normal method of allocating memory
2446 * in the kernel.
2447 *
2448 * The @flags argument may be one of:
2449 *
2450 * %GFP_USER - Allocate memory on behalf of user. May sleep.
2451 *
2452 * %GFP_KERNEL - Allocate normal kernel ram. May sleep.
2453 *
2454 * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers.
2455 *
2456 * Additionally, the %GFP_DMA flag may be set to indicate the memory
2457 * must be suitable for DMA. This can mean different things on different
2458 * platforms. For example, on i386, it means that the memory must come
2459 * from the first 16MB.
2460 */
2461void *__kmalloc(size_t size, unsigned int __nocast flags)
2462{
2463 kmem_cache_t *cachep;
2464
2465 cachep = kmem_find_general_cachep(size, flags);
2466 if (unlikely(cachep == NULL))
2467 return NULL;
2468 return __cache_alloc(cachep, flags);
2469}
2470EXPORT_SYMBOL(__kmalloc);
2471
2472#ifdef CONFIG_SMP
2473/**
2474 * __alloc_percpu - allocate one copy of the object for every present
2475 * cpu in the system, zeroing them.
2476 * Objects should be dereferenced using the per_cpu_ptr macro only.
2477 *
2478 * @size: how many bytes of memory are required.
2479 * @align: the alignment, which can't be greater than SMP_CACHE_BYTES.
2480 */
2481void *__alloc_percpu(size_t size, size_t align)
2482{
2483 int i;
2484 struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
2485
2486 if (!pdata)
2487 return NULL;
2488
2489 for (i = 0; i < NR_CPUS; i++) {
2490 if (!cpu_possible(i))
2491 continue;
2492 pdata->ptrs[i] = kmem_cache_alloc_node(
2493 kmem_find_general_cachep(size, GFP_KERNEL),
2494 cpu_to_node(i));
2495
2496 if (!pdata->ptrs[i])
2497 goto unwind_oom;
2498 memset(pdata->ptrs[i], 0, size);
2499 }
2500
2501 /* Catch derefs w/o wrappers */
2502 return (void *) (~(unsigned long) pdata);
2503
2504unwind_oom:
2505 while (--i >= 0) {
2506 if (!cpu_possible(i))
2507 continue;
2508 kfree(pdata->ptrs[i]);
2509 }
2510 kfree(pdata);
2511 return NULL;
2512}
2513EXPORT_SYMBOL(__alloc_percpu);
2514#endif
2515
2516/**
2517 * kmem_cache_free - Deallocate an object
2518 * @cachep: The cache the allocation was from.
2519 * @objp: The previously allocated object.
2520 *
2521 * Free an object which was previously allocated from this
2522 * cache.
2523 */
2524void kmem_cache_free(kmem_cache_t *cachep, void *objp)
2525{
2526 unsigned long flags;
2527
2528 local_irq_save(flags);
2529 __cache_free(cachep, objp);
2530 local_irq_restore(flags);
2531}
2532EXPORT_SYMBOL(kmem_cache_free);
2533
2534/**
2535 * kcalloc - allocate memory for an array. The memory is set to zero.
2536 * @n: number of elements.
2537 * @size: element size.
2538 * @flags: the type of memory to allocate.
2539 */
2540void *kcalloc(size_t n, size_t size, unsigned int __nocast flags)
2541{
2542 void *ret = NULL;
2543
2544 if (n != 0 && size > INT_MAX / n)
2545 return ret;
2546
2547 ret = kmalloc(n * size, flags);
2548 if (ret)
2549 memset(ret, 0, n * size);
2550 return ret;
2551}
2552EXPORT_SYMBOL(kcalloc);
2553
2554/**
2555 * kfree - free previously allocated memory
2556 * @objp: pointer returned by kmalloc.
2557 *
2558 * Don't free memory not originally allocated by kmalloc()
2559 * or you will run into trouble.
2560 */
2561void kfree(const void *objp)
2562{
2563 kmem_cache_t *c;
2564 unsigned long flags;
2565
2566 if (unlikely(!objp))
2567 return;
2568 local_irq_save(flags);
2569 kfree_debugcheck(objp);
2570 c = GET_PAGE_CACHE(virt_to_page(objp));
2571 __cache_free(c, (void*)objp);
2572 local_irq_restore(flags);
2573}
2574EXPORT_SYMBOL(kfree);
2575
2576#ifdef CONFIG_SMP
2577/**
2578 * free_percpu - free previously allocated percpu memory
2579 * @objp: pointer returned by alloc_percpu.
2580 *
2581 * Don't free memory not originally allocated by alloc_percpu()
2582 * The complemented objp is to check for that.
2583 */
2584void
2585free_percpu(const void *objp)
2586{
2587 int i;
2588 struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
2589
2590 for (i = 0; i < NR_CPUS; i++) {
2591 if (!cpu_possible(i))
2592 continue;
2593 kfree(p->ptrs[i]);
2594 }
2595 kfree(p);
2596}
2597EXPORT_SYMBOL(free_percpu);
2598#endif
2599
2600unsigned int kmem_cache_size(kmem_cache_t *cachep)
2601{
2602 return obj_reallen(cachep);
2603}
2604EXPORT_SYMBOL(kmem_cache_size);
2605
2606struct ccupdate_struct {
2607 kmem_cache_t *cachep;
2608 struct array_cache *new[NR_CPUS];
2609};
2610
2611static void do_ccupdate_local(void *info)
2612{
2613 struct ccupdate_struct *new = (struct ccupdate_struct *)info;
2614 struct array_cache *old;
2615
2616 check_irq_off();
2617 old = ac_data(new->cachep);
2618
2619 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
2620 new->new[smp_processor_id()] = old;
2621}
2622
2623
2624static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
2625 int shared)
2626{
2627 struct ccupdate_struct new;
2628 struct array_cache *new_shared;
2629 int i;
2630
2631 memset(&new.new,0,sizeof(new.new));
2632 for (i = 0; i < NR_CPUS; i++) {
2633 if (cpu_online(i)) {
2634 new.new[i] = alloc_arraycache(i, limit, batchcount);
2635 if (!new.new[i]) {
2636 for (i--; i >= 0; i--) kfree(new.new[i]);
2637 return -ENOMEM;
2638 }
2639 } else {
2640 new.new[i] = NULL;
2641 }
2642 }
2643 new.cachep = cachep;
2644
2645 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
2646
2647 check_irq_on();
2648 spin_lock_irq(&cachep->spinlock);
2649 cachep->batchcount = batchcount;
2650 cachep->limit = limit;
2651 cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num;
2652 spin_unlock_irq(&cachep->spinlock);
2653
2654 for (i = 0; i < NR_CPUS; i++) {
2655 struct array_cache *ccold = new.new[i];
2656 if (!ccold)
2657 continue;
2658 spin_lock_irq(&cachep->spinlock);
2659 free_block(cachep, ac_entry(ccold), ccold->avail);
2660 spin_unlock_irq(&cachep->spinlock);
2661 kfree(ccold);
2662 }
2663 new_shared = alloc_arraycache(-1, batchcount*shared, 0xbaadf00d);
2664 if (new_shared) {
2665 struct array_cache *old;
2666
2667 spin_lock_irq(&cachep->spinlock);
2668 old = cachep->lists.shared;
2669 cachep->lists.shared = new_shared;
2670 if (old)
2671 free_block(cachep, ac_entry(old), old->avail);
2672 spin_unlock_irq(&cachep->spinlock);
2673 kfree(old);
2674 }
2675
2676 return 0;
2677}
2678
2679
2680static void enable_cpucache(kmem_cache_t *cachep)
2681{
2682 int err;
2683 int limit, shared;
2684
2685 /* The head array serves three purposes:
2686 * - create a LIFO ordering, i.e. return objects that are cache-warm
2687 * - reduce the number of spinlock operations.
2688 * - reduce the number of linked list operations on the slab and
2689 * bufctl chains: array operations are cheaper.
2690 * The numbers are guessed, we should auto-tune as described by
2691 * Bonwick.
2692 */
2693 if (cachep->objsize > 131072)
2694 limit = 1;
2695 else if (cachep->objsize > PAGE_SIZE)
2696 limit = 8;
2697 else if (cachep->objsize > 1024)
2698 limit = 24;
2699 else if (cachep->objsize > 256)
2700 limit = 54;
2701 else
2702 limit = 120;
2703
2704 /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound
2705 * allocation behaviour: Most allocs on one cpu, most free operations
2706 * on another cpu. For these cases, an efficient object passing between
2707 * cpus is necessary. This is provided by a shared array. The array
2708 * replaces Bonwick's magazine layer.
2709 * On uniprocessor, it's functionally equivalent (but less efficient)
2710 * to a larger limit. Thus disabled by default.
2711 */
2712 shared = 0;
2713#ifdef CONFIG_SMP
2714 if (cachep->objsize <= PAGE_SIZE)
2715 shared = 8;
2716#endif
2717
2718#if DEBUG
2719 /* With debugging enabled, large batchcount lead to excessively
2720 * long periods with disabled local interrupts. Limit the
2721 * batchcount
2722 */
2723 if (limit > 32)
2724 limit = 32;
2725#endif
2726 err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared);
2727 if (err)
2728 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
2729 cachep->name, -err);
2730}
2731
2732static void drain_array_locked(kmem_cache_t *cachep,
2733 struct array_cache *ac, int force)
2734{
2735 int tofree;
2736
2737 check_spinlock_acquired(cachep);
2738 if (ac->touched && !force) {
2739 ac->touched = 0;
2740 } else if (ac->avail) {
2741 tofree = force ? ac->avail : (ac->limit+4)/5;
2742 if (tofree > ac->avail) {
2743 tofree = (ac->avail+1)/2;
2744 }
2745 free_block(cachep, ac_entry(ac), tofree);
2746 ac->avail -= tofree;
2747 memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree],
2748 sizeof(void*)*ac->avail);
2749 }
2750}
2751
2752/**
2753 * cache_reap - Reclaim memory from caches.
2754 *
2755 * Called from workqueue/eventd every few seconds.
2756 * Purpose:
2757 * - clear the per-cpu caches for this CPU.
2758 * - return freeable pages to the main free memory pool.
2759 *
2760 * If we cannot acquire the cache chain semaphore then just give up - we'll
2761 * try again on the next iteration.
2762 */
2763static void cache_reap(void *unused)
2764{
2765 struct list_head *walk;
2766
2767 if (down_trylock(&cache_chain_sem)) {
2768 /* Give up. Setup the next iteration. */
2769 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
2770 return;
2771 }
2772
2773 list_for_each(walk, &cache_chain) {
2774 kmem_cache_t *searchp;
2775 struct list_head* p;
2776 int tofree;
2777 struct slab *slabp;
2778
2779 searchp = list_entry(walk, kmem_cache_t, next);
2780
2781 if (searchp->flags & SLAB_NO_REAP)
2782 goto next;
2783
2784 check_irq_on();
2785
2786 spin_lock_irq(&searchp->spinlock);
2787
2788 drain_array_locked(searchp, ac_data(searchp), 0);
2789
2790 if(time_after(searchp->lists.next_reap, jiffies))
2791 goto next_unlock;
2792
2793 searchp->lists.next_reap = jiffies + REAPTIMEOUT_LIST3;
2794
2795 if (searchp->lists.shared)
2796 drain_array_locked(searchp, searchp->lists.shared, 0);
2797
2798 if (searchp->lists.free_touched) {
2799 searchp->lists.free_touched = 0;
2800 goto next_unlock;
2801 }
2802
2803 tofree = (searchp->free_limit+5*searchp->num-1)/(5*searchp->num);
2804 do {
2805 p = list3_data(searchp)->slabs_free.next;
2806 if (p == &(list3_data(searchp)->slabs_free))
2807 break;
2808
2809 slabp = list_entry(p, struct slab, list);
2810 BUG_ON(slabp->inuse);
2811 list_del(&slabp->list);
2812 STATS_INC_REAPED(searchp);
2813
2814 /* Safe to drop the lock. The slab is no longer
2815 * linked to the cache.
2816 * searchp cannot disappear, we hold
2817 * cache_chain_lock
2818 */
2819 searchp->lists.free_objects -= searchp->num;
2820 spin_unlock_irq(&searchp->spinlock);
2821 slab_destroy(searchp, slabp);
2822 spin_lock_irq(&searchp->spinlock);
2823 } while(--tofree > 0);
2824next_unlock:
2825 spin_unlock_irq(&searchp->spinlock);
2826next:
2827 cond_resched();
2828 }
2829 check_irq_on();
2830 up(&cache_chain_sem);
2831 /* Setup the next iteration */
2832 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
2833}
2834
2835#ifdef CONFIG_PROC_FS
2836
2837static void *s_start(struct seq_file *m, loff_t *pos)
2838{
2839 loff_t n = *pos;
2840 struct list_head *p;
2841
2842 down(&cache_chain_sem);
2843 if (!n) {
2844 /*
2845 * Output format version, so at least we can change it
2846 * without _too_ many complaints.
2847 */
2848#if STATS
2849 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
2850#else
2851 seq_puts(m, "slabinfo - version: 2.1\n");
2852#endif
2853 seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
2854 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
2855 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
2856#if STATS
2857 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>"
2858 " <error> <maxfreeable> <freelimit> <nodeallocs>");
2859 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
2860#endif
2861 seq_putc(m, '\n');
2862 }
2863 p = cache_chain.next;
2864 while (n--) {
2865 p = p->next;
2866 if (p == &cache_chain)
2867 return NULL;
2868 }
2869 return list_entry(p, kmem_cache_t, next);
2870}
2871
2872static void *s_next(struct seq_file *m, void *p, loff_t *pos)
2873{
2874 kmem_cache_t *cachep = p;
2875 ++*pos;
2876 return cachep->next.next == &cache_chain ? NULL
2877 : list_entry(cachep->next.next, kmem_cache_t, next);
2878}
2879
2880static void s_stop(struct seq_file *m, void *p)
2881{
2882 up(&cache_chain_sem);
2883}
2884
2885static int s_show(struct seq_file *m, void *p)
2886{
2887 kmem_cache_t *cachep = p;
2888 struct list_head *q;
2889 struct slab *slabp;
2890 unsigned long active_objs;
2891 unsigned long num_objs;
2892 unsigned long active_slabs = 0;
2893 unsigned long num_slabs;
2894 const char *name;
2895 char *error = NULL;
2896
2897 check_irq_on();
2898 spin_lock_irq(&cachep->spinlock);
2899 active_objs = 0;
2900 num_slabs = 0;
2901 list_for_each(q,&cachep->lists.slabs_full) {
2902 slabp = list_entry(q, struct slab, list);
2903 if (slabp->inuse != cachep->num && !error)
2904 error = "slabs_full accounting error";
2905 active_objs += cachep->num;
2906 active_slabs++;
2907 }
2908 list_for_each(q,&cachep->lists.slabs_partial) {
2909 slabp = list_entry(q, struct slab, list);
2910 if (slabp->inuse == cachep->num && !error)
2911 error = "slabs_partial inuse accounting error";
2912 if (!slabp->inuse && !error)
2913 error = "slabs_partial/inuse accounting error";
2914 active_objs += slabp->inuse;
2915 active_slabs++;
2916 }
2917 list_for_each(q,&cachep->lists.slabs_free) {
2918 slabp = list_entry(q, struct slab, list);
2919 if (slabp->inuse && !error)
2920 error = "slabs_free/inuse accounting error";
2921 num_slabs++;
2922 }
2923 num_slabs+=active_slabs;
2924 num_objs = num_slabs*cachep->num;
2925 if (num_objs - active_objs != cachep->lists.free_objects && !error)
2926 error = "free_objects accounting error";
2927
2928 name = cachep->name;
2929 if (error)
2930 printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
2931
2932 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
2933 name, active_objs, num_objs, cachep->objsize,
2934 cachep->num, (1<<cachep->gfporder));
2935 seq_printf(m, " : tunables %4u %4u %4u",
2936 cachep->limit, cachep->batchcount,
2937 cachep->lists.shared->limit/cachep->batchcount);
2938 seq_printf(m, " : slabdata %6lu %6lu %6u",
2939 active_slabs, num_slabs, cachep->lists.shared->avail);
2940#if STATS
2941 { /* list3 stats */
2942 unsigned long high = cachep->high_mark;
2943 unsigned long allocs = cachep->num_allocations;
2944 unsigned long grown = cachep->grown;
2945 unsigned long reaped = cachep->reaped;
2946 unsigned long errors = cachep->errors;
2947 unsigned long max_freeable = cachep->max_freeable;
2948 unsigned long free_limit = cachep->free_limit;
2949 unsigned long node_allocs = cachep->node_allocs;
2950
2951 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu",
2952 allocs, high, grown, reaped, errors,
2953 max_freeable, free_limit, node_allocs);
2954 }
2955 /* cpu stats */
2956 {
2957 unsigned long allochit = atomic_read(&cachep->allochit);
2958 unsigned long allocmiss = atomic_read(&cachep->allocmiss);
2959 unsigned long freehit = atomic_read(&cachep->freehit);
2960 unsigned long freemiss = atomic_read(&cachep->freemiss);
2961
2962 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
2963 allochit, allocmiss, freehit, freemiss);
2964 }
2965#endif
2966 seq_putc(m, '\n');
2967 spin_unlock_irq(&cachep->spinlock);
2968 return 0;
2969}
2970
2971/*
2972 * slabinfo_op - iterator that generates /proc/slabinfo
2973 *
2974 * Output layout:
2975 * cache-name
2976 * num-active-objs
2977 * total-objs
2978 * object size
2979 * num-active-slabs
2980 * total-slabs
2981 * num-pages-per-slab
2982 * + further values on SMP and with statistics enabled
2983 */
2984
2985struct seq_operations slabinfo_op = {
2986 .start = s_start,
2987 .next = s_next,
2988 .stop = s_stop,
2989 .show = s_show,
2990};
2991
2992#define MAX_SLABINFO_WRITE 128
2993/**
2994 * slabinfo_write - Tuning for the slab allocator
2995 * @file: unused
2996 * @buffer: user buffer
2997 * @count: data length
2998 * @ppos: unused
2999 */
3000ssize_t slabinfo_write(struct file *file, const char __user *buffer,
3001 size_t count, loff_t *ppos)
3002{
3003 char kbuf[MAX_SLABINFO_WRITE+1], *tmp;
3004 int limit, batchcount, shared, res;
3005 struct list_head *p;
3006
3007 if (count > MAX_SLABINFO_WRITE)
3008 return -EINVAL;
3009 if (copy_from_user(&kbuf, buffer, count))
3010 return -EFAULT;
3011 kbuf[MAX_SLABINFO_WRITE] = '\0';
3012
3013 tmp = strchr(kbuf, ' ');
3014 if (!tmp)
3015 return -EINVAL;
3016 *tmp = '\0';
3017 tmp++;
3018 if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
3019 return -EINVAL;
3020
3021 /* Find the cache in the chain of caches. */
3022 down(&cache_chain_sem);
3023 res = -EINVAL;
3024 list_for_each(p,&cache_chain) {
3025 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
3026
3027 if (!strcmp(cachep->name, kbuf)) {
3028 if (limit < 1 ||
3029 batchcount < 1 ||
3030 batchcount > limit ||
3031 shared < 0) {
3032 res = -EINVAL;
3033 } else {
3034 res = do_tune_cpucache(cachep, limit, batchcount, shared);
3035 }
3036 break;
3037 }
3038 }
3039 up(&cache_chain_sem);
3040 if (res >= 0)
3041 res = count;
3042 return res;
3043}
3044#endif
3045
3046unsigned int ksize(const void *objp)
3047{
3048 kmem_cache_t *c;
3049 unsigned long flags;
3050 unsigned int size = 0;
3051
3052 if (likely(objp != NULL)) {
3053 local_irq_save(flags);
3054 c = GET_PAGE_CACHE(virt_to_page(objp));
3055 size = kmem_cache_size(c);
3056 local_irq_restore(flags);
3057 }
3058
3059 return size;
3060}
diff --git a/mm/swap.c b/mm/swap.c
new file mode 100644
index 00000000000..7771d2803f6
--- /dev/null
+++ b/mm/swap.c
@@ -0,0 +1,485 @@
1/*
2 * linux/mm/swap.c
3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 */
6
7/*
8 * This file contains the default values for the opereation of the
9 * Linux VM subsystem. Fine-tuning documentation can be found in
10 * Documentation/sysctl/vm.txt.
11 * Started 18.12.91
12 * Swap aging added 23.2.95, Stephen Tweedie.
13 * Buffermem limits added 12.3.98, Rik van Riel.
14 */
15
16#include <linux/mm.h>
17#include <linux/sched.h>
18#include <linux/kernel_stat.h>
19#include <linux/swap.h>
20#include <linux/mman.h>
21#include <linux/pagemap.h>
22#include <linux/pagevec.h>
23#include <linux/init.h>
24#include <linux/module.h>
25#include <linux/mm_inline.h>
26#include <linux/buffer_head.h> /* for try_to_release_page() */
27#include <linux/module.h>
28#include <linux/percpu_counter.h>
29#include <linux/percpu.h>
30#include <linux/cpu.h>
31#include <linux/notifier.h>
32#include <linux/init.h>
33
34/* How many pages do we try to swap or page in/out together? */
35int page_cluster;
36
37#ifdef CONFIG_HUGETLB_PAGE
38
39void put_page(struct page *page)
40{
41 if (unlikely(PageCompound(page))) {
42 page = (struct page *)page->private;
43 if (put_page_testzero(page)) {
44 void (*dtor)(struct page *page);
45
46 dtor = (void (*)(struct page *))page[1].mapping;
47 (*dtor)(page);
48 }
49 return;
50 }
51 if (!PageReserved(page) && put_page_testzero(page))
52 __page_cache_release(page);
53}
54EXPORT_SYMBOL(put_page);
55#endif
56
57/*
58 * Writeback is about to end against a page which has been marked for immediate
59 * reclaim. If it still appears to be reclaimable, move it to the tail of the
60 * inactive list. The page still has PageWriteback set, which will pin it.
61 *
62 * We don't expect many pages to come through here, so don't bother batching
63 * things up.
64 *
65 * To avoid placing the page at the tail of the LRU while PG_writeback is still
66 * set, this function will clear PG_writeback before performing the page
67 * motion. Do that inside the lru lock because once PG_writeback is cleared
68 * we may not touch the page.
69 *
70 * Returns zero if it cleared PG_writeback.
71 */
72int rotate_reclaimable_page(struct page *page)
73{
74 struct zone *zone;
75 unsigned long flags;
76
77 if (PageLocked(page))
78 return 1;
79 if (PageDirty(page))
80 return 1;
81 if (PageActive(page))
82 return 1;
83 if (!PageLRU(page))
84 return 1;
85
86 zone = page_zone(page);
87 spin_lock_irqsave(&zone->lru_lock, flags);
88 if (PageLRU(page) && !PageActive(page)) {
89 list_del(&page->lru);
90 list_add_tail(&page->lru, &zone->inactive_list);
91 inc_page_state(pgrotated);
92 }
93 if (!test_clear_page_writeback(page))
94 BUG();
95 spin_unlock_irqrestore(&zone->lru_lock, flags);
96 return 0;
97}
98
99/*
100 * FIXME: speed this up?
101 */
102void fastcall activate_page(struct page *page)
103{
104 struct zone *zone = page_zone(page);
105
106 spin_lock_irq(&zone->lru_lock);
107 if (PageLRU(page) && !PageActive(page)) {
108 del_page_from_inactive_list(zone, page);
109 SetPageActive(page);
110 add_page_to_active_list(zone, page);
111 inc_page_state(pgactivate);
112 }
113 spin_unlock_irq(&zone->lru_lock);
114}
115
116/*
117 * Mark a page as having seen activity.
118 *
119 * inactive,unreferenced -> inactive,referenced
120 * inactive,referenced -> active,unreferenced
121 * active,unreferenced -> active,referenced
122 */
123void fastcall mark_page_accessed(struct page *page)
124{
125 if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
126 activate_page(page);
127 ClearPageReferenced(page);
128 } else if (!PageReferenced(page)) {
129 SetPageReferenced(page);
130 }
131}
132
133EXPORT_SYMBOL(mark_page_accessed);
134
135/**
136 * lru_cache_add: add a page to the page lists
137 * @page: the page to add
138 */
139static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
140static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
141
142void fastcall lru_cache_add(struct page *page)
143{
144 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
145
146 page_cache_get(page);
147 if (!pagevec_add(pvec, page))
148 __pagevec_lru_add(pvec);
149 put_cpu_var(lru_add_pvecs);
150}
151
152void fastcall lru_cache_add_active(struct page *page)
153{
154 struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
155
156 page_cache_get(page);
157 if (!pagevec_add(pvec, page))
158 __pagevec_lru_add_active(pvec);
159 put_cpu_var(lru_add_active_pvecs);
160}
161
162void lru_add_drain(void)
163{
164 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
165
166 if (pagevec_count(pvec))
167 __pagevec_lru_add(pvec);
168 pvec = &__get_cpu_var(lru_add_active_pvecs);
169 if (pagevec_count(pvec))
170 __pagevec_lru_add_active(pvec);
171 put_cpu_var(lru_add_pvecs);
172}
173
174/*
175 * This path almost never happens for VM activity - pages are normally
176 * freed via pagevecs. But it gets used by networking.
177 */
178void fastcall __page_cache_release(struct page *page)
179{
180 unsigned long flags;
181 struct zone *zone = page_zone(page);
182
183 spin_lock_irqsave(&zone->lru_lock, flags);
184 if (TestClearPageLRU(page))
185 del_page_from_lru(zone, page);
186 if (page_count(page) != 0)
187 page = NULL;
188 spin_unlock_irqrestore(&zone->lru_lock, flags);
189 if (page)
190 free_hot_page(page);
191}
192
193EXPORT_SYMBOL(__page_cache_release);
194
195/*
196 * Batched page_cache_release(). Decrement the reference count on all the
197 * passed pages. If it fell to zero then remove the page from the LRU and
198 * free it.
199 *
200 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
201 * for the remainder of the operation.
202 *
203 * The locking in this function is against shrink_cache(): we recheck the
204 * page count inside the lock to see whether shrink_cache grabbed the page
205 * via the LRU. If it did, give up: shrink_cache will free it.
206 */
207void release_pages(struct page **pages, int nr, int cold)
208{
209 int i;
210 struct pagevec pages_to_free;
211 struct zone *zone = NULL;
212
213 pagevec_init(&pages_to_free, cold);
214 for (i = 0; i < nr; i++) {
215 struct page *page = pages[i];
216 struct zone *pagezone;
217
218 if (PageReserved(page) || !put_page_testzero(page))
219 continue;
220
221 pagezone = page_zone(page);
222 if (pagezone != zone) {
223 if (zone)
224 spin_unlock_irq(&zone->lru_lock);
225 zone = pagezone;
226 spin_lock_irq(&zone->lru_lock);
227 }
228 if (TestClearPageLRU(page))
229 del_page_from_lru(zone, page);
230 if (page_count(page) == 0) {
231 if (!pagevec_add(&pages_to_free, page)) {
232 spin_unlock_irq(&zone->lru_lock);
233 __pagevec_free(&pages_to_free);
234 pagevec_reinit(&pages_to_free);
235 zone = NULL; /* No lock is held */
236 }
237 }
238 }
239 if (zone)
240 spin_unlock_irq(&zone->lru_lock);
241
242 pagevec_free(&pages_to_free);
243}
244
245/*
246 * The pages which we're about to release may be in the deferred lru-addition
247 * queues. That would prevent them from really being freed right now. That's
248 * OK from a correctness point of view but is inefficient - those pages may be
249 * cache-warm and we want to give them back to the page allocator ASAP.
250 *
251 * So __pagevec_release() will drain those queues here. __pagevec_lru_add()
252 * and __pagevec_lru_add_active() call release_pages() directly to avoid
253 * mutual recursion.
254 */
255void __pagevec_release(struct pagevec *pvec)
256{
257 lru_add_drain();
258 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
259 pagevec_reinit(pvec);
260}
261
262/*
263 * pagevec_release() for pages which are known to not be on the LRU
264 *
265 * This function reinitialises the caller's pagevec.
266 */
267void __pagevec_release_nonlru(struct pagevec *pvec)
268{
269 int i;
270 struct pagevec pages_to_free;
271
272 pagevec_init(&pages_to_free, pvec->cold);
273 pages_to_free.cold = pvec->cold;
274 for (i = 0; i < pagevec_count(pvec); i++) {
275 struct page *page = pvec->pages[i];
276
277 BUG_ON(PageLRU(page));
278 if (put_page_testzero(page))
279 pagevec_add(&pages_to_free, page);
280 }
281 pagevec_free(&pages_to_free);
282 pagevec_reinit(pvec);
283}
284
285/*
286 * Add the passed pages to the LRU, then drop the caller's refcount
287 * on them. Reinitialises the caller's pagevec.
288 */
289void __pagevec_lru_add(struct pagevec *pvec)
290{
291 int i;
292 struct zone *zone = NULL;
293
294 for (i = 0; i < pagevec_count(pvec); i++) {
295 struct page *page = pvec->pages[i];
296 struct zone *pagezone = page_zone(page);
297
298 if (pagezone != zone) {
299 if (zone)
300 spin_unlock_irq(&zone->lru_lock);
301 zone = pagezone;
302 spin_lock_irq(&zone->lru_lock);
303 }
304 if (TestSetPageLRU(page))
305 BUG();
306 add_page_to_inactive_list(zone, page);
307 }
308 if (zone)
309 spin_unlock_irq(&zone->lru_lock);
310 release_pages(pvec->pages, pvec->nr, pvec->cold);
311 pagevec_reinit(pvec);
312}
313
314EXPORT_SYMBOL(__pagevec_lru_add);
315
316void __pagevec_lru_add_active(struct pagevec *pvec)
317{
318 int i;
319 struct zone *zone = NULL;
320
321 for (i = 0; i < pagevec_count(pvec); i++) {
322 struct page *page = pvec->pages[i];
323 struct zone *pagezone = page_zone(page);
324
325 if (pagezone != zone) {
326 if (zone)
327 spin_unlock_irq(&zone->lru_lock);
328 zone = pagezone;
329 spin_lock_irq(&zone->lru_lock);
330 }
331 if (TestSetPageLRU(page))
332 BUG();
333 if (TestSetPageActive(page))
334 BUG();
335 add_page_to_active_list(zone, page);
336 }
337 if (zone)
338 spin_unlock_irq(&zone->lru_lock);
339 release_pages(pvec->pages, pvec->nr, pvec->cold);
340 pagevec_reinit(pvec);
341}
342
343/*
344 * Try to drop buffers from the pages in a pagevec
345 */
346void pagevec_strip(struct pagevec *pvec)
347{
348 int i;
349
350 for (i = 0; i < pagevec_count(pvec); i++) {
351 struct page *page = pvec->pages[i];
352
353 if (PagePrivate(page) && !TestSetPageLocked(page)) {
354 try_to_release_page(page, 0);
355 unlock_page(page);
356 }
357 }
358}
359
360/**
361 * pagevec_lookup - gang pagecache lookup
362 * @pvec: Where the resulting pages are placed
363 * @mapping: The address_space to search
364 * @start: The starting page index
365 * @nr_pages: The maximum number of pages
366 *
367 * pagevec_lookup() will search for and return a group of up to @nr_pages pages
368 * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a
369 * reference against the pages in @pvec.
370 *
371 * The search returns a group of mapping-contiguous pages with ascending
372 * indexes. There may be holes in the indices due to not-present pages.
373 *
374 * pagevec_lookup() returns the number of pages which were found.
375 */
376unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
377 pgoff_t start, unsigned nr_pages)
378{
379 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
380 return pagevec_count(pvec);
381}
382
383unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
384 pgoff_t *index, int tag, unsigned nr_pages)
385{
386 pvec->nr = find_get_pages_tag(mapping, index, tag,
387 nr_pages, pvec->pages);
388 return pagevec_count(pvec);
389}
390
391
392#ifdef CONFIG_SMP
393/*
394 * We tolerate a little inaccuracy to avoid ping-ponging the counter between
395 * CPUs
396 */
397#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
398
399static DEFINE_PER_CPU(long, committed_space) = 0;
400
401void vm_acct_memory(long pages)
402{
403 long *local;
404
405 preempt_disable();
406 local = &__get_cpu_var(committed_space);
407 *local += pages;
408 if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
409 atomic_add(*local, &vm_committed_space);
410 *local = 0;
411 }
412 preempt_enable();
413}
414EXPORT_SYMBOL(vm_acct_memory);
415
416#ifdef CONFIG_HOTPLUG_CPU
417static void lru_drain_cache(unsigned int cpu)
418{
419 struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
420
421 /* CPU is dead, so no locking needed. */
422 if (pagevec_count(pvec))
423 __pagevec_lru_add(pvec);
424 pvec = &per_cpu(lru_add_active_pvecs, cpu);
425 if (pagevec_count(pvec))
426 __pagevec_lru_add_active(pvec);
427}
428
429/* Drop the CPU's cached committed space back into the central pool. */
430static int cpu_swap_callback(struct notifier_block *nfb,
431 unsigned long action,
432 void *hcpu)
433{
434 long *committed;
435
436 committed = &per_cpu(committed_space, (long)hcpu);
437 if (action == CPU_DEAD) {
438 atomic_add(*committed, &vm_committed_space);
439 *committed = 0;
440 lru_drain_cache((long)hcpu);
441 }
442 return NOTIFY_OK;
443}
444#endif /* CONFIG_HOTPLUG_CPU */
445#endif /* CONFIG_SMP */
446
447#ifdef CONFIG_SMP
448void percpu_counter_mod(struct percpu_counter *fbc, long amount)
449{
450 long count;
451 long *pcount;
452 int cpu = get_cpu();
453
454 pcount = per_cpu_ptr(fbc->counters, cpu);
455 count = *pcount + amount;
456 if (count >= FBC_BATCH || count <= -FBC_BATCH) {
457 spin_lock(&fbc->lock);
458 fbc->count += count;
459 spin_unlock(&fbc->lock);
460 count = 0;
461 }
462 *pcount = count;
463 put_cpu();
464}
465EXPORT_SYMBOL(percpu_counter_mod);
466#endif
467
468/*
469 * Perform any setup for the swap system
470 */
471void __init swap_setup(void)
472{
473 unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
474
475 /* Use a smaller cluster for small-memory machines */
476 if (megs < 16)
477 page_cluster = 2;
478 else
479 page_cluster = 3;
480 /*
481 * Right now other parts of the system means that we
482 * _really_ don't want to cluster much more
483 */
484 hotcpu_notifier(cpu_swap_callback, 0);
485}
diff --git a/mm/swap_state.c b/mm/swap_state.c
new file mode 100644
index 00000000000..a063a902ed0
--- /dev/null
+++ b/mm/swap_state.c
@@ -0,0 +1,382 @@
1/*
2 * linux/mm/swap_state.c
3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
6 *
7 * Rewritten to use page cache, (C) 1998 Stephen Tweedie
8 */
9#include <linux/module.h>
10#include <linux/mm.h>
11#include <linux/kernel_stat.h>
12#include <linux/swap.h>
13#include <linux/init.h>
14#include <linux/pagemap.h>
15#include <linux/buffer_head.h>
16#include <linux/backing-dev.h>
17
18#include <asm/pgtable.h>
19
20/*
21 * swapper_space is a fiction, retained to simplify the path through
22 * vmscan's shrink_list, to make sync_page look nicer, and to allow
23 * future use of radix_tree tags in the swap cache.
24 */
25static struct address_space_operations swap_aops = {
26 .writepage = swap_writepage,
27 .sync_page = block_sync_page,
28 .set_page_dirty = __set_page_dirty_nobuffers,
29};
30
31static struct backing_dev_info swap_backing_dev_info = {
32 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
33 .unplug_io_fn = swap_unplug_io_fn,
34};
35
36struct address_space swapper_space = {
37 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
38 .tree_lock = RW_LOCK_UNLOCKED,
39 .a_ops = &swap_aops,
40 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
41 .backing_dev_info = &swap_backing_dev_info,
42};
43EXPORT_SYMBOL(swapper_space);
44
45#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
46
47static struct {
48 unsigned long add_total;
49 unsigned long del_total;
50 unsigned long find_success;
51 unsigned long find_total;
52 unsigned long noent_race;
53 unsigned long exist_race;
54} swap_cache_info;
55
56void show_swap_cache_info(void)
57{
58 printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n",
59 swap_cache_info.add_total, swap_cache_info.del_total,
60 swap_cache_info.find_success, swap_cache_info.find_total,
61 swap_cache_info.noent_race, swap_cache_info.exist_race);
62 printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
63 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
64}
65
66/*
67 * __add_to_swap_cache resembles add_to_page_cache on swapper_space,
68 * but sets SwapCache flag and private instead of mapping and index.
69 */
70static int __add_to_swap_cache(struct page *page,
71 swp_entry_t entry, int gfp_mask)
72{
73 int error;
74
75 BUG_ON(PageSwapCache(page));
76 BUG_ON(PagePrivate(page));
77 error = radix_tree_preload(gfp_mask);
78 if (!error) {
79 write_lock_irq(&swapper_space.tree_lock);
80 error = radix_tree_insert(&swapper_space.page_tree,
81 entry.val, page);
82 if (!error) {
83 page_cache_get(page);
84 SetPageLocked(page);
85 SetPageSwapCache(page);
86 page->private = entry.val;
87 total_swapcache_pages++;
88 pagecache_acct(1);
89 }
90 write_unlock_irq(&swapper_space.tree_lock);
91 radix_tree_preload_end();
92 }
93 return error;
94}
95
96static int add_to_swap_cache(struct page *page, swp_entry_t entry)
97{
98 int error;
99
100 if (!swap_duplicate(entry)) {
101 INC_CACHE_INFO(noent_race);
102 return -ENOENT;
103 }
104 error = __add_to_swap_cache(page, entry, GFP_KERNEL);
105 /*
106 * Anon pages are already on the LRU, we don't run lru_cache_add here.
107 */
108 if (error) {
109 swap_free(entry);
110 if (error == -EEXIST)
111 INC_CACHE_INFO(exist_race);
112 return error;
113 }
114 INC_CACHE_INFO(add_total);
115 return 0;
116}
117
118/*
119 * This must be called only on pages that have
120 * been verified to be in the swap cache.
121 */
122void __delete_from_swap_cache(struct page *page)
123{
124 BUG_ON(!PageLocked(page));
125 BUG_ON(!PageSwapCache(page));
126 BUG_ON(PageWriteback(page));
127
128 radix_tree_delete(&swapper_space.page_tree, page->private);
129 page->private = 0;
130 ClearPageSwapCache(page);
131 total_swapcache_pages--;
132 pagecache_acct(-1);
133 INC_CACHE_INFO(del_total);
134}
135
136/**
137 * add_to_swap - allocate swap space for a page
138 * @page: page we want to move to swap
139 *
140 * Allocate swap space for the page and add the page to the
141 * swap cache. Caller needs to hold the page lock.
142 */
143int add_to_swap(struct page * page)
144{
145 swp_entry_t entry;
146 int pf_flags;
147 int err;
148
149 if (!PageLocked(page))
150 BUG();
151
152 for (;;) {
153 entry = get_swap_page();
154 if (!entry.val)
155 return 0;
156
157 /* Radix-tree node allocations are performing
158 * GFP_ATOMIC allocations under PF_MEMALLOC.
159 * They can completely exhaust the page allocator.
160 *
161 * So PF_MEMALLOC is dropped here. This causes the slab
162 * allocations to fail earlier, so radix-tree nodes will
163 * then be allocated from the mempool reserves.
164 *
165 * We're still using __GFP_HIGH for radix-tree node
166 * allocations, so some of the emergency pools are available,
167 * just not all of them.
168 */
169
170 pf_flags = current->flags;
171 current->flags &= ~PF_MEMALLOC;
172
173 /*
174 * Add it to the swap cache and mark it dirty
175 */
176 err = __add_to_swap_cache(page, entry, GFP_ATOMIC|__GFP_NOWARN);
177
178 if (pf_flags & PF_MEMALLOC)
179 current->flags |= PF_MEMALLOC;
180
181 switch (err) {
182 case 0: /* Success */
183 SetPageUptodate(page);
184 SetPageDirty(page);
185 INC_CACHE_INFO(add_total);
186 return 1;
187 case -EEXIST:
188 /* Raced with "speculative" read_swap_cache_async */
189 INC_CACHE_INFO(exist_race);
190 swap_free(entry);
191 continue;
192 default:
193 /* -ENOMEM radix-tree allocation failure */
194 swap_free(entry);
195 return 0;
196 }
197 }
198}
199
200/*
201 * This must be called only on pages that have
202 * been verified to be in the swap cache and locked.
203 * It will never put the page into the free list,
204 * the caller has a reference on the page.
205 */
206void delete_from_swap_cache(struct page *page)
207{
208 swp_entry_t entry;
209
210 BUG_ON(!PageSwapCache(page));
211 BUG_ON(!PageLocked(page));
212 BUG_ON(PageWriteback(page));
213 BUG_ON(PagePrivate(page));
214
215 entry.val = page->private;
216
217 write_lock_irq(&swapper_space.tree_lock);
218 __delete_from_swap_cache(page);
219 write_unlock_irq(&swapper_space.tree_lock);
220
221 swap_free(entry);
222 page_cache_release(page);
223}
224
225/*
226 * Strange swizzling function only for use by shmem_writepage
227 */
228int move_to_swap_cache(struct page *page, swp_entry_t entry)
229{
230 int err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
231 if (!err) {
232 remove_from_page_cache(page);
233 page_cache_release(page); /* pagecache ref */
234 if (!swap_duplicate(entry))
235 BUG();
236 SetPageDirty(page);
237 INC_CACHE_INFO(add_total);
238 } else if (err == -EEXIST)
239 INC_CACHE_INFO(exist_race);
240 return err;
241}
242
243/*
244 * Strange swizzling function for shmem_getpage (and shmem_unuse)
245 */
246int move_from_swap_cache(struct page *page, unsigned long index,
247 struct address_space *mapping)
248{
249 int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC);
250 if (!err) {
251 delete_from_swap_cache(page);
252 /* shift page from clean_pages to dirty_pages list */
253 ClearPageDirty(page);
254 set_page_dirty(page);
255 }
256 return err;
257}
258
259/*
260 * If we are the only user, then try to free up the swap cache.
261 *
262 * Its ok to check for PageSwapCache without the page lock
263 * here because we are going to recheck again inside
264 * exclusive_swap_page() _with_ the lock.
265 * - Marcelo
266 */
267static inline void free_swap_cache(struct page *page)
268{
269 if (PageSwapCache(page) && !TestSetPageLocked(page)) {
270 remove_exclusive_swap_page(page);
271 unlock_page(page);
272 }
273}
274
275/*
276 * Perform a free_page(), also freeing any swap cache associated with
277 * this page if it is the last user of the page. Can not do a lock_page,
278 * as we are holding the page_table_lock spinlock.
279 */
280void free_page_and_swap_cache(struct page *page)
281{
282 free_swap_cache(page);
283 page_cache_release(page);
284}
285
286/*
287 * Passed an array of pages, drop them all from swapcache and then release
288 * them. They are removed from the LRU and freed if this is their last use.
289 */
290void free_pages_and_swap_cache(struct page **pages, int nr)
291{
292 int chunk = 16;
293 struct page **pagep = pages;
294
295 lru_add_drain();
296 while (nr) {
297 int todo = min(chunk, nr);
298 int i;
299
300 for (i = 0; i < todo; i++)
301 free_swap_cache(pagep[i]);
302 release_pages(pagep, todo, 0);
303 pagep += todo;
304 nr -= todo;
305 }
306}
307
308/*
309 * Lookup a swap entry in the swap cache. A found page will be returned
310 * unlocked and with its refcount incremented - we rely on the kernel
311 * lock getting page table operations atomic even if we drop the page
312 * lock before returning.
313 */
314struct page * lookup_swap_cache(swp_entry_t entry)
315{
316 struct page *page;
317
318 page = find_get_page(&swapper_space, entry.val);
319
320 if (page)
321 INC_CACHE_INFO(find_success);
322
323 INC_CACHE_INFO(find_total);
324 return page;
325}
326
327/*
328 * Locate a page of swap in physical memory, reserving swap cache space
329 * and reading the disk if it is not already cached.
330 * A failure return means that either the page allocation failed or that
331 * the swap entry is no longer in use.
332 */
333struct page *read_swap_cache_async(swp_entry_t entry,
334 struct vm_area_struct *vma, unsigned long addr)
335{
336 struct page *found_page, *new_page = NULL;
337 int err;
338
339 do {
340 /*
341 * First check the swap cache. Since this is normally
342 * called after lookup_swap_cache() failed, re-calling
343 * that would confuse statistics.
344 */
345 found_page = find_get_page(&swapper_space, entry.val);
346 if (found_page)
347 break;
348
349 /*
350 * Get a new page to read into from swap.
351 */
352 if (!new_page) {
353 new_page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
354 if (!new_page)
355 break; /* Out of memory */
356 }
357
358 /*
359 * Associate the page with swap entry in the swap cache.
360 * May fail (-ENOENT) if swap entry has been freed since
361 * our caller observed it. May fail (-EEXIST) if there
362 * is already a page associated with this entry in the
363 * swap cache: added by a racing read_swap_cache_async,
364 * or by try_to_swap_out (or shmem_writepage) re-using
365 * the just freed swap entry for an existing page.
366 * May fail (-ENOMEM) if radix-tree node allocation failed.
367 */
368 err = add_to_swap_cache(new_page, entry);
369 if (!err) {
370 /*
371 * Initiate read into locked page and return.
372 */
373 lru_cache_add_active(new_page);
374 swap_readpage(NULL, new_page);
375 return new_page;
376 }
377 } while (err != -ENOENT && err != -ENOMEM);
378
379 if (new_page)
380 page_cache_release(new_page);
381 return found_page;
382}
diff --git a/mm/swapfile.c b/mm/swapfile.c
new file mode 100644
index 00000000000..a60e0075d55
--- /dev/null
+++ b/mm/swapfile.c
@@ -0,0 +1,1672 @@
1/*
2 * linux/mm/swapfile.c
3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
6 */
7
8#include <linux/config.h>
9#include <linux/mm.h>
10#include <linux/hugetlb.h>
11#include <linux/mman.h>
12#include <linux/slab.h>
13#include <linux/kernel_stat.h>
14#include <linux/swap.h>
15#include <linux/vmalloc.h>
16#include <linux/pagemap.h>
17#include <linux/namei.h>
18#include <linux/shm.h>
19#include <linux/blkdev.h>
20#include <linux/writeback.h>
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/init.h>
24#include <linux/module.h>
25#include <linux/rmap.h>
26#include <linux/security.h>
27#include <linux/backing-dev.h>
28#include <linux/syscalls.h>
29
30#include <asm/pgtable.h>
31#include <asm/tlbflush.h>
32#include <linux/swapops.h>
33
34DEFINE_SPINLOCK(swaplock);
35unsigned int nr_swapfiles;
36long total_swap_pages;
37static int swap_overflow;
38
39EXPORT_SYMBOL(total_swap_pages);
40
41static const char Bad_file[] = "Bad swap file entry ";
42static const char Unused_file[] = "Unused swap file entry ";
43static const char Bad_offset[] = "Bad swap offset entry ";
44static const char Unused_offset[] = "Unused swap offset entry ";
45
46struct swap_list_t swap_list = {-1, -1};
47
48struct swap_info_struct swap_info[MAX_SWAPFILES];
49
50static DECLARE_MUTEX(swapon_sem);
51
52/*
53 * We need this because the bdev->unplug_fn can sleep and we cannot
54 * hold swap_list_lock while calling the unplug_fn. And swap_list_lock
55 * cannot be turned into a semaphore.
56 */
57static DECLARE_RWSEM(swap_unplug_sem);
58
59#define SWAPFILE_CLUSTER 256
60
61void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
62{
63 swp_entry_t entry;
64
65 down_read(&swap_unplug_sem);
66 entry.val = page->private;
67 if (PageSwapCache(page)) {
68 struct block_device *bdev = swap_info[swp_type(entry)].bdev;
69 struct backing_dev_info *bdi;
70
71 /*
72 * If the page is removed from swapcache from under us (with a
73 * racy try_to_unuse/swapoff) we need an additional reference
74 * count to avoid reading garbage from page->private above. If
75 * the WARN_ON triggers during a swapoff it maybe the race
76 * condition and it's harmless. However if it triggers without
77 * swapoff it signals a problem.
78 */
79 WARN_ON(page_count(page) <= 1);
80
81 bdi = bdev->bd_inode->i_mapping->backing_dev_info;
82 bdi->unplug_io_fn(bdi, page);
83 }
84 up_read(&swap_unplug_sem);
85}
86
87static inline int scan_swap_map(struct swap_info_struct *si)
88{
89 unsigned long offset;
90 /*
91 * We try to cluster swap pages by allocating them
92 * sequentially in swap. Once we've allocated
93 * SWAPFILE_CLUSTER pages this way, however, we resort to
94 * first-free allocation, starting a new cluster. This
95 * prevents us from scattering swap pages all over the entire
96 * swap partition, so that we reduce overall disk seek times
97 * between swap pages. -- sct */
98 if (si->cluster_nr) {
99 while (si->cluster_next <= si->highest_bit) {
100 offset = si->cluster_next++;
101 if (si->swap_map[offset])
102 continue;
103 si->cluster_nr--;
104 goto got_page;
105 }
106 }
107 si->cluster_nr = SWAPFILE_CLUSTER;
108
109 /* try to find an empty (even not aligned) cluster. */
110 offset = si->lowest_bit;
111 check_next_cluster:
112 if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
113 {
114 unsigned long nr;
115 for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
116 if (si->swap_map[nr])
117 {
118 offset = nr+1;
119 goto check_next_cluster;
120 }
121 /* We found a completly empty cluster, so start
122 * using it.
123 */
124 goto got_page;
125 }
126 /* No luck, so now go finegrined as usual. -Andrea */
127 for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
128 if (si->swap_map[offset])
129 continue;
130 si->lowest_bit = offset+1;
131 got_page:
132 if (offset == si->lowest_bit)
133 si->lowest_bit++;
134 if (offset == si->highest_bit)
135 si->highest_bit--;
136 if (si->lowest_bit > si->highest_bit) {
137 si->lowest_bit = si->max;
138 si->highest_bit = 0;
139 }
140 si->swap_map[offset] = 1;
141 si->inuse_pages++;
142 nr_swap_pages--;
143 si->cluster_next = offset+1;
144 return offset;
145 }
146 si->lowest_bit = si->max;
147 si->highest_bit = 0;
148 return 0;
149}
150
151swp_entry_t get_swap_page(void)
152{
153 struct swap_info_struct * p;
154 unsigned long offset;
155 swp_entry_t entry;
156 int type, wrapped = 0;
157
158 entry.val = 0; /* Out of memory */
159 swap_list_lock();
160 type = swap_list.next;
161 if (type < 0)
162 goto out;
163 if (nr_swap_pages <= 0)
164 goto out;
165
166 while (1) {
167 p = &swap_info[type];
168 if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
169 swap_device_lock(p);
170 offset = scan_swap_map(p);
171 swap_device_unlock(p);
172 if (offset) {
173 entry = swp_entry(type,offset);
174 type = swap_info[type].next;
175 if (type < 0 ||
176 p->prio != swap_info[type].prio) {
177 swap_list.next = swap_list.head;
178 } else {
179 swap_list.next = type;
180 }
181 goto out;
182 }
183 }
184 type = p->next;
185 if (!wrapped) {
186 if (type < 0 || p->prio != swap_info[type].prio) {
187 type = swap_list.head;
188 wrapped = 1;
189 }
190 } else
191 if (type < 0)
192 goto out; /* out of swap space */
193 }
194out:
195 swap_list_unlock();
196 return entry;
197}
198
199static struct swap_info_struct * swap_info_get(swp_entry_t entry)
200{
201 struct swap_info_struct * p;
202 unsigned long offset, type;
203
204 if (!entry.val)
205 goto out;
206 type = swp_type(entry);
207 if (type >= nr_swapfiles)
208 goto bad_nofile;
209 p = & swap_info[type];
210 if (!(p->flags & SWP_USED))
211 goto bad_device;
212 offset = swp_offset(entry);
213 if (offset >= p->max)
214 goto bad_offset;
215 if (!p->swap_map[offset])
216 goto bad_free;
217 swap_list_lock();
218 if (p->prio > swap_info[swap_list.next].prio)
219 swap_list.next = type;
220 swap_device_lock(p);
221 return p;
222
223bad_free:
224 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
225 goto out;
226bad_offset:
227 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
228 goto out;
229bad_device:
230 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
231 goto out;
232bad_nofile:
233 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
234out:
235 return NULL;
236}
237
238static void swap_info_put(struct swap_info_struct * p)
239{
240 swap_device_unlock(p);
241 swap_list_unlock();
242}
243
244static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
245{
246 int count = p->swap_map[offset];
247
248 if (count < SWAP_MAP_MAX) {
249 count--;
250 p->swap_map[offset] = count;
251 if (!count) {
252 if (offset < p->lowest_bit)
253 p->lowest_bit = offset;
254 if (offset > p->highest_bit)
255 p->highest_bit = offset;
256 nr_swap_pages++;
257 p->inuse_pages--;
258 }
259 }
260 return count;
261}
262
263/*
264 * Caller has made sure that the swapdevice corresponding to entry
265 * is still around or has not been recycled.
266 */
267void swap_free(swp_entry_t entry)
268{
269 struct swap_info_struct * p;
270
271 p = swap_info_get(entry);
272 if (p) {
273 swap_entry_free(p, swp_offset(entry));
274 swap_info_put(p);
275 }
276}
277
278/*
279 * Check if we're the only user of a swap page,
280 * when the page is locked.
281 */
282static int exclusive_swap_page(struct page *page)
283{
284 int retval = 0;
285 struct swap_info_struct * p;
286 swp_entry_t entry;
287
288 entry.val = page->private;
289 p = swap_info_get(entry);
290 if (p) {
291 /* Is the only swap cache user the cache itself? */
292 if (p->swap_map[swp_offset(entry)] == 1) {
293 /* Recheck the page count with the swapcache lock held.. */
294 write_lock_irq(&swapper_space.tree_lock);
295 if (page_count(page) == 2)
296 retval = 1;
297 write_unlock_irq(&swapper_space.tree_lock);
298 }
299 swap_info_put(p);
300 }
301 return retval;
302}
303
304/*
305 * We can use this swap cache entry directly
306 * if there are no other references to it.
307 *
308 * Here "exclusive_swap_page()" does the real
309 * work, but we opportunistically check whether
310 * we need to get all the locks first..
311 */
312int can_share_swap_page(struct page *page)
313{
314 int retval = 0;
315
316 if (!PageLocked(page))
317 BUG();
318 switch (page_count(page)) {
319 case 3:
320 if (!PagePrivate(page))
321 break;
322 /* Fallthrough */
323 case 2:
324 if (!PageSwapCache(page))
325 break;
326 retval = exclusive_swap_page(page);
327 break;
328 case 1:
329 if (PageReserved(page))
330 break;
331 retval = 1;
332 }
333 return retval;
334}
335
336/*
337 * Work out if there are any other processes sharing this
338 * swap cache page. Free it if you can. Return success.
339 */
340int remove_exclusive_swap_page(struct page *page)
341{
342 int retval;
343 struct swap_info_struct * p;
344 swp_entry_t entry;
345
346 BUG_ON(PagePrivate(page));
347 BUG_ON(!PageLocked(page));
348
349 if (!PageSwapCache(page))
350 return 0;
351 if (PageWriteback(page))
352 return 0;
353 if (page_count(page) != 2) /* 2: us + cache */
354 return 0;
355
356 entry.val = page->private;
357 p = swap_info_get(entry);
358 if (!p)
359 return 0;
360
361 /* Is the only swap cache user the cache itself? */
362 retval = 0;
363 if (p->swap_map[swp_offset(entry)] == 1) {
364 /* Recheck the page count with the swapcache lock held.. */
365 write_lock_irq(&swapper_space.tree_lock);
366 if ((page_count(page) == 2) && !PageWriteback(page)) {
367 __delete_from_swap_cache(page);
368 SetPageDirty(page);
369 retval = 1;
370 }
371 write_unlock_irq(&swapper_space.tree_lock);
372 }
373 swap_info_put(p);
374
375 if (retval) {
376 swap_free(entry);
377 page_cache_release(page);
378 }
379
380 return retval;
381}
382
383/*
384 * Free the swap entry like above, but also try to
385 * free the page cache entry if it is the last user.
386 */
387void free_swap_and_cache(swp_entry_t entry)
388{
389 struct swap_info_struct * p;
390 struct page *page = NULL;
391
392 p = swap_info_get(entry);
393 if (p) {
394 if (swap_entry_free(p, swp_offset(entry)) == 1)
395 page = find_trylock_page(&swapper_space, entry.val);
396 swap_info_put(p);
397 }
398 if (page) {
399 int one_user;
400
401 BUG_ON(PagePrivate(page));
402 page_cache_get(page);
403 one_user = (page_count(page) == 2);
404 /* Only cache user (+us), or swap space full? Free it! */
405 if (!PageWriteback(page) && (one_user || vm_swap_full())) {
406 delete_from_swap_cache(page);
407 SetPageDirty(page);
408 }
409 unlock_page(page);
410 page_cache_release(page);
411 }
412}
413
414/*
415 * Always set the resulting pte to be nowrite (the same as COW pages
416 * after one process has exited). We don't know just how many PTEs will
417 * share this swap entry, so be cautious and let do_wp_page work out
418 * what to do if a write is requested later.
419 *
420 * vma->vm_mm->page_table_lock is held.
421 */
422static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
423 unsigned long addr, swp_entry_t entry, struct page *page)
424{
425 inc_mm_counter(vma->vm_mm, rss);
426 get_page(page);
427 set_pte_at(vma->vm_mm, addr, pte,
428 pte_mkold(mk_pte(page, vma->vm_page_prot)));
429 page_add_anon_rmap(page, vma, addr);
430 swap_free(entry);
431 /*
432 * Move the page to the active list so it is not
433 * immediately swapped out again after swapon.
434 */
435 activate_page(page);
436}
437
438static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
439 unsigned long addr, unsigned long end,
440 swp_entry_t entry, struct page *page)
441{
442 pte_t *pte;
443 pte_t swp_pte = swp_entry_to_pte(entry);
444
445 pte = pte_offset_map(pmd, addr);
446 do {
447 /*
448 * swapoff spends a _lot_ of time in this loop!
449 * Test inline before going to call unuse_pte.
450 */
451 if (unlikely(pte_same(*pte, swp_pte))) {
452 unuse_pte(vma, pte, addr, entry, page);
453 pte_unmap(pte);
454 return 1;
455 }
456 } while (pte++, addr += PAGE_SIZE, addr != end);
457 pte_unmap(pte - 1);
458 return 0;
459}
460
461static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
462 unsigned long addr, unsigned long end,
463 swp_entry_t entry, struct page *page)
464{
465 pmd_t *pmd;
466 unsigned long next;
467
468 pmd = pmd_offset(pud, addr);
469 do {
470 next = pmd_addr_end(addr, end);
471 if (pmd_none_or_clear_bad(pmd))
472 continue;
473 if (unuse_pte_range(vma, pmd, addr, next, entry, page))
474 return 1;
475 } while (pmd++, addr = next, addr != end);
476 return 0;
477}
478
479static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
480 unsigned long addr, unsigned long end,
481 swp_entry_t entry, struct page *page)
482{
483 pud_t *pud;
484 unsigned long next;
485
486 pud = pud_offset(pgd, addr);
487 do {
488 next = pud_addr_end(addr, end);
489 if (pud_none_or_clear_bad(pud))
490 continue;
491 if (unuse_pmd_range(vma, pud, addr, next, entry, page))
492 return 1;
493 } while (pud++, addr = next, addr != end);
494 return 0;
495}
496
497static int unuse_vma(struct vm_area_struct *vma,
498 swp_entry_t entry, struct page *page)
499{
500 pgd_t *pgd;
501 unsigned long addr, end, next;
502
503 if (page->mapping) {
504 addr = page_address_in_vma(page, vma);
505 if (addr == -EFAULT)
506 return 0;
507 else
508 end = addr + PAGE_SIZE;
509 } else {
510 addr = vma->vm_start;
511 end = vma->vm_end;
512 }
513
514 pgd = pgd_offset(vma->vm_mm, addr);
515 do {
516 next = pgd_addr_end(addr, end);
517 if (pgd_none_or_clear_bad(pgd))
518 continue;
519 if (unuse_pud_range(vma, pgd, addr, next, entry, page))
520 return 1;
521 } while (pgd++, addr = next, addr != end);
522 return 0;
523}
524
525static int unuse_mm(struct mm_struct *mm,
526 swp_entry_t entry, struct page *page)
527{
528 struct vm_area_struct *vma;
529
530 if (!down_read_trylock(&mm->mmap_sem)) {
531 /*
532 * Our reference to the page stops try_to_unmap_one from
533 * unmapping its ptes, so swapoff can make progress.
534 */
535 unlock_page(page);
536 down_read(&mm->mmap_sem);
537 lock_page(page);
538 }
539 spin_lock(&mm->page_table_lock);
540 for (vma = mm->mmap; vma; vma = vma->vm_next) {
541 if (vma->anon_vma && unuse_vma(vma, entry, page))
542 break;
543 }
544 spin_unlock(&mm->page_table_lock);
545 up_read(&mm->mmap_sem);
546 /*
547 * Currently unuse_mm cannot fail, but leave error handling
548 * at call sites for now, since we change it from time to time.
549 */
550 return 0;
551}
552
553/*
554 * Scan swap_map from current position to next entry still in use.
555 * Recycle to start on reaching the end, returning 0 when empty.
556 */
557static int find_next_to_unuse(struct swap_info_struct *si, int prev)
558{
559 int max = si->max;
560 int i = prev;
561 int count;
562
563 /*
564 * No need for swap_device_lock(si) here: we're just looking
565 * for whether an entry is in use, not modifying it; false
566 * hits are okay, and sys_swapoff() has already prevented new
567 * allocations from this area (while holding swap_list_lock()).
568 */
569 for (;;) {
570 if (++i >= max) {
571 if (!prev) {
572 i = 0;
573 break;
574 }
575 /*
576 * No entries in use at top of swap_map,
577 * loop back to start and recheck there.
578 */
579 max = prev + 1;
580 prev = 0;
581 i = 1;
582 }
583 count = si->swap_map[i];
584 if (count && count != SWAP_MAP_BAD)
585 break;
586 }
587 return i;
588}
589
590/*
591 * We completely avoid races by reading each swap page in advance,
592 * and then search for the process using it. All the necessary
593 * page table adjustments can then be made atomically.
594 */
595static int try_to_unuse(unsigned int type)
596{
597 struct swap_info_struct * si = &swap_info[type];
598 struct mm_struct *start_mm;
599 unsigned short *swap_map;
600 unsigned short swcount;
601 struct page *page;
602 swp_entry_t entry;
603 int i = 0;
604 int retval = 0;
605 int reset_overflow = 0;
606 int shmem;
607
608 /*
609 * When searching mms for an entry, a good strategy is to
610 * start at the first mm we freed the previous entry from
611 * (though actually we don't notice whether we or coincidence
612 * freed the entry). Initialize this start_mm with a hold.
613 *
614 * A simpler strategy would be to start at the last mm we
615 * freed the previous entry from; but that would take less
616 * advantage of mmlist ordering, which clusters forked mms
617 * together, child after parent. If we race with dup_mmap(), we
618 * prefer to resolve parent before child, lest we miss entries
619 * duplicated after we scanned child: using last mm would invert
620 * that. Though it's only a serious concern when an overflowed
621 * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
622 */
623 start_mm = &init_mm;
624 atomic_inc(&init_mm.mm_users);
625
626 /*
627 * Keep on scanning until all entries have gone. Usually,
628 * one pass through swap_map is enough, but not necessarily:
629 * there are races when an instance of an entry might be missed.
630 */
631 while ((i = find_next_to_unuse(si, i)) != 0) {
632 if (signal_pending(current)) {
633 retval = -EINTR;
634 break;
635 }
636
637 /*
638 * Get a page for the entry, using the existing swap
639 * cache page if there is one. Otherwise, get a clean
640 * page and read the swap into it.
641 */
642 swap_map = &si->swap_map[i];
643 entry = swp_entry(type, i);
644 page = read_swap_cache_async(entry, NULL, 0);
645 if (!page) {
646 /*
647 * Either swap_duplicate() failed because entry
648 * has been freed independently, and will not be
649 * reused since sys_swapoff() already disabled
650 * allocation from here, or alloc_page() failed.
651 */
652 if (!*swap_map)
653 continue;
654 retval = -ENOMEM;
655 break;
656 }
657
658 /*
659 * Don't hold on to start_mm if it looks like exiting.
660 */
661 if (atomic_read(&start_mm->mm_users) == 1) {
662 mmput(start_mm);
663 start_mm = &init_mm;
664 atomic_inc(&init_mm.mm_users);
665 }
666
667 /*
668 * Wait for and lock page. When do_swap_page races with
669 * try_to_unuse, do_swap_page can handle the fault much
670 * faster than try_to_unuse can locate the entry. This
671 * apparently redundant "wait_on_page_locked" lets try_to_unuse
672 * defer to do_swap_page in such a case - in some tests,
673 * do_swap_page and try_to_unuse repeatedly compete.
674 */
675 wait_on_page_locked(page);
676 wait_on_page_writeback(page);
677 lock_page(page);
678 wait_on_page_writeback(page);
679
680 /*
681 * Remove all references to entry.
682 * Whenever we reach init_mm, there's no address space
683 * to search, but use it as a reminder to search shmem.
684 */
685 shmem = 0;
686 swcount = *swap_map;
687 if (swcount > 1) {
688 if (start_mm == &init_mm)
689 shmem = shmem_unuse(entry, page);
690 else
691 retval = unuse_mm(start_mm, entry, page);
692 }
693 if (*swap_map > 1) {
694 int set_start_mm = (*swap_map >= swcount);
695 struct list_head *p = &start_mm->mmlist;
696 struct mm_struct *new_start_mm = start_mm;
697 struct mm_struct *prev_mm = start_mm;
698 struct mm_struct *mm;
699
700 atomic_inc(&new_start_mm->mm_users);
701 atomic_inc(&prev_mm->mm_users);
702 spin_lock(&mmlist_lock);
703 while (*swap_map > 1 && !retval &&
704 (p = p->next) != &start_mm->mmlist) {
705 mm = list_entry(p, struct mm_struct, mmlist);
706 if (atomic_inc_return(&mm->mm_users) == 1) {
707 atomic_dec(&mm->mm_users);
708 continue;
709 }
710 spin_unlock(&mmlist_lock);
711 mmput(prev_mm);
712 prev_mm = mm;
713
714 cond_resched();
715
716 swcount = *swap_map;
717 if (swcount <= 1)
718 ;
719 else if (mm == &init_mm) {
720 set_start_mm = 1;
721 shmem = shmem_unuse(entry, page);
722 } else
723 retval = unuse_mm(mm, entry, page);
724 if (set_start_mm && *swap_map < swcount) {
725 mmput(new_start_mm);
726 atomic_inc(&mm->mm_users);
727 new_start_mm = mm;
728 set_start_mm = 0;
729 }
730 spin_lock(&mmlist_lock);
731 }
732 spin_unlock(&mmlist_lock);
733 mmput(prev_mm);
734 mmput(start_mm);
735 start_mm = new_start_mm;
736 }
737 if (retval) {
738 unlock_page(page);
739 page_cache_release(page);
740 break;
741 }
742
743 /*
744 * How could swap count reach 0x7fff when the maximum
745 * pid is 0x7fff, and there's no way to repeat a swap
746 * page within an mm (except in shmem, where it's the
747 * shared object which takes the reference count)?
748 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
749 *
750 * If that's wrong, then we should worry more about
751 * exit_mmap() and do_munmap() cases described above:
752 * we might be resetting SWAP_MAP_MAX too early here.
753 * We know "Undead"s can happen, they're okay, so don't
754 * report them; but do report if we reset SWAP_MAP_MAX.
755 */
756 if (*swap_map == SWAP_MAP_MAX) {
757 swap_device_lock(si);
758 *swap_map = 1;
759 swap_device_unlock(si);
760 reset_overflow = 1;
761 }
762
763 /*
764 * If a reference remains (rare), we would like to leave
765 * the page in the swap cache; but try_to_unmap could
766 * then re-duplicate the entry once we drop page lock,
767 * so we might loop indefinitely; also, that page could
768 * not be swapped out to other storage meanwhile. So:
769 * delete from cache even if there's another reference,
770 * after ensuring that the data has been saved to disk -
771 * since if the reference remains (rarer), it will be
772 * read from disk into another page. Splitting into two
773 * pages would be incorrect if swap supported "shared
774 * private" pages, but they are handled by tmpfs files.
775 *
776 * Note shmem_unuse already deleted a swappage from
777 * the swap cache, unless the move to filepage failed:
778 * in which case it left swappage in cache, lowered its
779 * swap count to pass quickly through the loops above,
780 * and now we must reincrement count to try again later.
781 */
782 if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
783 struct writeback_control wbc = {
784 .sync_mode = WB_SYNC_NONE,
785 };
786
787 swap_writepage(page, &wbc);
788 lock_page(page);
789 wait_on_page_writeback(page);
790 }
791 if (PageSwapCache(page)) {
792 if (shmem)
793 swap_duplicate(entry);
794 else
795 delete_from_swap_cache(page);
796 }
797
798 /*
799 * So we could skip searching mms once swap count went
800 * to 1, we did not mark any present ptes as dirty: must
801 * mark page dirty so shrink_list will preserve it.
802 */
803 SetPageDirty(page);
804 unlock_page(page);
805 page_cache_release(page);
806
807 /*
808 * Make sure that we aren't completely killing
809 * interactive performance.
810 */
811 cond_resched();
812 }
813
814 mmput(start_mm);
815 if (reset_overflow) {
816 printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
817 swap_overflow = 0;
818 }
819 return retval;
820}
821
822/*
823 * After a successful try_to_unuse, if no swap is now in use, we know we
824 * can empty the mmlist. swap_list_lock must be held on entry and exit.
825 * Note that mmlist_lock nests inside swap_list_lock, and an mm must be
826 * added to the mmlist just after page_duplicate - before would be racy.
827 */
828static void drain_mmlist(void)
829{
830 struct list_head *p, *next;
831 unsigned int i;
832
833 for (i = 0; i < nr_swapfiles; i++)
834 if (swap_info[i].inuse_pages)
835 return;
836 spin_lock(&mmlist_lock);
837 list_for_each_safe(p, next, &init_mm.mmlist)
838 list_del_init(p);
839 spin_unlock(&mmlist_lock);
840}
841
842/*
843 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
844 * corresponds to page offset `offset'.
845 */
846sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
847{
848 struct swap_extent *se = sis->curr_swap_extent;
849 struct swap_extent *start_se = se;
850
851 for ( ; ; ) {
852 struct list_head *lh;
853
854 if (se->start_page <= offset &&
855 offset < (se->start_page + se->nr_pages)) {
856 return se->start_block + (offset - se->start_page);
857 }
858 lh = se->list.prev;
859 if (lh == &sis->extent_list)
860 lh = lh->prev;
861 se = list_entry(lh, struct swap_extent, list);
862 sis->curr_swap_extent = se;
863 BUG_ON(se == start_se); /* It *must* be present */
864 }
865}
866
867/*
868 * Free all of a swapdev's extent information
869 */
870static void destroy_swap_extents(struct swap_info_struct *sis)
871{
872 while (!list_empty(&sis->extent_list)) {
873 struct swap_extent *se;
874
875 se = list_entry(sis->extent_list.next,
876 struct swap_extent, list);
877 list_del(&se->list);
878 kfree(se);
879 }
880 sis->nr_extents = 0;
881}
882
883/*
884 * Add a block range (and the corresponding page range) into this swapdev's
885 * extent list. The extent list is kept sorted in block order.
886 *
887 * This function rather assumes that it is called in ascending sector_t order.
888 * It doesn't look for extent coalescing opportunities.
889 */
890static int
891add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
892 unsigned long nr_pages, sector_t start_block)
893{
894 struct swap_extent *se;
895 struct swap_extent *new_se;
896 struct list_head *lh;
897
898 lh = sis->extent_list.next; /* The highest-addressed block */
899 while (lh != &sis->extent_list) {
900 se = list_entry(lh, struct swap_extent, list);
901 if (se->start_block + se->nr_pages == start_block &&
902 se->start_page + se->nr_pages == start_page) {
903 /* Merge it */
904 se->nr_pages += nr_pages;
905 return 0;
906 }
907 lh = lh->next;
908 }
909
910 /*
911 * No merge. Insert a new extent, preserving ordering.
912 */
913 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
914 if (new_se == NULL)
915 return -ENOMEM;
916 new_se->start_page = start_page;
917 new_se->nr_pages = nr_pages;
918 new_se->start_block = start_block;
919
920 lh = sis->extent_list.prev; /* The lowest block */
921 while (lh != &sis->extent_list) {
922 se = list_entry(lh, struct swap_extent, list);
923 if (se->start_block > start_block)
924 break;
925 lh = lh->prev;
926 }
927 list_add_tail(&new_se->list, lh);
928 sis->nr_extents++;
929 return 0;
930}
931
932/*
933 * A `swap extent' is a simple thing which maps a contiguous range of pages
934 * onto a contiguous range of disk blocks. An ordered list of swap extents
935 * is built at swapon time and is then used at swap_writepage/swap_readpage
936 * time for locating where on disk a page belongs.
937 *
938 * If the swapfile is an S_ISBLK block device, a single extent is installed.
939 * This is done so that the main operating code can treat S_ISBLK and S_ISREG
940 * swap files identically.
941 *
942 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
943 * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
944 * swapfiles are handled *identically* after swapon time.
945 *
946 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
947 * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If
948 * some stray blocks are found which do not fall within the PAGE_SIZE alignment
949 * requirements, they are simply tossed out - we will never use those blocks
950 * for swapping.
951 *
952 * For S_ISREG swapfiles we hold i_sem across the life of the swapon. This
953 * prevents root from shooting her foot off by ftruncating an in-use swapfile,
954 * which will scribble on the fs.
955 *
956 * The amount of disk space which a single swap extent represents varies.
957 * Typically it is in the 1-4 megabyte range. So we can have hundreds of
958 * extents in the list. To avoid much list walking, we cache the previous
959 * search location in `curr_swap_extent', and start new searches from there.
960 * This is extremely effective. The average number of iterations in
961 * map_swap_page() has been measured at about 0.3 per page. - akpm.
962 */
963static int setup_swap_extents(struct swap_info_struct *sis)
964{
965 struct inode *inode;
966 unsigned blocks_per_page;
967 unsigned long page_no;
968 unsigned blkbits;
969 sector_t probe_block;
970 sector_t last_block;
971 int ret;
972
973 inode = sis->swap_file->f_mapping->host;
974 if (S_ISBLK(inode->i_mode)) {
975 ret = add_swap_extent(sis, 0, sis->max, 0);
976 goto done;
977 }
978
979 blkbits = inode->i_blkbits;
980 blocks_per_page = PAGE_SIZE >> blkbits;
981
982 /*
983 * Map all the blocks into the extent list. This code doesn't try
984 * to be very smart.
985 */
986 probe_block = 0;
987 page_no = 0;
988 last_block = i_size_read(inode) >> blkbits;
989 while ((probe_block + blocks_per_page) <= last_block &&
990 page_no < sis->max) {
991 unsigned block_in_page;
992 sector_t first_block;
993
994 first_block = bmap(inode, probe_block);
995 if (first_block == 0)
996 goto bad_bmap;
997
998 /*
999 * It must be PAGE_SIZE aligned on-disk
1000 */
1001 if (first_block & (blocks_per_page - 1)) {
1002 probe_block++;
1003 goto reprobe;
1004 }
1005
1006 for (block_in_page = 1; block_in_page < blocks_per_page;
1007 block_in_page++) {
1008 sector_t block;
1009
1010 block = bmap(inode, probe_block + block_in_page);
1011 if (block == 0)
1012 goto bad_bmap;
1013 if (block != first_block + block_in_page) {
1014 /* Discontiguity */
1015 probe_block++;
1016 goto reprobe;
1017 }
1018 }
1019
1020 /*
1021 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
1022 */
1023 ret = add_swap_extent(sis, page_no, 1,
1024 first_block >> (PAGE_SHIFT - blkbits));
1025 if (ret)
1026 goto out;
1027 page_no++;
1028 probe_block += blocks_per_page;
1029reprobe:
1030 continue;
1031 }
1032 ret = 0;
1033 if (page_no == 0)
1034 ret = -EINVAL;
1035 sis->max = page_no;
1036 sis->highest_bit = page_no - 1;
1037done:
1038 sis->curr_swap_extent = list_entry(sis->extent_list.prev,
1039 struct swap_extent, list);
1040 goto out;
1041bad_bmap:
1042 printk(KERN_ERR "swapon: swapfile has holes\n");
1043 ret = -EINVAL;
1044out:
1045 return ret;
1046}
1047
1048#if 0 /* We don't need this yet */
1049#include <linux/backing-dev.h>
1050int page_queue_congested(struct page *page)
1051{
1052 struct backing_dev_info *bdi;
1053
1054 BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
1055
1056 if (PageSwapCache(page)) {
1057 swp_entry_t entry = { .val = page->private };
1058 struct swap_info_struct *sis;
1059
1060 sis = get_swap_info_struct(swp_type(entry));
1061 bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
1062 } else
1063 bdi = page->mapping->backing_dev_info;
1064 return bdi_write_congested(bdi);
1065}
1066#endif
1067
1068asmlinkage long sys_swapoff(const char __user * specialfile)
1069{
1070 struct swap_info_struct * p = NULL;
1071 unsigned short *swap_map;
1072 struct file *swap_file, *victim;
1073 struct address_space *mapping;
1074 struct inode *inode;
1075 char * pathname;
1076 int i, type, prev;
1077 int err;
1078
1079 if (!capable(CAP_SYS_ADMIN))
1080 return -EPERM;
1081
1082 pathname = getname(specialfile);
1083 err = PTR_ERR(pathname);
1084 if (IS_ERR(pathname))
1085 goto out;
1086
1087 victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
1088 putname(pathname);
1089 err = PTR_ERR(victim);
1090 if (IS_ERR(victim))
1091 goto out;
1092
1093 mapping = victim->f_mapping;
1094 prev = -1;
1095 swap_list_lock();
1096 for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
1097 p = swap_info + type;
1098 if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
1099 if (p->swap_file->f_mapping == mapping)
1100 break;
1101 }
1102 prev = type;
1103 }
1104 if (type < 0) {
1105 err = -EINVAL;
1106 swap_list_unlock();
1107 goto out_dput;
1108 }
1109 if (!security_vm_enough_memory(p->pages))
1110 vm_unacct_memory(p->pages);
1111 else {
1112 err = -ENOMEM;
1113 swap_list_unlock();
1114 goto out_dput;
1115 }
1116 if (prev < 0) {
1117 swap_list.head = p->next;
1118 } else {
1119 swap_info[prev].next = p->next;
1120 }
1121 if (type == swap_list.next) {
1122 /* just pick something that's safe... */
1123 swap_list.next = swap_list.head;
1124 }
1125 nr_swap_pages -= p->pages;
1126 total_swap_pages -= p->pages;
1127 p->flags &= ~SWP_WRITEOK;
1128 swap_list_unlock();
1129 current->flags |= PF_SWAPOFF;
1130 err = try_to_unuse(type);
1131 current->flags &= ~PF_SWAPOFF;
1132
1133 /* wait for any unplug function to finish */
1134 down_write(&swap_unplug_sem);
1135 up_write(&swap_unplug_sem);
1136
1137 if (err) {
1138 /* re-insert swap space back into swap_list */
1139 swap_list_lock();
1140 for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
1141 if (p->prio >= swap_info[i].prio)
1142 break;
1143 p->next = i;
1144 if (prev < 0)
1145 swap_list.head = swap_list.next = p - swap_info;
1146 else
1147 swap_info[prev].next = p - swap_info;
1148 nr_swap_pages += p->pages;
1149 total_swap_pages += p->pages;
1150 p->flags |= SWP_WRITEOK;
1151 swap_list_unlock();
1152 goto out_dput;
1153 }
1154 down(&swapon_sem);
1155 swap_list_lock();
1156 drain_mmlist();
1157 swap_device_lock(p);
1158 swap_file = p->swap_file;
1159 p->swap_file = NULL;
1160 p->max = 0;
1161 swap_map = p->swap_map;
1162 p->swap_map = NULL;
1163 p->flags = 0;
1164 destroy_swap_extents(p);
1165 swap_device_unlock(p);
1166 swap_list_unlock();
1167 up(&swapon_sem);
1168 vfree(swap_map);
1169 inode = mapping->host;
1170 if (S_ISBLK(inode->i_mode)) {
1171 struct block_device *bdev = I_BDEV(inode);
1172 set_blocksize(bdev, p->old_block_size);
1173 bd_release(bdev);
1174 } else {
1175 down(&inode->i_sem);
1176 inode->i_flags &= ~S_SWAPFILE;
1177 up(&inode->i_sem);
1178 }
1179 filp_close(swap_file, NULL);
1180 err = 0;
1181
1182out_dput:
1183 filp_close(victim, NULL);
1184out:
1185 return err;
1186}
1187
1188#ifdef CONFIG_PROC_FS
1189/* iterator */
1190static void *swap_start(struct seq_file *swap, loff_t *pos)
1191{
1192 struct swap_info_struct *ptr = swap_info;
1193 int i;
1194 loff_t l = *pos;
1195
1196 down(&swapon_sem);
1197
1198 for (i = 0; i < nr_swapfiles; i++, ptr++) {
1199 if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
1200 continue;
1201 if (!l--)
1202 return ptr;
1203 }
1204
1205 return NULL;
1206}
1207
1208static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1209{
1210 struct swap_info_struct *ptr = v;
1211 struct swap_info_struct *endptr = swap_info + nr_swapfiles;
1212
1213 for (++ptr; ptr < endptr; ptr++) {
1214 if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
1215 continue;
1216 ++*pos;
1217 return ptr;
1218 }
1219
1220 return NULL;
1221}
1222
1223static void swap_stop(struct seq_file *swap, void *v)
1224{
1225 up(&swapon_sem);
1226}
1227
1228static int swap_show(struct seq_file *swap, void *v)
1229{
1230 struct swap_info_struct *ptr = v;
1231 struct file *file;
1232 int len;
1233
1234 if (v == swap_info)
1235 seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1236
1237 file = ptr->swap_file;
1238 len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\");
1239 seq_printf(swap, "%*s%s\t%d\t%ld\t%d\n",
1240 len < 40 ? 40 - len : 1, " ",
1241 S_ISBLK(file->f_dentry->d_inode->i_mode) ?
1242 "partition" : "file\t",
1243 ptr->pages << (PAGE_SHIFT - 10),
1244 ptr->inuse_pages << (PAGE_SHIFT - 10),
1245 ptr->prio);
1246 return 0;
1247}
1248
1249static struct seq_operations swaps_op = {
1250 .start = swap_start,
1251 .next = swap_next,
1252 .stop = swap_stop,
1253 .show = swap_show
1254};
1255
1256static int swaps_open(struct inode *inode, struct file *file)
1257{
1258 return seq_open(file, &swaps_op);
1259}
1260
1261static struct file_operations proc_swaps_operations = {
1262 .open = swaps_open,
1263 .read = seq_read,
1264 .llseek = seq_lseek,
1265 .release = seq_release,
1266};
1267
1268static int __init procswaps_init(void)
1269{
1270 struct proc_dir_entry *entry;
1271
1272 entry = create_proc_entry("swaps", 0, NULL);
1273 if (entry)
1274 entry->proc_fops = &proc_swaps_operations;
1275 return 0;
1276}
1277__initcall(procswaps_init);
1278#endif /* CONFIG_PROC_FS */
1279
1280/*
1281 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
1282 *
1283 * The swapon system call
1284 */
1285asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1286{
1287 struct swap_info_struct * p;
1288 char *name = NULL;
1289 struct block_device *bdev = NULL;
1290 struct file *swap_file = NULL;
1291 struct address_space *mapping;
1292 unsigned int type;
1293 int i, prev;
1294 int error;
1295 static int least_priority;
1296 union swap_header *swap_header = NULL;
1297 int swap_header_version;
1298 int nr_good_pages = 0;
1299 unsigned long maxpages = 1;
1300 int swapfilesize;
1301 unsigned short *swap_map;
1302 struct page *page = NULL;
1303 struct inode *inode = NULL;
1304 int did_down = 0;
1305
1306 if (!capable(CAP_SYS_ADMIN))
1307 return -EPERM;
1308 swap_list_lock();
1309 p = swap_info;
1310 for (type = 0 ; type < nr_swapfiles ; type++,p++)
1311 if (!(p->flags & SWP_USED))
1312 break;
1313 error = -EPERM;
1314 /*
1315 * Test if adding another swap device is possible. There are
1316 * two limiting factors: 1) the number of bits for the swap
1317 * type swp_entry_t definition and 2) the number of bits for
1318 * the swap type in the swap ptes as defined by the different
1319 * architectures. To honor both limitations a swap entry
1320 * with swap offset 0 and swap type ~0UL is created, encoded
1321 * to a swap pte, decoded to a swp_entry_t again and finally
1322 * the swap type part is extracted. This will mask all bits
1323 * from the initial ~0UL that can't be encoded in either the
1324 * swp_entry_t or the architecture definition of a swap pte.
1325 */
1326 if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
1327 swap_list_unlock();
1328 goto out;
1329 }
1330 if (type >= nr_swapfiles)
1331 nr_swapfiles = type+1;
1332 INIT_LIST_HEAD(&p->extent_list);
1333 p->flags = SWP_USED;
1334 p->nr_extents = 0;
1335 p->swap_file = NULL;
1336 p->old_block_size = 0;
1337 p->swap_map = NULL;
1338 p->lowest_bit = 0;
1339 p->highest_bit = 0;
1340 p->cluster_nr = 0;
1341 p->inuse_pages = 0;
1342 spin_lock_init(&p->sdev_lock);
1343 p->next = -1;
1344 if (swap_flags & SWAP_FLAG_PREFER) {
1345 p->prio =
1346 (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
1347 } else {
1348 p->prio = --least_priority;
1349 }
1350 swap_list_unlock();
1351 name = getname(specialfile);
1352 error = PTR_ERR(name);
1353 if (IS_ERR(name)) {
1354 name = NULL;
1355 goto bad_swap_2;
1356 }
1357 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
1358 error = PTR_ERR(swap_file);
1359 if (IS_ERR(swap_file)) {
1360 swap_file = NULL;
1361 goto bad_swap_2;
1362 }
1363
1364 p->swap_file = swap_file;
1365 mapping = swap_file->f_mapping;
1366 inode = mapping->host;
1367
1368 error = -EBUSY;
1369 for (i = 0; i < nr_swapfiles; i++) {
1370 struct swap_info_struct *q = &swap_info[i];
1371
1372 if (i == type || !q->swap_file)
1373 continue;
1374 if (mapping == q->swap_file->f_mapping)
1375 goto bad_swap;
1376 }
1377
1378 error = -EINVAL;
1379 if (S_ISBLK(inode->i_mode)) {
1380 bdev = I_BDEV(inode);
1381 error = bd_claim(bdev, sys_swapon);
1382 if (error < 0) {
1383 bdev = NULL;
1384 goto bad_swap;
1385 }
1386 p->old_block_size = block_size(bdev);
1387 error = set_blocksize(bdev, PAGE_SIZE);
1388 if (error < 0)
1389 goto bad_swap;
1390 p->bdev = bdev;
1391 } else if (S_ISREG(inode->i_mode)) {
1392 p->bdev = inode->i_sb->s_bdev;
1393 down(&inode->i_sem);
1394 did_down = 1;
1395 if (IS_SWAPFILE(inode)) {
1396 error = -EBUSY;
1397 goto bad_swap;
1398 }
1399 } else {
1400 goto bad_swap;
1401 }
1402
1403 swapfilesize = i_size_read(inode) >> PAGE_SHIFT;
1404
1405 /*
1406 * Read the swap header.
1407 */
1408 if (!mapping->a_ops->readpage) {
1409 error = -EINVAL;
1410 goto bad_swap;
1411 }
1412 page = read_cache_page(mapping, 0,
1413 (filler_t *)mapping->a_ops->readpage, swap_file);
1414 if (IS_ERR(page)) {
1415 error = PTR_ERR(page);
1416 goto bad_swap;
1417 }
1418 wait_on_page_locked(page);
1419 if (!PageUptodate(page))
1420 goto bad_swap;
1421 kmap(page);
1422 swap_header = page_address(page);
1423
1424 if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
1425 swap_header_version = 1;
1426 else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
1427 swap_header_version = 2;
1428 else {
1429 printk("Unable to find swap-space signature\n");
1430 error = -EINVAL;
1431 goto bad_swap;
1432 }
1433
1434 switch (swap_header_version) {
1435 case 1:
1436 printk(KERN_ERR "version 0 swap is no longer supported. "
1437 "Use mkswap -v1 %s\n", name);
1438 error = -EINVAL;
1439 goto bad_swap;
1440 case 2:
1441 /* Check the swap header's sub-version and the size of
1442 the swap file and bad block lists */
1443 if (swap_header->info.version != 1) {
1444 printk(KERN_WARNING
1445 "Unable to handle swap header version %d\n",
1446 swap_header->info.version);
1447 error = -EINVAL;
1448 goto bad_swap;
1449 }
1450
1451 p->lowest_bit = 1;
1452 /*
1453 * Find out how many pages are allowed for a single swap
1454 * device. There are two limiting factors: 1) the number of
1455 * bits for the swap offset in the swp_entry_t type and
1456 * 2) the number of bits in the a swap pte as defined by
1457 * the different architectures. In order to find the
1458 * largest possible bit mask a swap entry with swap type 0
1459 * and swap offset ~0UL is created, encoded to a swap pte,
1460 * decoded to a swp_entry_t again and finally the swap
1461 * offset is extracted. This will mask all the bits from
1462 * the initial ~0UL mask that can't be encoded in either
1463 * the swp_entry_t or the architecture definition of a
1464 * swap pte.
1465 */
1466 maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1;
1467 if (maxpages > swap_header->info.last_page)
1468 maxpages = swap_header->info.last_page;
1469 p->highest_bit = maxpages - 1;
1470
1471 error = -EINVAL;
1472 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1473 goto bad_swap;
1474
1475 /* OK, set up the swap map and apply the bad block list */
1476 if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
1477 error = -ENOMEM;
1478 goto bad_swap;
1479 }
1480
1481 error = 0;
1482 memset(p->swap_map, 0, maxpages * sizeof(short));
1483 for (i=0; i<swap_header->info.nr_badpages; i++) {
1484 int page = swap_header->info.badpages[i];
1485 if (page <= 0 || page >= swap_header->info.last_page)
1486 error = -EINVAL;
1487 else
1488 p->swap_map[page] = SWAP_MAP_BAD;
1489 }
1490 nr_good_pages = swap_header->info.last_page -
1491 swap_header->info.nr_badpages -
1492 1 /* header page */;
1493 if (error)
1494 goto bad_swap;
1495 }
1496
1497 if (swapfilesize && maxpages > swapfilesize) {
1498 printk(KERN_WARNING
1499 "Swap area shorter than signature indicates\n");
1500 error = -EINVAL;
1501 goto bad_swap;
1502 }
1503 if (!nr_good_pages) {
1504 printk(KERN_WARNING "Empty swap-file\n");
1505 error = -EINVAL;
1506 goto bad_swap;
1507 }
1508 p->swap_map[0] = SWAP_MAP_BAD;
1509 p->max = maxpages;
1510 p->pages = nr_good_pages;
1511
1512 error = setup_swap_extents(p);
1513 if (error)
1514 goto bad_swap;
1515
1516 down(&swapon_sem);
1517 swap_list_lock();
1518 swap_device_lock(p);
1519 p->flags = SWP_ACTIVE;
1520 nr_swap_pages += nr_good_pages;
1521 total_swap_pages += nr_good_pages;
1522 printk(KERN_INFO "Adding %dk swap on %s. Priority:%d extents:%d\n",
1523 nr_good_pages<<(PAGE_SHIFT-10), name,
1524 p->prio, p->nr_extents);
1525
1526 /* insert swap space into swap_list: */
1527 prev = -1;
1528 for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
1529 if (p->prio >= swap_info[i].prio) {
1530 break;
1531 }
1532 prev = i;
1533 }
1534 p->next = i;
1535 if (prev < 0) {
1536 swap_list.head = swap_list.next = p - swap_info;
1537 } else {
1538 swap_info[prev].next = p - swap_info;
1539 }
1540 swap_device_unlock(p);
1541 swap_list_unlock();
1542 up(&swapon_sem);
1543 error = 0;
1544 goto out;
1545bad_swap:
1546 if (bdev) {
1547 set_blocksize(bdev, p->old_block_size);
1548 bd_release(bdev);
1549 }
1550bad_swap_2:
1551 swap_list_lock();
1552 swap_map = p->swap_map;
1553 p->swap_file = NULL;
1554 p->swap_map = NULL;
1555 p->flags = 0;
1556 if (!(swap_flags & SWAP_FLAG_PREFER))
1557 ++least_priority;
1558 swap_list_unlock();
1559 destroy_swap_extents(p);
1560 vfree(swap_map);
1561 if (swap_file)
1562 filp_close(swap_file, NULL);
1563out:
1564 if (page && !IS_ERR(page)) {
1565 kunmap(page);
1566 page_cache_release(page);
1567 }
1568 if (name)
1569 putname(name);
1570 if (did_down) {
1571 if (!error)
1572 inode->i_flags |= S_SWAPFILE;
1573 up(&inode->i_sem);
1574 }
1575 return error;
1576}
1577
1578void si_swapinfo(struct sysinfo *val)
1579{
1580 unsigned int i;
1581 unsigned long nr_to_be_unused = 0;
1582
1583 swap_list_lock();
1584 for (i = 0; i < nr_swapfiles; i++) {
1585 if (!(swap_info[i].flags & SWP_USED) ||
1586 (swap_info[i].flags & SWP_WRITEOK))
1587 continue;
1588 nr_to_be_unused += swap_info[i].inuse_pages;
1589 }
1590 val->freeswap = nr_swap_pages + nr_to_be_unused;
1591 val->totalswap = total_swap_pages + nr_to_be_unused;
1592 swap_list_unlock();
1593}
1594
1595/*
1596 * Verify that a swap entry is valid and increment its swap map count.
1597 *
1598 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
1599 * "permanent", but will be reclaimed by the next swapoff.
1600 */
1601int swap_duplicate(swp_entry_t entry)
1602{
1603 struct swap_info_struct * p;
1604 unsigned long offset, type;
1605 int result = 0;
1606
1607 type = swp_type(entry);
1608 if (type >= nr_swapfiles)
1609 goto bad_file;
1610 p = type + swap_info;
1611 offset = swp_offset(entry);
1612
1613 swap_device_lock(p);
1614 if (offset < p->max && p->swap_map[offset]) {
1615 if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
1616 p->swap_map[offset]++;
1617 result = 1;
1618 } else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
1619 if (swap_overflow++ < 5)
1620 printk(KERN_WARNING "swap_dup: swap entry overflow\n");
1621 p->swap_map[offset] = SWAP_MAP_MAX;
1622 result = 1;
1623 }
1624 }
1625 swap_device_unlock(p);
1626out:
1627 return result;
1628
1629bad_file:
1630 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
1631 goto out;
1632}
1633
1634struct swap_info_struct *
1635get_swap_info_struct(unsigned type)
1636{
1637 return &swap_info[type];
1638}
1639
1640/*
1641 * swap_device_lock prevents swap_map being freed. Don't grab an extra
1642 * reference on the swaphandle, it doesn't matter if it becomes unused.
1643 */
1644int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
1645{
1646 int ret = 0, i = 1 << page_cluster;
1647 unsigned long toff;
1648 struct swap_info_struct *swapdev = swp_type(entry) + swap_info;
1649
1650 if (!page_cluster) /* no readahead */
1651 return 0;
1652 toff = (swp_offset(entry) >> page_cluster) << page_cluster;
1653 if (!toff) /* first page is swap header */
1654 toff++, i--;
1655 *offset = toff;
1656
1657 swap_device_lock(swapdev);
1658 do {
1659 /* Don't read-ahead past the end of the swap area */
1660 if (toff >= swapdev->max)
1661 break;
1662 /* Don't read in free or bad pages */
1663 if (!swapdev->swap_map[toff])
1664 break;
1665 if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
1666 break;
1667 toff++;
1668 ret++;
1669 } while (--i);
1670 swap_device_unlock(swapdev);
1671 return ret;
1672}
diff --git a/mm/thrash.c b/mm/thrash.c
new file mode 100644
index 00000000000..11461f7ad83
--- /dev/null
+++ b/mm/thrash.c
@@ -0,0 +1,102 @@
1/*
2 * mm/thrash.c
3 *
4 * Copyright (C) 2004, Red Hat, Inc.
5 * Copyright (C) 2004, Rik van Riel <riel@redhat.com>
6 * Released under the GPL, see the file COPYING for details.
7 *
8 * Simple token based thrashing protection, using the algorithm
9 * described in: http://www.cs.wm.edu/~sjiang/token.pdf
10 */
11#include <linux/jiffies.h>
12#include <linux/mm.h>
13#include <linux/sched.h>
14#include <linux/swap.h>
15
16static DEFINE_SPINLOCK(swap_token_lock);
17static unsigned long swap_token_timeout;
18static unsigned long swap_token_check;
19struct mm_struct * swap_token_mm = &init_mm;
20
21#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2)
22#define SWAP_TOKEN_TIMEOUT 0
23/*
24 * Currently disabled; Needs further code to work at HZ * 300.
25 */
26unsigned long swap_token_default_timeout = SWAP_TOKEN_TIMEOUT;
27
28/*
29 * Take the token away if the process had no page faults
30 * in the last interval, or if it has held the token for
31 * too long.
32 */
33#define SWAP_TOKEN_ENOUGH_RSS 1
34#define SWAP_TOKEN_TIMED_OUT 2
35static int should_release_swap_token(struct mm_struct *mm)
36{
37 int ret = 0;
38 if (!mm->recent_pagein)
39 ret = SWAP_TOKEN_ENOUGH_RSS;
40 else if (time_after(jiffies, swap_token_timeout))
41 ret = SWAP_TOKEN_TIMED_OUT;
42 mm->recent_pagein = 0;
43 return ret;
44}
45
46/*
47 * Try to grab the swapout protection token. We only try to
48 * grab it once every TOKEN_CHECK_INTERVAL, both to prevent
49 * SMP lock contention and to check that the process that held
50 * the token before is no longer thrashing.
51 */
52void grab_swap_token(void)
53{
54 struct mm_struct *mm;
55 int reason;
56
57 /* We have the token. Let others know we still need it. */
58 if (has_swap_token(current->mm)) {
59 current->mm->recent_pagein = 1;
60 return;
61 }
62
63 if (time_after(jiffies, swap_token_check)) {
64
65 /* Can't get swapout protection if we exceed our RSS limit. */
66 // if (current->mm->rss > current->mm->rlimit_rss)
67 // return;
68
69 /* ... or if we recently held the token. */
70 if (time_before(jiffies, current->mm->swap_token_time))
71 return;
72
73 if (!spin_trylock(&swap_token_lock))
74 return;
75
76 swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
77
78 mm = swap_token_mm;
79 if ((reason = should_release_swap_token(mm))) {
80 unsigned long eligible = jiffies;
81 if (reason == SWAP_TOKEN_TIMED_OUT) {
82 eligible += swap_token_default_timeout;
83 }
84 mm->swap_token_time = eligible;
85 swap_token_timeout = jiffies + swap_token_default_timeout;
86 swap_token_mm = current->mm;
87 }
88 spin_unlock(&swap_token_lock);
89 }
90 return;
91}
92
93/* Called on process exit. */
94void __put_swap_token(struct mm_struct *mm)
95{
96 spin_lock(&swap_token_lock);
97 if (likely(mm == swap_token_mm)) {
98 swap_token_mm = &init_mm;
99 swap_token_check = jiffies;
100 }
101 spin_unlock(&swap_token_lock);
102}
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
new file mode 100644
index 00000000000..c13a2161bca
--- /dev/null
+++ b/mm/tiny-shmem.c
@@ -0,0 +1,122 @@
1/*
2 * tiny-shmem.c: simple shmemfs and tmpfs using ramfs code
3 *
4 * Matt Mackall <mpm@selenic.com> January, 2004
5 * derived from mm/shmem.c and fs/ramfs/inode.c
6 *
7 * This is intended for small system where the benefits of the full
8 * shmem code (swap-backed and resource-limited) are outweighed by
9 * their complexity. On systems without swap this code should be
10 * effectively equivalent, but much lighter weight.
11 */
12
13#include <linux/fs.h>
14#include <linux/init.h>
15#include <linux/devfs_fs_kernel.h>
16#include <linux/vfs.h>
17#include <linux/mount.h>
18#include <linux/file.h>
19#include <linux/mm.h>
20#include <linux/module.h>
21#include <linux/swap.h>
22#include <linux/ramfs.h>
23
24static struct file_system_type tmpfs_fs_type = {
25 .name = "tmpfs",
26 .get_sb = ramfs_get_sb,
27 .kill_sb = kill_litter_super,
28};
29
30static struct vfsmount *shm_mnt;
31
32static int __init init_tmpfs(void)
33{
34 register_filesystem(&tmpfs_fs_type);
35#ifdef CONFIG_TMPFS
36 devfs_mk_dir("shm");
37#endif
38 shm_mnt = kern_mount(&tmpfs_fs_type);
39 return 0;
40}
41module_init(init_tmpfs)
42
43/*
44 * shmem_file_setup - get an unlinked file living in tmpfs
45 *
46 * @name: name for dentry (to be seen in /proc/<pid>/maps
47 * @size: size to be set for the file
48 *
49 */
50struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
51{
52 int error;
53 struct file *file;
54 struct inode *inode;
55 struct dentry *dentry, *root;
56 struct qstr this;
57
58 if (IS_ERR(shm_mnt))
59 return (void *)shm_mnt;
60
61 error = -ENOMEM;
62 this.name = name;
63 this.len = strlen(name);
64 this.hash = 0; /* will go */
65 root = shm_mnt->mnt_root;
66 dentry = d_alloc(root, &this);
67 if (!dentry)
68 goto put_memory;
69
70 error = -ENFILE;
71 file = get_empty_filp();
72 if (!file)
73 goto put_dentry;
74
75 error = -ENOSPC;
76 inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
77 if (!inode)
78 goto close_file;
79
80 d_instantiate(dentry, inode);
81 inode->i_size = size;
82 inode->i_nlink = 0; /* It is unlinked */
83 file->f_vfsmnt = mntget(shm_mnt);
84 file->f_dentry = dentry;
85 file->f_mapping = inode->i_mapping;
86 file->f_op = &ramfs_file_operations;
87 file->f_mode = FMODE_WRITE | FMODE_READ;
88 return file;
89
90close_file:
91 put_filp(file);
92put_dentry:
93 dput(dentry);
94put_memory:
95 return ERR_PTR(error);
96}
97
98/*
99 * shmem_zero_setup - setup a shared anonymous mapping
100 *
101 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
102 */
103int shmem_zero_setup(struct vm_area_struct *vma)
104{
105 struct file *file;
106 loff_t size = vma->vm_end - vma->vm_start;
107
108 file = shmem_file_setup("dev/zero", size, vma->vm_flags);
109 if (IS_ERR(file))
110 return PTR_ERR(file);
111
112 if (vma->vm_file)
113 fput(vma->vm_file);
114 vma->vm_file = file;
115 vma->vm_ops = &generic_file_vm_ops;
116 return 0;
117}
118
119int shmem_unuse(swp_entry_t entry, struct page *page)
120{
121 return 0;
122}
diff --git a/mm/truncate.c b/mm/truncate.c
new file mode 100644
index 00000000000..c9a63f0b69a
--- /dev/null
+++ b/mm/truncate.c
@@ -0,0 +1,336 @@
1/*
2 * mm/truncate.c - code for taking down pages from address_spaces
3 *
4 * Copyright (C) 2002, Linus Torvalds
5 *
6 * 10Sep2002 akpm@zip.com.au
7 * Initial version.
8 */
9
10#include <linux/kernel.h>
11#include <linux/mm.h>
12#include <linux/module.h>
13#include <linux/pagemap.h>
14#include <linux/pagevec.h>
15#include <linux/buffer_head.h> /* grr. try_to_release_page,
16 block_invalidatepage */
17
18
19static int do_invalidatepage(struct page *page, unsigned long offset)
20{
21 int (*invalidatepage)(struct page *, unsigned long);
22 invalidatepage = page->mapping->a_ops->invalidatepage;
23 if (invalidatepage == NULL)
24 invalidatepage = block_invalidatepage;
25 return (*invalidatepage)(page, offset);
26}
27
28static inline void truncate_partial_page(struct page *page, unsigned partial)
29{
30 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
31 if (PagePrivate(page))
32 do_invalidatepage(page, partial);
33}
34
35/*
36 * If truncate cannot remove the fs-private metadata from the page, the page
37 * becomes anonymous. It will be left on the LRU and may even be mapped into
38 * user pagetables if we're racing with filemap_nopage().
39 *
40 * We need to bale out if page->mapping is no longer equal to the original
41 * mapping. This happens a) when the VM reclaimed the page while we waited on
42 * its lock, b) when a concurrent invalidate_inode_pages got there first and
43 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
44 */
45static void
46truncate_complete_page(struct address_space *mapping, struct page *page)
47{
48 if (page->mapping != mapping)
49 return;
50
51 if (PagePrivate(page))
52 do_invalidatepage(page, 0);
53
54 clear_page_dirty(page);
55 ClearPageUptodate(page);
56 ClearPageMappedToDisk(page);
57 remove_from_page_cache(page);
58 page_cache_release(page); /* pagecache ref */
59}
60
61/*
62 * This is for invalidate_inode_pages(). That function can be called at
63 * any time, and is not supposed to throw away dirty pages. But pages can
64 * be marked dirty at any time too. So we re-check the dirtiness inside
65 * ->tree_lock. That provides exclusion against the __set_page_dirty
66 * functions.
67 *
68 * Returns non-zero if the page was successfully invalidated.
69 */
70static int
71invalidate_complete_page(struct address_space *mapping, struct page *page)
72{
73 if (page->mapping != mapping)
74 return 0;
75
76 if (PagePrivate(page) && !try_to_release_page(page, 0))
77 return 0;
78
79 write_lock_irq(&mapping->tree_lock);
80 if (PageDirty(page)) {
81 write_unlock_irq(&mapping->tree_lock);
82 return 0;
83 }
84
85 BUG_ON(PagePrivate(page));
86 __remove_from_page_cache(page);
87 write_unlock_irq(&mapping->tree_lock);
88 ClearPageUptodate(page);
89 page_cache_release(page); /* pagecache ref */
90 return 1;
91}
92
93/**
94 * truncate_inode_pages - truncate *all* the pages from an offset
95 * @mapping: mapping to truncate
96 * @lstart: offset from which to truncate
97 *
98 * Truncate the page cache at a set offset, removing the pages that are beyond
99 * that offset (and zeroing out partial pages).
100 *
101 * Truncate takes two passes - the first pass is nonblocking. It will not
102 * block on page locks and it will not block on writeback. The second pass
103 * will wait. This is to prevent as much IO as possible in the affected region.
104 * The first pass will remove most pages, so the search cost of the second pass
105 * is low.
106 *
107 * When looking at page->index outside the page lock we need to be careful to
108 * copy it into a local to avoid races (it could change at any time).
109 *
110 * We pass down the cache-hot hint to the page freeing code. Even if the
111 * mapping is large, it is probably the case that the final pages are the most
112 * recently touched, and freeing happens in ascending file offset order.
113 *
114 * Called under (and serialised by) inode->i_sem.
115 */
116void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
117{
118 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
119 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
120 struct pagevec pvec;
121 pgoff_t next;
122 int i;
123
124 if (mapping->nrpages == 0)
125 return;
126
127 pagevec_init(&pvec, 0);
128 next = start;
129 while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
130 for (i = 0; i < pagevec_count(&pvec); i++) {
131 struct page *page = pvec.pages[i];
132 pgoff_t page_index = page->index;
133
134 if (page_index > next)
135 next = page_index;
136 next++;
137 if (TestSetPageLocked(page))
138 continue;
139 if (PageWriteback(page)) {
140 unlock_page(page);
141 continue;
142 }
143 truncate_complete_page(mapping, page);
144 unlock_page(page);
145 }
146 pagevec_release(&pvec);
147 cond_resched();
148 }
149
150 if (partial) {
151 struct page *page = find_lock_page(mapping, start - 1);
152 if (page) {
153 wait_on_page_writeback(page);
154 truncate_partial_page(page, partial);
155 unlock_page(page);
156 page_cache_release(page);
157 }
158 }
159
160 next = start;
161 for ( ; ; ) {
162 cond_resched();
163 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
164 if (next == start)
165 break;
166 next = start;
167 continue;
168 }
169 for (i = 0; i < pagevec_count(&pvec); i++) {
170 struct page *page = pvec.pages[i];
171
172 lock_page(page);
173 wait_on_page_writeback(page);
174 if (page->index > next)
175 next = page->index;
176 next++;
177 truncate_complete_page(mapping, page);
178 unlock_page(page);
179 }
180 pagevec_release(&pvec);
181 }
182}
183
184EXPORT_SYMBOL(truncate_inode_pages);
185
186/**
187 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
188 * @mapping: the address_space which holds the pages to invalidate
189 * @start: the offset 'from' which to invalidate
190 * @end: the offset 'to' which to invalidate (inclusive)
191 *
192 * This function only removes the unlocked pages, if you want to
193 * remove all the pages of one inode, you must call truncate_inode_pages.
194 *
195 * invalidate_mapping_pages() will not block on IO activity. It will not
196 * invalidate pages which are dirty, locked, under writeback or mapped into
197 * pagetables.
198 */
199unsigned long invalidate_mapping_pages(struct address_space *mapping,
200 pgoff_t start, pgoff_t end)
201{
202 struct pagevec pvec;
203 pgoff_t next = start;
204 unsigned long ret = 0;
205 int i;
206
207 pagevec_init(&pvec, 0);
208 while (next <= end &&
209 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
210 for (i = 0; i < pagevec_count(&pvec); i++) {
211 struct page *page = pvec.pages[i];
212
213 if (TestSetPageLocked(page)) {
214 next++;
215 continue;
216 }
217 if (page->index > next)
218 next = page->index;
219 next++;
220 if (PageDirty(page) || PageWriteback(page))
221 goto unlock;
222 if (page_mapped(page))
223 goto unlock;
224 ret += invalidate_complete_page(mapping, page);
225unlock:
226 unlock_page(page);
227 if (next > end)
228 break;
229 }
230 pagevec_release(&pvec);
231 cond_resched();
232 }
233 return ret;
234}
235
236unsigned long invalidate_inode_pages(struct address_space *mapping)
237{
238 return invalidate_mapping_pages(mapping, 0, ~0UL);
239}
240
241EXPORT_SYMBOL(invalidate_inode_pages);
242
243/**
244 * invalidate_inode_pages2_range - remove range of pages from an address_space
245 * @mapping - the address_space
246 * @start: the page offset 'from' which to invalidate
247 * @end: the page offset 'to' which to invalidate (inclusive)
248 *
249 * Any pages which are found to be mapped into pagetables are unmapped prior to
250 * invalidation.
251 *
252 * Returns -EIO if any pages could not be invalidated.
253 */
254int invalidate_inode_pages2_range(struct address_space *mapping,
255 pgoff_t start, pgoff_t end)
256{
257 struct pagevec pvec;
258 pgoff_t next;
259 int i;
260 int ret = 0;
261 int did_range_unmap = 0;
262 int wrapped = 0;
263
264 pagevec_init(&pvec, 0);
265 next = start;
266 while (next <= end && !ret && !wrapped &&
267 pagevec_lookup(&pvec, mapping, next,
268 min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
269 for (i = 0; !ret && i < pagevec_count(&pvec); i++) {
270 struct page *page = pvec.pages[i];
271 pgoff_t page_index;
272 int was_dirty;
273
274 lock_page(page);
275 if (page->mapping != mapping) {
276 unlock_page(page);
277 continue;
278 }
279 page_index = page->index;
280 next = page_index + 1;
281 if (next == 0)
282 wrapped = 1;
283 if (page_index > end) {
284 unlock_page(page);
285 break;
286 }
287 wait_on_page_writeback(page);
288 while (page_mapped(page)) {
289 if (!did_range_unmap) {
290 /*
291 * Zap the rest of the file in one hit.
292 */
293 unmap_mapping_range(mapping,
294 page_index << PAGE_CACHE_SHIFT,
295 (end - page_index + 1)
296 << PAGE_CACHE_SHIFT,
297 0);
298 did_range_unmap = 1;
299 } else {
300 /*
301 * Just zap this page
302 */
303 unmap_mapping_range(mapping,
304 page_index << PAGE_CACHE_SHIFT,
305 PAGE_CACHE_SIZE, 0);
306 }
307 }
308 was_dirty = test_clear_page_dirty(page);
309 if (!invalidate_complete_page(mapping, page)) {
310 if (was_dirty)
311 set_page_dirty(page);
312 ret = -EIO;
313 }
314 unlock_page(page);
315 }
316 pagevec_release(&pvec);
317 cond_resched();
318 }
319 return ret;
320}
321EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
322
323/**
324 * invalidate_inode_pages2 - remove all pages from an address_space
325 * @mapping - the address_space
326 *
327 * Any pages which are found to be mapped into pagetables are unmapped prior to
328 * invalidation.
329 *
330 * Returns -EIO if any pages could not be invalidated.
331 */
332int invalidate_inode_pages2(struct address_space *mapping)
333{
334 return invalidate_inode_pages2_range(mapping, 0, -1);
335}
336EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
new file mode 100644
index 00000000000..c6182f6f130
--- /dev/null
+++ b/mm/vmalloc.c
@@ -0,0 +1,588 @@
1/*
2 * linux/mm/vmalloc.c
3 *
4 * Copyright (C) 1993 Linus Torvalds
5 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
6 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
7 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
8 */
9
10#include <linux/mm.h>
11#include <linux/module.h>
12#include <linux/highmem.h>
13#include <linux/slab.h>
14#include <linux/spinlock.h>
15#include <linux/interrupt.h>
16
17#include <linux/vmalloc.h>
18
19#include <asm/uaccess.h>
20#include <asm/tlbflush.h>
21
22
23DEFINE_RWLOCK(vmlist_lock);
24struct vm_struct *vmlist;
25
26static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
27{
28 pte_t *pte;
29
30 pte = pte_offset_kernel(pmd, addr);
31 do {
32 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
33 WARN_ON(!pte_none(ptent) && !pte_present(ptent));
34 } while (pte++, addr += PAGE_SIZE, addr != end);
35}
36
37static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr,
38 unsigned long end)
39{
40 pmd_t *pmd;
41 unsigned long next;
42
43 pmd = pmd_offset(pud, addr);
44 do {
45 next = pmd_addr_end(addr, end);
46 if (pmd_none_or_clear_bad(pmd))
47 continue;
48 vunmap_pte_range(pmd, addr, next);
49 } while (pmd++, addr = next, addr != end);
50}
51
52static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr,
53 unsigned long end)
54{
55 pud_t *pud;
56 unsigned long next;
57
58 pud = pud_offset(pgd, addr);
59 do {
60 next = pud_addr_end(addr, end);
61 if (pud_none_or_clear_bad(pud))
62 continue;
63 vunmap_pmd_range(pud, addr, next);
64 } while (pud++, addr = next, addr != end);
65}
66
67void unmap_vm_area(struct vm_struct *area)
68{
69 pgd_t *pgd;
70 unsigned long next;
71 unsigned long addr = (unsigned long) area->addr;
72 unsigned long end = addr + area->size;
73
74 BUG_ON(addr >= end);
75 pgd = pgd_offset_k(addr);
76 flush_cache_vunmap(addr, end);
77 do {
78 next = pgd_addr_end(addr, end);
79 if (pgd_none_or_clear_bad(pgd))
80 continue;
81 vunmap_pud_range(pgd, addr, next);
82 } while (pgd++, addr = next, addr != end);
83 flush_tlb_kernel_range((unsigned long) area->addr, end);
84}
85
86static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
87 unsigned long end, pgprot_t prot, struct page ***pages)
88{
89 pte_t *pte;
90
91 pte = pte_alloc_kernel(&init_mm, pmd, addr);
92 if (!pte)
93 return -ENOMEM;
94 do {
95 struct page *page = **pages;
96 WARN_ON(!pte_none(*pte));
97 if (!page)
98 return -ENOMEM;
99 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
100 (*pages)++;
101 } while (pte++, addr += PAGE_SIZE, addr != end);
102 return 0;
103}
104
105static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
106 unsigned long end, pgprot_t prot, struct page ***pages)
107{
108 pmd_t *pmd;
109 unsigned long next;
110
111 pmd = pmd_alloc(&init_mm, pud, addr);
112 if (!pmd)
113 return -ENOMEM;
114 do {
115 next = pmd_addr_end(addr, end);
116 if (vmap_pte_range(pmd, addr, next, prot, pages))
117 return -ENOMEM;
118 } while (pmd++, addr = next, addr != end);
119 return 0;
120}
121
122static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr,
123 unsigned long end, pgprot_t prot, struct page ***pages)
124{
125 pud_t *pud;
126 unsigned long next;
127
128 pud = pud_alloc(&init_mm, pgd, addr);
129 if (!pud)
130 return -ENOMEM;
131 do {
132 next = pud_addr_end(addr, end);
133 if (vmap_pmd_range(pud, addr, next, prot, pages))
134 return -ENOMEM;
135 } while (pud++, addr = next, addr != end);
136 return 0;
137}
138
139int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
140{
141 pgd_t *pgd;
142 unsigned long next;
143 unsigned long addr = (unsigned long) area->addr;
144 unsigned long end = addr + area->size - PAGE_SIZE;
145 int err;
146
147 BUG_ON(addr >= end);
148 pgd = pgd_offset_k(addr);
149 spin_lock(&init_mm.page_table_lock);
150 do {
151 next = pgd_addr_end(addr, end);
152 err = vmap_pud_range(pgd, addr, next, prot, pages);
153 if (err)
154 break;
155 } while (pgd++, addr = next, addr != end);
156 spin_unlock(&init_mm.page_table_lock);
157 flush_cache_vmap((unsigned long) area->addr, end);
158 return err;
159}
160
161#define IOREMAP_MAX_ORDER (7 + PAGE_SHIFT) /* 128 pages */
162
163struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
164 unsigned long start, unsigned long end)
165{
166 struct vm_struct **p, *tmp, *area;
167 unsigned long align = 1;
168 unsigned long addr;
169
170 if (flags & VM_IOREMAP) {
171 int bit = fls(size);
172
173 if (bit > IOREMAP_MAX_ORDER)
174 bit = IOREMAP_MAX_ORDER;
175 else if (bit < PAGE_SHIFT)
176 bit = PAGE_SHIFT;
177
178 align = 1ul << bit;
179 }
180 addr = ALIGN(start, align);
181 size = PAGE_ALIGN(size);
182
183 area = kmalloc(sizeof(*area), GFP_KERNEL);
184 if (unlikely(!area))
185 return NULL;
186
187 if (unlikely(!size)) {
188 kfree (area);
189 return NULL;
190 }
191
192 /*
193 * We always allocate a guard page.
194 */
195 size += PAGE_SIZE;
196
197 write_lock(&vmlist_lock);
198 for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) {
199 if ((unsigned long)tmp->addr < addr) {
200 if((unsigned long)tmp->addr + tmp->size >= addr)
201 addr = ALIGN(tmp->size +
202 (unsigned long)tmp->addr, align);
203 continue;
204 }
205 if ((size + addr) < addr)
206 goto out;
207 if (size + addr <= (unsigned long)tmp->addr)
208 goto found;
209 addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align);
210 if (addr > end - size)
211 goto out;
212 }
213
214found:
215 area->next = *p;
216 *p = area;
217
218 area->flags = flags;
219 area->addr = (void *)addr;
220 area->size = size;
221 area->pages = NULL;
222 area->nr_pages = 0;
223 area->phys_addr = 0;
224 write_unlock(&vmlist_lock);
225
226 return area;
227
228out:
229 write_unlock(&vmlist_lock);
230 kfree(area);
231 if (printk_ratelimit())
232 printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n");
233 return NULL;
234}
235
236/**
237 * get_vm_area - reserve a contingous kernel virtual area
238 *
239 * @size: size of the area
240 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
241 *
242 * Search an area of @size in the kernel virtual mapping area,
243 * and reserved it for out purposes. Returns the area descriptor
244 * on success or %NULL on failure.
245 */
246struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
247{
248 return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
249}
250
251/**
252 * remove_vm_area - find and remove a contingous kernel virtual area
253 *
254 * @addr: base address
255 *
256 * Search for the kernel VM area starting at @addr, and remove it.
257 * This function returns the found VM area, but using it is NOT safe
258 * on SMP machines.
259 */
260struct vm_struct *remove_vm_area(void *addr)
261{
262 struct vm_struct **p, *tmp;
263
264 write_lock(&vmlist_lock);
265 for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) {
266 if (tmp->addr == addr)
267 goto found;
268 }
269 write_unlock(&vmlist_lock);
270 return NULL;
271
272found:
273 unmap_vm_area(tmp);
274 *p = tmp->next;
275 write_unlock(&vmlist_lock);
276
277 /*
278 * Remove the guard page.
279 */
280 tmp->size -= PAGE_SIZE;
281 return tmp;
282}
283
284void __vunmap(void *addr, int deallocate_pages)
285{
286 struct vm_struct *area;
287
288 if (!addr)
289 return;
290
291 if ((PAGE_SIZE-1) & (unsigned long)addr) {
292 printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
293 WARN_ON(1);
294 return;
295 }
296
297 area = remove_vm_area(addr);
298 if (unlikely(!area)) {
299 printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
300 addr);
301 WARN_ON(1);
302 return;
303 }
304
305 if (deallocate_pages) {
306 int i;
307
308 for (i = 0; i < area->nr_pages; i++) {
309 if (unlikely(!area->pages[i]))
310 BUG();
311 __free_page(area->pages[i]);
312 }
313
314 if (area->nr_pages > PAGE_SIZE/sizeof(struct page *))
315 vfree(area->pages);
316 else
317 kfree(area->pages);
318 }
319
320 kfree(area);
321 return;
322}
323
324/**
325 * vfree - release memory allocated by vmalloc()
326 *
327 * @addr: memory base address
328 *
329 * Free the virtually contiguous memory area starting at @addr, as
330 * obtained from vmalloc(), vmalloc_32() or __vmalloc().
331 *
332 * May not be called in interrupt context.
333 */
334void vfree(void *addr)
335{
336 BUG_ON(in_interrupt());
337 __vunmap(addr, 1);
338}
339
340EXPORT_SYMBOL(vfree);
341
342/**
343 * vunmap - release virtual mapping obtained by vmap()
344 *
345 * @addr: memory base address
346 *
347 * Free the virtually contiguous memory area starting at @addr,
348 * which was created from the page array passed to vmap().
349 *
350 * May not be called in interrupt context.
351 */
352void vunmap(void *addr)
353{
354 BUG_ON(in_interrupt());
355 __vunmap(addr, 0);
356}
357
358EXPORT_SYMBOL(vunmap);
359
360/**
361 * vmap - map an array of pages into virtually contiguous space
362 *
363 * @pages: array of page pointers
364 * @count: number of pages to map
365 * @flags: vm_area->flags
366 * @prot: page protection for the mapping
367 *
368 * Maps @count pages from @pages into contiguous kernel virtual
369 * space.
370 */
371void *vmap(struct page **pages, unsigned int count,
372 unsigned long flags, pgprot_t prot)
373{
374 struct vm_struct *area;
375
376 if (count > num_physpages)
377 return NULL;
378
379 area = get_vm_area((count << PAGE_SHIFT), flags);
380 if (!area)
381 return NULL;
382 if (map_vm_area(area, prot, &pages)) {
383 vunmap(area->addr);
384 return NULL;
385 }
386
387 return area->addr;
388}
389
390EXPORT_SYMBOL(vmap);
391
392void *__vmalloc_area(struct vm_struct *area, unsigned int __nocast gfp_mask, pgprot_t prot)
393{
394 struct page **pages;
395 unsigned int nr_pages, array_size, i;
396
397 nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
398 array_size = (nr_pages * sizeof(struct page *));
399
400 area->nr_pages = nr_pages;
401 /* Please note that the recursion is strictly bounded. */
402 if (array_size > PAGE_SIZE)
403 pages = __vmalloc(array_size, gfp_mask, PAGE_KERNEL);
404 else
405 pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM));
406 area->pages = pages;
407 if (!area->pages) {
408 remove_vm_area(area->addr);
409 kfree(area);
410 return NULL;
411 }
412 memset(area->pages, 0, array_size);
413
414 for (i = 0; i < area->nr_pages; i++) {
415 area->pages[i] = alloc_page(gfp_mask);
416 if (unlikely(!area->pages[i])) {
417 /* Successfully allocated i pages, free them in __vunmap() */
418 area->nr_pages = i;
419 goto fail;
420 }
421 }
422
423 if (map_vm_area(area, prot, &pages))
424 goto fail;
425 return area->addr;
426
427fail:
428 vfree(area->addr);
429 return NULL;
430}
431
432/**
433 * __vmalloc - allocate virtually contiguous memory
434 *
435 * @size: allocation size
436 * @gfp_mask: flags for the page level allocator
437 * @prot: protection mask for the allocated pages
438 *
439 * Allocate enough pages to cover @size from the page level
440 * allocator with @gfp_mask flags. Map them into contiguous
441 * kernel virtual space, using a pagetable protection of @prot.
442 */
443void *__vmalloc(unsigned long size, unsigned int __nocast gfp_mask, pgprot_t prot)
444{
445 struct vm_struct *area;
446
447 size = PAGE_ALIGN(size);
448 if (!size || (size >> PAGE_SHIFT) > num_physpages)
449 return NULL;
450
451 area = get_vm_area(size, VM_ALLOC);
452 if (!area)
453 return NULL;
454
455 return __vmalloc_area(area, gfp_mask, prot);
456}
457
458EXPORT_SYMBOL(__vmalloc);
459
460/**
461 * vmalloc - allocate virtually contiguous memory
462 *
463 * @size: allocation size
464 *
465 * Allocate enough pages to cover @size from the page level
466 * allocator and map them into contiguous kernel virtual space.
467 *
468 * For tight cotrol over page level allocator and protection flags
469 * use __vmalloc() instead.
470 */
471void *vmalloc(unsigned long size)
472{
473 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
474}
475
476EXPORT_SYMBOL(vmalloc);
477
478/**
479 * vmalloc_exec - allocate virtually contiguous, executable memory
480 *
481 * @size: allocation size
482 *
483 * Kernel-internal function to allocate enough pages to cover @size
484 * the page level allocator and map them into contiguous and
485 * executable kernel virtual space.
486 *
487 * For tight cotrol over page level allocator and protection flags
488 * use __vmalloc() instead.
489 */
490
491#ifndef PAGE_KERNEL_EXEC
492# define PAGE_KERNEL_EXEC PAGE_KERNEL
493#endif
494
495void *vmalloc_exec(unsigned long size)
496{
497 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
498}
499
500/**
501 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
502 *
503 * @size: allocation size
504 *
505 * Allocate enough 32bit PA addressable pages to cover @size from the
506 * page level allocator and map them into contiguous kernel virtual space.
507 */
508void *vmalloc_32(unsigned long size)
509{
510 return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
511}
512
513EXPORT_SYMBOL(vmalloc_32);
514
515long vread(char *buf, char *addr, unsigned long count)
516{
517 struct vm_struct *tmp;
518 char *vaddr, *buf_start = buf;
519 unsigned long n;
520
521 /* Don't allow overflow */
522 if ((unsigned long) addr + count < count)
523 count = -(unsigned long) addr;
524
525 read_lock(&vmlist_lock);
526 for (tmp = vmlist; tmp; tmp = tmp->next) {
527 vaddr = (char *) tmp->addr;
528 if (addr >= vaddr + tmp->size - PAGE_SIZE)
529 continue;
530 while (addr < vaddr) {
531 if (count == 0)
532 goto finished;
533 *buf = '\0';
534 buf++;
535 addr++;
536 count--;
537 }
538 n = vaddr + tmp->size - PAGE_SIZE - addr;
539 do {
540 if (count == 0)
541 goto finished;
542 *buf = *addr;
543 buf++;
544 addr++;
545 count--;
546 } while (--n > 0);
547 }
548finished:
549 read_unlock(&vmlist_lock);
550 return buf - buf_start;
551}
552
553long vwrite(char *buf, char *addr, unsigned long count)
554{
555 struct vm_struct *tmp;
556 char *vaddr, *buf_start = buf;
557 unsigned long n;
558
559 /* Don't allow overflow */
560 if ((unsigned long) addr + count < count)
561 count = -(unsigned long) addr;
562
563 read_lock(&vmlist_lock);
564 for (tmp = vmlist; tmp; tmp = tmp->next) {
565 vaddr = (char *) tmp->addr;
566 if (addr >= vaddr + tmp->size - PAGE_SIZE)
567 continue;
568 while (addr < vaddr) {
569 if (count == 0)
570 goto finished;
571 buf++;
572 addr++;
573 count--;
574 }
575 n = vaddr + tmp->size - PAGE_SIZE - addr;
576 do {
577 if (count == 0)
578 goto finished;
579 *addr = *buf;
580 buf++;
581 addr++;
582 count--;
583 } while (--n > 0);
584 }
585finished:
586 read_unlock(&vmlist_lock);
587 return buf - buf_start;
588}
diff --git a/mm/vmscan.c b/mm/vmscan.c
new file mode 100644
index 00000000000..4003c0518d2
--- /dev/null
+++ b/mm/vmscan.c
@@ -0,0 +1,1311 @@
1/*
2 * linux/mm/vmscan.c
3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 *
6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
11 * Multiqueue VM started 5.8.00, Rik van Riel.
12 */
13
14#include <linux/mm.h>
15#include <linux/module.h>
16#include <linux/slab.h>
17#include <linux/kernel_stat.h>
18#include <linux/swap.h>
19#include <linux/pagemap.h>
20#include <linux/init.h>
21#include <linux/highmem.h>
22#include <linux/file.h>
23#include <linux/writeback.h>
24#include <linux/blkdev.h>
25#include <linux/buffer_head.h> /* for try_to_release_page(),
26 buffer_heads_over_limit */
27#include <linux/mm_inline.h>
28#include <linux/pagevec.h>
29#include <linux/backing-dev.h>
30#include <linux/rmap.h>
31#include <linux/topology.h>
32#include <linux/cpu.h>
33#include <linux/cpuset.h>
34#include <linux/notifier.h>
35#include <linux/rwsem.h>
36
37#include <asm/tlbflush.h>
38#include <asm/div64.h>
39
40#include <linux/swapops.h>
41
42/* possible outcome of pageout() */
43typedef enum {
44 /* failed to write page out, page is locked */
45 PAGE_KEEP,
46 /* move page to the active list, page is locked */
47 PAGE_ACTIVATE,
48 /* page has been sent to the disk successfully, page is unlocked */
49 PAGE_SUCCESS,
50 /* page is clean and locked */
51 PAGE_CLEAN,
52} pageout_t;
53
54struct scan_control {
55 /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */
56 unsigned long nr_to_scan;
57
58 /* Incremented by the number of inactive pages that were scanned */
59 unsigned long nr_scanned;
60
61 /* Incremented by the number of pages reclaimed */
62 unsigned long nr_reclaimed;
63
64 unsigned long nr_mapped; /* From page_state */
65
66 /* How many pages shrink_cache() should reclaim */
67 int nr_to_reclaim;
68
69 /* Ask shrink_caches, or shrink_zone to scan at this priority */
70 unsigned int priority;
71
72 /* This context's GFP mask */
73 unsigned int gfp_mask;
74
75 int may_writepage;
76
77 /* This context's SWAP_CLUSTER_MAX. If freeing memory for
78 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
79 * In this context, it doesn't matter that we scan the
80 * whole list at once. */
81 int swap_cluster_max;
82};
83
84/*
85 * The list of shrinker callbacks used by to apply pressure to
86 * ageable caches.
87 */
88struct shrinker {
89 shrinker_t shrinker;
90 struct list_head list;
91 int seeks; /* seeks to recreate an obj */
92 long nr; /* objs pending delete */
93};
94
95#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
96
97#ifdef ARCH_HAS_PREFETCH
98#define prefetch_prev_lru_page(_page, _base, _field) \
99 do { \
100 if ((_page)->lru.prev != _base) { \
101 struct page *prev; \
102 \
103 prev = lru_to_page(&(_page->lru)); \
104 prefetch(&prev->_field); \
105 } \
106 } while (0)
107#else
108#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
109#endif
110
111#ifdef ARCH_HAS_PREFETCHW
112#define prefetchw_prev_lru_page(_page, _base, _field) \
113 do { \
114 if ((_page)->lru.prev != _base) { \
115 struct page *prev; \
116 \
117 prev = lru_to_page(&(_page->lru)); \
118 prefetchw(&prev->_field); \
119 } \
120 } while (0)
121#else
122#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
123#endif
124
125/*
126 * From 0 .. 100. Higher means more swappy.
127 */
128int vm_swappiness = 60;
129static long total_memory;
130
131static LIST_HEAD(shrinker_list);
132static DECLARE_RWSEM(shrinker_rwsem);
133
134/*
135 * Add a shrinker callback to be called from the vm
136 */
137struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
138{
139 struct shrinker *shrinker;
140
141 shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
142 if (shrinker) {
143 shrinker->shrinker = theshrinker;
144 shrinker->seeks = seeks;
145 shrinker->nr = 0;
146 down_write(&shrinker_rwsem);
147 list_add_tail(&shrinker->list, &shrinker_list);
148 up_write(&shrinker_rwsem);
149 }
150 return shrinker;
151}
152EXPORT_SYMBOL(set_shrinker);
153
154/*
155 * Remove one
156 */
157void remove_shrinker(struct shrinker *shrinker)
158{
159 down_write(&shrinker_rwsem);
160 list_del(&shrinker->list);
161 up_write(&shrinker_rwsem);
162 kfree(shrinker);
163}
164EXPORT_SYMBOL(remove_shrinker);
165
166#define SHRINK_BATCH 128
167/*
168 * Call the shrink functions to age shrinkable caches
169 *
170 * Here we assume it costs one seek to replace a lru page and that it also
171 * takes a seek to recreate a cache object. With this in mind we age equal
172 * percentages of the lru and ageable caches. This should balance the seeks
173 * generated by these structures.
174 *
175 * If the vm encounted mapped pages on the LRU it increase the pressure on
176 * slab to avoid swapping.
177 *
178 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
179 *
180 * `lru_pages' represents the number of on-LRU pages in all the zones which
181 * are eligible for the caller's allocation attempt. It is used for balancing
182 * slab reclaim versus page reclaim.
183 */
184static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
185 unsigned long lru_pages)
186{
187 struct shrinker *shrinker;
188
189 if (scanned == 0)
190 scanned = SWAP_CLUSTER_MAX;
191
192 if (!down_read_trylock(&shrinker_rwsem))
193 return 0;
194
195 list_for_each_entry(shrinker, &shrinker_list, list) {
196 unsigned long long delta;
197 unsigned long total_scan;
198
199 delta = (4 * scanned) / shrinker->seeks;
200 delta *= (*shrinker->shrinker)(0, gfp_mask);
201 do_div(delta, lru_pages + 1);
202 shrinker->nr += delta;
203 if (shrinker->nr < 0)
204 shrinker->nr = LONG_MAX; /* It wrapped! */
205
206 total_scan = shrinker->nr;
207 shrinker->nr = 0;
208
209 while (total_scan >= SHRINK_BATCH) {
210 long this_scan = SHRINK_BATCH;
211 int shrink_ret;
212
213 shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
214 if (shrink_ret == -1)
215 break;
216 mod_page_state(slabs_scanned, this_scan);
217 total_scan -= this_scan;
218
219 cond_resched();
220 }
221
222 shrinker->nr += total_scan;
223 }
224 up_read(&shrinker_rwsem);
225 return 0;
226}
227
228/* Called without lock on whether page is mapped, so answer is unstable */
229static inline int page_mapping_inuse(struct page *page)
230{
231 struct address_space *mapping;
232
233 /* Page is in somebody's page tables. */
234 if (page_mapped(page))
235 return 1;
236
237 /* Be more reluctant to reclaim swapcache than pagecache */
238 if (PageSwapCache(page))
239 return 1;
240
241 mapping = page_mapping(page);
242 if (!mapping)
243 return 0;
244
245 /* File is mmap'd by somebody? */
246 return mapping_mapped(mapping);
247}
248
249static inline int is_page_cache_freeable(struct page *page)
250{
251 return page_count(page) - !!PagePrivate(page) == 2;
252}
253
254static int may_write_to_queue(struct backing_dev_info *bdi)
255{
256 if (current_is_kswapd())
257 return 1;
258 if (current_is_pdflush()) /* This is unlikely, but why not... */
259 return 1;
260 if (!bdi_write_congested(bdi))
261 return 1;
262 if (bdi == current->backing_dev_info)
263 return 1;
264 return 0;
265}
266
267/*
268 * We detected a synchronous write error writing a page out. Probably
269 * -ENOSPC. We need to propagate that into the address_space for a subsequent
270 * fsync(), msync() or close().
271 *
272 * The tricky part is that after writepage we cannot touch the mapping: nothing
273 * prevents it from being freed up. But we have a ref on the page and once
274 * that page is locked, the mapping is pinned.
275 *
276 * We're allowed to run sleeping lock_page() here because we know the caller has
277 * __GFP_FS.
278 */
279static void handle_write_error(struct address_space *mapping,
280 struct page *page, int error)
281{
282 lock_page(page);
283 if (page_mapping(page) == mapping) {
284 if (error == -ENOSPC)
285 set_bit(AS_ENOSPC, &mapping->flags);
286 else
287 set_bit(AS_EIO, &mapping->flags);
288 }
289 unlock_page(page);
290}
291
292/*
293 * pageout is called by shrink_list() for each dirty page. Calls ->writepage().
294 */
295static pageout_t pageout(struct page *page, struct address_space *mapping)
296{
297 /*
298 * If the page is dirty, only perform writeback if that write
299 * will be non-blocking. To prevent this allocation from being
300 * stalled by pagecache activity. But note that there may be
301 * stalls if we need to run get_block(). We could test
302 * PagePrivate for that.
303 *
304 * If this process is currently in generic_file_write() against
305 * this page's queue, we can perform writeback even if that
306 * will block.
307 *
308 * If the page is swapcache, write it back even if that would
309 * block, for some throttling. This happens by accident, because
310 * swap_backing_dev_info is bust: it doesn't reflect the
311 * congestion state of the swapdevs. Easy to fix, if needed.
312 * See swapfile.c:page_queue_congested().
313 */
314 if (!is_page_cache_freeable(page))
315 return PAGE_KEEP;
316 if (!mapping) {
317 /*
318 * Some data journaling orphaned pages can have
319 * page->mapping == NULL while being dirty with clean buffers.
320 */
321 if (PageDirty(page) && PagePrivate(page)) {
322 if (try_to_free_buffers(page)) {
323 ClearPageDirty(page);
324 printk("%s: orphaned page\n", __FUNCTION__);
325 return PAGE_CLEAN;
326 }
327 }
328 return PAGE_KEEP;
329 }
330 if (mapping->a_ops->writepage == NULL)
331 return PAGE_ACTIVATE;
332 if (!may_write_to_queue(mapping->backing_dev_info))
333 return PAGE_KEEP;
334
335 if (clear_page_dirty_for_io(page)) {
336 int res;
337 struct writeback_control wbc = {
338 .sync_mode = WB_SYNC_NONE,
339 .nr_to_write = SWAP_CLUSTER_MAX,
340 .nonblocking = 1,
341 .for_reclaim = 1,
342 };
343
344 SetPageReclaim(page);
345 res = mapping->a_ops->writepage(page, &wbc);
346 if (res < 0)
347 handle_write_error(mapping, page, res);
348 if (res == WRITEPAGE_ACTIVATE) {
349 ClearPageReclaim(page);
350 return PAGE_ACTIVATE;
351 }
352 if (!PageWriteback(page)) {
353 /* synchronous write or broken a_ops? */
354 ClearPageReclaim(page);
355 }
356
357 return PAGE_SUCCESS;
358 }
359
360 return PAGE_CLEAN;
361}
362
363/*
364 * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
365 */
366static int shrink_list(struct list_head *page_list, struct scan_control *sc)
367{
368 LIST_HEAD(ret_pages);
369 struct pagevec freed_pvec;
370 int pgactivate = 0;
371 int reclaimed = 0;
372
373 cond_resched();
374
375 pagevec_init(&freed_pvec, 1);
376 while (!list_empty(page_list)) {
377 struct address_space *mapping;
378 struct page *page;
379 int may_enter_fs;
380 int referenced;
381
382 cond_resched();
383
384 page = lru_to_page(page_list);
385 list_del(&page->lru);
386
387 if (TestSetPageLocked(page))
388 goto keep;
389
390 BUG_ON(PageActive(page));
391
392 sc->nr_scanned++;
393 /* Double the slab pressure for mapped and swapcache pages */
394 if (page_mapped(page) || PageSwapCache(page))
395 sc->nr_scanned++;
396
397 if (PageWriteback(page))
398 goto keep_locked;
399
400 referenced = page_referenced(page, 1, sc->priority <= 0);
401 /* In active use or really unfreeable? Activate it. */
402 if (referenced && page_mapping_inuse(page))
403 goto activate_locked;
404
405#ifdef CONFIG_SWAP
406 /*
407 * Anonymous process memory has backing store?
408 * Try to allocate it some swap space here.
409 */
410 if (PageAnon(page) && !PageSwapCache(page)) {
411 if (!add_to_swap(page))
412 goto activate_locked;
413 }
414#endif /* CONFIG_SWAP */
415
416 mapping = page_mapping(page);
417 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
418 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
419
420 /*
421 * The page is mapped into the page tables of one or more
422 * processes. Try to unmap it here.
423 */
424 if (page_mapped(page) && mapping) {
425 switch (try_to_unmap(page)) {
426 case SWAP_FAIL:
427 goto activate_locked;
428 case SWAP_AGAIN:
429 goto keep_locked;
430 case SWAP_SUCCESS:
431 ; /* try to free the page below */
432 }
433 }
434
435 if (PageDirty(page)) {
436 if (referenced)
437 goto keep_locked;
438 if (!may_enter_fs)
439 goto keep_locked;
440 if (laptop_mode && !sc->may_writepage)
441 goto keep_locked;
442
443 /* Page is dirty, try to write it out here */
444 switch(pageout(page, mapping)) {
445 case PAGE_KEEP:
446 goto keep_locked;
447 case PAGE_ACTIVATE:
448 goto activate_locked;
449 case PAGE_SUCCESS:
450 if (PageWriteback(page) || PageDirty(page))
451 goto keep;
452 /*
453 * A synchronous write - probably a ramdisk. Go
454 * ahead and try to reclaim the page.
455 */
456 if (TestSetPageLocked(page))
457 goto keep;
458 if (PageDirty(page) || PageWriteback(page))
459 goto keep_locked;
460 mapping = page_mapping(page);
461 case PAGE_CLEAN:
462 ; /* try to free the page below */
463 }
464 }
465
466 /*
467 * If the page has buffers, try to free the buffer mappings
468 * associated with this page. If we succeed we try to free
469 * the page as well.
470 *
471 * We do this even if the page is PageDirty().
472 * try_to_release_page() does not perform I/O, but it is
473 * possible for a page to have PageDirty set, but it is actually
474 * clean (all its buffers are clean). This happens if the
475 * buffers were written out directly, with submit_bh(). ext3
476 * will do this, as well as the blockdev mapping.
477 * try_to_release_page() will discover that cleanness and will
478 * drop the buffers and mark the page clean - it can be freed.
479 *
480 * Rarely, pages can have buffers and no ->mapping. These are
481 * the pages which were not successfully invalidated in
482 * truncate_complete_page(). We try to drop those buffers here
483 * and if that worked, and the page is no longer mapped into
484 * process address space (page_count == 1) it can be freed.
485 * Otherwise, leave the page on the LRU so it is swappable.
486 */
487 if (PagePrivate(page)) {
488 if (!try_to_release_page(page, sc->gfp_mask))
489 goto activate_locked;
490 if (!mapping && page_count(page) == 1)
491 goto free_it;
492 }
493
494 if (!mapping)
495 goto keep_locked; /* truncate got there first */
496
497 write_lock_irq(&mapping->tree_lock);
498
499 /*
500 * The non-racy check for busy page. It is critical to check
501 * PageDirty _after_ making sure that the page is freeable and
502 * not in use by anybody. (pagecache + us == 2)
503 */
504 if (page_count(page) != 2 || PageDirty(page)) {
505 write_unlock_irq(&mapping->tree_lock);
506 goto keep_locked;
507 }
508
509#ifdef CONFIG_SWAP
510 if (PageSwapCache(page)) {
511 swp_entry_t swap = { .val = page->private };
512 __delete_from_swap_cache(page);
513 write_unlock_irq(&mapping->tree_lock);
514 swap_free(swap);
515 __put_page(page); /* The pagecache ref */
516 goto free_it;
517 }
518#endif /* CONFIG_SWAP */
519
520 __remove_from_page_cache(page);
521 write_unlock_irq(&mapping->tree_lock);
522 __put_page(page);
523
524free_it:
525 unlock_page(page);
526 reclaimed++;
527 if (!pagevec_add(&freed_pvec, page))
528 __pagevec_release_nonlru(&freed_pvec);
529 continue;
530
531activate_locked:
532 SetPageActive(page);
533 pgactivate++;
534keep_locked:
535 unlock_page(page);
536keep:
537 list_add(&page->lru, &ret_pages);
538 BUG_ON(PageLRU(page));
539 }
540 list_splice(&ret_pages, page_list);
541 if (pagevec_count(&freed_pvec))
542 __pagevec_release_nonlru(&freed_pvec);
543 mod_page_state(pgactivate, pgactivate);
544 sc->nr_reclaimed += reclaimed;
545 return reclaimed;
546}
547
548/*
549 * zone->lru_lock is heavily contended. Some of the functions that
550 * shrink the lists perform better by taking out a batch of pages
551 * and working on them outside the LRU lock.
552 *
553 * For pagecache intensive workloads, this function is the hottest
554 * spot in the kernel (apart from copy_*_user functions).
555 *
556 * Appropriate locks must be held before calling this function.
557 *
558 * @nr_to_scan: The number of pages to look through on the list.
559 * @src: The LRU list to pull pages off.
560 * @dst: The temp list to put pages on to.
561 * @scanned: The number of pages that were scanned.
562 *
563 * returns how many pages were moved onto *@dst.
564 */
565static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
566 struct list_head *dst, int *scanned)
567{
568 int nr_taken = 0;
569 struct page *page;
570 int scan = 0;
571
572 while (scan++ < nr_to_scan && !list_empty(src)) {
573 page = lru_to_page(src);
574 prefetchw_prev_lru_page(page, src, flags);
575
576 if (!TestClearPageLRU(page))
577 BUG();
578 list_del(&page->lru);
579 if (get_page_testone(page)) {
580 /*
581 * It is being freed elsewhere
582 */
583 __put_page(page);
584 SetPageLRU(page);
585 list_add(&page->lru, src);
586 continue;
587 } else {
588 list_add(&page->lru, dst);
589 nr_taken++;
590 }
591 }
592
593 *scanned = scan;
594 return nr_taken;
595}
596
597/*
598 * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
599 */
600static void shrink_cache(struct zone *zone, struct scan_control *sc)
601{
602 LIST_HEAD(page_list);
603 struct pagevec pvec;
604 int max_scan = sc->nr_to_scan;
605
606 pagevec_init(&pvec, 1);
607
608 lru_add_drain();
609 spin_lock_irq(&zone->lru_lock);
610 while (max_scan > 0) {
611 struct page *page;
612 int nr_taken;
613 int nr_scan;
614 int nr_freed;
615
616 nr_taken = isolate_lru_pages(sc->swap_cluster_max,
617 &zone->inactive_list,
618 &page_list, &nr_scan);
619 zone->nr_inactive -= nr_taken;
620 zone->pages_scanned += nr_scan;
621 spin_unlock_irq(&zone->lru_lock);
622
623 if (nr_taken == 0)
624 goto done;
625
626 max_scan -= nr_scan;
627 if (current_is_kswapd())
628 mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
629 else
630 mod_page_state_zone(zone, pgscan_direct, nr_scan);
631 nr_freed = shrink_list(&page_list, sc);
632 if (current_is_kswapd())
633 mod_page_state(kswapd_steal, nr_freed);
634 mod_page_state_zone(zone, pgsteal, nr_freed);
635 sc->nr_to_reclaim -= nr_freed;
636
637 spin_lock_irq(&zone->lru_lock);
638 /*
639 * Put back any unfreeable pages.
640 */
641 while (!list_empty(&page_list)) {
642 page = lru_to_page(&page_list);
643 if (TestSetPageLRU(page))
644 BUG();
645 list_del(&page->lru);
646 if (PageActive(page))
647 add_page_to_active_list(zone, page);
648 else
649 add_page_to_inactive_list(zone, page);
650 if (!pagevec_add(&pvec, page)) {
651 spin_unlock_irq(&zone->lru_lock);
652 __pagevec_release(&pvec);
653 spin_lock_irq(&zone->lru_lock);
654 }
655 }
656 }
657 spin_unlock_irq(&zone->lru_lock);
658done:
659 pagevec_release(&pvec);
660}
661
662/*
663 * This moves pages from the active list to the inactive list.
664 *
665 * We move them the other way if the page is referenced by one or more
666 * processes, from rmap.
667 *
668 * If the pages are mostly unmapped, the processing is fast and it is
669 * appropriate to hold zone->lru_lock across the whole operation. But if
670 * the pages are mapped, the processing is slow (page_referenced()) so we
671 * should drop zone->lru_lock around each page. It's impossible to balance
672 * this, so instead we remove the pages from the LRU while processing them.
673 * It is safe to rely on PG_active against the non-LRU pages in here because
674 * nobody will play with that bit on a non-LRU page.
675 *
676 * The downside is that we have to touch page->_count against each page.
677 * But we had to alter page->flags anyway.
678 */
679static void
680refill_inactive_zone(struct zone *zone, struct scan_control *sc)
681{
682 int pgmoved;
683 int pgdeactivate = 0;
684 int pgscanned;
685 int nr_pages = sc->nr_to_scan;
686 LIST_HEAD(l_hold); /* The pages which were snipped off */
687 LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
688 LIST_HEAD(l_active); /* Pages to go onto the active_list */
689 struct page *page;
690 struct pagevec pvec;
691 int reclaim_mapped = 0;
692 long mapped_ratio;
693 long distress;
694 long swap_tendency;
695
696 lru_add_drain();
697 spin_lock_irq(&zone->lru_lock);
698 pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
699 &l_hold, &pgscanned);
700 zone->pages_scanned += pgscanned;
701 zone->nr_active -= pgmoved;
702 spin_unlock_irq(&zone->lru_lock);
703
704 /*
705 * `distress' is a measure of how much trouble we're having reclaiming
706 * pages. 0 -> no problems. 100 -> great trouble.
707 */
708 distress = 100 >> zone->prev_priority;
709
710 /*
711 * The point of this algorithm is to decide when to start reclaiming
712 * mapped memory instead of just pagecache. Work out how much memory
713 * is mapped.
714 */
715 mapped_ratio = (sc->nr_mapped * 100) / total_memory;
716
717 /*
718 * Now decide how much we really want to unmap some pages. The mapped
719 * ratio is downgraded - just because there's a lot of mapped memory
720 * doesn't necessarily mean that page reclaim isn't succeeding.
721 *
722 * The distress ratio is important - we don't want to start going oom.
723 *
724 * A 100% value of vm_swappiness overrides this algorithm altogether.
725 */
726 swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
727
728 /*
729 * Now use this metric to decide whether to start moving mapped memory
730 * onto the inactive list.
731 */
732 if (swap_tendency >= 100)
733 reclaim_mapped = 1;
734
735 while (!list_empty(&l_hold)) {
736 cond_resched();
737 page = lru_to_page(&l_hold);
738 list_del(&page->lru);
739 if (page_mapped(page)) {
740 if (!reclaim_mapped ||
741 (total_swap_pages == 0 && PageAnon(page)) ||
742 page_referenced(page, 0, sc->priority <= 0)) {
743 list_add(&page->lru, &l_active);
744 continue;
745 }
746 }
747 list_add(&page->lru, &l_inactive);
748 }
749
750 pagevec_init(&pvec, 1);
751 pgmoved = 0;
752 spin_lock_irq(&zone->lru_lock);
753 while (!list_empty(&l_inactive)) {
754 page = lru_to_page(&l_inactive);
755 prefetchw_prev_lru_page(page, &l_inactive, flags);
756 if (TestSetPageLRU(page))
757 BUG();
758 if (!TestClearPageActive(page))
759 BUG();
760 list_move(&page->lru, &zone->inactive_list);
761 pgmoved++;
762 if (!pagevec_add(&pvec, page)) {
763 zone->nr_inactive += pgmoved;
764 spin_unlock_irq(&zone->lru_lock);
765 pgdeactivate += pgmoved;
766 pgmoved = 0;
767 if (buffer_heads_over_limit)
768 pagevec_strip(&pvec);
769 __pagevec_release(&pvec);
770 spin_lock_irq(&zone->lru_lock);
771 }
772 }
773 zone->nr_inactive += pgmoved;
774 pgdeactivate += pgmoved;
775 if (buffer_heads_over_limit) {
776 spin_unlock_irq(&zone->lru_lock);
777 pagevec_strip(&pvec);
778 spin_lock_irq(&zone->lru_lock);
779 }
780
781 pgmoved = 0;
782 while (!list_empty(&l_active)) {
783 page = lru_to_page(&l_active);
784 prefetchw_prev_lru_page(page, &l_active, flags);
785 if (TestSetPageLRU(page))
786 BUG();
787 BUG_ON(!PageActive(page));
788 list_move(&page->lru, &zone->active_list);
789 pgmoved++;
790 if (!pagevec_add(&pvec, page)) {
791 zone->nr_active += pgmoved;
792 pgmoved = 0;
793 spin_unlock_irq(&zone->lru_lock);
794 __pagevec_release(&pvec);
795 spin_lock_irq(&zone->lru_lock);
796 }
797 }
798 zone->nr_active += pgmoved;
799 spin_unlock_irq(&zone->lru_lock);
800 pagevec_release(&pvec);
801
802 mod_page_state_zone(zone, pgrefill, pgscanned);
803 mod_page_state(pgdeactivate, pgdeactivate);
804}
805
806/*
807 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
808 */
809static void
810shrink_zone(struct zone *zone, struct scan_control *sc)
811{
812 unsigned long nr_active;
813 unsigned long nr_inactive;
814
815 /*
816 * Add one to `nr_to_scan' just to make sure that the kernel will
817 * slowly sift through the active list.
818 */
819 zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
820 nr_active = zone->nr_scan_active;
821 if (nr_active >= sc->swap_cluster_max)
822 zone->nr_scan_active = 0;
823 else
824 nr_active = 0;
825
826 zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
827 nr_inactive = zone->nr_scan_inactive;
828 if (nr_inactive >= sc->swap_cluster_max)
829 zone->nr_scan_inactive = 0;
830 else
831 nr_inactive = 0;
832
833 sc->nr_to_reclaim = sc->swap_cluster_max;
834
835 while (nr_active || nr_inactive) {
836 if (nr_active) {
837 sc->nr_to_scan = min(nr_active,
838 (unsigned long)sc->swap_cluster_max);
839 nr_active -= sc->nr_to_scan;
840 refill_inactive_zone(zone, sc);
841 }
842
843 if (nr_inactive) {
844 sc->nr_to_scan = min(nr_inactive,
845 (unsigned long)sc->swap_cluster_max);
846 nr_inactive -= sc->nr_to_scan;
847 shrink_cache(zone, sc);
848 if (sc->nr_to_reclaim <= 0)
849 break;
850 }
851 }
852
853 throttle_vm_writeout();
854}
855
856/*
857 * This is the direct reclaim path, for page-allocating processes. We only
858 * try to reclaim pages from zones which will satisfy the caller's allocation
859 * request.
860 *
861 * We reclaim from a zone even if that zone is over pages_high. Because:
862 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
863 * allocation or
864 * b) The zones may be over pages_high but they must go *over* pages_high to
865 * satisfy the `incremental min' zone defense algorithm.
866 *
867 * Returns the number of reclaimed pages.
868 *
869 * If a zone is deemed to be full of pinned pages then just give it a light
870 * scan then give up on it.
871 */
872static void
873shrink_caches(struct zone **zones, struct scan_control *sc)
874{
875 int i;
876
877 for (i = 0; zones[i] != NULL; i++) {
878 struct zone *zone = zones[i];
879
880 if (zone->present_pages == 0)
881 continue;
882
883 if (!cpuset_zone_allowed(zone))
884 continue;
885
886 zone->temp_priority = sc->priority;
887 if (zone->prev_priority > sc->priority)
888 zone->prev_priority = sc->priority;
889
890 if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
891 continue; /* Let kswapd poll it */
892
893 shrink_zone(zone, sc);
894 }
895}
896
897/*
898 * This is the main entry point to direct page reclaim.
899 *
900 * If a full scan of the inactive list fails to free enough memory then we
901 * are "out of memory" and something needs to be killed.
902 *
903 * If the caller is !__GFP_FS then the probability of a failure is reasonably
904 * high - the zone may be full of dirty or under-writeback pages, which this
905 * caller can't do much about. We kick pdflush and take explicit naps in the
906 * hope that some of these pages can be written. But if the allocating task
907 * holds filesystem locks which prevent writeout this might not work, and the
908 * allocation attempt will fail.
909 */
910int try_to_free_pages(struct zone **zones,
911 unsigned int gfp_mask, unsigned int order)
912{
913 int priority;
914 int ret = 0;
915 int total_scanned = 0, total_reclaimed = 0;
916 struct reclaim_state *reclaim_state = current->reclaim_state;
917 struct scan_control sc;
918 unsigned long lru_pages = 0;
919 int i;
920
921 sc.gfp_mask = gfp_mask;
922 sc.may_writepage = 0;
923
924 inc_page_state(allocstall);
925
926 for (i = 0; zones[i] != NULL; i++) {
927 struct zone *zone = zones[i];
928
929 if (!cpuset_zone_allowed(zone))
930 continue;
931
932 zone->temp_priority = DEF_PRIORITY;
933 lru_pages += zone->nr_active + zone->nr_inactive;
934 }
935
936 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
937 sc.nr_mapped = read_page_state(nr_mapped);
938 sc.nr_scanned = 0;
939 sc.nr_reclaimed = 0;
940 sc.priority = priority;
941 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
942 shrink_caches(zones, &sc);
943 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
944 if (reclaim_state) {
945 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
946 reclaim_state->reclaimed_slab = 0;
947 }
948 total_scanned += sc.nr_scanned;
949 total_reclaimed += sc.nr_reclaimed;
950 if (total_reclaimed >= sc.swap_cluster_max) {
951 ret = 1;
952 goto out;
953 }
954
955 /*
956 * Try to write back as many pages as we just scanned. This
957 * tends to cause slow streaming writers to write data to the
958 * disk smoothly, at the dirtying rate, which is nice. But
959 * that's undesirable in laptop mode, where we *want* lumpy
960 * writeout. So in laptop mode, write out the whole world.
961 */
962 if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) {
963 wakeup_bdflush(laptop_mode ? 0 : total_scanned);
964 sc.may_writepage = 1;
965 }
966
967 /* Take a nap, wait for some writeback to complete */
968 if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
969 blk_congestion_wait(WRITE, HZ/10);
970 }
971out:
972 for (i = 0; zones[i] != 0; i++) {
973 struct zone *zone = zones[i];
974
975 if (!cpuset_zone_allowed(zone))
976 continue;
977
978 zone->prev_priority = zone->temp_priority;
979 }
980 return ret;
981}
982
983/*
984 * For kswapd, balance_pgdat() will work across all this node's zones until
985 * they are all at pages_high.
986 *
987 * If `nr_pages' is non-zero then it is the number of pages which are to be
988 * reclaimed, regardless of the zone occupancies. This is a software suspend
989 * special.
990 *
991 * Returns the number of pages which were actually freed.
992 *
993 * There is special handling here for zones which are full of pinned pages.
994 * This can happen if the pages are all mlocked, or if they are all used by
995 * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
996 * What we do is to detect the case where all pages in the zone have been
997 * scanned twice and there has been zero successful reclaim. Mark the zone as
998 * dead and from now on, only perform a short scan. Basically we're polling
999 * the zone for when the problem goes away.
1000 *
1001 * kswapd scans the zones in the highmem->normal->dma direction. It skips
1002 * zones which have free_pages > pages_high, but once a zone is found to have
1003 * free_pages <= pages_high, we scan that zone and the lower zones regardless
1004 * of the number of free pages in the lower zones. This interoperates with
1005 * the page allocator fallback scheme to ensure that aging of pages is balanced
1006 * across the zones.
1007 */
1008static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order)
1009{
1010 int to_free = nr_pages;
1011 int all_zones_ok;
1012 int priority;
1013 int i;
1014 int total_scanned, total_reclaimed;
1015 struct reclaim_state *reclaim_state = current->reclaim_state;
1016 struct scan_control sc;
1017
1018loop_again:
1019 total_scanned = 0;
1020 total_reclaimed = 0;
1021 sc.gfp_mask = GFP_KERNEL;
1022 sc.may_writepage = 0;
1023 sc.nr_mapped = read_page_state(nr_mapped);
1024
1025 inc_page_state(pageoutrun);
1026
1027 for (i = 0; i < pgdat->nr_zones; i++) {
1028 struct zone *zone = pgdat->node_zones + i;
1029
1030 zone->temp_priority = DEF_PRIORITY;
1031 }
1032
1033 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1034 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
1035 unsigned long lru_pages = 0;
1036
1037 all_zones_ok = 1;
1038
1039 if (nr_pages == 0) {
1040 /*
1041 * Scan in the highmem->dma direction for the highest
1042 * zone which needs scanning
1043 */
1044 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
1045 struct zone *zone = pgdat->node_zones + i;
1046
1047 if (zone->present_pages == 0)
1048 continue;
1049
1050 if (zone->all_unreclaimable &&
1051 priority != DEF_PRIORITY)
1052 continue;
1053
1054 if (!zone_watermark_ok(zone, order,
1055 zone->pages_high, 0, 0, 0)) {
1056 end_zone = i;
1057 goto scan;
1058 }
1059 }
1060 goto out;
1061 } else {
1062 end_zone = pgdat->nr_zones - 1;
1063 }
1064scan:
1065 for (i = 0; i <= end_zone; i++) {
1066 struct zone *zone = pgdat->node_zones + i;
1067
1068 lru_pages += zone->nr_active + zone->nr_inactive;
1069 }
1070
1071 /*
1072 * Now scan the zone in the dma->highmem direction, stopping
1073 * at the last zone which needs scanning.
1074 *
1075 * We do this because the page allocator works in the opposite
1076 * direction. This prevents the page allocator from allocating
1077 * pages behind kswapd's direction of progress, which would
1078 * cause too much scanning of the lower zones.
1079 */
1080 for (i = 0; i <= end_zone; i++) {
1081 struct zone *zone = pgdat->node_zones + i;
1082
1083 if (zone->present_pages == 0)
1084 continue;
1085
1086 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1087 continue;
1088
1089 if (nr_pages == 0) { /* Not software suspend */
1090 if (!zone_watermark_ok(zone, order,
1091 zone->pages_high, end_zone, 0, 0))
1092 all_zones_ok = 0;
1093 }
1094 zone->temp_priority = priority;
1095 if (zone->prev_priority > priority)
1096 zone->prev_priority = priority;
1097 sc.nr_scanned = 0;
1098 sc.nr_reclaimed = 0;
1099 sc.priority = priority;
1100 sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
1101 shrink_zone(zone, &sc);
1102 reclaim_state->reclaimed_slab = 0;
1103 shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages);
1104 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
1105 total_reclaimed += sc.nr_reclaimed;
1106 total_scanned += sc.nr_scanned;
1107 if (zone->all_unreclaimable)
1108 continue;
1109 if (zone->pages_scanned >= (zone->nr_active +
1110 zone->nr_inactive) * 4)
1111 zone->all_unreclaimable = 1;
1112 /*
1113 * If we've done a decent amount of scanning and
1114 * the reclaim ratio is low, start doing writepage
1115 * even in laptop mode
1116 */
1117 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
1118 total_scanned > total_reclaimed+total_reclaimed/2)
1119 sc.may_writepage = 1;
1120 }
1121 if (nr_pages && to_free > total_reclaimed)
1122 continue; /* swsusp: need to do more work */
1123 if (all_zones_ok)
1124 break; /* kswapd: all done */
1125 /*
1126 * OK, kswapd is getting into trouble. Take a nap, then take
1127 * another pass across the zones.
1128 */
1129 if (total_scanned && priority < DEF_PRIORITY - 2)
1130 blk_congestion_wait(WRITE, HZ/10);
1131
1132 /*
1133 * We do this so kswapd doesn't build up large priorities for
1134 * example when it is freeing in parallel with allocators. It
1135 * matches the direct reclaim path behaviour in terms of impact
1136 * on zone->*_priority.
1137 */
1138 if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages))
1139 break;
1140 }
1141out:
1142 for (i = 0; i < pgdat->nr_zones; i++) {
1143 struct zone *zone = pgdat->node_zones + i;
1144
1145 zone->prev_priority = zone->temp_priority;
1146 }
1147 if (!all_zones_ok) {
1148 cond_resched();
1149 goto loop_again;
1150 }
1151
1152 return total_reclaimed;
1153}
1154
1155/*
1156 * The background pageout daemon, started as a kernel thread
1157 * from the init process.
1158 *
1159 * This basically trickles out pages so that we have _some_
1160 * free memory available even if there is no other activity
1161 * that frees anything up. This is needed for things like routing
1162 * etc, where we otherwise might have all activity going on in
1163 * asynchronous contexts that cannot page things out.
1164 *
1165 * If there are applications that are active memory-allocators
1166 * (most normal use), this basically shouldn't matter.
1167 */
1168static int kswapd(void *p)
1169{
1170 unsigned long order;
1171 pg_data_t *pgdat = (pg_data_t*)p;
1172 struct task_struct *tsk = current;
1173 DEFINE_WAIT(wait);
1174 struct reclaim_state reclaim_state = {
1175 .reclaimed_slab = 0,
1176 };
1177 cpumask_t cpumask;
1178
1179 daemonize("kswapd%d", pgdat->node_id);
1180 cpumask = node_to_cpumask(pgdat->node_id);
1181 if (!cpus_empty(cpumask))
1182 set_cpus_allowed(tsk, cpumask);
1183 current->reclaim_state = &reclaim_state;
1184
1185 /*
1186 * Tell the memory management that we're a "memory allocator",
1187 * and that if we need more memory we should get access to it
1188 * regardless (see "__alloc_pages()"). "kswapd" should
1189 * never get caught in the normal page freeing logic.
1190 *
1191 * (Kswapd normally doesn't need memory anyway, but sometimes
1192 * you need a small amount of memory in order to be able to
1193 * page out something else, and this flag essentially protects
1194 * us from recursively trying to free more memory as we're
1195 * trying to free the first piece of memory in the first place).
1196 */
1197 tsk->flags |= PF_MEMALLOC|PF_KSWAPD;
1198
1199 order = 0;
1200 for ( ; ; ) {
1201 unsigned long new_order;
1202 if (current->flags & PF_FREEZE)
1203 refrigerator(PF_FREEZE);
1204
1205 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
1206 new_order = pgdat->kswapd_max_order;
1207 pgdat->kswapd_max_order = 0;
1208 if (order < new_order) {
1209 /*
1210 * Don't sleep if someone wants a larger 'order'
1211 * allocation
1212 */
1213 order = new_order;
1214 } else {
1215 schedule();
1216 order = pgdat->kswapd_max_order;
1217 }
1218 finish_wait(&pgdat->kswapd_wait, &wait);
1219
1220 balance_pgdat(pgdat, 0, order);
1221 }
1222 return 0;
1223}
1224
1225/*
1226 * A zone is low on free memory, so wake its kswapd task to service it.
1227 */
1228void wakeup_kswapd(struct zone *zone, int order)
1229{
1230 pg_data_t *pgdat;
1231
1232 if (zone->present_pages == 0)
1233 return;
1234
1235 pgdat = zone->zone_pgdat;
1236 if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0, 0))
1237 return;
1238 if (pgdat->kswapd_max_order < order)
1239 pgdat->kswapd_max_order = order;
1240 if (!cpuset_zone_allowed(zone))
1241 return;
1242 if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
1243 return;
1244 wake_up_interruptible(&zone->zone_pgdat->kswapd_wait);
1245}
1246
1247#ifdef CONFIG_PM
1248/*
1249 * Try to free `nr_pages' of memory, system-wide. Returns the number of freed
1250 * pages.
1251 */
1252int shrink_all_memory(int nr_pages)
1253{
1254 pg_data_t *pgdat;
1255 int nr_to_free = nr_pages;
1256 int ret = 0;
1257 struct reclaim_state reclaim_state = {
1258 .reclaimed_slab = 0,
1259 };
1260
1261 current->reclaim_state = &reclaim_state;
1262 for_each_pgdat(pgdat) {
1263 int freed;
1264 freed = balance_pgdat(pgdat, nr_to_free, 0);
1265 ret += freed;
1266 nr_to_free -= freed;
1267 if (nr_to_free <= 0)
1268 break;
1269 }
1270 current->reclaim_state = NULL;
1271 return ret;
1272}
1273#endif
1274
1275#ifdef CONFIG_HOTPLUG_CPU
1276/* It's optimal to keep kswapds on the same CPUs as their memory, but
1277 not required for correctness. So if the last cpu in a node goes
1278 away, we get changed to run anywhere: as the first one comes back,
1279 restore their cpu bindings. */
1280static int __devinit cpu_callback(struct notifier_block *nfb,
1281 unsigned long action,
1282 void *hcpu)
1283{
1284 pg_data_t *pgdat;
1285 cpumask_t mask;
1286
1287 if (action == CPU_ONLINE) {
1288 for_each_pgdat(pgdat) {
1289 mask = node_to_cpumask(pgdat->node_id);
1290 if (any_online_cpu(mask) != NR_CPUS)
1291 /* One of our CPUs online: restore mask */
1292 set_cpus_allowed(pgdat->kswapd, mask);
1293 }
1294 }
1295 return NOTIFY_OK;
1296}
1297#endif /* CONFIG_HOTPLUG_CPU */
1298
1299static int __init kswapd_init(void)
1300{
1301 pg_data_t *pgdat;
1302 swap_setup();
1303 for_each_pgdat(pgdat)
1304 pgdat->kswapd
1305 = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
1306 total_memory = nr_free_pagecache_pages();
1307 hotcpu_notifier(cpu_callback, 0);
1308 return 0;
1309}
1310
1311module_init(kswapd_init)