diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Makefile | 20 | ||||
| -rw-r--r-- | mm/bootmem.c | 400 | ||||
| -rw-r--r-- | mm/fadvise.c | 111 | ||||
| -rw-r--r-- | mm/filemap.c | 2306 | ||||
| -rw-r--r-- | mm/fremap.c | 256 | ||||
| -rw-r--r-- | mm/highmem.c | 607 | ||||
| -rw-r--r-- | mm/hugetlb.c | 260 | ||||
| -rw-r--r-- | mm/internal.h | 13 | ||||
| -rw-r--r-- | mm/madvise.c | 242 | ||||
| -rw-r--r-- | mm/memory.c | 2165 | ||||
| -rw-r--r-- | mm/mempolicy.c | 1138 | ||||
| -rw-r--r-- | mm/mempool.c | 290 | ||||
| -rw-r--r-- | mm/mincore.c | 191 | ||||
| -rw-r--r-- | mm/mlock.c | 253 | ||||
| -rw-r--r-- | mm/mmap.c | 2082 | ||||
| -rw-r--r-- | mm/mprotect.c | 282 | ||||
| -rw-r--r-- | mm/mremap.c | 426 | ||||
| -rw-r--r-- | mm/msync.c | 236 | ||||
| -rw-r--r-- | mm/nommu.c | 1180 | ||||
| -rw-r--r-- | mm/oom_kill.c | 292 | ||||
| -rw-r--r-- | mm/page-writeback.c | 819 | ||||
| -rw-r--r-- | mm/page_alloc.c | 2220 | ||||
| -rw-r--r-- | mm/page_io.c | 160 | ||||
| -rw-r--r-- | mm/pdflush.c | 228 | ||||
| -rw-r--r-- | mm/prio_tree.c | 207 | ||||
| -rw-r--r-- | mm/readahead.c | 557 | ||||
| -rw-r--r-- | mm/rmap.c | 862 | ||||
| -rw-r--r-- | mm/shmem.c | 2326 | ||||
| -rw-r--r-- | mm/slab.c | 3060 | ||||
| -rw-r--r-- | mm/swap.c | 485 | ||||
| -rw-r--r-- | mm/swap_state.c | 382 | ||||
| -rw-r--r-- | mm/swapfile.c | 1672 | ||||
| -rw-r--r-- | mm/thrash.c | 102 | ||||
| -rw-r--r-- | mm/tiny-shmem.c | 122 | ||||
| -rw-r--r-- | mm/truncate.c | 336 | ||||
| -rw-r--r-- | mm/vmalloc.c | 588 | ||||
| -rw-r--r-- | mm/vmscan.c | 1311 |
37 files changed, 28187 insertions, 0 deletions
diff --git a/mm/Makefile b/mm/Makefile new file mode 100644 index 000000000000..097408064f6a --- /dev/null +++ b/mm/Makefile | |||
| @@ -0,0 +1,20 @@ | |||
| 1 | # | ||
| 2 | # Makefile for the linux memory manager. | ||
| 3 | # | ||
| 4 | |||
| 5 | mmu-y := nommu.o | ||
| 6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | ||
| 7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ | ||
| 8 | vmalloc.o | ||
| 9 | |||
| 10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | ||
| 11 | page_alloc.o page-writeback.o pdflush.o \ | ||
| 12 | readahead.o slab.o swap.o truncate.o vmscan.o \ | ||
| 13 | prio_tree.o $(mmu-y) | ||
| 14 | |||
| 15 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o | ||
| 16 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o | ||
| 17 | obj-$(CONFIG_NUMA) += mempolicy.o | ||
| 18 | obj-$(CONFIG_SHMEM) += shmem.o | ||
| 19 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o | ||
| 20 | |||
diff --git a/mm/bootmem.c b/mm/bootmem.c new file mode 100644 index 000000000000..260e703850d8 --- /dev/null +++ b/mm/bootmem.c | |||
| @@ -0,0 +1,400 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/bootmem.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 1999 Ingo Molnar | ||
| 5 | * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 | ||
| 6 | * | ||
| 7 | * simple boot-time physical memory area allocator and | ||
| 8 | * free memory collector. It's used to deal with reserved | ||
| 9 | * system memory and memory holes as well. | ||
| 10 | */ | ||
| 11 | |||
| 12 | #include <linux/mm.h> | ||
| 13 | #include <linux/kernel_stat.h> | ||
| 14 | #include <linux/swap.h> | ||
| 15 | #include <linux/interrupt.h> | ||
| 16 | #include <linux/init.h> | ||
| 17 | #include <linux/bootmem.h> | ||
| 18 | #include <linux/mmzone.h> | ||
| 19 | #include <linux/module.h> | ||
| 20 | #include <asm/dma.h> | ||
| 21 | #include <asm/io.h> | ||
| 22 | #include "internal.h" | ||
| 23 | |||
| 24 | /* | ||
| 25 | * Access to this subsystem has to be serialized externally. (this is | ||
| 26 | * true for the boot process anyway) | ||
| 27 | */ | ||
| 28 | unsigned long max_low_pfn; | ||
| 29 | unsigned long min_low_pfn; | ||
| 30 | unsigned long max_pfn; | ||
| 31 | |||
| 32 | EXPORT_SYMBOL(max_pfn); /* This is exported so | ||
| 33 | * dma_get_required_mask(), which uses | ||
| 34 | * it, can be an inline function */ | ||
| 35 | |||
| 36 | /* return the number of _pages_ that will be allocated for the boot bitmap */ | ||
| 37 | unsigned long __init bootmem_bootmap_pages (unsigned long pages) | ||
| 38 | { | ||
| 39 | unsigned long mapsize; | ||
| 40 | |||
| 41 | mapsize = (pages+7)/8; | ||
| 42 | mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; | ||
| 43 | mapsize >>= PAGE_SHIFT; | ||
| 44 | |||
| 45 | return mapsize; | ||
| 46 | } | ||
| 47 | |||
| 48 | /* | ||
| 49 | * Called once to set up the allocator itself. | ||
| 50 | */ | ||
| 51 | static unsigned long __init init_bootmem_core (pg_data_t *pgdat, | ||
| 52 | unsigned long mapstart, unsigned long start, unsigned long end) | ||
| 53 | { | ||
| 54 | bootmem_data_t *bdata = pgdat->bdata; | ||
| 55 | unsigned long mapsize = ((end - start)+7)/8; | ||
| 56 | |||
| 57 | pgdat->pgdat_next = pgdat_list; | ||
| 58 | pgdat_list = pgdat; | ||
| 59 | |||
| 60 | mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL); | ||
| 61 | bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); | ||
| 62 | bdata->node_boot_start = (start << PAGE_SHIFT); | ||
| 63 | bdata->node_low_pfn = end; | ||
| 64 | |||
| 65 | /* | ||
| 66 | * Initially all pages are reserved - setup_arch() has to | ||
| 67 | * register free RAM areas explicitly. | ||
| 68 | */ | ||
| 69 | memset(bdata->node_bootmem_map, 0xff, mapsize); | ||
| 70 | |||
| 71 | return mapsize; | ||
| 72 | } | ||
| 73 | |||
| 74 | /* | ||
| 75 | * Marks a particular physical memory range as unallocatable. Usable RAM | ||
| 76 | * might be used for boot-time allocations - or it might get added | ||
| 77 | * to the free page pool later on. | ||
| 78 | */ | ||
| 79 | static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) | ||
| 80 | { | ||
| 81 | unsigned long i; | ||
| 82 | /* | ||
| 83 | * round up, partially reserved pages are considered | ||
| 84 | * fully reserved. | ||
| 85 | */ | ||
| 86 | unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE; | ||
| 87 | unsigned long eidx = (addr + size - bdata->node_boot_start + | ||
| 88 | PAGE_SIZE-1)/PAGE_SIZE; | ||
| 89 | unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE; | ||
| 90 | |||
| 91 | BUG_ON(!size); | ||
| 92 | BUG_ON(sidx >= eidx); | ||
| 93 | BUG_ON((addr >> PAGE_SHIFT) >= bdata->node_low_pfn); | ||
| 94 | BUG_ON(end > bdata->node_low_pfn); | ||
| 95 | |||
| 96 | for (i = sidx; i < eidx; i++) | ||
| 97 | if (test_and_set_bit(i, bdata->node_bootmem_map)) { | ||
| 98 | #ifdef CONFIG_DEBUG_BOOTMEM | ||
| 99 | printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); | ||
| 100 | #endif | ||
| 101 | } | ||
| 102 | } | ||
| 103 | |||
| 104 | static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) | ||
| 105 | { | ||
| 106 | unsigned long i; | ||
| 107 | unsigned long start; | ||
| 108 | /* | ||
| 109 | * round down end of usable mem, partially free pages are | ||
| 110 | * considered reserved. | ||
| 111 | */ | ||
| 112 | unsigned long sidx; | ||
| 113 | unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE; | ||
| 114 | unsigned long end = (addr + size)/PAGE_SIZE; | ||
| 115 | |||
| 116 | BUG_ON(!size); | ||
| 117 | BUG_ON(end > bdata->node_low_pfn); | ||
| 118 | |||
| 119 | if (addr < bdata->last_success) | ||
| 120 | bdata->last_success = addr; | ||
| 121 | |||
| 122 | /* | ||
| 123 | * Round up the beginning of the address. | ||
| 124 | */ | ||
| 125 | start = (addr + PAGE_SIZE-1) / PAGE_SIZE; | ||
| 126 | sidx = start - (bdata->node_boot_start/PAGE_SIZE); | ||
| 127 | |||
| 128 | for (i = sidx; i < eidx; i++) { | ||
| 129 | if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) | ||
| 130 | BUG(); | ||
| 131 | } | ||
| 132 | } | ||
| 133 | |||
| 134 | /* | ||
| 135 | * We 'merge' subsequent allocations to save space. We might 'lose' | ||
| 136 | * some fraction of a page if allocations cannot be satisfied due to | ||
| 137 | * size constraints on boxes where there is physical RAM space | ||
| 138 | * fragmentation - in these cases (mostly large memory boxes) this | ||
| 139 | * is not a problem. | ||
| 140 | * | ||
| 141 | * On low memory boxes we get it right in 100% of the cases. | ||
| 142 | * | ||
| 143 | * alignment has to be a power of 2 value. | ||
| 144 | * | ||
| 145 | * NOTE: This function is _not_ reentrant. | ||
| 146 | */ | ||
| 147 | static void * __init | ||
| 148 | __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, | ||
| 149 | unsigned long align, unsigned long goal) | ||
| 150 | { | ||
| 151 | unsigned long offset, remaining_size, areasize, preferred; | ||
| 152 | unsigned long i, start = 0, incr, eidx; | ||
| 153 | void *ret; | ||
| 154 | |||
| 155 | if(!size) { | ||
| 156 | printk("__alloc_bootmem_core(): zero-sized request\n"); | ||
| 157 | BUG(); | ||
| 158 | } | ||
| 159 | BUG_ON(align & (align-1)); | ||
| 160 | |||
| 161 | eidx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); | ||
| 162 | offset = 0; | ||
| 163 | if (align && | ||
| 164 | (bdata->node_boot_start & (align - 1UL)) != 0) | ||
| 165 | offset = (align - (bdata->node_boot_start & (align - 1UL))); | ||
| 166 | offset >>= PAGE_SHIFT; | ||
| 167 | |||
| 168 | /* | ||
| 169 | * We try to allocate bootmem pages above 'goal' | ||
| 170 | * first, then we try to allocate lower pages. | ||
| 171 | */ | ||
| 172 | if (goal && (goal >= bdata->node_boot_start) && | ||
| 173 | ((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) { | ||
| 174 | preferred = goal - bdata->node_boot_start; | ||
| 175 | |||
| 176 | if (bdata->last_success >= preferred) | ||
| 177 | preferred = bdata->last_success; | ||
| 178 | } else | ||
| 179 | preferred = 0; | ||
| 180 | |||
| 181 | preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT; | ||
| 182 | preferred += offset; | ||
| 183 | areasize = (size+PAGE_SIZE-1)/PAGE_SIZE; | ||
| 184 | incr = align >> PAGE_SHIFT ? : 1; | ||
| 185 | |||
| 186 | restart_scan: | ||
| 187 | for (i = preferred; i < eidx; i += incr) { | ||
| 188 | unsigned long j; | ||
| 189 | i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i); | ||
| 190 | i = ALIGN(i, incr); | ||
| 191 | if (test_bit(i, bdata->node_bootmem_map)) | ||
| 192 | continue; | ||
| 193 | for (j = i + 1; j < i + areasize; ++j) { | ||
| 194 | if (j >= eidx) | ||
| 195 | goto fail_block; | ||
| 196 | if (test_bit (j, bdata->node_bootmem_map)) | ||
| 197 | goto fail_block; | ||
| 198 | } | ||
| 199 | start = i; | ||
| 200 | goto found; | ||
| 201 | fail_block: | ||
| 202 | i = ALIGN(j, incr); | ||
| 203 | } | ||
| 204 | |||
| 205 | if (preferred > offset) { | ||
| 206 | preferred = offset; | ||
| 207 | goto restart_scan; | ||
| 208 | } | ||
| 209 | return NULL; | ||
| 210 | |||
| 211 | found: | ||
| 212 | bdata->last_success = start << PAGE_SHIFT; | ||
| 213 | BUG_ON(start >= eidx); | ||
| 214 | |||
| 215 | /* | ||
| 216 | * Is the next page of the previous allocation-end the start | ||
| 217 | * of this allocation's buffer? If yes then we can 'merge' | ||
| 218 | * the previous partial page with this allocation. | ||
| 219 | */ | ||
| 220 | if (align < PAGE_SIZE && | ||
| 221 | bdata->last_offset && bdata->last_pos+1 == start) { | ||
| 222 | offset = (bdata->last_offset+align-1) & ~(align-1); | ||
| 223 | BUG_ON(offset > PAGE_SIZE); | ||
| 224 | remaining_size = PAGE_SIZE-offset; | ||
| 225 | if (size < remaining_size) { | ||
| 226 | areasize = 0; | ||
| 227 | /* last_pos unchanged */ | ||
| 228 | bdata->last_offset = offset+size; | ||
| 229 | ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + | ||
| 230 | bdata->node_boot_start); | ||
| 231 | } else { | ||
| 232 | remaining_size = size - remaining_size; | ||
| 233 | areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE; | ||
| 234 | ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + | ||
| 235 | bdata->node_boot_start); | ||
| 236 | bdata->last_pos = start+areasize-1; | ||
| 237 | bdata->last_offset = remaining_size; | ||
| 238 | } | ||
| 239 | bdata->last_offset &= ~PAGE_MASK; | ||
| 240 | } else { | ||
| 241 | bdata->last_pos = start + areasize - 1; | ||
| 242 | bdata->last_offset = size & ~PAGE_MASK; | ||
| 243 | ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start); | ||
| 244 | } | ||
| 245 | |||
| 246 | /* | ||
| 247 | * Reserve the area now: | ||
| 248 | */ | ||
| 249 | for (i = start; i < start+areasize; i++) | ||
| 250 | if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map))) | ||
| 251 | BUG(); | ||
| 252 | memset(ret, 0, size); | ||
| 253 | return ret; | ||
| 254 | } | ||
| 255 | |||
| 256 | static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) | ||
| 257 | { | ||
| 258 | struct page *page; | ||
| 259 | bootmem_data_t *bdata = pgdat->bdata; | ||
| 260 | unsigned long i, count, total = 0; | ||
| 261 | unsigned long idx; | ||
| 262 | unsigned long *map; | ||
| 263 | int gofast = 0; | ||
| 264 | |||
| 265 | BUG_ON(!bdata->node_bootmem_map); | ||
| 266 | |||
| 267 | count = 0; | ||
| 268 | /* first extant page of the node */ | ||
| 269 | page = virt_to_page(phys_to_virt(bdata->node_boot_start)); | ||
| 270 | idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); | ||
| 271 | map = bdata->node_bootmem_map; | ||
| 272 | /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */ | ||
| 273 | if (bdata->node_boot_start == 0 || | ||
| 274 | ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG)) | ||
| 275 | gofast = 1; | ||
| 276 | for (i = 0; i < idx; ) { | ||
| 277 | unsigned long v = ~map[i / BITS_PER_LONG]; | ||
| 278 | if (gofast && v == ~0UL) { | ||
| 279 | int j, order; | ||
| 280 | |||
| 281 | count += BITS_PER_LONG; | ||
| 282 | __ClearPageReserved(page); | ||
| 283 | order = ffs(BITS_PER_LONG) - 1; | ||
| 284 | set_page_refs(page, order); | ||
| 285 | for (j = 1; j < BITS_PER_LONG; j++) { | ||
| 286 | if (j + 16 < BITS_PER_LONG) | ||
| 287 | prefetchw(page + j + 16); | ||
| 288 | __ClearPageReserved(page + j); | ||
| 289 | } | ||
| 290 | __free_pages(page, order); | ||
| 291 | i += BITS_PER_LONG; | ||
| 292 | page += BITS_PER_LONG; | ||
| 293 | } else if (v) { | ||
| 294 | unsigned long m; | ||
| 295 | for (m = 1; m && i < idx; m<<=1, page++, i++) { | ||
| 296 | if (v & m) { | ||
| 297 | count++; | ||
| 298 | __ClearPageReserved(page); | ||
| 299 | set_page_refs(page, 0); | ||
| 300 | __free_page(page); | ||
| 301 | } | ||
| 302 | } | ||
| 303 | } else { | ||
| 304 | i+=BITS_PER_LONG; | ||
| 305 | page += BITS_PER_LONG; | ||
| 306 | } | ||
| 307 | } | ||
| 308 | total += count; | ||
| 309 | |||
| 310 | /* | ||
| 311 | * Now free the allocator bitmap itself, it's not | ||
| 312 | * needed anymore: | ||
| 313 | */ | ||
| 314 | page = virt_to_page(bdata->node_bootmem_map); | ||
| 315 | count = 0; | ||
| 316 | for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { | ||
| 317 | count++; | ||
| 318 | __ClearPageReserved(page); | ||
| 319 | set_page_count(page, 1); | ||
| 320 | __free_page(page); | ||
| 321 | } | ||
| 322 | total += count; | ||
| 323 | bdata->node_bootmem_map = NULL; | ||
| 324 | |||
| 325 | return total; | ||
| 326 | } | ||
| 327 | |||
| 328 | unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn) | ||
| 329 | { | ||
| 330 | return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn)); | ||
| 331 | } | ||
| 332 | |||
| 333 | void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) | ||
| 334 | { | ||
| 335 | reserve_bootmem_core(pgdat->bdata, physaddr, size); | ||
| 336 | } | ||
| 337 | |||
| 338 | void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) | ||
| 339 | { | ||
| 340 | free_bootmem_core(pgdat->bdata, physaddr, size); | ||
| 341 | } | ||
| 342 | |||
| 343 | unsigned long __init free_all_bootmem_node (pg_data_t *pgdat) | ||
| 344 | { | ||
| 345 | return(free_all_bootmem_core(pgdat)); | ||
| 346 | } | ||
| 347 | |||
| 348 | unsigned long __init init_bootmem (unsigned long start, unsigned long pages) | ||
| 349 | { | ||
| 350 | max_low_pfn = pages; | ||
| 351 | min_low_pfn = start; | ||
| 352 | return(init_bootmem_core(NODE_DATA(0), start, 0, pages)); | ||
| 353 | } | ||
| 354 | |||
| 355 | #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE | ||
| 356 | void __init reserve_bootmem (unsigned long addr, unsigned long size) | ||
| 357 | { | ||
| 358 | reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size); | ||
| 359 | } | ||
| 360 | #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ | ||
| 361 | |||
| 362 | void __init free_bootmem (unsigned long addr, unsigned long size) | ||
| 363 | { | ||
| 364 | free_bootmem_core(NODE_DATA(0)->bdata, addr, size); | ||
| 365 | } | ||
| 366 | |||
| 367 | unsigned long __init free_all_bootmem (void) | ||
| 368 | { | ||
| 369 | return(free_all_bootmem_core(NODE_DATA(0))); | ||
| 370 | } | ||
| 371 | |||
| 372 | void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal) | ||
| 373 | { | ||
| 374 | pg_data_t *pgdat = pgdat_list; | ||
| 375 | void *ptr; | ||
| 376 | |||
| 377 | for_each_pgdat(pgdat) | ||
| 378 | if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, | ||
| 379 | align, goal))) | ||
| 380 | return(ptr); | ||
| 381 | |||
| 382 | /* | ||
| 383 | * Whoops, we cannot satisfy the allocation request. | ||
| 384 | */ | ||
| 385 | printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); | ||
| 386 | panic("Out of memory"); | ||
| 387 | return NULL; | ||
| 388 | } | ||
| 389 | |||
| 390 | void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) | ||
| 391 | { | ||
| 392 | void *ptr; | ||
| 393 | |||
| 394 | ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal); | ||
| 395 | if (ptr) | ||
| 396 | return (ptr); | ||
| 397 | |||
| 398 | return __alloc_bootmem(size, align, goal); | ||
| 399 | } | ||
| 400 | |||
diff --git a/mm/fadvise.c b/mm/fadvise.c new file mode 100644 index 000000000000..57264d74b8bf --- /dev/null +++ b/mm/fadvise.c | |||
| @@ -0,0 +1,111 @@ | |||
| 1 | /* | ||
| 2 | * mm/fadvise.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 2002, Linus Torvalds | ||
| 5 | * | ||
| 6 | * 11Jan2003 akpm@digeo.com | ||
| 7 | * Initial version. | ||
| 8 | */ | ||
| 9 | |||
| 10 | #include <linux/kernel.h> | ||
| 11 | #include <linux/file.h> | ||
| 12 | #include <linux/fs.h> | ||
| 13 | #include <linux/mm.h> | ||
| 14 | #include <linux/pagemap.h> | ||
| 15 | #include <linux/backing-dev.h> | ||
| 16 | #include <linux/pagevec.h> | ||
| 17 | #include <linux/fadvise.h> | ||
| 18 | #include <linux/syscalls.h> | ||
| 19 | |||
| 20 | #include <asm/unistd.h> | ||
| 21 | |||
| 22 | /* | ||
| 23 | * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could | ||
| 24 | * deactivate the pages and clear PG_Referenced. | ||
| 25 | */ | ||
| 26 | asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | ||
| 27 | { | ||
| 28 | struct file *file = fget(fd); | ||
| 29 | struct address_space *mapping; | ||
| 30 | struct backing_dev_info *bdi; | ||
| 31 | loff_t endbyte; | ||
| 32 | pgoff_t start_index; | ||
| 33 | pgoff_t end_index; | ||
| 34 | unsigned long nrpages; | ||
| 35 | int ret = 0; | ||
| 36 | |||
| 37 | if (!file) | ||
| 38 | return -EBADF; | ||
| 39 | |||
| 40 | mapping = file->f_mapping; | ||
| 41 | if (!mapping || len < 0) { | ||
| 42 | ret = -EINVAL; | ||
| 43 | goto out; | ||
| 44 | } | ||
| 45 | |||
| 46 | /* Careful about overflows. Len == 0 means "as much as possible" */ | ||
| 47 | endbyte = offset + len; | ||
| 48 | if (!len || endbyte < len) | ||
| 49 | endbyte = -1; | ||
| 50 | |||
| 51 | bdi = mapping->backing_dev_info; | ||
| 52 | |||
| 53 | switch (advice) { | ||
| 54 | case POSIX_FADV_NORMAL: | ||
| 55 | file->f_ra.ra_pages = bdi->ra_pages; | ||
| 56 | break; | ||
| 57 | case POSIX_FADV_RANDOM: | ||
| 58 | file->f_ra.ra_pages = 0; | ||
| 59 | break; | ||
| 60 | case POSIX_FADV_SEQUENTIAL: | ||
| 61 | file->f_ra.ra_pages = bdi->ra_pages * 2; | ||
| 62 | break; | ||
| 63 | case POSIX_FADV_WILLNEED: | ||
| 64 | case POSIX_FADV_NOREUSE: | ||
| 65 | if (!mapping->a_ops->readpage) { | ||
| 66 | ret = -EINVAL; | ||
| 67 | break; | ||
| 68 | } | ||
| 69 | |||
| 70 | /* First and last PARTIAL page! */ | ||
| 71 | start_index = offset >> PAGE_CACHE_SHIFT; | ||
| 72 | end_index = (endbyte-1) >> PAGE_CACHE_SHIFT; | ||
| 73 | |||
| 74 | /* Careful about overflow on the "+1" */ | ||
| 75 | nrpages = end_index - start_index + 1; | ||
| 76 | if (!nrpages) | ||
| 77 | nrpages = ~0UL; | ||
| 78 | |||
| 79 | ret = force_page_cache_readahead(mapping, file, | ||
| 80 | start_index, | ||
| 81 | max_sane_readahead(nrpages)); | ||
| 82 | if (ret > 0) | ||
| 83 | ret = 0; | ||
| 84 | break; | ||
| 85 | case POSIX_FADV_DONTNEED: | ||
| 86 | if (!bdi_write_congested(mapping->backing_dev_info)) | ||
| 87 | filemap_flush(mapping); | ||
| 88 | |||
| 89 | /* First and last FULL page! */ | ||
| 90 | start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; | ||
| 91 | end_index = (endbyte >> PAGE_CACHE_SHIFT); | ||
| 92 | |||
| 93 | if (end_index > start_index) | ||
| 94 | invalidate_mapping_pages(mapping, start_index, end_index-1); | ||
| 95 | break; | ||
| 96 | default: | ||
| 97 | ret = -EINVAL; | ||
| 98 | } | ||
| 99 | out: | ||
| 100 | fput(file); | ||
| 101 | return ret; | ||
| 102 | } | ||
| 103 | |||
| 104 | #ifdef __ARCH_WANT_SYS_FADVISE64 | ||
| 105 | |||
| 106 | asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice) | ||
| 107 | { | ||
| 108 | return sys_fadvise64_64(fd, offset, len, advice); | ||
| 109 | } | ||
| 110 | |||
| 111 | #endif | ||
diff --git a/mm/filemap.c b/mm/filemap.c new file mode 100644 index 000000000000..439b2bea8e34 --- /dev/null +++ b/mm/filemap.c | |||
| @@ -0,0 +1,2306 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/filemap.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 1994-1999 Linus Torvalds | ||
| 5 | */ | ||
| 6 | |||
| 7 | /* | ||
| 8 | * This file handles the generic file mmap semantics used by | ||
| 9 | * most "normal" filesystems (but you don't /have/ to use this: | ||
| 10 | * the NFS filesystem used to do this differently, for example) | ||
| 11 | */ | ||
| 12 | #include <linux/config.h> | ||
| 13 | #include <linux/module.h> | ||
| 14 | #include <linux/slab.h> | ||
| 15 | #include <linux/compiler.h> | ||
| 16 | #include <linux/fs.h> | ||
| 17 | #include <linux/aio.h> | ||
| 18 | #include <linux/kernel_stat.h> | ||
| 19 | #include <linux/mm.h> | ||
| 20 | #include <linux/swap.h> | ||
| 21 | #include <linux/mman.h> | ||
| 22 | #include <linux/pagemap.h> | ||
| 23 | #include <linux/file.h> | ||
| 24 | #include <linux/uio.h> | ||
| 25 | #include <linux/hash.h> | ||
| 26 | #include <linux/writeback.h> | ||
| 27 | #include <linux/pagevec.h> | ||
| 28 | #include <linux/blkdev.h> | ||
| 29 | #include <linux/security.h> | ||
| 30 | #include <linux/syscalls.h> | ||
| 31 | /* | ||
| 32 | * This is needed for the following functions: | ||
| 33 | * - try_to_release_page | ||
| 34 | * - block_invalidatepage | ||
| 35 | * - generic_osync_inode | ||
| 36 | * | ||
| 37 | * FIXME: remove all knowledge of the buffer layer from the core VM | ||
| 38 | */ | ||
| 39 | #include <linux/buffer_head.h> /* for generic_osync_inode */ | ||
| 40 | |||
| 41 | #include <asm/uaccess.h> | ||
| 42 | #include <asm/mman.h> | ||
| 43 | |||
| 44 | /* | ||
| 45 | * Shared mappings implemented 30.11.1994. It's not fully working yet, | ||
| 46 | * though. | ||
| 47 | * | ||
| 48 | * Shared mappings now work. 15.8.1995 Bruno. | ||
| 49 | * | ||
| 50 | * finished 'unifying' the page and buffer cache and SMP-threaded the | ||
| 51 | * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> | ||
| 52 | * | ||
| 53 | * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> | ||
| 54 | */ | ||
| 55 | |||
| 56 | /* | ||
| 57 | * Lock ordering: | ||
| 58 | * | ||
| 59 | * ->i_mmap_lock (vmtruncate) | ||
| 60 | * ->private_lock (__free_pte->__set_page_dirty_buffers) | ||
| 61 | * ->swap_list_lock | ||
| 62 | * ->swap_device_lock (exclusive_swap_page, others) | ||
| 63 | * ->mapping->tree_lock | ||
| 64 | * | ||
| 65 | * ->i_sem | ||
| 66 | * ->i_mmap_lock (truncate->unmap_mapping_range) | ||
| 67 | * | ||
| 68 | * ->mmap_sem | ||
| 69 | * ->i_mmap_lock | ||
| 70 | * ->page_table_lock (various places, mainly in mmap.c) | ||
| 71 | * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) | ||
| 72 | * | ||
| 73 | * ->mmap_sem | ||
| 74 | * ->lock_page (access_process_vm) | ||
| 75 | * | ||
| 76 | * ->mmap_sem | ||
| 77 | * ->i_sem (msync) | ||
| 78 | * | ||
| 79 | * ->i_sem | ||
| 80 | * ->i_alloc_sem (various) | ||
| 81 | * | ||
| 82 | * ->inode_lock | ||
| 83 | * ->sb_lock (fs/fs-writeback.c) | ||
| 84 | * ->mapping->tree_lock (__sync_single_inode) | ||
| 85 | * | ||
| 86 | * ->i_mmap_lock | ||
| 87 | * ->anon_vma.lock (vma_adjust) | ||
| 88 | * | ||
| 89 | * ->anon_vma.lock | ||
| 90 | * ->page_table_lock (anon_vma_prepare and various) | ||
| 91 | * | ||
| 92 | * ->page_table_lock | ||
| 93 | * ->swap_device_lock (try_to_unmap_one) | ||
| 94 | * ->private_lock (try_to_unmap_one) | ||
| 95 | * ->tree_lock (try_to_unmap_one) | ||
| 96 | * ->zone.lru_lock (follow_page->mark_page_accessed) | ||
| 97 | * ->private_lock (page_remove_rmap->set_page_dirty) | ||
| 98 | * ->tree_lock (page_remove_rmap->set_page_dirty) | ||
| 99 | * ->inode_lock (page_remove_rmap->set_page_dirty) | ||
| 100 | * ->inode_lock (zap_pte_range->set_page_dirty) | ||
| 101 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | ||
| 102 | * | ||
| 103 | * ->task->proc_lock | ||
| 104 | * ->dcache_lock (proc_pid_lookup) | ||
| 105 | */ | ||
| 106 | |||
| 107 | /* | ||
| 108 | * Remove a page from the page cache and free it. Caller has to make | ||
| 109 | * sure the page is locked and that nobody else uses it - or that usage | ||
| 110 | * is safe. The caller must hold a write_lock on the mapping's tree_lock. | ||
| 111 | */ | ||
| 112 | void __remove_from_page_cache(struct page *page) | ||
| 113 | { | ||
| 114 | struct address_space *mapping = page->mapping; | ||
| 115 | |||
| 116 | radix_tree_delete(&mapping->page_tree, page->index); | ||
| 117 | page->mapping = NULL; | ||
| 118 | mapping->nrpages--; | ||
| 119 | pagecache_acct(-1); | ||
| 120 | } | ||
| 121 | |||
| 122 | void remove_from_page_cache(struct page *page) | ||
| 123 | { | ||
| 124 | struct address_space *mapping = page->mapping; | ||
| 125 | |||
| 126 | if (unlikely(!PageLocked(page))) | ||
| 127 | PAGE_BUG(page); | ||
| 128 | |||
| 129 | write_lock_irq(&mapping->tree_lock); | ||
| 130 | __remove_from_page_cache(page); | ||
| 131 | write_unlock_irq(&mapping->tree_lock); | ||
| 132 | } | ||
| 133 | |||
| 134 | static int sync_page(void *word) | ||
| 135 | { | ||
| 136 | struct address_space *mapping; | ||
| 137 | struct page *page; | ||
| 138 | |||
| 139 | page = container_of((page_flags_t *)word, struct page, flags); | ||
| 140 | |||
| 141 | /* | ||
| 142 | * FIXME, fercrissake. What is this barrier here for? | ||
| 143 | */ | ||
| 144 | smp_mb(); | ||
| 145 | mapping = page_mapping(page); | ||
| 146 | if (mapping && mapping->a_ops && mapping->a_ops->sync_page) | ||
| 147 | mapping->a_ops->sync_page(page); | ||
| 148 | io_schedule(); | ||
| 149 | return 0; | ||
| 150 | } | ||
| 151 | |||
| 152 | /** | ||
| 153 | * filemap_fdatawrite_range - start writeback against all of a mapping's | ||
| 154 | * dirty pages that lie within the byte offsets <start, end> | ||
| 155 | * @mapping: address space structure to write | ||
| 156 | * @start: offset in bytes where the range starts | ||
| 157 | * @end : offset in bytes where the range ends | ||
| 158 | * | ||
| 159 | * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as | ||
| 160 | * opposed to a regular memory * cleansing writeback. The difference between | ||
| 161 | * these two operations is that if a dirty page/buffer is encountered, it must | ||
| 162 | * be waited upon, and not just skipped over. | ||
| 163 | */ | ||
| 164 | static int __filemap_fdatawrite_range(struct address_space *mapping, | ||
| 165 | loff_t start, loff_t end, int sync_mode) | ||
| 166 | { | ||
| 167 | int ret; | ||
| 168 | struct writeback_control wbc = { | ||
| 169 | .sync_mode = sync_mode, | ||
| 170 | .nr_to_write = mapping->nrpages * 2, | ||
| 171 | .start = start, | ||
| 172 | .end = end, | ||
| 173 | }; | ||
| 174 | |||
| 175 | if (!mapping_cap_writeback_dirty(mapping)) | ||
| 176 | return 0; | ||
| 177 | |||
| 178 | ret = do_writepages(mapping, &wbc); | ||
| 179 | return ret; | ||
| 180 | } | ||
| 181 | |||
| 182 | static inline int __filemap_fdatawrite(struct address_space *mapping, | ||
| 183 | int sync_mode) | ||
| 184 | { | ||
| 185 | return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode); | ||
| 186 | } | ||
| 187 | |||
| 188 | int filemap_fdatawrite(struct address_space *mapping) | ||
| 189 | { | ||
| 190 | return __filemap_fdatawrite(mapping, WB_SYNC_ALL); | ||
| 191 | } | ||
| 192 | EXPORT_SYMBOL(filemap_fdatawrite); | ||
| 193 | |||
| 194 | static int filemap_fdatawrite_range(struct address_space *mapping, | ||
| 195 | loff_t start, loff_t end) | ||
| 196 | { | ||
| 197 | return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); | ||
| 198 | } | ||
| 199 | |||
| 200 | /* | ||
| 201 | * This is a mostly non-blocking flush. Not suitable for data-integrity | ||
| 202 | * purposes - I/O may not be started against all dirty pages. | ||
| 203 | */ | ||
| 204 | int filemap_flush(struct address_space *mapping) | ||
| 205 | { | ||
| 206 | return __filemap_fdatawrite(mapping, WB_SYNC_NONE); | ||
| 207 | } | ||
| 208 | EXPORT_SYMBOL(filemap_flush); | ||
| 209 | |||
| 210 | /* | ||
| 211 | * Wait for writeback to complete against pages indexed by start->end | ||
| 212 | * inclusive | ||
| 213 | */ | ||
| 214 | static int wait_on_page_writeback_range(struct address_space *mapping, | ||
| 215 | pgoff_t start, pgoff_t end) | ||
| 216 | { | ||
| 217 | struct pagevec pvec; | ||
| 218 | int nr_pages; | ||
| 219 | int ret = 0; | ||
| 220 | pgoff_t index; | ||
| 221 | |||
| 222 | if (end < start) | ||
| 223 | return 0; | ||
| 224 | |||
| 225 | pagevec_init(&pvec, 0); | ||
| 226 | index = start; | ||
| 227 | while ((index <= end) && | ||
| 228 | (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | ||
| 229 | PAGECACHE_TAG_WRITEBACK, | ||
| 230 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { | ||
| 231 | unsigned i; | ||
| 232 | |||
| 233 | for (i = 0; i < nr_pages; i++) { | ||
| 234 | struct page *page = pvec.pages[i]; | ||
| 235 | |||
| 236 | /* until radix tree lookup accepts end_index */ | ||
| 237 | if (page->index > end) | ||
| 238 | continue; | ||
| 239 | |||
| 240 | wait_on_page_writeback(page); | ||
| 241 | if (PageError(page)) | ||
| 242 | ret = -EIO; | ||
| 243 | } | ||
| 244 | pagevec_release(&pvec); | ||
| 245 | cond_resched(); | ||
| 246 | } | ||
| 247 | |||
| 248 | /* Check for outstanding write errors */ | ||
| 249 | if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) | ||
| 250 | ret = -ENOSPC; | ||
| 251 | if (test_and_clear_bit(AS_EIO, &mapping->flags)) | ||
| 252 | ret = -EIO; | ||
| 253 | |||
| 254 | return ret; | ||
| 255 | } | ||
| 256 | |||
| 257 | /* | ||
| 258 | * Write and wait upon all the pages in the passed range. This is a "data | ||
| 259 | * integrity" operation. It waits upon in-flight writeout before starting and | ||
| 260 | * waiting upon new writeout. If there was an IO error, return it. | ||
| 261 | * | ||
| 262 | * We need to re-take i_sem during the generic_osync_inode list walk because | ||
| 263 | * it is otherwise livelockable. | ||
| 264 | */ | ||
| 265 | int sync_page_range(struct inode *inode, struct address_space *mapping, | ||
| 266 | loff_t pos, size_t count) | ||
| 267 | { | ||
| 268 | pgoff_t start = pos >> PAGE_CACHE_SHIFT; | ||
| 269 | pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; | ||
| 270 | int ret; | ||
| 271 | |||
| 272 | if (!mapping_cap_writeback_dirty(mapping) || !count) | ||
| 273 | return 0; | ||
| 274 | ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); | ||
| 275 | if (ret == 0) { | ||
| 276 | down(&inode->i_sem); | ||
| 277 | ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); | ||
| 278 | up(&inode->i_sem); | ||
| 279 | } | ||
| 280 | if (ret == 0) | ||
| 281 | ret = wait_on_page_writeback_range(mapping, start, end); | ||
| 282 | return ret; | ||
| 283 | } | ||
| 284 | EXPORT_SYMBOL(sync_page_range); | ||
| 285 | |||
| 286 | /* | ||
| 287 | * Note: Holding i_sem across sync_page_range_nolock is not a good idea | ||
| 288 | * as it forces O_SYNC writers to different parts of the same file | ||
| 289 | * to be serialised right until io completion. | ||
| 290 | */ | ||
| 291 | int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, | ||
| 292 | loff_t pos, size_t count) | ||
| 293 | { | ||
| 294 | pgoff_t start = pos >> PAGE_CACHE_SHIFT; | ||
| 295 | pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; | ||
| 296 | int ret; | ||
| 297 | |||
| 298 | if (!mapping_cap_writeback_dirty(mapping) || !count) | ||
| 299 | return 0; | ||
| 300 | ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); | ||
| 301 | if (ret == 0) | ||
| 302 | ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); | ||
| 303 | if (ret == 0) | ||
| 304 | ret = wait_on_page_writeback_range(mapping, start, end); | ||
| 305 | return ret; | ||
| 306 | } | ||
| 307 | EXPORT_SYMBOL(sync_page_range_nolock); | ||
| 308 | |||
| 309 | /** | ||
| 310 | * filemap_fdatawait - walk the list of under-writeback pages of the given | ||
| 311 | * address space and wait for all of them. | ||
| 312 | * | ||
| 313 | * @mapping: address space structure to wait for | ||
| 314 | */ | ||
| 315 | int filemap_fdatawait(struct address_space *mapping) | ||
| 316 | { | ||
| 317 | loff_t i_size = i_size_read(mapping->host); | ||
| 318 | |||
| 319 | if (i_size == 0) | ||
| 320 | return 0; | ||
| 321 | |||
| 322 | return wait_on_page_writeback_range(mapping, 0, | ||
| 323 | (i_size - 1) >> PAGE_CACHE_SHIFT); | ||
| 324 | } | ||
| 325 | EXPORT_SYMBOL(filemap_fdatawait); | ||
| 326 | |||
| 327 | int filemap_write_and_wait(struct address_space *mapping) | ||
| 328 | { | ||
| 329 | int retval = 0; | ||
| 330 | |||
| 331 | if (mapping->nrpages) { | ||
| 332 | retval = filemap_fdatawrite(mapping); | ||
| 333 | if (retval == 0) | ||
| 334 | retval = filemap_fdatawait(mapping); | ||
| 335 | } | ||
| 336 | return retval; | ||
| 337 | } | ||
| 338 | |||
| 339 | int filemap_write_and_wait_range(struct address_space *mapping, | ||
| 340 | loff_t lstart, loff_t lend) | ||
| 341 | { | ||
| 342 | int retval = 0; | ||
| 343 | |||
| 344 | if (mapping->nrpages) { | ||
| 345 | retval = __filemap_fdatawrite_range(mapping, lstart, lend, | ||
| 346 | WB_SYNC_ALL); | ||
| 347 | if (retval == 0) | ||
| 348 | retval = wait_on_page_writeback_range(mapping, | ||
| 349 | lstart >> PAGE_CACHE_SHIFT, | ||
| 350 | lend >> PAGE_CACHE_SHIFT); | ||
| 351 | } | ||
| 352 | return retval; | ||
| 353 | } | ||
| 354 | |||
| 355 | /* | ||
| 356 | * This function is used to add newly allocated pagecache pages: | ||
| 357 | * the page is new, so we can just run SetPageLocked() against it. | ||
| 358 | * The other page state flags were set by rmqueue(). | ||
| 359 | * | ||
| 360 | * This function does not add the page to the LRU. The caller must do that. | ||
| 361 | */ | ||
| 362 | int add_to_page_cache(struct page *page, struct address_space *mapping, | ||
| 363 | pgoff_t offset, int gfp_mask) | ||
| 364 | { | ||
| 365 | int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | ||
| 366 | |||
| 367 | if (error == 0) { | ||
| 368 | write_lock_irq(&mapping->tree_lock); | ||
| 369 | error = radix_tree_insert(&mapping->page_tree, offset, page); | ||
| 370 | if (!error) { | ||
| 371 | page_cache_get(page); | ||
| 372 | SetPageLocked(page); | ||
| 373 | page->mapping = mapping; | ||
| 374 | page->index = offset; | ||
| 375 | mapping->nrpages++; | ||
| 376 | pagecache_acct(1); | ||
| 377 | } | ||
| 378 | write_unlock_irq(&mapping->tree_lock); | ||
| 379 | radix_tree_preload_end(); | ||
| 380 | } | ||
| 381 | return error; | ||
| 382 | } | ||
| 383 | |||
| 384 | EXPORT_SYMBOL(add_to_page_cache); | ||
| 385 | |||
| 386 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | ||
| 387 | pgoff_t offset, int gfp_mask) | ||
| 388 | { | ||
| 389 | int ret = add_to_page_cache(page, mapping, offset, gfp_mask); | ||
| 390 | if (ret == 0) | ||
| 391 | lru_cache_add(page); | ||
| 392 | return ret; | ||
| 393 | } | ||
| 394 | |||
| 395 | /* | ||
| 396 | * In order to wait for pages to become available there must be | ||
| 397 | * waitqueues associated with pages. By using a hash table of | ||
| 398 | * waitqueues where the bucket discipline is to maintain all | ||
| 399 | * waiters on the same queue and wake all when any of the pages | ||
| 400 | * become available, and for the woken contexts to check to be | ||
| 401 | * sure the appropriate page became available, this saves space | ||
| 402 | * at a cost of "thundering herd" phenomena during rare hash | ||
| 403 | * collisions. | ||
| 404 | */ | ||
| 405 | static wait_queue_head_t *page_waitqueue(struct page *page) | ||
| 406 | { | ||
| 407 | const struct zone *zone = page_zone(page); | ||
| 408 | |||
| 409 | return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; | ||
| 410 | } | ||
| 411 | |||
| 412 | static inline void wake_up_page(struct page *page, int bit) | ||
| 413 | { | ||
| 414 | __wake_up_bit(page_waitqueue(page), &page->flags, bit); | ||
| 415 | } | ||
| 416 | |||
| 417 | void fastcall wait_on_page_bit(struct page *page, int bit_nr) | ||
| 418 | { | ||
| 419 | DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); | ||
| 420 | |||
| 421 | if (test_bit(bit_nr, &page->flags)) | ||
| 422 | __wait_on_bit(page_waitqueue(page), &wait, sync_page, | ||
| 423 | TASK_UNINTERRUPTIBLE); | ||
| 424 | } | ||
| 425 | EXPORT_SYMBOL(wait_on_page_bit); | ||
| 426 | |||
| 427 | /** | ||
| 428 | * unlock_page() - unlock a locked page | ||
| 429 | * | ||
| 430 | * @page: the page | ||
| 431 | * | ||
| 432 | * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). | ||
| 433 | * Also wakes sleepers in wait_on_page_writeback() because the wakeup | ||
| 434 | * mechananism between PageLocked pages and PageWriteback pages is shared. | ||
| 435 | * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. | ||
| 436 | * | ||
| 437 | * The first mb is necessary to safely close the critical section opened by the | ||
| 438 | * TestSetPageLocked(), the second mb is necessary to enforce ordering between | ||
| 439 | * the clear_bit and the read of the waitqueue (to avoid SMP races with a | ||
| 440 | * parallel wait_on_page_locked()). | ||
| 441 | */ | ||
| 442 | void fastcall unlock_page(struct page *page) | ||
| 443 | { | ||
| 444 | smp_mb__before_clear_bit(); | ||
| 445 | if (!TestClearPageLocked(page)) | ||
| 446 | BUG(); | ||
| 447 | smp_mb__after_clear_bit(); | ||
| 448 | wake_up_page(page, PG_locked); | ||
| 449 | } | ||
| 450 | EXPORT_SYMBOL(unlock_page); | ||
| 451 | |||
| 452 | /* | ||
| 453 | * End writeback against a page. | ||
| 454 | */ | ||
| 455 | void end_page_writeback(struct page *page) | ||
| 456 | { | ||
| 457 | if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) { | ||
| 458 | if (!test_clear_page_writeback(page)) | ||
| 459 | BUG(); | ||
| 460 | } | ||
| 461 | smp_mb__after_clear_bit(); | ||
| 462 | wake_up_page(page, PG_writeback); | ||
| 463 | } | ||
| 464 | EXPORT_SYMBOL(end_page_writeback); | ||
| 465 | |||
| 466 | /* | ||
| 467 | * Get a lock on the page, assuming we need to sleep to get it. | ||
| 468 | * | ||
| 469 | * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some | ||
| 470 | * random driver's requestfn sets TASK_RUNNING, we could busywait. However | ||
| 471 | * chances are that on the second loop, the block layer's plug list is empty, | ||
| 472 | * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. | ||
| 473 | */ | ||
| 474 | void fastcall __lock_page(struct page *page) | ||
| 475 | { | ||
| 476 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | ||
| 477 | |||
| 478 | __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, | ||
| 479 | TASK_UNINTERRUPTIBLE); | ||
| 480 | } | ||
| 481 | EXPORT_SYMBOL(__lock_page); | ||
| 482 | |||
| 483 | /* | ||
| 484 | * a rather lightweight function, finding and getting a reference to a | ||
| 485 | * hashed page atomically. | ||
| 486 | */ | ||
| 487 | struct page * find_get_page(struct address_space *mapping, unsigned long offset) | ||
| 488 | { | ||
| 489 | struct page *page; | ||
| 490 | |||
| 491 | read_lock_irq(&mapping->tree_lock); | ||
| 492 | page = radix_tree_lookup(&mapping->page_tree, offset); | ||
| 493 | if (page) | ||
| 494 | page_cache_get(page); | ||
| 495 | read_unlock_irq(&mapping->tree_lock); | ||
| 496 | return page; | ||
| 497 | } | ||
| 498 | |||
| 499 | EXPORT_SYMBOL(find_get_page); | ||
| 500 | |||
| 501 | /* | ||
| 502 | * Same as above, but trylock it instead of incrementing the count. | ||
| 503 | */ | ||
| 504 | struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) | ||
| 505 | { | ||
| 506 | struct page *page; | ||
| 507 | |||
| 508 | read_lock_irq(&mapping->tree_lock); | ||
| 509 | page = radix_tree_lookup(&mapping->page_tree, offset); | ||
| 510 | if (page && TestSetPageLocked(page)) | ||
| 511 | page = NULL; | ||
| 512 | read_unlock_irq(&mapping->tree_lock); | ||
| 513 | return page; | ||
| 514 | } | ||
| 515 | |||
| 516 | EXPORT_SYMBOL(find_trylock_page); | ||
| 517 | |||
| 518 | /** | ||
| 519 | * find_lock_page - locate, pin and lock a pagecache page | ||
| 520 | * | ||
| 521 | * @mapping - the address_space to search | ||
| 522 | * @offset - the page index | ||
| 523 | * | ||
| 524 | * Locates the desired pagecache page, locks it, increments its reference | ||
| 525 | * count and returns its address. | ||
| 526 | * | ||
| 527 | * Returns zero if the page was not present. find_lock_page() may sleep. | ||
| 528 | */ | ||
| 529 | struct page *find_lock_page(struct address_space *mapping, | ||
| 530 | unsigned long offset) | ||
| 531 | { | ||
| 532 | struct page *page; | ||
| 533 | |||
| 534 | read_lock_irq(&mapping->tree_lock); | ||
| 535 | repeat: | ||
| 536 | page = radix_tree_lookup(&mapping->page_tree, offset); | ||
| 537 | if (page) { | ||
| 538 | page_cache_get(page); | ||
| 539 | if (TestSetPageLocked(page)) { | ||
| 540 | read_unlock_irq(&mapping->tree_lock); | ||
| 541 | lock_page(page); | ||
| 542 | read_lock_irq(&mapping->tree_lock); | ||
| 543 | |||
| 544 | /* Has the page been truncated while we slept? */ | ||
| 545 | if (page->mapping != mapping || page->index != offset) { | ||
| 546 | unlock_page(page); | ||
| 547 | page_cache_release(page); | ||
| 548 | goto repeat; | ||
| 549 | } | ||
| 550 | } | ||
| 551 | } | ||
| 552 | read_unlock_irq(&mapping->tree_lock); | ||
| 553 | return page; | ||
| 554 | } | ||
| 555 | |||
| 556 | EXPORT_SYMBOL(find_lock_page); | ||
| 557 | |||
| 558 | /** | ||
| 559 | * find_or_create_page - locate or add a pagecache page | ||
| 560 | * | ||
| 561 | * @mapping - the page's address_space | ||
| 562 | * @index - the page's index into the mapping | ||
| 563 | * @gfp_mask - page allocation mode | ||
| 564 | * | ||
| 565 | * Locates a page in the pagecache. If the page is not present, a new page | ||
| 566 | * is allocated using @gfp_mask and is added to the pagecache and to the VM's | ||
| 567 | * LRU list. The returned page is locked and has its reference count | ||
| 568 | * incremented. | ||
| 569 | * | ||
| 570 | * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic | ||
| 571 | * allocation! | ||
| 572 | * | ||
| 573 | * find_or_create_page() returns the desired page's address, or zero on | ||
| 574 | * memory exhaustion. | ||
| 575 | */ | ||
| 576 | struct page *find_or_create_page(struct address_space *mapping, | ||
| 577 | unsigned long index, unsigned int gfp_mask) | ||
| 578 | { | ||
| 579 | struct page *page, *cached_page = NULL; | ||
| 580 | int err; | ||
| 581 | repeat: | ||
| 582 | page = find_lock_page(mapping, index); | ||
| 583 | if (!page) { | ||
| 584 | if (!cached_page) { | ||
| 585 | cached_page = alloc_page(gfp_mask); | ||
| 586 | if (!cached_page) | ||
| 587 | return NULL; | ||
| 588 | } | ||
| 589 | err = add_to_page_cache_lru(cached_page, mapping, | ||
| 590 | index, gfp_mask); | ||
| 591 | if (!err) { | ||
| 592 | page = cached_page; | ||
| 593 | cached_page = NULL; | ||
| 594 | } else if (err == -EEXIST) | ||
| 595 | goto repeat; | ||
| 596 | } | ||
| 597 | if (cached_page) | ||
| 598 | page_cache_release(cached_page); | ||
| 599 | return page; | ||
| 600 | } | ||
| 601 | |||
| 602 | EXPORT_SYMBOL(find_or_create_page); | ||
| 603 | |||
| 604 | /** | ||
| 605 | * find_get_pages - gang pagecache lookup | ||
| 606 | * @mapping: The address_space to search | ||
| 607 | * @start: The starting page index | ||
| 608 | * @nr_pages: The maximum number of pages | ||
| 609 | * @pages: Where the resulting pages are placed | ||
| 610 | * | ||
| 611 | * find_get_pages() will search for and return a group of up to | ||
| 612 | * @nr_pages pages in the mapping. The pages are placed at @pages. | ||
| 613 | * find_get_pages() takes a reference against the returned pages. | ||
| 614 | * | ||
| 615 | * The search returns a group of mapping-contiguous pages with ascending | ||
| 616 | * indexes. There may be holes in the indices due to not-present pages. | ||
| 617 | * | ||
| 618 | * find_get_pages() returns the number of pages which were found. | ||
| 619 | */ | ||
| 620 | unsigned find_get_pages(struct address_space *mapping, pgoff_t start, | ||
| 621 | unsigned int nr_pages, struct page **pages) | ||
| 622 | { | ||
| 623 | unsigned int i; | ||
| 624 | unsigned int ret; | ||
| 625 | |||
| 626 | read_lock_irq(&mapping->tree_lock); | ||
| 627 | ret = radix_tree_gang_lookup(&mapping->page_tree, | ||
| 628 | (void **)pages, start, nr_pages); | ||
| 629 | for (i = 0; i < ret; i++) | ||
| 630 | page_cache_get(pages[i]); | ||
| 631 | read_unlock_irq(&mapping->tree_lock); | ||
| 632 | return ret; | ||
| 633 | } | ||
| 634 | |||
| 635 | /* | ||
| 636 | * Like find_get_pages, except we only return pages which are tagged with | ||
| 637 | * `tag'. We update *index to index the next page for the traversal. | ||
| 638 | */ | ||
| 639 | unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, | ||
| 640 | int tag, unsigned int nr_pages, struct page **pages) | ||
| 641 | { | ||
| 642 | unsigned int i; | ||
| 643 | unsigned int ret; | ||
| 644 | |||
| 645 | read_lock_irq(&mapping->tree_lock); | ||
| 646 | ret = radix_tree_gang_lookup_tag(&mapping->page_tree, | ||
| 647 | (void **)pages, *index, nr_pages, tag); | ||
| 648 | for (i = 0; i < ret; i++) | ||
| 649 | page_cache_get(pages[i]); | ||
| 650 | if (ret) | ||
| 651 | *index = pages[ret - 1]->index + 1; | ||
| 652 | read_unlock_irq(&mapping->tree_lock); | ||
| 653 | return ret; | ||
| 654 | } | ||
| 655 | |||
| 656 | /* | ||
| 657 | * Same as grab_cache_page, but do not wait if the page is unavailable. | ||
| 658 | * This is intended for speculative data generators, where the data can | ||
| 659 | * be regenerated if the page couldn't be grabbed. This routine should | ||
| 660 | * be safe to call while holding the lock for another page. | ||
| 661 | * | ||
| 662 | * Clear __GFP_FS when allocating the page to avoid recursion into the fs | ||
| 663 | * and deadlock against the caller's locked page. | ||
| 664 | */ | ||
| 665 | struct page * | ||
| 666 | grab_cache_page_nowait(struct address_space *mapping, unsigned long index) | ||
| 667 | { | ||
| 668 | struct page *page = find_get_page(mapping, index); | ||
| 669 | unsigned int gfp_mask; | ||
| 670 | |||
| 671 | if (page) { | ||
| 672 | if (!TestSetPageLocked(page)) | ||
| 673 | return page; | ||
| 674 | page_cache_release(page); | ||
| 675 | return NULL; | ||
| 676 | } | ||
| 677 | gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS; | ||
| 678 | page = alloc_pages(gfp_mask, 0); | ||
| 679 | if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) { | ||
| 680 | page_cache_release(page); | ||
| 681 | page = NULL; | ||
| 682 | } | ||
| 683 | return page; | ||
| 684 | } | ||
| 685 | |||
| 686 | EXPORT_SYMBOL(grab_cache_page_nowait); | ||
| 687 | |||
| 688 | /* | ||
| 689 | * This is a generic file read routine, and uses the | ||
| 690 | * mapping->a_ops->readpage() function for the actual low-level | ||
| 691 | * stuff. | ||
| 692 | * | ||
| 693 | * This is really ugly. But the goto's actually try to clarify some | ||
| 694 | * of the logic when it comes to error handling etc. | ||
| 695 | * | ||
| 696 | * Note the struct file* is only passed for the use of readpage. It may be | ||
| 697 | * NULL. | ||
| 698 | */ | ||
| 699 | void do_generic_mapping_read(struct address_space *mapping, | ||
| 700 | struct file_ra_state *_ra, | ||
| 701 | struct file *filp, | ||
| 702 | loff_t *ppos, | ||
| 703 | read_descriptor_t *desc, | ||
| 704 | read_actor_t actor) | ||
| 705 | { | ||
| 706 | struct inode *inode = mapping->host; | ||
| 707 | unsigned long index; | ||
| 708 | unsigned long end_index; | ||
| 709 | unsigned long offset; | ||
| 710 | unsigned long last_index; | ||
| 711 | unsigned long next_index; | ||
| 712 | unsigned long prev_index; | ||
| 713 | loff_t isize; | ||
| 714 | struct page *cached_page; | ||
| 715 | int error; | ||
| 716 | struct file_ra_state ra = *_ra; | ||
| 717 | |||
| 718 | cached_page = NULL; | ||
| 719 | index = *ppos >> PAGE_CACHE_SHIFT; | ||
| 720 | next_index = index; | ||
| 721 | prev_index = ra.prev_page; | ||
| 722 | last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; | ||
| 723 | offset = *ppos & ~PAGE_CACHE_MASK; | ||
| 724 | |||
| 725 | isize = i_size_read(inode); | ||
| 726 | if (!isize) | ||
| 727 | goto out; | ||
| 728 | |||
| 729 | end_index = (isize - 1) >> PAGE_CACHE_SHIFT; | ||
| 730 | for (;;) { | ||
| 731 | struct page *page; | ||
| 732 | unsigned long nr, ret; | ||
| 733 | |||
| 734 | /* nr is the maximum number of bytes to copy from this page */ | ||
| 735 | nr = PAGE_CACHE_SIZE; | ||
| 736 | if (index >= end_index) { | ||
| 737 | if (index > end_index) | ||
| 738 | goto out; | ||
| 739 | nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; | ||
| 740 | if (nr <= offset) { | ||
| 741 | goto out; | ||
| 742 | } | ||
| 743 | } | ||
| 744 | nr = nr - offset; | ||
| 745 | |||
| 746 | cond_resched(); | ||
| 747 | if (index == next_index) | ||
| 748 | next_index = page_cache_readahead(mapping, &ra, filp, | ||
| 749 | index, last_index - index); | ||
| 750 | |||
| 751 | find_page: | ||
| 752 | page = find_get_page(mapping, index); | ||
| 753 | if (unlikely(page == NULL)) { | ||
| 754 | handle_ra_miss(mapping, &ra, index); | ||
| 755 | goto no_cached_page; | ||
| 756 | } | ||
| 757 | if (!PageUptodate(page)) | ||
| 758 | goto page_not_up_to_date; | ||
| 759 | page_ok: | ||
| 760 | |||
| 761 | /* If users can be writing to this page using arbitrary | ||
| 762 | * virtual addresses, take care about potential aliasing | ||
| 763 | * before reading the page on the kernel side. | ||
| 764 | */ | ||
| 765 | if (mapping_writably_mapped(mapping)) | ||
| 766 | flush_dcache_page(page); | ||
| 767 | |||
| 768 | /* | ||
| 769 | * When (part of) the same page is read multiple times | ||
| 770 | * in succession, only mark it as accessed the first time. | ||
| 771 | */ | ||
| 772 | if (prev_index != index) | ||
| 773 | mark_page_accessed(page); | ||
| 774 | prev_index = index; | ||
| 775 | |||
| 776 | /* | ||
| 777 | * Ok, we have the page, and it's up-to-date, so | ||
| 778 | * now we can copy it to user space... | ||
| 779 | * | ||
| 780 | * The actor routine returns how many bytes were actually used.. | ||
| 781 | * NOTE! This may not be the same as how much of a user buffer | ||
| 782 | * we filled up (we may be padding etc), so we can only update | ||
| 783 | * "pos" here (the actor routine has to update the user buffer | ||
| 784 | * pointers and the remaining count). | ||
| 785 | */ | ||
| 786 | ret = actor(desc, page, offset, nr); | ||
| 787 | offset += ret; | ||
| 788 | index += offset >> PAGE_CACHE_SHIFT; | ||
| 789 | offset &= ~PAGE_CACHE_MASK; | ||
| 790 | |||
| 791 | page_cache_release(page); | ||
| 792 | if (ret == nr && desc->count) | ||
| 793 | continue; | ||
| 794 | goto out; | ||
| 795 | |||
| 796 | page_not_up_to_date: | ||
| 797 | /* Get exclusive access to the page ... */ | ||
| 798 | lock_page(page); | ||
| 799 | |||
| 800 | /* Did it get unhashed before we got the lock? */ | ||
| 801 | if (!page->mapping) { | ||
| 802 | unlock_page(page); | ||
| 803 | page_cache_release(page); | ||
| 804 | continue; | ||
| 805 | } | ||
| 806 | |||
| 807 | /* Did somebody else fill it already? */ | ||
| 808 | if (PageUptodate(page)) { | ||
| 809 | unlock_page(page); | ||
| 810 | goto page_ok; | ||
| 811 | } | ||
| 812 | |||
| 813 | readpage: | ||
| 814 | /* Start the actual read. The read will unlock the page. */ | ||
| 815 | error = mapping->a_ops->readpage(filp, page); | ||
| 816 | |||
| 817 | if (unlikely(error)) | ||
| 818 | goto readpage_error; | ||
| 819 | |||
| 820 | if (!PageUptodate(page)) { | ||
| 821 | lock_page(page); | ||
| 822 | if (!PageUptodate(page)) { | ||
| 823 | if (page->mapping == NULL) { | ||
| 824 | /* | ||
| 825 | * invalidate_inode_pages got it | ||
| 826 | */ | ||
| 827 | unlock_page(page); | ||
| 828 | page_cache_release(page); | ||
| 829 | goto find_page; | ||
| 830 | } | ||
| 831 | unlock_page(page); | ||
| 832 | error = -EIO; | ||
| 833 | goto readpage_error; | ||
| 834 | } | ||
| 835 | unlock_page(page); | ||
| 836 | } | ||
| 837 | |||
| 838 | /* | ||
| 839 | * i_size must be checked after we have done ->readpage. | ||
| 840 | * | ||
| 841 | * Checking i_size after the readpage allows us to calculate | ||
| 842 | * the correct value for "nr", which means the zero-filled | ||
| 843 | * part of the page is not copied back to userspace (unless | ||
| 844 | * another truncate extends the file - this is desired though). | ||
| 845 | */ | ||
| 846 | isize = i_size_read(inode); | ||
| 847 | end_index = (isize - 1) >> PAGE_CACHE_SHIFT; | ||
| 848 | if (unlikely(!isize || index > end_index)) { | ||
| 849 | page_cache_release(page); | ||
| 850 | goto out; | ||
| 851 | } | ||
| 852 | |||
| 853 | /* nr is the maximum number of bytes to copy from this page */ | ||
| 854 | nr = PAGE_CACHE_SIZE; | ||
| 855 | if (index == end_index) { | ||
| 856 | nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; | ||
| 857 | if (nr <= offset) { | ||
| 858 | page_cache_release(page); | ||
| 859 | goto out; | ||
| 860 | } | ||
| 861 | } | ||
| 862 | nr = nr - offset; | ||
| 863 | goto page_ok; | ||
| 864 | |||
| 865 | readpage_error: | ||
| 866 | /* UHHUH! A synchronous read error occurred. Report it */ | ||
| 867 | desc->error = error; | ||
| 868 | page_cache_release(page); | ||
| 869 | goto out; | ||
| 870 | |||
| 871 | no_cached_page: | ||
| 872 | /* | ||
| 873 | * Ok, it wasn't cached, so we need to create a new | ||
| 874 | * page.. | ||
| 875 | */ | ||
| 876 | if (!cached_page) { | ||
| 877 | cached_page = page_cache_alloc_cold(mapping); | ||
| 878 | if (!cached_page) { | ||
| 879 | desc->error = -ENOMEM; | ||
| 880 | goto out; | ||
| 881 | } | ||
| 882 | } | ||
| 883 | error = add_to_page_cache_lru(cached_page, mapping, | ||
| 884 | index, GFP_KERNEL); | ||
| 885 | if (error) { | ||
| 886 | if (error == -EEXIST) | ||
| 887 | goto find_page; | ||
| 888 | desc->error = error; | ||
| 889 | goto out; | ||
| 890 | } | ||
| 891 | page = cached_page; | ||
| 892 | cached_page = NULL; | ||
| 893 | goto readpage; | ||
| 894 | } | ||
| 895 | |||
| 896 | out: | ||
| 897 | *_ra = ra; | ||
| 898 | |||
| 899 | *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; | ||
| 900 | if (cached_page) | ||
| 901 | page_cache_release(cached_page); | ||
| 902 | if (filp) | ||
| 903 | file_accessed(filp); | ||
| 904 | } | ||
| 905 | |||
| 906 | EXPORT_SYMBOL(do_generic_mapping_read); | ||
| 907 | |||
| 908 | int file_read_actor(read_descriptor_t *desc, struct page *page, | ||
| 909 | unsigned long offset, unsigned long size) | ||
| 910 | { | ||
| 911 | char *kaddr; | ||
| 912 | unsigned long left, count = desc->count; | ||
| 913 | |||
| 914 | if (size > count) | ||
| 915 | size = count; | ||
| 916 | |||
| 917 | /* | ||
| 918 | * Faults on the destination of a read are common, so do it before | ||
| 919 | * taking the kmap. | ||
| 920 | */ | ||
| 921 | if (!fault_in_pages_writeable(desc->arg.buf, size)) { | ||
| 922 | kaddr = kmap_atomic(page, KM_USER0); | ||
| 923 | left = __copy_to_user_inatomic(desc->arg.buf, | ||
| 924 | kaddr + offset, size); | ||
| 925 | kunmap_atomic(kaddr, KM_USER0); | ||
| 926 | if (left == 0) | ||
| 927 | goto success; | ||
| 928 | } | ||
| 929 | |||
| 930 | /* Do it the slow way */ | ||
| 931 | kaddr = kmap(page); | ||
| 932 | left = __copy_to_user(desc->arg.buf, kaddr + offset, size); | ||
| 933 | kunmap(page); | ||
| 934 | |||
| 935 | if (left) { | ||
| 936 | size -= left; | ||
| 937 | desc->error = -EFAULT; | ||
| 938 | } | ||
| 939 | success: | ||
| 940 | desc->count = count - size; | ||
| 941 | desc->written += size; | ||
| 942 | desc->arg.buf += size; | ||
| 943 | return size; | ||
| 944 | } | ||
| 945 | |||
| 946 | /* | ||
| 947 | * This is the "read()" routine for all filesystems | ||
| 948 | * that can use the page cache directly. | ||
| 949 | */ | ||
| 950 | ssize_t | ||
| 951 | __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | ||
| 952 | unsigned long nr_segs, loff_t *ppos) | ||
| 953 | { | ||
| 954 | struct file *filp = iocb->ki_filp; | ||
| 955 | ssize_t retval; | ||
| 956 | unsigned long seg; | ||
| 957 | size_t count; | ||
| 958 | |||
| 959 | count = 0; | ||
| 960 | for (seg = 0; seg < nr_segs; seg++) { | ||
| 961 | const struct iovec *iv = &iov[seg]; | ||
| 962 | |||
| 963 | /* | ||
| 964 | * If any segment has a negative length, or the cumulative | ||
| 965 | * length ever wraps negative then return -EINVAL. | ||
| 966 | */ | ||
| 967 | count += iv->iov_len; | ||
| 968 | if (unlikely((ssize_t)(count|iv->iov_len) < 0)) | ||
| 969 | return -EINVAL; | ||
| 970 | if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len)) | ||
| 971 | continue; | ||
| 972 | if (seg == 0) | ||
| 973 | return -EFAULT; | ||
| 974 | nr_segs = seg; | ||
| 975 | count -= iv->iov_len; /* This segment is no good */ | ||
| 976 | break; | ||
| 977 | } | ||
| 978 | |||
| 979 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ | ||
| 980 | if (filp->f_flags & O_DIRECT) { | ||
| 981 | loff_t pos = *ppos, size; | ||
| 982 | struct address_space *mapping; | ||
| 983 | struct inode *inode; | ||
| 984 | |||
| 985 | mapping = filp->f_mapping; | ||
| 986 | inode = mapping->host; | ||
| 987 | retval = 0; | ||
| 988 | if (!count) | ||
| 989 | goto out; /* skip atime */ | ||
| 990 | size = i_size_read(inode); | ||
| 991 | if (pos < size) { | ||
| 992 | retval = generic_file_direct_IO(READ, iocb, | ||
| 993 | iov, pos, nr_segs); | ||
| 994 | if (retval >= 0 && !is_sync_kiocb(iocb)) | ||
| 995 | retval = -EIOCBQUEUED; | ||
| 996 | if (retval > 0) | ||
| 997 | *ppos = pos + retval; | ||
| 998 | } | ||
| 999 | file_accessed(filp); | ||
| 1000 | goto out; | ||
| 1001 | } | ||
| 1002 | |||
| 1003 | retval = 0; | ||
| 1004 | if (count) { | ||
| 1005 | for (seg = 0; seg < nr_segs; seg++) { | ||
| 1006 | read_descriptor_t desc; | ||
| 1007 | |||
| 1008 | desc.written = 0; | ||
| 1009 | desc.arg.buf = iov[seg].iov_base; | ||
| 1010 | desc.count = iov[seg].iov_len; | ||
| 1011 | if (desc.count == 0) | ||
| 1012 | continue; | ||
| 1013 | desc.error = 0; | ||
| 1014 | do_generic_file_read(filp,ppos,&desc,file_read_actor); | ||
| 1015 | retval += desc.written; | ||
| 1016 | if (!retval) { | ||
| 1017 | retval = desc.error; | ||
| 1018 | break; | ||
| 1019 | } | ||
| 1020 | } | ||
| 1021 | } | ||
| 1022 | out: | ||
| 1023 | return retval; | ||
| 1024 | } | ||
| 1025 | |||
| 1026 | EXPORT_SYMBOL(__generic_file_aio_read); | ||
| 1027 | |||
| 1028 | ssize_t | ||
| 1029 | generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos) | ||
| 1030 | { | ||
| 1031 | struct iovec local_iov = { .iov_base = buf, .iov_len = count }; | ||
| 1032 | |||
| 1033 | BUG_ON(iocb->ki_pos != pos); | ||
| 1034 | return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos); | ||
| 1035 | } | ||
| 1036 | |||
| 1037 | EXPORT_SYMBOL(generic_file_aio_read); | ||
| 1038 | |||
| 1039 | ssize_t | ||
| 1040 | generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) | ||
| 1041 | { | ||
| 1042 | struct iovec local_iov = { .iov_base = buf, .iov_len = count }; | ||
| 1043 | struct kiocb kiocb; | ||
| 1044 | ssize_t ret; | ||
| 1045 | |||
| 1046 | init_sync_kiocb(&kiocb, filp); | ||
| 1047 | ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos); | ||
| 1048 | if (-EIOCBQUEUED == ret) | ||
| 1049 | ret = wait_on_sync_kiocb(&kiocb); | ||
| 1050 | return ret; | ||
| 1051 | } | ||
| 1052 | |||
| 1053 | EXPORT_SYMBOL(generic_file_read); | ||
| 1054 | |||
| 1055 | int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) | ||
| 1056 | { | ||
| 1057 | ssize_t written; | ||
| 1058 | unsigned long count = desc->count; | ||
| 1059 | struct file *file = desc->arg.data; | ||
| 1060 | |||
| 1061 | if (size > count) | ||
| 1062 | size = count; | ||
| 1063 | |||
| 1064 | written = file->f_op->sendpage(file, page, offset, | ||
| 1065 | size, &file->f_pos, size<count); | ||
| 1066 | if (written < 0) { | ||
| 1067 | desc->error = written; | ||
| 1068 | written = 0; | ||
| 1069 | } | ||
| 1070 | desc->count = count - written; | ||
| 1071 | desc->written += written; | ||
| 1072 | return written; | ||
| 1073 | } | ||
| 1074 | |||
| 1075 | ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos, | ||
| 1076 | size_t count, read_actor_t actor, void *target) | ||
| 1077 | { | ||
| 1078 | read_descriptor_t desc; | ||
| 1079 | |||
| 1080 | if (!count) | ||
| 1081 | return 0; | ||
| 1082 | |||
| 1083 | desc.written = 0; | ||
| 1084 | desc.count = count; | ||
| 1085 | desc.arg.data = target; | ||
| 1086 | desc.error = 0; | ||
| 1087 | |||
| 1088 | do_generic_file_read(in_file, ppos, &desc, actor); | ||
| 1089 | if (desc.written) | ||
| 1090 | return desc.written; | ||
| 1091 | return desc.error; | ||
| 1092 | } | ||
| 1093 | |||
| 1094 | EXPORT_SYMBOL(generic_file_sendfile); | ||
| 1095 | |||
| 1096 | static ssize_t | ||
| 1097 | do_readahead(struct address_space *mapping, struct file *filp, | ||
| 1098 | unsigned long index, unsigned long nr) | ||
| 1099 | { | ||
| 1100 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) | ||
| 1101 | return -EINVAL; | ||
| 1102 | |||
| 1103 | force_page_cache_readahead(mapping, filp, index, | ||
| 1104 | max_sane_readahead(nr)); | ||
| 1105 | return 0; | ||
| 1106 | } | ||
| 1107 | |||
| 1108 | asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) | ||
| 1109 | { | ||
| 1110 | ssize_t ret; | ||
| 1111 | struct file *file; | ||
| 1112 | |||
| 1113 | ret = -EBADF; | ||
| 1114 | file = fget(fd); | ||
| 1115 | if (file) { | ||
| 1116 | if (file->f_mode & FMODE_READ) { | ||
| 1117 | struct address_space *mapping = file->f_mapping; | ||
| 1118 | unsigned long start = offset >> PAGE_CACHE_SHIFT; | ||
| 1119 | unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT; | ||
| 1120 | unsigned long len = end - start + 1; | ||
| 1121 | ret = do_readahead(mapping, file, start, len); | ||
| 1122 | } | ||
| 1123 | fput(file); | ||
| 1124 | } | ||
| 1125 | return ret; | ||
| 1126 | } | ||
| 1127 | |||
| 1128 | #ifdef CONFIG_MMU | ||
| 1129 | /* | ||
| 1130 | * This adds the requested page to the page cache if it isn't already there, | ||
| 1131 | * and schedules an I/O to read in its contents from disk. | ||
| 1132 | */ | ||
| 1133 | static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); | ||
| 1134 | static int fastcall page_cache_read(struct file * file, unsigned long offset) | ||
| 1135 | { | ||
| 1136 | struct address_space *mapping = file->f_mapping; | ||
| 1137 | struct page *page; | ||
| 1138 | int error; | ||
| 1139 | |||
| 1140 | page = page_cache_alloc_cold(mapping); | ||
| 1141 | if (!page) | ||
| 1142 | return -ENOMEM; | ||
| 1143 | |||
| 1144 | error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); | ||
| 1145 | if (!error) { | ||
| 1146 | error = mapping->a_ops->readpage(file, page); | ||
| 1147 | page_cache_release(page); | ||
| 1148 | return error; | ||
| 1149 | } | ||
| 1150 | |||
| 1151 | /* | ||
| 1152 | * We arrive here in the unlikely event that someone | ||
| 1153 | * raced with us and added our page to the cache first | ||
| 1154 | * or we are out of memory for radix-tree nodes. | ||
| 1155 | */ | ||
| 1156 | page_cache_release(page); | ||
| 1157 | return error == -EEXIST ? 0 : error; | ||
| 1158 | } | ||
| 1159 | |||
| 1160 | #define MMAP_LOTSAMISS (100) | ||
| 1161 | |||
| 1162 | /* | ||
| 1163 | * filemap_nopage() is invoked via the vma operations vector for a | ||
| 1164 | * mapped memory region to read in file data during a page fault. | ||
| 1165 | * | ||
| 1166 | * The goto's are kind of ugly, but this streamlines the normal case of having | ||
| 1167 | * it in the page cache, and handles the special cases reasonably without | ||
| 1168 | * having a lot of duplicated code. | ||
| 1169 | */ | ||
| 1170 | struct page *filemap_nopage(struct vm_area_struct *area, | ||
| 1171 | unsigned long address, int *type) | ||
| 1172 | { | ||
| 1173 | int error; | ||
| 1174 | struct file *file = area->vm_file; | ||
| 1175 | struct address_space *mapping = file->f_mapping; | ||
| 1176 | struct file_ra_state *ra = &file->f_ra; | ||
| 1177 | struct inode *inode = mapping->host; | ||
| 1178 | struct page *page; | ||
| 1179 | unsigned long size, pgoff; | ||
| 1180 | int did_readaround = 0, majmin = VM_FAULT_MINOR; | ||
| 1181 | |||
| 1182 | pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; | ||
| 1183 | |||
| 1184 | retry_all: | ||
| 1185 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
| 1186 | if (pgoff >= size) | ||
| 1187 | goto outside_data_content; | ||
| 1188 | |||
| 1189 | /* If we don't want any read-ahead, don't bother */ | ||
| 1190 | if (VM_RandomReadHint(area)) | ||
| 1191 | goto no_cached_page; | ||
| 1192 | |||
| 1193 | /* | ||
| 1194 | * The readahead code wants to be told about each and every page | ||
| 1195 | * so it can build and shrink its windows appropriately | ||
| 1196 | * | ||
| 1197 | * For sequential accesses, we use the generic readahead logic. | ||
| 1198 | */ | ||
| 1199 | if (VM_SequentialReadHint(area)) | ||
| 1200 | page_cache_readahead(mapping, ra, file, pgoff, 1); | ||
| 1201 | |||
| 1202 | /* | ||
| 1203 | * Do we have something in the page cache already? | ||
| 1204 | */ | ||
| 1205 | retry_find: | ||
| 1206 | page = find_get_page(mapping, pgoff); | ||
| 1207 | if (!page) { | ||
| 1208 | unsigned long ra_pages; | ||
| 1209 | |||
| 1210 | if (VM_SequentialReadHint(area)) { | ||
| 1211 | handle_ra_miss(mapping, ra, pgoff); | ||
| 1212 | goto no_cached_page; | ||
| 1213 | } | ||
| 1214 | ra->mmap_miss++; | ||
| 1215 | |||
| 1216 | /* | ||
| 1217 | * Do we miss much more than hit in this file? If so, | ||
| 1218 | * stop bothering with read-ahead. It will only hurt. | ||
| 1219 | */ | ||
| 1220 | if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS) | ||
| 1221 | goto no_cached_page; | ||
| 1222 | |||
| 1223 | /* | ||
| 1224 | * To keep the pgmajfault counter straight, we need to | ||
| 1225 | * check did_readaround, as this is an inner loop. | ||
| 1226 | */ | ||
| 1227 | if (!did_readaround) { | ||
| 1228 | majmin = VM_FAULT_MAJOR; | ||
| 1229 | inc_page_state(pgmajfault); | ||
| 1230 | } | ||
| 1231 | did_readaround = 1; | ||
| 1232 | ra_pages = max_sane_readahead(file->f_ra.ra_pages); | ||
| 1233 | if (ra_pages) { | ||
| 1234 | pgoff_t start = 0; | ||
| 1235 | |||
| 1236 | if (pgoff > ra_pages / 2) | ||
| 1237 | start = pgoff - ra_pages / 2; | ||
| 1238 | do_page_cache_readahead(mapping, file, start, ra_pages); | ||
| 1239 | } | ||
| 1240 | page = find_get_page(mapping, pgoff); | ||
| 1241 | if (!page) | ||
| 1242 | goto no_cached_page; | ||
| 1243 | } | ||
| 1244 | |||
| 1245 | if (!did_readaround) | ||
| 1246 | ra->mmap_hit++; | ||
| 1247 | |||
| 1248 | /* | ||
| 1249 | * Ok, found a page in the page cache, now we need to check | ||
| 1250 | * that it's up-to-date. | ||
| 1251 | */ | ||
| 1252 | if (!PageUptodate(page)) | ||
| 1253 | goto page_not_uptodate; | ||
| 1254 | |||
| 1255 | success: | ||
| 1256 | /* | ||
| 1257 | * Found the page and have a reference on it. | ||
| 1258 | */ | ||
| 1259 | mark_page_accessed(page); | ||
| 1260 | if (type) | ||
| 1261 | *type = majmin; | ||
| 1262 | return page; | ||
| 1263 | |||
| 1264 | outside_data_content: | ||
| 1265 | /* | ||
| 1266 | * An external ptracer can access pages that normally aren't | ||
| 1267 | * accessible.. | ||
| 1268 | */ | ||
| 1269 | if (area->vm_mm == current->mm) | ||
| 1270 | return NULL; | ||
| 1271 | /* Fall through to the non-read-ahead case */ | ||
| 1272 | no_cached_page: | ||
| 1273 | /* | ||
| 1274 | * We're only likely to ever get here if MADV_RANDOM is in | ||
| 1275 | * effect. | ||
| 1276 | */ | ||
| 1277 | error = page_cache_read(file, pgoff); | ||
| 1278 | grab_swap_token(); | ||
| 1279 | |||
| 1280 | /* | ||
| 1281 | * The page we want has now been added to the page cache. | ||
| 1282 | * In the unlikely event that someone removed it in the | ||
| 1283 | * meantime, we'll just come back here and read it again. | ||
| 1284 | */ | ||
| 1285 | if (error >= 0) | ||
| 1286 | goto retry_find; | ||
| 1287 | |||
| 1288 | /* | ||
| 1289 | * An error return from page_cache_read can result if the | ||
| 1290 | * system is low on memory, or a problem occurs while trying | ||
| 1291 | * to schedule I/O. | ||
| 1292 | */ | ||
| 1293 | if (error == -ENOMEM) | ||
| 1294 | return NOPAGE_OOM; | ||
| 1295 | return NULL; | ||
| 1296 | |||
| 1297 | page_not_uptodate: | ||
| 1298 | if (!did_readaround) { | ||
| 1299 | majmin = VM_FAULT_MAJOR; | ||
| 1300 | inc_page_state(pgmajfault); | ||
| 1301 | } | ||
| 1302 | lock_page(page); | ||
| 1303 | |||
| 1304 | /* Did it get unhashed while we waited for it? */ | ||
| 1305 | if (!page->mapping) { | ||
| 1306 | unlock_page(page); | ||
| 1307 | page_cache_release(page); | ||
| 1308 | goto retry_all; | ||
| 1309 | } | ||
| 1310 | |||
| 1311 | /* Did somebody else get it up-to-date? */ | ||
| 1312 | if (PageUptodate(page)) { | ||
| 1313 | unlock_page(page); | ||
| 1314 | goto success; | ||
| 1315 | } | ||
| 1316 | |||
| 1317 | if (!mapping->a_ops->readpage(file, page)) { | ||
| 1318 | wait_on_page_locked(page); | ||
| 1319 | if (PageUptodate(page)) | ||
| 1320 | goto success; | ||
| 1321 | } | ||
| 1322 | |||
| 1323 | /* | ||
| 1324 | * Umm, take care of errors if the page isn't up-to-date. | ||
| 1325 | * Try to re-read it _once_. We do this synchronously, | ||
| 1326 | * because there really aren't any performance issues here | ||
| 1327 | * and we need to check for errors. | ||
| 1328 | */ | ||
| 1329 | lock_page(page); | ||
| 1330 | |||
| 1331 | /* Somebody truncated the page on us? */ | ||
| 1332 | if (!page->mapping) { | ||
| 1333 | unlock_page(page); | ||
| 1334 | page_cache_release(page); | ||
| 1335 | goto retry_all; | ||
| 1336 | } | ||
| 1337 | |||
| 1338 | /* Somebody else successfully read it in? */ | ||
| 1339 | if (PageUptodate(page)) { | ||
| 1340 | unlock_page(page); | ||
| 1341 | goto success; | ||
| 1342 | } | ||
| 1343 | ClearPageError(page); | ||
| 1344 | if (!mapping->a_ops->readpage(file, page)) { | ||
| 1345 | wait_on_page_locked(page); | ||
| 1346 | if (PageUptodate(page)) | ||
| 1347 | goto success; | ||
| 1348 | } | ||
| 1349 | |||
| 1350 | /* | ||
| 1351 | * Things didn't work out. Return zero to tell the | ||
| 1352 | * mm layer so, possibly freeing the page cache page first. | ||
| 1353 | */ | ||
| 1354 | page_cache_release(page); | ||
| 1355 | return NULL; | ||
| 1356 | } | ||
| 1357 | |||
| 1358 | EXPORT_SYMBOL(filemap_nopage); | ||
| 1359 | |||
| 1360 | static struct page * filemap_getpage(struct file *file, unsigned long pgoff, | ||
| 1361 | int nonblock) | ||
| 1362 | { | ||
| 1363 | struct address_space *mapping = file->f_mapping; | ||
| 1364 | struct page *page; | ||
| 1365 | int error; | ||
| 1366 | |||
| 1367 | /* | ||
| 1368 | * Do we have something in the page cache already? | ||
| 1369 | */ | ||
| 1370 | retry_find: | ||
| 1371 | page = find_get_page(mapping, pgoff); | ||
| 1372 | if (!page) { | ||
| 1373 | if (nonblock) | ||
| 1374 | return NULL; | ||
| 1375 | goto no_cached_page; | ||
| 1376 | } | ||
| 1377 | |||
| 1378 | /* | ||
| 1379 | * Ok, found a page in the page cache, now we need to check | ||
| 1380 | * that it's up-to-date. | ||
| 1381 | */ | ||
| 1382 | if (!PageUptodate(page)) | ||
| 1383 | goto page_not_uptodate; | ||
| 1384 | |||
| 1385 | success: | ||
| 1386 | /* | ||
| 1387 | * Found the page and have a reference on it. | ||
| 1388 | */ | ||
| 1389 | mark_page_accessed(page); | ||
| 1390 | return page; | ||
| 1391 | |||
| 1392 | no_cached_page: | ||
| 1393 | error = page_cache_read(file, pgoff); | ||
| 1394 | |||
| 1395 | /* | ||
| 1396 | * The page we want has now been added to the page cache. | ||
| 1397 | * In the unlikely event that someone removed it in the | ||
| 1398 | * meantime, we'll just come back here and read it again. | ||
| 1399 | */ | ||
| 1400 | if (error >= 0) | ||
| 1401 | goto retry_find; | ||
| 1402 | |||
| 1403 | /* | ||
| 1404 | * An error return from page_cache_read can result if the | ||
| 1405 | * system is low on memory, or a problem occurs while trying | ||
| 1406 | * to schedule I/O. | ||
| 1407 | */ | ||
| 1408 | return NULL; | ||
| 1409 | |||
| 1410 | page_not_uptodate: | ||
| 1411 | lock_page(page); | ||
| 1412 | |||
| 1413 | /* Did it get unhashed while we waited for it? */ | ||
| 1414 | if (!page->mapping) { | ||
| 1415 | unlock_page(page); | ||
| 1416 | goto err; | ||
| 1417 | } | ||
| 1418 | |||
| 1419 | /* Did somebody else get it up-to-date? */ | ||
| 1420 | if (PageUptodate(page)) { | ||
| 1421 | unlock_page(page); | ||
| 1422 | goto success; | ||
| 1423 | } | ||
| 1424 | |||
| 1425 | if (!mapping->a_ops->readpage(file, page)) { | ||
| 1426 | wait_on_page_locked(page); | ||
| 1427 | if (PageUptodate(page)) | ||
| 1428 | goto success; | ||
| 1429 | } | ||
| 1430 | |||
| 1431 | /* | ||
| 1432 | * Umm, take care of errors if the page isn't up-to-date. | ||
| 1433 | * Try to re-read it _once_. We do this synchronously, | ||
| 1434 | * because there really aren't any performance issues here | ||
| 1435 | * and we need to check for errors. | ||
| 1436 | */ | ||
| 1437 | lock_page(page); | ||
| 1438 | |||
| 1439 | /* Somebody truncated the page on us? */ | ||
| 1440 | if (!page->mapping) { | ||
| 1441 | unlock_page(page); | ||
| 1442 | goto err; | ||
| 1443 | } | ||
| 1444 | /* Somebody else successfully read it in? */ | ||
| 1445 | if (PageUptodate(page)) { | ||
| 1446 | unlock_page(page); | ||
| 1447 | goto success; | ||
| 1448 | } | ||
| 1449 | |||
| 1450 | ClearPageError(page); | ||
| 1451 | if (!mapping->a_ops->readpage(file, page)) { | ||
| 1452 | wait_on_page_locked(page); | ||
| 1453 | if (PageUptodate(page)) | ||
| 1454 | goto success; | ||
| 1455 | } | ||
| 1456 | |||
| 1457 | /* | ||
| 1458 | * Things didn't work out. Return zero to tell the | ||
| 1459 | * mm layer so, possibly freeing the page cache page first. | ||
| 1460 | */ | ||
| 1461 | err: | ||
| 1462 | page_cache_release(page); | ||
| 1463 | |||
| 1464 | return NULL; | ||
| 1465 | } | ||
| 1466 | |||
| 1467 | int filemap_populate(struct vm_area_struct *vma, unsigned long addr, | ||
| 1468 | unsigned long len, pgprot_t prot, unsigned long pgoff, | ||
| 1469 | int nonblock) | ||
| 1470 | { | ||
| 1471 | struct file *file = vma->vm_file; | ||
| 1472 | struct address_space *mapping = file->f_mapping; | ||
| 1473 | struct inode *inode = mapping->host; | ||
| 1474 | unsigned long size; | ||
| 1475 | struct mm_struct *mm = vma->vm_mm; | ||
| 1476 | struct page *page; | ||
| 1477 | int err; | ||
| 1478 | |||
| 1479 | if (!nonblock) | ||
| 1480 | force_page_cache_readahead(mapping, vma->vm_file, | ||
| 1481 | pgoff, len >> PAGE_CACHE_SHIFT); | ||
| 1482 | |||
| 1483 | repeat: | ||
| 1484 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
| 1485 | if (pgoff + (len >> PAGE_CACHE_SHIFT) > size) | ||
| 1486 | return -EINVAL; | ||
| 1487 | |||
| 1488 | page = filemap_getpage(file, pgoff, nonblock); | ||
| 1489 | if (!page && !nonblock) | ||
| 1490 | return -ENOMEM; | ||
| 1491 | if (page) { | ||
| 1492 | err = install_page(mm, vma, addr, page, prot); | ||
| 1493 | if (err) { | ||
| 1494 | page_cache_release(page); | ||
| 1495 | return err; | ||
| 1496 | } | ||
| 1497 | } else { | ||
| 1498 | err = install_file_pte(mm, vma, addr, pgoff, prot); | ||
| 1499 | if (err) | ||
| 1500 | return err; | ||
| 1501 | } | ||
| 1502 | |||
| 1503 | len -= PAGE_SIZE; | ||
| 1504 | addr += PAGE_SIZE; | ||
| 1505 | pgoff++; | ||
| 1506 | if (len) | ||
| 1507 | goto repeat; | ||
| 1508 | |||
| 1509 | return 0; | ||
| 1510 | } | ||
| 1511 | |||
| 1512 | struct vm_operations_struct generic_file_vm_ops = { | ||
| 1513 | .nopage = filemap_nopage, | ||
| 1514 | .populate = filemap_populate, | ||
| 1515 | }; | ||
| 1516 | |||
| 1517 | /* This is used for a general mmap of a disk file */ | ||
| 1518 | |||
| 1519 | int generic_file_mmap(struct file * file, struct vm_area_struct * vma) | ||
| 1520 | { | ||
| 1521 | struct address_space *mapping = file->f_mapping; | ||
| 1522 | |||
| 1523 | if (!mapping->a_ops->readpage) | ||
| 1524 | return -ENOEXEC; | ||
| 1525 | file_accessed(file); | ||
| 1526 | vma->vm_ops = &generic_file_vm_ops; | ||
| 1527 | return 0; | ||
| 1528 | } | ||
| 1529 | EXPORT_SYMBOL(filemap_populate); | ||
| 1530 | |||
| 1531 | /* | ||
| 1532 | * This is for filesystems which do not implement ->writepage. | ||
| 1533 | */ | ||
| 1534 | int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) | ||
| 1535 | { | ||
| 1536 | if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) | ||
| 1537 | return -EINVAL; | ||
| 1538 | return generic_file_mmap(file, vma); | ||
| 1539 | } | ||
| 1540 | #else | ||
| 1541 | int generic_file_mmap(struct file * file, struct vm_area_struct * vma) | ||
| 1542 | { | ||
| 1543 | return -ENOSYS; | ||
| 1544 | } | ||
| 1545 | int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) | ||
| 1546 | { | ||
| 1547 | return -ENOSYS; | ||
| 1548 | } | ||
| 1549 | #endif /* CONFIG_MMU */ | ||
| 1550 | |||
| 1551 | EXPORT_SYMBOL(generic_file_mmap); | ||
| 1552 | EXPORT_SYMBOL(generic_file_readonly_mmap); | ||
| 1553 | |||
| 1554 | static inline struct page *__read_cache_page(struct address_space *mapping, | ||
| 1555 | unsigned long index, | ||
| 1556 | int (*filler)(void *,struct page*), | ||
| 1557 | void *data) | ||
| 1558 | { | ||
| 1559 | struct page *page, *cached_page = NULL; | ||
| 1560 | int err; | ||
| 1561 | repeat: | ||
| 1562 | page = find_get_page(mapping, index); | ||
| 1563 | if (!page) { | ||
| 1564 | if (!cached_page) { | ||
| 1565 | cached_page = page_cache_alloc_cold(mapping); | ||
| 1566 | if (!cached_page) | ||
| 1567 | return ERR_PTR(-ENOMEM); | ||
| 1568 | } | ||
| 1569 | err = add_to_page_cache_lru(cached_page, mapping, | ||
| 1570 | index, GFP_KERNEL); | ||
| 1571 | if (err == -EEXIST) | ||
| 1572 | goto repeat; | ||
| 1573 | if (err < 0) { | ||
| 1574 | /* Presumably ENOMEM for radix tree node */ | ||
| 1575 | page_cache_release(cached_page); | ||
| 1576 | return ERR_PTR(err); | ||
| 1577 | } | ||
| 1578 | page = cached_page; | ||
| 1579 | cached_page = NULL; | ||
| 1580 | err = filler(data, page); | ||
| 1581 | if (err < 0) { | ||
| 1582 | page_cache_release(page); | ||
| 1583 | page = ERR_PTR(err); | ||
| 1584 | } | ||
| 1585 | } | ||
| 1586 | if (cached_page) | ||
| 1587 | page_cache_release(cached_page); | ||
| 1588 | return page; | ||
| 1589 | } | ||
| 1590 | |||
| 1591 | /* | ||
| 1592 | * Read into the page cache. If a page already exists, | ||
| 1593 | * and PageUptodate() is not set, try to fill the page. | ||
| 1594 | */ | ||
| 1595 | struct page *read_cache_page(struct address_space *mapping, | ||
| 1596 | unsigned long index, | ||
| 1597 | int (*filler)(void *,struct page*), | ||
| 1598 | void *data) | ||
| 1599 | { | ||
| 1600 | struct page *page; | ||
| 1601 | int err; | ||
| 1602 | |||
| 1603 | retry: | ||
| 1604 | page = __read_cache_page(mapping, index, filler, data); | ||
| 1605 | if (IS_ERR(page)) | ||
| 1606 | goto out; | ||
| 1607 | mark_page_accessed(page); | ||
| 1608 | if (PageUptodate(page)) | ||
| 1609 | goto out; | ||
| 1610 | |||
| 1611 | lock_page(page); | ||
| 1612 | if (!page->mapping) { | ||
| 1613 | unlock_page(page); | ||
| 1614 | page_cache_release(page); | ||
| 1615 | goto retry; | ||
| 1616 | } | ||
| 1617 | if (PageUptodate(page)) { | ||
| 1618 | unlock_page(page); | ||
| 1619 | goto out; | ||
| 1620 | } | ||
| 1621 | err = filler(data, page); | ||
| 1622 | if (err < 0) { | ||
| 1623 | page_cache_release(page); | ||
| 1624 | page = ERR_PTR(err); | ||
| 1625 | } | ||
| 1626 | out: | ||
| 1627 | return page; | ||
| 1628 | } | ||
| 1629 | |||
| 1630 | EXPORT_SYMBOL(read_cache_page); | ||
| 1631 | |||
| 1632 | /* | ||
| 1633 | * If the page was newly created, increment its refcount and add it to the | ||
| 1634 | * caller's lru-buffering pagevec. This function is specifically for | ||
| 1635 | * generic_file_write(). | ||
| 1636 | */ | ||
| 1637 | static inline struct page * | ||
| 1638 | __grab_cache_page(struct address_space *mapping, unsigned long index, | ||
| 1639 | struct page **cached_page, struct pagevec *lru_pvec) | ||
| 1640 | { | ||
| 1641 | int err; | ||
| 1642 | struct page *page; | ||
| 1643 | repeat: | ||
| 1644 | page = find_lock_page(mapping, index); | ||
| 1645 | if (!page) { | ||
| 1646 | if (!*cached_page) { | ||
| 1647 | *cached_page = page_cache_alloc(mapping); | ||
| 1648 | if (!*cached_page) | ||
| 1649 | return NULL; | ||
| 1650 | } | ||
| 1651 | err = add_to_page_cache(*cached_page, mapping, | ||
| 1652 | index, GFP_KERNEL); | ||
| 1653 | if (err == -EEXIST) | ||
| 1654 | goto repeat; | ||
| 1655 | if (err == 0) { | ||
| 1656 | page = *cached_page; | ||
| 1657 | page_cache_get(page); | ||
| 1658 | if (!pagevec_add(lru_pvec, page)) | ||
| 1659 | __pagevec_lru_add(lru_pvec); | ||
| 1660 | *cached_page = NULL; | ||
| 1661 | } | ||
| 1662 | } | ||
| 1663 | return page; | ||
| 1664 | } | ||
| 1665 | |||
| 1666 | /* | ||
| 1667 | * The logic we want is | ||
| 1668 | * | ||
| 1669 | * if suid or (sgid and xgrp) | ||
| 1670 | * remove privs | ||
| 1671 | */ | ||
| 1672 | int remove_suid(struct dentry *dentry) | ||
| 1673 | { | ||
| 1674 | mode_t mode = dentry->d_inode->i_mode; | ||
| 1675 | int kill = 0; | ||
| 1676 | int result = 0; | ||
| 1677 | |||
| 1678 | /* suid always must be killed */ | ||
| 1679 | if (unlikely(mode & S_ISUID)) | ||
| 1680 | kill = ATTR_KILL_SUID; | ||
| 1681 | |||
| 1682 | /* | ||
| 1683 | * sgid without any exec bits is just a mandatory locking mark; leave | ||
| 1684 | * it alone. If some exec bits are set, it's a real sgid; kill it. | ||
| 1685 | */ | ||
| 1686 | if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) | ||
| 1687 | kill |= ATTR_KILL_SGID; | ||
| 1688 | |||
| 1689 | if (unlikely(kill && !capable(CAP_FSETID))) { | ||
| 1690 | struct iattr newattrs; | ||
| 1691 | |||
| 1692 | newattrs.ia_valid = ATTR_FORCE | kill; | ||
| 1693 | result = notify_change(dentry, &newattrs); | ||
| 1694 | } | ||
| 1695 | return result; | ||
| 1696 | } | ||
| 1697 | EXPORT_SYMBOL(remove_suid); | ||
| 1698 | |||
| 1699 | /* | ||
| 1700 | * Copy as much as we can into the page and return the number of bytes which | ||
| 1701 | * were sucessfully copied. If a fault is encountered then clear the page | ||
| 1702 | * out to (offset+bytes) and return the number of bytes which were copied. | ||
| 1703 | */ | ||
| 1704 | static inline size_t | ||
| 1705 | filemap_copy_from_user(struct page *page, unsigned long offset, | ||
| 1706 | const char __user *buf, unsigned bytes) | ||
| 1707 | { | ||
| 1708 | char *kaddr; | ||
| 1709 | int left; | ||
| 1710 | |||
| 1711 | kaddr = kmap_atomic(page, KM_USER0); | ||
| 1712 | left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); | ||
| 1713 | kunmap_atomic(kaddr, KM_USER0); | ||
| 1714 | |||
| 1715 | if (left != 0) { | ||
| 1716 | /* Do it the slow way */ | ||
| 1717 | kaddr = kmap(page); | ||
| 1718 | left = __copy_from_user(kaddr + offset, buf, bytes); | ||
| 1719 | kunmap(page); | ||
| 1720 | } | ||
| 1721 | return bytes - left; | ||
| 1722 | } | ||
| 1723 | |||
| 1724 | static size_t | ||
| 1725 | __filemap_copy_from_user_iovec(char *vaddr, | ||
| 1726 | const struct iovec *iov, size_t base, size_t bytes) | ||
| 1727 | { | ||
| 1728 | size_t copied = 0, left = 0; | ||
| 1729 | |||
| 1730 | while (bytes) { | ||
| 1731 | char __user *buf = iov->iov_base + base; | ||
| 1732 | int copy = min(bytes, iov->iov_len - base); | ||
| 1733 | |||
| 1734 | base = 0; | ||
| 1735 | left = __copy_from_user_inatomic(vaddr, buf, copy); | ||
| 1736 | copied += copy; | ||
| 1737 | bytes -= copy; | ||
| 1738 | vaddr += copy; | ||
| 1739 | iov++; | ||
| 1740 | |||
| 1741 | if (unlikely(left)) { | ||
| 1742 | /* zero the rest of the target like __copy_from_user */ | ||
| 1743 | if (bytes) | ||
| 1744 | memset(vaddr, 0, bytes); | ||
| 1745 | break; | ||
| 1746 | } | ||
| 1747 | } | ||
| 1748 | return copied - left; | ||
| 1749 | } | ||
| 1750 | |||
| 1751 | /* | ||
| 1752 | * This has the same sideeffects and return value as filemap_copy_from_user(). | ||
| 1753 | * The difference is that on a fault we need to memset the remainder of the | ||
| 1754 | * page (out to offset+bytes), to emulate filemap_copy_from_user()'s | ||
| 1755 | * single-segment behaviour. | ||
| 1756 | */ | ||
| 1757 | static inline size_t | ||
| 1758 | filemap_copy_from_user_iovec(struct page *page, unsigned long offset, | ||
| 1759 | const struct iovec *iov, size_t base, size_t bytes) | ||
| 1760 | { | ||
| 1761 | char *kaddr; | ||
| 1762 | size_t copied; | ||
| 1763 | |||
| 1764 | kaddr = kmap_atomic(page, KM_USER0); | ||
| 1765 | copied = __filemap_copy_from_user_iovec(kaddr + offset, iov, | ||
| 1766 | base, bytes); | ||
| 1767 | kunmap_atomic(kaddr, KM_USER0); | ||
| 1768 | if (copied != bytes) { | ||
| 1769 | kaddr = kmap(page); | ||
| 1770 | copied = __filemap_copy_from_user_iovec(kaddr + offset, iov, | ||
| 1771 | base, bytes); | ||
| 1772 | kunmap(page); | ||
| 1773 | } | ||
| 1774 | return copied; | ||
| 1775 | } | ||
| 1776 | |||
| 1777 | static inline void | ||
| 1778 | filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) | ||
| 1779 | { | ||
| 1780 | const struct iovec *iov = *iovp; | ||
| 1781 | size_t base = *basep; | ||
| 1782 | |||
| 1783 | while (bytes) { | ||
| 1784 | int copy = min(bytes, iov->iov_len - base); | ||
| 1785 | |||
| 1786 | bytes -= copy; | ||
| 1787 | base += copy; | ||
| 1788 | if (iov->iov_len == base) { | ||
| 1789 | iov++; | ||
| 1790 | base = 0; | ||
| 1791 | } | ||
| 1792 | } | ||
| 1793 | *iovp = iov; | ||
| 1794 | *basep = base; | ||
| 1795 | } | ||
| 1796 | |||
| 1797 | /* | ||
| 1798 | * Performs necessary checks before doing a write | ||
| 1799 | * | ||
| 1800 | * Can adjust writing position aor amount of bytes to write. | ||
| 1801 | * Returns appropriate error code that caller should return or | ||
| 1802 | * zero in case that write should be allowed. | ||
| 1803 | */ | ||
| 1804 | inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) | ||
| 1805 | { | ||
| 1806 | struct inode *inode = file->f_mapping->host; | ||
| 1807 | unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | ||
| 1808 | |||
| 1809 | if (unlikely(*pos < 0)) | ||
| 1810 | return -EINVAL; | ||
| 1811 | |||
| 1812 | if (unlikely(file->f_error)) { | ||
| 1813 | int err = file->f_error; | ||
| 1814 | file->f_error = 0; | ||
| 1815 | return err; | ||
| 1816 | } | ||
| 1817 | |||
| 1818 | if (!isblk) { | ||
| 1819 | /* FIXME: this is for backwards compatibility with 2.4 */ | ||
| 1820 | if (file->f_flags & O_APPEND) | ||
| 1821 | *pos = i_size_read(inode); | ||
| 1822 | |||
| 1823 | if (limit != RLIM_INFINITY) { | ||
| 1824 | if (*pos >= limit) { | ||
| 1825 | send_sig(SIGXFSZ, current, 0); | ||
| 1826 | return -EFBIG; | ||
| 1827 | } | ||
| 1828 | if (*count > limit - (typeof(limit))*pos) { | ||
| 1829 | *count = limit - (typeof(limit))*pos; | ||
| 1830 | } | ||
| 1831 | } | ||
| 1832 | } | ||
| 1833 | |||
| 1834 | /* | ||
| 1835 | * LFS rule | ||
| 1836 | */ | ||
| 1837 | if (unlikely(*pos + *count > MAX_NON_LFS && | ||
| 1838 | !(file->f_flags & O_LARGEFILE))) { | ||
| 1839 | if (*pos >= MAX_NON_LFS) { | ||
| 1840 | send_sig(SIGXFSZ, current, 0); | ||
| 1841 | return -EFBIG; | ||
| 1842 | } | ||
| 1843 | if (*count > MAX_NON_LFS - (unsigned long)*pos) { | ||
| 1844 | *count = MAX_NON_LFS - (unsigned long)*pos; | ||
| 1845 | } | ||
| 1846 | } | ||
| 1847 | |||
| 1848 | /* | ||
| 1849 | * Are we about to exceed the fs block limit ? | ||
| 1850 | * | ||
| 1851 | * If we have written data it becomes a short write. If we have | ||
| 1852 | * exceeded without writing data we send a signal and return EFBIG. | ||
| 1853 | * Linus frestrict idea will clean these up nicely.. | ||
| 1854 | */ | ||
| 1855 | if (likely(!isblk)) { | ||
| 1856 | if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { | ||
| 1857 | if (*count || *pos > inode->i_sb->s_maxbytes) { | ||
| 1858 | send_sig(SIGXFSZ, current, 0); | ||
| 1859 | return -EFBIG; | ||
| 1860 | } | ||
| 1861 | /* zero-length writes at ->s_maxbytes are OK */ | ||
| 1862 | } | ||
| 1863 | |||
| 1864 | if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) | ||
| 1865 | *count = inode->i_sb->s_maxbytes - *pos; | ||
| 1866 | } else { | ||
| 1867 | loff_t isize; | ||
| 1868 | if (bdev_read_only(I_BDEV(inode))) | ||
| 1869 | return -EPERM; | ||
| 1870 | isize = i_size_read(inode); | ||
| 1871 | if (*pos >= isize) { | ||
| 1872 | if (*count || *pos > isize) | ||
| 1873 | return -ENOSPC; | ||
| 1874 | } | ||
| 1875 | |||
| 1876 | if (*pos + *count > isize) | ||
| 1877 | *count = isize - *pos; | ||
| 1878 | } | ||
| 1879 | return 0; | ||
| 1880 | } | ||
| 1881 | EXPORT_SYMBOL(generic_write_checks); | ||
| 1882 | |||
| 1883 | ssize_t | ||
| 1884 | generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | ||
| 1885 | unsigned long *nr_segs, loff_t pos, loff_t *ppos, | ||
| 1886 | size_t count, size_t ocount) | ||
| 1887 | { | ||
| 1888 | struct file *file = iocb->ki_filp; | ||
| 1889 | struct address_space *mapping = file->f_mapping; | ||
| 1890 | struct inode *inode = mapping->host; | ||
| 1891 | ssize_t written; | ||
| 1892 | |||
| 1893 | if (count != ocount) | ||
| 1894 | *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); | ||
| 1895 | |||
| 1896 | written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs); | ||
| 1897 | if (written > 0) { | ||
| 1898 | loff_t end = pos + written; | ||
| 1899 | if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { | ||
| 1900 | i_size_write(inode, end); | ||
| 1901 | mark_inode_dirty(inode); | ||
| 1902 | } | ||
| 1903 | *ppos = end; | ||
| 1904 | } | ||
| 1905 | |||
| 1906 | /* | ||
| 1907 | * Sync the fs metadata but not the minor inode changes and | ||
| 1908 | * of course not the data as we did direct DMA for the IO. | ||
| 1909 | * i_sem is held, which protects generic_osync_inode() from | ||
| 1910 | * livelocking. | ||
| 1911 | */ | ||
| 1912 | if (written >= 0 && file->f_flags & O_SYNC) | ||
| 1913 | generic_osync_inode(inode, mapping, OSYNC_METADATA); | ||
| 1914 | if (written == count && !is_sync_kiocb(iocb)) | ||
| 1915 | written = -EIOCBQUEUED; | ||
| 1916 | return written; | ||
| 1917 | } | ||
| 1918 | EXPORT_SYMBOL(generic_file_direct_write); | ||
| 1919 | |||
| 1920 | ssize_t | ||
| 1921 | generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | ||
| 1922 | unsigned long nr_segs, loff_t pos, loff_t *ppos, | ||
| 1923 | size_t count, ssize_t written) | ||
| 1924 | { | ||
| 1925 | struct file *file = iocb->ki_filp; | ||
| 1926 | struct address_space * mapping = file->f_mapping; | ||
| 1927 | struct address_space_operations *a_ops = mapping->a_ops; | ||
| 1928 | struct inode *inode = mapping->host; | ||
| 1929 | long status = 0; | ||
| 1930 | struct page *page; | ||
| 1931 | struct page *cached_page = NULL; | ||
| 1932 | size_t bytes; | ||
| 1933 | struct pagevec lru_pvec; | ||
| 1934 | const struct iovec *cur_iov = iov; /* current iovec */ | ||
| 1935 | size_t iov_base = 0; /* offset in the current iovec */ | ||
| 1936 | char __user *buf; | ||
| 1937 | |||
| 1938 | pagevec_init(&lru_pvec, 0); | ||
| 1939 | |||
| 1940 | /* | ||
| 1941 | * handle partial DIO write. Adjust cur_iov if needed. | ||
| 1942 | */ | ||
| 1943 | if (likely(nr_segs == 1)) | ||
| 1944 | buf = iov->iov_base + written; | ||
| 1945 | else { | ||
| 1946 | filemap_set_next_iovec(&cur_iov, &iov_base, written); | ||
| 1947 | buf = iov->iov_base + iov_base; | ||
| 1948 | } | ||
| 1949 | |||
| 1950 | do { | ||
| 1951 | unsigned long index; | ||
| 1952 | unsigned long offset; | ||
| 1953 | size_t copied; | ||
| 1954 | |||
| 1955 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ | ||
| 1956 | index = pos >> PAGE_CACHE_SHIFT; | ||
| 1957 | bytes = PAGE_CACHE_SIZE - offset; | ||
| 1958 | if (bytes > count) | ||
| 1959 | bytes = count; | ||
| 1960 | |||
| 1961 | /* | ||
| 1962 | * Bring in the user page that we will copy from _first_. | ||
| 1963 | * Otherwise there's a nasty deadlock on copying from the | ||
| 1964 | * same page as we're writing to, without it being marked | ||
| 1965 | * up-to-date. | ||
| 1966 | */ | ||
| 1967 | fault_in_pages_readable(buf, bytes); | ||
| 1968 | |||
| 1969 | page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); | ||
| 1970 | if (!page) { | ||
| 1971 | status = -ENOMEM; | ||
| 1972 | break; | ||
| 1973 | } | ||
| 1974 | |||
| 1975 | status = a_ops->prepare_write(file, page, offset, offset+bytes); | ||
| 1976 | if (unlikely(status)) { | ||
| 1977 | loff_t isize = i_size_read(inode); | ||
| 1978 | /* | ||
| 1979 | * prepare_write() may have instantiated a few blocks | ||
| 1980 | * outside i_size. Trim these off again. | ||
| 1981 | */ | ||
| 1982 | unlock_page(page); | ||
| 1983 | page_cache_release(page); | ||
| 1984 | if (pos + bytes > isize) | ||
| 1985 | vmtruncate(inode, isize); | ||
| 1986 | break; | ||
| 1987 | } | ||
| 1988 | if (likely(nr_segs == 1)) | ||
| 1989 | copied = filemap_copy_from_user(page, offset, | ||
| 1990 | buf, bytes); | ||
| 1991 | else | ||
| 1992 | copied = filemap_copy_from_user_iovec(page, offset, | ||
| 1993 | cur_iov, iov_base, bytes); | ||
| 1994 | flush_dcache_page(page); | ||
| 1995 | status = a_ops->commit_write(file, page, offset, offset+bytes); | ||
| 1996 | if (likely(copied > 0)) { | ||
| 1997 | if (!status) | ||
| 1998 | status = copied; | ||
| 1999 | |||
| 2000 | if (status >= 0) { | ||
| 2001 | written += status; | ||
| 2002 | count -= status; | ||
| 2003 | pos += status; | ||
| 2004 | buf += status; | ||
| 2005 | if (unlikely(nr_segs > 1)) | ||
| 2006 | filemap_set_next_iovec(&cur_iov, | ||
| 2007 | &iov_base, status); | ||
| 2008 | } | ||
| 2009 | } | ||
| 2010 | if (unlikely(copied != bytes)) | ||
| 2011 | if (status >= 0) | ||
| 2012 | status = -EFAULT; | ||
| 2013 | unlock_page(page); | ||
| 2014 | mark_page_accessed(page); | ||
| 2015 | page_cache_release(page); | ||
| 2016 | if (status < 0) | ||
| 2017 | break; | ||
| 2018 | balance_dirty_pages_ratelimited(mapping); | ||
| 2019 | cond_resched(); | ||
| 2020 | } while (count); | ||
| 2021 | *ppos = pos; | ||
| 2022 | |||
| 2023 | if (cached_page) | ||
| 2024 | page_cache_release(cached_page); | ||
| 2025 | |||
| 2026 | /* | ||
| 2027 | * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC | ||
| 2028 | */ | ||
| 2029 | if (likely(status >= 0)) { | ||
| 2030 | if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | ||
| 2031 | if (!a_ops->writepage || !is_sync_kiocb(iocb)) | ||
| 2032 | status = generic_osync_inode(inode, mapping, | ||
| 2033 | OSYNC_METADATA|OSYNC_DATA); | ||
| 2034 | } | ||
| 2035 | } | ||
| 2036 | |||
| 2037 | /* | ||
| 2038 | * If we get here for O_DIRECT writes then we must have fallen through | ||
| 2039 | * to buffered writes (block instantiation inside i_size). So we sync | ||
| 2040 | * the file data here, to try to honour O_DIRECT expectations. | ||
| 2041 | */ | ||
| 2042 | if (unlikely(file->f_flags & O_DIRECT) && written) | ||
| 2043 | status = filemap_write_and_wait(mapping); | ||
| 2044 | |||
| 2045 | pagevec_lru_add(&lru_pvec); | ||
| 2046 | return written ? written : status; | ||
| 2047 | } | ||
| 2048 | EXPORT_SYMBOL(generic_file_buffered_write); | ||
| 2049 | |||
| 2050 | ssize_t | ||
| 2051 | __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | ||
| 2052 | unsigned long nr_segs, loff_t *ppos) | ||
| 2053 | { | ||
| 2054 | struct file *file = iocb->ki_filp; | ||
| 2055 | struct address_space * mapping = file->f_mapping; | ||
| 2056 | size_t ocount; /* original count */ | ||
| 2057 | size_t count; /* after file limit checks */ | ||
| 2058 | struct inode *inode = mapping->host; | ||
| 2059 | unsigned long seg; | ||
| 2060 | loff_t pos; | ||
| 2061 | ssize_t written; | ||
| 2062 | ssize_t err; | ||
| 2063 | |||
| 2064 | ocount = 0; | ||
| 2065 | for (seg = 0; seg < nr_segs; seg++) { | ||
| 2066 | const struct iovec *iv = &iov[seg]; | ||
| 2067 | |||
| 2068 | /* | ||
| 2069 | * If any segment has a negative length, or the cumulative | ||
| 2070 | * length ever wraps negative then return -EINVAL. | ||
| 2071 | */ | ||
| 2072 | ocount += iv->iov_len; | ||
| 2073 | if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) | ||
| 2074 | return -EINVAL; | ||
| 2075 | if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) | ||
| 2076 | continue; | ||
| 2077 | if (seg == 0) | ||
| 2078 | return -EFAULT; | ||
| 2079 | nr_segs = seg; | ||
| 2080 | ocount -= iv->iov_len; /* This segment is no good */ | ||
| 2081 | break; | ||
| 2082 | } | ||
| 2083 | |||
| 2084 | count = ocount; | ||
| 2085 | pos = *ppos; | ||
| 2086 | |||
| 2087 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); | ||
| 2088 | |||
| 2089 | /* We can write back this queue in page reclaim */ | ||
| 2090 | current->backing_dev_info = mapping->backing_dev_info; | ||
| 2091 | written = 0; | ||
| 2092 | |||
| 2093 | err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); | ||
| 2094 | if (err) | ||
| 2095 | goto out; | ||
| 2096 | |||
| 2097 | if (count == 0) | ||
| 2098 | goto out; | ||
| 2099 | |||
| 2100 | err = remove_suid(file->f_dentry); | ||
| 2101 | if (err) | ||
| 2102 | goto out; | ||
| 2103 | |||
| 2104 | inode_update_time(inode, 1); | ||
| 2105 | |||
| 2106 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ | ||
| 2107 | if (unlikely(file->f_flags & O_DIRECT)) { | ||
| 2108 | written = generic_file_direct_write(iocb, iov, | ||
| 2109 | &nr_segs, pos, ppos, count, ocount); | ||
| 2110 | if (written < 0 || written == count) | ||
| 2111 | goto out; | ||
| 2112 | /* | ||
| 2113 | * direct-io write to a hole: fall through to buffered I/O | ||
| 2114 | * for completing the rest of the request. | ||
| 2115 | */ | ||
| 2116 | pos += written; | ||
| 2117 | count -= written; | ||
| 2118 | } | ||
| 2119 | |||
| 2120 | written = generic_file_buffered_write(iocb, iov, nr_segs, | ||
| 2121 | pos, ppos, count, written); | ||
| 2122 | out: | ||
| 2123 | current->backing_dev_info = NULL; | ||
| 2124 | return written ? written : err; | ||
| 2125 | } | ||
| 2126 | EXPORT_SYMBOL(generic_file_aio_write_nolock); | ||
| 2127 | |||
| 2128 | ssize_t | ||
| 2129 | generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | ||
| 2130 | unsigned long nr_segs, loff_t *ppos) | ||
| 2131 | { | ||
| 2132 | struct file *file = iocb->ki_filp; | ||
| 2133 | struct address_space *mapping = file->f_mapping; | ||
| 2134 | struct inode *inode = mapping->host; | ||
| 2135 | ssize_t ret; | ||
| 2136 | loff_t pos = *ppos; | ||
| 2137 | |||
| 2138 | ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos); | ||
| 2139 | |||
| 2140 | if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | ||
| 2141 | int err; | ||
| 2142 | |||
| 2143 | err = sync_page_range_nolock(inode, mapping, pos, ret); | ||
| 2144 | if (err < 0) | ||
| 2145 | ret = err; | ||
| 2146 | } | ||
| 2147 | return ret; | ||
| 2148 | } | ||
| 2149 | |||
| 2150 | ssize_t | ||
| 2151 | __generic_file_write_nolock(struct file *file, const struct iovec *iov, | ||
| 2152 | unsigned long nr_segs, loff_t *ppos) | ||
| 2153 | { | ||
| 2154 | struct kiocb kiocb; | ||
| 2155 | ssize_t ret; | ||
| 2156 | |||
| 2157 | init_sync_kiocb(&kiocb, file); | ||
| 2158 | ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); | ||
| 2159 | if (ret == -EIOCBQUEUED) | ||
| 2160 | ret = wait_on_sync_kiocb(&kiocb); | ||
| 2161 | return ret; | ||
| 2162 | } | ||
| 2163 | |||
| 2164 | ssize_t | ||
| 2165 | generic_file_write_nolock(struct file *file, const struct iovec *iov, | ||
| 2166 | unsigned long nr_segs, loff_t *ppos) | ||
| 2167 | { | ||
| 2168 | struct kiocb kiocb; | ||
| 2169 | ssize_t ret; | ||
| 2170 | |||
| 2171 | init_sync_kiocb(&kiocb, file); | ||
| 2172 | ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); | ||
| 2173 | if (-EIOCBQUEUED == ret) | ||
| 2174 | ret = wait_on_sync_kiocb(&kiocb); | ||
| 2175 | return ret; | ||
| 2176 | } | ||
| 2177 | EXPORT_SYMBOL(generic_file_write_nolock); | ||
| 2178 | |||
| 2179 | ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf, | ||
| 2180 | size_t count, loff_t pos) | ||
| 2181 | { | ||
| 2182 | struct file *file = iocb->ki_filp; | ||
| 2183 | struct address_space *mapping = file->f_mapping; | ||
| 2184 | struct inode *inode = mapping->host; | ||
| 2185 | ssize_t ret; | ||
| 2186 | struct iovec local_iov = { .iov_base = (void __user *)buf, | ||
| 2187 | .iov_len = count }; | ||
| 2188 | |||
| 2189 | BUG_ON(iocb->ki_pos != pos); | ||
| 2190 | |||
| 2191 | down(&inode->i_sem); | ||
| 2192 | ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1, | ||
| 2193 | &iocb->ki_pos); | ||
| 2194 | up(&inode->i_sem); | ||
| 2195 | |||
| 2196 | if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | ||
| 2197 | ssize_t err; | ||
| 2198 | |||
| 2199 | err = sync_page_range(inode, mapping, pos, ret); | ||
| 2200 | if (err < 0) | ||
| 2201 | ret = err; | ||
| 2202 | } | ||
| 2203 | return ret; | ||
| 2204 | } | ||
| 2205 | EXPORT_SYMBOL(generic_file_aio_write); | ||
| 2206 | |||
| 2207 | ssize_t generic_file_write(struct file *file, const char __user *buf, | ||
| 2208 | size_t count, loff_t *ppos) | ||
| 2209 | { | ||
| 2210 | struct address_space *mapping = file->f_mapping; | ||
| 2211 | struct inode *inode = mapping->host; | ||
| 2212 | ssize_t ret; | ||
| 2213 | struct iovec local_iov = { .iov_base = (void __user *)buf, | ||
| 2214 | .iov_len = count }; | ||
| 2215 | |||
| 2216 | down(&inode->i_sem); | ||
| 2217 | ret = __generic_file_write_nolock(file, &local_iov, 1, ppos); | ||
| 2218 | up(&inode->i_sem); | ||
| 2219 | |||
| 2220 | if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | ||
| 2221 | ssize_t err; | ||
| 2222 | |||
| 2223 | err = sync_page_range(inode, mapping, *ppos - ret, ret); | ||
| 2224 | if (err < 0) | ||
| 2225 | ret = err; | ||
| 2226 | } | ||
| 2227 | return ret; | ||
| 2228 | } | ||
| 2229 | EXPORT_SYMBOL(generic_file_write); | ||
| 2230 | |||
| 2231 | ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, | ||
| 2232 | unsigned long nr_segs, loff_t *ppos) | ||
| 2233 | { | ||
| 2234 | struct kiocb kiocb; | ||
| 2235 | ssize_t ret; | ||
| 2236 | |||
| 2237 | init_sync_kiocb(&kiocb, filp); | ||
| 2238 | ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos); | ||
| 2239 | if (-EIOCBQUEUED == ret) | ||
| 2240 | ret = wait_on_sync_kiocb(&kiocb); | ||
| 2241 | return ret; | ||
| 2242 | } | ||
| 2243 | EXPORT_SYMBOL(generic_file_readv); | ||
| 2244 | |||
| 2245 | ssize_t generic_file_writev(struct file *file, const struct iovec *iov, | ||
| 2246 | unsigned long nr_segs, loff_t *ppos) | ||
| 2247 | { | ||
| 2248 | struct address_space *mapping = file->f_mapping; | ||
| 2249 | struct inode *inode = mapping->host; | ||
| 2250 | ssize_t ret; | ||
| 2251 | |||
| 2252 | down(&inode->i_sem); | ||
| 2253 | ret = __generic_file_write_nolock(file, iov, nr_segs, ppos); | ||
| 2254 | up(&inode->i_sem); | ||
| 2255 | |||
| 2256 | if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | ||
| 2257 | int err; | ||
| 2258 | |||
| 2259 | err = sync_page_range(inode, mapping, *ppos - ret, ret); | ||
| 2260 | if (err < 0) | ||
| 2261 | ret = err; | ||
| 2262 | } | ||
| 2263 | return ret; | ||
| 2264 | } | ||
| 2265 | EXPORT_SYMBOL(generic_file_writev); | ||
| 2266 | |||
| 2267 | /* | ||
| 2268 | * Called under i_sem for writes to S_ISREG files. Returns -EIO if something | ||
| 2269 | * went wrong during pagecache shootdown. | ||
| 2270 | */ | ||
| 2271 | ssize_t | ||
| 2272 | generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | ||
| 2273 | loff_t offset, unsigned long nr_segs) | ||
| 2274 | { | ||
| 2275 | struct file *file = iocb->ki_filp; | ||
| 2276 | struct address_space *mapping = file->f_mapping; | ||
| 2277 | ssize_t retval; | ||
| 2278 | size_t write_len = 0; | ||
| 2279 | |||
| 2280 | /* | ||
| 2281 | * If it's a write, unmap all mmappings of the file up-front. This | ||
| 2282 | * will cause any pte dirty bits to be propagated into the pageframes | ||
| 2283 | * for the subsequent filemap_write_and_wait(). | ||
| 2284 | */ | ||
| 2285 | if (rw == WRITE) { | ||
| 2286 | write_len = iov_length(iov, nr_segs); | ||
| 2287 | if (mapping_mapped(mapping)) | ||
| 2288 | unmap_mapping_range(mapping, offset, write_len, 0); | ||
| 2289 | } | ||
| 2290 | |||
| 2291 | retval = filemap_write_and_wait(mapping); | ||
| 2292 | if (retval == 0) { | ||
| 2293 | retval = mapping->a_ops->direct_IO(rw, iocb, iov, | ||
| 2294 | offset, nr_segs); | ||
| 2295 | if (rw == WRITE && mapping->nrpages) { | ||
| 2296 | pgoff_t end = (offset + write_len - 1) | ||
| 2297 | >> PAGE_CACHE_SHIFT; | ||
| 2298 | int err = invalidate_inode_pages2_range(mapping, | ||
| 2299 | offset >> PAGE_CACHE_SHIFT, end); | ||
| 2300 | if (err) | ||
| 2301 | retval = err; | ||
| 2302 | } | ||
| 2303 | } | ||
| 2304 | return retval; | ||
| 2305 | } | ||
| 2306 | EXPORT_SYMBOL_GPL(generic_file_direct_IO); | ||
diff --git a/mm/fremap.c b/mm/fremap.c new file mode 100644 index 000000000000..3235fb77c133 --- /dev/null +++ b/mm/fremap.c | |||
| @@ -0,0 +1,256 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/fremap.c | ||
| 3 | * | ||
| 4 | * Explicit pagetable population and nonlinear (random) mappings support. | ||
| 5 | * | ||
| 6 | * started by Ingo Molnar, Copyright (C) 2002, 2003 | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <linux/mm.h> | ||
| 10 | #include <linux/swap.h> | ||
| 11 | #include <linux/file.h> | ||
| 12 | #include <linux/mman.h> | ||
| 13 | #include <linux/pagemap.h> | ||
| 14 | #include <linux/swapops.h> | ||
| 15 | #include <linux/rmap.h> | ||
| 16 | #include <linux/module.h> | ||
| 17 | #include <linux/syscalls.h> | ||
| 18 | |||
| 19 | #include <asm/mmu_context.h> | ||
| 20 | #include <asm/cacheflush.h> | ||
| 21 | #include <asm/tlbflush.h> | ||
| 22 | |||
| 23 | static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 24 | unsigned long addr, pte_t *ptep) | ||
| 25 | { | ||
| 26 | pte_t pte = *ptep; | ||
| 27 | |||
| 28 | if (pte_none(pte)) | ||
| 29 | return; | ||
| 30 | if (pte_present(pte)) { | ||
| 31 | unsigned long pfn = pte_pfn(pte); | ||
| 32 | |||
| 33 | flush_cache_page(vma, addr, pfn); | ||
| 34 | pte = ptep_clear_flush(vma, addr, ptep); | ||
| 35 | if (pfn_valid(pfn)) { | ||
| 36 | struct page *page = pfn_to_page(pfn); | ||
| 37 | if (!PageReserved(page)) { | ||
| 38 | if (pte_dirty(pte)) | ||
| 39 | set_page_dirty(page); | ||
| 40 | page_remove_rmap(page); | ||
| 41 | page_cache_release(page); | ||
| 42 | dec_mm_counter(mm, rss); | ||
| 43 | } | ||
| 44 | } | ||
| 45 | } else { | ||
| 46 | if (!pte_file(pte)) | ||
| 47 | free_swap_and_cache(pte_to_swp_entry(pte)); | ||
| 48 | pte_clear(mm, addr, ptep); | ||
| 49 | } | ||
| 50 | } | ||
| 51 | |||
| 52 | /* | ||
| 53 | * Install a file page to a given virtual memory address, release any | ||
| 54 | * previously existing mapping. | ||
| 55 | */ | ||
| 56 | int install_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 57 | unsigned long addr, struct page *page, pgprot_t prot) | ||
| 58 | { | ||
| 59 | struct inode *inode; | ||
| 60 | pgoff_t size; | ||
| 61 | int err = -ENOMEM; | ||
| 62 | pte_t *pte; | ||
| 63 | pmd_t *pmd; | ||
| 64 | pud_t *pud; | ||
| 65 | pgd_t *pgd; | ||
| 66 | pte_t pte_val; | ||
| 67 | |||
| 68 | pgd = pgd_offset(mm, addr); | ||
| 69 | spin_lock(&mm->page_table_lock); | ||
| 70 | |||
| 71 | pud = pud_alloc(mm, pgd, addr); | ||
| 72 | if (!pud) | ||
| 73 | goto err_unlock; | ||
| 74 | |||
| 75 | pmd = pmd_alloc(mm, pud, addr); | ||
| 76 | if (!pmd) | ||
| 77 | goto err_unlock; | ||
| 78 | |||
| 79 | pte = pte_alloc_map(mm, pmd, addr); | ||
| 80 | if (!pte) | ||
| 81 | goto err_unlock; | ||
| 82 | |||
| 83 | /* | ||
| 84 | * This page may have been truncated. Tell the | ||
| 85 | * caller about it. | ||
| 86 | */ | ||
| 87 | err = -EINVAL; | ||
| 88 | inode = vma->vm_file->f_mapping->host; | ||
| 89 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
| 90 | if (!page->mapping || page->index >= size) | ||
| 91 | goto err_unlock; | ||
| 92 | |||
| 93 | zap_pte(mm, vma, addr, pte); | ||
| 94 | |||
| 95 | inc_mm_counter(mm,rss); | ||
| 96 | flush_icache_page(vma, page); | ||
| 97 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); | ||
| 98 | page_add_file_rmap(page); | ||
| 99 | pte_val = *pte; | ||
| 100 | pte_unmap(pte); | ||
| 101 | update_mmu_cache(vma, addr, pte_val); | ||
| 102 | |||
| 103 | err = 0; | ||
| 104 | err_unlock: | ||
| 105 | spin_unlock(&mm->page_table_lock); | ||
| 106 | return err; | ||
| 107 | } | ||
| 108 | EXPORT_SYMBOL(install_page); | ||
| 109 | |||
| 110 | |||
| 111 | /* | ||
| 112 | * Install a file pte to a given virtual memory address, release any | ||
| 113 | * previously existing mapping. | ||
| 114 | */ | ||
| 115 | int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 116 | unsigned long addr, unsigned long pgoff, pgprot_t prot) | ||
| 117 | { | ||
| 118 | int err = -ENOMEM; | ||
| 119 | pte_t *pte; | ||
| 120 | pmd_t *pmd; | ||
| 121 | pud_t *pud; | ||
| 122 | pgd_t *pgd; | ||
| 123 | pte_t pte_val; | ||
| 124 | |||
| 125 | pgd = pgd_offset(mm, addr); | ||
| 126 | spin_lock(&mm->page_table_lock); | ||
| 127 | |||
| 128 | pud = pud_alloc(mm, pgd, addr); | ||
| 129 | if (!pud) | ||
| 130 | goto err_unlock; | ||
| 131 | |||
| 132 | pmd = pmd_alloc(mm, pud, addr); | ||
| 133 | if (!pmd) | ||
| 134 | goto err_unlock; | ||
| 135 | |||
| 136 | pte = pte_alloc_map(mm, pmd, addr); | ||
| 137 | if (!pte) | ||
| 138 | goto err_unlock; | ||
| 139 | |||
| 140 | zap_pte(mm, vma, addr, pte); | ||
| 141 | |||
| 142 | set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); | ||
| 143 | pte_val = *pte; | ||
| 144 | pte_unmap(pte); | ||
| 145 | update_mmu_cache(vma, addr, pte_val); | ||
| 146 | spin_unlock(&mm->page_table_lock); | ||
| 147 | return 0; | ||
| 148 | |||
| 149 | err_unlock: | ||
| 150 | spin_unlock(&mm->page_table_lock); | ||
| 151 | return err; | ||
| 152 | } | ||
| 153 | |||
| 154 | |||
| 155 | /*** | ||
| 156 | * sys_remap_file_pages - remap arbitrary pages of a shared backing store | ||
| 157 | * file within an existing vma. | ||
| 158 | * @start: start of the remapped virtual memory range | ||
| 159 | * @size: size of the remapped virtual memory range | ||
| 160 | * @prot: new protection bits of the range | ||
| 161 | * @pgoff: to be mapped page of the backing store file | ||
| 162 | * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. | ||
| 163 | * | ||
| 164 | * this syscall works purely via pagetables, so it's the most efficient | ||
| 165 | * way to map the same (large) file into a given virtual window. Unlike | ||
| 166 | * mmap()/mremap() it does not create any new vmas. The new mappings are | ||
| 167 | * also safe across swapout. | ||
| 168 | * | ||
| 169 | * NOTE: the 'prot' parameter right now is ignored, and the vma's default | ||
| 170 | * protection is used. Arbitrary protections might be implemented in the | ||
| 171 | * future. | ||
| 172 | */ | ||
| 173 | asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, | ||
| 174 | unsigned long __prot, unsigned long pgoff, unsigned long flags) | ||
| 175 | { | ||
| 176 | struct mm_struct *mm = current->mm; | ||
| 177 | struct address_space *mapping; | ||
| 178 | unsigned long end = start + size; | ||
| 179 | struct vm_area_struct *vma; | ||
| 180 | int err = -EINVAL; | ||
| 181 | int has_write_lock = 0; | ||
| 182 | |||
| 183 | if (__prot) | ||
| 184 | return err; | ||
| 185 | /* | ||
| 186 | * Sanitize the syscall parameters: | ||
| 187 | */ | ||
| 188 | start = start & PAGE_MASK; | ||
| 189 | size = size & PAGE_MASK; | ||
| 190 | |||
| 191 | /* Does the address range wrap, or is the span zero-sized? */ | ||
| 192 | if (start + size <= start) | ||
| 193 | return err; | ||
| 194 | |||
| 195 | /* Can we represent this offset inside this architecture's pte's? */ | ||
| 196 | #if PTE_FILE_MAX_BITS < BITS_PER_LONG | ||
| 197 | if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) | ||
| 198 | return err; | ||
| 199 | #endif | ||
| 200 | |||
| 201 | /* We need down_write() to change vma->vm_flags. */ | ||
| 202 | down_read(&mm->mmap_sem); | ||
| 203 | retry: | ||
| 204 | vma = find_vma(mm, start); | ||
| 205 | |||
| 206 | /* | ||
| 207 | * Make sure the vma is shared, that it supports prefaulting, | ||
| 208 | * and that the remapped range is valid and fully within | ||
| 209 | * the single existing vma. vm_private_data is used as a | ||
| 210 | * swapout cursor in a VM_NONLINEAR vma (unless VM_RESERVED | ||
| 211 | * or VM_LOCKED, but VM_LOCKED could be revoked later on). | ||
| 212 | */ | ||
| 213 | if (vma && (vma->vm_flags & VM_SHARED) && | ||
| 214 | (!vma->vm_private_data || | ||
| 215 | (vma->vm_flags & (VM_NONLINEAR|VM_RESERVED))) && | ||
| 216 | vma->vm_ops && vma->vm_ops->populate && | ||
| 217 | end > start && start >= vma->vm_start && | ||
| 218 | end <= vma->vm_end) { | ||
| 219 | |||
| 220 | /* Must set VM_NONLINEAR before any pages are populated. */ | ||
| 221 | if (pgoff != linear_page_index(vma, start) && | ||
| 222 | !(vma->vm_flags & VM_NONLINEAR)) { | ||
| 223 | if (!has_write_lock) { | ||
| 224 | up_read(&mm->mmap_sem); | ||
| 225 | down_write(&mm->mmap_sem); | ||
| 226 | has_write_lock = 1; | ||
| 227 | goto retry; | ||
| 228 | } | ||
| 229 | mapping = vma->vm_file->f_mapping; | ||
| 230 | spin_lock(&mapping->i_mmap_lock); | ||
| 231 | flush_dcache_mmap_lock(mapping); | ||
| 232 | vma->vm_flags |= VM_NONLINEAR; | ||
| 233 | vma_prio_tree_remove(vma, &mapping->i_mmap); | ||
| 234 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | ||
| 235 | flush_dcache_mmap_unlock(mapping); | ||
| 236 | spin_unlock(&mapping->i_mmap_lock); | ||
| 237 | } | ||
| 238 | |||
| 239 | err = vma->vm_ops->populate(vma, start, size, | ||
| 240 | vma->vm_page_prot, | ||
| 241 | pgoff, flags & MAP_NONBLOCK); | ||
| 242 | |||
| 243 | /* | ||
| 244 | * We can't clear VM_NONLINEAR because we'd have to do | ||
| 245 | * it after ->populate completes, and that would prevent | ||
| 246 | * downgrading the lock. (Locks can't be upgraded). | ||
| 247 | */ | ||
| 248 | } | ||
| 249 | if (likely(!has_write_lock)) | ||
| 250 | up_read(&mm->mmap_sem); | ||
| 251 | else | ||
| 252 | up_write(&mm->mmap_sem); | ||
| 253 | |||
| 254 | return err; | ||
| 255 | } | ||
| 256 | |||
diff --git a/mm/highmem.c b/mm/highmem.c new file mode 100644 index 000000000000..d01276506b00 --- /dev/null +++ b/mm/highmem.c | |||
| @@ -0,0 +1,607 @@ | |||
| 1 | /* | ||
| 2 | * High memory handling common code and variables. | ||
| 3 | * | ||
| 4 | * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de | ||
| 5 | * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de | ||
| 6 | * | ||
| 7 | * | ||
| 8 | * Redesigned the x86 32-bit VM architecture to deal with | ||
| 9 | * 64-bit physical space. With current x86 CPUs this | ||
| 10 | * means up to 64 Gigabytes physical RAM. | ||
| 11 | * | ||
| 12 | * Rewrote high memory support to move the page cache into | ||
| 13 | * high memory. Implemented permanent (schedulable) kmaps | ||
| 14 | * based on Linus' idea. | ||
| 15 | * | ||
| 16 | * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> | ||
| 17 | */ | ||
| 18 | |||
| 19 | #include <linux/mm.h> | ||
| 20 | #include <linux/module.h> | ||
| 21 | #include <linux/swap.h> | ||
| 22 | #include <linux/bio.h> | ||
| 23 | #include <linux/pagemap.h> | ||
| 24 | #include <linux/mempool.h> | ||
| 25 | #include <linux/blkdev.h> | ||
| 26 | #include <linux/init.h> | ||
| 27 | #include <linux/hash.h> | ||
| 28 | #include <linux/highmem.h> | ||
| 29 | #include <asm/tlbflush.h> | ||
| 30 | |||
| 31 | static mempool_t *page_pool, *isa_page_pool; | ||
| 32 | |||
| 33 | static void *page_pool_alloc(unsigned int __nocast gfp_mask, void *data) | ||
| 34 | { | ||
| 35 | unsigned int gfp = gfp_mask | (unsigned int) (long) data; | ||
| 36 | |||
| 37 | return alloc_page(gfp); | ||
| 38 | } | ||
| 39 | |||
| 40 | static void page_pool_free(void *page, void *data) | ||
| 41 | { | ||
| 42 | __free_page(page); | ||
| 43 | } | ||
| 44 | |||
| 45 | /* | ||
| 46 | * Virtual_count is not a pure "count". | ||
| 47 | * 0 means that it is not mapped, and has not been mapped | ||
| 48 | * since a TLB flush - it is usable. | ||
| 49 | * 1 means that there are no users, but it has been mapped | ||
| 50 | * since the last TLB flush - so we can't use it. | ||
| 51 | * n means that there are (n-1) current users of it. | ||
| 52 | */ | ||
| 53 | #ifdef CONFIG_HIGHMEM | ||
| 54 | static int pkmap_count[LAST_PKMAP]; | ||
| 55 | static unsigned int last_pkmap_nr; | ||
| 56 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); | ||
| 57 | |||
| 58 | pte_t * pkmap_page_table; | ||
| 59 | |||
| 60 | static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); | ||
| 61 | |||
| 62 | static void flush_all_zero_pkmaps(void) | ||
| 63 | { | ||
| 64 | int i; | ||
| 65 | |||
| 66 | flush_cache_kmaps(); | ||
| 67 | |||
| 68 | for (i = 0; i < LAST_PKMAP; i++) { | ||
| 69 | struct page *page; | ||
| 70 | |||
| 71 | /* | ||
| 72 | * zero means we don't have anything to do, | ||
| 73 | * >1 means that it is still in use. Only | ||
| 74 | * a count of 1 means that it is free but | ||
| 75 | * needs to be unmapped | ||
| 76 | */ | ||
| 77 | if (pkmap_count[i] != 1) | ||
| 78 | continue; | ||
| 79 | pkmap_count[i] = 0; | ||
| 80 | |||
| 81 | /* sanity check */ | ||
| 82 | if (pte_none(pkmap_page_table[i])) | ||
| 83 | BUG(); | ||
| 84 | |||
| 85 | /* | ||
| 86 | * Don't need an atomic fetch-and-clear op here; | ||
| 87 | * no-one has the page mapped, and cannot get at | ||
| 88 | * its virtual address (and hence PTE) without first | ||
| 89 | * getting the kmap_lock (which is held here). | ||
| 90 | * So no dangers, even with speculative execution. | ||
| 91 | */ | ||
| 92 | page = pte_page(pkmap_page_table[i]); | ||
| 93 | pte_clear(&init_mm, (unsigned long)page_address(page), | ||
| 94 | &pkmap_page_table[i]); | ||
| 95 | |||
| 96 | set_page_address(page, NULL); | ||
| 97 | } | ||
| 98 | flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); | ||
| 99 | } | ||
| 100 | |||
| 101 | static inline unsigned long map_new_virtual(struct page *page) | ||
| 102 | { | ||
| 103 | unsigned long vaddr; | ||
| 104 | int count; | ||
| 105 | |||
| 106 | start: | ||
| 107 | count = LAST_PKMAP; | ||
| 108 | /* Find an empty entry */ | ||
| 109 | for (;;) { | ||
| 110 | last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; | ||
| 111 | if (!last_pkmap_nr) { | ||
| 112 | flush_all_zero_pkmaps(); | ||
| 113 | count = LAST_PKMAP; | ||
| 114 | } | ||
| 115 | if (!pkmap_count[last_pkmap_nr]) | ||
| 116 | break; /* Found a usable entry */ | ||
| 117 | if (--count) | ||
| 118 | continue; | ||
| 119 | |||
| 120 | /* | ||
| 121 | * Sleep for somebody else to unmap their entries | ||
| 122 | */ | ||
| 123 | { | ||
| 124 | DECLARE_WAITQUEUE(wait, current); | ||
| 125 | |||
| 126 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
| 127 | add_wait_queue(&pkmap_map_wait, &wait); | ||
| 128 | spin_unlock(&kmap_lock); | ||
| 129 | schedule(); | ||
| 130 | remove_wait_queue(&pkmap_map_wait, &wait); | ||
| 131 | spin_lock(&kmap_lock); | ||
| 132 | |||
| 133 | /* Somebody else might have mapped it while we slept */ | ||
| 134 | if (page_address(page)) | ||
| 135 | return (unsigned long)page_address(page); | ||
| 136 | |||
| 137 | /* Re-start */ | ||
| 138 | goto start; | ||
| 139 | } | ||
| 140 | } | ||
| 141 | vaddr = PKMAP_ADDR(last_pkmap_nr); | ||
| 142 | set_pte_at(&init_mm, vaddr, | ||
| 143 | &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); | ||
| 144 | |||
| 145 | pkmap_count[last_pkmap_nr] = 1; | ||
| 146 | set_page_address(page, (void *)vaddr); | ||
| 147 | |||
| 148 | return vaddr; | ||
| 149 | } | ||
| 150 | |||
| 151 | void fastcall *kmap_high(struct page *page) | ||
| 152 | { | ||
| 153 | unsigned long vaddr; | ||
| 154 | |||
| 155 | /* | ||
| 156 | * For highmem pages, we can't trust "virtual" until | ||
| 157 | * after we have the lock. | ||
| 158 | * | ||
| 159 | * We cannot call this from interrupts, as it may block | ||
| 160 | */ | ||
| 161 | spin_lock(&kmap_lock); | ||
| 162 | vaddr = (unsigned long)page_address(page); | ||
| 163 | if (!vaddr) | ||
| 164 | vaddr = map_new_virtual(page); | ||
| 165 | pkmap_count[PKMAP_NR(vaddr)]++; | ||
| 166 | if (pkmap_count[PKMAP_NR(vaddr)] < 2) | ||
| 167 | BUG(); | ||
| 168 | spin_unlock(&kmap_lock); | ||
| 169 | return (void*) vaddr; | ||
| 170 | } | ||
| 171 | |||
| 172 | EXPORT_SYMBOL(kmap_high); | ||
| 173 | |||
| 174 | void fastcall kunmap_high(struct page *page) | ||
| 175 | { | ||
| 176 | unsigned long vaddr; | ||
| 177 | unsigned long nr; | ||
| 178 | int need_wakeup; | ||
| 179 | |||
| 180 | spin_lock(&kmap_lock); | ||
| 181 | vaddr = (unsigned long)page_address(page); | ||
| 182 | if (!vaddr) | ||
| 183 | BUG(); | ||
| 184 | nr = PKMAP_NR(vaddr); | ||
| 185 | |||
| 186 | /* | ||
| 187 | * A count must never go down to zero | ||
| 188 | * without a TLB flush! | ||
| 189 | */ | ||
| 190 | need_wakeup = 0; | ||
| 191 | switch (--pkmap_count[nr]) { | ||
| 192 | case 0: | ||
| 193 | BUG(); | ||
| 194 | case 1: | ||
| 195 | /* | ||
| 196 | * Avoid an unnecessary wake_up() function call. | ||
| 197 | * The common case is pkmap_count[] == 1, but | ||
| 198 | * no waiters. | ||
| 199 | * The tasks queued in the wait-queue are guarded | ||
| 200 | * by both the lock in the wait-queue-head and by | ||
| 201 | * the kmap_lock. As the kmap_lock is held here, | ||
| 202 | * no need for the wait-queue-head's lock. Simply | ||
| 203 | * test if the queue is empty. | ||
| 204 | */ | ||
| 205 | need_wakeup = waitqueue_active(&pkmap_map_wait); | ||
| 206 | } | ||
| 207 | spin_unlock(&kmap_lock); | ||
| 208 | |||
| 209 | /* do wake-up, if needed, race-free outside of the spin lock */ | ||
| 210 | if (need_wakeup) | ||
| 211 | wake_up(&pkmap_map_wait); | ||
| 212 | } | ||
| 213 | |||
| 214 | EXPORT_SYMBOL(kunmap_high); | ||
| 215 | |||
| 216 | #define POOL_SIZE 64 | ||
| 217 | |||
| 218 | static __init int init_emergency_pool(void) | ||
| 219 | { | ||
| 220 | struct sysinfo i; | ||
| 221 | si_meminfo(&i); | ||
| 222 | si_swapinfo(&i); | ||
| 223 | |||
| 224 | if (!i.totalhigh) | ||
| 225 | return 0; | ||
| 226 | |||
| 227 | page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL); | ||
| 228 | if (!page_pool) | ||
| 229 | BUG(); | ||
| 230 | printk("highmem bounce pool size: %d pages\n", POOL_SIZE); | ||
| 231 | |||
| 232 | return 0; | ||
| 233 | } | ||
| 234 | |||
| 235 | __initcall(init_emergency_pool); | ||
| 236 | |||
| 237 | /* | ||
| 238 | * highmem version, map in to vec | ||
| 239 | */ | ||
| 240 | static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) | ||
| 241 | { | ||
| 242 | unsigned long flags; | ||
| 243 | unsigned char *vto; | ||
| 244 | |||
| 245 | local_irq_save(flags); | ||
| 246 | vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); | ||
| 247 | memcpy(vto + to->bv_offset, vfrom, to->bv_len); | ||
| 248 | kunmap_atomic(vto, KM_BOUNCE_READ); | ||
| 249 | local_irq_restore(flags); | ||
| 250 | } | ||
| 251 | |||
| 252 | #else /* CONFIG_HIGHMEM */ | ||
| 253 | |||
| 254 | #define bounce_copy_vec(to, vfrom) \ | ||
| 255 | memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) | ||
| 256 | |||
| 257 | #endif | ||
| 258 | |||
| 259 | #define ISA_POOL_SIZE 16 | ||
| 260 | |||
| 261 | /* | ||
| 262 | * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA | ||
| 263 | * as the max address, so check if the pool has already been created. | ||
| 264 | */ | ||
| 265 | int init_emergency_isa_pool(void) | ||
| 266 | { | ||
| 267 | if (isa_page_pool) | ||
| 268 | return 0; | ||
| 269 | |||
| 270 | isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc, page_pool_free, (void *) __GFP_DMA); | ||
| 271 | if (!isa_page_pool) | ||
| 272 | BUG(); | ||
| 273 | |||
| 274 | printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE); | ||
| 275 | return 0; | ||
| 276 | } | ||
| 277 | |||
| 278 | /* | ||
| 279 | * Simple bounce buffer support for highmem pages. Depending on the | ||
| 280 | * queue gfp mask set, *to may or may not be a highmem page. kmap it | ||
| 281 | * always, it will do the Right Thing | ||
| 282 | */ | ||
| 283 | static void copy_to_high_bio_irq(struct bio *to, struct bio *from) | ||
| 284 | { | ||
| 285 | unsigned char *vfrom; | ||
| 286 | struct bio_vec *tovec, *fromvec; | ||
| 287 | int i; | ||
| 288 | |||
| 289 | __bio_for_each_segment(tovec, to, i, 0) { | ||
| 290 | fromvec = from->bi_io_vec + i; | ||
| 291 | |||
| 292 | /* | ||
| 293 | * not bounced | ||
| 294 | */ | ||
| 295 | if (tovec->bv_page == fromvec->bv_page) | ||
| 296 | continue; | ||
| 297 | |||
| 298 | /* | ||
| 299 | * fromvec->bv_offset and fromvec->bv_len might have been | ||
| 300 | * modified by the block layer, so use the original copy, | ||
| 301 | * bounce_copy_vec already uses tovec->bv_len | ||
| 302 | */ | ||
| 303 | vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; | ||
| 304 | |||
| 305 | flush_dcache_page(tovec->bv_page); | ||
| 306 | bounce_copy_vec(tovec, vfrom); | ||
| 307 | } | ||
| 308 | } | ||
| 309 | |||
| 310 | static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) | ||
| 311 | { | ||
| 312 | struct bio *bio_orig = bio->bi_private; | ||
| 313 | struct bio_vec *bvec, *org_vec; | ||
| 314 | int i; | ||
| 315 | |||
| 316 | if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) | ||
| 317 | set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); | ||
| 318 | |||
| 319 | /* | ||
| 320 | * free up bounce indirect pages used | ||
| 321 | */ | ||
| 322 | __bio_for_each_segment(bvec, bio, i, 0) { | ||
| 323 | org_vec = bio_orig->bi_io_vec + i; | ||
| 324 | if (bvec->bv_page == org_vec->bv_page) | ||
| 325 | continue; | ||
| 326 | |||
| 327 | mempool_free(bvec->bv_page, pool); | ||
| 328 | } | ||
| 329 | |||
| 330 | bio_endio(bio_orig, bio_orig->bi_size, err); | ||
| 331 | bio_put(bio); | ||
| 332 | } | ||
| 333 | |||
| 334 | static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done,int err) | ||
| 335 | { | ||
| 336 | if (bio->bi_size) | ||
| 337 | return 1; | ||
| 338 | |||
| 339 | bounce_end_io(bio, page_pool, err); | ||
| 340 | return 0; | ||
| 341 | } | ||
| 342 | |||
| 343 | static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err) | ||
| 344 | { | ||
| 345 | if (bio->bi_size) | ||
| 346 | return 1; | ||
| 347 | |||
| 348 | bounce_end_io(bio, isa_page_pool, err); | ||
| 349 | return 0; | ||
| 350 | } | ||
| 351 | |||
| 352 | static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) | ||
| 353 | { | ||
| 354 | struct bio *bio_orig = bio->bi_private; | ||
| 355 | |||
| 356 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) | ||
| 357 | copy_to_high_bio_irq(bio_orig, bio); | ||
| 358 | |||
| 359 | bounce_end_io(bio, pool, err); | ||
| 360 | } | ||
| 361 | |||
| 362 | static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err) | ||
| 363 | { | ||
| 364 | if (bio->bi_size) | ||
| 365 | return 1; | ||
| 366 | |||
| 367 | __bounce_end_io_read(bio, page_pool, err); | ||
| 368 | return 0; | ||
| 369 | } | ||
| 370 | |||
| 371 | static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err) | ||
| 372 | { | ||
| 373 | if (bio->bi_size) | ||
| 374 | return 1; | ||
| 375 | |||
| 376 | __bounce_end_io_read(bio, isa_page_pool, err); | ||
| 377 | return 0; | ||
| 378 | } | ||
| 379 | |||
| 380 | static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, | ||
| 381 | mempool_t *pool) | ||
| 382 | { | ||
| 383 | struct page *page; | ||
| 384 | struct bio *bio = NULL; | ||
| 385 | int i, rw = bio_data_dir(*bio_orig); | ||
| 386 | struct bio_vec *to, *from; | ||
| 387 | |||
| 388 | bio_for_each_segment(from, *bio_orig, i) { | ||
| 389 | page = from->bv_page; | ||
| 390 | |||
| 391 | /* | ||
| 392 | * is destination page below bounce pfn? | ||
| 393 | */ | ||
| 394 | if (page_to_pfn(page) < q->bounce_pfn) | ||
| 395 | continue; | ||
| 396 | |||
| 397 | /* | ||
| 398 | * irk, bounce it | ||
| 399 | */ | ||
| 400 | if (!bio) | ||
| 401 | bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt); | ||
| 402 | |||
| 403 | to = bio->bi_io_vec + i; | ||
| 404 | |||
| 405 | to->bv_page = mempool_alloc(pool, q->bounce_gfp); | ||
| 406 | to->bv_len = from->bv_len; | ||
| 407 | to->bv_offset = from->bv_offset; | ||
| 408 | |||
| 409 | if (rw == WRITE) { | ||
| 410 | char *vto, *vfrom; | ||
| 411 | |||
| 412 | flush_dcache_page(from->bv_page); | ||
| 413 | vto = page_address(to->bv_page) + to->bv_offset; | ||
| 414 | vfrom = kmap(from->bv_page) + from->bv_offset; | ||
| 415 | memcpy(vto, vfrom, to->bv_len); | ||
| 416 | kunmap(from->bv_page); | ||
| 417 | } | ||
| 418 | } | ||
| 419 | |||
| 420 | /* | ||
| 421 | * no pages bounced | ||
| 422 | */ | ||
| 423 | if (!bio) | ||
| 424 | return; | ||
| 425 | |||
| 426 | /* | ||
| 427 | * at least one page was bounced, fill in possible non-highmem | ||
| 428 | * pages | ||
| 429 | */ | ||
| 430 | __bio_for_each_segment(from, *bio_orig, i, 0) { | ||
| 431 | to = bio_iovec_idx(bio, i); | ||
| 432 | if (!to->bv_page) { | ||
| 433 | to->bv_page = from->bv_page; | ||
| 434 | to->bv_len = from->bv_len; | ||
| 435 | to->bv_offset = from->bv_offset; | ||
| 436 | } | ||
| 437 | } | ||
| 438 | |||
| 439 | bio->bi_bdev = (*bio_orig)->bi_bdev; | ||
| 440 | bio->bi_flags |= (1 << BIO_BOUNCED); | ||
| 441 | bio->bi_sector = (*bio_orig)->bi_sector; | ||
| 442 | bio->bi_rw = (*bio_orig)->bi_rw; | ||
| 443 | |||
| 444 | bio->bi_vcnt = (*bio_orig)->bi_vcnt; | ||
| 445 | bio->bi_idx = (*bio_orig)->bi_idx; | ||
| 446 | bio->bi_size = (*bio_orig)->bi_size; | ||
| 447 | |||
| 448 | if (pool == page_pool) { | ||
| 449 | bio->bi_end_io = bounce_end_io_write; | ||
| 450 | if (rw == READ) | ||
| 451 | bio->bi_end_io = bounce_end_io_read; | ||
| 452 | } else { | ||
| 453 | bio->bi_end_io = bounce_end_io_write_isa; | ||
| 454 | if (rw == READ) | ||
| 455 | bio->bi_end_io = bounce_end_io_read_isa; | ||
| 456 | } | ||
| 457 | |||
| 458 | bio->bi_private = *bio_orig; | ||
| 459 | *bio_orig = bio; | ||
| 460 | } | ||
| 461 | |||
| 462 | void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) | ||
| 463 | { | ||
| 464 | mempool_t *pool; | ||
| 465 | |||
| 466 | /* | ||
| 467 | * for non-isa bounce case, just check if the bounce pfn is equal | ||
| 468 | * to or bigger than the highest pfn in the system -- in that case, | ||
| 469 | * don't waste time iterating over bio segments | ||
| 470 | */ | ||
| 471 | if (!(q->bounce_gfp & GFP_DMA)) { | ||
| 472 | if (q->bounce_pfn >= blk_max_pfn) | ||
| 473 | return; | ||
| 474 | pool = page_pool; | ||
| 475 | } else { | ||
| 476 | BUG_ON(!isa_page_pool); | ||
| 477 | pool = isa_page_pool; | ||
| 478 | } | ||
| 479 | |||
| 480 | /* | ||
| 481 | * slow path | ||
| 482 | */ | ||
| 483 | __blk_queue_bounce(q, bio_orig, pool); | ||
| 484 | } | ||
| 485 | |||
| 486 | EXPORT_SYMBOL(blk_queue_bounce); | ||
| 487 | |||
| 488 | #if defined(HASHED_PAGE_VIRTUAL) | ||
| 489 | |||
| 490 | #define PA_HASH_ORDER 7 | ||
| 491 | |||
| 492 | /* | ||
| 493 | * Describes one page->virtual association | ||
| 494 | */ | ||
| 495 | struct page_address_map { | ||
| 496 | struct page *page; | ||
| 497 | void *virtual; | ||
| 498 | struct list_head list; | ||
| 499 | }; | ||
| 500 | |||
| 501 | /* | ||
| 502 | * page_address_map freelist, allocated from page_address_maps. | ||
| 503 | */ | ||
| 504 | static struct list_head page_address_pool; /* freelist */ | ||
| 505 | static spinlock_t pool_lock; /* protects page_address_pool */ | ||
| 506 | |||
| 507 | /* | ||
| 508 | * Hash table bucket | ||
| 509 | */ | ||
| 510 | static struct page_address_slot { | ||
| 511 | struct list_head lh; /* List of page_address_maps */ | ||
| 512 | spinlock_t lock; /* Protect this bucket's list */ | ||
| 513 | } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; | ||
| 514 | |||
| 515 | static struct page_address_slot *page_slot(struct page *page) | ||
| 516 | { | ||
| 517 | return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; | ||
| 518 | } | ||
| 519 | |||
| 520 | void *page_address(struct page *page) | ||
| 521 | { | ||
| 522 | unsigned long flags; | ||
| 523 | void *ret; | ||
| 524 | struct page_address_slot *pas; | ||
| 525 | |||
| 526 | if (!PageHighMem(page)) | ||
| 527 | return lowmem_page_address(page); | ||
| 528 | |||
| 529 | pas = page_slot(page); | ||
| 530 | ret = NULL; | ||
| 531 | spin_lock_irqsave(&pas->lock, flags); | ||
| 532 | if (!list_empty(&pas->lh)) { | ||
| 533 | struct page_address_map *pam; | ||
| 534 | |||
| 535 | list_for_each_entry(pam, &pas->lh, list) { | ||
| 536 | if (pam->page == page) { | ||
| 537 | ret = pam->virtual; | ||
| 538 | goto done; | ||
| 539 | } | ||
| 540 | } | ||
| 541 | } | ||
| 542 | done: | ||
| 543 | spin_unlock_irqrestore(&pas->lock, flags); | ||
| 544 | return ret; | ||
| 545 | } | ||
| 546 | |||
| 547 | EXPORT_SYMBOL(page_address); | ||
| 548 | |||
| 549 | void set_page_address(struct page *page, void *virtual) | ||
| 550 | { | ||
| 551 | unsigned long flags; | ||
| 552 | struct page_address_slot *pas; | ||
| 553 | struct page_address_map *pam; | ||
| 554 | |||
| 555 | BUG_ON(!PageHighMem(page)); | ||
| 556 | |||
| 557 | pas = page_slot(page); | ||
| 558 | if (virtual) { /* Add */ | ||
| 559 | BUG_ON(list_empty(&page_address_pool)); | ||
| 560 | |||
| 561 | spin_lock_irqsave(&pool_lock, flags); | ||
| 562 | pam = list_entry(page_address_pool.next, | ||
| 563 | struct page_address_map, list); | ||
| 564 | list_del(&pam->list); | ||
| 565 | spin_unlock_irqrestore(&pool_lock, flags); | ||
| 566 | |||
| 567 | pam->page = page; | ||
| 568 | pam->virtual = virtual; | ||
| 569 | |||
| 570 | spin_lock_irqsave(&pas->lock, flags); | ||
| 571 | list_add_tail(&pam->list, &pas->lh); | ||
| 572 | spin_unlock_irqrestore(&pas->lock, flags); | ||
| 573 | } else { /* Remove */ | ||
| 574 | spin_lock_irqsave(&pas->lock, flags); | ||
| 575 | list_for_each_entry(pam, &pas->lh, list) { | ||
| 576 | if (pam->page == page) { | ||
| 577 | list_del(&pam->list); | ||
| 578 | spin_unlock_irqrestore(&pas->lock, flags); | ||
| 579 | spin_lock_irqsave(&pool_lock, flags); | ||
| 580 | list_add_tail(&pam->list, &page_address_pool); | ||
| 581 | spin_unlock_irqrestore(&pool_lock, flags); | ||
| 582 | goto done; | ||
| 583 | } | ||
| 584 | } | ||
| 585 | spin_unlock_irqrestore(&pas->lock, flags); | ||
| 586 | } | ||
| 587 | done: | ||
| 588 | return; | ||
| 589 | } | ||
| 590 | |||
| 591 | static struct page_address_map page_address_maps[LAST_PKMAP]; | ||
| 592 | |||
| 593 | void __init page_address_init(void) | ||
| 594 | { | ||
| 595 | int i; | ||
| 596 | |||
| 597 | INIT_LIST_HEAD(&page_address_pool); | ||
| 598 | for (i = 0; i < ARRAY_SIZE(page_address_maps); i++) | ||
| 599 | list_add(&page_address_maps[i].list, &page_address_pool); | ||
| 600 | for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { | ||
| 601 | INIT_LIST_HEAD(&page_address_htable[i].lh); | ||
| 602 | spin_lock_init(&page_address_htable[i].lock); | ||
| 603 | } | ||
| 604 | spin_lock_init(&pool_lock); | ||
| 605 | } | ||
| 606 | |||
| 607 | #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c new file mode 100644 index 000000000000..4eb5ae3fbe10 --- /dev/null +++ b/mm/hugetlb.c | |||
| @@ -0,0 +1,260 @@ | |||
| 1 | /* | ||
| 2 | * Generic hugetlb support. | ||
| 3 | * (C) William Irwin, April 2004 | ||
| 4 | */ | ||
| 5 | #include <linux/gfp.h> | ||
| 6 | #include <linux/list.h> | ||
| 7 | #include <linux/init.h> | ||
| 8 | #include <linux/module.h> | ||
| 9 | #include <linux/mm.h> | ||
| 10 | #include <linux/hugetlb.h> | ||
| 11 | #include <linux/sysctl.h> | ||
| 12 | #include <linux/highmem.h> | ||
| 13 | #include <linux/nodemask.h> | ||
| 14 | |||
| 15 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | ||
| 16 | static unsigned long nr_huge_pages, free_huge_pages; | ||
| 17 | unsigned long max_huge_pages; | ||
| 18 | static struct list_head hugepage_freelists[MAX_NUMNODES]; | ||
| 19 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; | ||
| 20 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; | ||
| 21 | static DEFINE_SPINLOCK(hugetlb_lock); | ||
| 22 | |||
| 23 | static void enqueue_huge_page(struct page *page) | ||
| 24 | { | ||
| 25 | int nid = page_to_nid(page); | ||
| 26 | list_add(&page->lru, &hugepage_freelists[nid]); | ||
| 27 | free_huge_pages++; | ||
| 28 | free_huge_pages_node[nid]++; | ||
| 29 | } | ||
| 30 | |||
| 31 | static struct page *dequeue_huge_page(void) | ||
| 32 | { | ||
| 33 | int nid = numa_node_id(); | ||
| 34 | struct page *page = NULL; | ||
| 35 | |||
| 36 | if (list_empty(&hugepage_freelists[nid])) { | ||
| 37 | for (nid = 0; nid < MAX_NUMNODES; ++nid) | ||
| 38 | if (!list_empty(&hugepage_freelists[nid])) | ||
| 39 | break; | ||
| 40 | } | ||
| 41 | if (nid >= 0 && nid < MAX_NUMNODES && | ||
| 42 | !list_empty(&hugepage_freelists[nid])) { | ||
| 43 | page = list_entry(hugepage_freelists[nid].next, | ||
| 44 | struct page, lru); | ||
| 45 | list_del(&page->lru); | ||
| 46 | free_huge_pages--; | ||
| 47 | free_huge_pages_node[nid]--; | ||
| 48 | } | ||
| 49 | return page; | ||
| 50 | } | ||
| 51 | |||
| 52 | static struct page *alloc_fresh_huge_page(void) | ||
| 53 | { | ||
| 54 | static int nid = 0; | ||
| 55 | struct page *page; | ||
| 56 | page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, | ||
| 57 | HUGETLB_PAGE_ORDER); | ||
| 58 | nid = (nid + 1) % num_online_nodes(); | ||
| 59 | if (page) { | ||
| 60 | nr_huge_pages++; | ||
| 61 | nr_huge_pages_node[page_to_nid(page)]++; | ||
| 62 | } | ||
| 63 | return page; | ||
| 64 | } | ||
| 65 | |||
| 66 | void free_huge_page(struct page *page) | ||
| 67 | { | ||
| 68 | BUG_ON(page_count(page)); | ||
| 69 | |||
| 70 | INIT_LIST_HEAD(&page->lru); | ||
| 71 | page[1].mapping = NULL; | ||
| 72 | |||
| 73 | spin_lock(&hugetlb_lock); | ||
| 74 | enqueue_huge_page(page); | ||
| 75 | spin_unlock(&hugetlb_lock); | ||
| 76 | } | ||
| 77 | |||
| 78 | struct page *alloc_huge_page(void) | ||
| 79 | { | ||
| 80 | struct page *page; | ||
| 81 | int i; | ||
| 82 | |||
| 83 | spin_lock(&hugetlb_lock); | ||
| 84 | page = dequeue_huge_page(); | ||
| 85 | if (!page) { | ||
| 86 | spin_unlock(&hugetlb_lock); | ||
| 87 | return NULL; | ||
| 88 | } | ||
| 89 | spin_unlock(&hugetlb_lock); | ||
| 90 | set_page_count(page, 1); | ||
| 91 | page[1].mapping = (void *)free_huge_page; | ||
| 92 | for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) | ||
| 93 | clear_highpage(&page[i]); | ||
| 94 | return page; | ||
| 95 | } | ||
| 96 | |||
| 97 | static int __init hugetlb_init(void) | ||
| 98 | { | ||
| 99 | unsigned long i; | ||
| 100 | struct page *page; | ||
| 101 | |||
| 102 | for (i = 0; i < MAX_NUMNODES; ++i) | ||
| 103 | INIT_LIST_HEAD(&hugepage_freelists[i]); | ||
| 104 | |||
| 105 | for (i = 0; i < max_huge_pages; ++i) { | ||
| 106 | page = alloc_fresh_huge_page(); | ||
| 107 | if (!page) | ||
| 108 | break; | ||
| 109 | spin_lock(&hugetlb_lock); | ||
| 110 | enqueue_huge_page(page); | ||
| 111 | spin_unlock(&hugetlb_lock); | ||
| 112 | } | ||
| 113 | max_huge_pages = free_huge_pages = nr_huge_pages = i; | ||
| 114 | printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); | ||
| 115 | return 0; | ||
| 116 | } | ||
| 117 | module_init(hugetlb_init); | ||
| 118 | |||
| 119 | static int __init hugetlb_setup(char *s) | ||
| 120 | { | ||
| 121 | if (sscanf(s, "%lu", &max_huge_pages) <= 0) | ||
| 122 | max_huge_pages = 0; | ||
| 123 | return 1; | ||
| 124 | } | ||
| 125 | __setup("hugepages=", hugetlb_setup); | ||
| 126 | |||
| 127 | #ifdef CONFIG_SYSCTL | ||
| 128 | static void update_and_free_page(struct page *page) | ||
| 129 | { | ||
| 130 | int i; | ||
| 131 | nr_huge_pages--; | ||
| 132 | nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--; | ||
| 133 | for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { | ||
| 134 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | | ||
| 135 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | | ||
| 136 | 1 << PG_private | 1<< PG_writeback); | ||
| 137 | set_page_count(&page[i], 0); | ||
| 138 | } | ||
| 139 | set_page_count(page, 1); | ||
| 140 | __free_pages(page, HUGETLB_PAGE_ORDER); | ||
| 141 | } | ||
| 142 | |||
| 143 | #ifdef CONFIG_HIGHMEM | ||
| 144 | static void try_to_free_low(unsigned long count) | ||
| 145 | { | ||
| 146 | int i, nid; | ||
| 147 | for (i = 0; i < MAX_NUMNODES; ++i) { | ||
| 148 | struct page *page, *next; | ||
| 149 | list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { | ||
| 150 | if (PageHighMem(page)) | ||
| 151 | continue; | ||
| 152 | list_del(&page->lru); | ||
| 153 | update_and_free_page(page); | ||
| 154 | nid = page_zone(page)->zone_pgdat->node_id; | ||
| 155 | free_huge_pages--; | ||
| 156 | free_huge_pages_node[nid]--; | ||
| 157 | if (count >= nr_huge_pages) | ||
| 158 | return; | ||
| 159 | } | ||
| 160 | } | ||
| 161 | } | ||
| 162 | #else | ||
| 163 | static inline void try_to_free_low(unsigned long count) | ||
| 164 | { | ||
| 165 | } | ||
| 166 | #endif | ||
| 167 | |||
| 168 | static unsigned long set_max_huge_pages(unsigned long count) | ||
| 169 | { | ||
| 170 | while (count > nr_huge_pages) { | ||
| 171 | struct page *page = alloc_fresh_huge_page(); | ||
| 172 | if (!page) | ||
| 173 | return nr_huge_pages; | ||
| 174 | spin_lock(&hugetlb_lock); | ||
| 175 | enqueue_huge_page(page); | ||
| 176 | spin_unlock(&hugetlb_lock); | ||
| 177 | } | ||
| 178 | if (count >= nr_huge_pages) | ||
| 179 | return nr_huge_pages; | ||
| 180 | |||
| 181 | spin_lock(&hugetlb_lock); | ||
| 182 | try_to_free_low(count); | ||
| 183 | while (count < nr_huge_pages) { | ||
| 184 | struct page *page = dequeue_huge_page(); | ||
| 185 | if (!page) | ||
| 186 | break; | ||
| 187 | update_and_free_page(page); | ||
| 188 | } | ||
| 189 | spin_unlock(&hugetlb_lock); | ||
| 190 | return nr_huge_pages; | ||
| 191 | } | ||
| 192 | |||
| 193 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | ||
| 194 | struct file *file, void __user *buffer, | ||
| 195 | size_t *length, loff_t *ppos) | ||
| 196 | { | ||
| 197 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); | ||
| 198 | max_huge_pages = set_max_huge_pages(max_huge_pages); | ||
| 199 | return 0; | ||
| 200 | } | ||
| 201 | #endif /* CONFIG_SYSCTL */ | ||
| 202 | |||
| 203 | int hugetlb_report_meminfo(char *buf) | ||
| 204 | { | ||
| 205 | return sprintf(buf, | ||
| 206 | "HugePages_Total: %5lu\n" | ||
| 207 | "HugePages_Free: %5lu\n" | ||
| 208 | "Hugepagesize: %5lu kB\n", | ||
| 209 | nr_huge_pages, | ||
| 210 | free_huge_pages, | ||
| 211 | HPAGE_SIZE/1024); | ||
| 212 | } | ||
| 213 | |||
| 214 | int hugetlb_report_node_meminfo(int nid, char *buf) | ||
| 215 | { | ||
| 216 | return sprintf(buf, | ||
| 217 | "Node %d HugePages_Total: %5u\n" | ||
| 218 | "Node %d HugePages_Free: %5u\n", | ||
| 219 | nid, nr_huge_pages_node[nid], | ||
| 220 | nid, free_huge_pages_node[nid]); | ||
| 221 | } | ||
| 222 | |||
| 223 | int is_hugepage_mem_enough(size_t size) | ||
| 224 | { | ||
| 225 | return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages; | ||
| 226 | } | ||
| 227 | |||
| 228 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ | ||
| 229 | unsigned long hugetlb_total_pages(void) | ||
| 230 | { | ||
| 231 | return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); | ||
| 232 | } | ||
| 233 | EXPORT_SYMBOL(hugetlb_total_pages); | ||
| 234 | |||
| 235 | /* | ||
| 236 | * We cannot handle pagefaults against hugetlb pages at all. They cause | ||
| 237 | * handle_mm_fault() to try to instantiate regular-sized pages in the | ||
| 238 | * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get | ||
| 239 | * this far. | ||
| 240 | */ | ||
| 241 | static struct page *hugetlb_nopage(struct vm_area_struct *vma, | ||
| 242 | unsigned long address, int *unused) | ||
| 243 | { | ||
| 244 | BUG(); | ||
| 245 | return NULL; | ||
| 246 | } | ||
| 247 | |||
| 248 | struct vm_operations_struct hugetlb_vm_ops = { | ||
| 249 | .nopage = hugetlb_nopage, | ||
| 250 | }; | ||
| 251 | |||
| 252 | void zap_hugepage_range(struct vm_area_struct *vma, | ||
| 253 | unsigned long start, unsigned long length) | ||
| 254 | { | ||
| 255 | struct mm_struct *mm = vma->vm_mm; | ||
| 256 | |||
| 257 | spin_lock(&mm->page_table_lock); | ||
| 258 | unmap_hugepage_range(vma, start, start + length); | ||
| 259 | spin_unlock(&mm->page_table_lock); | ||
| 260 | } | ||
diff --git a/mm/internal.h b/mm/internal.h new file mode 100644 index 000000000000..6bf134e8fb3d --- /dev/null +++ b/mm/internal.h | |||
| @@ -0,0 +1,13 @@ | |||
| 1 | /* internal.h: mm/ internal definitions | ||
| 2 | * | ||
| 3 | * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. | ||
| 4 | * Written by David Howells (dhowells@redhat.com) | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or | ||
| 7 | * modify it under the terms of the GNU General Public License | ||
| 8 | * as published by the Free Software Foundation; either version | ||
| 9 | * 2 of the License, or (at your option) any later version. | ||
| 10 | */ | ||
| 11 | |||
| 12 | /* page_alloc.c */ | ||
| 13 | extern void set_page_refs(struct page *page, int order); | ||
diff --git a/mm/madvise.c b/mm/madvise.c new file mode 100644 index 000000000000..944b5e52d812 --- /dev/null +++ b/mm/madvise.c | |||
| @@ -0,0 +1,242 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/madvise.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 1999 Linus Torvalds | ||
| 5 | * Copyright (C) 2002 Christoph Hellwig | ||
| 6 | */ | ||
| 7 | |||
| 8 | #include <linux/mman.h> | ||
| 9 | #include <linux/pagemap.h> | ||
| 10 | #include <linux/syscalls.h> | ||
| 11 | #include <linux/hugetlb.h> | ||
| 12 | |||
| 13 | /* | ||
| 14 | * We can potentially split a vm area into separate | ||
| 15 | * areas, each area with its own behavior. | ||
| 16 | */ | ||
| 17 | static long madvise_behavior(struct vm_area_struct * vma, unsigned long start, | ||
| 18 | unsigned long end, int behavior) | ||
| 19 | { | ||
| 20 | struct mm_struct * mm = vma->vm_mm; | ||
| 21 | int error = 0; | ||
| 22 | |||
| 23 | if (start != vma->vm_start) { | ||
| 24 | error = split_vma(mm, vma, start, 1); | ||
| 25 | if (error) | ||
| 26 | goto out; | ||
| 27 | } | ||
| 28 | |||
| 29 | if (end != vma->vm_end) { | ||
| 30 | error = split_vma(mm, vma, end, 0); | ||
| 31 | if (error) | ||
| 32 | goto out; | ||
| 33 | } | ||
| 34 | |||
| 35 | /* | ||
| 36 | * vm_flags is protected by the mmap_sem held in write mode. | ||
| 37 | */ | ||
| 38 | VM_ClearReadHint(vma); | ||
| 39 | |||
| 40 | switch (behavior) { | ||
| 41 | case MADV_SEQUENTIAL: | ||
| 42 | vma->vm_flags |= VM_SEQ_READ; | ||
| 43 | break; | ||
| 44 | case MADV_RANDOM: | ||
| 45 | vma->vm_flags |= VM_RAND_READ; | ||
| 46 | break; | ||
| 47 | default: | ||
| 48 | break; | ||
| 49 | } | ||
| 50 | |||
| 51 | out: | ||
| 52 | if (error == -ENOMEM) | ||
| 53 | error = -EAGAIN; | ||
| 54 | return error; | ||
| 55 | } | ||
| 56 | |||
| 57 | /* | ||
| 58 | * Schedule all required I/O operations. Do not wait for completion. | ||
| 59 | */ | ||
| 60 | static long madvise_willneed(struct vm_area_struct * vma, | ||
| 61 | unsigned long start, unsigned long end) | ||
| 62 | { | ||
| 63 | struct file *file = vma->vm_file; | ||
| 64 | |||
| 65 | if (!file) | ||
| 66 | return -EBADF; | ||
| 67 | |||
| 68 | start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
| 69 | if (end > vma->vm_end) | ||
| 70 | end = vma->vm_end; | ||
| 71 | end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
| 72 | |||
| 73 | force_page_cache_readahead(file->f_mapping, | ||
| 74 | file, start, max_sane_readahead(end - start)); | ||
| 75 | return 0; | ||
| 76 | } | ||
| 77 | |||
| 78 | /* | ||
| 79 | * Application no longer needs these pages. If the pages are dirty, | ||
| 80 | * it's OK to just throw them away. The app will be more careful about | ||
| 81 | * data it wants to keep. Be sure to free swap resources too. The | ||
| 82 | * zap_page_range call sets things up for refill_inactive to actually free | ||
| 83 | * these pages later if no one else has touched them in the meantime, | ||
| 84 | * although we could add these pages to a global reuse list for | ||
| 85 | * refill_inactive to pick up before reclaiming other pages. | ||
| 86 | * | ||
| 87 | * NB: This interface discards data rather than pushes it out to swap, | ||
| 88 | * as some implementations do. This has performance implications for | ||
| 89 | * applications like large transactional databases which want to discard | ||
| 90 | * pages in anonymous maps after committing to backing store the data | ||
| 91 | * that was kept in them. There is no reason to write this data out to | ||
| 92 | * the swap area if the application is discarding it. | ||
| 93 | * | ||
| 94 | * An interface that causes the system to free clean pages and flush | ||
| 95 | * dirty pages is already available as msync(MS_INVALIDATE). | ||
| 96 | */ | ||
| 97 | static long madvise_dontneed(struct vm_area_struct * vma, | ||
| 98 | unsigned long start, unsigned long end) | ||
| 99 | { | ||
| 100 | if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma)) | ||
| 101 | return -EINVAL; | ||
| 102 | |||
| 103 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) { | ||
| 104 | struct zap_details details = { | ||
| 105 | .nonlinear_vma = vma, | ||
| 106 | .last_index = ULONG_MAX, | ||
| 107 | }; | ||
| 108 | zap_page_range(vma, start, end - start, &details); | ||
| 109 | } else | ||
| 110 | zap_page_range(vma, start, end - start, NULL); | ||
| 111 | return 0; | ||
| 112 | } | ||
| 113 | |||
| 114 | static long madvise_vma(struct vm_area_struct * vma, unsigned long start, | ||
| 115 | unsigned long end, int behavior) | ||
| 116 | { | ||
| 117 | long error = -EBADF; | ||
| 118 | |||
| 119 | switch (behavior) { | ||
| 120 | case MADV_NORMAL: | ||
| 121 | case MADV_SEQUENTIAL: | ||
| 122 | case MADV_RANDOM: | ||
| 123 | error = madvise_behavior(vma, start, end, behavior); | ||
| 124 | break; | ||
| 125 | |||
| 126 | case MADV_WILLNEED: | ||
| 127 | error = madvise_willneed(vma, start, end); | ||
| 128 | break; | ||
| 129 | |||
| 130 | case MADV_DONTNEED: | ||
| 131 | error = madvise_dontneed(vma, start, end); | ||
| 132 | break; | ||
| 133 | |||
| 134 | default: | ||
| 135 | error = -EINVAL; | ||
| 136 | break; | ||
| 137 | } | ||
| 138 | |||
| 139 | return error; | ||
| 140 | } | ||
| 141 | |||
| 142 | /* | ||
| 143 | * The madvise(2) system call. | ||
| 144 | * | ||
| 145 | * Applications can use madvise() to advise the kernel how it should | ||
| 146 | * handle paging I/O in this VM area. The idea is to help the kernel | ||
| 147 | * use appropriate read-ahead and caching techniques. The information | ||
| 148 | * provided is advisory only, and can be safely disregarded by the | ||
| 149 | * kernel without affecting the correct operation of the application. | ||
| 150 | * | ||
| 151 | * behavior values: | ||
| 152 | * MADV_NORMAL - the default behavior is to read clusters. This | ||
| 153 | * results in some read-ahead and read-behind. | ||
| 154 | * MADV_RANDOM - the system should read the minimum amount of data | ||
| 155 | * on any access, since it is unlikely that the appli- | ||
| 156 | * cation will need more than what it asks for. | ||
| 157 | * MADV_SEQUENTIAL - pages in the given range will probably be accessed | ||
| 158 | * once, so they can be aggressively read ahead, and | ||
| 159 | * can be freed soon after they are accessed. | ||
| 160 | * MADV_WILLNEED - the application is notifying the system to read | ||
| 161 | * some pages ahead. | ||
| 162 | * MADV_DONTNEED - the application is finished with the given range, | ||
| 163 | * so the kernel can free resources associated with it. | ||
| 164 | * | ||
| 165 | * return values: | ||
| 166 | * zero - success | ||
| 167 | * -EINVAL - start + len < 0, start is not page-aligned, | ||
| 168 | * "behavior" is not a valid value, or application | ||
| 169 | * is attempting to release locked or shared pages. | ||
| 170 | * -ENOMEM - addresses in the specified range are not currently | ||
| 171 | * mapped, or are outside the AS of the process. | ||
| 172 | * -EIO - an I/O error occurred while paging in data. | ||
| 173 | * -EBADF - map exists, but area maps something that isn't a file. | ||
| 174 | * -EAGAIN - a kernel resource was temporarily unavailable. | ||
| 175 | */ | ||
| 176 | asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) | ||
| 177 | { | ||
| 178 | unsigned long end; | ||
| 179 | struct vm_area_struct * vma; | ||
| 180 | int unmapped_error = 0; | ||
| 181 | int error = -EINVAL; | ||
| 182 | size_t len; | ||
| 183 | |||
| 184 | down_write(¤t->mm->mmap_sem); | ||
| 185 | |||
| 186 | if (start & ~PAGE_MASK) | ||
| 187 | goto out; | ||
| 188 | len = (len_in + ~PAGE_MASK) & PAGE_MASK; | ||
| 189 | |||
| 190 | /* Check to see whether len was rounded up from small -ve to zero */ | ||
| 191 | if (len_in && !len) | ||
| 192 | goto out; | ||
| 193 | |||
| 194 | end = start + len; | ||
| 195 | if (end < start) | ||
| 196 | goto out; | ||
| 197 | |||
| 198 | error = 0; | ||
| 199 | if (end == start) | ||
| 200 | goto out; | ||
| 201 | |||
| 202 | /* | ||
| 203 | * If the interval [start,end) covers some unmapped address | ||
| 204 | * ranges, just ignore them, but return -ENOMEM at the end. | ||
| 205 | */ | ||
| 206 | vma = find_vma(current->mm, start); | ||
| 207 | for (;;) { | ||
| 208 | /* Still start < end. */ | ||
| 209 | error = -ENOMEM; | ||
| 210 | if (!vma) | ||
| 211 | goto out; | ||
| 212 | |||
| 213 | /* Here start < vma->vm_end. */ | ||
| 214 | if (start < vma->vm_start) { | ||
| 215 | unmapped_error = -ENOMEM; | ||
| 216 | start = vma->vm_start; | ||
| 217 | } | ||
| 218 | |||
| 219 | /* Here vma->vm_start <= start < vma->vm_end. */ | ||
| 220 | if (end <= vma->vm_end) { | ||
| 221 | if (start < end) { | ||
| 222 | error = madvise_vma(vma, start, end, | ||
| 223 | behavior); | ||
| 224 | if (error) | ||
| 225 | goto out; | ||
| 226 | } | ||
| 227 | error = unmapped_error; | ||
| 228 | goto out; | ||
| 229 | } | ||
| 230 | |||
| 231 | /* Here vma->vm_start <= start < vma->vm_end < end. */ | ||
| 232 | error = madvise_vma(vma, start, vma->vm_end, behavior); | ||
| 233 | if (error) | ||
| 234 | goto out; | ||
| 235 | start = vma->vm_end; | ||
| 236 | vma = vma->vm_next; | ||
| 237 | } | ||
| 238 | |||
| 239 | out: | ||
| 240 | up_write(¤t->mm->mmap_sem); | ||
| 241 | return error; | ||
| 242 | } | ||
diff --git a/mm/memory.c b/mm/memory.c new file mode 100644 index 000000000000..fb6e5deb873a --- /dev/null +++ b/mm/memory.c | |||
| @@ -0,0 +1,2165 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/memory.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | ||
| 5 | */ | ||
| 6 | |||
| 7 | /* | ||
| 8 | * demand-loading started 01.12.91 - seems it is high on the list of | ||
| 9 | * things wanted, and it should be easy to implement. - Linus | ||
| 10 | */ | ||
| 11 | |||
| 12 | /* | ||
| 13 | * Ok, demand-loading was easy, shared pages a little bit tricker. Shared | ||
| 14 | * pages started 02.12.91, seems to work. - Linus. | ||
| 15 | * | ||
| 16 | * Tested sharing by executing about 30 /bin/sh: under the old kernel it | ||
| 17 | * would have taken more than the 6M I have free, but it worked well as | ||
| 18 | * far as I could see. | ||
| 19 | * | ||
| 20 | * Also corrected some "invalidate()"s - I wasn't doing enough of them. | ||
| 21 | */ | ||
| 22 | |||
| 23 | /* | ||
| 24 | * Real VM (paging to/from disk) started 18.12.91. Much more work and | ||
| 25 | * thought has to go into this. Oh, well.. | ||
| 26 | * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. | ||
| 27 | * Found it. Everything seems to work now. | ||
| 28 | * 20.12.91 - Ok, making the swap-device changeable like the root. | ||
| 29 | */ | ||
| 30 | |||
| 31 | /* | ||
| 32 | * 05.04.94 - Multi-page memory management added for v1.1. | ||
| 33 | * Idea by Alex Bligh (alex@cconcepts.co.uk) | ||
| 34 | * | ||
| 35 | * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG | ||
| 36 | * (Gerhard.Wichert@pdb.siemens.de) | ||
| 37 | * | ||
| 38 | * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) | ||
| 39 | */ | ||
| 40 | |||
| 41 | #include <linux/kernel_stat.h> | ||
| 42 | #include <linux/mm.h> | ||
| 43 | #include <linux/hugetlb.h> | ||
| 44 | #include <linux/mman.h> | ||
| 45 | #include <linux/swap.h> | ||
| 46 | #include <linux/highmem.h> | ||
| 47 | #include <linux/pagemap.h> | ||
| 48 | #include <linux/rmap.h> | ||
| 49 | #include <linux/module.h> | ||
| 50 | #include <linux/init.h> | ||
| 51 | |||
| 52 | #include <asm/pgalloc.h> | ||
| 53 | #include <asm/uaccess.h> | ||
| 54 | #include <asm/tlb.h> | ||
| 55 | #include <asm/tlbflush.h> | ||
| 56 | #include <asm/pgtable.h> | ||
| 57 | |||
| 58 | #include <linux/swapops.h> | ||
| 59 | #include <linux/elf.h> | ||
| 60 | |||
| 61 | #ifndef CONFIG_DISCONTIGMEM | ||
| 62 | /* use the per-pgdat data instead for discontigmem - mbligh */ | ||
| 63 | unsigned long max_mapnr; | ||
| 64 | struct page *mem_map; | ||
| 65 | |||
| 66 | EXPORT_SYMBOL(max_mapnr); | ||
| 67 | EXPORT_SYMBOL(mem_map); | ||
| 68 | #endif | ||
| 69 | |||
| 70 | unsigned long num_physpages; | ||
| 71 | /* | ||
| 72 | * A number of key systems in x86 including ioremap() rely on the assumption | ||
| 73 | * that high_memory defines the upper bound on direct map memory, then end | ||
| 74 | * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and | ||
| 75 | * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL | ||
| 76 | * and ZONE_HIGHMEM. | ||
| 77 | */ | ||
| 78 | void * high_memory; | ||
| 79 | unsigned long vmalloc_earlyreserve; | ||
| 80 | |||
| 81 | EXPORT_SYMBOL(num_physpages); | ||
| 82 | EXPORT_SYMBOL(high_memory); | ||
| 83 | EXPORT_SYMBOL(vmalloc_earlyreserve); | ||
| 84 | |||
| 85 | /* | ||
| 86 | * If a p?d_bad entry is found while walking page tables, report | ||
| 87 | * the error, before resetting entry to p?d_none. Usually (but | ||
| 88 | * very seldom) called out from the p?d_none_or_clear_bad macros. | ||
| 89 | */ | ||
| 90 | |||
| 91 | void pgd_clear_bad(pgd_t *pgd) | ||
| 92 | { | ||
| 93 | pgd_ERROR(*pgd); | ||
| 94 | pgd_clear(pgd); | ||
| 95 | } | ||
| 96 | |||
| 97 | void pud_clear_bad(pud_t *pud) | ||
| 98 | { | ||
| 99 | pud_ERROR(*pud); | ||
| 100 | pud_clear(pud); | ||
| 101 | } | ||
| 102 | |||
| 103 | void pmd_clear_bad(pmd_t *pmd) | ||
| 104 | { | ||
| 105 | pmd_ERROR(*pmd); | ||
| 106 | pmd_clear(pmd); | ||
| 107 | } | ||
| 108 | |||
| 109 | /* | ||
| 110 | * Note: this doesn't free the actual pages themselves. That | ||
| 111 | * has been handled earlier when unmapping all the memory regions. | ||
| 112 | */ | ||
| 113 | static inline void clear_pte_range(struct mmu_gather *tlb, pmd_t *pmd, | ||
| 114 | unsigned long addr, unsigned long end) | ||
| 115 | { | ||
| 116 | if (!((addr | end) & ~PMD_MASK)) { | ||
| 117 | /* Only free fully aligned ranges */ | ||
| 118 | struct page *page = pmd_page(*pmd); | ||
| 119 | pmd_clear(pmd); | ||
| 120 | dec_page_state(nr_page_table_pages); | ||
| 121 | tlb->mm->nr_ptes--; | ||
| 122 | pte_free_tlb(tlb, page); | ||
| 123 | } | ||
| 124 | } | ||
| 125 | |||
| 126 | static inline void clear_pmd_range(struct mmu_gather *tlb, pud_t *pud, | ||
| 127 | unsigned long addr, unsigned long end) | ||
| 128 | { | ||
| 129 | pmd_t *pmd; | ||
| 130 | unsigned long next; | ||
| 131 | pmd_t *empty_pmd = NULL; | ||
| 132 | |||
| 133 | pmd = pmd_offset(pud, addr); | ||
| 134 | |||
| 135 | /* Only free fully aligned ranges */ | ||
| 136 | if (!((addr | end) & ~PUD_MASK)) | ||
| 137 | empty_pmd = pmd; | ||
| 138 | do { | ||
| 139 | next = pmd_addr_end(addr, end); | ||
| 140 | if (pmd_none_or_clear_bad(pmd)) | ||
| 141 | continue; | ||
| 142 | clear_pte_range(tlb, pmd, addr, next); | ||
| 143 | } while (pmd++, addr = next, addr != end); | ||
| 144 | |||
| 145 | if (empty_pmd) { | ||
| 146 | pud_clear(pud); | ||
| 147 | pmd_free_tlb(tlb, empty_pmd); | ||
| 148 | } | ||
| 149 | } | ||
| 150 | |||
| 151 | static inline void clear_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | ||
| 152 | unsigned long addr, unsigned long end) | ||
| 153 | { | ||
| 154 | pud_t *pud; | ||
| 155 | unsigned long next; | ||
| 156 | pud_t *empty_pud = NULL; | ||
| 157 | |||
| 158 | pud = pud_offset(pgd, addr); | ||
| 159 | |||
| 160 | /* Only free fully aligned ranges */ | ||
| 161 | if (!((addr | end) & ~PGDIR_MASK)) | ||
| 162 | empty_pud = pud; | ||
| 163 | do { | ||
| 164 | next = pud_addr_end(addr, end); | ||
| 165 | if (pud_none_or_clear_bad(pud)) | ||
| 166 | continue; | ||
| 167 | clear_pmd_range(tlb, pud, addr, next); | ||
| 168 | } while (pud++, addr = next, addr != end); | ||
| 169 | |||
| 170 | if (empty_pud) { | ||
| 171 | pgd_clear(pgd); | ||
| 172 | pud_free_tlb(tlb, empty_pud); | ||
| 173 | } | ||
| 174 | } | ||
| 175 | |||
| 176 | /* | ||
| 177 | * This function clears user-level page tables of a process. | ||
| 178 | * Unlike other pagetable walks, some memory layouts might give end 0. | ||
| 179 | * Must be called with pagetable lock held. | ||
| 180 | */ | ||
| 181 | void clear_page_range(struct mmu_gather *tlb, | ||
| 182 | unsigned long addr, unsigned long end) | ||
| 183 | { | ||
| 184 | pgd_t *pgd; | ||
| 185 | unsigned long next; | ||
| 186 | |||
| 187 | pgd = pgd_offset(tlb->mm, addr); | ||
| 188 | do { | ||
| 189 | next = pgd_addr_end(addr, end); | ||
| 190 | if (pgd_none_or_clear_bad(pgd)) | ||
| 191 | continue; | ||
| 192 | clear_pud_range(tlb, pgd, addr, next); | ||
| 193 | } while (pgd++, addr = next, addr != end); | ||
| 194 | } | ||
| 195 | |||
| 196 | pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | ||
| 197 | { | ||
| 198 | if (!pmd_present(*pmd)) { | ||
| 199 | struct page *new; | ||
| 200 | |||
| 201 | spin_unlock(&mm->page_table_lock); | ||
| 202 | new = pte_alloc_one(mm, address); | ||
| 203 | spin_lock(&mm->page_table_lock); | ||
| 204 | if (!new) | ||
| 205 | return NULL; | ||
| 206 | /* | ||
| 207 | * Because we dropped the lock, we should re-check the | ||
| 208 | * entry, as somebody else could have populated it.. | ||
| 209 | */ | ||
| 210 | if (pmd_present(*pmd)) { | ||
| 211 | pte_free(new); | ||
| 212 | goto out; | ||
| 213 | } | ||
| 214 | mm->nr_ptes++; | ||
| 215 | inc_page_state(nr_page_table_pages); | ||
| 216 | pmd_populate(mm, pmd, new); | ||
| 217 | } | ||
| 218 | out: | ||
| 219 | return pte_offset_map(pmd, address); | ||
| 220 | } | ||
| 221 | |||
| 222 | pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | ||
| 223 | { | ||
| 224 | if (!pmd_present(*pmd)) { | ||
| 225 | pte_t *new; | ||
| 226 | |||
| 227 | spin_unlock(&mm->page_table_lock); | ||
| 228 | new = pte_alloc_one_kernel(mm, address); | ||
| 229 | spin_lock(&mm->page_table_lock); | ||
| 230 | if (!new) | ||
| 231 | return NULL; | ||
| 232 | |||
| 233 | /* | ||
| 234 | * Because we dropped the lock, we should re-check the | ||
| 235 | * entry, as somebody else could have populated it.. | ||
| 236 | */ | ||
| 237 | if (pmd_present(*pmd)) { | ||
| 238 | pte_free_kernel(new); | ||
| 239 | goto out; | ||
| 240 | } | ||
| 241 | pmd_populate_kernel(mm, pmd, new); | ||
| 242 | } | ||
| 243 | out: | ||
| 244 | return pte_offset_kernel(pmd, address); | ||
| 245 | } | ||
| 246 | |||
| 247 | /* | ||
| 248 | * copy one vm_area from one task to the other. Assumes the page tables | ||
| 249 | * already present in the new task to be cleared in the whole range | ||
| 250 | * covered by this vma. | ||
| 251 | * | ||
| 252 | * dst->page_table_lock is held on entry and exit, | ||
| 253 | * but may be dropped within p[mg]d_alloc() and pte_alloc_map(). | ||
| 254 | */ | ||
| 255 | |||
| 256 | static inline void | ||
| 257 | copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||
| 258 | pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags, | ||
| 259 | unsigned long addr) | ||
| 260 | { | ||
| 261 | pte_t pte = *src_pte; | ||
| 262 | struct page *page; | ||
| 263 | unsigned long pfn; | ||
| 264 | |||
| 265 | /* pte contains position in swap or file, so copy. */ | ||
| 266 | if (unlikely(!pte_present(pte))) { | ||
| 267 | if (!pte_file(pte)) { | ||
| 268 | swap_duplicate(pte_to_swp_entry(pte)); | ||
| 269 | /* make sure dst_mm is on swapoff's mmlist. */ | ||
| 270 | if (unlikely(list_empty(&dst_mm->mmlist))) { | ||
| 271 | spin_lock(&mmlist_lock); | ||
| 272 | list_add(&dst_mm->mmlist, &src_mm->mmlist); | ||
| 273 | spin_unlock(&mmlist_lock); | ||
| 274 | } | ||
| 275 | } | ||
| 276 | set_pte_at(dst_mm, addr, dst_pte, pte); | ||
| 277 | return; | ||
| 278 | } | ||
| 279 | |||
| 280 | pfn = pte_pfn(pte); | ||
| 281 | /* the pte points outside of valid memory, the | ||
| 282 | * mapping is assumed to be good, meaningful | ||
| 283 | * and not mapped via rmap - duplicate the | ||
| 284 | * mapping as is. | ||
| 285 | */ | ||
| 286 | page = NULL; | ||
| 287 | if (pfn_valid(pfn)) | ||
| 288 | page = pfn_to_page(pfn); | ||
| 289 | |||
| 290 | if (!page || PageReserved(page)) { | ||
| 291 | set_pte_at(dst_mm, addr, dst_pte, pte); | ||
| 292 | return; | ||
| 293 | } | ||
| 294 | |||
| 295 | /* | ||
| 296 | * If it's a COW mapping, write protect it both | ||
| 297 | * in the parent and the child | ||
| 298 | */ | ||
| 299 | if ((vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) { | ||
| 300 | ptep_set_wrprotect(src_mm, addr, src_pte); | ||
| 301 | pte = *src_pte; | ||
| 302 | } | ||
| 303 | |||
| 304 | /* | ||
| 305 | * If it's a shared mapping, mark it clean in | ||
| 306 | * the child | ||
| 307 | */ | ||
| 308 | if (vm_flags & VM_SHARED) | ||
| 309 | pte = pte_mkclean(pte); | ||
| 310 | pte = pte_mkold(pte); | ||
| 311 | get_page(page); | ||
| 312 | inc_mm_counter(dst_mm, rss); | ||
| 313 | if (PageAnon(page)) | ||
| 314 | inc_mm_counter(dst_mm, anon_rss); | ||
| 315 | set_pte_at(dst_mm, addr, dst_pte, pte); | ||
| 316 | page_dup_rmap(page); | ||
| 317 | } | ||
| 318 | |||
| 319 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||
| 320 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, | ||
| 321 | unsigned long addr, unsigned long end) | ||
| 322 | { | ||
| 323 | pte_t *src_pte, *dst_pte; | ||
| 324 | unsigned long vm_flags = vma->vm_flags; | ||
| 325 | int progress; | ||
| 326 | |||
| 327 | again: | ||
| 328 | dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); | ||
| 329 | if (!dst_pte) | ||
| 330 | return -ENOMEM; | ||
| 331 | src_pte = pte_offset_map_nested(src_pmd, addr); | ||
| 332 | |||
| 333 | progress = 0; | ||
| 334 | spin_lock(&src_mm->page_table_lock); | ||
| 335 | do { | ||
| 336 | /* | ||
| 337 | * We are holding two locks at this point - either of them | ||
| 338 | * could generate latencies in another task on another CPU. | ||
| 339 | */ | ||
| 340 | if (progress >= 32 && (need_resched() || | ||
| 341 | need_lockbreak(&src_mm->page_table_lock) || | ||
| 342 | need_lockbreak(&dst_mm->page_table_lock))) | ||
| 343 | break; | ||
| 344 | if (pte_none(*src_pte)) { | ||
| 345 | progress++; | ||
| 346 | continue; | ||
| 347 | } | ||
| 348 | copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr); | ||
| 349 | progress += 8; | ||
| 350 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); | ||
| 351 | spin_unlock(&src_mm->page_table_lock); | ||
| 352 | |||
| 353 | pte_unmap_nested(src_pte - 1); | ||
| 354 | pte_unmap(dst_pte - 1); | ||
| 355 | cond_resched_lock(&dst_mm->page_table_lock); | ||
| 356 | if (addr != end) | ||
| 357 | goto again; | ||
| 358 | return 0; | ||
| 359 | } | ||
| 360 | |||
| 361 | static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||
| 362 | pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, | ||
| 363 | unsigned long addr, unsigned long end) | ||
| 364 | { | ||
| 365 | pmd_t *src_pmd, *dst_pmd; | ||
| 366 | unsigned long next; | ||
| 367 | |||
| 368 | dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); | ||
| 369 | if (!dst_pmd) | ||
| 370 | return -ENOMEM; | ||
| 371 | src_pmd = pmd_offset(src_pud, addr); | ||
| 372 | do { | ||
| 373 | next = pmd_addr_end(addr, end); | ||
| 374 | if (pmd_none_or_clear_bad(src_pmd)) | ||
| 375 | continue; | ||
| 376 | if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, | ||
| 377 | vma, addr, next)) | ||
| 378 | return -ENOMEM; | ||
| 379 | } while (dst_pmd++, src_pmd++, addr = next, addr != end); | ||
| 380 | return 0; | ||
| 381 | } | ||
| 382 | |||
| 383 | static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||
| 384 | pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, | ||
| 385 | unsigned long addr, unsigned long end) | ||
| 386 | { | ||
| 387 | pud_t *src_pud, *dst_pud; | ||
| 388 | unsigned long next; | ||
| 389 | |||
| 390 | dst_pud = pud_alloc(dst_mm, dst_pgd, addr); | ||
| 391 | if (!dst_pud) | ||
| 392 | return -ENOMEM; | ||
| 393 | src_pud = pud_offset(src_pgd, addr); | ||
| 394 | do { | ||
| 395 | next = pud_addr_end(addr, end); | ||
| 396 | if (pud_none_or_clear_bad(src_pud)) | ||
| 397 | continue; | ||
| 398 | if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, | ||
| 399 | vma, addr, next)) | ||
| 400 | return -ENOMEM; | ||
| 401 | } while (dst_pud++, src_pud++, addr = next, addr != end); | ||
| 402 | return 0; | ||
| 403 | } | ||
| 404 | |||
| 405 | int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||
| 406 | struct vm_area_struct *vma) | ||
| 407 | { | ||
| 408 | pgd_t *src_pgd, *dst_pgd; | ||
| 409 | unsigned long next; | ||
| 410 | unsigned long addr = vma->vm_start; | ||
| 411 | unsigned long end = vma->vm_end; | ||
| 412 | |||
| 413 | if (is_vm_hugetlb_page(vma)) | ||
| 414 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); | ||
| 415 | |||
| 416 | dst_pgd = pgd_offset(dst_mm, addr); | ||
| 417 | src_pgd = pgd_offset(src_mm, addr); | ||
| 418 | do { | ||
| 419 | next = pgd_addr_end(addr, end); | ||
| 420 | if (pgd_none_or_clear_bad(src_pgd)) | ||
| 421 | continue; | ||
| 422 | if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, | ||
| 423 | vma, addr, next)) | ||
| 424 | return -ENOMEM; | ||
| 425 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); | ||
| 426 | return 0; | ||
| 427 | } | ||
| 428 | |||
| 429 | static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, | ||
| 430 | unsigned long addr, unsigned long end, | ||
| 431 | struct zap_details *details) | ||
| 432 | { | ||
| 433 | pte_t *pte; | ||
| 434 | |||
| 435 | pte = pte_offset_map(pmd, addr); | ||
| 436 | do { | ||
| 437 | pte_t ptent = *pte; | ||
| 438 | if (pte_none(ptent)) | ||
| 439 | continue; | ||
| 440 | if (pte_present(ptent)) { | ||
| 441 | struct page *page = NULL; | ||
| 442 | unsigned long pfn = pte_pfn(ptent); | ||
| 443 | if (pfn_valid(pfn)) { | ||
| 444 | page = pfn_to_page(pfn); | ||
| 445 | if (PageReserved(page)) | ||
| 446 | page = NULL; | ||
| 447 | } | ||
| 448 | if (unlikely(details) && page) { | ||
| 449 | /* | ||
| 450 | * unmap_shared_mapping_pages() wants to | ||
| 451 | * invalidate cache without truncating: | ||
| 452 | * unmap shared but keep private pages. | ||
| 453 | */ | ||
| 454 | if (details->check_mapping && | ||
| 455 | details->check_mapping != page->mapping) | ||
| 456 | continue; | ||
| 457 | /* | ||
| 458 | * Each page->index must be checked when | ||
| 459 | * invalidating or truncating nonlinear. | ||
| 460 | */ | ||
| 461 | if (details->nonlinear_vma && | ||
| 462 | (page->index < details->first_index || | ||
| 463 | page->index > details->last_index)) | ||
| 464 | continue; | ||
| 465 | } | ||
| 466 | ptent = ptep_get_and_clear(tlb->mm, addr, pte); | ||
| 467 | tlb_remove_tlb_entry(tlb, pte, addr); | ||
| 468 | if (unlikely(!page)) | ||
| 469 | continue; | ||
| 470 | if (unlikely(details) && details->nonlinear_vma | ||
| 471 | && linear_page_index(details->nonlinear_vma, | ||
| 472 | addr) != page->index) | ||
| 473 | set_pte_at(tlb->mm, addr, pte, | ||
| 474 | pgoff_to_pte(page->index)); | ||
| 475 | if (pte_dirty(ptent)) | ||
| 476 | set_page_dirty(page); | ||
| 477 | if (PageAnon(page)) | ||
| 478 | dec_mm_counter(tlb->mm, anon_rss); | ||
| 479 | else if (pte_young(ptent)) | ||
| 480 | mark_page_accessed(page); | ||
| 481 | tlb->freed++; | ||
| 482 | page_remove_rmap(page); | ||
| 483 | tlb_remove_page(tlb, page); | ||
| 484 | continue; | ||
| 485 | } | ||
| 486 | /* | ||
| 487 | * If details->check_mapping, we leave swap entries; | ||
| 488 | * if details->nonlinear_vma, we leave file entries. | ||
| 489 | */ | ||
| 490 | if (unlikely(details)) | ||
| 491 | continue; | ||
| 492 | if (!pte_file(ptent)) | ||
| 493 | free_swap_and_cache(pte_to_swp_entry(ptent)); | ||
| 494 | pte_clear(tlb->mm, addr, pte); | ||
| 495 | } while (pte++, addr += PAGE_SIZE, addr != end); | ||
| 496 | pte_unmap(pte - 1); | ||
| 497 | } | ||
| 498 | |||
| 499 | static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud, | ||
| 500 | unsigned long addr, unsigned long end, | ||
| 501 | struct zap_details *details) | ||
| 502 | { | ||
| 503 | pmd_t *pmd; | ||
| 504 | unsigned long next; | ||
| 505 | |||
| 506 | pmd = pmd_offset(pud, addr); | ||
| 507 | do { | ||
| 508 | next = pmd_addr_end(addr, end); | ||
| 509 | if (pmd_none_or_clear_bad(pmd)) | ||
| 510 | continue; | ||
| 511 | zap_pte_range(tlb, pmd, addr, next, details); | ||
| 512 | } while (pmd++, addr = next, addr != end); | ||
| 513 | } | ||
| 514 | |||
| 515 | static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | ||
| 516 | unsigned long addr, unsigned long end, | ||
| 517 | struct zap_details *details) | ||
| 518 | { | ||
| 519 | pud_t *pud; | ||
| 520 | unsigned long next; | ||
| 521 | |||
| 522 | pud = pud_offset(pgd, addr); | ||
| 523 | do { | ||
| 524 | next = pud_addr_end(addr, end); | ||
| 525 | if (pud_none_or_clear_bad(pud)) | ||
| 526 | continue; | ||
| 527 | zap_pmd_range(tlb, pud, addr, next, details); | ||
| 528 | } while (pud++, addr = next, addr != end); | ||
| 529 | } | ||
| 530 | |||
| 531 | static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | ||
| 532 | unsigned long addr, unsigned long end, | ||
| 533 | struct zap_details *details) | ||
| 534 | { | ||
| 535 | pgd_t *pgd; | ||
| 536 | unsigned long next; | ||
| 537 | |||
| 538 | if (details && !details->check_mapping && !details->nonlinear_vma) | ||
| 539 | details = NULL; | ||
| 540 | |||
| 541 | BUG_ON(addr >= end); | ||
| 542 | tlb_start_vma(tlb, vma); | ||
| 543 | pgd = pgd_offset(vma->vm_mm, addr); | ||
| 544 | do { | ||
| 545 | next = pgd_addr_end(addr, end); | ||
| 546 | if (pgd_none_or_clear_bad(pgd)) | ||
| 547 | continue; | ||
| 548 | zap_pud_range(tlb, pgd, addr, next, details); | ||
| 549 | } while (pgd++, addr = next, addr != end); | ||
| 550 | tlb_end_vma(tlb, vma); | ||
| 551 | } | ||
| 552 | |||
| 553 | #ifdef CONFIG_PREEMPT | ||
| 554 | # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) | ||
| 555 | #else | ||
| 556 | /* No preempt: go for improved straight-line efficiency */ | ||
| 557 | # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) | ||
| 558 | #endif | ||
| 559 | |||
| 560 | /** | ||
| 561 | * unmap_vmas - unmap a range of memory covered by a list of vma's | ||
| 562 | * @tlbp: address of the caller's struct mmu_gather | ||
| 563 | * @mm: the controlling mm_struct | ||
| 564 | * @vma: the starting vma | ||
| 565 | * @start_addr: virtual address at which to start unmapping | ||
| 566 | * @end_addr: virtual address at which to end unmapping | ||
| 567 | * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here | ||
| 568 | * @details: details of nonlinear truncation or shared cache invalidation | ||
| 569 | * | ||
| 570 | * Returns the number of vma's which were covered by the unmapping. | ||
| 571 | * | ||
| 572 | * Unmap all pages in the vma list. Called under page_table_lock. | ||
| 573 | * | ||
| 574 | * We aim to not hold page_table_lock for too long (for scheduling latency | ||
| 575 | * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to | ||
| 576 | * return the ending mmu_gather to the caller. | ||
| 577 | * | ||
| 578 | * Only addresses between `start' and `end' will be unmapped. | ||
| 579 | * | ||
| 580 | * The VMA list must be sorted in ascending virtual address order. | ||
| 581 | * | ||
| 582 | * unmap_vmas() assumes that the caller will flush the whole unmapped address | ||
| 583 | * range after unmap_vmas() returns. So the only responsibility here is to | ||
| 584 | * ensure that any thus-far unmapped pages are flushed before unmap_vmas() | ||
| 585 | * drops the lock and schedules. | ||
| 586 | */ | ||
| 587 | int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, | ||
| 588 | struct vm_area_struct *vma, unsigned long start_addr, | ||
| 589 | unsigned long end_addr, unsigned long *nr_accounted, | ||
| 590 | struct zap_details *details) | ||
| 591 | { | ||
| 592 | unsigned long zap_bytes = ZAP_BLOCK_SIZE; | ||
| 593 | unsigned long tlb_start = 0; /* For tlb_finish_mmu */ | ||
| 594 | int tlb_start_valid = 0; | ||
| 595 | int ret = 0; | ||
| 596 | spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; | ||
| 597 | int fullmm = tlb_is_full_mm(*tlbp); | ||
| 598 | |||
| 599 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { | ||
| 600 | unsigned long start; | ||
| 601 | unsigned long end; | ||
| 602 | |||
| 603 | start = max(vma->vm_start, start_addr); | ||
| 604 | if (start >= vma->vm_end) | ||
| 605 | continue; | ||
| 606 | end = min(vma->vm_end, end_addr); | ||
| 607 | if (end <= vma->vm_start) | ||
| 608 | continue; | ||
| 609 | |||
| 610 | if (vma->vm_flags & VM_ACCOUNT) | ||
| 611 | *nr_accounted += (end - start) >> PAGE_SHIFT; | ||
| 612 | |||
| 613 | ret++; | ||
| 614 | while (start != end) { | ||
| 615 | unsigned long block; | ||
| 616 | |||
| 617 | if (!tlb_start_valid) { | ||
| 618 | tlb_start = start; | ||
| 619 | tlb_start_valid = 1; | ||
| 620 | } | ||
| 621 | |||
| 622 | if (is_vm_hugetlb_page(vma)) { | ||
| 623 | block = end - start; | ||
| 624 | unmap_hugepage_range(vma, start, end); | ||
| 625 | } else { | ||
| 626 | block = min(zap_bytes, end - start); | ||
| 627 | unmap_page_range(*tlbp, vma, start, | ||
| 628 | start + block, details); | ||
| 629 | } | ||
| 630 | |||
| 631 | start += block; | ||
| 632 | zap_bytes -= block; | ||
| 633 | if ((long)zap_bytes > 0) | ||
| 634 | continue; | ||
| 635 | |||
| 636 | tlb_finish_mmu(*tlbp, tlb_start, start); | ||
| 637 | |||
| 638 | if (need_resched() || | ||
| 639 | need_lockbreak(&mm->page_table_lock) || | ||
| 640 | (i_mmap_lock && need_lockbreak(i_mmap_lock))) { | ||
| 641 | if (i_mmap_lock) { | ||
| 642 | /* must reset count of rss freed */ | ||
| 643 | *tlbp = tlb_gather_mmu(mm, fullmm); | ||
| 644 | details->break_addr = start; | ||
| 645 | goto out; | ||
| 646 | } | ||
| 647 | spin_unlock(&mm->page_table_lock); | ||
| 648 | cond_resched(); | ||
| 649 | spin_lock(&mm->page_table_lock); | ||
| 650 | } | ||
| 651 | |||
| 652 | *tlbp = tlb_gather_mmu(mm, fullmm); | ||
| 653 | tlb_start_valid = 0; | ||
| 654 | zap_bytes = ZAP_BLOCK_SIZE; | ||
| 655 | } | ||
| 656 | } | ||
| 657 | out: | ||
| 658 | return ret; | ||
| 659 | } | ||
| 660 | |||
| 661 | /** | ||
| 662 | * zap_page_range - remove user pages in a given range | ||
| 663 | * @vma: vm_area_struct holding the applicable pages | ||
| 664 | * @address: starting address of pages to zap | ||
| 665 | * @size: number of bytes to zap | ||
| 666 | * @details: details of nonlinear truncation or shared cache invalidation | ||
| 667 | */ | ||
| 668 | void zap_page_range(struct vm_area_struct *vma, unsigned long address, | ||
| 669 | unsigned long size, struct zap_details *details) | ||
| 670 | { | ||
| 671 | struct mm_struct *mm = vma->vm_mm; | ||
| 672 | struct mmu_gather *tlb; | ||
| 673 | unsigned long end = address + size; | ||
| 674 | unsigned long nr_accounted = 0; | ||
| 675 | |||
| 676 | if (is_vm_hugetlb_page(vma)) { | ||
| 677 | zap_hugepage_range(vma, address, size); | ||
| 678 | return; | ||
| 679 | } | ||
| 680 | |||
| 681 | lru_add_drain(); | ||
| 682 | spin_lock(&mm->page_table_lock); | ||
| 683 | tlb = tlb_gather_mmu(mm, 0); | ||
| 684 | unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); | ||
| 685 | tlb_finish_mmu(tlb, address, end); | ||
| 686 | spin_unlock(&mm->page_table_lock); | ||
| 687 | } | ||
| 688 | |||
| 689 | /* | ||
| 690 | * Do a quick page-table lookup for a single page. | ||
| 691 | * mm->page_table_lock must be held. | ||
| 692 | */ | ||
| 693 | static struct page * | ||
| 694 | __follow_page(struct mm_struct *mm, unsigned long address, int read, int write) | ||
| 695 | { | ||
| 696 | pgd_t *pgd; | ||
| 697 | pud_t *pud; | ||
| 698 | pmd_t *pmd; | ||
| 699 | pte_t *ptep, pte; | ||
| 700 | unsigned long pfn; | ||
| 701 | struct page *page; | ||
| 702 | |||
| 703 | page = follow_huge_addr(mm, address, write); | ||
| 704 | if (! IS_ERR(page)) | ||
| 705 | return page; | ||
| 706 | |||
| 707 | pgd = pgd_offset(mm, address); | ||
| 708 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | ||
| 709 | goto out; | ||
| 710 | |||
| 711 | pud = pud_offset(pgd, address); | ||
| 712 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) | ||
| 713 | goto out; | ||
| 714 | |||
| 715 | pmd = pmd_offset(pud, address); | ||
| 716 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | ||
| 717 | goto out; | ||
| 718 | if (pmd_huge(*pmd)) | ||
| 719 | return follow_huge_pmd(mm, address, pmd, write); | ||
| 720 | |||
| 721 | ptep = pte_offset_map(pmd, address); | ||
| 722 | if (!ptep) | ||
| 723 | goto out; | ||
| 724 | |||
| 725 | pte = *ptep; | ||
| 726 | pte_unmap(ptep); | ||
| 727 | if (pte_present(pte)) { | ||
| 728 | if (write && !pte_write(pte)) | ||
| 729 | goto out; | ||
| 730 | if (read && !pte_read(pte)) | ||
| 731 | goto out; | ||
| 732 | pfn = pte_pfn(pte); | ||
| 733 | if (pfn_valid(pfn)) { | ||
| 734 | page = pfn_to_page(pfn); | ||
| 735 | if (write && !pte_dirty(pte) && !PageDirty(page)) | ||
| 736 | set_page_dirty(page); | ||
| 737 | mark_page_accessed(page); | ||
| 738 | return page; | ||
| 739 | } | ||
| 740 | } | ||
| 741 | |||
| 742 | out: | ||
| 743 | return NULL; | ||
| 744 | } | ||
| 745 | |||
| 746 | struct page * | ||
| 747 | follow_page(struct mm_struct *mm, unsigned long address, int write) | ||
| 748 | { | ||
| 749 | return __follow_page(mm, address, /*read*/0, write); | ||
| 750 | } | ||
| 751 | |||
| 752 | int | ||
| 753 | check_user_page_readable(struct mm_struct *mm, unsigned long address) | ||
| 754 | { | ||
| 755 | return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL; | ||
| 756 | } | ||
| 757 | |||
| 758 | EXPORT_SYMBOL(check_user_page_readable); | ||
| 759 | |||
| 760 | /* | ||
| 761 | * Given a physical address, is there a useful struct page pointing to | ||
| 762 | * it? This may become more complex in the future if we start dealing | ||
| 763 | * with IO-aperture pages for direct-IO. | ||
| 764 | */ | ||
| 765 | |||
| 766 | static inline struct page *get_page_map(struct page *page) | ||
| 767 | { | ||
| 768 | if (!pfn_valid(page_to_pfn(page))) | ||
| 769 | return NULL; | ||
| 770 | return page; | ||
| 771 | } | ||
| 772 | |||
| 773 | |||
| 774 | static inline int | ||
| 775 | untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, | ||
| 776 | unsigned long address) | ||
| 777 | { | ||
| 778 | pgd_t *pgd; | ||
| 779 | pud_t *pud; | ||
| 780 | pmd_t *pmd; | ||
| 781 | |||
| 782 | /* Check if the vma is for an anonymous mapping. */ | ||
| 783 | if (vma->vm_ops && vma->vm_ops->nopage) | ||
| 784 | return 0; | ||
| 785 | |||
| 786 | /* Check if page directory entry exists. */ | ||
| 787 | pgd = pgd_offset(mm, address); | ||
| 788 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | ||
| 789 | return 1; | ||
| 790 | |||
| 791 | pud = pud_offset(pgd, address); | ||
| 792 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) | ||
| 793 | return 1; | ||
| 794 | |||
| 795 | /* Check if page middle directory entry exists. */ | ||
| 796 | pmd = pmd_offset(pud, address); | ||
| 797 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | ||
| 798 | return 1; | ||
| 799 | |||
| 800 | /* There is a pte slot for 'address' in 'mm'. */ | ||
| 801 | return 0; | ||
| 802 | } | ||
| 803 | |||
| 804 | |||
| 805 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | ||
| 806 | unsigned long start, int len, int write, int force, | ||
| 807 | struct page **pages, struct vm_area_struct **vmas) | ||
| 808 | { | ||
| 809 | int i; | ||
| 810 | unsigned int flags; | ||
| 811 | |||
| 812 | /* | ||
| 813 | * Require read or write permissions. | ||
| 814 | * If 'force' is set, we only require the "MAY" flags. | ||
| 815 | */ | ||
| 816 | flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); | ||
| 817 | flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | ||
| 818 | i = 0; | ||
| 819 | |||
| 820 | do { | ||
| 821 | struct vm_area_struct * vma; | ||
| 822 | |||
| 823 | vma = find_extend_vma(mm, start); | ||
| 824 | if (!vma && in_gate_area(tsk, start)) { | ||
| 825 | unsigned long pg = start & PAGE_MASK; | ||
| 826 | struct vm_area_struct *gate_vma = get_gate_vma(tsk); | ||
| 827 | pgd_t *pgd; | ||
| 828 | pud_t *pud; | ||
| 829 | pmd_t *pmd; | ||
| 830 | pte_t *pte; | ||
| 831 | if (write) /* user gate pages are read-only */ | ||
| 832 | return i ? : -EFAULT; | ||
| 833 | if (pg > TASK_SIZE) | ||
| 834 | pgd = pgd_offset_k(pg); | ||
| 835 | else | ||
| 836 | pgd = pgd_offset_gate(mm, pg); | ||
| 837 | BUG_ON(pgd_none(*pgd)); | ||
| 838 | pud = pud_offset(pgd, pg); | ||
| 839 | BUG_ON(pud_none(*pud)); | ||
| 840 | pmd = pmd_offset(pud, pg); | ||
| 841 | BUG_ON(pmd_none(*pmd)); | ||
| 842 | pte = pte_offset_map(pmd, pg); | ||
| 843 | BUG_ON(pte_none(*pte)); | ||
| 844 | if (pages) { | ||
| 845 | pages[i] = pte_page(*pte); | ||
| 846 | get_page(pages[i]); | ||
| 847 | } | ||
| 848 | pte_unmap(pte); | ||
| 849 | if (vmas) | ||
| 850 | vmas[i] = gate_vma; | ||
| 851 | i++; | ||
| 852 | start += PAGE_SIZE; | ||
| 853 | len--; | ||
| 854 | continue; | ||
| 855 | } | ||
| 856 | |||
| 857 | if (!vma || (vma->vm_flags & VM_IO) | ||
| 858 | || !(flags & vma->vm_flags)) | ||
| 859 | return i ? : -EFAULT; | ||
| 860 | |||
| 861 | if (is_vm_hugetlb_page(vma)) { | ||
| 862 | i = follow_hugetlb_page(mm, vma, pages, vmas, | ||
| 863 | &start, &len, i); | ||
| 864 | continue; | ||
| 865 | } | ||
| 866 | spin_lock(&mm->page_table_lock); | ||
| 867 | do { | ||
| 868 | struct page *map; | ||
| 869 | int lookup_write = write; | ||
| 870 | |||
| 871 | cond_resched_lock(&mm->page_table_lock); | ||
| 872 | while (!(map = follow_page(mm, start, lookup_write))) { | ||
| 873 | /* | ||
| 874 | * Shortcut for anonymous pages. We don't want | ||
| 875 | * to force the creation of pages tables for | ||
| 876 | * insanly big anonymously mapped areas that | ||
| 877 | * nobody touched so far. This is important | ||
| 878 | * for doing a core dump for these mappings. | ||
| 879 | */ | ||
| 880 | if (!lookup_write && | ||
| 881 | untouched_anonymous_page(mm,vma,start)) { | ||
| 882 | map = ZERO_PAGE(start); | ||
| 883 | break; | ||
| 884 | } | ||
| 885 | spin_unlock(&mm->page_table_lock); | ||
| 886 | switch (handle_mm_fault(mm,vma,start,write)) { | ||
| 887 | case VM_FAULT_MINOR: | ||
| 888 | tsk->min_flt++; | ||
| 889 | break; | ||
| 890 | case VM_FAULT_MAJOR: | ||
| 891 | tsk->maj_flt++; | ||
| 892 | break; | ||
| 893 | case VM_FAULT_SIGBUS: | ||
| 894 | return i ? i : -EFAULT; | ||
| 895 | case VM_FAULT_OOM: | ||
| 896 | return i ? i : -ENOMEM; | ||
| 897 | default: | ||
| 898 | BUG(); | ||
| 899 | } | ||
| 900 | /* | ||
| 901 | * Now that we have performed a write fault | ||
| 902 | * and surely no longer have a shared page we | ||
| 903 | * shouldn't write, we shouldn't ignore an | ||
| 904 | * unwritable page in the page table if | ||
| 905 | * we are forcing write access. | ||
| 906 | */ | ||
| 907 | lookup_write = write && !force; | ||
| 908 | spin_lock(&mm->page_table_lock); | ||
| 909 | } | ||
| 910 | if (pages) { | ||
| 911 | pages[i] = get_page_map(map); | ||
| 912 | if (!pages[i]) { | ||
| 913 | spin_unlock(&mm->page_table_lock); | ||
| 914 | while (i--) | ||
| 915 | page_cache_release(pages[i]); | ||
| 916 | i = -EFAULT; | ||
| 917 | goto out; | ||
| 918 | } | ||
| 919 | flush_dcache_page(pages[i]); | ||
| 920 | if (!PageReserved(pages[i])) | ||
| 921 | page_cache_get(pages[i]); | ||
| 922 | } | ||
| 923 | if (vmas) | ||
| 924 | vmas[i] = vma; | ||
| 925 | i++; | ||
| 926 | start += PAGE_SIZE; | ||
| 927 | len--; | ||
| 928 | } while(len && start < vma->vm_end); | ||
| 929 | spin_unlock(&mm->page_table_lock); | ||
| 930 | } while(len); | ||
| 931 | out: | ||
| 932 | return i; | ||
| 933 | } | ||
| 934 | |||
| 935 | EXPORT_SYMBOL(get_user_pages); | ||
| 936 | |||
| 937 | static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, | ||
| 938 | unsigned long addr, unsigned long end, pgprot_t prot) | ||
| 939 | { | ||
| 940 | pte_t *pte; | ||
| 941 | |||
| 942 | pte = pte_alloc_map(mm, pmd, addr); | ||
| 943 | if (!pte) | ||
| 944 | return -ENOMEM; | ||
| 945 | do { | ||
| 946 | pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot)); | ||
| 947 | BUG_ON(!pte_none(*pte)); | ||
| 948 | set_pte_at(mm, addr, pte, zero_pte); | ||
| 949 | } while (pte++, addr += PAGE_SIZE, addr != end); | ||
| 950 | pte_unmap(pte - 1); | ||
| 951 | return 0; | ||
| 952 | } | ||
| 953 | |||
| 954 | static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, | ||
| 955 | unsigned long addr, unsigned long end, pgprot_t prot) | ||
| 956 | { | ||
| 957 | pmd_t *pmd; | ||
| 958 | unsigned long next; | ||
| 959 | |||
| 960 | pmd = pmd_alloc(mm, pud, addr); | ||
| 961 | if (!pmd) | ||
| 962 | return -ENOMEM; | ||
| 963 | do { | ||
| 964 | next = pmd_addr_end(addr, end); | ||
| 965 | if (zeromap_pte_range(mm, pmd, addr, next, prot)) | ||
| 966 | return -ENOMEM; | ||
| 967 | } while (pmd++, addr = next, addr != end); | ||
| 968 | return 0; | ||
| 969 | } | ||
| 970 | |||
| 971 | static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, | ||
| 972 | unsigned long addr, unsigned long end, pgprot_t prot) | ||
| 973 | { | ||
| 974 | pud_t *pud; | ||
| 975 | unsigned long next; | ||
| 976 | |||
| 977 | pud = pud_alloc(mm, pgd, addr); | ||
| 978 | if (!pud) | ||
| 979 | return -ENOMEM; | ||
| 980 | do { | ||
| 981 | next = pud_addr_end(addr, end); | ||
| 982 | if (zeromap_pmd_range(mm, pud, addr, next, prot)) | ||
| 983 | return -ENOMEM; | ||
| 984 | } while (pud++, addr = next, addr != end); | ||
| 985 | return 0; | ||
| 986 | } | ||
| 987 | |||
| 988 | int zeromap_page_range(struct vm_area_struct *vma, | ||
| 989 | unsigned long addr, unsigned long size, pgprot_t prot) | ||
| 990 | { | ||
| 991 | pgd_t *pgd; | ||
| 992 | unsigned long next; | ||
| 993 | unsigned long end = addr + size; | ||
| 994 | struct mm_struct *mm = vma->vm_mm; | ||
| 995 | int err; | ||
| 996 | |||
| 997 | BUG_ON(addr >= end); | ||
| 998 | pgd = pgd_offset(mm, addr); | ||
| 999 | flush_cache_range(vma, addr, end); | ||
| 1000 | spin_lock(&mm->page_table_lock); | ||
| 1001 | do { | ||
| 1002 | next = pgd_addr_end(addr, end); | ||
| 1003 | err = zeromap_pud_range(mm, pgd, addr, next, prot); | ||
| 1004 | if (err) | ||
| 1005 | break; | ||
| 1006 | } while (pgd++, addr = next, addr != end); | ||
| 1007 | spin_unlock(&mm->page_table_lock); | ||
| 1008 | return err; | ||
| 1009 | } | ||
| 1010 | |||
| 1011 | /* | ||
| 1012 | * maps a range of physical memory into the requested pages. the old | ||
| 1013 | * mappings are removed. any references to nonexistent pages results | ||
| 1014 | * in null mappings (currently treated as "copy-on-access") | ||
| 1015 | */ | ||
| 1016 | static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, | ||
| 1017 | unsigned long addr, unsigned long end, | ||
| 1018 | unsigned long pfn, pgprot_t prot) | ||
| 1019 | { | ||
| 1020 | pte_t *pte; | ||
| 1021 | |||
| 1022 | pte = pte_alloc_map(mm, pmd, addr); | ||
| 1023 | if (!pte) | ||
| 1024 | return -ENOMEM; | ||
| 1025 | do { | ||
| 1026 | BUG_ON(!pte_none(*pte)); | ||
| 1027 | if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) | ||
| 1028 | set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); | ||
| 1029 | pfn++; | ||
| 1030 | } while (pte++, addr += PAGE_SIZE, addr != end); | ||
| 1031 | pte_unmap(pte - 1); | ||
| 1032 | return 0; | ||
| 1033 | } | ||
| 1034 | |||
| 1035 | static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, | ||
| 1036 | unsigned long addr, unsigned long end, | ||
| 1037 | unsigned long pfn, pgprot_t prot) | ||
| 1038 | { | ||
| 1039 | pmd_t *pmd; | ||
| 1040 | unsigned long next; | ||
| 1041 | |||
| 1042 | pfn -= addr >> PAGE_SHIFT; | ||
| 1043 | pmd = pmd_alloc(mm, pud, addr); | ||
| 1044 | if (!pmd) | ||
| 1045 | return -ENOMEM; | ||
| 1046 | do { | ||
| 1047 | next = pmd_addr_end(addr, end); | ||
| 1048 | if (remap_pte_range(mm, pmd, addr, next, | ||
| 1049 | pfn + (addr >> PAGE_SHIFT), prot)) | ||
| 1050 | return -ENOMEM; | ||
| 1051 | } while (pmd++, addr = next, addr != end); | ||
| 1052 | return 0; | ||
| 1053 | } | ||
| 1054 | |||
| 1055 | static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, | ||
| 1056 | unsigned long addr, unsigned long end, | ||
| 1057 | unsigned long pfn, pgprot_t prot) | ||
| 1058 | { | ||
| 1059 | pud_t *pud; | ||
| 1060 | unsigned long next; | ||
| 1061 | |||
| 1062 | pfn -= addr >> PAGE_SHIFT; | ||
| 1063 | pud = pud_alloc(mm, pgd, addr); | ||
| 1064 | if (!pud) | ||
| 1065 | return -ENOMEM; | ||
| 1066 | do { | ||
| 1067 | next = pud_addr_end(addr, end); | ||
| 1068 | if (remap_pmd_range(mm, pud, addr, next, | ||
| 1069 | pfn + (addr >> PAGE_SHIFT), prot)) | ||
| 1070 | return -ENOMEM; | ||
| 1071 | } while (pud++, addr = next, addr != end); | ||
| 1072 | return 0; | ||
| 1073 | } | ||
| 1074 | |||
| 1075 | /* Note: this is only safe if the mm semaphore is held when called. */ | ||
| 1076 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | ||
| 1077 | unsigned long pfn, unsigned long size, pgprot_t prot) | ||
| 1078 | { | ||
| 1079 | pgd_t *pgd; | ||
| 1080 | unsigned long next; | ||
| 1081 | unsigned long end = addr + size; | ||
| 1082 | struct mm_struct *mm = vma->vm_mm; | ||
| 1083 | int err; | ||
| 1084 | |||
| 1085 | /* | ||
| 1086 | * Physically remapped pages are special. Tell the | ||
| 1087 | * rest of the world about it: | ||
| 1088 | * VM_IO tells people not to look at these pages | ||
| 1089 | * (accesses can have side effects). | ||
| 1090 | * VM_RESERVED tells swapout not to try to touch | ||
| 1091 | * this region. | ||
| 1092 | */ | ||
| 1093 | vma->vm_flags |= VM_IO | VM_RESERVED; | ||
| 1094 | |||
| 1095 | BUG_ON(addr >= end); | ||
| 1096 | pfn -= addr >> PAGE_SHIFT; | ||
| 1097 | pgd = pgd_offset(mm, addr); | ||
| 1098 | flush_cache_range(vma, addr, end); | ||
| 1099 | spin_lock(&mm->page_table_lock); | ||
| 1100 | do { | ||
| 1101 | next = pgd_addr_end(addr, end); | ||
| 1102 | err = remap_pud_range(mm, pgd, addr, next, | ||
| 1103 | pfn + (addr >> PAGE_SHIFT), prot); | ||
| 1104 | if (err) | ||
| 1105 | break; | ||
| 1106 | } while (pgd++, addr = next, addr != end); | ||
| 1107 | spin_unlock(&mm->page_table_lock); | ||
| 1108 | return err; | ||
| 1109 | } | ||
| 1110 | EXPORT_SYMBOL(remap_pfn_range); | ||
| 1111 | |||
| 1112 | /* | ||
| 1113 | * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when | ||
| 1114 | * servicing faults for write access. In the normal case, do always want | ||
| 1115 | * pte_mkwrite. But get_user_pages can cause write faults for mappings | ||
| 1116 | * that do not have writing enabled, when used by access_process_vm. | ||
| 1117 | */ | ||
| 1118 | static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | ||
| 1119 | { | ||
| 1120 | if (likely(vma->vm_flags & VM_WRITE)) | ||
| 1121 | pte = pte_mkwrite(pte); | ||
| 1122 | return pte; | ||
| 1123 | } | ||
| 1124 | |||
| 1125 | /* | ||
| 1126 | * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock | ||
| 1127 | */ | ||
| 1128 | static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, | ||
| 1129 | pte_t *page_table) | ||
| 1130 | { | ||
| 1131 | pte_t entry; | ||
| 1132 | |||
| 1133 | entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)), | ||
| 1134 | vma); | ||
| 1135 | ptep_establish(vma, address, page_table, entry); | ||
| 1136 | update_mmu_cache(vma, address, entry); | ||
| 1137 | lazy_mmu_prot_update(entry); | ||
| 1138 | } | ||
| 1139 | |||
| 1140 | /* | ||
| 1141 | * This routine handles present pages, when users try to write | ||
| 1142 | * to a shared page. It is done by copying the page to a new address | ||
| 1143 | * and decrementing the shared-page counter for the old page. | ||
| 1144 | * | ||
| 1145 | * Goto-purists beware: the only reason for goto's here is that it results | ||
| 1146 | * in better assembly code.. The "default" path will see no jumps at all. | ||
| 1147 | * | ||
| 1148 | * Note that this routine assumes that the protection checks have been | ||
| 1149 | * done by the caller (the low-level page fault routine in most cases). | ||
| 1150 | * Thus we can safely just mark it writable once we've done any necessary | ||
| 1151 | * COW. | ||
| 1152 | * | ||
| 1153 | * We also mark the page dirty at this point even though the page will | ||
| 1154 | * change only once the write actually happens. This avoids a few races, | ||
| 1155 | * and potentially makes it more efficient. | ||
| 1156 | * | ||
| 1157 | * We hold the mm semaphore and the page_table_lock on entry and exit | ||
| 1158 | * with the page_table_lock released. | ||
| 1159 | */ | ||
| 1160 | static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, | ||
| 1161 | unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte) | ||
| 1162 | { | ||
| 1163 | struct page *old_page, *new_page; | ||
| 1164 | unsigned long pfn = pte_pfn(pte); | ||
| 1165 | pte_t entry; | ||
| 1166 | |||
| 1167 | if (unlikely(!pfn_valid(pfn))) { | ||
| 1168 | /* | ||
| 1169 | * This should really halt the system so it can be debugged or | ||
| 1170 | * at least the kernel stops what it's doing before it corrupts | ||
| 1171 | * data, but for the moment just pretend this is OOM. | ||
| 1172 | */ | ||
| 1173 | pte_unmap(page_table); | ||
| 1174 | printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", | ||
| 1175 | address); | ||
| 1176 | spin_unlock(&mm->page_table_lock); | ||
| 1177 | return VM_FAULT_OOM; | ||
| 1178 | } | ||
| 1179 | old_page = pfn_to_page(pfn); | ||
| 1180 | |||
| 1181 | if (!TestSetPageLocked(old_page)) { | ||
| 1182 | int reuse = can_share_swap_page(old_page); | ||
| 1183 | unlock_page(old_page); | ||
| 1184 | if (reuse) { | ||
| 1185 | flush_cache_page(vma, address, pfn); | ||
| 1186 | entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)), | ||
| 1187 | vma); | ||
| 1188 | ptep_set_access_flags(vma, address, page_table, entry, 1); | ||
| 1189 | update_mmu_cache(vma, address, entry); | ||
| 1190 | lazy_mmu_prot_update(entry); | ||
| 1191 | pte_unmap(page_table); | ||
| 1192 | spin_unlock(&mm->page_table_lock); | ||
| 1193 | return VM_FAULT_MINOR; | ||
| 1194 | } | ||
| 1195 | } | ||
| 1196 | pte_unmap(page_table); | ||
| 1197 | |||
| 1198 | /* | ||
| 1199 | * Ok, we need to copy. Oh, well.. | ||
| 1200 | */ | ||
| 1201 | if (!PageReserved(old_page)) | ||
| 1202 | page_cache_get(old_page); | ||
| 1203 | spin_unlock(&mm->page_table_lock); | ||
| 1204 | |||
| 1205 | if (unlikely(anon_vma_prepare(vma))) | ||
| 1206 | goto no_new_page; | ||
| 1207 | if (old_page == ZERO_PAGE(address)) { | ||
| 1208 | new_page = alloc_zeroed_user_highpage(vma, address); | ||
| 1209 | if (!new_page) | ||
| 1210 | goto no_new_page; | ||
| 1211 | } else { | ||
| 1212 | new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); | ||
| 1213 | if (!new_page) | ||
| 1214 | goto no_new_page; | ||
| 1215 | copy_user_highpage(new_page, old_page, address); | ||
| 1216 | } | ||
| 1217 | /* | ||
| 1218 | * Re-check the pte - we dropped the lock | ||
| 1219 | */ | ||
| 1220 | spin_lock(&mm->page_table_lock); | ||
| 1221 | page_table = pte_offset_map(pmd, address); | ||
| 1222 | if (likely(pte_same(*page_table, pte))) { | ||
| 1223 | if (PageAnon(old_page)) | ||
| 1224 | dec_mm_counter(mm, anon_rss); | ||
| 1225 | if (PageReserved(old_page)) | ||
| 1226 | inc_mm_counter(mm, rss); | ||
| 1227 | else | ||
| 1228 | page_remove_rmap(old_page); | ||
| 1229 | flush_cache_page(vma, address, pfn); | ||
| 1230 | break_cow(vma, new_page, address, page_table); | ||
| 1231 | lru_cache_add_active(new_page); | ||
| 1232 | page_add_anon_rmap(new_page, vma, address); | ||
| 1233 | |||
| 1234 | /* Free the old page.. */ | ||
| 1235 | new_page = old_page; | ||
| 1236 | } | ||
| 1237 | pte_unmap(page_table); | ||
| 1238 | page_cache_release(new_page); | ||
| 1239 | page_cache_release(old_page); | ||
| 1240 | spin_unlock(&mm->page_table_lock); | ||
| 1241 | return VM_FAULT_MINOR; | ||
| 1242 | |||
| 1243 | no_new_page: | ||
| 1244 | page_cache_release(old_page); | ||
| 1245 | return VM_FAULT_OOM; | ||
| 1246 | } | ||
| 1247 | |||
| 1248 | /* | ||
| 1249 | * Helper functions for unmap_mapping_range(). | ||
| 1250 | * | ||
| 1251 | * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ | ||
| 1252 | * | ||
| 1253 | * We have to restart searching the prio_tree whenever we drop the lock, | ||
| 1254 | * since the iterator is only valid while the lock is held, and anyway | ||
| 1255 | * a later vma might be split and reinserted earlier while lock dropped. | ||
| 1256 | * | ||
| 1257 | * The list of nonlinear vmas could be handled more efficiently, using | ||
| 1258 | * a placeholder, but handle it in the same way until a need is shown. | ||
| 1259 | * It is important to search the prio_tree before nonlinear list: a vma | ||
| 1260 | * may become nonlinear and be shifted from prio_tree to nonlinear list | ||
| 1261 | * while the lock is dropped; but never shifted from list to prio_tree. | ||
| 1262 | * | ||
| 1263 | * In order to make forward progress despite restarting the search, | ||
| 1264 | * vm_truncate_count is used to mark a vma as now dealt with, so we can | ||
| 1265 | * quickly skip it next time around. Since the prio_tree search only | ||
| 1266 | * shows us those vmas affected by unmapping the range in question, we | ||
| 1267 | * can't efficiently keep all vmas in step with mapping->truncate_count: | ||
| 1268 | * so instead reset them all whenever it wraps back to 0 (then go to 1). | ||
| 1269 | * mapping->truncate_count and vma->vm_truncate_count are protected by | ||
| 1270 | * i_mmap_lock. | ||
| 1271 | * | ||
| 1272 | * In order to make forward progress despite repeatedly restarting some | ||
| 1273 | * large vma, note the break_addr set by unmap_vmas when it breaks out: | ||
| 1274 | * and restart from that address when we reach that vma again. It might | ||
| 1275 | * have been split or merged, shrunk or extended, but never shifted: so | ||
| 1276 | * restart_addr remains valid so long as it remains in the vma's range. | ||
| 1277 | * unmap_mapping_range forces truncate_count to leap over page-aligned | ||
| 1278 | * values so we can save vma's restart_addr in its truncate_count field. | ||
| 1279 | */ | ||
| 1280 | #define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK)) | ||
| 1281 | |||
| 1282 | static void reset_vma_truncate_counts(struct address_space *mapping) | ||
| 1283 | { | ||
| 1284 | struct vm_area_struct *vma; | ||
| 1285 | struct prio_tree_iter iter; | ||
| 1286 | |||
| 1287 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) | ||
| 1288 | vma->vm_truncate_count = 0; | ||
| 1289 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) | ||
| 1290 | vma->vm_truncate_count = 0; | ||
| 1291 | } | ||
| 1292 | |||
| 1293 | static int unmap_mapping_range_vma(struct vm_area_struct *vma, | ||
| 1294 | unsigned long start_addr, unsigned long end_addr, | ||
| 1295 | struct zap_details *details) | ||
| 1296 | { | ||
| 1297 | unsigned long restart_addr; | ||
| 1298 | int need_break; | ||
| 1299 | |||
| 1300 | again: | ||
| 1301 | restart_addr = vma->vm_truncate_count; | ||
| 1302 | if (is_restart_addr(restart_addr) && start_addr < restart_addr) { | ||
| 1303 | start_addr = restart_addr; | ||
| 1304 | if (start_addr >= end_addr) { | ||
| 1305 | /* Top of vma has been split off since last time */ | ||
| 1306 | vma->vm_truncate_count = details->truncate_count; | ||
| 1307 | return 0; | ||
| 1308 | } | ||
| 1309 | } | ||
| 1310 | |||
| 1311 | details->break_addr = end_addr; | ||
| 1312 | zap_page_range(vma, start_addr, end_addr - start_addr, details); | ||
| 1313 | |||
| 1314 | /* | ||
| 1315 | * We cannot rely on the break test in unmap_vmas: | ||
| 1316 | * on the one hand, we don't want to restart our loop | ||
| 1317 | * just because that broke out for the page_table_lock; | ||
| 1318 | * on the other hand, it does no test when vma is small. | ||
| 1319 | */ | ||
| 1320 | need_break = need_resched() || | ||
| 1321 | need_lockbreak(details->i_mmap_lock); | ||
| 1322 | |||
| 1323 | if (details->break_addr >= end_addr) { | ||
| 1324 | /* We have now completed this vma: mark it so */ | ||
| 1325 | vma->vm_truncate_count = details->truncate_count; | ||
| 1326 | if (!need_break) | ||
| 1327 | return 0; | ||
| 1328 | } else { | ||
| 1329 | /* Note restart_addr in vma's truncate_count field */ | ||
| 1330 | vma->vm_truncate_count = details->break_addr; | ||
| 1331 | if (!need_break) | ||
| 1332 | goto again; | ||
| 1333 | } | ||
| 1334 | |||
| 1335 | spin_unlock(details->i_mmap_lock); | ||
| 1336 | cond_resched(); | ||
| 1337 | spin_lock(details->i_mmap_lock); | ||
| 1338 | return -EINTR; | ||
| 1339 | } | ||
| 1340 | |||
| 1341 | static inline void unmap_mapping_range_tree(struct prio_tree_root *root, | ||
| 1342 | struct zap_details *details) | ||
| 1343 | { | ||
| 1344 | struct vm_area_struct *vma; | ||
| 1345 | struct prio_tree_iter iter; | ||
| 1346 | pgoff_t vba, vea, zba, zea; | ||
| 1347 | |||
| 1348 | restart: | ||
| 1349 | vma_prio_tree_foreach(vma, &iter, root, | ||
| 1350 | details->first_index, details->last_index) { | ||
| 1351 | /* Skip quickly over those we have already dealt with */ | ||
| 1352 | if (vma->vm_truncate_count == details->truncate_count) | ||
| 1353 | continue; | ||
| 1354 | |||
| 1355 | vba = vma->vm_pgoff; | ||
| 1356 | vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; | ||
| 1357 | /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ | ||
| 1358 | zba = details->first_index; | ||
| 1359 | if (zba < vba) | ||
| 1360 | zba = vba; | ||
| 1361 | zea = details->last_index; | ||
| 1362 | if (zea > vea) | ||
| 1363 | zea = vea; | ||
| 1364 | |||
| 1365 | if (unmap_mapping_range_vma(vma, | ||
| 1366 | ((zba - vba) << PAGE_SHIFT) + vma->vm_start, | ||
| 1367 | ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, | ||
| 1368 | details) < 0) | ||
| 1369 | goto restart; | ||
| 1370 | } | ||
| 1371 | } | ||
| 1372 | |||
| 1373 | static inline void unmap_mapping_range_list(struct list_head *head, | ||
| 1374 | struct zap_details *details) | ||
| 1375 | { | ||
| 1376 | struct vm_area_struct *vma; | ||
| 1377 | |||
| 1378 | /* | ||
| 1379 | * In nonlinear VMAs there is no correspondence between virtual address | ||
| 1380 | * offset and file offset. So we must perform an exhaustive search | ||
| 1381 | * across *all* the pages in each nonlinear VMA, not just the pages | ||
| 1382 | * whose virtual address lies outside the file truncation point. | ||
| 1383 | */ | ||
| 1384 | restart: | ||
| 1385 | list_for_each_entry(vma, head, shared.vm_set.list) { | ||
| 1386 | /* Skip quickly over those we have already dealt with */ | ||
| 1387 | if (vma->vm_truncate_count == details->truncate_count) | ||
| 1388 | continue; | ||
| 1389 | details->nonlinear_vma = vma; | ||
| 1390 | if (unmap_mapping_range_vma(vma, vma->vm_start, | ||
| 1391 | vma->vm_end, details) < 0) | ||
| 1392 | goto restart; | ||
| 1393 | } | ||
| 1394 | } | ||
| 1395 | |||
| 1396 | /** | ||
| 1397 | * unmap_mapping_range - unmap the portion of all mmaps | ||
| 1398 | * in the specified address_space corresponding to the specified | ||
| 1399 | * page range in the underlying file. | ||
| 1400 | * @address_space: the address space containing mmaps to be unmapped. | ||
| 1401 | * @holebegin: byte in first page to unmap, relative to the start of | ||
| 1402 | * the underlying file. This will be rounded down to a PAGE_SIZE | ||
| 1403 | * boundary. Note that this is different from vmtruncate(), which | ||
| 1404 | * must keep the partial page. In contrast, we must get rid of | ||
| 1405 | * partial pages. | ||
| 1406 | * @holelen: size of prospective hole in bytes. This will be rounded | ||
| 1407 | * up to a PAGE_SIZE boundary. A holelen of zero truncates to the | ||
| 1408 | * end of the file. | ||
| 1409 | * @even_cows: 1 when truncating a file, unmap even private COWed pages; | ||
| 1410 | * but 0 when invalidating pagecache, don't throw away private data. | ||
| 1411 | */ | ||
| 1412 | void unmap_mapping_range(struct address_space *mapping, | ||
| 1413 | loff_t const holebegin, loff_t const holelen, int even_cows) | ||
| 1414 | { | ||
| 1415 | struct zap_details details; | ||
| 1416 | pgoff_t hba = holebegin >> PAGE_SHIFT; | ||
| 1417 | pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
| 1418 | |||
| 1419 | /* Check for overflow. */ | ||
| 1420 | if (sizeof(holelen) > sizeof(hlen)) { | ||
| 1421 | long long holeend = | ||
| 1422 | (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
| 1423 | if (holeend & ~(long long)ULONG_MAX) | ||
| 1424 | hlen = ULONG_MAX - hba + 1; | ||
| 1425 | } | ||
| 1426 | |||
| 1427 | details.check_mapping = even_cows? NULL: mapping; | ||
| 1428 | details.nonlinear_vma = NULL; | ||
| 1429 | details.first_index = hba; | ||
| 1430 | details.last_index = hba + hlen - 1; | ||
| 1431 | if (details.last_index < details.first_index) | ||
| 1432 | details.last_index = ULONG_MAX; | ||
| 1433 | details.i_mmap_lock = &mapping->i_mmap_lock; | ||
| 1434 | |||
| 1435 | spin_lock(&mapping->i_mmap_lock); | ||
| 1436 | |||
| 1437 | /* serialize i_size write against truncate_count write */ | ||
| 1438 | smp_wmb(); | ||
| 1439 | /* Protect against page faults, and endless unmapping loops */ | ||
| 1440 | mapping->truncate_count++; | ||
| 1441 | /* | ||
| 1442 | * For archs where spin_lock has inclusive semantics like ia64 | ||
| 1443 | * this smp_mb() will prevent to read pagetable contents | ||
| 1444 | * before the truncate_count increment is visible to | ||
| 1445 | * other cpus. | ||
| 1446 | */ | ||
| 1447 | smp_mb(); | ||
| 1448 | if (unlikely(is_restart_addr(mapping->truncate_count))) { | ||
| 1449 | if (mapping->truncate_count == 0) | ||
| 1450 | reset_vma_truncate_counts(mapping); | ||
| 1451 | mapping->truncate_count++; | ||
| 1452 | } | ||
| 1453 | details.truncate_count = mapping->truncate_count; | ||
| 1454 | |||
| 1455 | if (unlikely(!prio_tree_empty(&mapping->i_mmap))) | ||
| 1456 | unmap_mapping_range_tree(&mapping->i_mmap, &details); | ||
| 1457 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) | ||
| 1458 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); | ||
| 1459 | spin_unlock(&mapping->i_mmap_lock); | ||
| 1460 | } | ||
| 1461 | EXPORT_SYMBOL(unmap_mapping_range); | ||
| 1462 | |||
| 1463 | /* | ||
| 1464 | * Handle all mappings that got truncated by a "truncate()" | ||
| 1465 | * system call. | ||
| 1466 | * | ||
| 1467 | * NOTE! We have to be ready to update the memory sharing | ||
| 1468 | * between the file and the memory map for a potential last | ||
| 1469 | * incomplete page. Ugly, but necessary. | ||
| 1470 | */ | ||
| 1471 | int vmtruncate(struct inode * inode, loff_t offset) | ||
| 1472 | { | ||
| 1473 | struct address_space *mapping = inode->i_mapping; | ||
| 1474 | unsigned long limit; | ||
| 1475 | |||
| 1476 | if (inode->i_size < offset) | ||
| 1477 | goto do_expand; | ||
| 1478 | /* | ||
| 1479 | * truncation of in-use swapfiles is disallowed - it would cause | ||
| 1480 | * subsequent swapout to scribble on the now-freed blocks. | ||
| 1481 | */ | ||
| 1482 | if (IS_SWAPFILE(inode)) | ||
| 1483 | goto out_busy; | ||
| 1484 | i_size_write(inode, offset); | ||
| 1485 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); | ||
| 1486 | truncate_inode_pages(mapping, offset); | ||
| 1487 | goto out_truncate; | ||
| 1488 | |||
| 1489 | do_expand: | ||
| 1490 | limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | ||
| 1491 | if (limit != RLIM_INFINITY && offset > limit) | ||
| 1492 | goto out_sig; | ||
| 1493 | if (offset > inode->i_sb->s_maxbytes) | ||
| 1494 | goto out_big; | ||
| 1495 | i_size_write(inode, offset); | ||
| 1496 | |||
| 1497 | out_truncate: | ||
| 1498 | if (inode->i_op && inode->i_op->truncate) | ||
| 1499 | inode->i_op->truncate(inode); | ||
| 1500 | return 0; | ||
| 1501 | out_sig: | ||
| 1502 | send_sig(SIGXFSZ, current, 0); | ||
| 1503 | out_big: | ||
| 1504 | return -EFBIG; | ||
| 1505 | out_busy: | ||
| 1506 | return -ETXTBSY; | ||
| 1507 | } | ||
| 1508 | |||
| 1509 | EXPORT_SYMBOL(vmtruncate); | ||
| 1510 | |||
| 1511 | /* | ||
| 1512 | * Primitive swap readahead code. We simply read an aligned block of | ||
| 1513 | * (1 << page_cluster) entries in the swap area. This method is chosen | ||
| 1514 | * because it doesn't cost us any seek time. We also make sure to queue | ||
| 1515 | * the 'original' request together with the readahead ones... | ||
| 1516 | * | ||
| 1517 | * This has been extended to use the NUMA policies from the mm triggering | ||
| 1518 | * the readahead. | ||
| 1519 | * | ||
| 1520 | * Caller must hold down_read on the vma->vm_mm if vma is not NULL. | ||
| 1521 | */ | ||
| 1522 | void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma) | ||
| 1523 | { | ||
| 1524 | #ifdef CONFIG_NUMA | ||
| 1525 | struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL; | ||
| 1526 | #endif | ||
| 1527 | int i, num; | ||
| 1528 | struct page *new_page; | ||
| 1529 | unsigned long offset; | ||
| 1530 | |||
| 1531 | /* | ||
| 1532 | * Get the number of handles we should do readahead io to. | ||
| 1533 | */ | ||
| 1534 | num = valid_swaphandles(entry, &offset); | ||
| 1535 | for (i = 0; i < num; offset++, i++) { | ||
| 1536 | /* Ok, do the async read-ahead now */ | ||
| 1537 | new_page = read_swap_cache_async(swp_entry(swp_type(entry), | ||
| 1538 | offset), vma, addr); | ||
| 1539 | if (!new_page) | ||
| 1540 | break; | ||
| 1541 | page_cache_release(new_page); | ||
| 1542 | #ifdef CONFIG_NUMA | ||
| 1543 | /* | ||
| 1544 | * Find the next applicable VMA for the NUMA policy. | ||
| 1545 | */ | ||
| 1546 | addr += PAGE_SIZE; | ||
| 1547 | if (addr == 0) | ||
| 1548 | vma = NULL; | ||
| 1549 | if (vma) { | ||
| 1550 | if (addr >= vma->vm_end) { | ||
| 1551 | vma = next_vma; | ||
| 1552 | next_vma = vma ? vma->vm_next : NULL; | ||
| 1553 | } | ||
| 1554 | if (vma && addr < vma->vm_start) | ||
| 1555 | vma = NULL; | ||
| 1556 | } else { | ||
| 1557 | if (next_vma && addr >= next_vma->vm_start) { | ||
| 1558 | vma = next_vma; | ||
| 1559 | next_vma = vma->vm_next; | ||
| 1560 | } | ||
| 1561 | } | ||
| 1562 | #endif | ||
| 1563 | } | ||
| 1564 | lru_add_drain(); /* Push any new pages onto the LRU now */ | ||
| 1565 | } | ||
| 1566 | |||
| 1567 | /* | ||
| 1568 | * We hold the mm semaphore and the page_table_lock on entry and | ||
| 1569 | * should release the pagetable lock on exit.. | ||
| 1570 | */ | ||
| 1571 | static int do_swap_page(struct mm_struct * mm, | ||
| 1572 | struct vm_area_struct * vma, unsigned long address, | ||
| 1573 | pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access) | ||
| 1574 | { | ||
| 1575 | struct page *page; | ||
| 1576 | swp_entry_t entry = pte_to_swp_entry(orig_pte); | ||
| 1577 | pte_t pte; | ||
| 1578 | int ret = VM_FAULT_MINOR; | ||
| 1579 | |||
| 1580 | pte_unmap(page_table); | ||
| 1581 | spin_unlock(&mm->page_table_lock); | ||
| 1582 | page = lookup_swap_cache(entry); | ||
| 1583 | if (!page) { | ||
| 1584 | swapin_readahead(entry, address, vma); | ||
| 1585 | page = read_swap_cache_async(entry, vma, address); | ||
| 1586 | if (!page) { | ||
| 1587 | /* | ||
| 1588 | * Back out if somebody else faulted in this pte while | ||
| 1589 | * we released the page table lock. | ||
| 1590 | */ | ||
| 1591 | spin_lock(&mm->page_table_lock); | ||
| 1592 | page_table = pte_offset_map(pmd, address); | ||
| 1593 | if (likely(pte_same(*page_table, orig_pte))) | ||
| 1594 | ret = VM_FAULT_OOM; | ||
| 1595 | else | ||
| 1596 | ret = VM_FAULT_MINOR; | ||
| 1597 | pte_unmap(page_table); | ||
| 1598 | spin_unlock(&mm->page_table_lock); | ||
| 1599 | goto out; | ||
| 1600 | } | ||
| 1601 | |||
| 1602 | /* Had to read the page from swap area: Major fault */ | ||
| 1603 | ret = VM_FAULT_MAJOR; | ||
| 1604 | inc_page_state(pgmajfault); | ||
| 1605 | grab_swap_token(); | ||
| 1606 | } | ||
| 1607 | |||
| 1608 | mark_page_accessed(page); | ||
| 1609 | lock_page(page); | ||
| 1610 | |||
| 1611 | /* | ||
| 1612 | * Back out if somebody else faulted in this pte while we | ||
| 1613 | * released the page table lock. | ||
| 1614 | */ | ||
| 1615 | spin_lock(&mm->page_table_lock); | ||
| 1616 | page_table = pte_offset_map(pmd, address); | ||
| 1617 | if (unlikely(!pte_same(*page_table, orig_pte))) { | ||
| 1618 | pte_unmap(page_table); | ||
| 1619 | spin_unlock(&mm->page_table_lock); | ||
| 1620 | unlock_page(page); | ||
| 1621 | page_cache_release(page); | ||
| 1622 | ret = VM_FAULT_MINOR; | ||
| 1623 | goto out; | ||
| 1624 | } | ||
| 1625 | |||
| 1626 | /* The page isn't present yet, go ahead with the fault. */ | ||
| 1627 | |||
| 1628 | swap_free(entry); | ||
| 1629 | if (vm_swap_full()) | ||
| 1630 | remove_exclusive_swap_page(page); | ||
| 1631 | |||
| 1632 | inc_mm_counter(mm, rss); | ||
| 1633 | pte = mk_pte(page, vma->vm_page_prot); | ||
| 1634 | if (write_access && can_share_swap_page(page)) { | ||
| 1635 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | ||
| 1636 | write_access = 0; | ||
| 1637 | } | ||
| 1638 | unlock_page(page); | ||
| 1639 | |||
| 1640 | flush_icache_page(vma, page); | ||
| 1641 | set_pte_at(mm, address, page_table, pte); | ||
| 1642 | page_add_anon_rmap(page, vma, address); | ||
| 1643 | |||
| 1644 | if (write_access) { | ||
| 1645 | if (do_wp_page(mm, vma, address, | ||
| 1646 | page_table, pmd, pte) == VM_FAULT_OOM) | ||
| 1647 | ret = VM_FAULT_OOM; | ||
| 1648 | goto out; | ||
| 1649 | } | ||
| 1650 | |||
| 1651 | /* No need to invalidate - it was non-present before */ | ||
| 1652 | update_mmu_cache(vma, address, pte); | ||
| 1653 | lazy_mmu_prot_update(pte); | ||
| 1654 | pte_unmap(page_table); | ||
| 1655 | spin_unlock(&mm->page_table_lock); | ||
| 1656 | out: | ||
| 1657 | return ret; | ||
| 1658 | } | ||
| 1659 | |||
| 1660 | /* | ||
| 1661 | * We are called with the MM semaphore and page_table_lock | ||
| 1662 | * spinlock held to protect against concurrent faults in | ||
| 1663 | * multithreaded programs. | ||
| 1664 | */ | ||
| 1665 | static int | ||
| 1666 | do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 1667 | pte_t *page_table, pmd_t *pmd, int write_access, | ||
| 1668 | unsigned long addr) | ||
| 1669 | { | ||
| 1670 | pte_t entry; | ||
| 1671 | struct page * page = ZERO_PAGE(addr); | ||
| 1672 | |||
| 1673 | /* Read-only mapping of ZERO_PAGE. */ | ||
| 1674 | entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); | ||
| 1675 | |||
| 1676 | /* ..except if it's a write access */ | ||
| 1677 | if (write_access) { | ||
| 1678 | /* Allocate our own private page. */ | ||
| 1679 | pte_unmap(page_table); | ||
| 1680 | spin_unlock(&mm->page_table_lock); | ||
| 1681 | |||
| 1682 | if (unlikely(anon_vma_prepare(vma))) | ||
| 1683 | goto no_mem; | ||
| 1684 | page = alloc_zeroed_user_highpage(vma, addr); | ||
| 1685 | if (!page) | ||
| 1686 | goto no_mem; | ||
| 1687 | |||
| 1688 | spin_lock(&mm->page_table_lock); | ||
| 1689 | page_table = pte_offset_map(pmd, addr); | ||
| 1690 | |||
| 1691 | if (!pte_none(*page_table)) { | ||
| 1692 | pte_unmap(page_table); | ||
| 1693 | page_cache_release(page); | ||
| 1694 | spin_unlock(&mm->page_table_lock); | ||
| 1695 | goto out; | ||
| 1696 | } | ||
| 1697 | inc_mm_counter(mm, rss); | ||
| 1698 | entry = maybe_mkwrite(pte_mkdirty(mk_pte(page, | ||
| 1699 | vma->vm_page_prot)), | ||
| 1700 | vma); | ||
| 1701 | lru_cache_add_active(page); | ||
| 1702 | SetPageReferenced(page); | ||
| 1703 | page_add_anon_rmap(page, vma, addr); | ||
| 1704 | } | ||
| 1705 | |||
| 1706 | set_pte_at(mm, addr, page_table, entry); | ||
| 1707 | pte_unmap(page_table); | ||
| 1708 | |||
| 1709 | /* No need to invalidate - it was non-present before */ | ||
| 1710 | update_mmu_cache(vma, addr, entry); | ||
| 1711 | lazy_mmu_prot_update(entry); | ||
| 1712 | spin_unlock(&mm->page_table_lock); | ||
| 1713 | out: | ||
| 1714 | return VM_FAULT_MINOR; | ||
| 1715 | no_mem: | ||
| 1716 | return VM_FAULT_OOM; | ||
| 1717 | } | ||
| 1718 | |||
| 1719 | /* | ||
| 1720 | * do_no_page() tries to create a new page mapping. It aggressively | ||
| 1721 | * tries to share with existing pages, but makes a separate copy if | ||
| 1722 | * the "write_access" parameter is true in order to avoid the next | ||
| 1723 | * page fault. | ||
| 1724 | * | ||
| 1725 | * As this is called only for pages that do not currently exist, we | ||
| 1726 | * do not need to flush old virtual caches or the TLB. | ||
| 1727 | * | ||
| 1728 | * This is called with the MM semaphore held and the page table | ||
| 1729 | * spinlock held. Exit with the spinlock released. | ||
| 1730 | */ | ||
| 1731 | static int | ||
| 1732 | do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 1733 | unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) | ||
| 1734 | { | ||
| 1735 | struct page * new_page; | ||
| 1736 | struct address_space *mapping = NULL; | ||
| 1737 | pte_t entry; | ||
| 1738 | unsigned int sequence = 0; | ||
| 1739 | int ret = VM_FAULT_MINOR; | ||
| 1740 | int anon = 0; | ||
| 1741 | |||
| 1742 | if (!vma->vm_ops || !vma->vm_ops->nopage) | ||
| 1743 | return do_anonymous_page(mm, vma, page_table, | ||
| 1744 | pmd, write_access, address); | ||
| 1745 | pte_unmap(page_table); | ||
| 1746 | spin_unlock(&mm->page_table_lock); | ||
| 1747 | |||
| 1748 | if (vma->vm_file) { | ||
| 1749 | mapping = vma->vm_file->f_mapping; | ||
| 1750 | sequence = mapping->truncate_count; | ||
| 1751 | smp_rmb(); /* serializes i_size against truncate_count */ | ||
| 1752 | } | ||
| 1753 | retry: | ||
| 1754 | cond_resched(); | ||
| 1755 | new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); | ||
| 1756 | /* | ||
| 1757 | * No smp_rmb is needed here as long as there's a full | ||
| 1758 | * spin_lock/unlock sequence inside the ->nopage callback | ||
| 1759 | * (for the pagecache lookup) that acts as an implicit | ||
| 1760 | * smp_mb() and prevents the i_size read to happen | ||
| 1761 | * after the next truncate_count read. | ||
| 1762 | */ | ||
| 1763 | |||
| 1764 | /* no page was available -- either SIGBUS or OOM */ | ||
| 1765 | if (new_page == NOPAGE_SIGBUS) | ||
| 1766 | return VM_FAULT_SIGBUS; | ||
| 1767 | if (new_page == NOPAGE_OOM) | ||
| 1768 | return VM_FAULT_OOM; | ||
| 1769 | |||
| 1770 | /* | ||
| 1771 | * Should we do an early C-O-W break? | ||
| 1772 | */ | ||
| 1773 | if (write_access && !(vma->vm_flags & VM_SHARED)) { | ||
| 1774 | struct page *page; | ||
| 1775 | |||
| 1776 | if (unlikely(anon_vma_prepare(vma))) | ||
| 1777 | goto oom; | ||
| 1778 | page = alloc_page_vma(GFP_HIGHUSER, vma, address); | ||
| 1779 | if (!page) | ||
| 1780 | goto oom; | ||
| 1781 | copy_user_highpage(page, new_page, address); | ||
| 1782 | page_cache_release(new_page); | ||
| 1783 | new_page = page; | ||
| 1784 | anon = 1; | ||
| 1785 | } | ||
| 1786 | |||
| 1787 | spin_lock(&mm->page_table_lock); | ||
| 1788 | /* | ||
| 1789 | * For a file-backed vma, someone could have truncated or otherwise | ||
| 1790 | * invalidated this page. If unmap_mapping_range got called, | ||
| 1791 | * retry getting the page. | ||
| 1792 | */ | ||
| 1793 | if (mapping && unlikely(sequence != mapping->truncate_count)) { | ||
| 1794 | sequence = mapping->truncate_count; | ||
| 1795 | spin_unlock(&mm->page_table_lock); | ||
| 1796 | page_cache_release(new_page); | ||
| 1797 | goto retry; | ||
| 1798 | } | ||
| 1799 | page_table = pte_offset_map(pmd, address); | ||
| 1800 | |||
| 1801 | /* | ||
| 1802 | * This silly early PAGE_DIRTY setting removes a race | ||
| 1803 | * due to the bad i386 page protection. But it's valid | ||
| 1804 | * for other architectures too. | ||
| 1805 | * | ||
| 1806 | * Note that if write_access is true, we either now have | ||
| 1807 | * an exclusive copy of the page, or this is a shared mapping, | ||
| 1808 | * so we can make it writable and dirty to avoid having to | ||
| 1809 | * handle that later. | ||
| 1810 | */ | ||
| 1811 | /* Only go through if we didn't race with anybody else... */ | ||
| 1812 | if (pte_none(*page_table)) { | ||
| 1813 | if (!PageReserved(new_page)) | ||
| 1814 | inc_mm_counter(mm, rss); | ||
| 1815 | |||
| 1816 | flush_icache_page(vma, new_page); | ||
| 1817 | entry = mk_pte(new_page, vma->vm_page_prot); | ||
| 1818 | if (write_access) | ||
| 1819 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
| 1820 | set_pte_at(mm, address, page_table, entry); | ||
| 1821 | if (anon) { | ||
| 1822 | lru_cache_add_active(new_page); | ||
| 1823 | page_add_anon_rmap(new_page, vma, address); | ||
| 1824 | } else | ||
| 1825 | page_add_file_rmap(new_page); | ||
| 1826 | pte_unmap(page_table); | ||
| 1827 | } else { | ||
| 1828 | /* One of our sibling threads was faster, back out. */ | ||
| 1829 | pte_unmap(page_table); | ||
| 1830 | page_cache_release(new_page); | ||
| 1831 | spin_unlock(&mm->page_table_lock); | ||
| 1832 | goto out; | ||
| 1833 | } | ||
| 1834 | |||
| 1835 | /* no need to invalidate: a not-present page shouldn't be cached */ | ||
| 1836 | update_mmu_cache(vma, address, entry); | ||
| 1837 | lazy_mmu_prot_update(entry); | ||
| 1838 | spin_unlock(&mm->page_table_lock); | ||
| 1839 | out: | ||
| 1840 | return ret; | ||
| 1841 | oom: | ||
| 1842 | page_cache_release(new_page); | ||
| 1843 | ret = VM_FAULT_OOM; | ||
| 1844 | goto out; | ||
| 1845 | } | ||
| 1846 | |||
| 1847 | /* | ||
| 1848 | * Fault of a previously existing named mapping. Repopulate the pte | ||
| 1849 | * from the encoded file_pte if possible. This enables swappable | ||
| 1850 | * nonlinear vmas. | ||
| 1851 | */ | ||
| 1852 | static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma, | ||
| 1853 | unsigned long address, int write_access, pte_t *pte, pmd_t *pmd) | ||
| 1854 | { | ||
| 1855 | unsigned long pgoff; | ||
| 1856 | int err; | ||
| 1857 | |||
| 1858 | BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage); | ||
| 1859 | /* | ||
| 1860 | * Fall back to the linear mapping if the fs does not support | ||
| 1861 | * ->populate: | ||
| 1862 | */ | ||
| 1863 | if (!vma->vm_ops || !vma->vm_ops->populate || | ||
| 1864 | (write_access && !(vma->vm_flags & VM_SHARED))) { | ||
| 1865 | pte_clear(mm, address, pte); | ||
| 1866 | return do_no_page(mm, vma, address, write_access, pte, pmd); | ||
| 1867 | } | ||
| 1868 | |||
| 1869 | pgoff = pte_to_pgoff(*pte); | ||
| 1870 | |||
| 1871 | pte_unmap(pte); | ||
| 1872 | spin_unlock(&mm->page_table_lock); | ||
| 1873 | |||
| 1874 | err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); | ||
| 1875 | if (err == -ENOMEM) | ||
| 1876 | return VM_FAULT_OOM; | ||
| 1877 | if (err) | ||
| 1878 | return VM_FAULT_SIGBUS; | ||
| 1879 | return VM_FAULT_MAJOR; | ||
| 1880 | } | ||
| 1881 | |||
| 1882 | /* | ||
| 1883 | * These routines also need to handle stuff like marking pages dirty | ||
| 1884 | * and/or accessed for architectures that don't do it in hardware (most | ||
| 1885 | * RISC architectures). The early dirtying is also good on the i386. | ||
| 1886 | * | ||
| 1887 | * There is also a hook called "update_mmu_cache()" that architectures | ||
| 1888 | * with external mmu caches can use to update those (ie the Sparc or | ||
| 1889 | * PowerPC hashed page tables that act as extended TLBs). | ||
| 1890 | * | ||
| 1891 | * Note the "page_table_lock". It is to protect against kswapd removing | ||
| 1892 | * pages from under us. Note that kswapd only ever _removes_ pages, never | ||
| 1893 | * adds them. As such, once we have noticed that the page is not present, | ||
| 1894 | * we can drop the lock early. | ||
| 1895 | * | ||
| 1896 | * The adding of pages is protected by the MM semaphore (which we hold), | ||
| 1897 | * so we don't need to worry about a page being suddenly been added into | ||
| 1898 | * our VM. | ||
| 1899 | * | ||
| 1900 | * We enter with the pagetable spinlock held, we are supposed to | ||
| 1901 | * release it when done. | ||
| 1902 | */ | ||
| 1903 | static inline int handle_pte_fault(struct mm_struct *mm, | ||
| 1904 | struct vm_area_struct * vma, unsigned long address, | ||
| 1905 | int write_access, pte_t *pte, pmd_t *pmd) | ||
| 1906 | { | ||
| 1907 | pte_t entry; | ||
| 1908 | |||
| 1909 | entry = *pte; | ||
| 1910 | if (!pte_present(entry)) { | ||
| 1911 | /* | ||
| 1912 | * If it truly wasn't present, we know that kswapd | ||
| 1913 | * and the PTE updates will not touch it later. So | ||
| 1914 | * drop the lock. | ||
| 1915 | */ | ||
| 1916 | if (pte_none(entry)) | ||
| 1917 | return do_no_page(mm, vma, address, write_access, pte, pmd); | ||
| 1918 | if (pte_file(entry)) | ||
| 1919 | return do_file_page(mm, vma, address, write_access, pte, pmd); | ||
| 1920 | return do_swap_page(mm, vma, address, pte, pmd, entry, write_access); | ||
| 1921 | } | ||
| 1922 | |||
| 1923 | if (write_access) { | ||
| 1924 | if (!pte_write(entry)) | ||
| 1925 | return do_wp_page(mm, vma, address, pte, pmd, entry); | ||
| 1926 | |||
| 1927 | entry = pte_mkdirty(entry); | ||
| 1928 | } | ||
| 1929 | entry = pte_mkyoung(entry); | ||
| 1930 | ptep_set_access_flags(vma, address, pte, entry, write_access); | ||
| 1931 | update_mmu_cache(vma, address, entry); | ||
| 1932 | lazy_mmu_prot_update(entry); | ||
| 1933 | pte_unmap(pte); | ||
| 1934 | spin_unlock(&mm->page_table_lock); | ||
| 1935 | return VM_FAULT_MINOR; | ||
| 1936 | } | ||
| 1937 | |||
| 1938 | /* | ||
| 1939 | * By the time we get here, we already hold the mm semaphore | ||
| 1940 | */ | ||
| 1941 | int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, | ||
| 1942 | unsigned long address, int write_access) | ||
| 1943 | { | ||
| 1944 | pgd_t *pgd; | ||
| 1945 | pud_t *pud; | ||
| 1946 | pmd_t *pmd; | ||
| 1947 | pte_t *pte; | ||
| 1948 | |||
| 1949 | __set_current_state(TASK_RUNNING); | ||
| 1950 | |||
| 1951 | inc_page_state(pgfault); | ||
| 1952 | |||
| 1953 | if (is_vm_hugetlb_page(vma)) | ||
| 1954 | return VM_FAULT_SIGBUS; /* mapping truncation does this. */ | ||
| 1955 | |||
| 1956 | /* | ||
| 1957 | * We need the page table lock to synchronize with kswapd | ||
| 1958 | * and the SMP-safe atomic PTE updates. | ||
| 1959 | */ | ||
| 1960 | pgd = pgd_offset(mm, address); | ||
| 1961 | spin_lock(&mm->page_table_lock); | ||
| 1962 | |||
| 1963 | pud = pud_alloc(mm, pgd, address); | ||
| 1964 | if (!pud) | ||
| 1965 | goto oom; | ||
| 1966 | |||
| 1967 | pmd = pmd_alloc(mm, pud, address); | ||
| 1968 | if (!pmd) | ||
| 1969 | goto oom; | ||
| 1970 | |||
| 1971 | pte = pte_alloc_map(mm, pmd, address); | ||
| 1972 | if (!pte) | ||
| 1973 | goto oom; | ||
| 1974 | |||
| 1975 | return handle_pte_fault(mm, vma, address, write_access, pte, pmd); | ||
| 1976 | |||
| 1977 | oom: | ||
| 1978 | spin_unlock(&mm->page_table_lock); | ||
| 1979 | return VM_FAULT_OOM; | ||
| 1980 | } | ||
| 1981 | |||
| 1982 | #ifndef __PAGETABLE_PUD_FOLDED | ||
| 1983 | /* | ||
| 1984 | * Allocate page upper directory. | ||
| 1985 | * | ||
| 1986 | * We've already handled the fast-path in-line, and we own the | ||
| 1987 | * page table lock. | ||
| 1988 | */ | ||
| 1989 | pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) | ||
| 1990 | { | ||
| 1991 | pud_t *new; | ||
| 1992 | |||
| 1993 | spin_unlock(&mm->page_table_lock); | ||
| 1994 | new = pud_alloc_one(mm, address); | ||
| 1995 | spin_lock(&mm->page_table_lock); | ||
| 1996 | if (!new) | ||
| 1997 | return NULL; | ||
| 1998 | |||
| 1999 | /* | ||
| 2000 | * Because we dropped the lock, we should re-check the | ||
| 2001 | * entry, as somebody else could have populated it.. | ||
| 2002 | */ | ||
| 2003 | if (pgd_present(*pgd)) { | ||
| 2004 | pud_free(new); | ||
| 2005 | goto out; | ||
| 2006 | } | ||
| 2007 | pgd_populate(mm, pgd, new); | ||
| 2008 | out: | ||
| 2009 | return pud_offset(pgd, address); | ||
| 2010 | } | ||
| 2011 | #endif /* __PAGETABLE_PUD_FOLDED */ | ||
| 2012 | |||
| 2013 | #ifndef __PAGETABLE_PMD_FOLDED | ||
| 2014 | /* | ||
| 2015 | * Allocate page middle directory. | ||
| 2016 | * | ||
| 2017 | * We've already handled the fast-path in-line, and we own the | ||
| 2018 | * page table lock. | ||
| 2019 | */ | ||
| 2020 | pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) | ||
| 2021 | { | ||
| 2022 | pmd_t *new; | ||
| 2023 | |||
| 2024 | spin_unlock(&mm->page_table_lock); | ||
| 2025 | new = pmd_alloc_one(mm, address); | ||
| 2026 | spin_lock(&mm->page_table_lock); | ||
| 2027 | if (!new) | ||
| 2028 | return NULL; | ||
| 2029 | |||
| 2030 | /* | ||
| 2031 | * Because we dropped the lock, we should re-check the | ||
| 2032 | * entry, as somebody else could have populated it.. | ||
| 2033 | */ | ||
| 2034 | #ifndef __ARCH_HAS_4LEVEL_HACK | ||
| 2035 | if (pud_present(*pud)) { | ||
| 2036 | pmd_free(new); | ||
| 2037 | goto out; | ||
| 2038 | } | ||
| 2039 | pud_populate(mm, pud, new); | ||
| 2040 | #else | ||
| 2041 | if (pgd_present(*pud)) { | ||
| 2042 | pmd_free(new); | ||
| 2043 | goto out; | ||
| 2044 | } | ||
| 2045 | pgd_populate(mm, pud, new); | ||
| 2046 | #endif /* __ARCH_HAS_4LEVEL_HACK */ | ||
| 2047 | |||
| 2048 | out: | ||
| 2049 | return pmd_offset(pud, address); | ||
| 2050 | } | ||
| 2051 | #endif /* __PAGETABLE_PMD_FOLDED */ | ||
| 2052 | |||
| 2053 | int make_pages_present(unsigned long addr, unsigned long end) | ||
| 2054 | { | ||
| 2055 | int ret, len, write; | ||
| 2056 | struct vm_area_struct * vma; | ||
| 2057 | |||
| 2058 | vma = find_vma(current->mm, addr); | ||
| 2059 | if (!vma) | ||
| 2060 | return -1; | ||
| 2061 | write = (vma->vm_flags & VM_WRITE) != 0; | ||
| 2062 | if (addr >= end) | ||
| 2063 | BUG(); | ||
| 2064 | if (end > vma->vm_end) | ||
| 2065 | BUG(); | ||
| 2066 | len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; | ||
| 2067 | ret = get_user_pages(current, current->mm, addr, | ||
| 2068 | len, write, 0, NULL, NULL); | ||
| 2069 | if (ret < 0) | ||
| 2070 | return ret; | ||
| 2071 | return ret == len ? 0 : -1; | ||
| 2072 | } | ||
| 2073 | |||
| 2074 | /* | ||
| 2075 | * Map a vmalloc()-space virtual address to the physical page. | ||
| 2076 | */ | ||
| 2077 | struct page * vmalloc_to_page(void * vmalloc_addr) | ||
| 2078 | { | ||
| 2079 | unsigned long addr = (unsigned long) vmalloc_addr; | ||
| 2080 | struct page *page = NULL; | ||
| 2081 | pgd_t *pgd = pgd_offset_k(addr); | ||
| 2082 | pud_t *pud; | ||
| 2083 | pmd_t *pmd; | ||
| 2084 | pte_t *ptep, pte; | ||
| 2085 | |||
| 2086 | if (!pgd_none(*pgd)) { | ||
| 2087 | pud = pud_offset(pgd, addr); | ||
| 2088 | if (!pud_none(*pud)) { | ||
| 2089 | pmd = pmd_offset(pud, addr); | ||
| 2090 | if (!pmd_none(*pmd)) { | ||
| 2091 | ptep = pte_offset_map(pmd, addr); | ||
| 2092 | pte = *ptep; | ||
| 2093 | if (pte_present(pte)) | ||
| 2094 | page = pte_page(pte); | ||
| 2095 | pte_unmap(ptep); | ||
| 2096 | } | ||
| 2097 | } | ||
| 2098 | } | ||
| 2099 | return page; | ||
| 2100 | } | ||
| 2101 | |||
| 2102 | EXPORT_SYMBOL(vmalloc_to_page); | ||
| 2103 | |||
| 2104 | /* | ||
| 2105 | * Map a vmalloc()-space virtual address to the physical page frame number. | ||
| 2106 | */ | ||
| 2107 | unsigned long vmalloc_to_pfn(void * vmalloc_addr) | ||
| 2108 | { | ||
| 2109 | return page_to_pfn(vmalloc_to_page(vmalloc_addr)); | ||
| 2110 | } | ||
| 2111 | |||
| 2112 | EXPORT_SYMBOL(vmalloc_to_pfn); | ||
| 2113 | |||
| 2114 | /* | ||
| 2115 | * update_mem_hiwater | ||
| 2116 | * - update per process rss and vm high water data | ||
| 2117 | */ | ||
| 2118 | void update_mem_hiwater(struct task_struct *tsk) | ||
| 2119 | { | ||
| 2120 | if (tsk->mm) { | ||
| 2121 | unsigned long rss = get_mm_counter(tsk->mm, rss); | ||
| 2122 | |||
| 2123 | if (tsk->mm->hiwater_rss < rss) | ||
| 2124 | tsk->mm->hiwater_rss = rss; | ||
| 2125 | if (tsk->mm->hiwater_vm < tsk->mm->total_vm) | ||
| 2126 | tsk->mm->hiwater_vm = tsk->mm->total_vm; | ||
| 2127 | } | ||
| 2128 | } | ||
| 2129 | |||
| 2130 | #if !defined(__HAVE_ARCH_GATE_AREA) | ||
| 2131 | |||
| 2132 | #if defined(AT_SYSINFO_EHDR) | ||
| 2133 | struct vm_area_struct gate_vma; | ||
| 2134 | |||
| 2135 | static int __init gate_vma_init(void) | ||
| 2136 | { | ||
| 2137 | gate_vma.vm_mm = NULL; | ||
| 2138 | gate_vma.vm_start = FIXADDR_USER_START; | ||
| 2139 | gate_vma.vm_end = FIXADDR_USER_END; | ||
| 2140 | gate_vma.vm_page_prot = PAGE_READONLY; | ||
| 2141 | gate_vma.vm_flags = 0; | ||
| 2142 | return 0; | ||
| 2143 | } | ||
| 2144 | __initcall(gate_vma_init); | ||
| 2145 | #endif | ||
| 2146 | |||
| 2147 | struct vm_area_struct *get_gate_vma(struct task_struct *tsk) | ||
| 2148 | { | ||
| 2149 | #ifdef AT_SYSINFO_EHDR | ||
| 2150 | return &gate_vma; | ||
| 2151 | #else | ||
| 2152 | return NULL; | ||
| 2153 | #endif | ||
| 2154 | } | ||
| 2155 | |||
| 2156 | int in_gate_area_no_task(unsigned long addr) | ||
| 2157 | { | ||
| 2158 | #ifdef AT_SYSINFO_EHDR | ||
| 2159 | if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) | ||
| 2160 | return 1; | ||
| 2161 | #endif | ||
| 2162 | return 0; | ||
| 2163 | } | ||
| 2164 | |||
| 2165 | #endif /* __HAVE_ARCH_GATE_AREA */ | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c new file mode 100644 index 000000000000..a3b44a671cec --- /dev/null +++ b/mm/mempolicy.c | |||
| @@ -0,0 +1,1138 @@ | |||
| 1 | /* | ||
| 2 | * Simple NUMA memory policy for the Linux kernel. | ||
| 3 | * | ||
| 4 | * Copyright 2003,2004 Andi Kleen, SuSE Labs. | ||
| 5 | * Subject to the GNU Public License, version 2. | ||
| 6 | * | ||
| 7 | * NUMA policy allows the user to give hints in which node(s) memory should | ||
| 8 | * be allocated. | ||
| 9 | * | ||
| 10 | * Support four policies per VMA and per process: | ||
| 11 | * | ||
| 12 | * The VMA policy has priority over the process policy for a page fault. | ||
| 13 | * | ||
| 14 | * interleave Allocate memory interleaved over a set of nodes, | ||
| 15 | * with normal fallback if it fails. | ||
| 16 | * For VMA based allocations this interleaves based on the | ||
| 17 | * offset into the backing object or offset into the mapping | ||
| 18 | * for anonymous memory. For process policy an process counter | ||
| 19 | * is used. | ||
| 20 | * bind Only allocate memory on a specific set of nodes, | ||
| 21 | * no fallback. | ||
| 22 | * preferred Try a specific node first before normal fallback. | ||
| 23 | * As a special case node -1 here means do the allocation | ||
| 24 | * on the local CPU. This is normally identical to default, | ||
| 25 | * but useful to set in a VMA when you have a non default | ||
| 26 | * process policy. | ||
| 27 | * default Allocate on the local node first, or when on a VMA | ||
| 28 | * use the process policy. This is what Linux always did | ||
| 29 | * in a NUMA aware kernel and still does by, ahem, default. | ||
| 30 | * | ||
| 31 | * The process policy is applied for most non interrupt memory allocations | ||
| 32 | * in that process' context. Interrupts ignore the policies and always | ||
| 33 | * try to allocate on the local CPU. The VMA policy is only applied for memory | ||
| 34 | * allocations for a VMA in the VM. | ||
| 35 | * | ||
| 36 | * Currently there are a few corner cases in swapping where the policy | ||
| 37 | * is not applied, but the majority should be handled. When process policy | ||
| 38 | * is used it is not remembered over swap outs/swap ins. | ||
| 39 | * | ||
| 40 | * Only the highest zone in the zone hierarchy gets policied. Allocations | ||
| 41 | * requesting a lower zone just use default policy. This implies that | ||
| 42 | * on systems with highmem kernel lowmem allocation don't get policied. | ||
| 43 | * Same with GFP_DMA allocations. | ||
| 44 | * | ||
| 45 | * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between | ||
| 46 | * all users and remembered even when nobody has memory mapped. | ||
| 47 | */ | ||
| 48 | |||
| 49 | /* Notebook: | ||
| 50 | fix mmap readahead to honour policy and enable policy for any page cache | ||
| 51 | object | ||
| 52 | statistics for bigpages | ||
| 53 | global policy for page cache? currently it uses process policy. Requires | ||
| 54 | first item above. | ||
| 55 | handle mremap for shared memory (currently ignored for the policy) | ||
| 56 | grows down? | ||
| 57 | make bind policy root only? It can trigger oom much faster and the | ||
| 58 | kernel is not always grateful with that. | ||
| 59 | could replace all the switch()es with a mempolicy_ops structure. | ||
| 60 | */ | ||
| 61 | |||
| 62 | #include <linux/mempolicy.h> | ||
| 63 | #include <linux/mm.h> | ||
| 64 | #include <linux/highmem.h> | ||
| 65 | #include <linux/hugetlb.h> | ||
| 66 | #include <linux/kernel.h> | ||
| 67 | #include <linux/sched.h> | ||
| 68 | #include <linux/mm.h> | ||
| 69 | #include <linux/nodemask.h> | ||
| 70 | #include <linux/cpuset.h> | ||
| 71 | #include <linux/gfp.h> | ||
| 72 | #include <linux/slab.h> | ||
| 73 | #include <linux/string.h> | ||
| 74 | #include <linux/module.h> | ||
| 75 | #include <linux/interrupt.h> | ||
| 76 | #include <linux/init.h> | ||
| 77 | #include <linux/compat.h> | ||
| 78 | #include <linux/mempolicy.h> | ||
| 79 | #include <asm/tlbflush.h> | ||
| 80 | #include <asm/uaccess.h> | ||
| 81 | |||
| 82 | static kmem_cache_t *policy_cache; | ||
| 83 | static kmem_cache_t *sn_cache; | ||
| 84 | |||
| 85 | #define PDprintk(fmt...) | ||
| 86 | |||
| 87 | /* Highest zone. An specific allocation for a zone below that is not | ||
| 88 | policied. */ | ||
| 89 | static int policy_zone; | ||
| 90 | |||
| 91 | static struct mempolicy default_policy = { | ||
| 92 | .refcnt = ATOMIC_INIT(1), /* never free it */ | ||
| 93 | .policy = MPOL_DEFAULT, | ||
| 94 | }; | ||
| 95 | |||
| 96 | /* Check if all specified nodes are online */ | ||
| 97 | static int nodes_online(unsigned long *nodes) | ||
| 98 | { | ||
| 99 | DECLARE_BITMAP(online2, MAX_NUMNODES); | ||
| 100 | |||
| 101 | bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES); | ||
| 102 | if (bitmap_empty(online2, MAX_NUMNODES)) | ||
| 103 | set_bit(0, online2); | ||
| 104 | if (!bitmap_subset(nodes, online2, MAX_NUMNODES)) | ||
| 105 | return -EINVAL; | ||
| 106 | return 0; | ||
| 107 | } | ||
| 108 | |||
| 109 | /* Do sanity checking on a policy */ | ||
| 110 | static int mpol_check_policy(int mode, unsigned long *nodes) | ||
| 111 | { | ||
| 112 | int empty = bitmap_empty(nodes, MAX_NUMNODES); | ||
| 113 | |||
| 114 | switch (mode) { | ||
| 115 | case MPOL_DEFAULT: | ||
| 116 | if (!empty) | ||
| 117 | return -EINVAL; | ||
| 118 | break; | ||
| 119 | case MPOL_BIND: | ||
| 120 | case MPOL_INTERLEAVE: | ||
| 121 | /* Preferred will only use the first bit, but allow | ||
| 122 | more for now. */ | ||
| 123 | if (empty) | ||
| 124 | return -EINVAL; | ||
| 125 | break; | ||
| 126 | } | ||
| 127 | return nodes_online(nodes); | ||
| 128 | } | ||
| 129 | |||
| 130 | /* Copy a node mask from user space. */ | ||
| 131 | static int get_nodes(unsigned long *nodes, unsigned long __user *nmask, | ||
| 132 | unsigned long maxnode, int mode) | ||
| 133 | { | ||
| 134 | unsigned long k; | ||
| 135 | unsigned long nlongs; | ||
| 136 | unsigned long endmask; | ||
| 137 | |||
| 138 | --maxnode; | ||
| 139 | bitmap_zero(nodes, MAX_NUMNODES); | ||
| 140 | if (maxnode == 0 || !nmask) | ||
| 141 | return 0; | ||
| 142 | |||
| 143 | nlongs = BITS_TO_LONGS(maxnode); | ||
| 144 | if ((maxnode % BITS_PER_LONG) == 0) | ||
| 145 | endmask = ~0UL; | ||
| 146 | else | ||
| 147 | endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; | ||
| 148 | |||
| 149 | /* When the user specified more nodes than supported just check | ||
| 150 | if the non supported part is all zero. */ | ||
| 151 | if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { | ||
| 152 | if (nlongs > PAGE_SIZE/sizeof(long)) | ||
| 153 | return -EINVAL; | ||
| 154 | for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { | ||
| 155 | unsigned long t; | ||
| 156 | if (get_user(t, nmask + k)) | ||
| 157 | return -EFAULT; | ||
| 158 | if (k == nlongs - 1) { | ||
| 159 | if (t & endmask) | ||
| 160 | return -EINVAL; | ||
| 161 | } else if (t) | ||
| 162 | return -EINVAL; | ||
| 163 | } | ||
| 164 | nlongs = BITS_TO_LONGS(MAX_NUMNODES); | ||
| 165 | endmask = ~0UL; | ||
| 166 | } | ||
| 167 | |||
| 168 | if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long))) | ||
| 169 | return -EFAULT; | ||
| 170 | nodes[nlongs-1] &= endmask; | ||
| 171 | /* Update current mems_allowed */ | ||
| 172 | cpuset_update_current_mems_allowed(); | ||
| 173 | /* Ignore nodes not set in current->mems_allowed */ | ||
| 174 | cpuset_restrict_to_mems_allowed(nodes); | ||
| 175 | return mpol_check_policy(mode, nodes); | ||
| 176 | } | ||
| 177 | |||
| 178 | /* Generate a custom zonelist for the BIND policy. */ | ||
| 179 | static struct zonelist *bind_zonelist(unsigned long *nodes) | ||
| 180 | { | ||
| 181 | struct zonelist *zl; | ||
| 182 | int num, max, nd; | ||
| 183 | |||
| 184 | max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES); | ||
| 185 | zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); | ||
| 186 | if (!zl) | ||
| 187 | return NULL; | ||
| 188 | num = 0; | ||
| 189 | for (nd = find_first_bit(nodes, MAX_NUMNODES); | ||
| 190 | nd < MAX_NUMNODES; | ||
| 191 | nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) { | ||
| 192 | int k; | ||
| 193 | for (k = MAX_NR_ZONES-1; k >= 0; k--) { | ||
| 194 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; | ||
| 195 | if (!z->present_pages) | ||
| 196 | continue; | ||
| 197 | zl->zones[num++] = z; | ||
| 198 | if (k > policy_zone) | ||
| 199 | policy_zone = k; | ||
| 200 | } | ||
| 201 | } | ||
| 202 | BUG_ON(num >= max); | ||
| 203 | zl->zones[num] = NULL; | ||
| 204 | return zl; | ||
| 205 | } | ||
| 206 | |||
| 207 | /* Create a new policy */ | ||
| 208 | static struct mempolicy *mpol_new(int mode, unsigned long *nodes) | ||
| 209 | { | ||
| 210 | struct mempolicy *policy; | ||
| 211 | |||
| 212 | PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]); | ||
| 213 | if (mode == MPOL_DEFAULT) | ||
| 214 | return NULL; | ||
| 215 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); | ||
| 216 | if (!policy) | ||
| 217 | return ERR_PTR(-ENOMEM); | ||
| 218 | atomic_set(&policy->refcnt, 1); | ||
| 219 | switch (mode) { | ||
| 220 | case MPOL_INTERLEAVE: | ||
| 221 | bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES); | ||
| 222 | break; | ||
| 223 | case MPOL_PREFERRED: | ||
| 224 | policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES); | ||
| 225 | if (policy->v.preferred_node >= MAX_NUMNODES) | ||
| 226 | policy->v.preferred_node = -1; | ||
| 227 | break; | ||
| 228 | case MPOL_BIND: | ||
| 229 | policy->v.zonelist = bind_zonelist(nodes); | ||
| 230 | if (policy->v.zonelist == NULL) { | ||
| 231 | kmem_cache_free(policy_cache, policy); | ||
| 232 | return ERR_PTR(-ENOMEM); | ||
| 233 | } | ||
| 234 | break; | ||
| 235 | } | ||
| 236 | policy->policy = mode; | ||
| 237 | return policy; | ||
| 238 | } | ||
| 239 | |||
| 240 | /* Ensure all existing pages follow the policy. */ | ||
| 241 | static int | ||
| 242 | verify_pages(struct mm_struct *mm, | ||
| 243 | unsigned long addr, unsigned long end, unsigned long *nodes) | ||
| 244 | { | ||
| 245 | while (addr < end) { | ||
| 246 | struct page *p; | ||
| 247 | pte_t *pte; | ||
| 248 | pmd_t *pmd; | ||
| 249 | pud_t *pud; | ||
| 250 | pgd_t *pgd; | ||
| 251 | pgd = pgd_offset(mm, addr); | ||
| 252 | if (pgd_none(*pgd)) { | ||
| 253 | unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK; | ||
| 254 | if (next > addr) | ||
| 255 | break; | ||
| 256 | addr = next; | ||
| 257 | continue; | ||
| 258 | } | ||
| 259 | pud = pud_offset(pgd, addr); | ||
| 260 | if (pud_none(*pud)) { | ||
| 261 | addr = (addr + PUD_SIZE) & PUD_MASK; | ||
| 262 | continue; | ||
| 263 | } | ||
| 264 | pmd = pmd_offset(pud, addr); | ||
| 265 | if (pmd_none(*pmd)) { | ||
| 266 | addr = (addr + PMD_SIZE) & PMD_MASK; | ||
| 267 | continue; | ||
| 268 | } | ||
| 269 | p = NULL; | ||
| 270 | pte = pte_offset_map(pmd, addr); | ||
| 271 | if (pte_present(*pte)) | ||
| 272 | p = pte_page(*pte); | ||
| 273 | pte_unmap(pte); | ||
| 274 | if (p) { | ||
| 275 | unsigned nid = page_to_nid(p); | ||
| 276 | if (!test_bit(nid, nodes)) | ||
| 277 | return -EIO; | ||
| 278 | } | ||
| 279 | addr += PAGE_SIZE; | ||
| 280 | } | ||
| 281 | return 0; | ||
| 282 | } | ||
| 283 | |||
| 284 | /* Step 1: check the range */ | ||
| 285 | static struct vm_area_struct * | ||
| 286 | check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | ||
| 287 | unsigned long *nodes, unsigned long flags) | ||
| 288 | { | ||
| 289 | int err; | ||
| 290 | struct vm_area_struct *first, *vma, *prev; | ||
| 291 | |||
| 292 | first = find_vma(mm, start); | ||
| 293 | if (!first) | ||
| 294 | return ERR_PTR(-EFAULT); | ||
| 295 | prev = NULL; | ||
| 296 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { | ||
| 297 | if (!vma->vm_next && vma->vm_end < end) | ||
| 298 | return ERR_PTR(-EFAULT); | ||
| 299 | if (prev && prev->vm_end < vma->vm_start) | ||
| 300 | return ERR_PTR(-EFAULT); | ||
| 301 | if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { | ||
| 302 | err = verify_pages(vma->vm_mm, | ||
| 303 | vma->vm_start, vma->vm_end, nodes); | ||
| 304 | if (err) { | ||
| 305 | first = ERR_PTR(err); | ||
| 306 | break; | ||
| 307 | } | ||
| 308 | } | ||
| 309 | prev = vma; | ||
| 310 | } | ||
| 311 | return first; | ||
| 312 | } | ||
| 313 | |||
| 314 | /* Apply policy to a single VMA */ | ||
| 315 | static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) | ||
| 316 | { | ||
| 317 | int err = 0; | ||
| 318 | struct mempolicy *old = vma->vm_policy; | ||
| 319 | |||
| 320 | PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", | ||
| 321 | vma->vm_start, vma->vm_end, vma->vm_pgoff, | ||
| 322 | vma->vm_ops, vma->vm_file, | ||
| 323 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); | ||
| 324 | |||
| 325 | if (vma->vm_ops && vma->vm_ops->set_policy) | ||
| 326 | err = vma->vm_ops->set_policy(vma, new); | ||
| 327 | if (!err) { | ||
| 328 | mpol_get(new); | ||
| 329 | vma->vm_policy = new; | ||
| 330 | mpol_free(old); | ||
| 331 | } | ||
| 332 | return err; | ||
| 333 | } | ||
| 334 | |||
| 335 | /* Step 2: apply policy to a range and do splits. */ | ||
| 336 | static int mbind_range(struct vm_area_struct *vma, unsigned long start, | ||
| 337 | unsigned long end, struct mempolicy *new) | ||
| 338 | { | ||
| 339 | struct vm_area_struct *next; | ||
| 340 | int err; | ||
| 341 | |||
| 342 | err = 0; | ||
| 343 | for (; vma && vma->vm_start < end; vma = next) { | ||
| 344 | next = vma->vm_next; | ||
| 345 | if (vma->vm_start < start) | ||
| 346 | err = split_vma(vma->vm_mm, vma, start, 1); | ||
| 347 | if (!err && vma->vm_end > end) | ||
| 348 | err = split_vma(vma->vm_mm, vma, end, 0); | ||
| 349 | if (!err) | ||
| 350 | err = policy_vma(vma, new); | ||
| 351 | if (err) | ||
| 352 | break; | ||
| 353 | } | ||
| 354 | return err; | ||
| 355 | } | ||
| 356 | |||
| 357 | /* Change policy for a memory range */ | ||
| 358 | asmlinkage long sys_mbind(unsigned long start, unsigned long len, | ||
| 359 | unsigned long mode, | ||
| 360 | unsigned long __user *nmask, unsigned long maxnode, | ||
| 361 | unsigned flags) | ||
| 362 | { | ||
| 363 | struct vm_area_struct *vma; | ||
| 364 | struct mm_struct *mm = current->mm; | ||
| 365 | struct mempolicy *new; | ||
| 366 | unsigned long end; | ||
| 367 | DECLARE_BITMAP(nodes, MAX_NUMNODES); | ||
| 368 | int err; | ||
| 369 | |||
| 370 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) | ||
| 371 | return -EINVAL; | ||
| 372 | if (start & ~PAGE_MASK) | ||
| 373 | return -EINVAL; | ||
| 374 | if (mode == MPOL_DEFAULT) | ||
| 375 | flags &= ~MPOL_MF_STRICT; | ||
| 376 | len = (len + PAGE_SIZE - 1) & PAGE_MASK; | ||
| 377 | end = start + len; | ||
| 378 | if (end < start) | ||
| 379 | return -EINVAL; | ||
| 380 | if (end == start) | ||
| 381 | return 0; | ||
| 382 | |||
| 383 | err = get_nodes(nodes, nmask, maxnode, mode); | ||
| 384 | if (err) | ||
| 385 | return err; | ||
| 386 | |||
| 387 | new = mpol_new(mode, nodes); | ||
| 388 | if (IS_ERR(new)) | ||
| 389 | return PTR_ERR(new); | ||
| 390 | |||
| 391 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, | ||
| 392 | mode,nodes[0]); | ||
| 393 | |||
| 394 | down_write(&mm->mmap_sem); | ||
| 395 | vma = check_range(mm, start, end, nodes, flags); | ||
| 396 | err = PTR_ERR(vma); | ||
| 397 | if (!IS_ERR(vma)) | ||
| 398 | err = mbind_range(vma, start, end, new); | ||
| 399 | up_write(&mm->mmap_sem); | ||
| 400 | mpol_free(new); | ||
| 401 | return err; | ||
| 402 | } | ||
| 403 | |||
| 404 | /* Set the process memory policy */ | ||
| 405 | asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | ||
| 406 | unsigned long maxnode) | ||
| 407 | { | ||
| 408 | int err; | ||
| 409 | struct mempolicy *new; | ||
| 410 | DECLARE_BITMAP(nodes, MAX_NUMNODES); | ||
| 411 | |||
| 412 | if (mode > MPOL_MAX) | ||
| 413 | return -EINVAL; | ||
| 414 | err = get_nodes(nodes, nmask, maxnode, mode); | ||
| 415 | if (err) | ||
| 416 | return err; | ||
| 417 | new = mpol_new(mode, nodes); | ||
| 418 | if (IS_ERR(new)) | ||
| 419 | return PTR_ERR(new); | ||
| 420 | mpol_free(current->mempolicy); | ||
| 421 | current->mempolicy = new; | ||
| 422 | if (new && new->policy == MPOL_INTERLEAVE) | ||
| 423 | current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES); | ||
| 424 | return 0; | ||
| 425 | } | ||
| 426 | |||
| 427 | /* Fill a zone bitmap for a policy */ | ||
| 428 | static void get_zonemask(struct mempolicy *p, unsigned long *nodes) | ||
| 429 | { | ||
| 430 | int i; | ||
| 431 | |||
| 432 | bitmap_zero(nodes, MAX_NUMNODES); | ||
| 433 | switch (p->policy) { | ||
| 434 | case MPOL_BIND: | ||
| 435 | for (i = 0; p->v.zonelist->zones[i]; i++) | ||
| 436 | __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes); | ||
| 437 | break; | ||
| 438 | case MPOL_DEFAULT: | ||
| 439 | break; | ||
| 440 | case MPOL_INTERLEAVE: | ||
| 441 | bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES); | ||
| 442 | break; | ||
| 443 | case MPOL_PREFERRED: | ||
| 444 | /* or use current node instead of online map? */ | ||
| 445 | if (p->v.preferred_node < 0) | ||
| 446 | bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES); | ||
| 447 | else | ||
| 448 | __set_bit(p->v.preferred_node, nodes); | ||
| 449 | break; | ||
| 450 | default: | ||
| 451 | BUG(); | ||
| 452 | } | ||
| 453 | } | ||
| 454 | |||
| 455 | static int lookup_node(struct mm_struct *mm, unsigned long addr) | ||
| 456 | { | ||
| 457 | struct page *p; | ||
| 458 | int err; | ||
| 459 | |||
| 460 | err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); | ||
| 461 | if (err >= 0) { | ||
| 462 | err = page_to_nid(p); | ||
| 463 | put_page(p); | ||
| 464 | } | ||
| 465 | return err; | ||
| 466 | } | ||
| 467 | |||
| 468 | /* Copy a kernel node mask to user space */ | ||
| 469 | static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, | ||
| 470 | void *nodes, unsigned nbytes) | ||
| 471 | { | ||
| 472 | unsigned long copy = ALIGN(maxnode-1, 64) / 8; | ||
| 473 | |||
| 474 | if (copy > nbytes) { | ||
| 475 | if (copy > PAGE_SIZE) | ||
| 476 | return -EINVAL; | ||
| 477 | if (clear_user((char __user *)mask + nbytes, copy - nbytes)) | ||
| 478 | return -EFAULT; | ||
| 479 | copy = nbytes; | ||
| 480 | } | ||
| 481 | return copy_to_user(mask, nodes, copy) ? -EFAULT : 0; | ||
| 482 | } | ||
| 483 | |||
| 484 | /* Retrieve NUMA policy */ | ||
| 485 | asmlinkage long sys_get_mempolicy(int __user *policy, | ||
| 486 | unsigned long __user *nmask, | ||
| 487 | unsigned long maxnode, | ||
| 488 | unsigned long addr, unsigned long flags) | ||
| 489 | { | ||
| 490 | int err, pval; | ||
| 491 | struct mm_struct *mm = current->mm; | ||
| 492 | struct vm_area_struct *vma = NULL; | ||
| 493 | struct mempolicy *pol = current->mempolicy; | ||
| 494 | |||
| 495 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) | ||
| 496 | return -EINVAL; | ||
| 497 | if (nmask != NULL && maxnode < MAX_NUMNODES) | ||
| 498 | return -EINVAL; | ||
| 499 | if (flags & MPOL_F_ADDR) { | ||
| 500 | down_read(&mm->mmap_sem); | ||
| 501 | vma = find_vma_intersection(mm, addr, addr+1); | ||
| 502 | if (!vma) { | ||
| 503 | up_read(&mm->mmap_sem); | ||
| 504 | return -EFAULT; | ||
| 505 | } | ||
| 506 | if (vma->vm_ops && vma->vm_ops->get_policy) | ||
| 507 | pol = vma->vm_ops->get_policy(vma, addr); | ||
| 508 | else | ||
| 509 | pol = vma->vm_policy; | ||
| 510 | } else if (addr) | ||
| 511 | return -EINVAL; | ||
| 512 | |||
| 513 | if (!pol) | ||
| 514 | pol = &default_policy; | ||
| 515 | |||
| 516 | if (flags & MPOL_F_NODE) { | ||
| 517 | if (flags & MPOL_F_ADDR) { | ||
| 518 | err = lookup_node(mm, addr); | ||
| 519 | if (err < 0) | ||
| 520 | goto out; | ||
| 521 | pval = err; | ||
| 522 | } else if (pol == current->mempolicy && | ||
| 523 | pol->policy == MPOL_INTERLEAVE) { | ||
| 524 | pval = current->il_next; | ||
| 525 | } else { | ||
| 526 | err = -EINVAL; | ||
| 527 | goto out; | ||
| 528 | } | ||
| 529 | } else | ||
| 530 | pval = pol->policy; | ||
| 531 | |||
| 532 | if (vma) { | ||
| 533 | up_read(¤t->mm->mmap_sem); | ||
| 534 | vma = NULL; | ||
| 535 | } | ||
| 536 | |||
| 537 | if (policy && put_user(pval, policy)) | ||
| 538 | return -EFAULT; | ||
| 539 | |||
| 540 | err = 0; | ||
| 541 | if (nmask) { | ||
| 542 | DECLARE_BITMAP(nodes, MAX_NUMNODES); | ||
| 543 | get_zonemask(pol, nodes); | ||
| 544 | err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes)); | ||
| 545 | } | ||
| 546 | |||
| 547 | out: | ||
| 548 | if (vma) | ||
| 549 | up_read(¤t->mm->mmap_sem); | ||
| 550 | return err; | ||
| 551 | } | ||
| 552 | |||
| 553 | #ifdef CONFIG_COMPAT | ||
| 554 | |||
| 555 | asmlinkage long compat_sys_get_mempolicy(int __user *policy, | ||
| 556 | compat_ulong_t __user *nmask, | ||
| 557 | compat_ulong_t maxnode, | ||
| 558 | compat_ulong_t addr, compat_ulong_t flags) | ||
| 559 | { | ||
| 560 | long err; | ||
| 561 | unsigned long __user *nm = NULL; | ||
| 562 | unsigned long nr_bits, alloc_size; | ||
| 563 | DECLARE_BITMAP(bm, MAX_NUMNODES); | ||
| 564 | |||
| 565 | nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); | ||
| 566 | alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; | ||
| 567 | |||
| 568 | if (nmask) | ||
| 569 | nm = compat_alloc_user_space(alloc_size); | ||
| 570 | |||
| 571 | err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); | ||
| 572 | |||
| 573 | if (!err && nmask) { | ||
| 574 | err = copy_from_user(bm, nm, alloc_size); | ||
| 575 | /* ensure entire bitmap is zeroed */ | ||
| 576 | err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); | ||
| 577 | err |= compat_put_bitmap(nmask, bm, nr_bits); | ||
| 578 | } | ||
| 579 | |||
| 580 | return err; | ||
| 581 | } | ||
| 582 | |||
| 583 | asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, | ||
| 584 | compat_ulong_t maxnode) | ||
| 585 | { | ||
| 586 | long err = 0; | ||
| 587 | unsigned long __user *nm = NULL; | ||
| 588 | unsigned long nr_bits, alloc_size; | ||
| 589 | DECLARE_BITMAP(bm, MAX_NUMNODES); | ||
| 590 | |||
| 591 | nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); | ||
| 592 | alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; | ||
| 593 | |||
| 594 | if (nmask) { | ||
| 595 | err = compat_get_bitmap(bm, nmask, nr_bits); | ||
| 596 | nm = compat_alloc_user_space(alloc_size); | ||
| 597 | err |= copy_to_user(nm, bm, alloc_size); | ||
| 598 | } | ||
| 599 | |||
| 600 | if (err) | ||
| 601 | return -EFAULT; | ||
| 602 | |||
| 603 | return sys_set_mempolicy(mode, nm, nr_bits+1); | ||
| 604 | } | ||
| 605 | |||
| 606 | asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, | ||
| 607 | compat_ulong_t mode, compat_ulong_t __user *nmask, | ||
| 608 | compat_ulong_t maxnode, compat_ulong_t flags) | ||
| 609 | { | ||
| 610 | long err = 0; | ||
| 611 | unsigned long __user *nm = NULL; | ||
| 612 | unsigned long nr_bits, alloc_size; | ||
| 613 | DECLARE_BITMAP(bm, MAX_NUMNODES); | ||
| 614 | |||
| 615 | nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); | ||
| 616 | alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; | ||
| 617 | |||
| 618 | if (nmask) { | ||
| 619 | err = compat_get_bitmap(bm, nmask, nr_bits); | ||
| 620 | nm = compat_alloc_user_space(alloc_size); | ||
| 621 | err |= copy_to_user(nm, bm, alloc_size); | ||
| 622 | } | ||
| 623 | |||
| 624 | if (err) | ||
| 625 | return -EFAULT; | ||
| 626 | |||
| 627 | return sys_mbind(start, len, mode, nm, nr_bits+1, flags); | ||
| 628 | } | ||
| 629 | |||
| 630 | #endif | ||
| 631 | |||
| 632 | /* Return effective policy for a VMA */ | ||
| 633 | static struct mempolicy * | ||
| 634 | get_vma_policy(struct vm_area_struct *vma, unsigned long addr) | ||
| 635 | { | ||
| 636 | struct mempolicy *pol = current->mempolicy; | ||
| 637 | |||
| 638 | if (vma) { | ||
| 639 | if (vma->vm_ops && vma->vm_ops->get_policy) | ||
| 640 | pol = vma->vm_ops->get_policy(vma, addr); | ||
| 641 | else if (vma->vm_policy && | ||
| 642 | vma->vm_policy->policy != MPOL_DEFAULT) | ||
| 643 | pol = vma->vm_policy; | ||
| 644 | } | ||
| 645 | if (!pol) | ||
| 646 | pol = &default_policy; | ||
| 647 | return pol; | ||
| 648 | } | ||
| 649 | |||
| 650 | /* Return a zonelist representing a mempolicy */ | ||
| 651 | static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempolicy *policy) | ||
| 652 | { | ||
| 653 | int nd; | ||
| 654 | |||
| 655 | switch (policy->policy) { | ||
| 656 | case MPOL_PREFERRED: | ||
| 657 | nd = policy->v.preferred_node; | ||
| 658 | if (nd < 0) | ||
| 659 | nd = numa_node_id(); | ||
| 660 | break; | ||
| 661 | case MPOL_BIND: | ||
| 662 | /* Lower zones don't get a policy applied */ | ||
| 663 | /* Careful: current->mems_allowed might have moved */ | ||
| 664 | if (gfp >= policy_zone) | ||
| 665 | if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist)) | ||
| 666 | return policy->v.zonelist; | ||
| 667 | /*FALL THROUGH*/ | ||
| 668 | case MPOL_INTERLEAVE: /* should not happen */ | ||
| 669 | case MPOL_DEFAULT: | ||
| 670 | nd = numa_node_id(); | ||
| 671 | break; | ||
| 672 | default: | ||
| 673 | nd = 0; | ||
| 674 | BUG(); | ||
| 675 | } | ||
| 676 | return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK); | ||
| 677 | } | ||
| 678 | |||
| 679 | /* Do dynamic interleaving for a process */ | ||
| 680 | static unsigned interleave_nodes(struct mempolicy *policy) | ||
| 681 | { | ||
| 682 | unsigned nid, next; | ||
| 683 | struct task_struct *me = current; | ||
| 684 | |||
| 685 | nid = me->il_next; | ||
| 686 | BUG_ON(nid >= MAX_NUMNODES); | ||
| 687 | next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid); | ||
| 688 | if (next >= MAX_NUMNODES) | ||
| 689 | next = find_first_bit(policy->v.nodes, MAX_NUMNODES); | ||
| 690 | me->il_next = next; | ||
| 691 | return nid; | ||
| 692 | } | ||
| 693 | |||
| 694 | /* Do static interleaving for a VMA with known offset. */ | ||
| 695 | static unsigned offset_il_node(struct mempolicy *pol, | ||
| 696 | struct vm_area_struct *vma, unsigned long off) | ||
| 697 | { | ||
| 698 | unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES); | ||
| 699 | unsigned target = (unsigned)off % nnodes; | ||
| 700 | int c; | ||
| 701 | int nid = -1; | ||
| 702 | |||
| 703 | c = 0; | ||
| 704 | do { | ||
| 705 | nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1); | ||
| 706 | c++; | ||
| 707 | } while (c <= target); | ||
| 708 | BUG_ON(nid >= MAX_NUMNODES); | ||
| 709 | BUG_ON(!test_bit(nid, pol->v.nodes)); | ||
| 710 | return nid; | ||
| 711 | } | ||
| 712 | |||
| 713 | /* Allocate a page in interleaved policy. | ||
| 714 | Own path because it needs to do special accounting. */ | ||
| 715 | static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid) | ||
| 716 | { | ||
| 717 | struct zonelist *zl; | ||
| 718 | struct page *page; | ||
| 719 | |||
| 720 | BUG_ON(!node_online(nid)); | ||
| 721 | zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK); | ||
| 722 | page = __alloc_pages(gfp, order, zl); | ||
| 723 | if (page && page_zone(page) == zl->zones[0]) { | ||
| 724 | zl->zones[0]->pageset[get_cpu()].interleave_hit++; | ||
| 725 | put_cpu(); | ||
| 726 | } | ||
| 727 | return page; | ||
| 728 | } | ||
| 729 | |||
| 730 | /** | ||
| 731 | * alloc_page_vma - Allocate a page for a VMA. | ||
| 732 | * | ||
| 733 | * @gfp: | ||
| 734 | * %GFP_USER user allocation. | ||
| 735 | * %GFP_KERNEL kernel allocations, | ||
| 736 | * %GFP_HIGHMEM highmem/user allocations, | ||
| 737 | * %GFP_FS allocation should not call back into a file system. | ||
| 738 | * %GFP_ATOMIC don't sleep. | ||
| 739 | * | ||
| 740 | * @vma: Pointer to VMA or NULL if not available. | ||
| 741 | * @addr: Virtual Address of the allocation. Must be inside the VMA. | ||
| 742 | * | ||
| 743 | * This function allocates a page from the kernel page pool and applies | ||
| 744 | * a NUMA policy associated with the VMA or the current process. | ||
| 745 | * When VMA is not NULL caller must hold down_read on the mmap_sem of the | ||
| 746 | * mm_struct of the VMA to prevent it from going away. Should be used for | ||
| 747 | * all allocations for pages that will be mapped into | ||
| 748 | * user space. Returns NULL when no page can be allocated. | ||
| 749 | * | ||
| 750 | * Should be called with the mm_sem of the vma hold. | ||
| 751 | */ | ||
| 752 | struct page * | ||
| 753 | alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr) | ||
| 754 | { | ||
| 755 | struct mempolicy *pol = get_vma_policy(vma, addr); | ||
| 756 | |||
| 757 | cpuset_update_current_mems_allowed(); | ||
| 758 | |||
| 759 | if (unlikely(pol->policy == MPOL_INTERLEAVE)) { | ||
| 760 | unsigned nid; | ||
| 761 | if (vma) { | ||
| 762 | unsigned long off; | ||
| 763 | BUG_ON(addr >= vma->vm_end); | ||
| 764 | BUG_ON(addr < vma->vm_start); | ||
| 765 | off = vma->vm_pgoff; | ||
| 766 | off += (addr - vma->vm_start) >> PAGE_SHIFT; | ||
| 767 | nid = offset_il_node(pol, vma, off); | ||
| 768 | } else { | ||
| 769 | /* fall back to process interleaving */ | ||
| 770 | nid = interleave_nodes(pol); | ||
| 771 | } | ||
| 772 | return alloc_page_interleave(gfp, 0, nid); | ||
| 773 | } | ||
| 774 | return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); | ||
| 775 | } | ||
| 776 | |||
| 777 | /** | ||
| 778 | * alloc_pages_current - Allocate pages. | ||
| 779 | * | ||
| 780 | * @gfp: | ||
| 781 | * %GFP_USER user allocation, | ||
| 782 | * %GFP_KERNEL kernel allocation, | ||
| 783 | * %GFP_HIGHMEM highmem allocation, | ||
| 784 | * %GFP_FS don't call back into a file system. | ||
| 785 | * %GFP_ATOMIC don't sleep. | ||
| 786 | * @order: Power of two of allocation size in pages. 0 is a single page. | ||
| 787 | * | ||
| 788 | * Allocate a page from the kernel page pool. When not in | ||
| 789 | * interrupt context and apply the current process NUMA policy. | ||
| 790 | * Returns NULL when no page can be allocated. | ||
| 791 | * | ||
| 792 | * Don't call cpuset_update_current_mems_allowed() unless | ||
| 793 | * 1) it's ok to take cpuset_sem (can WAIT), and | ||
| 794 | * 2) allocating for current task (not interrupt). | ||
| 795 | */ | ||
| 796 | struct page *alloc_pages_current(unsigned int __nocast gfp, unsigned order) | ||
| 797 | { | ||
| 798 | struct mempolicy *pol = current->mempolicy; | ||
| 799 | |||
| 800 | if ((gfp & __GFP_WAIT) && !in_interrupt()) | ||
| 801 | cpuset_update_current_mems_allowed(); | ||
| 802 | if (!pol || in_interrupt()) | ||
| 803 | pol = &default_policy; | ||
| 804 | if (pol->policy == MPOL_INTERLEAVE) | ||
| 805 | return alloc_page_interleave(gfp, order, interleave_nodes(pol)); | ||
| 806 | return __alloc_pages(gfp, order, zonelist_policy(gfp, pol)); | ||
| 807 | } | ||
| 808 | EXPORT_SYMBOL(alloc_pages_current); | ||
| 809 | |||
| 810 | /* Slow path of a mempolicy copy */ | ||
| 811 | struct mempolicy *__mpol_copy(struct mempolicy *old) | ||
| 812 | { | ||
| 813 | struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); | ||
| 814 | |||
| 815 | if (!new) | ||
| 816 | return ERR_PTR(-ENOMEM); | ||
| 817 | *new = *old; | ||
| 818 | atomic_set(&new->refcnt, 1); | ||
| 819 | if (new->policy == MPOL_BIND) { | ||
| 820 | int sz = ksize(old->v.zonelist); | ||
| 821 | new->v.zonelist = kmalloc(sz, SLAB_KERNEL); | ||
| 822 | if (!new->v.zonelist) { | ||
| 823 | kmem_cache_free(policy_cache, new); | ||
| 824 | return ERR_PTR(-ENOMEM); | ||
| 825 | } | ||
| 826 | memcpy(new->v.zonelist, old->v.zonelist, sz); | ||
| 827 | } | ||
| 828 | return new; | ||
| 829 | } | ||
| 830 | |||
| 831 | /* Slow path of a mempolicy comparison */ | ||
| 832 | int __mpol_equal(struct mempolicy *a, struct mempolicy *b) | ||
| 833 | { | ||
| 834 | if (!a || !b) | ||
| 835 | return 0; | ||
| 836 | if (a->policy != b->policy) | ||
| 837 | return 0; | ||
| 838 | switch (a->policy) { | ||
| 839 | case MPOL_DEFAULT: | ||
| 840 | return 1; | ||
| 841 | case MPOL_INTERLEAVE: | ||
| 842 | return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES); | ||
| 843 | case MPOL_PREFERRED: | ||
| 844 | return a->v.preferred_node == b->v.preferred_node; | ||
| 845 | case MPOL_BIND: { | ||
| 846 | int i; | ||
| 847 | for (i = 0; a->v.zonelist->zones[i]; i++) | ||
| 848 | if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i]) | ||
| 849 | return 0; | ||
| 850 | return b->v.zonelist->zones[i] == NULL; | ||
| 851 | } | ||
| 852 | default: | ||
| 853 | BUG(); | ||
| 854 | return 0; | ||
| 855 | } | ||
| 856 | } | ||
| 857 | |||
| 858 | /* Slow path of a mpol destructor. */ | ||
| 859 | void __mpol_free(struct mempolicy *p) | ||
| 860 | { | ||
| 861 | if (!atomic_dec_and_test(&p->refcnt)) | ||
| 862 | return; | ||
| 863 | if (p->policy == MPOL_BIND) | ||
| 864 | kfree(p->v.zonelist); | ||
| 865 | p->policy = MPOL_DEFAULT; | ||
| 866 | kmem_cache_free(policy_cache, p); | ||
| 867 | } | ||
| 868 | |||
| 869 | /* | ||
| 870 | * Hugetlb policy. Same as above, just works with node numbers instead of | ||
| 871 | * zonelists. | ||
| 872 | */ | ||
| 873 | |||
| 874 | /* Find first node suitable for an allocation */ | ||
| 875 | int mpol_first_node(struct vm_area_struct *vma, unsigned long addr) | ||
| 876 | { | ||
| 877 | struct mempolicy *pol = get_vma_policy(vma, addr); | ||
| 878 | |||
| 879 | switch (pol->policy) { | ||
| 880 | case MPOL_DEFAULT: | ||
| 881 | return numa_node_id(); | ||
| 882 | case MPOL_BIND: | ||
| 883 | return pol->v.zonelist->zones[0]->zone_pgdat->node_id; | ||
| 884 | case MPOL_INTERLEAVE: | ||
| 885 | return interleave_nodes(pol); | ||
| 886 | case MPOL_PREFERRED: | ||
| 887 | return pol->v.preferred_node >= 0 ? | ||
| 888 | pol->v.preferred_node : numa_node_id(); | ||
| 889 | } | ||
| 890 | BUG(); | ||
| 891 | return 0; | ||
| 892 | } | ||
| 893 | |||
| 894 | /* Find secondary valid nodes for an allocation */ | ||
| 895 | int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr) | ||
| 896 | { | ||
| 897 | struct mempolicy *pol = get_vma_policy(vma, addr); | ||
| 898 | |||
| 899 | switch (pol->policy) { | ||
| 900 | case MPOL_PREFERRED: | ||
| 901 | case MPOL_DEFAULT: | ||
| 902 | case MPOL_INTERLEAVE: | ||
| 903 | return 1; | ||
| 904 | case MPOL_BIND: { | ||
| 905 | struct zone **z; | ||
| 906 | for (z = pol->v.zonelist->zones; *z; z++) | ||
| 907 | if ((*z)->zone_pgdat->node_id == nid) | ||
| 908 | return 1; | ||
| 909 | return 0; | ||
| 910 | } | ||
| 911 | default: | ||
| 912 | BUG(); | ||
| 913 | return 0; | ||
| 914 | } | ||
| 915 | } | ||
| 916 | |||
| 917 | /* | ||
| 918 | * Shared memory backing store policy support. | ||
| 919 | * | ||
| 920 | * Remember policies even when nobody has shared memory mapped. | ||
| 921 | * The policies are kept in Red-Black tree linked from the inode. | ||
| 922 | * They are protected by the sp->lock spinlock, which should be held | ||
| 923 | * for any accesses to the tree. | ||
| 924 | */ | ||
| 925 | |||
| 926 | /* lookup first element intersecting start-end */ | ||
| 927 | /* Caller holds sp->lock */ | ||
| 928 | static struct sp_node * | ||
| 929 | sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) | ||
| 930 | { | ||
| 931 | struct rb_node *n = sp->root.rb_node; | ||
| 932 | |||
| 933 | while (n) { | ||
| 934 | struct sp_node *p = rb_entry(n, struct sp_node, nd); | ||
| 935 | |||
| 936 | if (start >= p->end) | ||
| 937 | n = n->rb_right; | ||
| 938 | else if (end <= p->start) | ||
| 939 | n = n->rb_left; | ||
| 940 | else | ||
| 941 | break; | ||
| 942 | } | ||
| 943 | if (!n) | ||
| 944 | return NULL; | ||
| 945 | for (;;) { | ||
| 946 | struct sp_node *w = NULL; | ||
| 947 | struct rb_node *prev = rb_prev(n); | ||
| 948 | if (!prev) | ||
| 949 | break; | ||
| 950 | w = rb_entry(prev, struct sp_node, nd); | ||
| 951 | if (w->end <= start) | ||
| 952 | break; | ||
| 953 | n = prev; | ||
| 954 | } | ||
| 955 | return rb_entry(n, struct sp_node, nd); | ||
| 956 | } | ||
| 957 | |||
| 958 | /* Insert a new shared policy into the list. */ | ||
| 959 | /* Caller holds sp->lock */ | ||
| 960 | static void sp_insert(struct shared_policy *sp, struct sp_node *new) | ||
| 961 | { | ||
| 962 | struct rb_node **p = &sp->root.rb_node; | ||
| 963 | struct rb_node *parent = NULL; | ||
| 964 | struct sp_node *nd; | ||
| 965 | |||
| 966 | while (*p) { | ||
| 967 | parent = *p; | ||
| 968 | nd = rb_entry(parent, struct sp_node, nd); | ||
| 969 | if (new->start < nd->start) | ||
| 970 | p = &(*p)->rb_left; | ||
| 971 | else if (new->end > nd->end) | ||
| 972 | p = &(*p)->rb_right; | ||
| 973 | else | ||
| 974 | BUG(); | ||
| 975 | } | ||
| 976 | rb_link_node(&new->nd, parent, p); | ||
| 977 | rb_insert_color(&new->nd, &sp->root); | ||
| 978 | PDprintk("inserting %lx-%lx: %d\n", new->start, new->end, | ||
| 979 | new->policy ? new->policy->policy : 0); | ||
| 980 | } | ||
| 981 | |||
| 982 | /* Find shared policy intersecting idx */ | ||
| 983 | struct mempolicy * | ||
| 984 | mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) | ||
| 985 | { | ||
| 986 | struct mempolicy *pol = NULL; | ||
| 987 | struct sp_node *sn; | ||
| 988 | |||
| 989 | if (!sp->root.rb_node) | ||
| 990 | return NULL; | ||
| 991 | spin_lock(&sp->lock); | ||
| 992 | sn = sp_lookup(sp, idx, idx+1); | ||
| 993 | if (sn) { | ||
| 994 | mpol_get(sn->policy); | ||
| 995 | pol = sn->policy; | ||
| 996 | } | ||
| 997 | spin_unlock(&sp->lock); | ||
| 998 | return pol; | ||
| 999 | } | ||
| 1000 | |||
| 1001 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) | ||
| 1002 | { | ||
| 1003 | PDprintk("deleting %lx-l%x\n", n->start, n->end); | ||
| 1004 | rb_erase(&n->nd, &sp->root); | ||
| 1005 | mpol_free(n->policy); | ||
| 1006 | kmem_cache_free(sn_cache, n); | ||
| 1007 | } | ||
| 1008 | |||
| 1009 | struct sp_node * | ||
| 1010 | sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) | ||
| 1011 | { | ||
| 1012 | struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); | ||
| 1013 | |||
| 1014 | if (!n) | ||
| 1015 | return NULL; | ||
| 1016 | n->start = start; | ||
| 1017 | n->end = end; | ||
| 1018 | mpol_get(pol); | ||
| 1019 | n->policy = pol; | ||
| 1020 | return n; | ||
| 1021 | } | ||
| 1022 | |||
| 1023 | /* Replace a policy range. */ | ||
| 1024 | static int shared_policy_replace(struct shared_policy *sp, unsigned long start, | ||
| 1025 | unsigned long end, struct sp_node *new) | ||
| 1026 | { | ||
| 1027 | struct sp_node *n, *new2 = NULL; | ||
| 1028 | |||
| 1029 | restart: | ||
| 1030 | spin_lock(&sp->lock); | ||
| 1031 | n = sp_lookup(sp, start, end); | ||
| 1032 | /* Take care of old policies in the same range. */ | ||
| 1033 | while (n && n->start < end) { | ||
| 1034 | struct rb_node *next = rb_next(&n->nd); | ||
| 1035 | if (n->start >= start) { | ||
| 1036 | if (n->end <= end) | ||
| 1037 | sp_delete(sp, n); | ||
| 1038 | else | ||
| 1039 | n->start = end; | ||
| 1040 | } else { | ||
| 1041 | /* Old policy spanning whole new range. */ | ||
| 1042 | if (n->end > end) { | ||
| 1043 | if (!new2) { | ||
| 1044 | spin_unlock(&sp->lock); | ||
| 1045 | new2 = sp_alloc(end, n->end, n->policy); | ||
| 1046 | if (!new2) | ||
| 1047 | return -ENOMEM; | ||
| 1048 | goto restart; | ||
| 1049 | } | ||
| 1050 | n->end = start; | ||
| 1051 | sp_insert(sp, new2); | ||
| 1052 | new2 = NULL; | ||
| 1053 | break; | ||
| 1054 | } else | ||
| 1055 | n->end = start; | ||
| 1056 | } | ||
| 1057 | if (!next) | ||
| 1058 | break; | ||
| 1059 | n = rb_entry(next, struct sp_node, nd); | ||
| 1060 | } | ||
| 1061 | if (new) | ||
| 1062 | sp_insert(sp, new); | ||
| 1063 | spin_unlock(&sp->lock); | ||
| 1064 | if (new2) { | ||
| 1065 | mpol_free(new2->policy); | ||
| 1066 | kmem_cache_free(sn_cache, new2); | ||
| 1067 | } | ||
| 1068 | return 0; | ||
| 1069 | } | ||
| 1070 | |||
| 1071 | int mpol_set_shared_policy(struct shared_policy *info, | ||
| 1072 | struct vm_area_struct *vma, struct mempolicy *npol) | ||
| 1073 | { | ||
| 1074 | int err; | ||
| 1075 | struct sp_node *new = NULL; | ||
| 1076 | unsigned long sz = vma_pages(vma); | ||
| 1077 | |||
| 1078 | PDprintk("set_shared_policy %lx sz %lu %d %lx\n", | ||
| 1079 | vma->vm_pgoff, | ||
| 1080 | sz, npol? npol->policy : -1, | ||
| 1081 | npol ? npol->v.nodes[0] : -1); | ||
| 1082 | |||
| 1083 | if (npol) { | ||
| 1084 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); | ||
| 1085 | if (!new) | ||
| 1086 | return -ENOMEM; | ||
| 1087 | } | ||
| 1088 | err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); | ||
| 1089 | if (err && new) | ||
| 1090 | kmem_cache_free(sn_cache, new); | ||
| 1091 | return err; | ||
| 1092 | } | ||
| 1093 | |||
| 1094 | /* Free a backing policy store on inode delete. */ | ||
| 1095 | void mpol_free_shared_policy(struct shared_policy *p) | ||
| 1096 | { | ||
| 1097 | struct sp_node *n; | ||
| 1098 | struct rb_node *next; | ||
| 1099 | |||
| 1100 | if (!p->root.rb_node) | ||
| 1101 | return; | ||
| 1102 | spin_lock(&p->lock); | ||
| 1103 | next = rb_first(&p->root); | ||
| 1104 | while (next) { | ||
| 1105 | n = rb_entry(next, struct sp_node, nd); | ||
| 1106 | next = rb_next(&n->nd); | ||
| 1107 | mpol_free(n->policy); | ||
| 1108 | kmem_cache_free(sn_cache, n); | ||
| 1109 | } | ||
| 1110 | spin_unlock(&p->lock); | ||
| 1111 | p->root = RB_ROOT; | ||
| 1112 | } | ||
| 1113 | |||
| 1114 | /* assumes fs == KERNEL_DS */ | ||
| 1115 | void __init numa_policy_init(void) | ||
| 1116 | { | ||
| 1117 | policy_cache = kmem_cache_create("numa_policy", | ||
| 1118 | sizeof(struct mempolicy), | ||
| 1119 | 0, SLAB_PANIC, NULL, NULL); | ||
| 1120 | |||
| 1121 | sn_cache = kmem_cache_create("shared_policy_node", | ||
| 1122 | sizeof(struct sp_node), | ||
| 1123 | 0, SLAB_PANIC, NULL, NULL); | ||
| 1124 | |||
| 1125 | /* Set interleaving policy for system init. This way not all | ||
| 1126 | the data structures allocated at system boot end up in node zero. */ | ||
| 1127 | |||
| 1128 | if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map), | ||
| 1129 | MAX_NUMNODES) < 0) | ||
| 1130 | printk("numa_policy_init: interleaving failed\n"); | ||
| 1131 | } | ||
| 1132 | |||
| 1133 | /* Reset policy of current process to default. | ||
| 1134 | * Assumes fs == KERNEL_DS */ | ||
| 1135 | void numa_default_policy(void) | ||
| 1136 | { | ||
| 1137 | sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); | ||
| 1138 | } | ||
diff --git a/mm/mempool.c b/mm/mempool.c new file mode 100644 index 000000000000..b014ffeaa413 --- /dev/null +++ b/mm/mempool.c | |||
| @@ -0,0 +1,290 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/mempool.c | ||
| 3 | * | ||
| 4 | * memory buffer pool support. Such pools are mostly used | ||
| 5 | * for guaranteed, deadlock-free memory allocations during | ||
| 6 | * extreme VM load. | ||
| 7 | * | ||
| 8 | * started by Ingo Molnar, Copyright (C) 2001 | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include <linux/mm.h> | ||
| 12 | #include <linux/slab.h> | ||
| 13 | #include <linux/module.h> | ||
| 14 | #include <linux/mempool.h> | ||
| 15 | #include <linux/blkdev.h> | ||
| 16 | #include <linux/writeback.h> | ||
| 17 | |||
| 18 | static void add_element(mempool_t *pool, void *element) | ||
| 19 | { | ||
| 20 | BUG_ON(pool->curr_nr >= pool->min_nr); | ||
| 21 | pool->elements[pool->curr_nr++] = element; | ||
| 22 | } | ||
| 23 | |||
| 24 | static void *remove_element(mempool_t *pool) | ||
| 25 | { | ||
| 26 | BUG_ON(pool->curr_nr <= 0); | ||
| 27 | return pool->elements[--pool->curr_nr]; | ||
| 28 | } | ||
| 29 | |||
| 30 | static void free_pool(mempool_t *pool) | ||
| 31 | { | ||
| 32 | while (pool->curr_nr) { | ||
| 33 | void *element = remove_element(pool); | ||
| 34 | pool->free(element, pool->pool_data); | ||
| 35 | } | ||
| 36 | kfree(pool->elements); | ||
| 37 | kfree(pool); | ||
| 38 | } | ||
| 39 | |||
| 40 | /** | ||
| 41 | * mempool_create - create a memory pool | ||
| 42 | * @min_nr: the minimum number of elements guaranteed to be | ||
| 43 | * allocated for this pool. | ||
| 44 | * @alloc_fn: user-defined element-allocation function. | ||
| 45 | * @free_fn: user-defined element-freeing function. | ||
| 46 | * @pool_data: optional private data available to the user-defined functions. | ||
| 47 | * | ||
| 48 | * this function creates and allocates a guaranteed size, preallocated | ||
| 49 | * memory pool. The pool can be used from the mempool_alloc and mempool_free | ||
| 50 | * functions. This function might sleep. Both the alloc_fn() and the free_fn() | ||
| 51 | * functions might sleep - as long as the mempool_alloc function is not called | ||
| 52 | * from IRQ contexts. | ||
| 53 | */ | ||
| 54 | mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn, | ||
| 55 | mempool_free_t *free_fn, void *pool_data) | ||
| 56 | { | ||
| 57 | mempool_t *pool; | ||
| 58 | |||
| 59 | pool = kmalloc(sizeof(*pool), GFP_KERNEL); | ||
| 60 | if (!pool) | ||
| 61 | return NULL; | ||
| 62 | memset(pool, 0, sizeof(*pool)); | ||
| 63 | pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL); | ||
| 64 | if (!pool->elements) { | ||
| 65 | kfree(pool); | ||
| 66 | return NULL; | ||
| 67 | } | ||
| 68 | spin_lock_init(&pool->lock); | ||
| 69 | pool->min_nr = min_nr; | ||
| 70 | pool->pool_data = pool_data; | ||
| 71 | init_waitqueue_head(&pool->wait); | ||
| 72 | pool->alloc = alloc_fn; | ||
| 73 | pool->free = free_fn; | ||
| 74 | |||
| 75 | /* | ||
| 76 | * First pre-allocate the guaranteed number of buffers. | ||
| 77 | */ | ||
| 78 | while (pool->curr_nr < pool->min_nr) { | ||
| 79 | void *element; | ||
| 80 | |||
| 81 | element = pool->alloc(GFP_KERNEL, pool->pool_data); | ||
| 82 | if (unlikely(!element)) { | ||
| 83 | free_pool(pool); | ||
| 84 | return NULL; | ||
| 85 | } | ||
| 86 | add_element(pool, element); | ||
| 87 | } | ||
| 88 | return pool; | ||
| 89 | } | ||
| 90 | EXPORT_SYMBOL(mempool_create); | ||
| 91 | |||
| 92 | /** | ||
| 93 | * mempool_resize - resize an existing memory pool | ||
| 94 | * @pool: pointer to the memory pool which was allocated via | ||
| 95 | * mempool_create(). | ||
| 96 | * @new_min_nr: the new minimum number of elements guaranteed to be | ||
| 97 | * allocated for this pool. | ||
| 98 | * @gfp_mask: the usual allocation bitmask. | ||
| 99 | * | ||
| 100 | * This function shrinks/grows the pool. In the case of growing, | ||
| 101 | * it cannot be guaranteed that the pool will be grown to the new | ||
| 102 | * size immediately, but new mempool_free() calls will refill it. | ||
| 103 | * | ||
| 104 | * Note, the caller must guarantee that no mempool_destroy is called | ||
| 105 | * while this function is running. mempool_alloc() & mempool_free() | ||
| 106 | * might be called (eg. from IRQ contexts) while this function executes. | ||
| 107 | */ | ||
| 108 | int mempool_resize(mempool_t *pool, int new_min_nr, unsigned int __nocast gfp_mask) | ||
| 109 | { | ||
| 110 | void *element; | ||
| 111 | void **new_elements; | ||
| 112 | unsigned long flags; | ||
| 113 | |||
| 114 | BUG_ON(new_min_nr <= 0); | ||
| 115 | |||
| 116 | spin_lock_irqsave(&pool->lock, flags); | ||
| 117 | if (new_min_nr <= pool->min_nr) { | ||
| 118 | while (new_min_nr < pool->curr_nr) { | ||
| 119 | element = remove_element(pool); | ||
| 120 | spin_unlock_irqrestore(&pool->lock, flags); | ||
| 121 | pool->free(element, pool->pool_data); | ||
| 122 | spin_lock_irqsave(&pool->lock, flags); | ||
| 123 | } | ||
| 124 | pool->min_nr = new_min_nr; | ||
| 125 | goto out_unlock; | ||
| 126 | } | ||
| 127 | spin_unlock_irqrestore(&pool->lock, flags); | ||
| 128 | |||
| 129 | /* Grow the pool */ | ||
| 130 | new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask); | ||
| 131 | if (!new_elements) | ||
| 132 | return -ENOMEM; | ||
| 133 | |||
| 134 | spin_lock_irqsave(&pool->lock, flags); | ||
| 135 | if (unlikely(new_min_nr <= pool->min_nr)) { | ||
| 136 | /* Raced, other resize will do our work */ | ||
| 137 | spin_unlock_irqrestore(&pool->lock, flags); | ||
| 138 | kfree(new_elements); | ||
| 139 | goto out; | ||
| 140 | } | ||
| 141 | memcpy(new_elements, pool->elements, | ||
| 142 | pool->curr_nr * sizeof(*new_elements)); | ||
| 143 | kfree(pool->elements); | ||
| 144 | pool->elements = new_elements; | ||
| 145 | pool->min_nr = new_min_nr; | ||
| 146 | |||
| 147 | while (pool->curr_nr < pool->min_nr) { | ||
| 148 | spin_unlock_irqrestore(&pool->lock, flags); | ||
| 149 | element = pool->alloc(gfp_mask, pool->pool_data); | ||
| 150 | if (!element) | ||
| 151 | goto out; | ||
| 152 | spin_lock_irqsave(&pool->lock, flags); | ||
| 153 | if (pool->curr_nr < pool->min_nr) { | ||
| 154 | add_element(pool, element); | ||
| 155 | } else { | ||
| 156 | spin_unlock_irqrestore(&pool->lock, flags); | ||
| 157 | pool->free(element, pool->pool_data); /* Raced */ | ||
| 158 | goto out; | ||
| 159 | } | ||
| 160 | } | ||
| 161 | out_unlock: | ||
| 162 | spin_unlock_irqrestore(&pool->lock, flags); | ||
| 163 | out: | ||
| 164 | return 0; | ||
| 165 | } | ||
| 166 | EXPORT_SYMBOL(mempool_resize); | ||
| 167 | |||
| 168 | /** | ||
| 169 | * mempool_destroy - deallocate a memory pool | ||
| 170 | * @pool: pointer to the memory pool which was allocated via | ||
| 171 | * mempool_create(). | ||
| 172 | * | ||
| 173 | * this function only sleeps if the free_fn() function sleeps. The caller | ||
| 174 | * has to guarantee that all elements have been returned to the pool (ie: | ||
| 175 | * freed) prior to calling mempool_destroy(). | ||
| 176 | */ | ||
| 177 | void mempool_destroy(mempool_t *pool) | ||
| 178 | { | ||
| 179 | if (pool->curr_nr != pool->min_nr) | ||
| 180 | BUG(); /* There were outstanding elements */ | ||
| 181 | free_pool(pool); | ||
| 182 | } | ||
| 183 | EXPORT_SYMBOL(mempool_destroy); | ||
| 184 | |||
| 185 | /** | ||
| 186 | * mempool_alloc - allocate an element from a specific memory pool | ||
| 187 | * @pool: pointer to the memory pool which was allocated via | ||
| 188 | * mempool_create(). | ||
| 189 | * @gfp_mask: the usual allocation bitmask. | ||
| 190 | * | ||
| 191 | * this function only sleeps if the alloc_fn function sleeps or | ||
| 192 | * returns NULL. Note that due to preallocation, this function | ||
| 193 | * *never* fails when called from process contexts. (it might | ||
| 194 | * fail if called from an IRQ context.) | ||
| 195 | */ | ||
| 196 | void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask) | ||
| 197 | { | ||
| 198 | void *element; | ||
| 199 | unsigned long flags; | ||
| 200 | DEFINE_WAIT(wait); | ||
| 201 | int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO); | ||
| 202 | |||
| 203 | might_sleep_if(gfp_mask & __GFP_WAIT); | ||
| 204 | repeat_alloc: | ||
| 205 | element = pool->alloc(gfp_nowait|__GFP_NOWARN, pool->pool_data); | ||
| 206 | if (likely(element != NULL)) | ||
| 207 | return element; | ||
| 208 | |||
| 209 | /* | ||
| 210 | * If the pool is less than 50% full and we can perform effective | ||
| 211 | * page reclaim then try harder to allocate an element. | ||
| 212 | */ | ||
| 213 | mb(); | ||
| 214 | if ((gfp_mask & __GFP_FS) && (gfp_mask != gfp_nowait) && | ||
| 215 | (pool->curr_nr <= pool->min_nr/2)) { | ||
| 216 | element = pool->alloc(gfp_mask, pool->pool_data); | ||
| 217 | if (likely(element != NULL)) | ||
| 218 | return element; | ||
| 219 | } | ||
| 220 | |||
| 221 | /* | ||
| 222 | * Kick the VM at this point. | ||
| 223 | */ | ||
| 224 | wakeup_bdflush(0); | ||
| 225 | |||
| 226 | spin_lock_irqsave(&pool->lock, flags); | ||
| 227 | if (likely(pool->curr_nr)) { | ||
| 228 | element = remove_element(pool); | ||
| 229 | spin_unlock_irqrestore(&pool->lock, flags); | ||
| 230 | return element; | ||
| 231 | } | ||
| 232 | spin_unlock_irqrestore(&pool->lock, flags); | ||
| 233 | |||
| 234 | /* We must not sleep in the GFP_ATOMIC case */ | ||
| 235 | if (!(gfp_mask & __GFP_WAIT)) | ||
| 236 | return NULL; | ||
| 237 | |||
| 238 | prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); | ||
| 239 | mb(); | ||
| 240 | if (!pool->curr_nr) | ||
| 241 | io_schedule(); | ||
| 242 | finish_wait(&pool->wait, &wait); | ||
| 243 | |||
| 244 | goto repeat_alloc; | ||
| 245 | } | ||
| 246 | EXPORT_SYMBOL(mempool_alloc); | ||
| 247 | |||
| 248 | /** | ||
| 249 | * mempool_free - return an element to the pool. | ||
| 250 | * @element: pool element pointer. | ||
| 251 | * @pool: pointer to the memory pool which was allocated via | ||
| 252 | * mempool_create(). | ||
| 253 | * | ||
| 254 | * this function only sleeps if the free_fn() function sleeps. | ||
| 255 | */ | ||
| 256 | void mempool_free(void *element, mempool_t *pool) | ||
| 257 | { | ||
| 258 | unsigned long flags; | ||
| 259 | |||
| 260 | mb(); | ||
| 261 | if (pool->curr_nr < pool->min_nr) { | ||
| 262 | spin_lock_irqsave(&pool->lock, flags); | ||
| 263 | if (pool->curr_nr < pool->min_nr) { | ||
| 264 | add_element(pool, element); | ||
| 265 | spin_unlock_irqrestore(&pool->lock, flags); | ||
| 266 | wake_up(&pool->wait); | ||
| 267 | return; | ||
| 268 | } | ||
| 269 | spin_unlock_irqrestore(&pool->lock, flags); | ||
| 270 | } | ||
| 271 | pool->free(element, pool->pool_data); | ||
| 272 | } | ||
| 273 | EXPORT_SYMBOL(mempool_free); | ||
| 274 | |||
| 275 | /* | ||
| 276 | * A commonly used alloc and free fn. | ||
| 277 | */ | ||
| 278 | void *mempool_alloc_slab(unsigned int __nocast gfp_mask, void *pool_data) | ||
| 279 | { | ||
| 280 | kmem_cache_t *mem = (kmem_cache_t *) pool_data; | ||
| 281 | return kmem_cache_alloc(mem, gfp_mask); | ||
| 282 | } | ||
| 283 | EXPORT_SYMBOL(mempool_alloc_slab); | ||
| 284 | |||
| 285 | void mempool_free_slab(void *element, void *pool_data) | ||
| 286 | { | ||
| 287 | kmem_cache_t *mem = (kmem_cache_t *) pool_data; | ||
| 288 | kmem_cache_free(mem, element); | ||
| 289 | } | ||
| 290 | EXPORT_SYMBOL(mempool_free_slab); | ||
diff --git a/mm/mincore.c b/mm/mincore.c new file mode 100644 index 000000000000..07833dc5829d --- /dev/null +++ b/mm/mincore.c | |||
| @@ -0,0 +1,191 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/mincore.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 1994-1999 Linus Torvalds | ||
| 5 | */ | ||
| 6 | |||
| 7 | /* | ||
| 8 | * The mincore() system call. | ||
| 9 | */ | ||
| 10 | #include <linux/slab.h> | ||
| 11 | #include <linux/pagemap.h> | ||
| 12 | #include <linux/mm.h> | ||
| 13 | #include <linux/mman.h> | ||
| 14 | #include <linux/syscalls.h> | ||
| 15 | |||
| 16 | #include <asm/uaccess.h> | ||
| 17 | #include <asm/pgtable.h> | ||
| 18 | |||
| 19 | /* | ||
| 20 | * Later we can get more picky about what "in core" means precisely. | ||
| 21 | * For now, simply check to see if the page is in the page cache, | ||
| 22 | * and is up to date; i.e. that no page-in operation would be required | ||
| 23 | * at this time if an application were to map and access this page. | ||
| 24 | */ | ||
| 25 | static unsigned char mincore_page(struct vm_area_struct * vma, | ||
| 26 | unsigned long pgoff) | ||
| 27 | { | ||
| 28 | unsigned char present = 0; | ||
| 29 | struct address_space * as = vma->vm_file->f_mapping; | ||
| 30 | struct page * page; | ||
| 31 | |||
| 32 | page = find_get_page(as, pgoff); | ||
| 33 | if (page) { | ||
| 34 | present = PageUptodate(page); | ||
| 35 | page_cache_release(page); | ||
| 36 | } | ||
| 37 | |||
| 38 | return present; | ||
| 39 | } | ||
| 40 | |||
| 41 | static long mincore_vma(struct vm_area_struct * vma, | ||
| 42 | unsigned long start, unsigned long end, unsigned char __user * vec) | ||
| 43 | { | ||
| 44 | long error, i, remaining; | ||
| 45 | unsigned char * tmp; | ||
| 46 | |||
| 47 | error = -ENOMEM; | ||
| 48 | if (!vma->vm_file) | ||
| 49 | return error; | ||
| 50 | |||
| 51 | start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
| 52 | if (end > vma->vm_end) | ||
| 53 | end = vma->vm_end; | ||
| 54 | end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
| 55 | |||
| 56 | error = -EAGAIN; | ||
| 57 | tmp = (unsigned char *) __get_free_page(GFP_KERNEL); | ||
| 58 | if (!tmp) | ||
| 59 | return error; | ||
| 60 | |||
| 61 | /* (end - start) is # of pages, and also # of bytes in "vec */ | ||
| 62 | remaining = (end - start), | ||
| 63 | |||
| 64 | error = 0; | ||
| 65 | for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) { | ||
| 66 | int j = 0; | ||
| 67 | long thispiece = (remaining < PAGE_SIZE) ? | ||
| 68 | remaining : PAGE_SIZE; | ||
| 69 | |||
| 70 | while (j < thispiece) | ||
| 71 | tmp[j++] = mincore_page(vma, start++); | ||
| 72 | |||
| 73 | if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) { | ||
| 74 | error = -EFAULT; | ||
| 75 | break; | ||
| 76 | } | ||
| 77 | } | ||
| 78 | |||
| 79 | free_page((unsigned long) tmp); | ||
| 80 | return error; | ||
| 81 | } | ||
| 82 | |||
| 83 | /* | ||
| 84 | * The mincore(2) system call. | ||
| 85 | * | ||
| 86 | * mincore() returns the memory residency status of the pages in the | ||
| 87 | * current process's address space specified by [addr, addr + len). | ||
| 88 | * The status is returned in a vector of bytes. The least significant | ||
| 89 | * bit of each byte is 1 if the referenced page is in memory, otherwise | ||
| 90 | * it is zero. | ||
| 91 | * | ||
| 92 | * Because the status of a page can change after mincore() checks it | ||
| 93 | * but before it returns to the application, the returned vector may | ||
| 94 | * contain stale information. Only locked pages are guaranteed to | ||
| 95 | * remain in memory. | ||
| 96 | * | ||
| 97 | * return values: | ||
| 98 | * zero - success | ||
| 99 | * -EFAULT - vec points to an illegal address | ||
| 100 | * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE | ||
| 101 | * -ENOMEM - Addresses in the range [addr, addr + len] are | ||
| 102 | * invalid for the address space of this process, or | ||
| 103 | * specify one or more pages which are not currently | ||
| 104 | * mapped | ||
| 105 | * -EAGAIN - A kernel resource was temporarily unavailable. | ||
| 106 | */ | ||
| 107 | asmlinkage long sys_mincore(unsigned long start, size_t len, | ||
| 108 | unsigned char __user * vec) | ||
| 109 | { | ||
| 110 | int index = 0; | ||
| 111 | unsigned long end, limit; | ||
| 112 | struct vm_area_struct * vma; | ||
| 113 | size_t max; | ||
| 114 | int unmapped_error = 0; | ||
| 115 | long error; | ||
| 116 | |||
| 117 | /* check the arguments */ | ||
| 118 | if (start & ~PAGE_CACHE_MASK) | ||
| 119 | goto einval; | ||
| 120 | |||
| 121 | if (start < FIRST_USER_PGD_NR * PGDIR_SIZE) | ||
| 122 | goto enomem; | ||
| 123 | |||
| 124 | limit = TASK_SIZE; | ||
| 125 | if (start >= limit) | ||
| 126 | goto enomem; | ||
| 127 | |||
| 128 | if (!len) | ||
| 129 | return 0; | ||
| 130 | |||
| 131 | max = limit - start; | ||
| 132 | len = PAGE_CACHE_ALIGN(len); | ||
| 133 | if (len > max || !len) | ||
| 134 | goto enomem; | ||
| 135 | |||
| 136 | end = start + len; | ||
| 137 | |||
| 138 | /* check the output buffer whilst holding the lock */ | ||
| 139 | error = -EFAULT; | ||
| 140 | down_read(¤t->mm->mmap_sem); | ||
| 141 | |||
| 142 | if (!access_ok(VERIFY_WRITE, vec, len >> PAGE_SHIFT)) | ||
| 143 | goto out; | ||
| 144 | |||
| 145 | /* | ||
| 146 | * If the interval [start,end) covers some unmapped address | ||
| 147 | * ranges, just ignore them, but return -ENOMEM at the end. | ||
| 148 | */ | ||
| 149 | error = 0; | ||
| 150 | |||
| 151 | vma = find_vma(current->mm, start); | ||
| 152 | while (vma) { | ||
| 153 | /* Here start < vma->vm_end. */ | ||
| 154 | if (start < vma->vm_start) { | ||
| 155 | unmapped_error = -ENOMEM; | ||
| 156 | start = vma->vm_start; | ||
| 157 | } | ||
| 158 | |||
| 159 | /* Here vma->vm_start <= start < vma->vm_end. */ | ||
| 160 | if (end <= vma->vm_end) { | ||
| 161 | if (start < end) { | ||
| 162 | error = mincore_vma(vma, start, end, | ||
| 163 | &vec[index]); | ||
| 164 | if (error) | ||
| 165 | goto out; | ||
| 166 | } | ||
| 167 | error = unmapped_error; | ||
| 168 | goto out; | ||
| 169 | } | ||
| 170 | |||
| 171 | /* Here vma->vm_start <= start < vma->vm_end < end. */ | ||
| 172 | error = mincore_vma(vma, start, vma->vm_end, &vec[index]); | ||
| 173 | if (error) | ||
| 174 | goto out; | ||
| 175 | index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT; | ||
| 176 | start = vma->vm_end; | ||
| 177 | vma = vma->vm_next; | ||
| 178 | } | ||
| 179 | |||
| 180 | /* we found a hole in the area queried if we arrive here */ | ||
| 181 | error = -ENOMEM; | ||
| 182 | |||
| 183 | out: | ||
| 184 | up_read(¤t->mm->mmap_sem); | ||
| 185 | return error; | ||
| 186 | |||
| 187 | einval: | ||
| 188 | return -EINVAL; | ||
| 189 | enomem: | ||
| 190 | return -ENOMEM; | ||
| 191 | } | ||
diff --git a/mm/mlock.c b/mm/mlock.c new file mode 100644 index 000000000000..4ae3a46ff768 --- /dev/null +++ b/mm/mlock.c | |||
| @@ -0,0 +1,253 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/mlock.c | ||
| 3 | * | ||
| 4 | * (C) Copyright 1995 Linus Torvalds | ||
| 5 | * (C) Copyright 2002 Christoph Hellwig | ||
| 6 | */ | ||
| 7 | |||
| 8 | #include <linux/mman.h> | ||
| 9 | #include <linux/mm.h> | ||
| 10 | #include <linux/mempolicy.h> | ||
| 11 | #include <linux/syscalls.h> | ||
| 12 | |||
| 13 | |||
| 14 | static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, | ||
| 15 | unsigned long start, unsigned long end, unsigned int newflags) | ||
| 16 | { | ||
| 17 | struct mm_struct * mm = vma->vm_mm; | ||
| 18 | pgoff_t pgoff; | ||
| 19 | int pages; | ||
| 20 | int ret = 0; | ||
| 21 | |||
| 22 | if (newflags == vma->vm_flags) { | ||
| 23 | *prev = vma; | ||
| 24 | goto out; | ||
| 25 | } | ||
| 26 | |||
| 27 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); | ||
| 28 | *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, | ||
| 29 | vma->vm_file, pgoff, vma_policy(vma)); | ||
| 30 | if (*prev) { | ||
| 31 | vma = *prev; | ||
| 32 | goto success; | ||
| 33 | } | ||
| 34 | |||
| 35 | *prev = vma; | ||
| 36 | |||
| 37 | if (start != vma->vm_start) { | ||
| 38 | ret = split_vma(mm, vma, start, 1); | ||
| 39 | if (ret) | ||
| 40 | goto out; | ||
| 41 | } | ||
| 42 | |||
| 43 | if (end != vma->vm_end) { | ||
| 44 | ret = split_vma(mm, vma, end, 0); | ||
| 45 | if (ret) | ||
| 46 | goto out; | ||
| 47 | } | ||
| 48 | |||
| 49 | success: | ||
| 50 | /* | ||
| 51 | * vm_flags is protected by the mmap_sem held in write mode. | ||
| 52 | * It's okay if try_to_unmap_one unmaps a page just after we | ||
| 53 | * set VM_LOCKED, make_pages_present below will bring it back. | ||
| 54 | */ | ||
| 55 | vma->vm_flags = newflags; | ||
| 56 | |||
| 57 | /* | ||
| 58 | * Keep track of amount of locked VM. | ||
| 59 | */ | ||
| 60 | pages = (end - start) >> PAGE_SHIFT; | ||
| 61 | if (newflags & VM_LOCKED) { | ||
| 62 | pages = -pages; | ||
| 63 | if (!(newflags & VM_IO)) | ||
| 64 | ret = make_pages_present(start, end); | ||
| 65 | } | ||
| 66 | |||
| 67 | vma->vm_mm->locked_vm -= pages; | ||
| 68 | out: | ||
| 69 | if (ret == -ENOMEM) | ||
| 70 | ret = -EAGAIN; | ||
| 71 | return ret; | ||
| 72 | } | ||
| 73 | |||
| 74 | static int do_mlock(unsigned long start, size_t len, int on) | ||
| 75 | { | ||
| 76 | unsigned long nstart, end, tmp; | ||
| 77 | struct vm_area_struct * vma, * prev; | ||
| 78 | int error; | ||
| 79 | |||
| 80 | len = PAGE_ALIGN(len); | ||
| 81 | end = start + len; | ||
| 82 | if (end < start) | ||
| 83 | return -EINVAL; | ||
| 84 | if (end == start) | ||
| 85 | return 0; | ||
| 86 | vma = find_vma_prev(current->mm, start, &prev); | ||
| 87 | if (!vma || vma->vm_start > start) | ||
| 88 | return -ENOMEM; | ||
| 89 | |||
| 90 | if (start > vma->vm_start) | ||
| 91 | prev = vma; | ||
| 92 | |||
| 93 | for (nstart = start ; ; ) { | ||
| 94 | unsigned int newflags; | ||
| 95 | |||
| 96 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ | ||
| 97 | |||
| 98 | newflags = vma->vm_flags | VM_LOCKED; | ||
| 99 | if (!on) | ||
| 100 | newflags &= ~VM_LOCKED; | ||
| 101 | |||
| 102 | tmp = vma->vm_end; | ||
| 103 | if (tmp > end) | ||
| 104 | tmp = end; | ||
| 105 | error = mlock_fixup(vma, &prev, nstart, tmp, newflags); | ||
| 106 | if (error) | ||
| 107 | break; | ||
| 108 | nstart = tmp; | ||
| 109 | if (nstart < prev->vm_end) | ||
| 110 | nstart = prev->vm_end; | ||
| 111 | if (nstart >= end) | ||
| 112 | break; | ||
| 113 | |||
| 114 | vma = prev->vm_next; | ||
| 115 | if (!vma || vma->vm_start != nstart) { | ||
| 116 | error = -ENOMEM; | ||
| 117 | break; | ||
| 118 | } | ||
| 119 | } | ||
| 120 | return error; | ||
| 121 | } | ||
| 122 | |||
| 123 | asmlinkage long sys_mlock(unsigned long start, size_t len) | ||
| 124 | { | ||
| 125 | unsigned long locked; | ||
| 126 | unsigned long lock_limit; | ||
| 127 | int error = -ENOMEM; | ||
| 128 | |||
| 129 | if (!can_do_mlock()) | ||
| 130 | return -EPERM; | ||
| 131 | |||
| 132 | down_write(¤t->mm->mmap_sem); | ||
| 133 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); | ||
| 134 | start &= PAGE_MASK; | ||
| 135 | |||
| 136 | locked = len >> PAGE_SHIFT; | ||
| 137 | locked += current->mm->locked_vm; | ||
| 138 | |||
| 139 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | ||
| 140 | lock_limit >>= PAGE_SHIFT; | ||
| 141 | |||
| 142 | /* check against resource limits */ | ||
| 143 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) | ||
| 144 | error = do_mlock(start, len, 1); | ||
| 145 | up_write(¤t->mm->mmap_sem); | ||
| 146 | return error; | ||
| 147 | } | ||
| 148 | |||
| 149 | asmlinkage long sys_munlock(unsigned long start, size_t len) | ||
| 150 | { | ||
| 151 | int ret; | ||
| 152 | |||
| 153 | down_write(¤t->mm->mmap_sem); | ||
| 154 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); | ||
| 155 | start &= PAGE_MASK; | ||
| 156 | ret = do_mlock(start, len, 0); | ||
| 157 | up_write(¤t->mm->mmap_sem); | ||
| 158 | return ret; | ||
| 159 | } | ||
| 160 | |||
| 161 | static int do_mlockall(int flags) | ||
| 162 | { | ||
| 163 | struct vm_area_struct * vma, * prev = NULL; | ||
| 164 | unsigned int def_flags = 0; | ||
| 165 | |||
| 166 | if (flags & MCL_FUTURE) | ||
| 167 | def_flags = VM_LOCKED; | ||
| 168 | current->mm->def_flags = def_flags; | ||
| 169 | if (flags == MCL_FUTURE) | ||
| 170 | goto out; | ||
| 171 | |||
| 172 | for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { | ||
| 173 | unsigned int newflags; | ||
| 174 | |||
| 175 | newflags = vma->vm_flags | VM_LOCKED; | ||
| 176 | if (!(flags & MCL_CURRENT)) | ||
| 177 | newflags &= ~VM_LOCKED; | ||
| 178 | |||
| 179 | /* Ignore errors */ | ||
| 180 | mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); | ||
| 181 | } | ||
| 182 | out: | ||
| 183 | return 0; | ||
| 184 | } | ||
| 185 | |||
| 186 | asmlinkage long sys_mlockall(int flags) | ||
| 187 | { | ||
| 188 | unsigned long lock_limit; | ||
| 189 | int ret = -EINVAL; | ||
| 190 | |||
| 191 | if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE))) | ||
| 192 | goto out; | ||
| 193 | |||
| 194 | ret = -EPERM; | ||
| 195 | if (!can_do_mlock()) | ||
| 196 | goto out; | ||
| 197 | |||
| 198 | down_write(¤t->mm->mmap_sem); | ||
| 199 | |||
| 200 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | ||
| 201 | lock_limit >>= PAGE_SHIFT; | ||
| 202 | |||
| 203 | ret = -ENOMEM; | ||
| 204 | if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || | ||
| 205 | capable(CAP_IPC_LOCK)) | ||
| 206 | ret = do_mlockall(flags); | ||
| 207 | up_write(¤t->mm->mmap_sem); | ||
| 208 | out: | ||
| 209 | return ret; | ||
| 210 | } | ||
| 211 | |||
| 212 | asmlinkage long sys_munlockall(void) | ||
| 213 | { | ||
| 214 | int ret; | ||
| 215 | |||
| 216 | down_write(¤t->mm->mmap_sem); | ||
| 217 | ret = do_mlockall(0); | ||
| 218 | up_write(¤t->mm->mmap_sem); | ||
| 219 | return ret; | ||
| 220 | } | ||
| 221 | |||
| 222 | /* | ||
| 223 | * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB | ||
| 224 | * shm segments) get accounted against the user_struct instead. | ||
| 225 | */ | ||
| 226 | static DEFINE_SPINLOCK(shmlock_user_lock); | ||
| 227 | |||
| 228 | int user_shm_lock(size_t size, struct user_struct *user) | ||
| 229 | { | ||
| 230 | unsigned long lock_limit, locked; | ||
| 231 | int allowed = 0; | ||
| 232 | |||
| 233 | locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
| 234 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | ||
| 235 | lock_limit >>= PAGE_SHIFT; | ||
| 236 | spin_lock(&shmlock_user_lock); | ||
| 237 | if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) | ||
| 238 | goto out; | ||
| 239 | get_uid(user); | ||
| 240 | user->locked_shm += locked; | ||
| 241 | allowed = 1; | ||
| 242 | out: | ||
| 243 | spin_unlock(&shmlock_user_lock); | ||
| 244 | return allowed; | ||
| 245 | } | ||
| 246 | |||
| 247 | void user_shm_unlock(size_t size, struct user_struct *user) | ||
| 248 | { | ||
| 249 | spin_lock(&shmlock_user_lock); | ||
| 250 | user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
| 251 | spin_unlock(&shmlock_user_lock); | ||
| 252 | free_uid(user); | ||
| 253 | } | ||
diff --git a/mm/mmap.c b/mm/mmap.c new file mode 100644 index 000000000000..a95ebda27446 --- /dev/null +++ b/mm/mmap.c | |||
| @@ -0,0 +1,2082 @@ | |||
| 1 | /* | ||
| 2 | * mm/mmap.c | ||
| 3 | * | ||
| 4 | * Written by obz. | ||
| 5 | * | ||
| 6 | * Address space accounting code <alan@redhat.com> | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <linux/slab.h> | ||
| 10 | #include <linux/mm.h> | ||
| 11 | #include <linux/shm.h> | ||
| 12 | #include <linux/mman.h> | ||
| 13 | #include <linux/pagemap.h> | ||
| 14 | #include <linux/swap.h> | ||
| 15 | #include <linux/syscalls.h> | ||
| 16 | #include <linux/init.h> | ||
| 17 | #include <linux/file.h> | ||
| 18 | #include <linux/fs.h> | ||
| 19 | #include <linux/personality.h> | ||
| 20 | #include <linux/security.h> | ||
| 21 | #include <linux/hugetlb.h> | ||
| 22 | #include <linux/profile.h> | ||
| 23 | #include <linux/module.h> | ||
| 24 | #include <linux/mount.h> | ||
| 25 | #include <linux/mempolicy.h> | ||
| 26 | #include <linux/rmap.h> | ||
| 27 | |||
| 28 | #include <asm/uaccess.h> | ||
| 29 | #include <asm/cacheflush.h> | ||
| 30 | #include <asm/tlb.h> | ||
| 31 | |||
| 32 | /* | ||
| 33 | * WARNING: the debugging will use recursive algorithms so never enable this | ||
| 34 | * unless you know what you are doing. | ||
| 35 | */ | ||
| 36 | #undef DEBUG_MM_RB | ||
| 37 | |||
| 38 | /* description of effects of mapping type and prot in current implementation. | ||
| 39 | * this is due to the limited x86 page protection hardware. The expected | ||
| 40 | * behavior is in parens: | ||
| 41 | * | ||
| 42 | * map_type prot | ||
| 43 | * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC | ||
| 44 | * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes | ||
| 45 | * w: (no) no w: (no) no w: (yes) yes w: (no) no | ||
| 46 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes | ||
| 47 | * | ||
| 48 | * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes | ||
| 49 | * w: (no) no w: (no) no w: (copy) copy w: (no) no | ||
| 50 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes | ||
| 51 | * | ||
| 52 | */ | ||
| 53 | pgprot_t protection_map[16] = { | ||
| 54 | __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, | ||
| 55 | __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 | ||
| 56 | }; | ||
| 57 | |||
| 58 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | ||
| 59 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | ||
| 60 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; | ||
| 61 | atomic_t vm_committed_space = ATOMIC_INIT(0); | ||
| 62 | |||
| 63 | /* | ||
| 64 | * Check that a process has enough memory to allocate a new virtual | ||
| 65 | * mapping. 0 means there is enough memory for the allocation to | ||
| 66 | * succeed and -ENOMEM implies there is not. | ||
| 67 | * | ||
| 68 | * We currently support three overcommit policies, which are set via the | ||
| 69 | * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting | ||
| 70 | * | ||
| 71 | * Strict overcommit modes added 2002 Feb 26 by Alan Cox. | ||
| 72 | * Additional code 2002 Jul 20 by Robert Love. | ||
| 73 | * | ||
| 74 | * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. | ||
| 75 | * | ||
| 76 | * Note this is a helper function intended to be used by LSMs which | ||
| 77 | * wish to use this logic. | ||
| 78 | */ | ||
| 79 | int __vm_enough_memory(long pages, int cap_sys_admin) | ||
| 80 | { | ||
| 81 | unsigned long free, allowed; | ||
| 82 | |||
| 83 | vm_acct_memory(pages); | ||
| 84 | |||
| 85 | /* | ||
| 86 | * Sometimes we want to use more memory than we have | ||
| 87 | */ | ||
| 88 | if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) | ||
| 89 | return 0; | ||
| 90 | |||
| 91 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { | ||
| 92 | unsigned long n; | ||
| 93 | |||
| 94 | free = get_page_cache_size(); | ||
| 95 | free += nr_swap_pages; | ||
| 96 | |||
| 97 | /* | ||
| 98 | * Any slabs which are created with the | ||
| 99 | * SLAB_RECLAIM_ACCOUNT flag claim to have contents | ||
| 100 | * which are reclaimable, under pressure. The dentry | ||
| 101 | * cache and most inode caches should fall into this | ||
| 102 | */ | ||
| 103 | free += atomic_read(&slab_reclaim_pages); | ||
| 104 | |||
| 105 | /* | ||
| 106 | * Leave the last 3% for root | ||
| 107 | */ | ||
| 108 | if (!cap_sys_admin) | ||
| 109 | free -= free / 32; | ||
| 110 | |||
| 111 | if (free > pages) | ||
| 112 | return 0; | ||
| 113 | |||
| 114 | /* | ||
| 115 | * nr_free_pages() is very expensive on large systems, | ||
| 116 | * only call if we're about to fail. | ||
| 117 | */ | ||
| 118 | n = nr_free_pages(); | ||
| 119 | if (!cap_sys_admin) | ||
| 120 | n -= n / 32; | ||
| 121 | free += n; | ||
| 122 | |||
| 123 | if (free > pages) | ||
| 124 | return 0; | ||
| 125 | vm_unacct_memory(pages); | ||
| 126 | return -ENOMEM; | ||
| 127 | } | ||
| 128 | |||
| 129 | allowed = (totalram_pages - hugetlb_total_pages()) | ||
| 130 | * sysctl_overcommit_ratio / 100; | ||
| 131 | /* | ||
| 132 | * Leave the last 3% for root | ||
| 133 | */ | ||
| 134 | if (!cap_sys_admin) | ||
| 135 | allowed -= allowed / 32; | ||
| 136 | allowed += total_swap_pages; | ||
| 137 | |||
| 138 | /* Don't let a single process grow too big: | ||
| 139 | leave 3% of the size of this process for other processes */ | ||
| 140 | allowed -= current->mm->total_vm / 32; | ||
| 141 | |||
| 142 | if (atomic_read(&vm_committed_space) < allowed) | ||
| 143 | return 0; | ||
| 144 | |||
| 145 | vm_unacct_memory(pages); | ||
| 146 | |||
| 147 | return -ENOMEM; | ||
| 148 | } | ||
| 149 | |||
| 150 | EXPORT_SYMBOL(sysctl_overcommit_memory); | ||
| 151 | EXPORT_SYMBOL(sysctl_overcommit_ratio); | ||
| 152 | EXPORT_SYMBOL(sysctl_max_map_count); | ||
| 153 | EXPORT_SYMBOL(vm_committed_space); | ||
| 154 | EXPORT_SYMBOL(__vm_enough_memory); | ||
| 155 | |||
| 156 | /* | ||
| 157 | * Requires inode->i_mapping->i_mmap_lock | ||
| 158 | */ | ||
| 159 | static void __remove_shared_vm_struct(struct vm_area_struct *vma, | ||
| 160 | struct file *file, struct address_space *mapping) | ||
| 161 | { | ||
| 162 | if (vma->vm_flags & VM_DENYWRITE) | ||
| 163 | atomic_inc(&file->f_dentry->d_inode->i_writecount); | ||
| 164 | if (vma->vm_flags & VM_SHARED) | ||
| 165 | mapping->i_mmap_writable--; | ||
| 166 | |||
| 167 | flush_dcache_mmap_lock(mapping); | ||
| 168 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) | ||
| 169 | list_del_init(&vma->shared.vm_set.list); | ||
| 170 | else | ||
| 171 | vma_prio_tree_remove(vma, &mapping->i_mmap); | ||
| 172 | flush_dcache_mmap_unlock(mapping); | ||
| 173 | } | ||
| 174 | |||
| 175 | /* | ||
| 176 | * Remove one vm structure and free it. | ||
| 177 | */ | ||
| 178 | static void remove_vm_struct(struct vm_area_struct *vma) | ||
| 179 | { | ||
| 180 | struct file *file = vma->vm_file; | ||
| 181 | |||
| 182 | might_sleep(); | ||
| 183 | if (file) { | ||
| 184 | struct address_space *mapping = file->f_mapping; | ||
| 185 | spin_lock(&mapping->i_mmap_lock); | ||
| 186 | __remove_shared_vm_struct(vma, file, mapping); | ||
| 187 | spin_unlock(&mapping->i_mmap_lock); | ||
| 188 | } | ||
| 189 | if (vma->vm_ops && vma->vm_ops->close) | ||
| 190 | vma->vm_ops->close(vma); | ||
| 191 | if (file) | ||
| 192 | fput(file); | ||
| 193 | anon_vma_unlink(vma); | ||
| 194 | mpol_free(vma_policy(vma)); | ||
| 195 | kmem_cache_free(vm_area_cachep, vma); | ||
| 196 | } | ||
| 197 | |||
| 198 | /* | ||
| 199 | * sys_brk() for the most part doesn't need the global kernel | ||
| 200 | * lock, except when an application is doing something nasty | ||
| 201 | * like trying to un-brk an area that has already been mapped | ||
| 202 | * to a regular file. in this case, the unmapping will need | ||
| 203 | * to invoke file system routines that need the global lock. | ||
| 204 | */ | ||
| 205 | asmlinkage unsigned long sys_brk(unsigned long brk) | ||
| 206 | { | ||
| 207 | unsigned long rlim, retval; | ||
| 208 | unsigned long newbrk, oldbrk; | ||
| 209 | struct mm_struct *mm = current->mm; | ||
| 210 | |||
| 211 | down_write(&mm->mmap_sem); | ||
| 212 | |||
| 213 | if (brk < mm->end_code) | ||
| 214 | goto out; | ||
| 215 | newbrk = PAGE_ALIGN(brk); | ||
| 216 | oldbrk = PAGE_ALIGN(mm->brk); | ||
| 217 | if (oldbrk == newbrk) | ||
| 218 | goto set_brk; | ||
| 219 | |||
| 220 | /* Always allow shrinking brk. */ | ||
| 221 | if (brk <= mm->brk) { | ||
| 222 | if (!do_munmap(mm, newbrk, oldbrk-newbrk)) | ||
| 223 | goto set_brk; | ||
| 224 | goto out; | ||
| 225 | } | ||
| 226 | |||
| 227 | /* Check against rlimit.. */ | ||
| 228 | rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; | ||
| 229 | if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) | ||
| 230 | goto out; | ||
| 231 | |||
| 232 | /* Check against existing mmap mappings. */ | ||
| 233 | if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) | ||
| 234 | goto out; | ||
| 235 | |||
| 236 | /* Ok, looks good - let it rip. */ | ||
| 237 | if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) | ||
| 238 | goto out; | ||
| 239 | set_brk: | ||
| 240 | mm->brk = brk; | ||
| 241 | out: | ||
| 242 | retval = mm->brk; | ||
| 243 | up_write(&mm->mmap_sem); | ||
| 244 | return retval; | ||
| 245 | } | ||
| 246 | |||
| 247 | #ifdef DEBUG_MM_RB | ||
| 248 | static int browse_rb(struct rb_root *root) | ||
| 249 | { | ||
| 250 | int i = 0, j; | ||
| 251 | struct rb_node *nd, *pn = NULL; | ||
| 252 | unsigned long prev = 0, pend = 0; | ||
| 253 | |||
| 254 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { | ||
| 255 | struct vm_area_struct *vma; | ||
| 256 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); | ||
| 257 | if (vma->vm_start < prev) | ||
| 258 | printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; | ||
| 259 | if (vma->vm_start < pend) | ||
| 260 | printk("vm_start %lx pend %lx\n", vma->vm_start, pend); | ||
| 261 | if (vma->vm_start > vma->vm_end) | ||
| 262 | printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); | ||
| 263 | i++; | ||
| 264 | pn = nd; | ||
| 265 | } | ||
| 266 | j = 0; | ||
| 267 | for (nd = pn; nd; nd = rb_prev(nd)) { | ||
| 268 | j++; | ||
| 269 | } | ||
| 270 | if (i != j) | ||
| 271 | printk("backwards %d, forwards %d\n", j, i), i = 0; | ||
| 272 | return i; | ||
| 273 | } | ||
| 274 | |||
| 275 | void validate_mm(struct mm_struct *mm) | ||
| 276 | { | ||
| 277 | int bug = 0; | ||
| 278 | int i = 0; | ||
| 279 | struct vm_area_struct *tmp = mm->mmap; | ||
| 280 | while (tmp) { | ||
| 281 | tmp = tmp->vm_next; | ||
| 282 | i++; | ||
| 283 | } | ||
| 284 | if (i != mm->map_count) | ||
| 285 | printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; | ||
| 286 | i = browse_rb(&mm->mm_rb); | ||
| 287 | if (i != mm->map_count) | ||
| 288 | printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; | ||
| 289 | if (bug) | ||
| 290 | BUG(); | ||
| 291 | } | ||
| 292 | #else | ||
| 293 | #define validate_mm(mm) do { } while (0) | ||
| 294 | #endif | ||
| 295 | |||
| 296 | static struct vm_area_struct * | ||
| 297 | find_vma_prepare(struct mm_struct *mm, unsigned long addr, | ||
| 298 | struct vm_area_struct **pprev, struct rb_node ***rb_link, | ||
| 299 | struct rb_node ** rb_parent) | ||
| 300 | { | ||
| 301 | struct vm_area_struct * vma; | ||
| 302 | struct rb_node ** __rb_link, * __rb_parent, * rb_prev; | ||
| 303 | |||
| 304 | __rb_link = &mm->mm_rb.rb_node; | ||
| 305 | rb_prev = __rb_parent = NULL; | ||
| 306 | vma = NULL; | ||
| 307 | |||
| 308 | while (*__rb_link) { | ||
| 309 | struct vm_area_struct *vma_tmp; | ||
| 310 | |||
| 311 | __rb_parent = *__rb_link; | ||
| 312 | vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); | ||
| 313 | |||
| 314 | if (vma_tmp->vm_end > addr) { | ||
| 315 | vma = vma_tmp; | ||
| 316 | if (vma_tmp->vm_start <= addr) | ||
| 317 | return vma; | ||
| 318 | __rb_link = &__rb_parent->rb_left; | ||
| 319 | } else { | ||
| 320 | rb_prev = __rb_parent; | ||
| 321 | __rb_link = &__rb_parent->rb_right; | ||
| 322 | } | ||
| 323 | } | ||
| 324 | |||
| 325 | *pprev = NULL; | ||
| 326 | if (rb_prev) | ||
| 327 | *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); | ||
| 328 | *rb_link = __rb_link; | ||
| 329 | *rb_parent = __rb_parent; | ||
| 330 | return vma; | ||
| 331 | } | ||
| 332 | |||
| 333 | static inline void | ||
| 334 | __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 335 | struct vm_area_struct *prev, struct rb_node *rb_parent) | ||
| 336 | { | ||
| 337 | if (prev) { | ||
| 338 | vma->vm_next = prev->vm_next; | ||
| 339 | prev->vm_next = vma; | ||
| 340 | } else { | ||
| 341 | mm->mmap = vma; | ||
| 342 | if (rb_parent) | ||
| 343 | vma->vm_next = rb_entry(rb_parent, | ||
| 344 | struct vm_area_struct, vm_rb); | ||
| 345 | else | ||
| 346 | vma->vm_next = NULL; | ||
| 347 | } | ||
| 348 | } | ||
| 349 | |||
| 350 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 351 | struct rb_node **rb_link, struct rb_node *rb_parent) | ||
| 352 | { | ||
| 353 | rb_link_node(&vma->vm_rb, rb_parent, rb_link); | ||
| 354 | rb_insert_color(&vma->vm_rb, &mm->mm_rb); | ||
| 355 | } | ||
| 356 | |||
| 357 | static inline void __vma_link_file(struct vm_area_struct *vma) | ||
| 358 | { | ||
| 359 | struct file * file; | ||
| 360 | |||
| 361 | file = vma->vm_file; | ||
| 362 | if (file) { | ||
| 363 | struct address_space *mapping = file->f_mapping; | ||
| 364 | |||
| 365 | if (vma->vm_flags & VM_DENYWRITE) | ||
| 366 | atomic_dec(&file->f_dentry->d_inode->i_writecount); | ||
| 367 | if (vma->vm_flags & VM_SHARED) | ||
| 368 | mapping->i_mmap_writable++; | ||
| 369 | |||
| 370 | flush_dcache_mmap_lock(mapping); | ||
| 371 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) | ||
| 372 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | ||
| 373 | else | ||
| 374 | vma_prio_tree_insert(vma, &mapping->i_mmap); | ||
| 375 | flush_dcache_mmap_unlock(mapping); | ||
| 376 | } | ||
| 377 | } | ||
| 378 | |||
| 379 | static void | ||
| 380 | __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 381 | struct vm_area_struct *prev, struct rb_node **rb_link, | ||
| 382 | struct rb_node *rb_parent) | ||
| 383 | { | ||
| 384 | __vma_link_list(mm, vma, prev, rb_parent); | ||
| 385 | __vma_link_rb(mm, vma, rb_link, rb_parent); | ||
| 386 | __anon_vma_link(vma); | ||
| 387 | } | ||
| 388 | |||
| 389 | static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 390 | struct vm_area_struct *prev, struct rb_node **rb_link, | ||
| 391 | struct rb_node *rb_parent) | ||
| 392 | { | ||
| 393 | struct address_space *mapping = NULL; | ||
| 394 | |||
| 395 | if (vma->vm_file) | ||
| 396 | mapping = vma->vm_file->f_mapping; | ||
| 397 | |||
| 398 | if (mapping) { | ||
| 399 | spin_lock(&mapping->i_mmap_lock); | ||
| 400 | vma->vm_truncate_count = mapping->truncate_count; | ||
| 401 | } | ||
| 402 | anon_vma_lock(vma); | ||
| 403 | |||
| 404 | __vma_link(mm, vma, prev, rb_link, rb_parent); | ||
| 405 | __vma_link_file(vma); | ||
| 406 | |||
| 407 | anon_vma_unlock(vma); | ||
| 408 | if (mapping) | ||
| 409 | spin_unlock(&mapping->i_mmap_lock); | ||
| 410 | |||
| 411 | mm->map_count++; | ||
| 412 | validate_mm(mm); | ||
| 413 | } | ||
| 414 | |||
| 415 | /* | ||
| 416 | * Helper for vma_adjust in the split_vma insert case: | ||
| 417 | * insert vm structure into list and rbtree and anon_vma, | ||
| 418 | * but it has already been inserted into prio_tree earlier. | ||
| 419 | */ | ||
| 420 | static void | ||
| 421 | __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) | ||
| 422 | { | ||
| 423 | struct vm_area_struct * __vma, * prev; | ||
| 424 | struct rb_node ** rb_link, * rb_parent; | ||
| 425 | |||
| 426 | __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); | ||
| 427 | if (__vma && __vma->vm_start < vma->vm_end) | ||
| 428 | BUG(); | ||
| 429 | __vma_link(mm, vma, prev, rb_link, rb_parent); | ||
| 430 | mm->map_count++; | ||
| 431 | } | ||
| 432 | |||
| 433 | static inline void | ||
| 434 | __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 435 | struct vm_area_struct *prev) | ||
| 436 | { | ||
| 437 | prev->vm_next = vma->vm_next; | ||
| 438 | rb_erase(&vma->vm_rb, &mm->mm_rb); | ||
| 439 | if (mm->mmap_cache == vma) | ||
| 440 | mm->mmap_cache = prev; | ||
| 441 | } | ||
| 442 | |||
| 443 | /* | ||
| 444 | * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that | ||
| 445 | * is already present in an i_mmap tree without adjusting the tree. | ||
| 446 | * The following helper function should be used when such adjustments | ||
| 447 | * are necessary. The "insert" vma (if any) is to be inserted | ||
| 448 | * before we drop the necessary locks. | ||
| 449 | */ | ||
| 450 | void vma_adjust(struct vm_area_struct *vma, unsigned long start, | ||
| 451 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) | ||
| 452 | { | ||
| 453 | struct mm_struct *mm = vma->vm_mm; | ||
| 454 | struct vm_area_struct *next = vma->vm_next; | ||
| 455 | struct vm_area_struct *importer = NULL; | ||
| 456 | struct address_space *mapping = NULL; | ||
| 457 | struct prio_tree_root *root = NULL; | ||
| 458 | struct file *file = vma->vm_file; | ||
| 459 | struct anon_vma *anon_vma = NULL; | ||
| 460 | long adjust_next = 0; | ||
| 461 | int remove_next = 0; | ||
| 462 | |||
| 463 | if (next && !insert) { | ||
| 464 | if (end >= next->vm_end) { | ||
| 465 | /* | ||
| 466 | * vma expands, overlapping all the next, and | ||
| 467 | * perhaps the one after too (mprotect case 6). | ||
| 468 | */ | ||
| 469 | again: remove_next = 1 + (end > next->vm_end); | ||
| 470 | end = next->vm_end; | ||
| 471 | anon_vma = next->anon_vma; | ||
| 472 | importer = vma; | ||
| 473 | } else if (end > next->vm_start) { | ||
| 474 | /* | ||
| 475 | * vma expands, overlapping part of the next: | ||
| 476 | * mprotect case 5 shifting the boundary up. | ||
| 477 | */ | ||
| 478 | adjust_next = (end - next->vm_start) >> PAGE_SHIFT; | ||
| 479 | anon_vma = next->anon_vma; | ||
| 480 | importer = vma; | ||
| 481 | } else if (end < vma->vm_end) { | ||
| 482 | /* | ||
| 483 | * vma shrinks, and !insert tells it's not | ||
| 484 | * split_vma inserting another: so it must be | ||
| 485 | * mprotect case 4 shifting the boundary down. | ||
| 486 | */ | ||
| 487 | adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); | ||
| 488 | anon_vma = next->anon_vma; | ||
| 489 | importer = next; | ||
| 490 | } | ||
| 491 | } | ||
| 492 | |||
| 493 | if (file) { | ||
| 494 | mapping = file->f_mapping; | ||
| 495 | if (!(vma->vm_flags & VM_NONLINEAR)) | ||
| 496 | root = &mapping->i_mmap; | ||
| 497 | spin_lock(&mapping->i_mmap_lock); | ||
| 498 | if (importer && | ||
| 499 | vma->vm_truncate_count != next->vm_truncate_count) { | ||
| 500 | /* | ||
| 501 | * unmap_mapping_range might be in progress: | ||
| 502 | * ensure that the expanding vma is rescanned. | ||
| 503 | */ | ||
| 504 | importer->vm_truncate_count = 0; | ||
| 505 | } | ||
| 506 | if (insert) { | ||
| 507 | insert->vm_truncate_count = vma->vm_truncate_count; | ||
| 508 | /* | ||
| 509 | * Put into prio_tree now, so instantiated pages | ||
| 510 | * are visible to arm/parisc __flush_dcache_page | ||
| 511 | * throughout; but we cannot insert into address | ||
| 512 | * space until vma start or end is updated. | ||
| 513 | */ | ||
| 514 | __vma_link_file(insert); | ||
| 515 | } | ||
| 516 | } | ||
| 517 | |||
| 518 | /* | ||
| 519 | * When changing only vma->vm_end, we don't really need | ||
| 520 | * anon_vma lock: but is that case worth optimizing out? | ||
| 521 | */ | ||
| 522 | if (vma->anon_vma) | ||
| 523 | anon_vma = vma->anon_vma; | ||
| 524 | if (anon_vma) { | ||
| 525 | spin_lock(&anon_vma->lock); | ||
| 526 | /* | ||
| 527 | * Easily overlooked: when mprotect shifts the boundary, | ||
| 528 | * make sure the expanding vma has anon_vma set if the | ||
| 529 | * shrinking vma had, to cover any anon pages imported. | ||
| 530 | */ | ||
| 531 | if (importer && !importer->anon_vma) { | ||
| 532 | importer->anon_vma = anon_vma; | ||
| 533 | __anon_vma_link(importer); | ||
| 534 | } | ||
| 535 | } | ||
| 536 | |||
| 537 | if (root) { | ||
| 538 | flush_dcache_mmap_lock(mapping); | ||
| 539 | vma_prio_tree_remove(vma, root); | ||
| 540 | if (adjust_next) | ||
| 541 | vma_prio_tree_remove(next, root); | ||
| 542 | } | ||
| 543 | |||
| 544 | vma->vm_start = start; | ||
| 545 | vma->vm_end = end; | ||
| 546 | vma->vm_pgoff = pgoff; | ||
| 547 | if (adjust_next) { | ||
| 548 | next->vm_start += adjust_next << PAGE_SHIFT; | ||
| 549 | next->vm_pgoff += adjust_next; | ||
| 550 | } | ||
| 551 | |||
| 552 | if (root) { | ||
| 553 | if (adjust_next) | ||
| 554 | vma_prio_tree_insert(next, root); | ||
| 555 | vma_prio_tree_insert(vma, root); | ||
| 556 | flush_dcache_mmap_unlock(mapping); | ||
| 557 | } | ||
| 558 | |||
| 559 | if (remove_next) { | ||
| 560 | /* | ||
| 561 | * vma_merge has merged next into vma, and needs | ||
| 562 | * us to remove next before dropping the locks. | ||
| 563 | */ | ||
| 564 | __vma_unlink(mm, next, vma); | ||
| 565 | if (file) | ||
| 566 | __remove_shared_vm_struct(next, file, mapping); | ||
| 567 | if (next->anon_vma) | ||
| 568 | __anon_vma_merge(vma, next); | ||
| 569 | } else if (insert) { | ||
| 570 | /* | ||
| 571 | * split_vma has split insert from vma, and needs | ||
| 572 | * us to insert it before dropping the locks | ||
| 573 | * (it may either follow vma or precede it). | ||
| 574 | */ | ||
| 575 | __insert_vm_struct(mm, insert); | ||
| 576 | } | ||
| 577 | |||
| 578 | if (anon_vma) | ||
| 579 | spin_unlock(&anon_vma->lock); | ||
| 580 | if (mapping) | ||
| 581 | spin_unlock(&mapping->i_mmap_lock); | ||
| 582 | |||
| 583 | if (remove_next) { | ||
| 584 | if (file) | ||
| 585 | fput(file); | ||
| 586 | mm->map_count--; | ||
| 587 | mpol_free(vma_policy(next)); | ||
| 588 | kmem_cache_free(vm_area_cachep, next); | ||
| 589 | /* | ||
| 590 | * In mprotect's case 6 (see comments on vma_merge), | ||
| 591 | * we must remove another next too. It would clutter | ||
| 592 | * up the code too much to do both in one go. | ||
| 593 | */ | ||
| 594 | if (remove_next == 2) { | ||
| 595 | next = vma->vm_next; | ||
| 596 | goto again; | ||
| 597 | } | ||
| 598 | } | ||
| 599 | |||
| 600 | validate_mm(mm); | ||
| 601 | } | ||
| 602 | |||
| 603 | /* | ||
| 604 | * If the vma has a ->close operation then the driver probably needs to release | ||
| 605 | * per-vma resources, so we don't attempt to merge those. | ||
| 606 | */ | ||
| 607 | #define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED) | ||
| 608 | |||
| 609 | static inline int is_mergeable_vma(struct vm_area_struct *vma, | ||
| 610 | struct file *file, unsigned long vm_flags) | ||
| 611 | { | ||
| 612 | if (vma->vm_flags != vm_flags) | ||
| 613 | return 0; | ||
| 614 | if (vma->vm_file != file) | ||
| 615 | return 0; | ||
| 616 | if (vma->vm_ops && vma->vm_ops->close) | ||
| 617 | return 0; | ||
| 618 | return 1; | ||
| 619 | } | ||
| 620 | |||
| 621 | static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, | ||
| 622 | struct anon_vma *anon_vma2) | ||
| 623 | { | ||
| 624 | return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2); | ||
| 625 | } | ||
| 626 | |||
| 627 | /* | ||
| 628 | * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) | ||
| 629 | * in front of (at a lower virtual address and file offset than) the vma. | ||
| 630 | * | ||
| 631 | * We cannot merge two vmas if they have differently assigned (non-NULL) | ||
| 632 | * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. | ||
| 633 | * | ||
| 634 | * We don't check here for the merged mmap wrapping around the end of pagecache | ||
| 635 | * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which | ||
| 636 | * wrap, nor mmaps which cover the final page at index -1UL. | ||
| 637 | */ | ||
| 638 | static int | ||
| 639 | can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, | ||
| 640 | struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) | ||
| 641 | { | ||
| 642 | if (is_mergeable_vma(vma, file, vm_flags) && | ||
| 643 | is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { | ||
| 644 | if (vma->vm_pgoff == vm_pgoff) | ||
| 645 | return 1; | ||
| 646 | } | ||
| 647 | return 0; | ||
| 648 | } | ||
| 649 | |||
| 650 | /* | ||
| 651 | * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) | ||
| 652 | * beyond (at a higher virtual address and file offset than) the vma. | ||
| 653 | * | ||
| 654 | * We cannot merge two vmas if they have differently assigned (non-NULL) | ||
| 655 | * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. | ||
| 656 | */ | ||
| 657 | static int | ||
| 658 | can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, | ||
| 659 | struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) | ||
| 660 | { | ||
| 661 | if (is_mergeable_vma(vma, file, vm_flags) && | ||
| 662 | is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { | ||
| 663 | pgoff_t vm_pglen; | ||
| 664 | vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; | ||
| 665 | if (vma->vm_pgoff + vm_pglen == vm_pgoff) | ||
| 666 | return 1; | ||
| 667 | } | ||
| 668 | return 0; | ||
| 669 | } | ||
| 670 | |||
| 671 | /* | ||
| 672 | * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out | ||
| 673 | * whether that can be merged with its predecessor or its successor. | ||
| 674 | * Or both (it neatly fills a hole). | ||
| 675 | * | ||
| 676 | * In most cases - when called for mmap, brk or mremap - [addr,end) is | ||
| 677 | * certain not to be mapped by the time vma_merge is called; but when | ||
| 678 | * called for mprotect, it is certain to be already mapped (either at | ||
| 679 | * an offset within prev, or at the start of next), and the flags of | ||
| 680 | * this area are about to be changed to vm_flags - and the no-change | ||
| 681 | * case has already been eliminated. | ||
| 682 | * | ||
| 683 | * The following mprotect cases have to be considered, where AAAA is | ||
| 684 | * the area passed down from mprotect_fixup, never extending beyond one | ||
| 685 | * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: | ||
| 686 | * | ||
| 687 | * AAAA AAAA AAAA AAAA | ||
| 688 | * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX | ||
| 689 | * cannot merge might become might become might become | ||
| 690 | * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or | ||
| 691 | * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or | ||
| 692 | * mremap move: PPPPNNNNNNNN 8 | ||
| 693 | * AAAA | ||
| 694 | * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN | ||
| 695 | * might become case 1 below case 2 below case 3 below | ||
| 696 | * | ||
| 697 | * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: | ||
| 698 | * mprotect_fixup updates vm_flags & vm_page_prot on successful return. | ||
| 699 | */ | ||
| 700 | struct vm_area_struct *vma_merge(struct mm_struct *mm, | ||
| 701 | struct vm_area_struct *prev, unsigned long addr, | ||
| 702 | unsigned long end, unsigned long vm_flags, | ||
| 703 | struct anon_vma *anon_vma, struct file *file, | ||
| 704 | pgoff_t pgoff, struct mempolicy *policy) | ||
| 705 | { | ||
| 706 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; | ||
| 707 | struct vm_area_struct *area, *next; | ||
| 708 | |||
| 709 | /* | ||
| 710 | * We later require that vma->vm_flags == vm_flags, | ||
| 711 | * so this tests vma->vm_flags & VM_SPECIAL, too. | ||
| 712 | */ | ||
| 713 | if (vm_flags & VM_SPECIAL) | ||
| 714 | return NULL; | ||
| 715 | |||
| 716 | if (prev) | ||
| 717 | next = prev->vm_next; | ||
| 718 | else | ||
| 719 | next = mm->mmap; | ||
| 720 | area = next; | ||
| 721 | if (next && next->vm_end == end) /* cases 6, 7, 8 */ | ||
| 722 | next = next->vm_next; | ||
| 723 | |||
| 724 | /* | ||
| 725 | * Can it merge with the predecessor? | ||
| 726 | */ | ||
| 727 | if (prev && prev->vm_end == addr && | ||
| 728 | mpol_equal(vma_policy(prev), policy) && | ||
| 729 | can_vma_merge_after(prev, vm_flags, | ||
| 730 | anon_vma, file, pgoff)) { | ||
| 731 | /* | ||
| 732 | * OK, it can. Can we now merge in the successor as well? | ||
| 733 | */ | ||
| 734 | if (next && end == next->vm_start && | ||
| 735 | mpol_equal(policy, vma_policy(next)) && | ||
| 736 | can_vma_merge_before(next, vm_flags, | ||
| 737 | anon_vma, file, pgoff+pglen) && | ||
| 738 | is_mergeable_anon_vma(prev->anon_vma, | ||
| 739 | next->anon_vma)) { | ||
| 740 | /* cases 1, 6 */ | ||
| 741 | vma_adjust(prev, prev->vm_start, | ||
| 742 | next->vm_end, prev->vm_pgoff, NULL); | ||
| 743 | } else /* cases 2, 5, 7 */ | ||
| 744 | vma_adjust(prev, prev->vm_start, | ||
| 745 | end, prev->vm_pgoff, NULL); | ||
| 746 | return prev; | ||
| 747 | } | ||
| 748 | |||
| 749 | /* | ||
| 750 | * Can this new request be merged in front of next? | ||
| 751 | */ | ||
| 752 | if (next && end == next->vm_start && | ||
| 753 | mpol_equal(policy, vma_policy(next)) && | ||
| 754 | can_vma_merge_before(next, vm_flags, | ||
| 755 | anon_vma, file, pgoff+pglen)) { | ||
| 756 | if (prev && addr < prev->vm_end) /* case 4 */ | ||
| 757 | vma_adjust(prev, prev->vm_start, | ||
| 758 | addr, prev->vm_pgoff, NULL); | ||
| 759 | else /* cases 3, 8 */ | ||
| 760 | vma_adjust(area, addr, next->vm_end, | ||
| 761 | next->vm_pgoff - pglen, NULL); | ||
| 762 | return area; | ||
| 763 | } | ||
| 764 | |||
| 765 | return NULL; | ||
| 766 | } | ||
| 767 | |||
| 768 | /* | ||
| 769 | * find_mergeable_anon_vma is used by anon_vma_prepare, to check | ||
| 770 | * neighbouring vmas for a suitable anon_vma, before it goes off | ||
| 771 | * to allocate a new anon_vma. It checks because a repetitive | ||
| 772 | * sequence of mprotects and faults may otherwise lead to distinct | ||
| 773 | * anon_vmas being allocated, preventing vma merge in subsequent | ||
| 774 | * mprotect. | ||
| 775 | */ | ||
| 776 | struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) | ||
| 777 | { | ||
| 778 | struct vm_area_struct *near; | ||
| 779 | unsigned long vm_flags; | ||
| 780 | |||
| 781 | near = vma->vm_next; | ||
| 782 | if (!near) | ||
| 783 | goto try_prev; | ||
| 784 | |||
| 785 | /* | ||
| 786 | * Since only mprotect tries to remerge vmas, match flags | ||
| 787 | * which might be mprotected into each other later on. | ||
| 788 | * Neither mlock nor madvise tries to remerge at present, | ||
| 789 | * so leave their flags as obstructing a merge. | ||
| 790 | */ | ||
| 791 | vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); | ||
| 792 | vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); | ||
| 793 | |||
| 794 | if (near->anon_vma && vma->vm_end == near->vm_start && | ||
| 795 | mpol_equal(vma_policy(vma), vma_policy(near)) && | ||
| 796 | can_vma_merge_before(near, vm_flags, | ||
| 797 | NULL, vma->vm_file, vma->vm_pgoff + | ||
| 798 | ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT))) | ||
| 799 | return near->anon_vma; | ||
| 800 | try_prev: | ||
| 801 | /* | ||
| 802 | * It is potentially slow to have to call find_vma_prev here. | ||
| 803 | * But it's only on the first write fault on the vma, not | ||
| 804 | * every time, and we could devise a way to avoid it later | ||
| 805 | * (e.g. stash info in next's anon_vma_node when assigning | ||
| 806 | * an anon_vma, or when trying vma_merge). Another time. | ||
| 807 | */ | ||
| 808 | if (find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma) | ||
| 809 | BUG(); | ||
| 810 | if (!near) | ||
| 811 | goto none; | ||
| 812 | |||
| 813 | vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); | ||
| 814 | vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); | ||
| 815 | |||
| 816 | if (near->anon_vma && near->vm_end == vma->vm_start && | ||
| 817 | mpol_equal(vma_policy(near), vma_policy(vma)) && | ||
| 818 | can_vma_merge_after(near, vm_flags, | ||
| 819 | NULL, vma->vm_file, vma->vm_pgoff)) | ||
| 820 | return near->anon_vma; | ||
| 821 | none: | ||
| 822 | /* | ||
| 823 | * There's no absolute need to look only at touching neighbours: | ||
| 824 | * we could search further afield for "compatible" anon_vmas. | ||
| 825 | * But it would probably just be a waste of time searching, | ||
| 826 | * or lead to too many vmas hanging off the same anon_vma. | ||
| 827 | * We're trying to allow mprotect remerging later on, | ||
| 828 | * not trying to minimize memory used for anon_vmas. | ||
| 829 | */ | ||
| 830 | return NULL; | ||
| 831 | } | ||
| 832 | |||
| 833 | #ifdef CONFIG_PROC_FS | ||
| 834 | void __vm_stat_account(struct mm_struct *mm, unsigned long flags, | ||
| 835 | struct file *file, long pages) | ||
| 836 | { | ||
| 837 | const unsigned long stack_flags | ||
| 838 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); | ||
| 839 | |||
| 840 | #ifdef CONFIG_HUGETLB | ||
| 841 | if (flags & VM_HUGETLB) { | ||
| 842 | if (!(flags & VM_DONTCOPY)) | ||
| 843 | mm->shared_vm += pages; | ||
| 844 | return; | ||
| 845 | } | ||
| 846 | #endif /* CONFIG_HUGETLB */ | ||
| 847 | |||
| 848 | if (file) { | ||
| 849 | mm->shared_vm += pages; | ||
| 850 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) | ||
| 851 | mm->exec_vm += pages; | ||
| 852 | } else if (flags & stack_flags) | ||
| 853 | mm->stack_vm += pages; | ||
| 854 | if (flags & (VM_RESERVED|VM_IO)) | ||
| 855 | mm->reserved_vm += pages; | ||
| 856 | } | ||
| 857 | #endif /* CONFIG_PROC_FS */ | ||
| 858 | |||
| 859 | /* | ||
| 860 | * The caller must hold down_write(current->mm->mmap_sem). | ||
| 861 | */ | ||
| 862 | |||
| 863 | unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | ||
| 864 | unsigned long len, unsigned long prot, | ||
| 865 | unsigned long flags, unsigned long pgoff) | ||
| 866 | { | ||
| 867 | struct mm_struct * mm = current->mm; | ||
| 868 | struct vm_area_struct * vma, * prev; | ||
| 869 | struct inode *inode; | ||
| 870 | unsigned int vm_flags; | ||
| 871 | int correct_wcount = 0; | ||
| 872 | int error; | ||
| 873 | struct rb_node ** rb_link, * rb_parent; | ||
| 874 | int accountable = 1; | ||
| 875 | unsigned long charged = 0, reqprot = prot; | ||
| 876 | |||
| 877 | if (file) { | ||
| 878 | if (is_file_hugepages(file)) | ||
| 879 | accountable = 0; | ||
| 880 | |||
| 881 | if (!file->f_op || !file->f_op->mmap) | ||
| 882 | return -ENODEV; | ||
| 883 | |||
| 884 | if ((prot & PROT_EXEC) && | ||
| 885 | (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)) | ||
| 886 | return -EPERM; | ||
| 887 | } | ||
| 888 | /* | ||
| 889 | * Does the application expect PROT_READ to imply PROT_EXEC? | ||
| 890 | * | ||
| 891 | * (the exception is when the underlying filesystem is noexec | ||
| 892 | * mounted, in which case we dont add PROT_EXEC.) | ||
| 893 | */ | ||
| 894 | if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) | ||
| 895 | if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))) | ||
| 896 | prot |= PROT_EXEC; | ||
| 897 | |||
| 898 | if (!len) | ||
| 899 | return -EINVAL; | ||
| 900 | |||
| 901 | /* Careful about overflows.. */ | ||
| 902 | len = PAGE_ALIGN(len); | ||
| 903 | if (!len || len > TASK_SIZE) | ||
| 904 | return -ENOMEM; | ||
| 905 | |||
| 906 | /* offset overflow? */ | ||
| 907 | if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) | ||
| 908 | return -EOVERFLOW; | ||
| 909 | |||
| 910 | /* Too many mappings? */ | ||
| 911 | if (mm->map_count > sysctl_max_map_count) | ||
| 912 | return -ENOMEM; | ||
| 913 | |||
| 914 | /* Obtain the address to map to. we verify (or select) it and ensure | ||
| 915 | * that it represents a valid section of the address space. | ||
| 916 | */ | ||
| 917 | addr = get_unmapped_area(file, addr, len, pgoff, flags); | ||
| 918 | if (addr & ~PAGE_MASK) | ||
| 919 | return addr; | ||
| 920 | |||
| 921 | /* Do simple checking here so the lower-level routines won't have | ||
| 922 | * to. we assume access permissions have been handled by the open | ||
| 923 | * of the memory object, so we don't do any here. | ||
| 924 | */ | ||
| 925 | vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | | ||
| 926 | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; | ||
| 927 | |||
| 928 | if (flags & MAP_LOCKED) { | ||
| 929 | if (!can_do_mlock()) | ||
| 930 | return -EPERM; | ||
| 931 | vm_flags |= VM_LOCKED; | ||
| 932 | } | ||
| 933 | /* mlock MCL_FUTURE? */ | ||
| 934 | if (vm_flags & VM_LOCKED) { | ||
| 935 | unsigned long locked, lock_limit; | ||
| 936 | locked = mm->locked_vm << PAGE_SHIFT; | ||
| 937 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | ||
| 938 | locked += len; | ||
| 939 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
| 940 | return -EAGAIN; | ||
| 941 | } | ||
| 942 | |||
| 943 | inode = file ? file->f_dentry->d_inode : NULL; | ||
| 944 | |||
| 945 | if (file) { | ||
| 946 | switch (flags & MAP_TYPE) { | ||
| 947 | case MAP_SHARED: | ||
| 948 | if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) | ||
| 949 | return -EACCES; | ||
| 950 | |||
| 951 | /* | ||
| 952 | * Make sure we don't allow writing to an append-only | ||
| 953 | * file.. | ||
| 954 | */ | ||
| 955 | if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) | ||
| 956 | return -EACCES; | ||
| 957 | |||
| 958 | /* | ||
| 959 | * Make sure there are no mandatory locks on the file. | ||
| 960 | */ | ||
| 961 | if (locks_verify_locked(inode)) | ||
| 962 | return -EAGAIN; | ||
| 963 | |||
| 964 | vm_flags |= VM_SHARED | VM_MAYSHARE; | ||
| 965 | if (!(file->f_mode & FMODE_WRITE)) | ||
| 966 | vm_flags &= ~(VM_MAYWRITE | VM_SHARED); | ||
| 967 | |||
| 968 | /* fall through */ | ||
| 969 | case MAP_PRIVATE: | ||
| 970 | if (!(file->f_mode & FMODE_READ)) | ||
| 971 | return -EACCES; | ||
| 972 | break; | ||
| 973 | |||
| 974 | default: | ||
| 975 | return -EINVAL; | ||
| 976 | } | ||
| 977 | } else { | ||
| 978 | switch (flags & MAP_TYPE) { | ||
| 979 | case MAP_SHARED: | ||
| 980 | vm_flags |= VM_SHARED | VM_MAYSHARE; | ||
| 981 | break; | ||
| 982 | case MAP_PRIVATE: | ||
| 983 | /* | ||
| 984 | * Set pgoff according to addr for anon_vma. | ||
| 985 | */ | ||
| 986 | pgoff = addr >> PAGE_SHIFT; | ||
| 987 | break; | ||
| 988 | default: | ||
| 989 | return -EINVAL; | ||
| 990 | } | ||
| 991 | } | ||
| 992 | |||
| 993 | error = security_file_mmap(file, reqprot, prot, flags); | ||
| 994 | if (error) | ||
| 995 | return error; | ||
| 996 | |||
| 997 | /* Clear old maps */ | ||
| 998 | error = -ENOMEM; | ||
| 999 | munmap_back: | ||
| 1000 | vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); | ||
| 1001 | if (vma && vma->vm_start < addr + len) { | ||
| 1002 | if (do_munmap(mm, addr, len)) | ||
| 1003 | return -ENOMEM; | ||
| 1004 | goto munmap_back; | ||
| 1005 | } | ||
| 1006 | |||
| 1007 | /* Check against address space limit. */ | ||
| 1008 | if ((mm->total_vm << PAGE_SHIFT) + len | ||
| 1009 | > current->signal->rlim[RLIMIT_AS].rlim_cur) | ||
| 1010 | return -ENOMEM; | ||
| 1011 | |||
| 1012 | if (accountable && (!(flags & MAP_NORESERVE) || | ||
| 1013 | sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { | ||
| 1014 | if (vm_flags & VM_SHARED) { | ||
| 1015 | /* Check memory availability in shmem_file_setup? */ | ||
| 1016 | vm_flags |= VM_ACCOUNT; | ||
| 1017 | } else if (vm_flags & VM_WRITE) { | ||
| 1018 | /* | ||
| 1019 | * Private writable mapping: check memory availability | ||
| 1020 | */ | ||
| 1021 | charged = len >> PAGE_SHIFT; | ||
| 1022 | if (security_vm_enough_memory(charged)) | ||
| 1023 | return -ENOMEM; | ||
| 1024 | vm_flags |= VM_ACCOUNT; | ||
| 1025 | } | ||
| 1026 | } | ||
| 1027 | |||
| 1028 | /* | ||
| 1029 | * Can we just expand an old private anonymous mapping? | ||
| 1030 | * The VM_SHARED test is necessary because shmem_zero_setup | ||
| 1031 | * will create the file object for a shared anonymous map below. | ||
| 1032 | */ | ||
| 1033 | if (!file && !(vm_flags & VM_SHARED) && | ||
| 1034 | vma_merge(mm, prev, addr, addr + len, vm_flags, | ||
| 1035 | NULL, NULL, pgoff, NULL)) | ||
| 1036 | goto out; | ||
| 1037 | |||
| 1038 | /* | ||
| 1039 | * Determine the object being mapped and call the appropriate | ||
| 1040 | * specific mapper. the address has already been validated, but | ||
| 1041 | * not unmapped, but the maps are removed from the list. | ||
| 1042 | */ | ||
| 1043 | vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); | ||
| 1044 | if (!vma) { | ||
| 1045 | error = -ENOMEM; | ||
| 1046 | goto unacct_error; | ||
| 1047 | } | ||
| 1048 | memset(vma, 0, sizeof(*vma)); | ||
| 1049 | |||
| 1050 | vma->vm_mm = mm; | ||
| 1051 | vma->vm_start = addr; | ||
| 1052 | vma->vm_end = addr + len; | ||
| 1053 | vma->vm_flags = vm_flags; | ||
| 1054 | vma->vm_page_prot = protection_map[vm_flags & 0x0f]; | ||
| 1055 | vma->vm_pgoff = pgoff; | ||
| 1056 | |||
| 1057 | if (file) { | ||
| 1058 | error = -EINVAL; | ||
| 1059 | if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) | ||
| 1060 | goto free_vma; | ||
| 1061 | if (vm_flags & VM_DENYWRITE) { | ||
| 1062 | error = deny_write_access(file); | ||
| 1063 | if (error) | ||
| 1064 | goto free_vma; | ||
| 1065 | correct_wcount = 1; | ||
| 1066 | } | ||
| 1067 | vma->vm_file = file; | ||
| 1068 | get_file(file); | ||
| 1069 | error = file->f_op->mmap(file, vma); | ||
| 1070 | if (error) | ||
| 1071 | goto unmap_and_free_vma; | ||
| 1072 | } else if (vm_flags & VM_SHARED) { | ||
| 1073 | error = shmem_zero_setup(vma); | ||
| 1074 | if (error) | ||
| 1075 | goto free_vma; | ||
| 1076 | } | ||
| 1077 | |||
| 1078 | /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform | ||
| 1079 | * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) | ||
| 1080 | * that memory reservation must be checked; but that reservation | ||
| 1081 | * belongs to shared memory object, not to vma: so now clear it. | ||
| 1082 | */ | ||
| 1083 | if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT)) | ||
| 1084 | vma->vm_flags &= ~VM_ACCOUNT; | ||
| 1085 | |||
| 1086 | /* Can addr have changed?? | ||
| 1087 | * | ||
| 1088 | * Answer: Yes, several device drivers can do it in their | ||
| 1089 | * f_op->mmap method. -DaveM | ||
| 1090 | */ | ||
| 1091 | addr = vma->vm_start; | ||
| 1092 | pgoff = vma->vm_pgoff; | ||
| 1093 | vm_flags = vma->vm_flags; | ||
| 1094 | |||
| 1095 | if (!file || !vma_merge(mm, prev, addr, vma->vm_end, | ||
| 1096 | vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { | ||
| 1097 | file = vma->vm_file; | ||
| 1098 | vma_link(mm, vma, prev, rb_link, rb_parent); | ||
| 1099 | if (correct_wcount) | ||
| 1100 | atomic_inc(&inode->i_writecount); | ||
| 1101 | } else { | ||
| 1102 | if (file) { | ||
| 1103 | if (correct_wcount) | ||
| 1104 | atomic_inc(&inode->i_writecount); | ||
| 1105 | fput(file); | ||
| 1106 | } | ||
| 1107 | mpol_free(vma_policy(vma)); | ||
| 1108 | kmem_cache_free(vm_area_cachep, vma); | ||
| 1109 | } | ||
| 1110 | out: | ||
| 1111 | mm->total_vm += len >> PAGE_SHIFT; | ||
| 1112 | __vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | ||
| 1113 | if (vm_flags & VM_LOCKED) { | ||
| 1114 | mm->locked_vm += len >> PAGE_SHIFT; | ||
| 1115 | make_pages_present(addr, addr + len); | ||
| 1116 | } | ||
| 1117 | if (flags & MAP_POPULATE) { | ||
| 1118 | up_write(&mm->mmap_sem); | ||
| 1119 | sys_remap_file_pages(addr, len, 0, | ||
| 1120 | pgoff, flags & MAP_NONBLOCK); | ||
| 1121 | down_write(&mm->mmap_sem); | ||
| 1122 | } | ||
| 1123 | return addr; | ||
| 1124 | |||
| 1125 | unmap_and_free_vma: | ||
| 1126 | if (correct_wcount) | ||
| 1127 | atomic_inc(&inode->i_writecount); | ||
| 1128 | vma->vm_file = NULL; | ||
| 1129 | fput(file); | ||
| 1130 | |||
| 1131 | /* Undo any partial mapping done by a device driver. */ | ||
| 1132 | zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); | ||
| 1133 | free_vma: | ||
| 1134 | kmem_cache_free(vm_area_cachep, vma); | ||
| 1135 | unacct_error: | ||
| 1136 | if (charged) | ||
| 1137 | vm_unacct_memory(charged); | ||
| 1138 | return error; | ||
| 1139 | } | ||
| 1140 | |||
| 1141 | EXPORT_SYMBOL(do_mmap_pgoff); | ||
| 1142 | |||
| 1143 | /* Get an address range which is currently unmapped. | ||
| 1144 | * For shmat() with addr=0. | ||
| 1145 | * | ||
| 1146 | * Ugly calling convention alert: | ||
| 1147 | * Return value with the low bits set means error value, | ||
| 1148 | * ie | ||
| 1149 | * if (ret & ~PAGE_MASK) | ||
| 1150 | * error = ret; | ||
| 1151 | * | ||
| 1152 | * This function "knows" that -ENOMEM has the bits set. | ||
| 1153 | */ | ||
| 1154 | #ifndef HAVE_ARCH_UNMAPPED_AREA | ||
| 1155 | unsigned long | ||
| 1156 | arch_get_unmapped_area(struct file *filp, unsigned long addr, | ||
| 1157 | unsigned long len, unsigned long pgoff, unsigned long flags) | ||
| 1158 | { | ||
| 1159 | struct mm_struct *mm = current->mm; | ||
| 1160 | struct vm_area_struct *vma; | ||
| 1161 | unsigned long start_addr; | ||
| 1162 | |||
| 1163 | if (len > TASK_SIZE) | ||
| 1164 | return -ENOMEM; | ||
| 1165 | |||
| 1166 | if (addr) { | ||
| 1167 | addr = PAGE_ALIGN(addr); | ||
| 1168 | vma = find_vma(mm, addr); | ||
| 1169 | if (TASK_SIZE - len >= addr && | ||
| 1170 | (!vma || addr + len <= vma->vm_start)) | ||
| 1171 | return addr; | ||
| 1172 | } | ||
| 1173 | start_addr = addr = mm->free_area_cache; | ||
| 1174 | |||
| 1175 | full_search: | ||
| 1176 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | ||
| 1177 | /* At this point: (!vma || addr < vma->vm_end). */ | ||
| 1178 | if (TASK_SIZE - len < addr) { | ||
| 1179 | /* | ||
| 1180 | * Start a new search - just in case we missed | ||
| 1181 | * some holes. | ||
| 1182 | */ | ||
| 1183 | if (start_addr != TASK_UNMAPPED_BASE) { | ||
| 1184 | start_addr = addr = TASK_UNMAPPED_BASE; | ||
| 1185 | goto full_search; | ||
| 1186 | } | ||
| 1187 | return -ENOMEM; | ||
| 1188 | } | ||
| 1189 | if (!vma || addr + len <= vma->vm_start) { | ||
| 1190 | /* | ||
| 1191 | * Remember the place where we stopped the search: | ||
| 1192 | */ | ||
| 1193 | mm->free_area_cache = addr + len; | ||
| 1194 | return addr; | ||
| 1195 | } | ||
| 1196 | addr = vma->vm_end; | ||
| 1197 | } | ||
| 1198 | } | ||
| 1199 | #endif | ||
| 1200 | |||
| 1201 | void arch_unmap_area(struct vm_area_struct *area) | ||
| 1202 | { | ||
| 1203 | /* | ||
| 1204 | * Is this a new hole at the lowest possible address? | ||
| 1205 | */ | ||
| 1206 | if (area->vm_start >= TASK_UNMAPPED_BASE && | ||
| 1207 | area->vm_start < area->vm_mm->free_area_cache) | ||
| 1208 | area->vm_mm->free_area_cache = area->vm_start; | ||
| 1209 | } | ||
| 1210 | |||
| 1211 | /* | ||
| 1212 | * This mmap-allocator allocates new areas top-down from below the | ||
| 1213 | * stack's low limit (the base): | ||
| 1214 | */ | ||
| 1215 | #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN | ||
| 1216 | unsigned long | ||
| 1217 | arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | ||
| 1218 | const unsigned long len, const unsigned long pgoff, | ||
| 1219 | const unsigned long flags) | ||
| 1220 | { | ||
| 1221 | struct vm_area_struct *vma; | ||
| 1222 | struct mm_struct *mm = current->mm; | ||
| 1223 | unsigned long addr = addr0; | ||
| 1224 | |||
| 1225 | /* requested length too big for entire address space */ | ||
| 1226 | if (len > TASK_SIZE) | ||
| 1227 | return -ENOMEM; | ||
| 1228 | |||
| 1229 | /* requesting a specific address */ | ||
| 1230 | if (addr) { | ||
| 1231 | addr = PAGE_ALIGN(addr); | ||
| 1232 | vma = find_vma(mm, addr); | ||
| 1233 | if (TASK_SIZE - len >= addr && | ||
| 1234 | (!vma || addr + len <= vma->vm_start)) | ||
| 1235 | return addr; | ||
| 1236 | } | ||
| 1237 | |||
| 1238 | /* either no address requested or can't fit in requested address hole */ | ||
| 1239 | addr = mm->free_area_cache; | ||
| 1240 | |||
| 1241 | /* make sure it can fit in the remaining address space */ | ||
| 1242 | if (addr >= len) { | ||
| 1243 | vma = find_vma(mm, addr-len); | ||
| 1244 | if (!vma || addr <= vma->vm_start) | ||
| 1245 | /* remember the address as a hint for next time */ | ||
| 1246 | return (mm->free_area_cache = addr-len); | ||
| 1247 | } | ||
| 1248 | |||
| 1249 | addr = mm->mmap_base-len; | ||
| 1250 | |||
| 1251 | do { | ||
| 1252 | /* | ||
| 1253 | * Lookup failure means no vma is above this address, | ||
| 1254 | * else if new region fits below vma->vm_start, | ||
| 1255 | * return with success: | ||
| 1256 | */ | ||
| 1257 | vma = find_vma(mm, addr); | ||
| 1258 | if (!vma || addr+len <= vma->vm_start) | ||
| 1259 | /* remember the address as a hint for next time */ | ||
| 1260 | return (mm->free_area_cache = addr); | ||
| 1261 | |||
| 1262 | /* try just below the current vma->vm_start */ | ||
| 1263 | addr = vma->vm_start-len; | ||
| 1264 | } while (len <= vma->vm_start); | ||
| 1265 | |||
| 1266 | /* | ||
| 1267 | * A failed mmap() very likely causes application failure, | ||
| 1268 | * so fall back to the bottom-up function here. This scenario | ||
| 1269 | * can happen with large stack limits and large mmap() | ||
| 1270 | * allocations. | ||
| 1271 | */ | ||
| 1272 | mm->free_area_cache = TASK_UNMAPPED_BASE; | ||
| 1273 | addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); | ||
| 1274 | /* | ||
| 1275 | * Restore the topdown base: | ||
| 1276 | */ | ||
| 1277 | mm->free_area_cache = mm->mmap_base; | ||
| 1278 | |||
| 1279 | return addr; | ||
| 1280 | } | ||
| 1281 | #endif | ||
| 1282 | |||
| 1283 | void arch_unmap_area_topdown(struct vm_area_struct *area) | ||
| 1284 | { | ||
| 1285 | /* | ||
| 1286 | * Is this a new hole at the highest possible address? | ||
| 1287 | */ | ||
| 1288 | if (area->vm_end > area->vm_mm->free_area_cache) | ||
| 1289 | area->vm_mm->free_area_cache = area->vm_end; | ||
| 1290 | |||
| 1291 | /* dont allow allocations above current base */ | ||
| 1292 | if (area->vm_mm->free_area_cache > area->vm_mm->mmap_base) | ||
| 1293 | area->vm_mm->free_area_cache = area->vm_mm->mmap_base; | ||
| 1294 | } | ||
| 1295 | |||
| 1296 | unsigned long | ||
| 1297 | get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, | ||
| 1298 | unsigned long pgoff, unsigned long flags) | ||
| 1299 | { | ||
| 1300 | if (flags & MAP_FIXED) { | ||
| 1301 | unsigned long ret; | ||
| 1302 | |||
| 1303 | if (addr > TASK_SIZE - len) | ||
| 1304 | return -ENOMEM; | ||
| 1305 | if (addr & ~PAGE_MASK) | ||
| 1306 | return -EINVAL; | ||
| 1307 | if (file && is_file_hugepages(file)) { | ||
| 1308 | /* | ||
| 1309 | * Check if the given range is hugepage aligned, and | ||
| 1310 | * can be made suitable for hugepages. | ||
| 1311 | */ | ||
| 1312 | ret = prepare_hugepage_range(addr, len); | ||
| 1313 | } else { | ||
| 1314 | /* | ||
| 1315 | * Ensure that a normal request is not falling in a | ||
| 1316 | * reserved hugepage range. For some archs like IA-64, | ||
| 1317 | * there is a separate region for hugepages. | ||
| 1318 | */ | ||
| 1319 | ret = is_hugepage_only_range(current->mm, addr, len); | ||
| 1320 | } | ||
| 1321 | if (ret) | ||
| 1322 | return -EINVAL; | ||
| 1323 | return addr; | ||
| 1324 | } | ||
| 1325 | |||
| 1326 | if (file && file->f_op && file->f_op->get_unmapped_area) | ||
| 1327 | return file->f_op->get_unmapped_area(file, addr, len, | ||
| 1328 | pgoff, flags); | ||
| 1329 | |||
| 1330 | return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); | ||
| 1331 | } | ||
| 1332 | |||
| 1333 | EXPORT_SYMBOL(get_unmapped_area); | ||
| 1334 | |||
| 1335 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ | ||
| 1336 | struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) | ||
| 1337 | { | ||
| 1338 | struct vm_area_struct *vma = NULL; | ||
| 1339 | |||
| 1340 | if (mm) { | ||
| 1341 | /* Check the cache first. */ | ||
| 1342 | /* (Cache hit rate is typically around 35%.) */ | ||
| 1343 | vma = mm->mmap_cache; | ||
| 1344 | if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { | ||
| 1345 | struct rb_node * rb_node; | ||
| 1346 | |||
| 1347 | rb_node = mm->mm_rb.rb_node; | ||
| 1348 | vma = NULL; | ||
| 1349 | |||
| 1350 | while (rb_node) { | ||
| 1351 | struct vm_area_struct * vma_tmp; | ||
| 1352 | |||
| 1353 | vma_tmp = rb_entry(rb_node, | ||
| 1354 | struct vm_area_struct, vm_rb); | ||
| 1355 | |||
| 1356 | if (vma_tmp->vm_end > addr) { | ||
| 1357 | vma = vma_tmp; | ||
| 1358 | if (vma_tmp->vm_start <= addr) | ||
| 1359 | break; | ||
| 1360 | rb_node = rb_node->rb_left; | ||
| 1361 | } else | ||
| 1362 | rb_node = rb_node->rb_right; | ||
| 1363 | } | ||
| 1364 | if (vma) | ||
| 1365 | mm->mmap_cache = vma; | ||
| 1366 | } | ||
| 1367 | } | ||
| 1368 | return vma; | ||
| 1369 | } | ||
| 1370 | |||
| 1371 | EXPORT_SYMBOL(find_vma); | ||
| 1372 | |||
| 1373 | /* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ | ||
| 1374 | struct vm_area_struct * | ||
| 1375 | find_vma_prev(struct mm_struct *mm, unsigned long addr, | ||
| 1376 | struct vm_area_struct **pprev) | ||
| 1377 | { | ||
| 1378 | struct vm_area_struct *vma = NULL, *prev = NULL; | ||
| 1379 | struct rb_node * rb_node; | ||
| 1380 | if (!mm) | ||
| 1381 | goto out; | ||
| 1382 | |||
| 1383 | /* Guard against addr being lower than the first VMA */ | ||
| 1384 | vma = mm->mmap; | ||
| 1385 | |||
| 1386 | /* Go through the RB tree quickly. */ | ||
| 1387 | rb_node = mm->mm_rb.rb_node; | ||
| 1388 | |||
| 1389 | while (rb_node) { | ||
| 1390 | struct vm_area_struct *vma_tmp; | ||
| 1391 | vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); | ||
| 1392 | |||
| 1393 | if (addr < vma_tmp->vm_end) { | ||
| 1394 | rb_node = rb_node->rb_left; | ||
| 1395 | } else { | ||
| 1396 | prev = vma_tmp; | ||
| 1397 | if (!prev->vm_next || (addr < prev->vm_next->vm_end)) | ||
| 1398 | break; | ||
| 1399 | rb_node = rb_node->rb_right; | ||
| 1400 | } | ||
| 1401 | } | ||
| 1402 | |||
| 1403 | out: | ||
| 1404 | *pprev = prev; | ||
| 1405 | return prev ? prev->vm_next : vma; | ||
| 1406 | } | ||
| 1407 | |||
| 1408 | /* | ||
| 1409 | * Verify that the stack growth is acceptable and | ||
| 1410 | * update accounting. This is shared with both the | ||
| 1411 | * grow-up and grow-down cases. | ||
| 1412 | */ | ||
| 1413 | static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow) | ||
| 1414 | { | ||
| 1415 | struct mm_struct *mm = vma->vm_mm; | ||
| 1416 | struct rlimit *rlim = current->signal->rlim; | ||
| 1417 | |||
| 1418 | /* address space limit tests */ | ||
| 1419 | if (mm->total_vm + grow > rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT) | ||
| 1420 | return -ENOMEM; | ||
| 1421 | |||
| 1422 | /* Stack limit test */ | ||
| 1423 | if (size > rlim[RLIMIT_STACK].rlim_cur) | ||
| 1424 | return -ENOMEM; | ||
| 1425 | |||
| 1426 | /* mlock limit tests */ | ||
| 1427 | if (vma->vm_flags & VM_LOCKED) { | ||
| 1428 | unsigned long locked; | ||
| 1429 | unsigned long limit; | ||
| 1430 | locked = mm->locked_vm + grow; | ||
| 1431 | limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; | ||
| 1432 | if (locked > limit && !capable(CAP_IPC_LOCK)) | ||
| 1433 | return -ENOMEM; | ||
| 1434 | } | ||
| 1435 | |||
| 1436 | /* | ||
| 1437 | * Overcommit.. This must be the final test, as it will | ||
| 1438 | * update security statistics. | ||
| 1439 | */ | ||
| 1440 | if (security_vm_enough_memory(grow)) | ||
| 1441 | return -ENOMEM; | ||
| 1442 | |||
| 1443 | /* Ok, everything looks good - let it rip */ | ||
| 1444 | mm->total_vm += grow; | ||
| 1445 | if (vma->vm_flags & VM_LOCKED) | ||
| 1446 | mm->locked_vm += grow; | ||
| 1447 | __vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); | ||
| 1448 | return 0; | ||
| 1449 | } | ||
| 1450 | |||
| 1451 | #ifdef CONFIG_STACK_GROWSUP | ||
| 1452 | /* | ||
| 1453 | * vma is the first one with address > vma->vm_end. Have to extend vma. | ||
| 1454 | */ | ||
| 1455 | int expand_stack(struct vm_area_struct * vma, unsigned long address) | ||
| 1456 | { | ||
| 1457 | int error; | ||
| 1458 | |||
| 1459 | if (!(vma->vm_flags & VM_GROWSUP)) | ||
| 1460 | return -EFAULT; | ||
| 1461 | |||
| 1462 | /* | ||
| 1463 | * We must make sure the anon_vma is allocated | ||
| 1464 | * so that the anon_vma locking is not a noop. | ||
| 1465 | */ | ||
| 1466 | if (unlikely(anon_vma_prepare(vma))) | ||
| 1467 | return -ENOMEM; | ||
| 1468 | anon_vma_lock(vma); | ||
| 1469 | |||
| 1470 | /* | ||
| 1471 | * vma->vm_start/vm_end cannot change under us because the caller | ||
| 1472 | * is required to hold the mmap_sem in read mode. We need the | ||
| 1473 | * anon_vma lock to serialize against concurrent expand_stacks. | ||
| 1474 | */ | ||
| 1475 | address += 4 + PAGE_SIZE - 1; | ||
| 1476 | address &= PAGE_MASK; | ||
| 1477 | error = 0; | ||
| 1478 | |||
| 1479 | /* Somebody else might have raced and expanded it already */ | ||
| 1480 | if (address > vma->vm_end) { | ||
| 1481 | unsigned long size, grow; | ||
| 1482 | |||
| 1483 | size = address - vma->vm_start; | ||
| 1484 | grow = (address - vma->vm_end) >> PAGE_SHIFT; | ||
| 1485 | |||
| 1486 | error = acct_stack_growth(vma, size, grow); | ||
| 1487 | if (!error) | ||
| 1488 | vma->vm_end = address; | ||
| 1489 | } | ||
| 1490 | anon_vma_unlock(vma); | ||
| 1491 | return error; | ||
| 1492 | } | ||
| 1493 | |||
| 1494 | struct vm_area_struct * | ||
| 1495 | find_extend_vma(struct mm_struct *mm, unsigned long addr) | ||
| 1496 | { | ||
| 1497 | struct vm_area_struct *vma, *prev; | ||
| 1498 | |||
| 1499 | addr &= PAGE_MASK; | ||
| 1500 | vma = find_vma_prev(mm, addr, &prev); | ||
| 1501 | if (vma && (vma->vm_start <= addr)) | ||
| 1502 | return vma; | ||
| 1503 | if (!prev || expand_stack(prev, addr)) | ||
| 1504 | return NULL; | ||
| 1505 | if (prev->vm_flags & VM_LOCKED) { | ||
| 1506 | make_pages_present(addr, prev->vm_end); | ||
| 1507 | } | ||
| 1508 | return prev; | ||
| 1509 | } | ||
| 1510 | #else | ||
| 1511 | /* | ||
| 1512 | * vma is the first one with address < vma->vm_start. Have to extend vma. | ||
| 1513 | */ | ||
| 1514 | int expand_stack(struct vm_area_struct *vma, unsigned long address) | ||
| 1515 | { | ||
| 1516 | int error; | ||
| 1517 | |||
| 1518 | /* | ||
| 1519 | * We must make sure the anon_vma is allocated | ||
| 1520 | * so that the anon_vma locking is not a noop. | ||
| 1521 | */ | ||
| 1522 | if (unlikely(anon_vma_prepare(vma))) | ||
| 1523 | return -ENOMEM; | ||
| 1524 | anon_vma_lock(vma); | ||
| 1525 | |||
| 1526 | /* | ||
| 1527 | * vma->vm_start/vm_end cannot change under us because the caller | ||
| 1528 | * is required to hold the mmap_sem in read mode. We need the | ||
| 1529 | * anon_vma lock to serialize against concurrent expand_stacks. | ||
| 1530 | */ | ||
| 1531 | address &= PAGE_MASK; | ||
| 1532 | error = 0; | ||
| 1533 | |||
| 1534 | /* Somebody else might have raced and expanded it already */ | ||
| 1535 | if (address < vma->vm_start) { | ||
| 1536 | unsigned long size, grow; | ||
| 1537 | |||
| 1538 | size = vma->vm_end - address; | ||
| 1539 | grow = (vma->vm_start - address) >> PAGE_SHIFT; | ||
| 1540 | |||
| 1541 | error = acct_stack_growth(vma, size, grow); | ||
| 1542 | if (!error) { | ||
| 1543 | vma->vm_start = address; | ||
| 1544 | vma->vm_pgoff -= grow; | ||
| 1545 | } | ||
| 1546 | } | ||
| 1547 | anon_vma_unlock(vma); | ||
| 1548 | return error; | ||
| 1549 | } | ||
| 1550 | |||
| 1551 | struct vm_area_struct * | ||
| 1552 | find_extend_vma(struct mm_struct * mm, unsigned long addr) | ||
| 1553 | { | ||
| 1554 | struct vm_area_struct * vma; | ||
| 1555 | unsigned long start; | ||
| 1556 | |||
| 1557 | addr &= PAGE_MASK; | ||
| 1558 | vma = find_vma(mm,addr); | ||
| 1559 | if (!vma) | ||
| 1560 | return NULL; | ||
| 1561 | if (vma->vm_start <= addr) | ||
| 1562 | return vma; | ||
| 1563 | if (!(vma->vm_flags & VM_GROWSDOWN)) | ||
| 1564 | return NULL; | ||
| 1565 | start = vma->vm_start; | ||
| 1566 | if (expand_stack(vma, addr)) | ||
| 1567 | return NULL; | ||
| 1568 | if (vma->vm_flags & VM_LOCKED) { | ||
| 1569 | make_pages_present(addr, start); | ||
| 1570 | } | ||
| 1571 | return vma; | ||
| 1572 | } | ||
| 1573 | #endif | ||
| 1574 | |||
| 1575 | /* | ||
| 1576 | * Try to free as many page directory entries as we can, | ||
| 1577 | * without having to work very hard at actually scanning | ||
| 1578 | * the page tables themselves. | ||
| 1579 | * | ||
| 1580 | * Right now we try to free page tables if we have a nice | ||
| 1581 | * PGDIR-aligned area that got free'd up. We could be more | ||
| 1582 | * granular if we want to, but this is fast and simple, | ||
| 1583 | * and covers the bad cases. | ||
| 1584 | * | ||
| 1585 | * "prev", if it exists, points to a vma before the one | ||
| 1586 | * we just free'd - but there's no telling how much before. | ||
| 1587 | */ | ||
| 1588 | static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev, | ||
| 1589 | unsigned long start, unsigned long end) | ||
| 1590 | { | ||
| 1591 | unsigned long first = start & PGDIR_MASK; | ||
| 1592 | unsigned long last = end + PGDIR_SIZE - 1; | ||
| 1593 | struct mm_struct *mm = tlb->mm; | ||
| 1594 | |||
| 1595 | if (last > MM_VM_SIZE(mm) || last < end) | ||
| 1596 | last = MM_VM_SIZE(mm); | ||
| 1597 | |||
| 1598 | if (!prev) { | ||
| 1599 | prev = mm->mmap; | ||
| 1600 | if (!prev) | ||
| 1601 | goto no_mmaps; | ||
| 1602 | if (prev->vm_end > start) { | ||
| 1603 | if (last > prev->vm_start) | ||
| 1604 | last = prev->vm_start; | ||
| 1605 | goto no_mmaps; | ||
| 1606 | } | ||
| 1607 | } | ||
| 1608 | for (;;) { | ||
| 1609 | struct vm_area_struct *next = prev->vm_next; | ||
| 1610 | |||
| 1611 | if (next) { | ||
| 1612 | if (next->vm_start < start) { | ||
| 1613 | prev = next; | ||
| 1614 | continue; | ||
| 1615 | } | ||
| 1616 | if (last > next->vm_start) | ||
| 1617 | last = next->vm_start; | ||
| 1618 | } | ||
| 1619 | if (prev->vm_end > first) | ||
| 1620 | first = prev->vm_end; | ||
| 1621 | break; | ||
| 1622 | } | ||
| 1623 | no_mmaps: | ||
| 1624 | if (last < first) /* for arches with discontiguous pgd indices */ | ||
| 1625 | return; | ||
| 1626 | if (first < FIRST_USER_PGD_NR * PGDIR_SIZE) | ||
| 1627 | first = FIRST_USER_PGD_NR * PGDIR_SIZE; | ||
| 1628 | /* No point trying to free anything if we're in the same pte page */ | ||
| 1629 | if ((first & PMD_MASK) < (last & PMD_MASK)) { | ||
| 1630 | clear_page_range(tlb, first, last); | ||
| 1631 | flush_tlb_pgtables(mm, first, last); | ||
| 1632 | } | ||
| 1633 | } | ||
| 1634 | |||
| 1635 | /* Normal function to fix up a mapping | ||
| 1636 | * This function is the default for when an area has no specific | ||
| 1637 | * function. This may be used as part of a more specific routine. | ||
| 1638 | * | ||
| 1639 | * By the time this function is called, the area struct has been | ||
| 1640 | * removed from the process mapping list. | ||
| 1641 | */ | ||
| 1642 | static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area) | ||
| 1643 | { | ||
| 1644 | size_t len = area->vm_end - area->vm_start; | ||
| 1645 | |||
| 1646 | area->vm_mm->total_vm -= len >> PAGE_SHIFT; | ||
| 1647 | if (area->vm_flags & VM_LOCKED) | ||
| 1648 | area->vm_mm->locked_vm -= len >> PAGE_SHIFT; | ||
| 1649 | vm_stat_unaccount(area); | ||
| 1650 | area->vm_mm->unmap_area(area); | ||
| 1651 | remove_vm_struct(area); | ||
| 1652 | } | ||
| 1653 | |||
| 1654 | /* | ||
| 1655 | * Update the VMA and inode share lists. | ||
| 1656 | * | ||
| 1657 | * Ok - we have the memory areas we should free on the 'free' list, | ||
| 1658 | * so release them, and do the vma updates. | ||
| 1659 | */ | ||
| 1660 | static void unmap_vma_list(struct mm_struct *mm, | ||
| 1661 | struct vm_area_struct *mpnt) | ||
| 1662 | { | ||
| 1663 | do { | ||
| 1664 | struct vm_area_struct *next = mpnt->vm_next; | ||
| 1665 | unmap_vma(mm, mpnt); | ||
| 1666 | mpnt = next; | ||
| 1667 | } while (mpnt != NULL); | ||
| 1668 | validate_mm(mm); | ||
| 1669 | } | ||
| 1670 | |||
| 1671 | /* | ||
| 1672 | * Get rid of page table information in the indicated region. | ||
| 1673 | * | ||
| 1674 | * Called with the page table lock held. | ||
| 1675 | */ | ||
| 1676 | static void unmap_region(struct mm_struct *mm, | ||
| 1677 | struct vm_area_struct *vma, | ||
| 1678 | struct vm_area_struct *prev, | ||
| 1679 | unsigned long start, | ||
| 1680 | unsigned long end) | ||
| 1681 | { | ||
| 1682 | struct mmu_gather *tlb; | ||
| 1683 | unsigned long nr_accounted = 0; | ||
| 1684 | |||
| 1685 | lru_add_drain(); | ||
| 1686 | tlb = tlb_gather_mmu(mm, 0); | ||
| 1687 | unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); | ||
| 1688 | vm_unacct_memory(nr_accounted); | ||
| 1689 | |||
| 1690 | if (is_hugepage_only_range(mm, start, end - start)) | ||
| 1691 | hugetlb_free_pgtables(tlb, prev, start, end); | ||
| 1692 | else | ||
| 1693 | free_pgtables(tlb, prev, start, end); | ||
| 1694 | tlb_finish_mmu(tlb, start, end); | ||
| 1695 | } | ||
| 1696 | |||
| 1697 | /* | ||
| 1698 | * Create a list of vma's touched by the unmap, removing them from the mm's | ||
| 1699 | * vma list as we go.. | ||
| 1700 | */ | ||
| 1701 | static void | ||
| 1702 | detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 1703 | struct vm_area_struct *prev, unsigned long end) | ||
| 1704 | { | ||
| 1705 | struct vm_area_struct **insertion_point; | ||
| 1706 | struct vm_area_struct *tail_vma = NULL; | ||
| 1707 | |||
| 1708 | insertion_point = (prev ? &prev->vm_next : &mm->mmap); | ||
| 1709 | do { | ||
| 1710 | rb_erase(&vma->vm_rb, &mm->mm_rb); | ||
| 1711 | mm->map_count--; | ||
| 1712 | tail_vma = vma; | ||
| 1713 | vma = vma->vm_next; | ||
| 1714 | } while (vma && vma->vm_start < end); | ||
| 1715 | *insertion_point = vma; | ||
| 1716 | tail_vma->vm_next = NULL; | ||
| 1717 | mm->mmap_cache = NULL; /* Kill the cache. */ | ||
| 1718 | } | ||
| 1719 | |||
| 1720 | /* | ||
| 1721 | * Split a vma into two pieces at address 'addr', a new vma is allocated | ||
| 1722 | * either for the first part or the the tail. | ||
| 1723 | */ | ||
| 1724 | int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | ||
| 1725 | unsigned long addr, int new_below) | ||
| 1726 | { | ||
| 1727 | struct mempolicy *pol; | ||
| 1728 | struct vm_area_struct *new; | ||
| 1729 | |||
| 1730 | if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK)) | ||
| 1731 | return -EINVAL; | ||
| 1732 | |||
| 1733 | if (mm->map_count >= sysctl_max_map_count) | ||
| 1734 | return -ENOMEM; | ||
| 1735 | |||
| 1736 | new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); | ||
| 1737 | if (!new) | ||
| 1738 | return -ENOMEM; | ||
| 1739 | |||
| 1740 | /* most fields are the same, copy all, and then fixup */ | ||
| 1741 | *new = *vma; | ||
| 1742 | |||
| 1743 | if (new_below) | ||
| 1744 | new->vm_end = addr; | ||
| 1745 | else { | ||
| 1746 | new->vm_start = addr; | ||
| 1747 | new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); | ||
| 1748 | } | ||
| 1749 | |||
| 1750 | pol = mpol_copy(vma_policy(vma)); | ||
| 1751 | if (IS_ERR(pol)) { | ||
| 1752 | kmem_cache_free(vm_area_cachep, new); | ||
| 1753 | return PTR_ERR(pol); | ||
| 1754 | } | ||
| 1755 | vma_set_policy(new, pol); | ||
| 1756 | |||
| 1757 | if (new->vm_file) | ||
| 1758 | get_file(new->vm_file); | ||
| 1759 | |||
| 1760 | if (new->vm_ops && new->vm_ops->open) | ||
| 1761 | new->vm_ops->open(new); | ||
| 1762 | |||
| 1763 | if (new_below) | ||
| 1764 | vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + | ||
| 1765 | ((addr - new->vm_start) >> PAGE_SHIFT), new); | ||
| 1766 | else | ||
| 1767 | vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); | ||
| 1768 | |||
| 1769 | return 0; | ||
| 1770 | } | ||
| 1771 | |||
| 1772 | /* Munmap is split into 2 main parts -- this part which finds | ||
| 1773 | * what needs doing, and the areas themselves, which do the | ||
| 1774 | * work. This now handles partial unmappings. | ||
| 1775 | * Jeremy Fitzhardinge <jeremy@goop.org> | ||
| 1776 | */ | ||
| 1777 | int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | ||
| 1778 | { | ||
| 1779 | unsigned long end; | ||
| 1780 | struct vm_area_struct *mpnt, *prev, *last; | ||
| 1781 | |||
| 1782 | if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) | ||
| 1783 | return -EINVAL; | ||
| 1784 | |||
| 1785 | if ((len = PAGE_ALIGN(len)) == 0) | ||
| 1786 | return -EINVAL; | ||
| 1787 | |||
| 1788 | /* Find the first overlapping VMA */ | ||
| 1789 | mpnt = find_vma_prev(mm, start, &prev); | ||
| 1790 | if (!mpnt) | ||
| 1791 | return 0; | ||
| 1792 | /* we have start < mpnt->vm_end */ | ||
| 1793 | |||
| 1794 | /* if it doesn't overlap, we have nothing.. */ | ||
| 1795 | end = start + len; | ||
| 1796 | if (mpnt->vm_start >= end) | ||
| 1797 | return 0; | ||
| 1798 | |||
| 1799 | /* | ||
| 1800 | * If we need to split any vma, do it now to save pain later. | ||
| 1801 | * | ||
| 1802 | * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially | ||
| 1803 | * unmapped vm_area_struct will remain in use: so lower split_vma | ||
| 1804 | * places tmp vma above, and higher split_vma places tmp vma below. | ||
| 1805 | */ | ||
| 1806 | if (start > mpnt->vm_start) { | ||
| 1807 | int error = split_vma(mm, mpnt, start, 0); | ||
| 1808 | if (error) | ||
| 1809 | return error; | ||
| 1810 | prev = mpnt; | ||
| 1811 | } | ||
| 1812 | |||
| 1813 | /* Does it split the last one? */ | ||
| 1814 | last = find_vma(mm, end); | ||
| 1815 | if (last && end > last->vm_start) { | ||
| 1816 | int error = split_vma(mm, last, end, 1); | ||
| 1817 | if (error) | ||
| 1818 | return error; | ||
| 1819 | } | ||
| 1820 | mpnt = prev? prev->vm_next: mm->mmap; | ||
| 1821 | |||
| 1822 | /* | ||
| 1823 | * Remove the vma's, and unmap the actual pages | ||
| 1824 | */ | ||
| 1825 | detach_vmas_to_be_unmapped(mm, mpnt, prev, end); | ||
| 1826 | spin_lock(&mm->page_table_lock); | ||
| 1827 | unmap_region(mm, mpnt, prev, start, end); | ||
| 1828 | spin_unlock(&mm->page_table_lock); | ||
| 1829 | |||
| 1830 | /* Fix up all other VM information */ | ||
| 1831 | unmap_vma_list(mm, mpnt); | ||
| 1832 | |||
| 1833 | return 0; | ||
| 1834 | } | ||
| 1835 | |||
| 1836 | EXPORT_SYMBOL(do_munmap); | ||
| 1837 | |||
| 1838 | asmlinkage long sys_munmap(unsigned long addr, size_t len) | ||
| 1839 | { | ||
| 1840 | int ret; | ||
| 1841 | struct mm_struct *mm = current->mm; | ||
| 1842 | |||
| 1843 | profile_munmap(addr); | ||
| 1844 | |||
| 1845 | down_write(&mm->mmap_sem); | ||
| 1846 | ret = do_munmap(mm, addr, len); | ||
| 1847 | up_write(&mm->mmap_sem); | ||
| 1848 | return ret; | ||
| 1849 | } | ||
| 1850 | |||
| 1851 | static inline void verify_mm_writelocked(struct mm_struct *mm) | ||
| 1852 | { | ||
| 1853 | #ifdef CONFIG_DEBUG_KERNEL | ||
| 1854 | if (unlikely(down_read_trylock(&mm->mmap_sem))) { | ||
| 1855 | WARN_ON(1); | ||
| 1856 | up_read(&mm->mmap_sem); | ||
| 1857 | } | ||
| 1858 | #endif | ||
| 1859 | } | ||
| 1860 | |||
| 1861 | /* | ||
| 1862 | * this is really a simplified "do_mmap". it only handles | ||
| 1863 | * anonymous maps. eventually we may be able to do some | ||
| 1864 | * brk-specific accounting here. | ||
| 1865 | */ | ||
| 1866 | unsigned long do_brk(unsigned long addr, unsigned long len) | ||
| 1867 | { | ||
| 1868 | struct mm_struct * mm = current->mm; | ||
| 1869 | struct vm_area_struct * vma, * prev; | ||
| 1870 | unsigned long flags; | ||
| 1871 | struct rb_node ** rb_link, * rb_parent; | ||
| 1872 | pgoff_t pgoff = addr >> PAGE_SHIFT; | ||
| 1873 | |||
| 1874 | len = PAGE_ALIGN(len); | ||
| 1875 | if (!len) | ||
| 1876 | return addr; | ||
| 1877 | |||
| 1878 | if ((addr + len) > TASK_SIZE || (addr + len) < addr) | ||
| 1879 | return -EINVAL; | ||
| 1880 | |||
| 1881 | /* | ||
| 1882 | * mlock MCL_FUTURE? | ||
| 1883 | */ | ||
| 1884 | if (mm->def_flags & VM_LOCKED) { | ||
| 1885 | unsigned long locked, lock_limit; | ||
| 1886 | locked = mm->locked_vm << PAGE_SHIFT; | ||
| 1887 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | ||
| 1888 | locked += len; | ||
| 1889 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
| 1890 | return -EAGAIN; | ||
| 1891 | } | ||
| 1892 | |||
| 1893 | /* | ||
| 1894 | * mm->mmap_sem is required to protect against another thread | ||
| 1895 | * changing the mappings in case we sleep. | ||
| 1896 | */ | ||
| 1897 | verify_mm_writelocked(mm); | ||
| 1898 | |||
| 1899 | /* | ||
| 1900 | * Clear old maps. this also does some error checking for us | ||
| 1901 | */ | ||
| 1902 | munmap_back: | ||
| 1903 | vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); | ||
| 1904 | if (vma && vma->vm_start < addr + len) { | ||
| 1905 | if (do_munmap(mm, addr, len)) | ||
| 1906 | return -ENOMEM; | ||
| 1907 | goto munmap_back; | ||
| 1908 | } | ||
| 1909 | |||
| 1910 | /* Check against address space limits *after* clearing old maps... */ | ||
| 1911 | if ((mm->total_vm << PAGE_SHIFT) + len | ||
| 1912 | > current->signal->rlim[RLIMIT_AS].rlim_cur) | ||
| 1913 | return -ENOMEM; | ||
| 1914 | |||
| 1915 | if (mm->map_count > sysctl_max_map_count) | ||
| 1916 | return -ENOMEM; | ||
| 1917 | |||
| 1918 | if (security_vm_enough_memory(len >> PAGE_SHIFT)) | ||
| 1919 | return -ENOMEM; | ||
| 1920 | |||
| 1921 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; | ||
| 1922 | |||
| 1923 | /* Can we just expand an old private anonymous mapping? */ | ||
| 1924 | if (vma_merge(mm, prev, addr, addr + len, flags, | ||
| 1925 | NULL, NULL, pgoff, NULL)) | ||
| 1926 | goto out; | ||
| 1927 | |||
| 1928 | /* | ||
| 1929 | * create a vma struct for an anonymous mapping | ||
| 1930 | */ | ||
| 1931 | vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); | ||
| 1932 | if (!vma) { | ||
| 1933 | vm_unacct_memory(len >> PAGE_SHIFT); | ||
| 1934 | return -ENOMEM; | ||
| 1935 | } | ||
| 1936 | memset(vma, 0, sizeof(*vma)); | ||
| 1937 | |||
| 1938 | vma->vm_mm = mm; | ||
| 1939 | vma->vm_start = addr; | ||
| 1940 | vma->vm_end = addr + len; | ||
| 1941 | vma->vm_pgoff = pgoff; | ||
| 1942 | vma->vm_flags = flags; | ||
| 1943 | vma->vm_page_prot = protection_map[flags & 0x0f]; | ||
| 1944 | vma_link(mm, vma, prev, rb_link, rb_parent); | ||
| 1945 | out: | ||
| 1946 | mm->total_vm += len >> PAGE_SHIFT; | ||
| 1947 | if (flags & VM_LOCKED) { | ||
| 1948 | mm->locked_vm += len >> PAGE_SHIFT; | ||
| 1949 | make_pages_present(addr, addr + len); | ||
| 1950 | } | ||
| 1951 | return addr; | ||
| 1952 | } | ||
| 1953 | |||
| 1954 | EXPORT_SYMBOL(do_brk); | ||
| 1955 | |||
| 1956 | /* Release all mmaps. */ | ||
| 1957 | void exit_mmap(struct mm_struct *mm) | ||
| 1958 | { | ||
| 1959 | struct mmu_gather *tlb; | ||
| 1960 | struct vm_area_struct *vma; | ||
| 1961 | unsigned long nr_accounted = 0; | ||
| 1962 | |||
| 1963 | lru_add_drain(); | ||
| 1964 | |||
| 1965 | spin_lock(&mm->page_table_lock); | ||
| 1966 | |||
| 1967 | tlb = tlb_gather_mmu(mm, 1); | ||
| 1968 | flush_cache_mm(mm); | ||
| 1969 | /* Use ~0UL here to ensure all VMAs in the mm are unmapped */ | ||
| 1970 | mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0, | ||
| 1971 | ~0UL, &nr_accounted, NULL); | ||
| 1972 | vm_unacct_memory(nr_accounted); | ||
| 1973 | BUG_ON(mm->map_count); /* This is just debugging */ | ||
| 1974 | clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm)); | ||
| 1975 | |||
| 1976 | tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm)); | ||
| 1977 | |||
| 1978 | vma = mm->mmap; | ||
| 1979 | mm->mmap = mm->mmap_cache = NULL; | ||
| 1980 | mm->mm_rb = RB_ROOT; | ||
| 1981 | set_mm_counter(mm, rss, 0); | ||
| 1982 | mm->total_vm = 0; | ||
| 1983 | mm->locked_vm = 0; | ||
| 1984 | |||
| 1985 | spin_unlock(&mm->page_table_lock); | ||
| 1986 | |||
| 1987 | /* | ||
| 1988 | * Walk the list again, actually closing and freeing it | ||
| 1989 | * without holding any MM locks. | ||
| 1990 | */ | ||
| 1991 | while (vma) { | ||
| 1992 | struct vm_area_struct *next = vma->vm_next; | ||
| 1993 | remove_vm_struct(vma); | ||
| 1994 | vma = next; | ||
| 1995 | } | ||
| 1996 | } | ||
| 1997 | |||
| 1998 | /* Insert vm structure into process list sorted by address | ||
| 1999 | * and into the inode's i_mmap tree. If vm_file is non-NULL | ||
| 2000 | * then i_mmap_lock is taken here. | ||
| 2001 | */ | ||
| 2002 | int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) | ||
| 2003 | { | ||
| 2004 | struct vm_area_struct * __vma, * prev; | ||
| 2005 | struct rb_node ** rb_link, * rb_parent; | ||
| 2006 | |||
| 2007 | /* | ||
| 2008 | * The vm_pgoff of a purely anonymous vma should be irrelevant | ||
| 2009 | * until its first write fault, when page's anon_vma and index | ||
| 2010 | * are set. But now set the vm_pgoff it will almost certainly | ||
| 2011 | * end up with (unless mremap moves it elsewhere before that | ||
| 2012 | * first wfault), so /proc/pid/maps tells a consistent story. | ||
| 2013 | * | ||
| 2014 | * By setting it to reflect the virtual start address of the | ||
| 2015 | * vma, merges and splits can happen in a seamless way, just | ||
| 2016 | * using the existing file pgoff checks and manipulations. | ||
| 2017 | * Similarly in do_mmap_pgoff and in do_brk. | ||
| 2018 | */ | ||
| 2019 | if (!vma->vm_file) { | ||
| 2020 | BUG_ON(vma->anon_vma); | ||
| 2021 | vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; | ||
| 2022 | } | ||
| 2023 | __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); | ||
| 2024 | if (__vma && __vma->vm_start < vma->vm_end) | ||
| 2025 | return -ENOMEM; | ||
| 2026 | vma_link(mm, vma, prev, rb_link, rb_parent); | ||
| 2027 | return 0; | ||
| 2028 | } | ||
| 2029 | |||
| 2030 | /* | ||
| 2031 | * Copy the vma structure to a new location in the same mm, | ||
| 2032 | * prior to moving page table entries, to effect an mremap move. | ||
| 2033 | */ | ||
| 2034 | struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | ||
| 2035 | unsigned long addr, unsigned long len, pgoff_t pgoff) | ||
| 2036 | { | ||
| 2037 | struct vm_area_struct *vma = *vmap; | ||
| 2038 | unsigned long vma_start = vma->vm_start; | ||
| 2039 | struct mm_struct *mm = vma->vm_mm; | ||
| 2040 | struct vm_area_struct *new_vma, *prev; | ||
| 2041 | struct rb_node **rb_link, *rb_parent; | ||
| 2042 | struct mempolicy *pol; | ||
| 2043 | |||
| 2044 | /* | ||
| 2045 | * If anonymous vma has not yet been faulted, update new pgoff | ||
| 2046 | * to match new location, to increase its chance of merging. | ||
| 2047 | */ | ||
| 2048 | if (!vma->vm_file && !vma->anon_vma) | ||
| 2049 | pgoff = addr >> PAGE_SHIFT; | ||
| 2050 | |||
| 2051 | find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); | ||
| 2052 | new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, | ||
| 2053 | vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); | ||
| 2054 | if (new_vma) { | ||
| 2055 | /* | ||
| 2056 | * Source vma may have been merged into new_vma | ||
| 2057 | */ | ||
| 2058 | if (vma_start >= new_vma->vm_start && | ||
| 2059 | vma_start < new_vma->vm_end) | ||
| 2060 | *vmap = new_vma; | ||
| 2061 | } else { | ||
| 2062 | new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); | ||
| 2063 | if (new_vma) { | ||
| 2064 | *new_vma = *vma; | ||
| 2065 | pol = mpol_copy(vma_policy(vma)); | ||
| 2066 | if (IS_ERR(pol)) { | ||
| 2067 | kmem_cache_free(vm_area_cachep, new_vma); | ||
| 2068 | return NULL; | ||
| 2069 | } | ||
| 2070 | vma_set_policy(new_vma, pol); | ||
| 2071 | new_vma->vm_start = addr; | ||
| 2072 | new_vma->vm_end = addr + len; | ||
| 2073 | new_vma->vm_pgoff = pgoff; | ||
| 2074 | if (new_vma->vm_file) | ||
| 2075 | get_file(new_vma->vm_file); | ||
| 2076 | if (new_vma->vm_ops && new_vma->vm_ops->open) | ||
| 2077 | new_vma->vm_ops->open(new_vma); | ||
| 2078 | vma_link(mm, new_vma, prev, rb_link, rb_parent); | ||
| 2079 | } | ||
| 2080 | } | ||
| 2081 | return new_vma; | ||
| 2082 | } | ||
diff --git a/mm/mprotect.c b/mm/mprotect.c new file mode 100644 index 000000000000..e9fbd013ad9a --- /dev/null +++ b/mm/mprotect.c | |||
| @@ -0,0 +1,282 @@ | |||
| 1 | /* | ||
| 2 | * mm/mprotect.c | ||
| 3 | * | ||
| 4 | * (C) Copyright 1994 Linus Torvalds | ||
| 5 | * (C) Copyright 2002 Christoph Hellwig | ||
| 6 | * | ||
| 7 | * Address space accounting code <alan@redhat.com> | ||
| 8 | * (C) Copyright 2002 Red Hat Inc, All Rights Reserved | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include <linux/mm.h> | ||
| 12 | #include <linux/hugetlb.h> | ||
| 13 | #include <linux/slab.h> | ||
| 14 | #include <linux/shm.h> | ||
| 15 | #include <linux/mman.h> | ||
| 16 | #include <linux/fs.h> | ||
| 17 | #include <linux/highmem.h> | ||
| 18 | #include <linux/security.h> | ||
| 19 | #include <linux/mempolicy.h> | ||
| 20 | #include <linux/personality.h> | ||
| 21 | #include <linux/syscalls.h> | ||
| 22 | |||
| 23 | #include <asm/uaccess.h> | ||
| 24 | #include <asm/pgtable.h> | ||
| 25 | #include <asm/cacheflush.h> | ||
| 26 | #include <asm/tlbflush.h> | ||
| 27 | |||
| 28 | static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | ||
| 29 | unsigned long addr, unsigned long end, pgprot_t newprot) | ||
| 30 | { | ||
| 31 | pte_t *pte; | ||
| 32 | |||
| 33 | pte = pte_offset_map(pmd, addr); | ||
| 34 | do { | ||
| 35 | if (pte_present(*pte)) { | ||
| 36 | pte_t ptent; | ||
| 37 | |||
| 38 | /* Avoid an SMP race with hardware updated dirty/clean | ||
| 39 | * bits by wiping the pte and then setting the new pte | ||
| 40 | * into place. | ||
| 41 | */ | ||
| 42 | ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot); | ||
| 43 | set_pte_at(mm, addr, pte, ptent); | ||
| 44 | lazy_mmu_prot_update(ptent); | ||
| 45 | } | ||
| 46 | } while (pte++, addr += PAGE_SIZE, addr != end); | ||
| 47 | pte_unmap(pte - 1); | ||
| 48 | } | ||
| 49 | |||
| 50 | static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, | ||
| 51 | unsigned long addr, unsigned long end, pgprot_t newprot) | ||
| 52 | { | ||
| 53 | pmd_t *pmd; | ||
| 54 | unsigned long next; | ||
| 55 | |||
| 56 | pmd = pmd_offset(pud, addr); | ||
| 57 | do { | ||
| 58 | next = pmd_addr_end(addr, end); | ||
| 59 | if (pmd_none_or_clear_bad(pmd)) | ||
| 60 | continue; | ||
| 61 | change_pte_range(mm, pmd, addr, next, newprot); | ||
| 62 | } while (pmd++, addr = next, addr != end); | ||
| 63 | } | ||
| 64 | |||
| 65 | static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, | ||
| 66 | unsigned long addr, unsigned long end, pgprot_t newprot) | ||
| 67 | { | ||
| 68 | pud_t *pud; | ||
| 69 | unsigned long next; | ||
| 70 | |||
| 71 | pud = pud_offset(pgd, addr); | ||
| 72 | do { | ||
| 73 | next = pud_addr_end(addr, end); | ||
| 74 | if (pud_none_or_clear_bad(pud)) | ||
| 75 | continue; | ||
| 76 | change_pmd_range(mm, pud, addr, next, newprot); | ||
| 77 | } while (pud++, addr = next, addr != end); | ||
| 78 | } | ||
| 79 | |||
| 80 | static void change_protection(struct vm_area_struct *vma, | ||
| 81 | unsigned long addr, unsigned long end, pgprot_t newprot) | ||
| 82 | { | ||
| 83 | struct mm_struct *mm = vma->vm_mm; | ||
| 84 | pgd_t *pgd; | ||
| 85 | unsigned long next; | ||
| 86 | unsigned long start = addr; | ||
| 87 | |||
| 88 | BUG_ON(addr >= end); | ||
| 89 | pgd = pgd_offset(mm, addr); | ||
| 90 | flush_cache_range(vma, addr, end); | ||
| 91 | spin_lock(&mm->page_table_lock); | ||
| 92 | do { | ||
| 93 | next = pgd_addr_end(addr, end); | ||
| 94 | if (pgd_none_or_clear_bad(pgd)) | ||
| 95 | continue; | ||
| 96 | change_pud_range(mm, pgd, addr, next, newprot); | ||
| 97 | } while (pgd++, addr = next, addr != end); | ||
| 98 | flush_tlb_range(vma, start, end); | ||
| 99 | spin_unlock(&mm->page_table_lock); | ||
| 100 | } | ||
| 101 | |||
| 102 | static int | ||
| 103 | mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | ||
| 104 | unsigned long start, unsigned long end, unsigned long newflags) | ||
| 105 | { | ||
| 106 | struct mm_struct *mm = vma->vm_mm; | ||
| 107 | unsigned long oldflags = vma->vm_flags; | ||
| 108 | long nrpages = (end - start) >> PAGE_SHIFT; | ||
| 109 | unsigned long charged = 0; | ||
| 110 | pgprot_t newprot; | ||
| 111 | pgoff_t pgoff; | ||
| 112 | int error; | ||
| 113 | |||
| 114 | if (newflags == oldflags) { | ||
| 115 | *pprev = vma; | ||
| 116 | return 0; | ||
| 117 | } | ||
| 118 | |||
| 119 | /* | ||
| 120 | * If we make a private mapping writable we increase our commit; | ||
| 121 | * but (without finer accounting) cannot reduce our commit if we | ||
| 122 | * make it unwritable again. | ||
| 123 | * | ||
| 124 | * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting | ||
| 125 | * a MAP_NORESERVE private mapping to writable will now reserve. | ||
| 126 | */ | ||
| 127 | if (newflags & VM_WRITE) { | ||
| 128 | if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { | ||
| 129 | charged = nrpages; | ||
| 130 | if (security_vm_enough_memory(charged)) | ||
| 131 | return -ENOMEM; | ||
| 132 | newflags |= VM_ACCOUNT; | ||
| 133 | } | ||
| 134 | } | ||
| 135 | |||
| 136 | newprot = protection_map[newflags & 0xf]; | ||
| 137 | |||
| 138 | /* | ||
| 139 | * First try to merge with previous and/or next vma. | ||
| 140 | */ | ||
| 141 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); | ||
| 142 | *pprev = vma_merge(mm, *pprev, start, end, newflags, | ||
| 143 | vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); | ||
| 144 | if (*pprev) { | ||
| 145 | vma = *pprev; | ||
| 146 | goto success; | ||
| 147 | } | ||
| 148 | |||
| 149 | *pprev = vma; | ||
| 150 | |||
| 151 | if (start != vma->vm_start) { | ||
| 152 | error = split_vma(mm, vma, start, 1); | ||
| 153 | if (error) | ||
| 154 | goto fail; | ||
| 155 | } | ||
| 156 | |||
| 157 | if (end != vma->vm_end) { | ||
| 158 | error = split_vma(mm, vma, end, 0); | ||
| 159 | if (error) | ||
| 160 | goto fail; | ||
| 161 | } | ||
| 162 | |||
| 163 | success: | ||
| 164 | /* | ||
| 165 | * vm_flags and vm_page_prot are protected by the mmap_sem | ||
| 166 | * held in write mode. | ||
| 167 | */ | ||
| 168 | vma->vm_flags = newflags; | ||
| 169 | vma->vm_page_prot = newprot; | ||
| 170 | change_protection(vma, start, end, newprot); | ||
| 171 | __vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); | ||
| 172 | __vm_stat_account(mm, newflags, vma->vm_file, nrpages); | ||
| 173 | return 0; | ||
| 174 | |||
| 175 | fail: | ||
| 176 | vm_unacct_memory(charged); | ||
| 177 | return error; | ||
| 178 | } | ||
| 179 | |||
| 180 | asmlinkage long | ||
| 181 | sys_mprotect(unsigned long start, size_t len, unsigned long prot) | ||
| 182 | { | ||
| 183 | unsigned long vm_flags, nstart, end, tmp, reqprot; | ||
| 184 | struct vm_area_struct *vma, *prev; | ||
| 185 | int error = -EINVAL; | ||
| 186 | const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); | ||
| 187 | prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); | ||
| 188 | if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ | ||
| 189 | return -EINVAL; | ||
| 190 | |||
| 191 | if (start & ~PAGE_MASK) | ||
| 192 | return -EINVAL; | ||
| 193 | if (!len) | ||
| 194 | return 0; | ||
| 195 | len = PAGE_ALIGN(len); | ||
| 196 | end = start + len; | ||
| 197 | if (end <= start) | ||
| 198 | return -ENOMEM; | ||
| 199 | if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) | ||
| 200 | return -EINVAL; | ||
| 201 | |||
| 202 | reqprot = prot; | ||
| 203 | /* | ||
| 204 | * Does the application expect PROT_READ to imply PROT_EXEC: | ||
| 205 | */ | ||
| 206 | if (unlikely((prot & PROT_READ) && | ||
| 207 | (current->personality & READ_IMPLIES_EXEC))) | ||
| 208 | prot |= PROT_EXEC; | ||
| 209 | |||
| 210 | vm_flags = calc_vm_prot_bits(prot); | ||
| 211 | |||
| 212 | down_write(¤t->mm->mmap_sem); | ||
| 213 | |||
| 214 | vma = find_vma_prev(current->mm, start, &prev); | ||
| 215 | error = -ENOMEM; | ||
| 216 | if (!vma) | ||
| 217 | goto out; | ||
| 218 | if (unlikely(grows & PROT_GROWSDOWN)) { | ||
| 219 | if (vma->vm_start >= end) | ||
| 220 | goto out; | ||
| 221 | start = vma->vm_start; | ||
| 222 | error = -EINVAL; | ||
| 223 | if (!(vma->vm_flags & VM_GROWSDOWN)) | ||
| 224 | goto out; | ||
| 225 | } | ||
| 226 | else { | ||
| 227 | if (vma->vm_start > start) | ||
| 228 | goto out; | ||
| 229 | if (unlikely(grows & PROT_GROWSUP)) { | ||
| 230 | end = vma->vm_end; | ||
| 231 | error = -EINVAL; | ||
| 232 | if (!(vma->vm_flags & VM_GROWSUP)) | ||
| 233 | goto out; | ||
| 234 | } | ||
| 235 | } | ||
| 236 | if (start > vma->vm_start) | ||
| 237 | prev = vma; | ||
| 238 | |||
| 239 | for (nstart = start ; ; ) { | ||
| 240 | unsigned long newflags; | ||
| 241 | |||
| 242 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ | ||
| 243 | |||
| 244 | if (is_vm_hugetlb_page(vma)) { | ||
| 245 | error = -EACCES; | ||
| 246 | goto out; | ||
| 247 | } | ||
| 248 | |||
| 249 | newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); | ||
| 250 | |||
| 251 | if ((newflags & ~(newflags >> 4)) & 0xf) { | ||
| 252 | error = -EACCES; | ||
| 253 | goto out; | ||
| 254 | } | ||
| 255 | |||
| 256 | error = security_file_mprotect(vma, reqprot, prot); | ||
| 257 | if (error) | ||
| 258 | goto out; | ||
| 259 | |||
| 260 | tmp = vma->vm_end; | ||
| 261 | if (tmp > end) | ||
| 262 | tmp = end; | ||
| 263 | error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); | ||
| 264 | if (error) | ||
| 265 | goto out; | ||
| 266 | nstart = tmp; | ||
| 267 | |||
| 268 | if (nstart < prev->vm_end) | ||
| 269 | nstart = prev->vm_end; | ||
| 270 | if (nstart >= end) | ||
| 271 | goto out; | ||
| 272 | |||
| 273 | vma = prev->vm_next; | ||
| 274 | if (!vma || vma->vm_start != nstart) { | ||
| 275 | error = -ENOMEM; | ||
| 276 | goto out; | ||
| 277 | } | ||
| 278 | } | ||
| 279 | out: | ||
| 280 | up_write(¤t->mm->mmap_sem); | ||
| 281 | return error; | ||
| 282 | } | ||
diff --git a/mm/mremap.c b/mm/mremap.c new file mode 100644 index 000000000000..0d1c1b9c7a0a --- /dev/null +++ b/mm/mremap.c | |||
| @@ -0,0 +1,426 @@ | |||
| 1 | /* | ||
| 2 | * mm/mremap.c | ||
| 3 | * | ||
| 4 | * (C) Copyright 1996 Linus Torvalds | ||
| 5 | * | ||
| 6 | * Address space accounting code <alan@redhat.com> | ||
| 7 | * (C) Copyright 2002 Red Hat Inc, All Rights Reserved | ||
| 8 | */ | ||
| 9 | |||
| 10 | #include <linux/mm.h> | ||
| 11 | #include <linux/hugetlb.h> | ||
| 12 | #include <linux/slab.h> | ||
| 13 | #include <linux/shm.h> | ||
| 14 | #include <linux/mman.h> | ||
| 15 | #include <linux/swap.h> | ||
| 16 | #include <linux/fs.h> | ||
| 17 | #include <linux/highmem.h> | ||
| 18 | #include <linux/security.h> | ||
| 19 | #include <linux/syscalls.h> | ||
| 20 | |||
| 21 | #include <asm/uaccess.h> | ||
| 22 | #include <asm/cacheflush.h> | ||
| 23 | #include <asm/tlbflush.h> | ||
| 24 | |||
| 25 | static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr) | ||
| 26 | { | ||
| 27 | pgd_t *pgd; | ||
| 28 | pud_t *pud; | ||
| 29 | pmd_t *pmd; | ||
| 30 | pte_t *pte = NULL; | ||
| 31 | |||
| 32 | pgd = pgd_offset(mm, addr); | ||
| 33 | if (pgd_none_or_clear_bad(pgd)) | ||
| 34 | goto end; | ||
| 35 | |||
| 36 | pud = pud_offset(pgd, addr); | ||
| 37 | if (pud_none_or_clear_bad(pud)) | ||
| 38 | goto end; | ||
| 39 | |||
| 40 | pmd = pmd_offset(pud, addr); | ||
| 41 | if (pmd_none_or_clear_bad(pmd)) | ||
| 42 | goto end; | ||
| 43 | |||
| 44 | pte = pte_offset_map_nested(pmd, addr); | ||
| 45 | if (pte_none(*pte)) { | ||
| 46 | pte_unmap_nested(pte); | ||
| 47 | pte = NULL; | ||
| 48 | } | ||
| 49 | end: | ||
| 50 | return pte; | ||
| 51 | } | ||
| 52 | |||
| 53 | static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr) | ||
| 54 | { | ||
| 55 | pgd_t *pgd; | ||
| 56 | pud_t *pud; | ||
| 57 | pmd_t *pmd; | ||
| 58 | |||
| 59 | pgd = pgd_offset(mm, addr); | ||
| 60 | if (pgd_none_or_clear_bad(pgd)) | ||
| 61 | return NULL; | ||
| 62 | |||
| 63 | pud = pud_offset(pgd, addr); | ||
| 64 | if (pud_none_or_clear_bad(pud)) | ||
| 65 | return NULL; | ||
| 66 | |||
| 67 | pmd = pmd_offset(pud, addr); | ||
| 68 | if (pmd_none_or_clear_bad(pmd)) | ||
| 69 | return NULL; | ||
| 70 | |||
| 71 | return pte_offset_map(pmd, addr); | ||
| 72 | } | ||
| 73 | |||
| 74 | static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr) | ||
| 75 | { | ||
| 76 | pgd_t *pgd; | ||
| 77 | pud_t *pud; | ||
| 78 | pmd_t *pmd; | ||
| 79 | pte_t *pte = NULL; | ||
| 80 | |||
| 81 | pgd = pgd_offset(mm, addr); | ||
| 82 | |||
| 83 | pud = pud_alloc(mm, pgd, addr); | ||
| 84 | if (!pud) | ||
| 85 | return NULL; | ||
| 86 | pmd = pmd_alloc(mm, pud, addr); | ||
| 87 | if (pmd) | ||
| 88 | pte = pte_alloc_map(mm, pmd, addr); | ||
| 89 | return pte; | ||
| 90 | } | ||
| 91 | |||
| 92 | static int | ||
| 93 | move_one_page(struct vm_area_struct *vma, unsigned long old_addr, | ||
| 94 | struct vm_area_struct *new_vma, unsigned long new_addr) | ||
| 95 | { | ||
| 96 | struct address_space *mapping = NULL; | ||
| 97 | struct mm_struct *mm = vma->vm_mm; | ||
| 98 | int error = 0; | ||
| 99 | pte_t *src, *dst; | ||
| 100 | |||
| 101 | if (vma->vm_file) { | ||
| 102 | /* | ||
| 103 | * Subtle point from Rajesh Venkatasubramanian: before | ||
| 104 | * moving file-based ptes, we must lock vmtruncate out, | ||
| 105 | * since it might clean the dst vma before the src vma, | ||
| 106 | * and we propagate stale pages into the dst afterward. | ||
| 107 | */ | ||
| 108 | mapping = vma->vm_file->f_mapping; | ||
| 109 | spin_lock(&mapping->i_mmap_lock); | ||
| 110 | if (new_vma->vm_truncate_count && | ||
| 111 | new_vma->vm_truncate_count != vma->vm_truncate_count) | ||
| 112 | new_vma->vm_truncate_count = 0; | ||
| 113 | } | ||
| 114 | spin_lock(&mm->page_table_lock); | ||
| 115 | |||
| 116 | src = get_one_pte_map_nested(mm, old_addr); | ||
| 117 | if (src) { | ||
| 118 | /* | ||
| 119 | * Look to see whether alloc_one_pte_map needs to perform a | ||
| 120 | * memory allocation. If it does then we need to drop the | ||
| 121 | * atomic kmap | ||
| 122 | */ | ||
| 123 | dst = get_one_pte_map(mm, new_addr); | ||
| 124 | if (unlikely(!dst)) { | ||
| 125 | pte_unmap_nested(src); | ||
| 126 | if (mapping) | ||
| 127 | spin_unlock(&mapping->i_mmap_lock); | ||
| 128 | dst = alloc_one_pte_map(mm, new_addr); | ||
| 129 | if (mapping && !spin_trylock(&mapping->i_mmap_lock)) { | ||
| 130 | spin_unlock(&mm->page_table_lock); | ||
| 131 | spin_lock(&mapping->i_mmap_lock); | ||
| 132 | spin_lock(&mm->page_table_lock); | ||
| 133 | } | ||
| 134 | src = get_one_pte_map_nested(mm, old_addr); | ||
| 135 | } | ||
| 136 | /* | ||
| 137 | * Since alloc_one_pte_map can drop and re-acquire | ||
| 138 | * page_table_lock, we should re-check the src entry... | ||
| 139 | */ | ||
| 140 | if (src) { | ||
| 141 | if (dst) { | ||
| 142 | pte_t pte; | ||
| 143 | pte = ptep_clear_flush(vma, old_addr, src); | ||
| 144 | set_pte_at(mm, new_addr, dst, pte); | ||
| 145 | } else | ||
| 146 | error = -ENOMEM; | ||
| 147 | pte_unmap_nested(src); | ||
| 148 | } | ||
| 149 | if (dst) | ||
| 150 | pte_unmap(dst); | ||
| 151 | } | ||
| 152 | spin_unlock(&mm->page_table_lock); | ||
| 153 | if (mapping) | ||
| 154 | spin_unlock(&mapping->i_mmap_lock); | ||
| 155 | return error; | ||
| 156 | } | ||
| 157 | |||
| 158 | static unsigned long move_page_tables(struct vm_area_struct *vma, | ||
| 159 | unsigned long old_addr, struct vm_area_struct *new_vma, | ||
| 160 | unsigned long new_addr, unsigned long len) | ||
| 161 | { | ||
| 162 | unsigned long offset; | ||
| 163 | |||
| 164 | flush_cache_range(vma, old_addr, old_addr + len); | ||
| 165 | |||
| 166 | /* | ||
| 167 | * This is not the clever way to do this, but we're taking the | ||
| 168 | * easy way out on the assumption that most remappings will be | ||
| 169 | * only a few pages.. This also makes error recovery easier. | ||
| 170 | */ | ||
| 171 | for (offset = 0; offset < len; offset += PAGE_SIZE) { | ||
| 172 | if (move_one_page(vma, old_addr + offset, | ||
| 173 | new_vma, new_addr + offset) < 0) | ||
| 174 | break; | ||
| 175 | cond_resched(); | ||
| 176 | } | ||
| 177 | return offset; | ||
| 178 | } | ||
| 179 | |||
| 180 | static unsigned long move_vma(struct vm_area_struct *vma, | ||
| 181 | unsigned long old_addr, unsigned long old_len, | ||
| 182 | unsigned long new_len, unsigned long new_addr) | ||
| 183 | { | ||
| 184 | struct mm_struct *mm = vma->vm_mm; | ||
| 185 | struct vm_area_struct *new_vma; | ||
| 186 | unsigned long vm_flags = vma->vm_flags; | ||
| 187 | unsigned long new_pgoff; | ||
| 188 | unsigned long moved_len; | ||
| 189 | unsigned long excess = 0; | ||
| 190 | int split = 0; | ||
| 191 | |||
| 192 | /* | ||
| 193 | * We'd prefer to avoid failure later on in do_munmap: | ||
| 194 | * which may split one vma into three before unmapping. | ||
| 195 | */ | ||
| 196 | if (mm->map_count >= sysctl_max_map_count - 3) | ||
| 197 | return -ENOMEM; | ||
| 198 | |||
| 199 | new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); | ||
| 200 | new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); | ||
| 201 | if (!new_vma) | ||
| 202 | return -ENOMEM; | ||
| 203 | |||
| 204 | moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); | ||
| 205 | if (moved_len < old_len) { | ||
| 206 | /* | ||
| 207 | * On error, move entries back from new area to old, | ||
| 208 | * which will succeed since page tables still there, | ||
| 209 | * and then proceed to unmap new area instead of old. | ||
| 210 | */ | ||
| 211 | move_page_tables(new_vma, new_addr, vma, old_addr, moved_len); | ||
| 212 | vma = new_vma; | ||
| 213 | old_len = new_len; | ||
| 214 | old_addr = new_addr; | ||
| 215 | new_addr = -ENOMEM; | ||
| 216 | } | ||
| 217 | |||
| 218 | /* Conceal VM_ACCOUNT so old reservation is not undone */ | ||
| 219 | if (vm_flags & VM_ACCOUNT) { | ||
| 220 | vma->vm_flags &= ~VM_ACCOUNT; | ||
| 221 | excess = vma->vm_end - vma->vm_start - old_len; | ||
| 222 | if (old_addr > vma->vm_start && | ||
| 223 | old_addr + old_len < vma->vm_end) | ||
| 224 | split = 1; | ||
| 225 | } | ||
| 226 | |||
| 227 | if (do_munmap(mm, old_addr, old_len) < 0) { | ||
| 228 | /* OOM: unable to split vma, just get accounts right */ | ||
| 229 | vm_unacct_memory(excess >> PAGE_SHIFT); | ||
| 230 | excess = 0; | ||
| 231 | } | ||
| 232 | |||
| 233 | /* Restore VM_ACCOUNT if one or two pieces of vma left */ | ||
| 234 | if (excess) { | ||
| 235 | vma->vm_flags |= VM_ACCOUNT; | ||
| 236 | if (split) | ||
| 237 | vma->vm_next->vm_flags |= VM_ACCOUNT; | ||
| 238 | } | ||
| 239 | |||
| 240 | mm->total_vm += new_len >> PAGE_SHIFT; | ||
| 241 | __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); | ||
| 242 | if (vm_flags & VM_LOCKED) { | ||
| 243 | mm->locked_vm += new_len >> PAGE_SHIFT; | ||
| 244 | if (new_len > old_len) | ||
| 245 | make_pages_present(new_addr + old_len, | ||
| 246 | new_addr + new_len); | ||
| 247 | } | ||
| 248 | |||
| 249 | return new_addr; | ||
| 250 | } | ||
| 251 | |||
| 252 | /* | ||
| 253 | * Expand (or shrink) an existing mapping, potentially moving it at the | ||
| 254 | * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) | ||
| 255 | * | ||
| 256 | * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise | ||
| 257 | * This option implies MREMAP_MAYMOVE. | ||
| 258 | */ | ||
| 259 | unsigned long do_mremap(unsigned long addr, | ||
| 260 | unsigned long old_len, unsigned long new_len, | ||
| 261 | unsigned long flags, unsigned long new_addr) | ||
| 262 | { | ||
| 263 | struct vm_area_struct *vma; | ||
| 264 | unsigned long ret = -EINVAL; | ||
| 265 | unsigned long charged = 0; | ||
| 266 | |||
| 267 | if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) | ||
| 268 | goto out; | ||
| 269 | |||
| 270 | if (addr & ~PAGE_MASK) | ||
| 271 | goto out; | ||
| 272 | |||
| 273 | old_len = PAGE_ALIGN(old_len); | ||
| 274 | new_len = PAGE_ALIGN(new_len); | ||
| 275 | |||
| 276 | /* | ||
| 277 | * We allow a zero old-len as a special case | ||
| 278 | * for DOS-emu "duplicate shm area" thing. But | ||
| 279 | * a zero new-len is nonsensical. | ||
| 280 | */ | ||
| 281 | if (!new_len) | ||
| 282 | goto out; | ||
| 283 | |||
| 284 | /* new_addr is only valid if MREMAP_FIXED is specified */ | ||
| 285 | if (flags & MREMAP_FIXED) { | ||
| 286 | if (new_addr & ~PAGE_MASK) | ||
| 287 | goto out; | ||
| 288 | if (!(flags & MREMAP_MAYMOVE)) | ||
| 289 | goto out; | ||
| 290 | |||
| 291 | if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) | ||
| 292 | goto out; | ||
| 293 | |||
| 294 | /* Check if the location we're moving into overlaps the | ||
| 295 | * old location at all, and fail if it does. | ||
| 296 | */ | ||
| 297 | if ((new_addr <= addr) && (new_addr+new_len) > addr) | ||
| 298 | goto out; | ||
| 299 | |||
| 300 | if ((addr <= new_addr) && (addr+old_len) > new_addr) | ||
| 301 | goto out; | ||
| 302 | |||
| 303 | ret = do_munmap(current->mm, new_addr, new_len); | ||
| 304 | if (ret) | ||
| 305 | goto out; | ||
| 306 | } | ||
| 307 | |||
| 308 | /* | ||
| 309 | * Always allow a shrinking remap: that just unmaps | ||
| 310 | * the unnecessary pages.. | ||
| 311 | * do_munmap does all the needed commit accounting | ||
| 312 | */ | ||
| 313 | if (old_len >= new_len) { | ||
| 314 | ret = do_munmap(current->mm, addr+new_len, old_len - new_len); | ||
| 315 | if (ret && old_len != new_len) | ||
| 316 | goto out; | ||
| 317 | ret = addr; | ||
| 318 | if (!(flags & MREMAP_FIXED) || (new_addr == addr)) | ||
| 319 | goto out; | ||
| 320 | old_len = new_len; | ||
| 321 | } | ||
| 322 | |||
| 323 | /* | ||
| 324 | * Ok, we need to grow.. or relocate. | ||
| 325 | */ | ||
| 326 | ret = -EFAULT; | ||
| 327 | vma = find_vma(current->mm, addr); | ||
| 328 | if (!vma || vma->vm_start > addr) | ||
| 329 | goto out; | ||
| 330 | if (is_vm_hugetlb_page(vma)) { | ||
| 331 | ret = -EINVAL; | ||
| 332 | goto out; | ||
| 333 | } | ||
| 334 | /* We can't remap across vm area boundaries */ | ||
| 335 | if (old_len > vma->vm_end - addr) | ||
| 336 | goto out; | ||
| 337 | if (vma->vm_flags & VM_DONTEXPAND) { | ||
| 338 | if (new_len > old_len) | ||
| 339 | goto out; | ||
| 340 | } | ||
| 341 | if (vma->vm_flags & VM_LOCKED) { | ||
| 342 | unsigned long locked, lock_limit; | ||
| 343 | locked = current->mm->locked_vm << PAGE_SHIFT; | ||
| 344 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | ||
| 345 | locked += new_len - old_len; | ||
| 346 | ret = -EAGAIN; | ||
| 347 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
| 348 | goto out; | ||
| 349 | } | ||
| 350 | ret = -ENOMEM; | ||
| 351 | if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len) | ||
| 352 | > current->signal->rlim[RLIMIT_AS].rlim_cur) | ||
| 353 | goto out; | ||
| 354 | |||
| 355 | if (vma->vm_flags & VM_ACCOUNT) { | ||
| 356 | charged = (new_len - old_len) >> PAGE_SHIFT; | ||
| 357 | if (security_vm_enough_memory(charged)) | ||
| 358 | goto out_nc; | ||
| 359 | } | ||
| 360 | |||
| 361 | /* old_len exactly to the end of the area.. | ||
| 362 | * And we're not relocating the area. | ||
| 363 | */ | ||
| 364 | if (old_len == vma->vm_end - addr && | ||
| 365 | !((flags & MREMAP_FIXED) && (addr != new_addr)) && | ||
| 366 | (old_len != new_len || !(flags & MREMAP_MAYMOVE))) { | ||
| 367 | unsigned long max_addr = TASK_SIZE; | ||
| 368 | if (vma->vm_next) | ||
| 369 | max_addr = vma->vm_next->vm_start; | ||
| 370 | /* can we just expand the current mapping? */ | ||
| 371 | if (max_addr - addr >= new_len) { | ||
| 372 | int pages = (new_len - old_len) >> PAGE_SHIFT; | ||
| 373 | |||
| 374 | vma_adjust(vma, vma->vm_start, | ||
| 375 | addr + new_len, vma->vm_pgoff, NULL); | ||
| 376 | |||
| 377 | current->mm->total_vm += pages; | ||
| 378 | __vm_stat_account(vma->vm_mm, vma->vm_flags, | ||
| 379 | vma->vm_file, pages); | ||
| 380 | if (vma->vm_flags & VM_LOCKED) { | ||
| 381 | current->mm->locked_vm += pages; | ||
| 382 | make_pages_present(addr + old_len, | ||
| 383 | addr + new_len); | ||
| 384 | } | ||
| 385 | ret = addr; | ||
| 386 | goto out; | ||
| 387 | } | ||
| 388 | } | ||
| 389 | |||
| 390 | /* | ||
| 391 | * We weren't able to just expand or shrink the area, | ||
| 392 | * we need to create a new one and move it.. | ||
| 393 | */ | ||
| 394 | ret = -ENOMEM; | ||
| 395 | if (flags & MREMAP_MAYMOVE) { | ||
| 396 | if (!(flags & MREMAP_FIXED)) { | ||
| 397 | unsigned long map_flags = 0; | ||
| 398 | if (vma->vm_flags & VM_MAYSHARE) | ||
| 399 | map_flags |= MAP_SHARED; | ||
| 400 | |||
| 401 | new_addr = get_unmapped_area(vma->vm_file, 0, new_len, | ||
| 402 | vma->vm_pgoff, map_flags); | ||
| 403 | ret = new_addr; | ||
| 404 | if (new_addr & ~PAGE_MASK) | ||
| 405 | goto out; | ||
| 406 | } | ||
| 407 | ret = move_vma(vma, addr, old_len, new_len, new_addr); | ||
| 408 | } | ||
| 409 | out: | ||
| 410 | if (ret & ~PAGE_MASK) | ||
| 411 | vm_unacct_memory(charged); | ||
| 412 | out_nc: | ||
| 413 | return ret; | ||
| 414 | } | ||
| 415 | |||
| 416 | asmlinkage unsigned long sys_mremap(unsigned long addr, | ||
| 417 | unsigned long old_len, unsigned long new_len, | ||
| 418 | unsigned long flags, unsigned long new_addr) | ||
| 419 | { | ||
| 420 | unsigned long ret; | ||
| 421 | |||
| 422 | down_write(¤t->mm->mmap_sem); | ||
| 423 | ret = do_mremap(addr, old_len, new_len, flags, new_addr); | ||
| 424 | up_write(¤t->mm->mmap_sem); | ||
| 425 | return ret; | ||
| 426 | } | ||
diff --git a/mm/msync.c b/mm/msync.c new file mode 100644 index 000000000000..090f426bca7d --- /dev/null +++ b/mm/msync.c | |||
| @@ -0,0 +1,236 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/msync.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 1994-1999 Linus Torvalds | ||
| 5 | */ | ||
| 6 | |||
| 7 | /* | ||
| 8 | * The msync() system call. | ||
| 9 | */ | ||
| 10 | #include <linux/slab.h> | ||
| 11 | #include <linux/pagemap.h> | ||
| 12 | #include <linux/mm.h> | ||
| 13 | #include <linux/mman.h> | ||
| 14 | #include <linux/hugetlb.h> | ||
| 15 | #include <linux/syscalls.h> | ||
| 16 | |||
| 17 | #include <asm/pgtable.h> | ||
| 18 | #include <asm/tlbflush.h> | ||
| 19 | |||
| 20 | /* | ||
| 21 | * Called with mm->page_table_lock held to protect against other | ||
| 22 | * threads/the swapper from ripping pte's out from under us. | ||
| 23 | */ | ||
| 24 | |||
| 25 | static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | ||
| 26 | unsigned long addr, unsigned long end) | ||
| 27 | { | ||
| 28 | pte_t *pte; | ||
| 29 | |||
| 30 | pte = pte_offset_map(pmd, addr); | ||
| 31 | do { | ||
| 32 | unsigned long pfn; | ||
| 33 | struct page *page; | ||
| 34 | |||
| 35 | if (!pte_present(*pte)) | ||
| 36 | continue; | ||
| 37 | pfn = pte_pfn(*pte); | ||
| 38 | if (!pfn_valid(pfn)) | ||
| 39 | continue; | ||
| 40 | page = pfn_to_page(pfn); | ||
| 41 | if (PageReserved(page)) | ||
| 42 | continue; | ||
| 43 | |||
| 44 | if (ptep_clear_flush_dirty(vma, addr, pte) || | ||
| 45 | page_test_and_clear_dirty(page)) | ||
| 46 | set_page_dirty(page); | ||
| 47 | } while (pte++, addr += PAGE_SIZE, addr != end); | ||
| 48 | pte_unmap(pte - 1); | ||
| 49 | } | ||
| 50 | |||
| 51 | static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud, | ||
| 52 | unsigned long addr, unsigned long end) | ||
| 53 | { | ||
| 54 | pmd_t *pmd; | ||
| 55 | unsigned long next; | ||
| 56 | |||
| 57 | pmd = pmd_offset(pud, addr); | ||
| 58 | do { | ||
| 59 | next = pmd_addr_end(addr, end); | ||
| 60 | if (pmd_none_or_clear_bad(pmd)) | ||
| 61 | continue; | ||
| 62 | sync_pte_range(vma, pmd, addr, next); | ||
| 63 | } while (pmd++, addr = next, addr != end); | ||
| 64 | } | ||
| 65 | |||
| 66 | static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | ||
| 67 | unsigned long addr, unsigned long end) | ||
| 68 | { | ||
| 69 | pud_t *pud; | ||
| 70 | unsigned long next; | ||
| 71 | |||
| 72 | pud = pud_offset(pgd, addr); | ||
| 73 | do { | ||
| 74 | next = pud_addr_end(addr, end); | ||
| 75 | if (pud_none_or_clear_bad(pud)) | ||
| 76 | continue; | ||
| 77 | sync_pmd_range(vma, pud, addr, next); | ||
| 78 | } while (pud++, addr = next, addr != end); | ||
| 79 | } | ||
| 80 | |||
| 81 | static void sync_page_range(struct vm_area_struct *vma, | ||
| 82 | unsigned long addr, unsigned long end) | ||
| 83 | { | ||
| 84 | struct mm_struct *mm = vma->vm_mm; | ||
| 85 | pgd_t *pgd; | ||
| 86 | unsigned long next; | ||
| 87 | |||
| 88 | /* For hugepages we can't go walking the page table normally, | ||
| 89 | * but that's ok, hugetlbfs is memory based, so we don't need | ||
| 90 | * to do anything more on an msync() */ | ||
| 91 | if (is_vm_hugetlb_page(vma)) | ||
| 92 | return; | ||
| 93 | |||
| 94 | BUG_ON(addr >= end); | ||
| 95 | pgd = pgd_offset(mm, addr); | ||
| 96 | flush_cache_range(vma, addr, end); | ||
| 97 | spin_lock(&mm->page_table_lock); | ||
| 98 | do { | ||
| 99 | next = pgd_addr_end(addr, end); | ||
| 100 | if (pgd_none_or_clear_bad(pgd)) | ||
| 101 | continue; | ||
| 102 | sync_pud_range(vma, pgd, addr, next); | ||
| 103 | } while (pgd++, addr = next, addr != end); | ||
| 104 | spin_unlock(&mm->page_table_lock); | ||
| 105 | } | ||
| 106 | |||
| 107 | #ifdef CONFIG_PREEMPT | ||
| 108 | static inline void filemap_sync(struct vm_area_struct *vma, | ||
| 109 | unsigned long addr, unsigned long end) | ||
| 110 | { | ||
| 111 | const size_t chunk = 64 * 1024; /* bytes */ | ||
| 112 | unsigned long next; | ||
| 113 | |||
| 114 | do { | ||
| 115 | next = addr + chunk; | ||
| 116 | if (next > end || next < addr) | ||
| 117 | next = end; | ||
| 118 | sync_page_range(vma, addr, next); | ||
| 119 | cond_resched(); | ||
| 120 | } while (addr = next, addr != end); | ||
| 121 | } | ||
| 122 | #else | ||
| 123 | static inline void filemap_sync(struct vm_area_struct *vma, | ||
| 124 | unsigned long addr, unsigned long end) | ||
| 125 | { | ||
| 126 | sync_page_range(vma, addr, end); | ||
| 127 | } | ||
| 128 | #endif | ||
| 129 | |||
| 130 | /* | ||
| 131 | * MS_SYNC syncs the entire file - including mappings. | ||
| 132 | * | ||
| 133 | * MS_ASYNC does not start I/O (it used to, up to 2.5.67). Instead, it just | ||
| 134 | * marks the relevant pages dirty. The application may now run fsync() to | ||
| 135 | * write out the dirty pages and wait on the writeout and check the result. | ||
| 136 | * Or the application may run fadvise(FADV_DONTNEED) against the fd to start | ||
| 137 | * async writeout immediately. | ||
| 138 | * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to | ||
| 139 | * applications. | ||
| 140 | */ | ||
| 141 | static int msync_interval(struct vm_area_struct *vma, | ||
| 142 | unsigned long addr, unsigned long end, int flags) | ||
| 143 | { | ||
| 144 | int ret = 0; | ||
| 145 | struct file *file = vma->vm_file; | ||
| 146 | |||
| 147 | if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED)) | ||
| 148 | return -EBUSY; | ||
| 149 | |||
| 150 | if (file && (vma->vm_flags & VM_SHARED)) { | ||
| 151 | filemap_sync(vma, addr, end); | ||
| 152 | |||
| 153 | if (flags & MS_SYNC) { | ||
| 154 | struct address_space *mapping = file->f_mapping; | ||
| 155 | int err; | ||
| 156 | |||
| 157 | ret = filemap_fdatawrite(mapping); | ||
| 158 | if (file->f_op && file->f_op->fsync) { | ||
| 159 | /* | ||
| 160 | * We don't take i_sem here because mmap_sem | ||
| 161 | * is already held. | ||
| 162 | */ | ||
| 163 | err = file->f_op->fsync(file,file->f_dentry,1); | ||
| 164 | if (err && !ret) | ||
| 165 | ret = err; | ||
| 166 | } | ||
| 167 | err = filemap_fdatawait(mapping); | ||
| 168 | if (!ret) | ||
| 169 | ret = err; | ||
| 170 | } | ||
| 171 | } | ||
| 172 | return ret; | ||
| 173 | } | ||
| 174 | |||
| 175 | asmlinkage long sys_msync(unsigned long start, size_t len, int flags) | ||
| 176 | { | ||
| 177 | unsigned long end; | ||
| 178 | struct vm_area_struct *vma; | ||
| 179 | int unmapped_error, error = -EINVAL; | ||
| 180 | |||
| 181 | if (flags & MS_SYNC) | ||
| 182 | current->flags |= PF_SYNCWRITE; | ||
| 183 | |||
| 184 | down_read(¤t->mm->mmap_sem); | ||
| 185 | if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) | ||
| 186 | goto out; | ||
| 187 | if (start & ~PAGE_MASK) | ||
| 188 | goto out; | ||
| 189 | if ((flags & MS_ASYNC) && (flags & MS_SYNC)) | ||
| 190 | goto out; | ||
| 191 | error = -ENOMEM; | ||
| 192 | len = (len + ~PAGE_MASK) & PAGE_MASK; | ||
| 193 | end = start + len; | ||
| 194 | if (end < start) | ||
| 195 | goto out; | ||
| 196 | error = 0; | ||
| 197 | if (end == start) | ||
| 198 | goto out; | ||
| 199 | /* | ||
| 200 | * If the interval [start,end) covers some unmapped address ranges, | ||
| 201 | * just ignore them, but return -ENOMEM at the end. | ||
| 202 | */ | ||
| 203 | vma = find_vma(current->mm, start); | ||
| 204 | unmapped_error = 0; | ||
| 205 | for (;;) { | ||
| 206 | /* Still start < end. */ | ||
| 207 | error = -ENOMEM; | ||
| 208 | if (!vma) | ||
| 209 | goto out; | ||
| 210 | /* Here start < vma->vm_end. */ | ||
| 211 | if (start < vma->vm_start) { | ||
| 212 | unmapped_error = -ENOMEM; | ||
| 213 | start = vma->vm_start; | ||
| 214 | } | ||
| 215 | /* Here vma->vm_start <= start < vma->vm_end. */ | ||
| 216 | if (end <= vma->vm_end) { | ||
| 217 | if (start < end) { | ||
| 218 | error = msync_interval(vma, start, end, flags); | ||
| 219 | if (error) | ||
| 220 | goto out; | ||
| 221 | } | ||
| 222 | error = unmapped_error; | ||
| 223 | goto out; | ||
| 224 | } | ||
| 225 | /* Here vma->vm_start <= start < vma->vm_end < end. */ | ||
| 226 | error = msync_interval(vma, start, vma->vm_end, flags); | ||
| 227 | if (error) | ||
| 228 | goto out; | ||
| 229 | start = vma->vm_end; | ||
| 230 | vma = vma->vm_next; | ||
| 231 | } | ||
| 232 | out: | ||
| 233 | up_read(¤t->mm->mmap_sem); | ||
| 234 | current->flags &= ~PF_SYNCWRITE; | ||
| 235 | return error; | ||
| 236 | } | ||
diff --git a/mm/nommu.c b/mm/nommu.c new file mode 100644 index 000000000000..b293ec1cc4e6 --- /dev/null +++ b/mm/nommu.c | |||
| @@ -0,0 +1,1180 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/nommu.c | ||
| 3 | * | ||
| 4 | * Replacement code for mm functions to support CPU's that don't | ||
| 5 | * have any form of memory management unit (thus no virtual memory). | ||
| 6 | * | ||
| 7 | * See Documentation/nommu-mmap.txt | ||
| 8 | * | ||
| 9 | * Copyright (c) 2004-2005 David Howells <dhowells@redhat.com> | ||
| 10 | * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> | ||
| 11 | * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> | ||
| 12 | * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> | ||
| 13 | */ | ||
| 14 | |||
| 15 | #include <linux/mm.h> | ||
| 16 | #include <linux/mman.h> | ||
| 17 | #include <linux/swap.h> | ||
| 18 | #include <linux/file.h> | ||
| 19 | #include <linux/highmem.h> | ||
| 20 | #include <linux/pagemap.h> | ||
| 21 | #include <linux/slab.h> | ||
| 22 | #include <linux/vmalloc.h> | ||
| 23 | #include <linux/ptrace.h> | ||
| 24 | #include <linux/blkdev.h> | ||
| 25 | #include <linux/backing-dev.h> | ||
| 26 | #include <linux/mount.h> | ||
| 27 | #include <linux/personality.h> | ||
| 28 | #include <linux/security.h> | ||
| 29 | #include <linux/syscalls.h> | ||
| 30 | |||
| 31 | #include <asm/uaccess.h> | ||
| 32 | #include <asm/tlb.h> | ||
| 33 | #include <asm/tlbflush.h> | ||
| 34 | |||
| 35 | void *high_memory; | ||
| 36 | struct page *mem_map; | ||
| 37 | unsigned long max_mapnr; | ||
| 38 | unsigned long num_physpages; | ||
| 39 | unsigned long askedalloc, realalloc; | ||
| 40 | atomic_t vm_committed_space = ATOMIC_INIT(0); | ||
| 41 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | ||
| 42 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | ||
| 43 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; | ||
| 44 | int heap_stack_gap = 0; | ||
| 45 | |||
| 46 | EXPORT_SYMBOL(mem_map); | ||
| 47 | EXPORT_SYMBOL(sysctl_max_map_count); | ||
| 48 | EXPORT_SYMBOL(sysctl_overcommit_memory); | ||
| 49 | EXPORT_SYMBOL(sysctl_overcommit_ratio); | ||
| 50 | EXPORT_SYMBOL(vm_committed_space); | ||
| 51 | EXPORT_SYMBOL(__vm_enough_memory); | ||
| 52 | |||
| 53 | /* list of shareable VMAs */ | ||
| 54 | struct rb_root nommu_vma_tree = RB_ROOT; | ||
| 55 | DECLARE_RWSEM(nommu_vma_sem); | ||
| 56 | |||
| 57 | struct vm_operations_struct generic_file_vm_ops = { | ||
| 58 | }; | ||
| 59 | |||
| 60 | /* | ||
| 61 | * Handle all mappings that got truncated by a "truncate()" | ||
| 62 | * system call. | ||
| 63 | * | ||
| 64 | * NOTE! We have to be ready to update the memory sharing | ||
| 65 | * between the file and the memory map for a potential last | ||
| 66 | * incomplete page. Ugly, but necessary. | ||
| 67 | */ | ||
| 68 | int vmtruncate(struct inode *inode, loff_t offset) | ||
| 69 | { | ||
| 70 | struct address_space *mapping = inode->i_mapping; | ||
| 71 | unsigned long limit; | ||
| 72 | |||
| 73 | if (inode->i_size < offset) | ||
| 74 | goto do_expand; | ||
| 75 | i_size_write(inode, offset); | ||
| 76 | |||
| 77 | truncate_inode_pages(mapping, offset); | ||
| 78 | goto out_truncate; | ||
| 79 | |||
| 80 | do_expand: | ||
| 81 | limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | ||
| 82 | if (limit != RLIM_INFINITY && offset > limit) | ||
| 83 | goto out_sig; | ||
| 84 | if (offset > inode->i_sb->s_maxbytes) | ||
| 85 | goto out; | ||
| 86 | i_size_write(inode, offset); | ||
| 87 | |||
| 88 | out_truncate: | ||
| 89 | if (inode->i_op && inode->i_op->truncate) | ||
| 90 | inode->i_op->truncate(inode); | ||
| 91 | return 0; | ||
| 92 | out_sig: | ||
| 93 | send_sig(SIGXFSZ, current, 0); | ||
| 94 | out: | ||
| 95 | return -EFBIG; | ||
| 96 | } | ||
| 97 | |||
| 98 | EXPORT_SYMBOL(vmtruncate); | ||
| 99 | |||
| 100 | /* | ||
| 101 | * Return the total memory allocated for this pointer, not | ||
| 102 | * just what the caller asked for. | ||
| 103 | * | ||
| 104 | * Doesn't have to be accurate, i.e. may have races. | ||
| 105 | */ | ||
| 106 | unsigned int kobjsize(const void *objp) | ||
| 107 | { | ||
| 108 | struct page *page; | ||
| 109 | |||
| 110 | if (!objp || !((page = virt_to_page(objp)))) | ||
| 111 | return 0; | ||
| 112 | |||
| 113 | if (PageSlab(page)) | ||
| 114 | return ksize(objp); | ||
| 115 | |||
| 116 | BUG_ON(page->index < 0); | ||
| 117 | BUG_ON(page->index >= MAX_ORDER); | ||
| 118 | |||
| 119 | return (PAGE_SIZE << page->index); | ||
| 120 | } | ||
| 121 | |||
| 122 | /* | ||
| 123 | * The nommu dodgy version :-) | ||
| 124 | */ | ||
| 125 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | ||
| 126 | unsigned long start, int len, int write, int force, | ||
| 127 | struct page **pages, struct vm_area_struct **vmas) | ||
| 128 | { | ||
| 129 | int i; | ||
| 130 | static struct vm_area_struct dummy_vma; | ||
| 131 | |||
| 132 | for (i = 0; i < len; i++) { | ||
| 133 | if (pages) { | ||
| 134 | pages[i] = virt_to_page(start); | ||
| 135 | if (pages[i]) | ||
| 136 | page_cache_get(pages[i]); | ||
| 137 | } | ||
| 138 | if (vmas) | ||
| 139 | vmas[i] = &dummy_vma; | ||
| 140 | start += PAGE_SIZE; | ||
| 141 | } | ||
| 142 | return(i); | ||
| 143 | } | ||
| 144 | |||
| 145 | DEFINE_RWLOCK(vmlist_lock); | ||
| 146 | struct vm_struct *vmlist; | ||
| 147 | |||
| 148 | void vfree(void *addr) | ||
| 149 | { | ||
| 150 | kfree(addr); | ||
| 151 | } | ||
| 152 | |||
| 153 | void *__vmalloc(unsigned long size, int gfp_mask, pgprot_t prot) | ||
| 154 | { | ||
| 155 | /* | ||
| 156 | * kmalloc doesn't like __GFP_HIGHMEM for some reason | ||
| 157 | */ | ||
| 158 | return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM); | ||
| 159 | } | ||
| 160 | |||
| 161 | struct page * vmalloc_to_page(void *addr) | ||
| 162 | { | ||
| 163 | return virt_to_page(addr); | ||
| 164 | } | ||
| 165 | |||
| 166 | unsigned long vmalloc_to_pfn(void *addr) | ||
| 167 | { | ||
| 168 | return page_to_pfn(virt_to_page(addr)); | ||
| 169 | } | ||
| 170 | |||
| 171 | |||
| 172 | long vread(char *buf, char *addr, unsigned long count) | ||
| 173 | { | ||
| 174 | memcpy(buf, addr, count); | ||
| 175 | return count; | ||
| 176 | } | ||
| 177 | |||
| 178 | long vwrite(char *buf, char *addr, unsigned long count) | ||
| 179 | { | ||
| 180 | /* Don't allow overflow */ | ||
| 181 | if ((unsigned long) addr + count < count) | ||
| 182 | count = -(unsigned long) addr; | ||
| 183 | |||
| 184 | memcpy(addr, buf, count); | ||
| 185 | return(count); | ||
| 186 | } | ||
| 187 | |||
| 188 | /* | ||
| 189 | * vmalloc - allocate virtually continguos memory | ||
| 190 | * | ||
| 191 | * @size: allocation size | ||
| 192 | * | ||
| 193 | * Allocate enough pages to cover @size from the page level | ||
| 194 | * allocator and map them into continguos kernel virtual space. | ||
| 195 | * | ||
| 196 | * For tight cotrol over page level allocator and protection flags | ||
| 197 | * use __vmalloc() instead. | ||
| 198 | */ | ||
| 199 | void *vmalloc(unsigned long size) | ||
| 200 | { | ||
| 201 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); | ||
| 202 | } | ||
| 203 | |||
| 204 | /* | ||
| 205 | * vmalloc_32 - allocate virtually continguos memory (32bit addressable) | ||
| 206 | * | ||
| 207 | * @size: allocation size | ||
| 208 | * | ||
| 209 | * Allocate enough 32bit PA addressable pages to cover @size from the | ||
| 210 | * page level allocator and map them into continguos kernel virtual space. | ||
| 211 | */ | ||
| 212 | void *vmalloc_32(unsigned long size) | ||
| 213 | { | ||
| 214 | return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); | ||
| 215 | } | ||
| 216 | |||
| 217 | void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) | ||
| 218 | { | ||
| 219 | BUG(); | ||
| 220 | return NULL; | ||
| 221 | } | ||
| 222 | |||
| 223 | void vunmap(void *addr) | ||
| 224 | { | ||
| 225 | BUG(); | ||
| 226 | } | ||
| 227 | |||
| 228 | /* | ||
| 229 | * sys_brk() for the most part doesn't need the global kernel | ||
| 230 | * lock, except when an application is doing something nasty | ||
| 231 | * like trying to un-brk an area that has already been mapped | ||
| 232 | * to a regular file. in this case, the unmapping will need | ||
| 233 | * to invoke file system routines that need the global lock. | ||
| 234 | */ | ||
| 235 | asmlinkage unsigned long sys_brk(unsigned long brk) | ||
| 236 | { | ||
| 237 | struct mm_struct *mm = current->mm; | ||
| 238 | |||
| 239 | if (brk < mm->start_brk || brk > mm->context.end_brk) | ||
| 240 | return mm->brk; | ||
| 241 | |||
| 242 | if (mm->brk == brk) | ||
| 243 | return mm->brk; | ||
| 244 | |||
| 245 | /* | ||
| 246 | * Always allow shrinking brk | ||
| 247 | */ | ||
| 248 | if (brk <= mm->brk) { | ||
| 249 | mm->brk = brk; | ||
| 250 | return brk; | ||
| 251 | } | ||
| 252 | |||
| 253 | /* | ||
| 254 | * Ok, looks good - let it rip. | ||
| 255 | */ | ||
| 256 | return mm->brk = brk; | ||
| 257 | } | ||
| 258 | |||
| 259 | #ifdef DEBUG | ||
| 260 | static void show_process_blocks(void) | ||
| 261 | { | ||
| 262 | struct vm_list_struct *vml; | ||
| 263 | |||
| 264 | printk("Process blocks %d:", current->pid); | ||
| 265 | |||
| 266 | for (vml = ¤t->mm->context.vmlist; vml; vml = vml->next) { | ||
| 267 | printk(" %p: %p", vml, vml->vma); | ||
| 268 | if (vml->vma) | ||
| 269 | printk(" (%d @%lx #%d)", | ||
| 270 | kobjsize((void *) vml->vma->vm_start), | ||
| 271 | vml->vma->vm_start, | ||
| 272 | atomic_read(&vml->vma->vm_usage)); | ||
| 273 | printk(vml->next ? " ->" : ".\n"); | ||
| 274 | } | ||
| 275 | } | ||
| 276 | #endif /* DEBUG */ | ||
| 277 | |||
| 278 | static inline struct vm_area_struct *find_nommu_vma(unsigned long start) | ||
| 279 | { | ||
| 280 | struct vm_area_struct *vma; | ||
| 281 | struct rb_node *n = nommu_vma_tree.rb_node; | ||
| 282 | |||
| 283 | while (n) { | ||
| 284 | vma = rb_entry(n, struct vm_area_struct, vm_rb); | ||
| 285 | |||
| 286 | if (start < vma->vm_start) | ||
| 287 | n = n->rb_left; | ||
| 288 | else if (start > vma->vm_start) | ||
| 289 | n = n->rb_right; | ||
| 290 | else | ||
| 291 | return vma; | ||
| 292 | } | ||
| 293 | |||
| 294 | return NULL; | ||
| 295 | } | ||
| 296 | |||
| 297 | static void add_nommu_vma(struct vm_area_struct *vma) | ||
| 298 | { | ||
| 299 | struct vm_area_struct *pvma; | ||
| 300 | struct address_space *mapping; | ||
| 301 | struct rb_node **p = &nommu_vma_tree.rb_node; | ||
| 302 | struct rb_node *parent = NULL; | ||
| 303 | |||
| 304 | /* add the VMA to the mapping */ | ||
| 305 | if (vma->vm_file) { | ||
| 306 | mapping = vma->vm_file->f_mapping; | ||
| 307 | |||
| 308 | flush_dcache_mmap_lock(mapping); | ||
| 309 | vma_prio_tree_insert(vma, &mapping->i_mmap); | ||
| 310 | flush_dcache_mmap_unlock(mapping); | ||
| 311 | } | ||
| 312 | |||
| 313 | /* add the VMA to the master list */ | ||
| 314 | while (*p) { | ||
| 315 | parent = *p; | ||
| 316 | pvma = rb_entry(parent, struct vm_area_struct, vm_rb); | ||
| 317 | |||
| 318 | if (vma->vm_start < pvma->vm_start) { | ||
| 319 | p = &(*p)->rb_left; | ||
| 320 | } | ||
| 321 | else if (vma->vm_start > pvma->vm_start) { | ||
| 322 | p = &(*p)->rb_right; | ||
| 323 | } | ||
| 324 | else { | ||
| 325 | /* mappings are at the same address - this can only | ||
| 326 | * happen for shared-mem chardevs and shared file | ||
| 327 | * mappings backed by ramfs/tmpfs */ | ||
| 328 | BUG_ON(!(pvma->vm_flags & VM_SHARED)); | ||
| 329 | |||
| 330 | if (vma < pvma) | ||
| 331 | p = &(*p)->rb_left; | ||
| 332 | else if (vma > pvma) | ||
| 333 | p = &(*p)->rb_right; | ||
| 334 | else | ||
| 335 | BUG(); | ||
| 336 | } | ||
| 337 | } | ||
| 338 | |||
| 339 | rb_link_node(&vma->vm_rb, parent, p); | ||
| 340 | rb_insert_color(&vma->vm_rb, &nommu_vma_tree); | ||
| 341 | } | ||
| 342 | |||
| 343 | static void delete_nommu_vma(struct vm_area_struct *vma) | ||
| 344 | { | ||
| 345 | struct address_space *mapping; | ||
| 346 | |||
| 347 | /* remove the VMA from the mapping */ | ||
| 348 | if (vma->vm_file) { | ||
| 349 | mapping = vma->vm_file->f_mapping; | ||
| 350 | |||
| 351 | flush_dcache_mmap_lock(mapping); | ||
| 352 | vma_prio_tree_remove(vma, &mapping->i_mmap); | ||
| 353 | flush_dcache_mmap_unlock(mapping); | ||
| 354 | } | ||
| 355 | |||
| 356 | /* remove from the master list */ | ||
| 357 | rb_erase(&vma->vm_rb, &nommu_vma_tree); | ||
| 358 | } | ||
| 359 | |||
| 360 | /* | ||
| 361 | * determine whether a mapping should be permitted and, if so, what sort of | ||
| 362 | * mapping we're capable of supporting | ||
| 363 | */ | ||
| 364 | static int validate_mmap_request(struct file *file, | ||
| 365 | unsigned long addr, | ||
| 366 | unsigned long len, | ||
| 367 | unsigned long prot, | ||
| 368 | unsigned long flags, | ||
| 369 | unsigned long pgoff, | ||
| 370 | unsigned long *_capabilities) | ||
| 371 | { | ||
| 372 | unsigned long capabilities; | ||
| 373 | unsigned long reqprot = prot; | ||
| 374 | int ret; | ||
| 375 | |||
| 376 | /* do the simple checks first */ | ||
| 377 | if (flags & MAP_FIXED || addr) { | ||
| 378 | printk(KERN_DEBUG | ||
| 379 | "%d: Can't do fixed-address/overlay mmap of RAM\n", | ||
| 380 | current->pid); | ||
| 381 | return -EINVAL; | ||
| 382 | } | ||
| 383 | |||
| 384 | if ((flags & MAP_TYPE) != MAP_PRIVATE && | ||
| 385 | (flags & MAP_TYPE) != MAP_SHARED) | ||
| 386 | return -EINVAL; | ||
| 387 | |||
| 388 | if (PAGE_ALIGN(len) == 0) | ||
| 389 | return addr; | ||
| 390 | |||
| 391 | if (len > TASK_SIZE) | ||
| 392 | return -EINVAL; | ||
| 393 | |||
| 394 | /* offset overflow? */ | ||
| 395 | if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) | ||
| 396 | return -EINVAL; | ||
| 397 | |||
| 398 | if (file) { | ||
| 399 | /* validate file mapping requests */ | ||
| 400 | struct address_space *mapping; | ||
| 401 | |||
| 402 | /* files must support mmap */ | ||
| 403 | if (!file->f_op || !file->f_op->mmap) | ||
| 404 | return -ENODEV; | ||
| 405 | |||
| 406 | /* work out if what we've got could possibly be shared | ||
| 407 | * - we support chardevs that provide their own "memory" | ||
| 408 | * - we support files/blockdevs that are memory backed | ||
| 409 | */ | ||
| 410 | mapping = file->f_mapping; | ||
| 411 | if (!mapping) | ||
| 412 | mapping = file->f_dentry->d_inode->i_mapping; | ||
| 413 | |||
| 414 | capabilities = 0; | ||
| 415 | if (mapping && mapping->backing_dev_info) | ||
| 416 | capabilities = mapping->backing_dev_info->capabilities; | ||
| 417 | |||
| 418 | if (!capabilities) { | ||
| 419 | /* no explicit capabilities set, so assume some | ||
| 420 | * defaults */ | ||
| 421 | switch (file->f_dentry->d_inode->i_mode & S_IFMT) { | ||
| 422 | case S_IFREG: | ||
| 423 | case S_IFBLK: | ||
| 424 | capabilities = BDI_CAP_MAP_COPY; | ||
| 425 | break; | ||
| 426 | |||
| 427 | case S_IFCHR: | ||
| 428 | capabilities = | ||
| 429 | BDI_CAP_MAP_DIRECT | | ||
| 430 | BDI_CAP_READ_MAP | | ||
| 431 | BDI_CAP_WRITE_MAP; | ||
| 432 | break; | ||
| 433 | |||
| 434 | default: | ||
| 435 | return -EINVAL; | ||
| 436 | } | ||
| 437 | } | ||
| 438 | |||
| 439 | /* eliminate any capabilities that we can't support on this | ||
| 440 | * device */ | ||
| 441 | if (!file->f_op->get_unmapped_area) | ||
| 442 | capabilities &= ~BDI_CAP_MAP_DIRECT; | ||
| 443 | if (!file->f_op->read) | ||
| 444 | capabilities &= ~BDI_CAP_MAP_COPY; | ||
| 445 | |||
| 446 | if (flags & MAP_SHARED) { | ||
| 447 | /* do checks for writing, appending and locking */ | ||
| 448 | if ((prot & PROT_WRITE) && | ||
| 449 | !(file->f_mode & FMODE_WRITE)) | ||
| 450 | return -EACCES; | ||
| 451 | |||
| 452 | if (IS_APPEND(file->f_dentry->d_inode) && | ||
| 453 | (file->f_mode & FMODE_WRITE)) | ||
| 454 | return -EACCES; | ||
| 455 | |||
| 456 | if (locks_verify_locked(file->f_dentry->d_inode)) | ||
| 457 | return -EAGAIN; | ||
| 458 | |||
| 459 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) | ||
| 460 | return -ENODEV; | ||
| 461 | |||
| 462 | if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) || | ||
| 463 | ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) || | ||
| 464 | ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP)) | ||
| 465 | ) { | ||
| 466 | printk("MAP_SHARED not completely supported on !MMU\n"); | ||
| 467 | return -EINVAL; | ||
| 468 | } | ||
| 469 | |||
| 470 | /* we mustn't privatise shared mappings */ | ||
| 471 | capabilities &= ~BDI_CAP_MAP_COPY; | ||
| 472 | } | ||
| 473 | else { | ||
| 474 | /* we're going to read the file into private memory we | ||
| 475 | * allocate */ | ||
| 476 | if (!(capabilities & BDI_CAP_MAP_COPY)) | ||
| 477 | return -ENODEV; | ||
| 478 | |||
| 479 | /* we don't permit a private writable mapping to be | ||
| 480 | * shared with the backing device */ | ||
| 481 | if (prot & PROT_WRITE) | ||
| 482 | capabilities &= ~BDI_CAP_MAP_DIRECT; | ||
| 483 | } | ||
| 484 | |||
| 485 | /* handle executable mappings and implied executable | ||
| 486 | * mappings */ | ||
| 487 | if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) { | ||
| 488 | if (prot & PROT_EXEC) | ||
| 489 | return -EPERM; | ||
| 490 | } | ||
| 491 | else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { | ||
| 492 | /* handle implication of PROT_EXEC by PROT_READ */ | ||
| 493 | if (current->personality & READ_IMPLIES_EXEC) { | ||
| 494 | if (capabilities & BDI_CAP_EXEC_MAP) | ||
| 495 | prot |= PROT_EXEC; | ||
| 496 | } | ||
| 497 | } | ||
| 498 | else if ((prot & PROT_READ) && | ||
| 499 | (prot & PROT_EXEC) && | ||
| 500 | !(capabilities & BDI_CAP_EXEC_MAP) | ||
| 501 | ) { | ||
| 502 | /* backing file is not executable, try to copy */ | ||
| 503 | capabilities &= ~BDI_CAP_MAP_DIRECT; | ||
| 504 | } | ||
| 505 | } | ||
| 506 | else { | ||
| 507 | /* anonymous mappings are always memory backed and can be | ||
| 508 | * privately mapped | ||
| 509 | */ | ||
| 510 | capabilities = BDI_CAP_MAP_COPY; | ||
| 511 | |||
| 512 | /* handle PROT_EXEC implication by PROT_READ */ | ||
| 513 | if ((prot & PROT_READ) && | ||
| 514 | (current->personality & READ_IMPLIES_EXEC)) | ||
| 515 | prot |= PROT_EXEC; | ||
| 516 | } | ||
| 517 | |||
| 518 | /* allow the security API to have its say */ | ||
| 519 | ret = security_file_mmap(file, reqprot, prot, flags); | ||
| 520 | if (ret < 0) | ||
| 521 | return ret; | ||
| 522 | |||
| 523 | /* looks okay */ | ||
| 524 | *_capabilities = capabilities; | ||
| 525 | return 0; | ||
| 526 | } | ||
| 527 | |||
| 528 | /* | ||
| 529 | * we've determined that we can make the mapping, now translate what we | ||
| 530 | * now know into VMA flags | ||
| 531 | */ | ||
| 532 | static unsigned long determine_vm_flags(struct file *file, | ||
| 533 | unsigned long prot, | ||
| 534 | unsigned long flags, | ||
| 535 | unsigned long capabilities) | ||
| 536 | { | ||
| 537 | unsigned long vm_flags; | ||
| 538 | |||
| 539 | vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); | ||
| 540 | vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; | ||
| 541 | /* vm_flags |= mm->def_flags; */ | ||
| 542 | |||
| 543 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) { | ||
| 544 | /* attempt to share read-only copies of mapped file chunks */ | ||
| 545 | if (file && !(prot & PROT_WRITE)) | ||
| 546 | vm_flags |= VM_MAYSHARE; | ||
| 547 | } | ||
| 548 | else { | ||
| 549 | /* overlay a shareable mapping on the backing device or inode | ||
| 550 | * if possible - used for chardevs, ramfs/tmpfs/shmfs and | ||
| 551 | * romfs/cramfs */ | ||
| 552 | if (flags & MAP_SHARED) | ||
| 553 | vm_flags |= VM_MAYSHARE | VM_SHARED; | ||
| 554 | else if ((((vm_flags & capabilities) ^ vm_flags) & BDI_CAP_VMFLAGS) == 0) | ||
| 555 | vm_flags |= VM_MAYSHARE; | ||
| 556 | } | ||
| 557 | |||
| 558 | /* refuse to let anyone share private mappings with this process if | ||
| 559 | * it's being traced - otherwise breakpoints set in it may interfere | ||
| 560 | * with another untraced process | ||
| 561 | */ | ||
| 562 | if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED)) | ||
| 563 | vm_flags &= ~VM_MAYSHARE; | ||
| 564 | |||
| 565 | return vm_flags; | ||
| 566 | } | ||
| 567 | |||
| 568 | /* | ||
| 569 | * set up a shared mapping on a file | ||
| 570 | */ | ||
| 571 | static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len) | ||
| 572 | { | ||
| 573 | int ret; | ||
| 574 | |||
| 575 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); | ||
| 576 | if (ret != -ENOSYS) | ||
| 577 | return ret; | ||
| 578 | |||
| 579 | /* getting an ENOSYS error indicates that direct mmap isn't | ||
| 580 | * possible (as opposed to tried but failed) so we'll fall | ||
| 581 | * through to making a private copy of the data and mapping | ||
| 582 | * that if we can */ | ||
| 583 | return -ENODEV; | ||
| 584 | } | ||
| 585 | |||
| 586 | /* | ||
| 587 | * set up a private mapping or an anonymous shared mapping | ||
| 588 | */ | ||
| 589 | static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) | ||
| 590 | { | ||
| 591 | void *base; | ||
| 592 | int ret; | ||
| 593 | |||
| 594 | /* invoke the file's mapping function so that it can keep track of | ||
| 595 | * shared mappings on devices or memory | ||
| 596 | * - VM_MAYSHARE will be set if it may attempt to share | ||
| 597 | */ | ||
| 598 | if (vma->vm_file) { | ||
| 599 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); | ||
| 600 | if (ret != -ENOSYS) { | ||
| 601 | /* shouldn't return success if we're not sharing */ | ||
| 602 | BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE)); | ||
| 603 | return ret; /* success or a real error */ | ||
| 604 | } | ||
| 605 | |||
| 606 | /* getting an ENOSYS error indicates that direct mmap isn't | ||
| 607 | * possible (as opposed to tried but failed) so we'll try to | ||
| 608 | * make a private copy of the data and map that instead */ | ||
| 609 | } | ||
| 610 | |||
| 611 | /* allocate some memory to hold the mapping | ||
| 612 | * - note that this may not return a page-aligned address if the object | ||
| 613 | * we're allocating is smaller than a page | ||
| 614 | */ | ||
| 615 | base = kmalloc(len, GFP_KERNEL); | ||
| 616 | if (!base) | ||
| 617 | goto enomem; | ||
| 618 | |||
| 619 | vma->vm_start = (unsigned long) base; | ||
| 620 | vma->vm_end = vma->vm_start + len; | ||
| 621 | vma->vm_flags |= VM_MAPPED_COPY; | ||
| 622 | |||
| 623 | #ifdef WARN_ON_SLACK | ||
| 624 | if (len + WARN_ON_SLACK <= kobjsize(result)) | ||
| 625 | printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n", | ||
| 626 | len, current->pid, kobjsize(result) - len); | ||
| 627 | #endif | ||
| 628 | |||
| 629 | if (vma->vm_file) { | ||
| 630 | /* read the contents of a file into the copy */ | ||
| 631 | mm_segment_t old_fs; | ||
| 632 | loff_t fpos; | ||
| 633 | |||
| 634 | fpos = vma->vm_pgoff; | ||
| 635 | fpos <<= PAGE_SHIFT; | ||
| 636 | |||
| 637 | old_fs = get_fs(); | ||
| 638 | set_fs(KERNEL_DS); | ||
| 639 | ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos); | ||
| 640 | set_fs(old_fs); | ||
| 641 | |||
| 642 | if (ret < 0) | ||
| 643 | goto error_free; | ||
| 644 | |||
| 645 | /* clear the last little bit */ | ||
| 646 | if (ret < len) | ||
| 647 | memset(base + ret, 0, len - ret); | ||
| 648 | |||
| 649 | } else { | ||
| 650 | /* if it's an anonymous mapping, then just clear it */ | ||
| 651 | memset(base, 0, len); | ||
| 652 | } | ||
| 653 | |||
| 654 | return 0; | ||
| 655 | |||
| 656 | error_free: | ||
| 657 | kfree(base); | ||
| 658 | vma->vm_start = 0; | ||
| 659 | return ret; | ||
| 660 | |||
| 661 | enomem: | ||
| 662 | printk("Allocation of length %lu from process %d failed\n", | ||
| 663 | len, current->pid); | ||
| 664 | show_free_areas(); | ||
| 665 | return -ENOMEM; | ||
| 666 | } | ||
| 667 | |||
| 668 | /* | ||
| 669 | * handle mapping creation for uClinux | ||
| 670 | */ | ||
| 671 | unsigned long do_mmap_pgoff(struct file *file, | ||
| 672 | unsigned long addr, | ||
| 673 | unsigned long len, | ||
| 674 | unsigned long prot, | ||
| 675 | unsigned long flags, | ||
| 676 | unsigned long pgoff) | ||
| 677 | { | ||
| 678 | struct vm_list_struct *vml = NULL; | ||
| 679 | struct vm_area_struct *vma = NULL; | ||
| 680 | struct rb_node *rb; | ||
| 681 | unsigned long capabilities, vm_flags; | ||
| 682 | void *result; | ||
| 683 | int ret; | ||
| 684 | |||
| 685 | /* decide whether we should attempt the mapping, and if so what sort of | ||
| 686 | * mapping */ | ||
| 687 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, | ||
| 688 | &capabilities); | ||
| 689 | if (ret < 0) | ||
| 690 | return ret; | ||
| 691 | |||
| 692 | /* we've determined that we can make the mapping, now translate what we | ||
| 693 | * now know into VMA flags */ | ||
| 694 | vm_flags = determine_vm_flags(file, prot, flags, capabilities); | ||
| 695 | |||
| 696 | /* we're going to need to record the mapping if it works */ | ||
| 697 | vml = kmalloc(sizeof(struct vm_list_struct), GFP_KERNEL); | ||
| 698 | if (!vml) | ||
| 699 | goto error_getting_vml; | ||
| 700 | memset(vml, 0, sizeof(*vml)); | ||
| 701 | |||
| 702 | down_write(&nommu_vma_sem); | ||
| 703 | |||
| 704 | /* if we want to share, we need to check for VMAs created by other | ||
| 705 | * mmap() calls that overlap with our proposed mapping | ||
| 706 | * - we can only share with an exact match on most regular files | ||
| 707 | * - shared mappings on character devices and memory backed files are | ||
| 708 | * permitted to overlap inexactly as far as we are concerned for in | ||
| 709 | * these cases, sharing is handled in the driver or filesystem rather | ||
| 710 | * than here | ||
| 711 | */ | ||
| 712 | if (vm_flags & VM_MAYSHARE) { | ||
| 713 | unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
| 714 | unsigned long vmpglen; | ||
| 715 | |||
| 716 | for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) { | ||
| 717 | vma = rb_entry(rb, struct vm_area_struct, vm_rb); | ||
| 718 | |||
| 719 | if (!(vma->vm_flags & VM_MAYSHARE)) | ||
| 720 | continue; | ||
| 721 | |||
| 722 | /* search for overlapping mappings on the same file */ | ||
| 723 | if (vma->vm_file->f_dentry->d_inode != file->f_dentry->d_inode) | ||
| 724 | continue; | ||
| 725 | |||
| 726 | if (vma->vm_pgoff >= pgoff + pglen) | ||
| 727 | continue; | ||
| 728 | |||
| 729 | vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1; | ||
| 730 | vmpglen >>= PAGE_SHIFT; | ||
| 731 | if (pgoff >= vma->vm_pgoff + vmpglen) | ||
| 732 | continue; | ||
| 733 | |||
| 734 | /* handle inexactly overlapping matches between mappings */ | ||
| 735 | if (vma->vm_pgoff != pgoff || vmpglen != pglen) { | ||
| 736 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) | ||
| 737 | goto sharing_violation; | ||
| 738 | continue; | ||
| 739 | } | ||
| 740 | |||
| 741 | /* we've found a VMA we can share */ | ||
| 742 | atomic_inc(&vma->vm_usage); | ||
| 743 | |||
| 744 | vml->vma = vma; | ||
| 745 | result = (void *) vma->vm_start; | ||
| 746 | goto shared; | ||
| 747 | } | ||
| 748 | |||
| 749 | vma = NULL; | ||
| 750 | |||
| 751 | /* obtain the address at which to make a shared mapping | ||
| 752 | * - this is the hook for quasi-memory character devices to | ||
| 753 | * tell us the location of a shared mapping | ||
| 754 | */ | ||
| 755 | if (file && file->f_op->get_unmapped_area) { | ||
| 756 | addr = file->f_op->get_unmapped_area(file, addr, len, | ||
| 757 | pgoff, flags); | ||
| 758 | if (IS_ERR((void *) addr)) { | ||
| 759 | ret = addr; | ||
| 760 | if (ret != (unsigned long) -ENOSYS) | ||
| 761 | goto error; | ||
| 762 | |||
| 763 | /* the driver refused to tell us where to site | ||
| 764 | * the mapping so we'll have to attempt to copy | ||
| 765 | * it */ | ||
| 766 | ret = (unsigned long) -ENODEV; | ||
| 767 | if (!(capabilities & BDI_CAP_MAP_COPY)) | ||
| 768 | goto error; | ||
| 769 | |||
| 770 | capabilities &= ~BDI_CAP_MAP_DIRECT; | ||
| 771 | } | ||
| 772 | } | ||
| 773 | } | ||
| 774 | |||
| 775 | /* we're going to need a VMA struct as well */ | ||
| 776 | vma = kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); | ||
| 777 | if (!vma) | ||
| 778 | goto error_getting_vma; | ||
| 779 | |||
| 780 | memset(vma, 0, sizeof(*vma)); | ||
| 781 | INIT_LIST_HEAD(&vma->anon_vma_node); | ||
| 782 | atomic_set(&vma->vm_usage, 1); | ||
| 783 | if (file) | ||
| 784 | get_file(file); | ||
| 785 | vma->vm_file = file; | ||
| 786 | vma->vm_flags = vm_flags; | ||
| 787 | vma->vm_start = addr; | ||
| 788 | vma->vm_end = addr + len; | ||
| 789 | vma->vm_pgoff = pgoff; | ||
| 790 | |||
| 791 | vml->vma = vma; | ||
| 792 | |||
| 793 | /* set up the mapping */ | ||
| 794 | if (file && vma->vm_flags & VM_SHARED) | ||
| 795 | ret = do_mmap_shared_file(vma, len); | ||
| 796 | else | ||
| 797 | ret = do_mmap_private(vma, len); | ||
| 798 | if (ret < 0) | ||
| 799 | goto error; | ||
| 800 | |||
| 801 | /* okay... we have a mapping; now we have to register it */ | ||
| 802 | result = (void *) vma->vm_start; | ||
| 803 | |||
| 804 | if (vma->vm_flags & VM_MAPPED_COPY) { | ||
| 805 | realalloc += kobjsize(result); | ||
| 806 | askedalloc += len; | ||
| 807 | } | ||
| 808 | |||
| 809 | realalloc += kobjsize(vma); | ||
| 810 | askedalloc += sizeof(*vma); | ||
| 811 | |||
| 812 | current->mm->total_vm += len >> PAGE_SHIFT; | ||
| 813 | |||
| 814 | add_nommu_vma(vma); | ||
| 815 | |||
| 816 | shared: | ||
| 817 | realalloc += kobjsize(vml); | ||
| 818 | askedalloc += sizeof(*vml); | ||
| 819 | |||
| 820 | vml->next = current->mm->context.vmlist; | ||
| 821 | current->mm->context.vmlist = vml; | ||
| 822 | |||
| 823 | up_write(&nommu_vma_sem); | ||
| 824 | |||
| 825 | if (prot & PROT_EXEC) | ||
| 826 | flush_icache_range((unsigned long) result, | ||
| 827 | (unsigned long) result + len); | ||
| 828 | |||
| 829 | #ifdef DEBUG | ||
| 830 | printk("do_mmap:\n"); | ||
| 831 | show_process_blocks(); | ||
| 832 | #endif | ||
| 833 | |||
| 834 | return (unsigned long) result; | ||
| 835 | |||
| 836 | error: | ||
| 837 | up_write(&nommu_vma_sem); | ||
| 838 | kfree(vml); | ||
| 839 | if (vma) { | ||
| 840 | fput(vma->vm_file); | ||
| 841 | kfree(vma); | ||
| 842 | } | ||
| 843 | return ret; | ||
| 844 | |||
| 845 | sharing_violation: | ||
| 846 | up_write(&nommu_vma_sem); | ||
| 847 | printk("Attempt to share mismatched mappings\n"); | ||
| 848 | kfree(vml); | ||
| 849 | return -EINVAL; | ||
| 850 | |||
| 851 | error_getting_vma: | ||
| 852 | up_write(&nommu_vma_sem); | ||
| 853 | kfree(vml); | ||
| 854 | printk("Allocation of vml for %lu byte allocation from process %d failed\n", | ||
| 855 | len, current->pid); | ||
| 856 | show_free_areas(); | ||
| 857 | return -ENOMEM; | ||
| 858 | |||
| 859 | error_getting_vml: | ||
| 860 | printk("Allocation of vml for %lu byte allocation from process %d failed\n", | ||
| 861 | len, current->pid); | ||
| 862 | show_free_areas(); | ||
| 863 | return -ENOMEM; | ||
| 864 | } | ||
| 865 | |||
| 866 | /* | ||
| 867 | * handle mapping disposal for uClinux | ||
| 868 | */ | ||
| 869 | static void put_vma(struct vm_area_struct *vma) | ||
| 870 | { | ||
| 871 | if (vma) { | ||
| 872 | down_write(&nommu_vma_sem); | ||
| 873 | |||
| 874 | if (atomic_dec_and_test(&vma->vm_usage)) { | ||
| 875 | delete_nommu_vma(vma); | ||
| 876 | |||
| 877 | if (vma->vm_ops && vma->vm_ops->close) | ||
| 878 | vma->vm_ops->close(vma); | ||
| 879 | |||
| 880 | /* IO memory and memory shared directly out of the pagecache from | ||
| 881 | * ramfs/tmpfs mustn't be released here */ | ||
| 882 | if (vma->vm_flags & VM_MAPPED_COPY) { | ||
| 883 | realalloc -= kobjsize((void *) vma->vm_start); | ||
| 884 | askedalloc -= vma->vm_end - vma->vm_start; | ||
| 885 | kfree((void *) vma->vm_start); | ||
| 886 | } | ||
| 887 | |||
| 888 | realalloc -= kobjsize(vma); | ||
| 889 | askedalloc -= sizeof(*vma); | ||
| 890 | |||
| 891 | if (vma->vm_file) | ||
| 892 | fput(vma->vm_file); | ||
| 893 | kfree(vma); | ||
| 894 | } | ||
| 895 | |||
| 896 | up_write(&nommu_vma_sem); | ||
| 897 | } | ||
| 898 | } | ||
| 899 | |||
| 900 | int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) | ||
| 901 | { | ||
| 902 | struct vm_list_struct *vml, **parent; | ||
| 903 | unsigned long end = addr + len; | ||
| 904 | |||
| 905 | #ifdef DEBUG | ||
| 906 | printk("do_munmap:\n"); | ||
| 907 | #endif | ||
| 908 | |||
| 909 | for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) | ||
| 910 | if ((*parent)->vma->vm_start == addr && | ||
| 911 | (*parent)->vma->vm_end == end) | ||
| 912 | goto found; | ||
| 913 | |||
| 914 | printk("munmap of non-mmaped memory by process %d (%s): %p\n", | ||
| 915 | current->pid, current->comm, (void *) addr); | ||
| 916 | return -EINVAL; | ||
| 917 | |||
| 918 | found: | ||
| 919 | vml = *parent; | ||
| 920 | |||
| 921 | put_vma(vml->vma); | ||
| 922 | |||
| 923 | *parent = vml->next; | ||
| 924 | realalloc -= kobjsize(vml); | ||
| 925 | askedalloc -= sizeof(*vml); | ||
| 926 | kfree(vml); | ||
| 927 | mm->total_vm -= len >> PAGE_SHIFT; | ||
| 928 | |||
| 929 | #ifdef DEBUG | ||
| 930 | show_process_blocks(); | ||
| 931 | #endif | ||
| 932 | |||
| 933 | return 0; | ||
| 934 | } | ||
| 935 | |||
| 936 | /* Release all mmaps. */ | ||
| 937 | void exit_mmap(struct mm_struct * mm) | ||
| 938 | { | ||
| 939 | struct vm_list_struct *tmp; | ||
| 940 | |||
| 941 | if (mm) { | ||
| 942 | #ifdef DEBUG | ||
| 943 | printk("Exit_mmap:\n"); | ||
| 944 | #endif | ||
| 945 | |||
| 946 | mm->total_vm = 0; | ||
| 947 | |||
| 948 | while ((tmp = mm->context.vmlist)) { | ||
| 949 | mm->context.vmlist = tmp->next; | ||
| 950 | put_vma(tmp->vma); | ||
| 951 | |||
| 952 | realalloc -= kobjsize(tmp); | ||
| 953 | askedalloc -= sizeof(*tmp); | ||
| 954 | kfree(tmp); | ||
| 955 | } | ||
| 956 | |||
| 957 | #ifdef DEBUG | ||
| 958 | show_process_blocks(); | ||
| 959 | #endif | ||
| 960 | } | ||
| 961 | } | ||
| 962 | |||
| 963 | asmlinkage long sys_munmap(unsigned long addr, size_t len) | ||
| 964 | { | ||
| 965 | int ret; | ||
| 966 | struct mm_struct *mm = current->mm; | ||
| 967 | |||
| 968 | down_write(&mm->mmap_sem); | ||
| 969 | ret = do_munmap(mm, addr, len); | ||
| 970 | up_write(&mm->mmap_sem); | ||
| 971 | return ret; | ||
| 972 | } | ||
| 973 | |||
| 974 | unsigned long do_brk(unsigned long addr, unsigned long len) | ||
| 975 | { | ||
| 976 | return -ENOMEM; | ||
| 977 | } | ||
| 978 | |||
| 979 | /* | ||
| 980 | * Expand (or shrink) an existing mapping, potentially moving it at the | ||
| 981 | * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) | ||
| 982 | * | ||
| 983 | * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise | ||
| 984 | * This option implies MREMAP_MAYMOVE. | ||
| 985 | * | ||
| 986 | * on uClinux, we only permit changing a mapping's size, and only as long as it stays within the | ||
| 987 | * hole allocated by the kmalloc() call in do_mmap_pgoff() and the block is not shareable | ||
| 988 | */ | ||
| 989 | unsigned long do_mremap(unsigned long addr, | ||
| 990 | unsigned long old_len, unsigned long new_len, | ||
| 991 | unsigned long flags, unsigned long new_addr) | ||
| 992 | { | ||
| 993 | struct vm_list_struct *vml = NULL; | ||
| 994 | |||
| 995 | /* insanity checks first */ | ||
| 996 | if (new_len == 0) | ||
| 997 | return (unsigned long) -EINVAL; | ||
| 998 | |||
| 999 | if (flags & MREMAP_FIXED && new_addr != addr) | ||
| 1000 | return (unsigned long) -EINVAL; | ||
| 1001 | |||
| 1002 | for (vml = current->mm->context.vmlist; vml; vml = vml->next) | ||
| 1003 | if (vml->vma->vm_start == addr) | ||
| 1004 | goto found; | ||
| 1005 | |||
| 1006 | return (unsigned long) -EINVAL; | ||
| 1007 | |||
| 1008 | found: | ||
| 1009 | if (vml->vma->vm_end != vml->vma->vm_start + old_len) | ||
| 1010 | return (unsigned long) -EFAULT; | ||
| 1011 | |||
| 1012 | if (vml->vma->vm_flags & VM_MAYSHARE) | ||
| 1013 | return (unsigned long) -EPERM; | ||
| 1014 | |||
| 1015 | if (new_len > kobjsize((void *) addr)) | ||
| 1016 | return (unsigned long) -ENOMEM; | ||
| 1017 | |||
| 1018 | /* all checks complete - do it */ | ||
| 1019 | vml->vma->vm_end = vml->vma->vm_start + new_len; | ||
| 1020 | |||
| 1021 | askedalloc -= old_len; | ||
| 1022 | askedalloc += new_len; | ||
| 1023 | |||
| 1024 | return vml->vma->vm_start; | ||
| 1025 | } | ||
| 1026 | |||
| 1027 | /* | ||
| 1028 | * Look up the first VMA which satisfies addr < vm_end, NULL if none | ||
| 1029 | */ | ||
| 1030 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | ||
| 1031 | { | ||
| 1032 | struct vm_list_struct *vml; | ||
| 1033 | |||
| 1034 | for (vml = mm->context.vmlist; vml; vml = vml->next) | ||
| 1035 | if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end) | ||
| 1036 | return vml->vma; | ||
| 1037 | |||
| 1038 | return NULL; | ||
| 1039 | } | ||
| 1040 | |||
| 1041 | EXPORT_SYMBOL(find_vma); | ||
| 1042 | |||
| 1043 | struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write) | ||
| 1044 | { | ||
| 1045 | return NULL; | ||
| 1046 | } | ||
| 1047 | |||
| 1048 | struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) | ||
| 1049 | { | ||
| 1050 | return NULL; | ||
| 1051 | } | ||
| 1052 | |||
| 1053 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, | ||
| 1054 | unsigned long to, unsigned long size, pgprot_t prot) | ||
| 1055 | { | ||
| 1056 | return -EPERM; | ||
| 1057 | } | ||
| 1058 | |||
| 1059 | void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | ||
| 1060 | { | ||
| 1061 | } | ||
| 1062 | |||
| 1063 | unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, | ||
| 1064 | unsigned long len, unsigned long pgoff, unsigned long flags) | ||
| 1065 | { | ||
| 1066 | return -ENOMEM; | ||
| 1067 | } | ||
| 1068 | |||
| 1069 | void arch_unmap_area(struct vm_area_struct *area) | ||
| 1070 | { | ||
| 1071 | } | ||
| 1072 | |||
| 1073 | void update_mem_hiwater(struct task_struct *tsk) | ||
| 1074 | { | ||
| 1075 | unsigned long rss = get_mm_counter(tsk->mm, rss); | ||
| 1076 | |||
| 1077 | if (likely(tsk->mm)) { | ||
| 1078 | if (tsk->mm->hiwater_rss < rss) | ||
| 1079 | tsk->mm->hiwater_rss = rss; | ||
| 1080 | if (tsk->mm->hiwater_vm < tsk->mm->total_vm) | ||
| 1081 | tsk->mm->hiwater_vm = tsk->mm->total_vm; | ||
| 1082 | } | ||
| 1083 | } | ||
| 1084 | |||
| 1085 | void unmap_mapping_range(struct address_space *mapping, | ||
| 1086 | loff_t const holebegin, loff_t const holelen, | ||
| 1087 | int even_cows) | ||
| 1088 | { | ||
| 1089 | } | ||
| 1090 | |||
| 1091 | /* | ||
| 1092 | * Check that a process has enough memory to allocate a new virtual | ||
| 1093 | * mapping. 0 means there is enough memory for the allocation to | ||
| 1094 | * succeed and -ENOMEM implies there is not. | ||
| 1095 | * | ||
| 1096 | * We currently support three overcommit policies, which are set via the | ||
| 1097 | * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting | ||
| 1098 | * | ||
| 1099 | * Strict overcommit modes added 2002 Feb 26 by Alan Cox. | ||
| 1100 | * Additional code 2002 Jul 20 by Robert Love. | ||
| 1101 | * | ||
| 1102 | * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. | ||
| 1103 | * | ||
| 1104 | * Note this is a helper function intended to be used by LSMs which | ||
| 1105 | * wish to use this logic. | ||
| 1106 | */ | ||
| 1107 | int __vm_enough_memory(long pages, int cap_sys_admin) | ||
| 1108 | { | ||
| 1109 | unsigned long free, allowed; | ||
| 1110 | |||
| 1111 | vm_acct_memory(pages); | ||
| 1112 | |||
| 1113 | /* | ||
| 1114 | * Sometimes we want to use more memory than we have | ||
| 1115 | */ | ||
| 1116 | if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) | ||
| 1117 | return 0; | ||
| 1118 | |||
| 1119 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { | ||
| 1120 | unsigned long n; | ||
| 1121 | |||
| 1122 | free = get_page_cache_size(); | ||
| 1123 | free += nr_swap_pages; | ||
| 1124 | |||
| 1125 | /* | ||
| 1126 | * Any slabs which are created with the | ||
| 1127 | * SLAB_RECLAIM_ACCOUNT flag claim to have contents | ||
| 1128 | * which are reclaimable, under pressure. The dentry | ||
| 1129 | * cache and most inode caches should fall into this | ||
| 1130 | */ | ||
| 1131 | free += atomic_read(&slab_reclaim_pages); | ||
| 1132 | |||
| 1133 | /* | ||
| 1134 | * Leave the last 3% for root | ||
| 1135 | */ | ||
| 1136 | if (!cap_sys_admin) | ||
| 1137 | free -= free / 32; | ||
| 1138 | |||
| 1139 | if (free > pages) | ||
| 1140 | return 0; | ||
| 1141 | |||
| 1142 | /* | ||
| 1143 | * nr_free_pages() is very expensive on large systems, | ||
| 1144 | * only call if we're about to fail. | ||
| 1145 | */ | ||
| 1146 | n = nr_free_pages(); | ||
| 1147 | if (!cap_sys_admin) | ||
| 1148 | n -= n / 32; | ||
| 1149 | free += n; | ||
| 1150 | |||
| 1151 | if (free > pages) | ||
| 1152 | return 0; | ||
| 1153 | vm_unacct_memory(pages); | ||
| 1154 | return -ENOMEM; | ||
| 1155 | } | ||
| 1156 | |||
| 1157 | allowed = totalram_pages * sysctl_overcommit_ratio / 100; | ||
| 1158 | /* | ||
| 1159 | * Leave the last 3% for root | ||
| 1160 | */ | ||
| 1161 | if (!cap_sys_admin) | ||
| 1162 | allowed -= allowed / 32; | ||
| 1163 | allowed += total_swap_pages; | ||
| 1164 | |||
| 1165 | /* Don't let a single process grow too big: | ||
| 1166 | leave 3% of the size of this process for other processes */ | ||
| 1167 | allowed -= current->mm->total_vm / 32; | ||
| 1168 | |||
| 1169 | if (atomic_read(&vm_committed_space) < allowed) | ||
| 1170 | return 0; | ||
| 1171 | |||
| 1172 | vm_unacct_memory(pages); | ||
| 1173 | |||
| 1174 | return -ENOMEM; | ||
| 1175 | } | ||
| 1176 | |||
| 1177 | int in_gate_area_no_task(unsigned long addr) | ||
| 1178 | { | ||
| 1179 | return 0; | ||
| 1180 | } | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c new file mode 100644 index 000000000000..9595a0f6c4b8 --- /dev/null +++ b/mm/oom_kill.c | |||
| @@ -0,0 +1,292 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/oom_kill.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 1998,2000 Rik van Riel | ||
| 5 | * Thanks go out to Claus Fischer for some serious inspiration and | ||
| 6 | * for goading me into coding this file... | ||
| 7 | * | ||
| 8 | * The routines in this file are used to kill a process when | ||
| 9 | * we're seriously out of memory. This gets called from kswapd() | ||
| 10 | * in linux/mm/vmscan.c when we really run out of memory. | ||
| 11 | * | ||
| 12 | * Since we won't call these routines often (on a well-configured | ||
| 13 | * machine) this file will double as a 'coding guide' and a signpost | ||
| 14 | * for newbie kernel hackers. It features several pointers to major | ||
| 15 | * kernel subsystems and hints as to where to find out what things do. | ||
| 16 | */ | ||
| 17 | |||
| 18 | #include <linux/mm.h> | ||
| 19 | #include <linux/sched.h> | ||
| 20 | #include <linux/swap.h> | ||
| 21 | #include <linux/timex.h> | ||
| 22 | #include <linux/jiffies.h> | ||
| 23 | |||
| 24 | /* #define DEBUG */ | ||
| 25 | |||
| 26 | /** | ||
| 27 | * oom_badness - calculate a numeric value for how bad this task has been | ||
| 28 | * @p: task struct of which task we should calculate | ||
| 29 | * @p: current uptime in seconds | ||
| 30 | * | ||
| 31 | * The formula used is relatively simple and documented inline in the | ||
| 32 | * function. The main rationale is that we want to select a good task | ||
| 33 | * to kill when we run out of memory. | ||
| 34 | * | ||
| 35 | * Good in this context means that: | ||
| 36 | * 1) we lose the minimum amount of work done | ||
| 37 | * 2) we recover a large amount of memory | ||
| 38 | * 3) we don't kill anything innocent of eating tons of memory | ||
| 39 | * 4) we want to kill the minimum amount of processes (one) | ||
| 40 | * 5) we try to kill the process the user expects us to kill, this | ||
| 41 | * algorithm has been meticulously tuned to meet the principle | ||
| 42 | * of least surprise ... (be careful when you change it) | ||
| 43 | */ | ||
| 44 | |||
| 45 | unsigned long badness(struct task_struct *p, unsigned long uptime) | ||
| 46 | { | ||
| 47 | unsigned long points, cpu_time, run_time, s; | ||
| 48 | struct list_head *tsk; | ||
| 49 | |||
| 50 | if (!p->mm) | ||
| 51 | return 0; | ||
| 52 | |||
| 53 | /* | ||
| 54 | * The memory size of the process is the basis for the badness. | ||
| 55 | */ | ||
| 56 | points = p->mm->total_vm; | ||
| 57 | |||
| 58 | /* | ||
| 59 | * Processes which fork a lot of child processes are likely | ||
| 60 | * a good choice. We add the vmsize of the childs if they | ||
| 61 | * have an own mm. This prevents forking servers to flood the | ||
| 62 | * machine with an endless amount of childs | ||
| 63 | */ | ||
| 64 | list_for_each(tsk, &p->children) { | ||
| 65 | struct task_struct *chld; | ||
| 66 | chld = list_entry(tsk, struct task_struct, sibling); | ||
| 67 | if (chld->mm != p->mm && chld->mm) | ||
| 68 | points += chld->mm->total_vm; | ||
| 69 | } | ||
| 70 | |||
| 71 | /* | ||
| 72 | * CPU time is in tens of seconds and run time is in thousands | ||
| 73 | * of seconds. There is no particular reason for this other than | ||
| 74 | * that it turned out to work very well in practice. | ||
| 75 | */ | ||
| 76 | cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime)) | ||
| 77 | >> (SHIFT_HZ + 3); | ||
| 78 | |||
| 79 | if (uptime >= p->start_time.tv_sec) | ||
| 80 | run_time = (uptime - p->start_time.tv_sec) >> 10; | ||
| 81 | else | ||
| 82 | run_time = 0; | ||
| 83 | |||
| 84 | s = int_sqrt(cpu_time); | ||
| 85 | if (s) | ||
| 86 | points /= s; | ||
| 87 | s = int_sqrt(int_sqrt(run_time)); | ||
| 88 | if (s) | ||
| 89 | points /= s; | ||
| 90 | |||
| 91 | /* | ||
| 92 | * Niced processes are most likely less important, so double | ||
| 93 | * their badness points. | ||
| 94 | */ | ||
| 95 | if (task_nice(p) > 0) | ||
| 96 | points *= 2; | ||
| 97 | |||
| 98 | /* | ||
| 99 | * Superuser processes are usually more important, so we make it | ||
| 100 | * less likely that we kill those. | ||
| 101 | */ | ||
| 102 | if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) || | ||
| 103 | p->uid == 0 || p->euid == 0) | ||
| 104 | points /= 4; | ||
| 105 | |||
| 106 | /* | ||
| 107 | * We don't want to kill a process with direct hardware access. | ||
| 108 | * Not only could that mess up the hardware, but usually users | ||
| 109 | * tend to only have this flag set on applications they think | ||
| 110 | * of as important. | ||
| 111 | */ | ||
| 112 | if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) | ||
| 113 | points /= 4; | ||
| 114 | |||
| 115 | /* | ||
| 116 | * Adjust the score by oomkilladj. | ||
| 117 | */ | ||
| 118 | if (p->oomkilladj) { | ||
| 119 | if (p->oomkilladj > 0) | ||
| 120 | points <<= p->oomkilladj; | ||
| 121 | else | ||
| 122 | points >>= -(p->oomkilladj); | ||
| 123 | } | ||
| 124 | |||
| 125 | #ifdef DEBUG | ||
| 126 | printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n", | ||
| 127 | p->pid, p->comm, points); | ||
| 128 | #endif | ||
| 129 | return points; | ||
| 130 | } | ||
| 131 | |||
| 132 | /* | ||
| 133 | * Simple selection loop. We chose the process with the highest | ||
| 134 | * number of 'points'. We expect the caller will lock the tasklist. | ||
| 135 | * | ||
| 136 | * (not docbooked, we don't want this one cluttering up the manual) | ||
| 137 | */ | ||
| 138 | static struct task_struct * select_bad_process(void) | ||
| 139 | { | ||
| 140 | unsigned long maxpoints = 0; | ||
| 141 | struct task_struct *g, *p; | ||
| 142 | struct task_struct *chosen = NULL; | ||
| 143 | struct timespec uptime; | ||
| 144 | |||
| 145 | do_posix_clock_monotonic_gettime(&uptime); | ||
| 146 | do_each_thread(g, p) | ||
| 147 | /* skip the init task with pid == 1 */ | ||
| 148 | if (p->pid > 1) { | ||
| 149 | unsigned long points; | ||
| 150 | |||
| 151 | /* | ||
| 152 | * This is in the process of releasing memory so wait it | ||
| 153 | * to finish before killing some other task by mistake. | ||
| 154 | */ | ||
| 155 | if ((unlikely(test_tsk_thread_flag(p, TIF_MEMDIE)) || (p->flags & PF_EXITING)) && | ||
| 156 | !(p->flags & PF_DEAD)) | ||
| 157 | return ERR_PTR(-1UL); | ||
| 158 | if (p->flags & PF_SWAPOFF) | ||
| 159 | return p; | ||
| 160 | |||
| 161 | points = badness(p, uptime.tv_sec); | ||
| 162 | if (points > maxpoints || !chosen) { | ||
| 163 | chosen = p; | ||
| 164 | maxpoints = points; | ||
| 165 | } | ||
| 166 | } | ||
| 167 | while_each_thread(g, p); | ||
| 168 | return chosen; | ||
| 169 | } | ||
| 170 | |||
| 171 | /** | ||
| 172 | * We must be careful though to never send SIGKILL a process with | ||
| 173 | * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that | ||
| 174 | * we select a process with CAP_SYS_RAW_IO set). | ||
| 175 | */ | ||
| 176 | static void __oom_kill_task(task_t *p) | ||
| 177 | { | ||
| 178 | if (p->pid == 1) { | ||
| 179 | WARN_ON(1); | ||
| 180 | printk(KERN_WARNING "tried to kill init!\n"); | ||
| 181 | return; | ||
| 182 | } | ||
| 183 | |||
| 184 | task_lock(p); | ||
| 185 | if (!p->mm || p->mm == &init_mm) { | ||
| 186 | WARN_ON(1); | ||
| 187 | printk(KERN_WARNING "tried to kill an mm-less task!\n"); | ||
| 188 | task_unlock(p); | ||
| 189 | return; | ||
| 190 | } | ||
| 191 | task_unlock(p); | ||
| 192 | printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm); | ||
| 193 | |||
| 194 | /* | ||
| 195 | * We give our sacrificial lamb high priority and access to | ||
| 196 | * all the memory it needs. That way it should be able to | ||
| 197 | * exit() and clear out its resources quickly... | ||
| 198 | */ | ||
| 199 | p->time_slice = HZ; | ||
| 200 | set_tsk_thread_flag(p, TIF_MEMDIE); | ||
| 201 | |||
| 202 | force_sig(SIGKILL, p); | ||
| 203 | } | ||
| 204 | |||
| 205 | static struct mm_struct *oom_kill_task(task_t *p) | ||
| 206 | { | ||
| 207 | struct mm_struct *mm = get_task_mm(p); | ||
| 208 | task_t * g, * q; | ||
| 209 | |||
| 210 | if (!mm) | ||
| 211 | return NULL; | ||
| 212 | if (mm == &init_mm) { | ||
| 213 | mmput(mm); | ||
| 214 | return NULL; | ||
| 215 | } | ||
| 216 | |||
| 217 | __oom_kill_task(p); | ||
| 218 | /* | ||
| 219 | * kill all processes that share the ->mm (i.e. all threads), | ||
| 220 | * but are in a different thread group | ||
| 221 | */ | ||
| 222 | do_each_thread(g, q) | ||
| 223 | if (q->mm == mm && q->tgid != p->tgid) | ||
| 224 | __oom_kill_task(q); | ||
| 225 | while_each_thread(g, q); | ||
| 226 | |||
| 227 | return mm; | ||
| 228 | } | ||
| 229 | |||
| 230 | static struct mm_struct *oom_kill_process(struct task_struct *p) | ||
| 231 | { | ||
| 232 | struct mm_struct *mm; | ||
| 233 | struct task_struct *c; | ||
| 234 | struct list_head *tsk; | ||
| 235 | |||
| 236 | /* Try to kill a child first */ | ||
| 237 | list_for_each(tsk, &p->children) { | ||
| 238 | c = list_entry(tsk, struct task_struct, sibling); | ||
| 239 | if (c->mm == p->mm) | ||
| 240 | continue; | ||
| 241 | mm = oom_kill_task(c); | ||
| 242 | if (mm) | ||
| 243 | return mm; | ||
| 244 | } | ||
| 245 | return oom_kill_task(p); | ||
| 246 | } | ||
| 247 | |||
| 248 | /** | ||
| 249 | * oom_kill - kill the "best" process when we run out of memory | ||
| 250 | * | ||
| 251 | * If we run out of memory, we have the choice between either | ||
| 252 | * killing a random task (bad), letting the system crash (worse) | ||
| 253 | * OR try to be smart about which process to kill. Note that we | ||
| 254 | * don't have to be perfect here, we just have to be good. | ||
| 255 | */ | ||
| 256 | void out_of_memory(unsigned int __nocast gfp_mask) | ||
| 257 | { | ||
| 258 | struct mm_struct *mm = NULL; | ||
| 259 | task_t * p; | ||
| 260 | |||
| 261 | read_lock(&tasklist_lock); | ||
| 262 | retry: | ||
| 263 | p = select_bad_process(); | ||
| 264 | |||
| 265 | if (PTR_ERR(p) == -1UL) | ||
| 266 | goto out; | ||
| 267 | |||
| 268 | /* Found nothing?!?! Either we hang forever, or we panic. */ | ||
| 269 | if (!p) { | ||
| 270 | read_unlock(&tasklist_lock); | ||
| 271 | show_free_areas(); | ||
| 272 | panic("Out of memory and no killable processes...\n"); | ||
| 273 | } | ||
| 274 | |||
| 275 | printk("oom-killer: gfp_mask=0x%x\n", gfp_mask); | ||
| 276 | show_free_areas(); | ||
| 277 | mm = oom_kill_process(p); | ||
| 278 | if (!mm) | ||
| 279 | goto retry; | ||
| 280 | |||
| 281 | out: | ||
| 282 | read_unlock(&tasklist_lock); | ||
| 283 | if (mm) | ||
| 284 | mmput(mm); | ||
| 285 | |||
| 286 | /* | ||
| 287 | * Give "p" a good chance of killing itself before we | ||
| 288 | * retry to allocate memory. | ||
| 289 | */ | ||
| 290 | __set_current_state(TASK_INTERRUPTIBLE); | ||
| 291 | schedule_timeout(1); | ||
| 292 | } | ||
diff --git a/mm/page-writeback.c b/mm/page-writeback.c new file mode 100644 index 000000000000..6ddd6a29c73b --- /dev/null +++ b/mm/page-writeback.c | |||
| @@ -0,0 +1,819 @@ | |||
| 1 | /* | ||
| 2 | * mm/page-writeback.c. | ||
| 3 | * | ||
| 4 | * Copyright (C) 2002, Linus Torvalds. | ||
| 5 | * | ||
| 6 | * Contains functions related to writing back dirty pages at the | ||
| 7 | * address_space level. | ||
| 8 | * | ||
| 9 | * 10Apr2002 akpm@zip.com.au | ||
| 10 | * Initial version | ||
| 11 | */ | ||
| 12 | |||
| 13 | #include <linux/kernel.h> | ||
| 14 | #include <linux/module.h> | ||
| 15 | #include <linux/spinlock.h> | ||
| 16 | #include <linux/fs.h> | ||
| 17 | #include <linux/mm.h> | ||
| 18 | #include <linux/swap.h> | ||
| 19 | #include <linux/slab.h> | ||
| 20 | #include <linux/pagemap.h> | ||
| 21 | #include <linux/writeback.h> | ||
| 22 | #include <linux/init.h> | ||
| 23 | #include <linux/backing-dev.h> | ||
| 24 | #include <linux/blkdev.h> | ||
| 25 | #include <linux/mpage.h> | ||
| 26 | #include <linux/percpu.h> | ||
| 27 | #include <linux/notifier.h> | ||
| 28 | #include <linux/smp.h> | ||
| 29 | #include <linux/sysctl.h> | ||
| 30 | #include <linux/cpu.h> | ||
| 31 | #include <linux/syscalls.h> | ||
| 32 | |||
| 33 | /* | ||
| 34 | * The maximum number of pages to writeout in a single bdflush/kupdate | ||
| 35 | * operation. We do this so we don't hold I_LOCK against an inode for | ||
| 36 | * enormous amounts of time, which would block a userspace task which has | ||
| 37 | * been forced to throttle against that inode. Also, the code reevaluates | ||
| 38 | * the dirty each time it has written this many pages. | ||
| 39 | */ | ||
| 40 | #define MAX_WRITEBACK_PAGES 1024 | ||
| 41 | |||
| 42 | /* | ||
| 43 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited | ||
| 44 | * will look to see if it needs to force writeback or throttling. | ||
| 45 | */ | ||
| 46 | static long ratelimit_pages = 32; | ||
| 47 | |||
| 48 | static long total_pages; /* The total number of pages in the machine. */ | ||
| 49 | static int dirty_exceeded; /* Dirty mem may be over limit */ | ||
| 50 | |||
| 51 | /* | ||
| 52 | * When balance_dirty_pages decides that the caller needs to perform some | ||
| 53 | * non-background writeback, this is how many pages it will attempt to write. | ||
| 54 | * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably | ||
| 55 | * large amounts of I/O are submitted. | ||
| 56 | */ | ||
| 57 | static inline long sync_writeback_pages(void) | ||
| 58 | { | ||
| 59 | return ratelimit_pages + ratelimit_pages / 2; | ||
| 60 | } | ||
| 61 | |||
| 62 | /* The following parameters are exported via /proc/sys/vm */ | ||
| 63 | |||
| 64 | /* | ||
| 65 | * Start background writeback (via pdflush) at this percentage | ||
| 66 | */ | ||
| 67 | int dirty_background_ratio = 10; | ||
| 68 | |||
| 69 | /* | ||
| 70 | * The generator of dirty data starts writeback at this percentage | ||
| 71 | */ | ||
| 72 | int vm_dirty_ratio = 40; | ||
| 73 | |||
| 74 | /* | ||
| 75 | * The interval between `kupdate'-style writebacks, in centiseconds | ||
| 76 | * (hundredths of a second) | ||
| 77 | */ | ||
| 78 | int dirty_writeback_centisecs = 5 * 100; | ||
| 79 | |||
| 80 | /* | ||
| 81 | * The longest number of centiseconds for which data is allowed to remain dirty | ||
| 82 | */ | ||
| 83 | int dirty_expire_centisecs = 30 * 100; | ||
| 84 | |||
| 85 | /* | ||
| 86 | * Flag that makes the machine dump writes/reads and block dirtyings. | ||
| 87 | */ | ||
| 88 | int block_dump; | ||
| 89 | |||
| 90 | /* | ||
| 91 | * Flag that puts the machine in "laptop mode". | ||
| 92 | */ | ||
| 93 | int laptop_mode; | ||
| 94 | |||
| 95 | EXPORT_SYMBOL(laptop_mode); | ||
| 96 | |||
| 97 | /* End of sysctl-exported parameters */ | ||
| 98 | |||
| 99 | |||
| 100 | static void background_writeout(unsigned long _min_pages); | ||
| 101 | |||
| 102 | struct writeback_state | ||
| 103 | { | ||
| 104 | unsigned long nr_dirty; | ||
| 105 | unsigned long nr_unstable; | ||
| 106 | unsigned long nr_mapped; | ||
| 107 | unsigned long nr_writeback; | ||
| 108 | }; | ||
| 109 | |||
| 110 | static void get_writeback_state(struct writeback_state *wbs) | ||
| 111 | { | ||
| 112 | wbs->nr_dirty = read_page_state(nr_dirty); | ||
| 113 | wbs->nr_unstable = read_page_state(nr_unstable); | ||
| 114 | wbs->nr_mapped = read_page_state(nr_mapped); | ||
| 115 | wbs->nr_writeback = read_page_state(nr_writeback); | ||
| 116 | } | ||
| 117 | |||
| 118 | /* | ||
| 119 | * Work out the current dirty-memory clamping and background writeout | ||
| 120 | * thresholds. | ||
| 121 | * | ||
| 122 | * The main aim here is to lower them aggressively if there is a lot of mapped | ||
| 123 | * memory around. To avoid stressing page reclaim with lots of unreclaimable | ||
| 124 | * pages. It is better to clamp down on writers than to start swapping, and | ||
| 125 | * performing lots of scanning. | ||
| 126 | * | ||
| 127 | * We only allow 1/2 of the currently-unmapped memory to be dirtied. | ||
| 128 | * | ||
| 129 | * We don't permit the clamping level to fall below 5% - that is getting rather | ||
| 130 | * excessive. | ||
| 131 | * | ||
| 132 | * We make sure that the background writeout level is below the adjusted | ||
| 133 | * clamping level. | ||
| 134 | */ | ||
| 135 | static void | ||
| 136 | get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, | ||
| 137 | struct address_space *mapping) | ||
| 138 | { | ||
| 139 | int background_ratio; /* Percentages */ | ||
| 140 | int dirty_ratio; | ||
| 141 | int unmapped_ratio; | ||
| 142 | long background; | ||
| 143 | long dirty; | ||
| 144 | unsigned long available_memory = total_pages; | ||
| 145 | struct task_struct *tsk; | ||
| 146 | |||
| 147 | get_writeback_state(wbs); | ||
| 148 | |||
| 149 | #ifdef CONFIG_HIGHMEM | ||
| 150 | /* | ||
| 151 | * If this mapping can only allocate from low memory, | ||
| 152 | * we exclude high memory from our count. | ||
| 153 | */ | ||
| 154 | if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM)) | ||
| 155 | available_memory -= totalhigh_pages; | ||
| 156 | #endif | ||
| 157 | |||
| 158 | |||
| 159 | unmapped_ratio = 100 - (wbs->nr_mapped * 100) / total_pages; | ||
| 160 | |||
| 161 | dirty_ratio = vm_dirty_ratio; | ||
| 162 | if (dirty_ratio > unmapped_ratio / 2) | ||
| 163 | dirty_ratio = unmapped_ratio / 2; | ||
| 164 | |||
| 165 | if (dirty_ratio < 5) | ||
| 166 | dirty_ratio = 5; | ||
| 167 | |||
| 168 | background_ratio = dirty_background_ratio; | ||
| 169 | if (background_ratio >= dirty_ratio) | ||
| 170 | background_ratio = dirty_ratio / 2; | ||
| 171 | |||
| 172 | background = (background_ratio * available_memory) / 100; | ||
| 173 | dirty = (dirty_ratio * available_memory) / 100; | ||
| 174 | tsk = current; | ||
| 175 | if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { | ||
| 176 | background += background / 4; | ||
| 177 | dirty += dirty / 4; | ||
| 178 | } | ||
| 179 | *pbackground = background; | ||
| 180 | *pdirty = dirty; | ||
| 181 | } | ||
| 182 | |||
| 183 | /* | ||
| 184 | * balance_dirty_pages() must be called by processes which are generating dirty | ||
| 185 | * data. It looks at the number of dirty pages in the machine and will force | ||
| 186 | * the caller to perform writeback if the system is over `vm_dirty_ratio'. | ||
| 187 | * If we're over `background_thresh' then pdflush is woken to perform some | ||
| 188 | * writeout. | ||
| 189 | */ | ||
| 190 | static void balance_dirty_pages(struct address_space *mapping) | ||
| 191 | { | ||
| 192 | struct writeback_state wbs; | ||
| 193 | long nr_reclaimable; | ||
| 194 | long background_thresh; | ||
| 195 | long dirty_thresh; | ||
| 196 | unsigned long pages_written = 0; | ||
| 197 | unsigned long write_chunk = sync_writeback_pages(); | ||
| 198 | |||
| 199 | struct backing_dev_info *bdi = mapping->backing_dev_info; | ||
| 200 | |||
| 201 | for (;;) { | ||
| 202 | struct writeback_control wbc = { | ||
| 203 | .bdi = bdi, | ||
| 204 | .sync_mode = WB_SYNC_NONE, | ||
| 205 | .older_than_this = NULL, | ||
| 206 | .nr_to_write = write_chunk, | ||
| 207 | }; | ||
| 208 | |||
| 209 | get_dirty_limits(&wbs, &background_thresh, | ||
| 210 | &dirty_thresh, mapping); | ||
| 211 | nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; | ||
| 212 | if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) | ||
| 213 | break; | ||
| 214 | |||
| 215 | dirty_exceeded = 1; | ||
| 216 | |||
| 217 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. | ||
| 218 | * Unstable writes are a feature of certain networked | ||
| 219 | * filesystems (i.e. NFS) in which data may have been | ||
| 220 | * written to the server's write cache, but has not yet | ||
| 221 | * been flushed to permanent storage. | ||
| 222 | */ | ||
| 223 | if (nr_reclaimable) { | ||
| 224 | writeback_inodes(&wbc); | ||
| 225 | get_dirty_limits(&wbs, &background_thresh, | ||
| 226 | &dirty_thresh, mapping); | ||
| 227 | nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; | ||
| 228 | if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) | ||
| 229 | break; | ||
| 230 | pages_written += write_chunk - wbc.nr_to_write; | ||
| 231 | if (pages_written >= write_chunk) | ||
| 232 | break; /* We've done our duty */ | ||
| 233 | } | ||
| 234 | blk_congestion_wait(WRITE, HZ/10); | ||
| 235 | } | ||
| 236 | |||
| 237 | if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) | ||
| 238 | dirty_exceeded = 0; | ||
| 239 | |||
| 240 | if (writeback_in_progress(bdi)) | ||
| 241 | return; /* pdflush is already working this queue */ | ||
| 242 | |||
| 243 | /* | ||
| 244 | * In laptop mode, we wait until hitting the higher threshold before | ||
| 245 | * starting background writeout, and then write out all the way down | ||
| 246 | * to the lower threshold. So slow writers cause minimal disk activity. | ||
| 247 | * | ||
| 248 | * In normal mode, we start background writeout at the lower | ||
| 249 | * background_thresh, to keep the amount of dirty memory low. | ||
| 250 | */ | ||
| 251 | if ((laptop_mode && pages_written) || | ||
| 252 | (!laptop_mode && (nr_reclaimable > background_thresh))) | ||
| 253 | pdflush_operation(background_writeout, 0); | ||
| 254 | } | ||
| 255 | |||
| 256 | /** | ||
| 257 | * balance_dirty_pages_ratelimited - balance dirty memory state | ||
| 258 | * @mapping - address_space which was dirtied | ||
| 259 | * | ||
| 260 | * Processes which are dirtying memory should call in here once for each page | ||
| 261 | * which was newly dirtied. The function will periodically check the system's | ||
| 262 | * dirty state and will initiate writeback if needed. | ||
| 263 | * | ||
| 264 | * On really big machines, get_writeback_state is expensive, so try to avoid | ||
| 265 | * calling it too often (ratelimiting). But once we're over the dirty memory | ||
| 266 | * limit we decrease the ratelimiting by a lot, to prevent individual processes | ||
| 267 | * from overshooting the limit by (ratelimit_pages) each. | ||
| 268 | */ | ||
| 269 | void balance_dirty_pages_ratelimited(struct address_space *mapping) | ||
| 270 | { | ||
| 271 | static DEFINE_PER_CPU(int, ratelimits) = 0; | ||
| 272 | long ratelimit; | ||
| 273 | |||
| 274 | ratelimit = ratelimit_pages; | ||
| 275 | if (dirty_exceeded) | ||
| 276 | ratelimit = 8; | ||
| 277 | |||
| 278 | /* | ||
| 279 | * Check the rate limiting. Also, we do not want to throttle real-time | ||
| 280 | * tasks in balance_dirty_pages(). Period. | ||
| 281 | */ | ||
| 282 | if (get_cpu_var(ratelimits)++ >= ratelimit) { | ||
| 283 | __get_cpu_var(ratelimits) = 0; | ||
| 284 | put_cpu_var(ratelimits); | ||
| 285 | balance_dirty_pages(mapping); | ||
| 286 | return; | ||
| 287 | } | ||
| 288 | put_cpu_var(ratelimits); | ||
| 289 | } | ||
| 290 | EXPORT_SYMBOL(balance_dirty_pages_ratelimited); | ||
| 291 | |||
| 292 | void throttle_vm_writeout(void) | ||
| 293 | { | ||
| 294 | struct writeback_state wbs; | ||
| 295 | long background_thresh; | ||
| 296 | long dirty_thresh; | ||
| 297 | |||
| 298 | for ( ; ; ) { | ||
| 299 | get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); | ||
| 300 | |||
| 301 | /* | ||
| 302 | * Boost the allowable dirty threshold a bit for page | ||
| 303 | * allocators so they don't get DoS'ed by heavy writers | ||
| 304 | */ | ||
| 305 | dirty_thresh += dirty_thresh / 10; /* wheeee... */ | ||
| 306 | |||
| 307 | if (wbs.nr_unstable + wbs.nr_writeback <= dirty_thresh) | ||
| 308 | break; | ||
| 309 | blk_congestion_wait(WRITE, HZ/10); | ||
| 310 | } | ||
| 311 | } | ||
| 312 | |||
| 313 | |||
| 314 | /* | ||
| 315 | * writeback at least _min_pages, and keep writing until the amount of dirty | ||
| 316 | * memory is less than the background threshold, or until we're all clean. | ||
| 317 | */ | ||
| 318 | static void background_writeout(unsigned long _min_pages) | ||
| 319 | { | ||
| 320 | long min_pages = _min_pages; | ||
| 321 | struct writeback_control wbc = { | ||
| 322 | .bdi = NULL, | ||
| 323 | .sync_mode = WB_SYNC_NONE, | ||
| 324 | .older_than_this = NULL, | ||
| 325 | .nr_to_write = 0, | ||
| 326 | .nonblocking = 1, | ||
| 327 | }; | ||
| 328 | |||
| 329 | for ( ; ; ) { | ||
| 330 | struct writeback_state wbs; | ||
| 331 | long background_thresh; | ||
| 332 | long dirty_thresh; | ||
| 333 | |||
| 334 | get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); | ||
| 335 | if (wbs.nr_dirty + wbs.nr_unstable < background_thresh | ||
| 336 | && min_pages <= 0) | ||
| 337 | break; | ||
| 338 | wbc.encountered_congestion = 0; | ||
| 339 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; | ||
| 340 | wbc.pages_skipped = 0; | ||
| 341 | writeback_inodes(&wbc); | ||
| 342 | min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; | ||
| 343 | if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { | ||
| 344 | /* Wrote less than expected */ | ||
| 345 | blk_congestion_wait(WRITE, HZ/10); | ||
| 346 | if (!wbc.encountered_congestion) | ||
| 347 | break; | ||
| 348 | } | ||
| 349 | } | ||
| 350 | } | ||
| 351 | |||
| 352 | /* | ||
| 353 | * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back | ||
| 354 | * the whole world. Returns 0 if a pdflush thread was dispatched. Returns | ||
| 355 | * -1 if all pdflush threads were busy. | ||
| 356 | */ | ||
| 357 | int wakeup_bdflush(long nr_pages) | ||
| 358 | { | ||
| 359 | if (nr_pages == 0) { | ||
| 360 | struct writeback_state wbs; | ||
| 361 | |||
| 362 | get_writeback_state(&wbs); | ||
| 363 | nr_pages = wbs.nr_dirty + wbs.nr_unstable; | ||
| 364 | } | ||
| 365 | return pdflush_operation(background_writeout, nr_pages); | ||
| 366 | } | ||
| 367 | |||
| 368 | static void wb_timer_fn(unsigned long unused); | ||
| 369 | static void laptop_timer_fn(unsigned long unused); | ||
| 370 | |||
| 371 | static struct timer_list wb_timer = | ||
| 372 | TIMER_INITIALIZER(wb_timer_fn, 0, 0); | ||
| 373 | static struct timer_list laptop_mode_wb_timer = | ||
| 374 | TIMER_INITIALIZER(laptop_timer_fn, 0, 0); | ||
| 375 | |||
| 376 | /* | ||
| 377 | * Periodic writeback of "old" data. | ||
| 378 | * | ||
| 379 | * Define "old": the first time one of an inode's pages is dirtied, we mark the | ||
| 380 | * dirtying-time in the inode's address_space. So this periodic writeback code | ||
| 381 | * just walks the superblock inode list, writing back any inodes which are | ||
| 382 | * older than a specific point in time. | ||
| 383 | * | ||
| 384 | * Try to run once per dirty_writeback_centisecs. But if a writeback event | ||
| 385 | * takes longer than a dirty_writeback_centisecs interval, then leave a | ||
| 386 | * one-second gap. | ||
| 387 | * | ||
| 388 | * older_than_this takes precedence over nr_to_write. So we'll only write back | ||
| 389 | * all dirty pages if they are all attached to "old" mappings. | ||
| 390 | */ | ||
| 391 | static void wb_kupdate(unsigned long arg) | ||
| 392 | { | ||
| 393 | unsigned long oldest_jif; | ||
| 394 | unsigned long start_jif; | ||
| 395 | unsigned long next_jif; | ||
| 396 | long nr_to_write; | ||
| 397 | struct writeback_state wbs; | ||
| 398 | struct writeback_control wbc = { | ||
| 399 | .bdi = NULL, | ||
| 400 | .sync_mode = WB_SYNC_NONE, | ||
| 401 | .older_than_this = &oldest_jif, | ||
| 402 | .nr_to_write = 0, | ||
| 403 | .nonblocking = 1, | ||
| 404 | .for_kupdate = 1, | ||
| 405 | }; | ||
| 406 | |||
| 407 | sync_supers(); | ||
| 408 | |||
| 409 | get_writeback_state(&wbs); | ||
| 410 | oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; | ||
| 411 | start_jif = jiffies; | ||
| 412 | next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; | ||
| 413 | nr_to_write = wbs.nr_dirty + wbs.nr_unstable + | ||
| 414 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); | ||
| 415 | while (nr_to_write > 0) { | ||
| 416 | wbc.encountered_congestion = 0; | ||
| 417 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; | ||
| 418 | writeback_inodes(&wbc); | ||
| 419 | if (wbc.nr_to_write > 0) { | ||
| 420 | if (wbc.encountered_congestion) | ||
| 421 | blk_congestion_wait(WRITE, HZ/10); | ||
| 422 | else | ||
| 423 | break; /* All the old data is written */ | ||
| 424 | } | ||
| 425 | nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; | ||
| 426 | } | ||
| 427 | if (time_before(next_jif, jiffies + HZ)) | ||
| 428 | next_jif = jiffies + HZ; | ||
| 429 | if (dirty_writeback_centisecs) | ||
| 430 | mod_timer(&wb_timer, next_jif); | ||
| 431 | } | ||
| 432 | |||
| 433 | /* | ||
| 434 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs | ||
| 435 | */ | ||
| 436 | int dirty_writeback_centisecs_handler(ctl_table *table, int write, | ||
| 437 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | ||
| 438 | { | ||
| 439 | proc_dointvec(table, write, file, buffer, length, ppos); | ||
| 440 | if (dirty_writeback_centisecs) { | ||
| 441 | mod_timer(&wb_timer, | ||
| 442 | jiffies + (dirty_writeback_centisecs * HZ) / 100); | ||
| 443 | } else { | ||
| 444 | del_timer(&wb_timer); | ||
| 445 | } | ||
| 446 | return 0; | ||
| 447 | } | ||
| 448 | |||
| 449 | static void wb_timer_fn(unsigned long unused) | ||
| 450 | { | ||
| 451 | if (pdflush_operation(wb_kupdate, 0) < 0) | ||
| 452 | mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */ | ||
| 453 | } | ||
| 454 | |||
| 455 | static void laptop_flush(unsigned long unused) | ||
| 456 | { | ||
| 457 | sys_sync(); | ||
| 458 | } | ||
| 459 | |||
| 460 | static void laptop_timer_fn(unsigned long unused) | ||
| 461 | { | ||
| 462 | pdflush_operation(laptop_flush, 0); | ||
| 463 | } | ||
| 464 | |||
| 465 | /* | ||
| 466 | * We've spun up the disk and we're in laptop mode: schedule writeback | ||
| 467 | * of all dirty data a few seconds from now. If the flush is already scheduled | ||
| 468 | * then push it back - the user is still using the disk. | ||
| 469 | */ | ||
| 470 | void laptop_io_completion(void) | ||
| 471 | { | ||
| 472 | mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ); | ||
| 473 | } | ||
| 474 | |||
| 475 | /* | ||
| 476 | * We're in laptop mode and we've just synced. The sync's writes will have | ||
| 477 | * caused another writeback to be scheduled by laptop_io_completion. | ||
| 478 | * Nothing needs to be written back anymore, so we unschedule the writeback. | ||
| 479 | */ | ||
| 480 | void laptop_sync_completion(void) | ||
| 481 | { | ||
| 482 | del_timer(&laptop_mode_wb_timer); | ||
| 483 | } | ||
| 484 | |||
| 485 | /* | ||
| 486 | * If ratelimit_pages is too high then we can get into dirty-data overload | ||
| 487 | * if a large number of processes all perform writes at the same time. | ||
| 488 | * If it is too low then SMP machines will call the (expensive) | ||
| 489 | * get_writeback_state too often. | ||
| 490 | * | ||
| 491 | * Here we set ratelimit_pages to a level which ensures that when all CPUs are | ||
| 492 | * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory | ||
| 493 | * thresholds before writeback cuts in. | ||
| 494 | * | ||
| 495 | * But the limit should not be set too high. Because it also controls the | ||
| 496 | * amount of memory which the balance_dirty_pages() caller has to write back. | ||
| 497 | * If this is too large then the caller will block on the IO queue all the | ||
| 498 | * time. So limit it to four megabytes - the balance_dirty_pages() caller | ||
| 499 | * will write six megabyte chunks, max. | ||
| 500 | */ | ||
| 501 | |||
| 502 | static void set_ratelimit(void) | ||
| 503 | { | ||
| 504 | ratelimit_pages = total_pages / (num_online_cpus() * 32); | ||
| 505 | if (ratelimit_pages < 16) | ||
| 506 | ratelimit_pages = 16; | ||
| 507 | if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) | ||
| 508 | ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; | ||
| 509 | } | ||
| 510 | |||
| 511 | static int | ||
| 512 | ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) | ||
| 513 | { | ||
| 514 | set_ratelimit(); | ||
| 515 | return 0; | ||
| 516 | } | ||
| 517 | |||
| 518 | static struct notifier_block ratelimit_nb = { | ||
| 519 | .notifier_call = ratelimit_handler, | ||
| 520 | .next = NULL, | ||
| 521 | }; | ||
| 522 | |||
| 523 | /* | ||
| 524 | * If the machine has a large highmem:lowmem ratio then scale back the default | ||
| 525 | * dirty memory thresholds: allowing too much dirty highmem pins an excessive | ||
| 526 | * number of buffer_heads. | ||
| 527 | */ | ||
| 528 | void __init page_writeback_init(void) | ||
| 529 | { | ||
| 530 | long buffer_pages = nr_free_buffer_pages(); | ||
| 531 | long correction; | ||
| 532 | |||
| 533 | total_pages = nr_free_pagecache_pages(); | ||
| 534 | |||
| 535 | correction = (100 * 4 * buffer_pages) / total_pages; | ||
| 536 | |||
| 537 | if (correction < 100) { | ||
| 538 | dirty_background_ratio *= correction; | ||
| 539 | dirty_background_ratio /= 100; | ||
| 540 | vm_dirty_ratio *= correction; | ||
| 541 | vm_dirty_ratio /= 100; | ||
| 542 | |||
| 543 | if (dirty_background_ratio <= 0) | ||
| 544 | dirty_background_ratio = 1; | ||
| 545 | if (vm_dirty_ratio <= 0) | ||
| 546 | vm_dirty_ratio = 1; | ||
| 547 | } | ||
| 548 | mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100); | ||
| 549 | set_ratelimit(); | ||
| 550 | register_cpu_notifier(&ratelimit_nb); | ||
| 551 | } | ||
| 552 | |||
| 553 | int do_writepages(struct address_space *mapping, struct writeback_control *wbc) | ||
| 554 | { | ||
| 555 | if (wbc->nr_to_write <= 0) | ||
| 556 | return 0; | ||
| 557 | if (mapping->a_ops->writepages) | ||
| 558 | return mapping->a_ops->writepages(mapping, wbc); | ||
| 559 | return generic_writepages(mapping, wbc); | ||
| 560 | } | ||
| 561 | |||
| 562 | /** | ||
| 563 | * write_one_page - write out a single page and optionally wait on I/O | ||
| 564 | * | ||
| 565 | * @page - the page to write | ||
| 566 | * @wait - if true, wait on writeout | ||
| 567 | * | ||
| 568 | * The page must be locked by the caller and will be unlocked upon return. | ||
| 569 | * | ||
| 570 | * write_one_page() returns a negative error code if I/O failed. | ||
| 571 | */ | ||
| 572 | int write_one_page(struct page *page, int wait) | ||
| 573 | { | ||
| 574 | struct address_space *mapping = page->mapping; | ||
| 575 | int ret = 0; | ||
| 576 | struct writeback_control wbc = { | ||
| 577 | .sync_mode = WB_SYNC_ALL, | ||
| 578 | .nr_to_write = 1, | ||
| 579 | }; | ||
| 580 | |||
| 581 | BUG_ON(!PageLocked(page)); | ||
| 582 | |||
| 583 | if (wait) | ||
| 584 | wait_on_page_writeback(page); | ||
| 585 | |||
| 586 | if (clear_page_dirty_for_io(page)) { | ||
| 587 | page_cache_get(page); | ||
| 588 | ret = mapping->a_ops->writepage(page, &wbc); | ||
| 589 | if (ret == 0 && wait) { | ||
| 590 | wait_on_page_writeback(page); | ||
| 591 | if (PageError(page)) | ||
| 592 | ret = -EIO; | ||
| 593 | } | ||
| 594 | page_cache_release(page); | ||
| 595 | } else { | ||
| 596 | unlock_page(page); | ||
| 597 | } | ||
| 598 | return ret; | ||
| 599 | } | ||
| 600 | EXPORT_SYMBOL(write_one_page); | ||
| 601 | |||
| 602 | /* | ||
| 603 | * For address_spaces which do not use buffers. Just tag the page as dirty in | ||
| 604 | * its radix tree. | ||
| 605 | * | ||
| 606 | * This is also used when a single buffer is being dirtied: we want to set the | ||
| 607 | * page dirty in that case, but not all the buffers. This is a "bottom-up" | ||
| 608 | * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. | ||
| 609 | * | ||
| 610 | * Most callers have locked the page, which pins the address_space in memory. | ||
| 611 | * But zap_pte_range() does not lock the page, however in that case the | ||
| 612 | * mapping is pinned by the vma's ->vm_file reference. | ||
| 613 | * | ||
| 614 | * We take care to handle the case where the page was truncated from the | ||
| 615 | * mapping by re-checking page_mapping() insode tree_lock. | ||
| 616 | */ | ||
| 617 | int __set_page_dirty_nobuffers(struct page *page) | ||
| 618 | { | ||
| 619 | int ret = 0; | ||
| 620 | |||
| 621 | if (!TestSetPageDirty(page)) { | ||
| 622 | struct address_space *mapping = page_mapping(page); | ||
| 623 | struct address_space *mapping2; | ||
| 624 | |||
| 625 | if (mapping) { | ||
| 626 | write_lock_irq(&mapping->tree_lock); | ||
| 627 | mapping2 = page_mapping(page); | ||
| 628 | if (mapping2) { /* Race with truncate? */ | ||
| 629 | BUG_ON(mapping2 != mapping); | ||
| 630 | if (mapping_cap_account_dirty(mapping)) | ||
| 631 | inc_page_state(nr_dirty); | ||
| 632 | radix_tree_tag_set(&mapping->page_tree, | ||
| 633 | page_index(page), PAGECACHE_TAG_DIRTY); | ||
| 634 | } | ||
| 635 | write_unlock_irq(&mapping->tree_lock); | ||
| 636 | if (mapping->host) { | ||
| 637 | /* !PageAnon && !swapper_space */ | ||
| 638 | __mark_inode_dirty(mapping->host, | ||
| 639 | I_DIRTY_PAGES); | ||
| 640 | } | ||
| 641 | } | ||
| 642 | } | ||
| 643 | return ret; | ||
| 644 | } | ||
| 645 | EXPORT_SYMBOL(__set_page_dirty_nobuffers); | ||
| 646 | |||
| 647 | /* | ||
| 648 | * When a writepage implementation decides that it doesn't want to write this | ||
| 649 | * page for some reason, it should redirty the locked page via | ||
| 650 | * redirty_page_for_writepage() and it should then unlock the page and return 0 | ||
| 651 | */ | ||
| 652 | int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) | ||
| 653 | { | ||
| 654 | wbc->pages_skipped++; | ||
| 655 | return __set_page_dirty_nobuffers(page); | ||
| 656 | } | ||
| 657 | EXPORT_SYMBOL(redirty_page_for_writepage); | ||
| 658 | |||
| 659 | /* | ||
| 660 | * If the mapping doesn't provide a set_page_dirty a_op, then | ||
| 661 | * just fall through and assume that it wants buffer_heads. | ||
| 662 | */ | ||
| 663 | int fastcall set_page_dirty(struct page *page) | ||
| 664 | { | ||
| 665 | struct address_space *mapping = page_mapping(page); | ||
| 666 | |||
| 667 | if (likely(mapping)) { | ||
| 668 | int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; | ||
| 669 | if (spd) | ||
| 670 | return (*spd)(page); | ||
| 671 | return __set_page_dirty_buffers(page); | ||
| 672 | } | ||
| 673 | if (!PageDirty(page)) | ||
| 674 | SetPageDirty(page); | ||
| 675 | return 0; | ||
| 676 | } | ||
| 677 | EXPORT_SYMBOL(set_page_dirty); | ||
| 678 | |||
| 679 | /* | ||
| 680 | * set_page_dirty() is racy if the caller has no reference against | ||
| 681 | * page->mapping->host, and if the page is unlocked. This is because another | ||
| 682 | * CPU could truncate the page off the mapping and then free the mapping. | ||
| 683 | * | ||
| 684 | * Usually, the page _is_ locked, or the caller is a user-space process which | ||
| 685 | * holds a reference on the inode by having an open file. | ||
| 686 | * | ||
| 687 | * In other cases, the page should be locked before running set_page_dirty(). | ||
| 688 | */ | ||
| 689 | int set_page_dirty_lock(struct page *page) | ||
| 690 | { | ||
| 691 | int ret; | ||
| 692 | |||
| 693 | lock_page(page); | ||
| 694 | ret = set_page_dirty(page); | ||
| 695 | unlock_page(page); | ||
| 696 | return ret; | ||
| 697 | } | ||
| 698 | EXPORT_SYMBOL(set_page_dirty_lock); | ||
| 699 | |||
| 700 | /* | ||
| 701 | * Clear a page's dirty flag, while caring for dirty memory accounting. | ||
| 702 | * Returns true if the page was previously dirty. | ||
| 703 | */ | ||
| 704 | int test_clear_page_dirty(struct page *page) | ||
| 705 | { | ||
| 706 | struct address_space *mapping = page_mapping(page); | ||
| 707 | unsigned long flags; | ||
| 708 | |||
| 709 | if (mapping) { | ||
| 710 | write_lock_irqsave(&mapping->tree_lock, flags); | ||
| 711 | if (TestClearPageDirty(page)) { | ||
| 712 | radix_tree_tag_clear(&mapping->page_tree, | ||
| 713 | page_index(page), | ||
| 714 | PAGECACHE_TAG_DIRTY); | ||
| 715 | write_unlock_irqrestore(&mapping->tree_lock, flags); | ||
| 716 | if (mapping_cap_account_dirty(mapping)) | ||
| 717 | dec_page_state(nr_dirty); | ||
| 718 | return 1; | ||
| 719 | } | ||
| 720 | write_unlock_irqrestore(&mapping->tree_lock, flags); | ||
| 721 | return 0; | ||
| 722 | } | ||
| 723 | return TestClearPageDirty(page); | ||
| 724 | } | ||
| 725 | EXPORT_SYMBOL(test_clear_page_dirty); | ||
| 726 | |||
| 727 | /* | ||
| 728 | * Clear a page's dirty flag, while caring for dirty memory accounting. | ||
| 729 | * Returns true if the page was previously dirty. | ||
| 730 | * | ||
| 731 | * This is for preparing to put the page under writeout. We leave the page | ||
| 732 | * tagged as dirty in the radix tree so that a concurrent write-for-sync | ||
| 733 | * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage | ||
| 734 | * implementation will run either set_page_writeback() or set_page_dirty(), | ||
| 735 | * at which stage we bring the page's dirty flag and radix-tree dirty tag | ||
| 736 | * back into sync. | ||
| 737 | * | ||
| 738 | * This incoherency between the page's dirty flag and radix-tree tag is | ||
| 739 | * unfortunate, but it only exists while the page is locked. | ||
| 740 | */ | ||
| 741 | int clear_page_dirty_for_io(struct page *page) | ||
| 742 | { | ||
| 743 | struct address_space *mapping = page_mapping(page); | ||
| 744 | |||
| 745 | if (mapping) { | ||
| 746 | if (TestClearPageDirty(page)) { | ||
| 747 | if (mapping_cap_account_dirty(mapping)) | ||
| 748 | dec_page_state(nr_dirty); | ||
| 749 | return 1; | ||
| 750 | } | ||
| 751 | return 0; | ||
| 752 | } | ||
| 753 | return TestClearPageDirty(page); | ||
| 754 | } | ||
| 755 | EXPORT_SYMBOL(clear_page_dirty_for_io); | ||
| 756 | |||
| 757 | int test_clear_page_writeback(struct page *page) | ||
| 758 | { | ||
| 759 | struct address_space *mapping = page_mapping(page); | ||
| 760 | int ret; | ||
| 761 | |||
| 762 | if (mapping) { | ||
| 763 | unsigned long flags; | ||
| 764 | |||
| 765 | write_lock_irqsave(&mapping->tree_lock, flags); | ||
| 766 | ret = TestClearPageWriteback(page); | ||
| 767 | if (ret) | ||
| 768 | radix_tree_tag_clear(&mapping->page_tree, | ||
| 769 | page_index(page), | ||
| 770 | PAGECACHE_TAG_WRITEBACK); | ||
| 771 | write_unlock_irqrestore(&mapping->tree_lock, flags); | ||
| 772 | } else { | ||
| 773 | ret = TestClearPageWriteback(page); | ||
| 774 | } | ||
| 775 | return ret; | ||
| 776 | } | ||
| 777 | |||
| 778 | int test_set_page_writeback(struct page *page) | ||
| 779 | { | ||
| 780 | struct address_space *mapping = page_mapping(page); | ||
| 781 | int ret; | ||
| 782 | |||
| 783 | if (mapping) { | ||
| 784 | unsigned long flags; | ||
| 785 | |||
| 786 | write_lock_irqsave(&mapping->tree_lock, flags); | ||
| 787 | ret = TestSetPageWriteback(page); | ||
| 788 | if (!ret) | ||
| 789 | radix_tree_tag_set(&mapping->page_tree, | ||
| 790 | page_index(page), | ||
| 791 | PAGECACHE_TAG_WRITEBACK); | ||
| 792 | if (!PageDirty(page)) | ||
| 793 | radix_tree_tag_clear(&mapping->page_tree, | ||
| 794 | page_index(page), | ||
| 795 | PAGECACHE_TAG_DIRTY); | ||
| 796 | write_unlock_irqrestore(&mapping->tree_lock, flags); | ||
| 797 | } else { | ||
| 798 | ret = TestSetPageWriteback(page); | ||
| 799 | } | ||
| 800 | return ret; | ||
| 801 | |||
| 802 | } | ||
| 803 | EXPORT_SYMBOL(test_set_page_writeback); | ||
| 804 | |||
| 805 | /* | ||
| 806 | * Return true if any of the pages in the mapping are marged with the | ||
| 807 | * passed tag. | ||
| 808 | */ | ||
| 809 | int mapping_tagged(struct address_space *mapping, int tag) | ||
| 810 | { | ||
| 811 | unsigned long flags; | ||
| 812 | int ret; | ||
| 813 | |||
| 814 | read_lock_irqsave(&mapping->tree_lock, flags); | ||
| 815 | ret = radix_tree_tagged(&mapping->page_tree, tag); | ||
| 816 | read_unlock_irqrestore(&mapping->tree_lock, flags); | ||
| 817 | return ret; | ||
| 818 | } | ||
| 819 | EXPORT_SYMBOL(mapping_tagged); | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c new file mode 100644 index 000000000000..c73dbbc1cd8f --- /dev/null +++ b/mm/page_alloc.c | |||
| @@ -0,0 +1,2220 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/page_alloc.c | ||
| 3 | * | ||
| 4 | * Manages the free list, the system allocates free pages here. | ||
| 5 | * Note that kmalloc() lives in slab.c | ||
| 6 | * | ||
| 7 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | ||
| 8 | * Swap reorganised 29.12.95, Stephen Tweedie | ||
| 9 | * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 | ||
| 10 | * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 | ||
| 11 | * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 | ||
| 12 | * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 | ||
| 13 | * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 | ||
| 14 | * (lots of bits borrowed from Ingo Molnar & Andrew Morton) | ||
| 15 | */ | ||
| 16 | |||
| 17 | #include <linux/config.h> | ||
| 18 | #include <linux/stddef.h> | ||
| 19 | #include <linux/mm.h> | ||
| 20 | #include <linux/swap.h> | ||
| 21 | #include <linux/interrupt.h> | ||
| 22 | #include <linux/pagemap.h> | ||
| 23 | #include <linux/bootmem.h> | ||
| 24 | #include <linux/compiler.h> | ||
| 25 | #include <linux/module.h> | ||
| 26 | #include <linux/suspend.h> | ||
| 27 | #include <linux/pagevec.h> | ||
| 28 | #include <linux/blkdev.h> | ||
| 29 | #include <linux/slab.h> | ||
| 30 | #include <linux/notifier.h> | ||
| 31 | #include <linux/topology.h> | ||
| 32 | #include <linux/sysctl.h> | ||
| 33 | #include <linux/cpu.h> | ||
| 34 | #include <linux/cpuset.h> | ||
| 35 | #include <linux/nodemask.h> | ||
| 36 | #include <linux/vmalloc.h> | ||
| 37 | |||
| 38 | #include <asm/tlbflush.h> | ||
| 39 | #include "internal.h" | ||
| 40 | |||
| 41 | /* | ||
| 42 | * MCD - HACK: Find somewhere to initialize this EARLY, or make this | ||
| 43 | * initializer cleaner | ||
| 44 | */ | ||
| 45 | nodemask_t node_online_map = { { [0] = 1UL } }; | ||
| 46 | nodemask_t node_possible_map = NODE_MASK_ALL; | ||
| 47 | struct pglist_data *pgdat_list; | ||
| 48 | unsigned long totalram_pages; | ||
| 49 | unsigned long totalhigh_pages; | ||
| 50 | long nr_swap_pages; | ||
| 51 | |||
| 52 | /* | ||
| 53 | * results with 256, 32 in the lowmem_reserve sysctl: | ||
| 54 | * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) | ||
| 55 | * 1G machine -> (16M dma, 784M normal, 224M high) | ||
| 56 | * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA | ||
| 57 | * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL | ||
| 58 | * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA | ||
| 59 | */ | ||
| 60 | int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; | ||
| 61 | |||
| 62 | EXPORT_SYMBOL(totalram_pages); | ||
| 63 | EXPORT_SYMBOL(nr_swap_pages); | ||
| 64 | |||
| 65 | /* | ||
| 66 | * Used by page_zone() to look up the address of the struct zone whose | ||
| 67 | * id is encoded in the upper bits of page->flags | ||
| 68 | */ | ||
| 69 | struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; | ||
| 70 | EXPORT_SYMBOL(zone_table); | ||
| 71 | |||
| 72 | static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; | ||
| 73 | int min_free_kbytes = 1024; | ||
| 74 | |||
| 75 | unsigned long __initdata nr_kernel_pages; | ||
| 76 | unsigned long __initdata nr_all_pages; | ||
| 77 | |||
| 78 | /* | ||
| 79 | * Temporary debugging check for pages not lying within a given zone. | ||
| 80 | */ | ||
| 81 | static int bad_range(struct zone *zone, struct page *page) | ||
| 82 | { | ||
| 83 | if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) | ||
| 84 | return 1; | ||
| 85 | if (page_to_pfn(page) < zone->zone_start_pfn) | ||
| 86 | return 1; | ||
| 87 | #ifdef CONFIG_HOLES_IN_ZONE | ||
| 88 | if (!pfn_valid(page_to_pfn(page))) | ||
| 89 | return 1; | ||
| 90 | #endif | ||
| 91 | if (zone != page_zone(page)) | ||
| 92 | return 1; | ||
| 93 | return 0; | ||
| 94 | } | ||
| 95 | |||
| 96 | static void bad_page(const char *function, struct page *page) | ||
| 97 | { | ||
| 98 | printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", | ||
| 99 | function, current->comm, page); | ||
| 100 | printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", | ||
| 101 | (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags, | ||
| 102 | page->mapping, page_mapcount(page), page_count(page)); | ||
| 103 | printk(KERN_EMERG "Backtrace:\n"); | ||
| 104 | dump_stack(); | ||
| 105 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); | ||
| 106 | page->flags &= ~(1 << PG_private | | ||
| 107 | 1 << PG_locked | | ||
| 108 | 1 << PG_lru | | ||
| 109 | 1 << PG_active | | ||
| 110 | 1 << PG_dirty | | ||
| 111 | 1 << PG_swapcache | | ||
| 112 | 1 << PG_writeback); | ||
| 113 | set_page_count(page, 0); | ||
| 114 | reset_page_mapcount(page); | ||
| 115 | page->mapping = NULL; | ||
| 116 | tainted |= TAINT_BAD_PAGE; | ||
| 117 | } | ||
| 118 | |||
| 119 | #ifndef CONFIG_HUGETLB_PAGE | ||
| 120 | #define prep_compound_page(page, order) do { } while (0) | ||
| 121 | #define destroy_compound_page(page, order) do { } while (0) | ||
| 122 | #else | ||
| 123 | /* | ||
| 124 | * Higher-order pages are called "compound pages". They are structured thusly: | ||
| 125 | * | ||
| 126 | * The first PAGE_SIZE page is called the "head page". | ||
| 127 | * | ||
| 128 | * The remaining PAGE_SIZE pages are called "tail pages". | ||
| 129 | * | ||
| 130 | * All pages have PG_compound set. All pages have their ->private pointing at | ||
| 131 | * the head page (even the head page has this). | ||
| 132 | * | ||
| 133 | * The first tail page's ->mapping, if non-zero, holds the address of the | ||
| 134 | * compound page's put_page() function. | ||
| 135 | * | ||
| 136 | * The order of the allocation is stored in the first tail page's ->index | ||
| 137 | * This is only for debug at present. This usage means that zero-order pages | ||
| 138 | * may not be compound. | ||
| 139 | */ | ||
| 140 | static void prep_compound_page(struct page *page, unsigned long order) | ||
| 141 | { | ||
| 142 | int i; | ||
| 143 | int nr_pages = 1 << order; | ||
| 144 | |||
| 145 | page[1].mapping = NULL; | ||
| 146 | page[1].index = order; | ||
| 147 | for (i = 0; i < nr_pages; i++) { | ||
| 148 | struct page *p = page + i; | ||
| 149 | |||
| 150 | SetPageCompound(p); | ||
| 151 | p->private = (unsigned long)page; | ||
| 152 | } | ||
| 153 | } | ||
| 154 | |||
| 155 | static void destroy_compound_page(struct page *page, unsigned long order) | ||
| 156 | { | ||
| 157 | int i; | ||
| 158 | int nr_pages = 1 << order; | ||
| 159 | |||
| 160 | if (!PageCompound(page)) | ||
| 161 | return; | ||
| 162 | |||
| 163 | if (page[1].index != order) | ||
| 164 | bad_page(__FUNCTION__, page); | ||
| 165 | |||
| 166 | for (i = 0; i < nr_pages; i++) { | ||
| 167 | struct page *p = page + i; | ||
| 168 | |||
| 169 | if (!PageCompound(p)) | ||
| 170 | bad_page(__FUNCTION__, page); | ||
| 171 | if (p->private != (unsigned long)page) | ||
| 172 | bad_page(__FUNCTION__, page); | ||
| 173 | ClearPageCompound(p); | ||
| 174 | } | ||
| 175 | } | ||
| 176 | #endif /* CONFIG_HUGETLB_PAGE */ | ||
| 177 | |||
| 178 | /* | ||
| 179 | * function for dealing with page's order in buddy system. | ||
| 180 | * zone->lock is already acquired when we use these. | ||
| 181 | * So, we don't need atomic page->flags operations here. | ||
| 182 | */ | ||
| 183 | static inline unsigned long page_order(struct page *page) { | ||
| 184 | return page->private; | ||
| 185 | } | ||
| 186 | |||
| 187 | static inline void set_page_order(struct page *page, int order) { | ||
| 188 | page->private = order; | ||
| 189 | __SetPagePrivate(page); | ||
| 190 | } | ||
| 191 | |||
| 192 | static inline void rmv_page_order(struct page *page) | ||
| 193 | { | ||
| 194 | __ClearPagePrivate(page); | ||
| 195 | page->private = 0; | ||
| 196 | } | ||
| 197 | |||
| 198 | /* | ||
| 199 | * Locate the struct page for both the matching buddy in our | ||
| 200 | * pair (buddy1) and the combined O(n+1) page they form (page). | ||
| 201 | * | ||
| 202 | * 1) Any buddy B1 will have an order O twin B2 which satisfies | ||
| 203 | * the following equation: | ||
| 204 | * B2 = B1 ^ (1 << O) | ||
| 205 | * For example, if the starting buddy (buddy2) is #8 its order | ||
| 206 | * 1 buddy is #10: | ||
| 207 | * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 | ||
| 208 | * | ||
| 209 | * 2) Any buddy B will have an order O+1 parent P which | ||
| 210 | * satisfies the following equation: | ||
| 211 | * P = B & ~(1 << O) | ||
| 212 | * | ||
| 213 | * Assumption: *_mem_map is contigious at least up to MAX_ORDER | ||
| 214 | */ | ||
| 215 | static inline struct page * | ||
| 216 | __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) | ||
| 217 | { | ||
| 218 | unsigned long buddy_idx = page_idx ^ (1 << order); | ||
| 219 | |||
| 220 | return page + (buddy_idx - page_idx); | ||
| 221 | } | ||
| 222 | |||
| 223 | static inline unsigned long | ||
| 224 | __find_combined_index(unsigned long page_idx, unsigned int order) | ||
| 225 | { | ||
| 226 | return (page_idx & ~(1 << order)); | ||
| 227 | } | ||
| 228 | |||
| 229 | /* | ||
| 230 | * This function checks whether a page is free && is the buddy | ||
| 231 | * we can do coalesce a page and its buddy if | ||
| 232 | * (a) the buddy is free && | ||
| 233 | * (b) the buddy is on the buddy system && | ||
| 234 | * (c) a page and its buddy have the same order. | ||
| 235 | * for recording page's order, we use page->private and PG_private. | ||
| 236 | * | ||
| 237 | */ | ||
| 238 | static inline int page_is_buddy(struct page *page, int order) | ||
| 239 | { | ||
| 240 | if (PagePrivate(page) && | ||
| 241 | (page_order(page) == order) && | ||
| 242 | !PageReserved(page) && | ||
| 243 | page_count(page) == 0) | ||
| 244 | return 1; | ||
| 245 | return 0; | ||
| 246 | } | ||
| 247 | |||
| 248 | /* | ||
| 249 | * Freeing function for a buddy system allocator. | ||
| 250 | * | ||
| 251 | * The concept of a buddy system is to maintain direct-mapped table | ||
| 252 | * (containing bit values) for memory blocks of various "orders". | ||
| 253 | * The bottom level table contains the map for the smallest allocatable | ||
| 254 | * units of memory (here, pages), and each level above it describes | ||
| 255 | * pairs of units from the levels below, hence, "buddies". | ||
| 256 | * At a high level, all that happens here is marking the table entry | ||
| 257 | * at the bottom level available, and propagating the changes upward | ||
| 258 | * as necessary, plus some accounting needed to play nicely with other | ||
| 259 | * parts of the VM system. | ||
| 260 | * At each level, we keep a list of pages, which are heads of continuous | ||
| 261 | * free pages of length of (1 << order) and marked with PG_Private.Page's | ||
| 262 | * order is recorded in page->private field. | ||
| 263 | * So when we are allocating or freeing one, we can derive the state of the | ||
| 264 | * other. That is, if we allocate a small block, and both were | ||
| 265 | * free, the remainder of the region must be split into blocks. | ||
| 266 | * If a block is freed, and its buddy is also free, then this | ||
| 267 | * triggers coalescing into a block of larger size. | ||
| 268 | * | ||
| 269 | * -- wli | ||
| 270 | */ | ||
| 271 | |||
| 272 | static inline void __free_pages_bulk (struct page *page, | ||
| 273 | struct zone *zone, unsigned int order) | ||
| 274 | { | ||
| 275 | unsigned long page_idx; | ||
| 276 | int order_size = 1 << order; | ||
| 277 | |||
| 278 | if (unlikely(order)) | ||
| 279 | destroy_compound_page(page, order); | ||
| 280 | |||
| 281 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | ||
| 282 | |||
| 283 | BUG_ON(page_idx & (order_size - 1)); | ||
| 284 | BUG_ON(bad_range(zone, page)); | ||
| 285 | |||
| 286 | zone->free_pages += order_size; | ||
| 287 | while (order < MAX_ORDER-1) { | ||
| 288 | unsigned long combined_idx; | ||
| 289 | struct free_area *area; | ||
| 290 | struct page *buddy; | ||
| 291 | |||
| 292 | combined_idx = __find_combined_index(page_idx, order); | ||
| 293 | buddy = __page_find_buddy(page, page_idx, order); | ||
| 294 | |||
| 295 | if (bad_range(zone, buddy)) | ||
| 296 | break; | ||
| 297 | if (!page_is_buddy(buddy, order)) | ||
| 298 | break; /* Move the buddy up one level. */ | ||
| 299 | list_del(&buddy->lru); | ||
| 300 | area = zone->free_area + order; | ||
| 301 | area->nr_free--; | ||
| 302 | rmv_page_order(buddy); | ||
| 303 | page = page + (combined_idx - page_idx); | ||
| 304 | page_idx = combined_idx; | ||
| 305 | order++; | ||
| 306 | } | ||
| 307 | set_page_order(page, order); | ||
| 308 | list_add(&page->lru, &zone->free_area[order].free_list); | ||
| 309 | zone->free_area[order].nr_free++; | ||
| 310 | } | ||
| 311 | |||
| 312 | static inline void free_pages_check(const char *function, struct page *page) | ||
| 313 | { | ||
| 314 | if ( page_mapcount(page) || | ||
| 315 | page->mapping != NULL || | ||
| 316 | page_count(page) != 0 || | ||
| 317 | (page->flags & ( | ||
| 318 | 1 << PG_lru | | ||
| 319 | 1 << PG_private | | ||
| 320 | 1 << PG_locked | | ||
| 321 | 1 << PG_active | | ||
| 322 | 1 << PG_reclaim | | ||
| 323 | 1 << PG_slab | | ||
| 324 | 1 << PG_swapcache | | ||
| 325 | 1 << PG_writeback ))) | ||
| 326 | bad_page(function, page); | ||
| 327 | if (PageDirty(page)) | ||
| 328 | ClearPageDirty(page); | ||
| 329 | } | ||
| 330 | |||
| 331 | /* | ||
| 332 | * Frees a list of pages. | ||
| 333 | * Assumes all pages on list are in same zone, and of same order. | ||
| 334 | * count is the number of pages to free, or 0 for all on the list. | ||
| 335 | * | ||
| 336 | * If the zone was previously in an "all pages pinned" state then look to | ||
| 337 | * see if this freeing clears that state. | ||
| 338 | * | ||
| 339 | * And clear the zone's pages_scanned counter, to hold off the "all pages are | ||
| 340 | * pinned" detection logic. | ||
| 341 | */ | ||
| 342 | static int | ||
| 343 | free_pages_bulk(struct zone *zone, int count, | ||
| 344 | struct list_head *list, unsigned int order) | ||
| 345 | { | ||
| 346 | unsigned long flags; | ||
| 347 | struct page *page = NULL; | ||
| 348 | int ret = 0; | ||
| 349 | |||
| 350 | spin_lock_irqsave(&zone->lock, flags); | ||
| 351 | zone->all_unreclaimable = 0; | ||
| 352 | zone->pages_scanned = 0; | ||
| 353 | while (!list_empty(list) && count--) { | ||
| 354 | page = list_entry(list->prev, struct page, lru); | ||
| 355 | /* have to delete it as __free_pages_bulk list manipulates */ | ||
| 356 | list_del(&page->lru); | ||
| 357 | __free_pages_bulk(page, zone, order); | ||
| 358 | ret++; | ||
| 359 | } | ||
| 360 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 361 | return ret; | ||
| 362 | } | ||
| 363 | |||
| 364 | void __free_pages_ok(struct page *page, unsigned int order) | ||
| 365 | { | ||
| 366 | LIST_HEAD(list); | ||
| 367 | int i; | ||
| 368 | |||
| 369 | arch_free_page(page, order); | ||
| 370 | |||
| 371 | mod_page_state(pgfree, 1 << order); | ||
| 372 | |||
| 373 | #ifndef CONFIG_MMU | ||
| 374 | if (order > 0) | ||
| 375 | for (i = 1 ; i < (1 << order) ; ++i) | ||
| 376 | __put_page(page + i); | ||
| 377 | #endif | ||
| 378 | |||
| 379 | for (i = 0 ; i < (1 << order) ; ++i) | ||
| 380 | free_pages_check(__FUNCTION__, page + i); | ||
| 381 | list_add(&page->lru, &list); | ||
| 382 | kernel_map_pages(page, 1<<order, 0); | ||
| 383 | free_pages_bulk(page_zone(page), 1, &list, order); | ||
| 384 | } | ||
| 385 | |||
| 386 | |||
| 387 | /* | ||
| 388 | * The order of subdivision here is critical for the IO subsystem. | ||
| 389 | * Please do not alter this order without good reasons and regression | ||
| 390 | * testing. Specifically, as large blocks of memory are subdivided, | ||
| 391 | * the order in which smaller blocks are delivered depends on the order | ||
| 392 | * they're subdivided in this function. This is the primary factor | ||
| 393 | * influencing the order in which pages are delivered to the IO | ||
| 394 | * subsystem according to empirical testing, and this is also justified | ||
| 395 | * by considering the behavior of a buddy system containing a single | ||
| 396 | * large block of memory acted on by a series of small allocations. | ||
| 397 | * This behavior is a critical factor in sglist merging's success. | ||
| 398 | * | ||
| 399 | * -- wli | ||
| 400 | */ | ||
| 401 | static inline struct page * | ||
| 402 | expand(struct zone *zone, struct page *page, | ||
| 403 | int low, int high, struct free_area *area) | ||
| 404 | { | ||
| 405 | unsigned long size = 1 << high; | ||
| 406 | |||
| 407 | while (high > low) { | ||
| 408 | area--; | ||
| 409 | high--; | ||
| 410 | size >>= 1; | ||
| 411 | BUG_ON(bad_range(zone, &page[size])); | ||
| 412 | list_add(&page[size].lru, &area->free_list); | ||
| 413 | area->nr_free++; | ||
| 414 | set_page_order(&page[size], high); | ||
| 415 | } | ||
| 416 | return page; | ||
| 417 | } | ||
| 418 | |||
| 419 | void set_page_refs(struct page *page, int order) | ||
| 420 | { | ||
| 421 | #ifdef CONFIG_MMU | ||
| 422 | set_page_count(page, 1); | ||
| 423 | #else | ||
| 424 | int i; | ||
| 425 | |||
| 426 | /* | ||
| 427 | * We need to reference all the pages for this order, otherwise if | ||
| 428 | * anyone accesses one of the pages with (get/put) it will be freed. | ||
| 429 | * - eg: access_process_vm() | ||
| 430 | */ | ||
| 431 | for (i = 0; i < (1 << order); i++) | ||
| 432 | set_page_count(page + i, 1); | ||
| 433 | #endif /* CONFIG_MMU */ | ||
| 434 | } | ||
| 435 | |||
| 436 | /* | ||
| 437 | * This page is about to be returned from the page allocator | ||
| 438 | */ | ||
| 439 | static void prep_new_page(struct page *page, int order) | ||
| 440 | { | ||
| 441 | if (page->mapping || page_mapcount(page) || | ||
| 442 | (page->flags & ( | ||
| 443 | 1 << PG_private | | ||
| 444 | 1 << PG_locked | | ||
| 445 | 1 << PG_lru | | ||
| 446 | 1 << PG_active | | ||
| 447 | 1 << PG_dirty | | ||
| 448 | 1 << PG_reclaim | | ||
| 449 | 1 << PG_swapcache | | ||
| 450 | 1 << PG_writeback ))) | ||
| 451 | bad_page(__FUNCTION__, page); | ||
| 452 | |||
| 453 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | | ||
| 454 | 1 << PG_referenced | 1 << PG_arch_1 | | ||
| 455 | 1 << PG_checked | 1 << PG_mappedtodisk); | ||
| 456 | page->private = 0; | ||
| 457 | set_page_refs(page, order); | ||
| 458 | kernel_map_pages(page, 1 << order, 1); | ||
| 459 | } | ||
| 460 | |||
| 461 | /* | ||
| 462 | * Do the hard work of removing an element from the buddy allocator. | ||
| 463 | * Call me with the zone->lock already held. | ||
| 464 | */ | ||
| 465 | static struct page *__rmqueue(struct zone *zone, unsigned int order) | ||
| 466 | { | ||
| 467 | struct free_area * area; | ||
| 468 | unsigned int current_order; | ||
| 469 | struct page *page; | ||
| 470 | |||
| 471 | for (current_order = order; current_order < MAX_ORDER; ++current_order) { | ||
| 472 | area = zone->free_area + current_order; | ||
| 473 | if (list_empty(&area->free_list)) | ||
| 474 | continue; | ||
| 475 | |||
| 476 | page = list_entry(area->free_list.next, struct page, lru); | ||
| 477 | list_del(&page->lru); | ||
| 478 | rmv_page_order(page); | ||
| 479 | area->nr_free--; | ||
| 480 | zone->free_pages -= 1UL << order; | ||
| 481 | return expand(zone, page, order, current_order, area); | ||
| 482 | } | ||
| 483 | |||
| 484 | return NULL; | ||
| 485 | } | ||
| 486 | |||
| 487 | /* | ||
| 488 | * Obtain a specified number of elements from the buddy allocator, all under | ||
| 489 | * a single hold of the lock, for efficiency. Add them to the supplied list. | ||
| 490 | * Returns the number of new pages which were placed at *list. | ||
| 491 | */ | ||
| 492 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | ||
| 493 | unsigned long count, struct list_head *list) | ||
| 494 | { | ||
| 495 | unsigned long flags; | ||
| 496 | int i; | ||
| 497 | int allocated = 0; | ||
| 498 | struct page *page; | ||
| 499 | |||
| 500 | spin_lock_irqsave(&zone->lock, flags); | ||
| 501 | for (i = 0; i < count; ++i) { | ||
| 502 | page = __rmqueue(zone, order); | ||
| 503 | if (page == NULL) | ||
| 504 | break; | ||
| 505 | allocated++; | ||
| 506 | list_add_tail(&page->lru, list); | ||
| 507 | } | ||
| 508 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 509 | return allocated; | ||
| 510 | } | ||
| 511 | |||
| 512 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) | ||
| 513 | static void __drain_pages(unsigned int cpu) | ||
| 514 | { | ||
| 515 | struct zone *zone; | ||
| 516 | int i; | ||
| 517 | |||
| 518 | for_each_zone(zone) { | ||
| 519 | struct per_cpu_pageset *pset; | ||
| 520 | |||
| 521 | pset = &zone->pageset[cpu]; | ||
| 522 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | ||
| 523 | struct per_cpu_pages *pcp; | ||
| 524 | |||
| 525 | pcp = &pset->pcp[i]; | ||
| 526 | pcp->count -= free_pages_bulk(zone, pcp->count, | ||
| 527 | &pcp->list, 0); | ||
| 528 | } | ||
| 529 | } | ||
| 530 | } | ||
| 531 | #endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ | ||
| 532 | |||
| 533 | #ifdef CONFIG_PM | ||
| 534 | |||
| 535 | void mark_free_pages(struct zone *zone) | ||
| 536 | { | ||
| 537 | unsigned long zone_pfn, flags; | ||
| 538 | int order; | ||
| 539 | struct list_head *curr; | ||
| 540 | |||
| 541 | if (!zone->spanned_pages) | ||
| 542 | return; | ||
| 543 | |||
| 544 | spin_lock_irqsave(&zone->lock, flags); | ||
| 545 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | ||
| 546 | ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); | ||
| 547 | |||
| 548 | for (order = MAX_ORDER - 1; order >= 0; --order) | ||
| 549 | list_for_each(curr, &zone->free_area[order].free_list) { | ||
| 550 | unsigned long start_pfn, i; | ||
| 551 | |||
| 552 | start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); | ||
| 553 | |||
| 554 | for (i=0; i < (1<<order); i++) | ||
| 555 | SetPageNosaveFree(pfn_to_page(start_pfn+i)); | ||
| 556 | } | ||
| 557 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 558 | } | ||
| 559 | |||
| 560 | /* | ||
| 561 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. | ||
| 562 | */ | ||
| 563 | void drain_local_pages(void) | ||
| 564 | { | ||
| 565 | unsigned long flags; | ||
| 566 | |||
| 567 | local_irq_save(flags); | ||
| 568 | __drain_pages(smp_processor_id()); | ||
| 569 | local_irq_restore(flags); | ||
| 570 | } | ||
| 571 | #endif /* CONFIG_PM */ | ||
| 572 | |||
| 573 | static void zone_statistics(struct zonelist *zonelist, struct zone *z) | ||
| 574 | { | ||
| 575 | #ifdef CONFIG_NUMA | ||
| 576 | unsigned long flags; | ||
| 577 | int cpu; | ||
| 578 | pg_data_t *pg = z->zone_pgdat; | ||
| 579 | pg_data_t *orig = zonelist->zones[0]->zone_pgdat; | ||
| 580 | struct per_cpu_pageset *p; | ||
| 581 | |||
| 582 | local_irq_save(flags); | ||
| 583 | cpu = smp_processor_id(); | ||
| 584 | p = &z->pageset[cpu]; | ||
| 585 | if (pg == orig) { | ||
| 586 | z->pageset[cpu].numa_hit++; | ||
| 587 | } else { | ||
| 588 | p->numa_miss++; | ||
| 589 | zonelist->zones[0]->pageset[cpu].numa_foreign++; | ||
| 590 | } | ||
| 591 | if (pg == NODE_DATA(numa_node_id())) | ||
| 592 | p->local_node++; | ||
| 593 | else | ||
| 594 | p->other_node++; | ||
| 595 | local_irq_restore(flags); | ||
| 596 | #endif | ||
| 597 | } | ||
| 598 | |||
| 599 | /* | ||
| 600 | * Free a 0-order page | ||
| 601 | */ | ||
| 602 | static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); | ||
| 603 | static void fastcall free_hot_cold_page(struct page *page, int cold) | ||
| 604 | { | ||
| 605 | struct zone *zone = page_zone(page); | ||
| 606 | struct per_cpu_pages *pcp; | ||
| 607 | unsigned long flags; | ||
| 608 | |||
| 609 | arch_free_page(page, 0); | ||
| 610 | |||
| 611 | kernel_map_pages(page, 1, 0); | ||
| 612 | inc_page_state(pgfree); | ||
| 613 | if (PageAnon(page)) | ||
| 614 | page->mapping = NULL; | ||
| 615 | free_pages_check(__FUNCTION__, page); | ||
| 616 | pcp = &zone->pageset[get_cpu()].pcp[cold]; | ||
| 617 | local_irq_save(flags); | ||
| 618 | if (pcp->count >= pcp->high) | ||
| 619 | pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | ||
| 620 | list_add(&page->lru, &pcp->list); | ||
| 621 | pcp->count++; | ||
| 622 | local_irq_restore(flags); | ||
| 623 | put_cpu(); | ||
| 624 | } | ||
| 625 | |||
| 626 | void fastcall free_hot_page(struct page *page) | ||
| 627 | { | ||
| 628 | free_hot_cold_page(page, 0); | ||
| 629 | } | ||
| 630 | |||
| 631 | void fastcall free_cold_page(struct page *page) | ||
| 632 | { | ||
| 633 | free_hot_cold_page(page, 1); | ||
| 634 | } | ||
| 635 | |||
| 636 | static inline void prep_zero_page(struct page *page, int order, unsigned int __nocast gfp_flags) | ||
| 637 | { | ||
| 638 | int i; | ||
| 639 | |||
| 640 | BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); | ||
| 641 | for(i = 0; i < (1 << order); i++) | ||
| 642 | clear_highpage(page + i); | ||
| 643 | } | ||
| 644 | |||
| 645 | /* | ||
| 646 | * Really, prep_compound_page() should be called from __rmqueue_bulk(). But | ||
| 647 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | ||
| 648 | * or two. | ||
| 649 | */ | ||
| 650 | static struct page * | ||
| 651 | buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags) | ||
| 652 | { | ||
| 653 | unsigned long flags; | ||
| 654 | struct page *page = NULL; | ||
| 655 | int cold = !!(gfp_flags & __GFP_COLD); | ||
| 656 | |||
| 657 | if (order == 0) { | ||
| 658 | struct per_cpu_pages *pcp; | ||
| 659 | |||
| 660 | pcp = &zone->pageset[get_cpu()].pcp[cold]; | ||
| 661 | local_irq_save(flags); | ||
| 662 | if (pcp->count <= pcp->low) | ||
| 663 | pcp->count += rmqueue_bulk(zone, 0, | ||
| 664 | pcp->batch, &pcp->list); | ||
| 665 | if (pcp->count) { | ||
| 666 | page = list_entry(pcp->list.next, struct page, lru); | ||
| 667 | list_del(&page->lru); | ||
| 668 | pcp->count--; | ||
| 669 | } | ||
| 670 | local_irq_restore(flags); | ||
| 671 | put_cpu(); | ||
| 672 | } | ||
| 673 | |||
| 674 | if (page == NULL) { | ||
| 675 | spin_lock_irqsave(&zone->lock, flags); | ||
| 676 | page = __rmqueue(zone, order); | ||
| 677 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 678 | } | ||
| 679 | |||
| 680 | if (page != NULL) { | ||
| 681 | BUG_ON(bad_range(zone, page)); | ||
| 682 | mod_page_state_zone(zone, pgalloc, 1 << order); | ||
| 683 | prep_new_page(page, order); | ||
| 684 | |||
| 685 | if (gfp_flags & __GFP_ZERO) | ||
| 686 | prep_zero_page(page, order, gfp_flags); | ||
| 687 | |||
| 688 | if (order && (gfp_flags & __GFP_COMP)) | ||
| 689 | prep_compound_page(page, order); | ||
| 690 | } | ||
| 691 | return page; | ||
| 692 | } | ||
| 693 | |||
| 694 | /* | ||
| 695 | * Return 1 if free pages are above 'mark'. This takes into account the order | ||
| 696 | * of the allocation. | ||
| 697 | */ | ||
| 698 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | ||
| 699 | int classzone_idx, int can_try_harder, int gfp_high) | ||
| 700 | { | ||
| 701 | /* free_pages my go negative - that's OK */ | ||
| 702 | long min = mark, free_pages = z->free_pages - (1 << order) + 1; | ||
| 703 | int o; | ||
| 704 | |||
| 705 | if (gfp_high) | ||
| 706 | min -= min / 2; | ||
| 707 | if (can_try_harder) | ||
| 708 | min -= min / 4; | ||
| 709 | |||
| 710 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) | ||
| 711 | return 0; | ||
| 712 | for (o = 0; o < order; o++) { | ||
| 713 | /* At the next order, this order's pages become unavailable */ | ||
| 714 | free_pages -= z->free_area[o].nr_free << o; | ||
| 715 | |||
| 716 | /* Require fewer higher order pages to be free */ | ||
| 717 | min >>= 1; | ||
| 718 | |||
| 719 | if (free_pages <= min) | ||
| 720 | return 0; | ||
| 721 | } | ||
| 722 | return 1; | ||
| 723 | } | ||
| 724 | |||
| 725 | /* | ||
| 726 | * This is the 'heart' of the zoned buddy allocator. | ||
| 727 | */ | ||
| 728 | struct page * fastcall | ||
| 729 | __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order, | ||
| 730 | struct zonelist *zonelist) | ||
| 731 | { | ||
| 732 | const int wait = gfp_mask & __GFP_WAIT; | ||
| 733 | struct zone **zones, *z; | ||
| 734 | struct page *page; | ||
| 735 | struct reclaim_state reclaim_state; | ||
| 736 | struct task_struct *p = current; | ||
| 737 | int i; | ||
| 738 | int classzone_idx; | ||
| 739 | int do_retry; | ||
| 740 | int can_try_harder; | ||
| 741 | int did_some_progress; | ||
| 742 | |||
| 743 | might_sleep_if(wait); | ||
| 744 | |||
| 745 | /* | ||
| 746 | * The caller may dip into page reserves a bit more if the caller | ||
| 747 | * cannot run direct reclaim, or is the caller has realtime scheduling | ||
| 748 | * policy | ||
| 749 | */ | ||
| 750 | can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait; | ||
| 751 | |||
| 752 | zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ | ||
| 753 | |||
| 754 | if (unlikely(zones[0] == NULL)) { | ||
| 755 | /* Should this ever happen?? */ | ||
| 756 | return NULL; | ||
| 757 | } | ||
| 758 | |||
| 759 | classzone_idx = zone_idx(zones[0]); | ||
| 760 | |||
| 761 | restart: | ||
| 762 | /* Go through the zonelist once, looking for a zone with enough free */ | ||
| 763 | for (i = 0; (z = zones[i]) != NULL; i++) { | ||
| 764 | |||
| 765 | if (!zone_watermark_ok(z, order, z->pages_low, | ||
| 766 | classzone_idx, 0, 0)) | ||
| 767 | continue; | ||
| 768 | |||
| 769 | if (!cpuset_zone_allowed(z)) | ||
| 770 | continue; | ||
| 771 | |||
| 772 | page = buffered_rmqueue(z, order, gfp_mask); | ||
| 773 | if (page) | ||
| 774 | goto got_pg; | ||
| 775 | } | ||
| 776 | |||
| 777 | for (i = 0; (z = zones[i]) != NULL; i++) | ||
| 778 | wakeup_kswapd(z, order); | ||
| 779 | |||
| 780 | /* | ||
| 781 | * Go through the zonelist again. Let __GFP_HIGH and allocations | ||
| 782 | * coming from realtime tasks to go deeper into reserves | ||
| 783 | * | ||
| 784 | * This is the last chance, in general, before the goto nopage. | ||
| 785 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | ||
| 786 | */ | ||
| 787 | for (i = 0; (z = zones[i]) != NULL; i++) { | ||
| 788 | if (!zone_watermark_ok(z, order, z->pages_min, | ||
| 789 | classzone_idx, can_try_harder, | ||
| 790 | gfp_mask & __GFP_HIGH)) | ||
| 791 | continue; | ||
| 792 | |||
| 793 | if (wait && !cpuset_zone_allowed(z)) | ||
| 794 | continue; | ||
| 795 | |||
| 796 | page = buffered_rmqueue(z, order, gfp_mask); | ||
| 797 | if (page) | ||
| 798 | goto got_pg; | ||
| 799 | } | ||
| 800 | |||
| 801 | /* This allocation should allow future memory freeing. */ | ||
| 802 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) { | ||
| 803 | /* go through the zonelist yet again, ignoring mins */ | ||
| 804 | for (i = 0; (z = zones[i]) != NULL; i++) { | ||
| 805 | if (!cpuset_zone_allowed(z)) | ||
| 806 | continue; | ||
| 807 | page = buffered_rmqueue(z, order, gfp_mask); | ||
| 808 | if (page) | ||
| 809 | goto got_pg; | ||
| 810 | } | ||
| 811 | goto nopage; | ||
| 812 | } | ||
| 813 | |||
| 814 | /* Atomic allocations - we can't balance anything */ | ||
| 815 | if (!wait) | ||
| 816 | goto nopage; | ||
| 817 | |||
| 818 | rebalance: | ||
| 819 | cond_resched(); | ||
| 820 | |||
| 821 | /* We now go into synchronous reclaim */ | ||
| 822 | p->flags |= PF_MEMALLOC; | ||
| 823 | reclaim_state.reclaimed_slab = 0; | ||
| 824 | p->reclaim_state = &reclaim_state; | ||
| 825 | |||
| 826 | did_some_progress = try_to_free_pages(zones, gfp_mask, order); | ||
| 827 | |||
| 828 | p->reclaim_state = NULL; | ||
| 829 | p->flags &= ~PF_MEMALLOC; | ||
| 830 | |||
| 831 | cond_resched(); | ||
| 832 | |||
| 833 | if (likely(did_some_progress)) { | ||
| 834 | /* | ||
| 835 | * Go through the zonelist yet one more time, keep | ||
| 836 | * very high watermark here, this is only to catch | ||
| 837 | * a parallel oom killing, we must fail if we're still | ||
| 838 | * under heavy pressure. | ||
| 839 | */ | ||
| 840 | for (i = 0; (z = zones[i]) != NULL; i++) { | ||
| 841 | if (!zone_watermark_ok(z, order, z->pages_min, | ||
| 842 | classzone_idx, can_try_harder, | ||
| 843 | gfp_mask & __GFP_HIGH)) | ||
| 844 | continue; | ||
| 845 | |||
| 846 | if (!cpuset_zone_allowed(z)) | ||
| 847 | continue; | ||
| 848 | |||
| 849 | page = buffered_rmqueue(z, order, gfp_mask); | ||
| 850 | if (page) | ||
| 851 | goto got_pg; | ||
| 852 | } | ||
| 853 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | ||
| 854 | /* | ||
| 855 | * Go through the zonelist yet one more time, keep | ||
| 856 | * very high watermark here, this is only to catch | ||
| 857 | * a parallel oom killing, we must fail if we're still | ||
| 858 | * under heavy pressure. | ||
| 859 | */ | ||
| 860 | for (i = 0; (z = zones[i]) != NULL; i++) { | ||
| 861 | if (!zone_watermark_ok(z, order, z->pages_high, | ||
| 862 | classzone_idx, 0, 0)) | ||
| 863 | continue; | ||
| 864 | |||
| 865 | if (!cpuset_zone_allowed(z)) | ||
| 866 | continue; | ||
| 867 | |||
| 868 | page = buffered_rmqueue(z, order, gfp_mask); | ||
| 869 | if (page) | ||
| 870 | goto got_pg; | ||
| 871 | } | ||
| 872 | |||
| 873 | out_of_memory(gfp_mask); | ||
| 874 | goto restart; | ||
| 875 | } | ||
| 876 | |||
| 877 | /* | ||
| 878 | * Don't let big-order allocations loop unless the caller explicitly | ||
| 879 | * requests that. Wait for some write requests to complete then retry. | ||
| 880 | * | ||
| 881 | * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order | ||
| 882 | * <= 3, but that may not be true in other implementations. | ||
| 883 | */ | ||
| 884 | do_retry = 0; | ||
| 885 | if (!(gfp_mask & __GFP_NORETRY)) { | ||
| 886 | if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) | ||
| 887 | do_retry = 1; | ||
| 888 | if (gfp_mask & __GFP_NOFAIL) | ||
| 889 | do_retry = 1; | ||
| 890 | } | ||
| 891 | if (do_retry) { | ||
| 892 | blk_congestion_wait(WRITE, HZ/50); | ||
| 893 | goto rebalance; | ||
| 894 | } | ||
| 895 | |||
| 896 | nopage: | ||
| 897 | if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { | ||
| 898 | printk(KERN_WARNING "%s: page allocation failure." | ||
| 899 | " order:%d, mode:0x%x\n", | ||
| 900 | p->comm, order, gfp_mask); | ||
| 901 | dump_stack(); | ||
| 902 | } | ||
| 903 | return NULL; | ||
| 904 | got_pg: | ||
| 905 | zone_statistics(zonelist, z); | ||
| 906 | return page; | ||
| 907 | } | ||
| 908 | |||
| 909 | EXPORT_SYMBOL(__alloc_pages); | ||
| 910 | |||
| 911 | /* | ||
| 912 | * Common helper functions. | ||
| 913 | */ | ||
| 914 | fastcall unsigned long __get_free_pages(unsigned int __nocast gfp_mask, unsigned int order) | ||
| 915 | { | ||
| 916 | struct page * page; | ||
| 917 | page = alloc_pages(gfp_mask, order); | ||
| 918 | if (!page) | ||
| 919 | return 0; | ||
| 920 | return (unsigned long) page_address(page); | ||
| 921 | } | ||
| 922 | |||
| 923 | EXPORT_SYMBOL(__get_free_pages); | ||
| 924 | |||
| 925 | fastcall unsigned long get_zeroed_page(unsigned int __nocast gfp_mask) | ||
| 926 | { | ||
| 927 | struct page * page; | ||
| 928 | |||
| 929 | /* | ||
| 930 | * get_zeroed_page() returns a 32-bit address, which cannot represent | ||
| 931 | * a highmem page | ||
| 932 | */ | ||
| 933 | BUG_ON(gfp_mask & __GFP_HIGHMEM); | ||
| 934 | |||
| 935 | page = alloc_pages(gfp_mask | __GFP_ZERO, 0); | ||
| 936 | if (page) | ||
| 937 | return (unsigned long) page_address(page); | ||
| 938 | return 0; | ||
| 939 | } | ||
| 940 | |||
| 941 | EXPORT_SYMBOL(get_zeroed_page); | ||
| 942 | |||
| 943 | void __pagevec_free(struct pagevec *pvec) | ||
| 944 | { | ||
| 945 | int i = pagevec_count(pvec); | ||
| 946 | |||
| 947 | while (--i >= 0) | ||
| 948 | free_hot_cold_page(pvec->pages[i], pvec->cold); | ||
| 949 | } | ||
| 950 | |||
| 951 | fastcall void __free_pages(struct page *page, unsigned int order) | ||
| 952 | { | ||
| 953 | if (!PageReserved(page) && put_page_testzero(page)) { | ||
| 954 | if (order == 0) | ||
| 955 | free_hot_page(page); | ||
| 956 | else | ||
| 957 | __free_pages_ok(page, order); | ||
| 958 | } | ||
| 959 | } | ||
| 960 | |||
| 961 | EXPORT_SYMBOL(__free_pages); | ||
| 962 | |||
| 963 | fastcall void free_pages(unsigned long addr, unsigned int order) | ||
| 964 | { | ||
| 965 | if (addr != 0) { | ||
| 966 | BUG_ON(!virt_addr_valid((void *)addr)); | ||
| 967 | __free_pages(virt_to_page((void *)addr), order); | ||
| 968 | } | ||
| 969 | } | ||
| 970 | |||
| 971 | EXPORT_SYMBOL(free_pages); | ||
| 972 | |||
| 973 | /* | ||
| 974 | * Total amount of free (allocatable) RAM: | ||
| 975 | */ | ||
| 976 | unsigned int nr_free_pages(void) | ||
| 977 | { | ||
| 978 | unsigned int sum = 0; | ||
| 979 | struct zone *zone; | ||
| 980 | |||
| 981 | for_each_zone(zone) | ||
| 982 | sum += zone->free_pages; | ||
| 983 | |||
| 984 | return sum; | ||
| 985 | } | ||
| 986 | |||
| 987 | EXPORT_SYMBOL(nr_free_pages); | ||
| 988 | |||
| 989 | #ifdef CONFIG_NUMA | ||
| 990 | unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) | ||
| 991 | { | ||
| 992 | unsigned int i, sum = 0; | ||
| 993 | |||
| 994 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
| 995 | sum += pgdat->node_zones[i].free_pages; | ||
| 996 | |||
| 997 | return sum; | ||
| 998 | } | ||
| 999 | #endif | ||
| 1000 | |||
| 1001 | static unsigned int nr_free_zone_pages(int offset) | ||
| 1002 | { | ||
| 1003 | pg_data_t *pgdat; | ||
| 1004 | unsigned int sum = 0; | ||
| 1005 | |||
| 1006 | for_each_pgdat(pgdat) { | ||
| 1007 | struct zonelist *zonelist = pgdat->node_zonelists + offset; | ||
| 1008 | struct zone **zonep = zonelist->zones; | ||
| 1009 | struct zone *zone; | ||
| 1010 | |||
| 1011 | for (zone = *zonep++; zone; zone = *zonep++) { | ||
| 1012 | unsigned long size = zone->present_pages; | ||
| 1013 | unsigned long high = zone->pages_high; | ||
| 1014 | if (size > high) | ||
| 1015 | sum += size - high; | ||
| 1016 | } | ||
| 1017 | } | ||
| 1018 | |||
| 1019 | return sum; | ||
| 1020 | } | ||
| 1021 | |||
| 1022 | /* | ||
| 1023 | * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL | ||
| 1024 | */ | ||
| 1025 | unsigned int nr_free_buffer_pages(void) | ||
| 1026 | { | ||
| 1027 | return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK); | ||
| 1028 | } | ||
| 1029 | |||
| 1030 | /* | ||
| 1031 | * Amount of free RAM allocatable within all zones | ||
| 1032 | */ | ||
| 1033 | unsigned int nr_free_pagecache_pages(void) | ||
| 1034 | { | ||
| 1035 | return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK); | ||
| 1036 | } | ||
| 1037 | |||
| 1038 | #ifdef CONFIG_HIGHMEM | ||
| 1039 | unsigned int nr_free_highpages (void) | ||
| 1040 | { | ||
| 1041 | pg_data_t *pgdat; | ||
| 1042 | unsigned int pages = 0; | ||
| 1043 | |||
| 1044 | for_each_pgdat(pgdat) | ||
| 1045 | pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; | ||
| 1046 | |||
| 1047 | return pages; | ||
| 1048 | } | ||
| 1049 | #endif | ||
| 1050 | |||
| 1051 | #ifdef CONFIG_NUMA | ||
| 1052 | static void show_node(struct zone *zone) | ||
| 1053 | { | ||
| 1054 | printk("Node %d ", zone->zone_pgdat->node_id); | ||
| 1055 | } | ||
| 1056 | #else | ||
| 1057 | #define show_node(zone) do { } while (0) | ||
| 1058 | #endif | ||
| 1059 | |||
| 1060 | /* | ||
| 1061 | * Accumulate the page_state information across all CPUs. | ||
| 1062 | * The result is unavoidably approximate - it can change | ||
| 1063 | * during and after execution of this function. | ||
| 1064 | */ | ||
| 1065 | static DEFINE_PER_CPU(struct page_state, page_states) = {0}; | ||
| 1066 | |||
| 1067 | atomic_t nr_pagecache = ATOMIC_INIT(0); | ||
| 1068 | EXPORT_SYMBOL(nr_pagecache); | ||
| 1069 | #ifdef CONFIG_SMP | ||
| 1070 | DEFINE_PER_CPU(long, nr_pagecache_local) = 0; | ||
| 1071 | #endif | ||
| 1072 | |||
| 1073 | void __get_page_state(struct page_state *ret, int nr) | ||
| 1074 | { | ||
| 1075 | int cpu = 0; | ||
| 1076 | |||
| 1077 | memset(ret, 0, sizeof(*ret)); | ||
| 1078 | |||
| 1079 | cpu = first_cpu(cpu_online_map); | ||
| 1080 | while (cpu < NR_CPUS) { | ||
| 1081 | unsigned long *in, *out, off; | ||
| 1082 | |||
| 1083 | in = (unsigned long *)&per_cpu(page_states, cpu); | ||
| 1084 | |||
| 1085 | cpu = next_cpu(cpu, cpu_online_map); | ||
| 1086 | |||
| 1087 | if (cpu < NR_CPUS) | ||
| 1088 | prefetch(&per_cpu(page_states, cpu)); | ||
| 1089 | |||
| 1090 | out = (unsigned long *)ret; | ||
| 1091 | for (off = 0; off < nr; off++) | ||
| 1092 | *out++ += *in++; | ||
| 1093 | } | ||
| 1094 | } | ||
| 1095 | |||
| 1096 | void get_page_state(struct page_state *ret) | ||
| 1097 | { | ||
| 1098 | int nr; | ||
| 1099 | |||
| 1100 | nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); | ||
| 1101 | nr /= sizeof(unsigned long); | ||
| 1102 | |||
| 1103 | __get_page_state(ret, nr + 1); | ||
| 1104 | } | ||
| 1105 | |||
| 1106 | void get_full_page_state(struct page_state *ret) | ||
| 1107 | { | ||
| 1108 | __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); | ||
| 1109 | } | ||
| 1110 | |||
| 1111 | unsigned long __read_page_state(unsigned offset) | ||
| 1112 | { | ||
| 1113 | unsigned long ret = 0; | ||
| 1114 | int cpu; | ||
| 1115 | |||
| 1116 | for_each_online_cpu(cpu) { | ||
| 1117 | unsigned long in; | ||
| 1118 | |||
| 1119 | in = (unsigned long)&per_cpu(page_states, cpu) + offset; | ||
| 1120 | ret += *((unsigned long *)in); | ||
| 1121 | } | ||
| 1122 | return ret; | ||
| 1123 | } | ||
| 1124 | |||
| 1125 | void __mod_page_state(unsigned offset, unsigned long delta) | ||
| 1126 | { | ||
| 1127 | unsigned long flags; | ||
| 1128 | void* ptr; | ||
| 1129 | |||
| 1130 | local_irq_save(flags); | ||
| 1131 | ptr = &__get_cpu_var(page_states); | ||
| 1132 | *(unsigned long*)(ptr + offset) += delta; | ||
| 1133 | local_irq_restore(flags); | ||
| 1134 | } | ||
| 1135 | |||
| 1136 | EXPORT_SYMBOL(__mod_page_state); | ||
| 1137 | |||
| 1138 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, | ||
| 1139 | unsigned long *free, struct pglist_data *pgdat) | ||
| 1140 | { | ||
| 1141 | struct zone *zones = pgdat->node_zones; | ||
| 1142 | int i; | ||
| 1143 | |||
| 1144 | *active = 0; | ||
| 1145 | *inactive = 0; | ||
| 1146 | *free = 0; | ||
| 1147 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
| 1148 | *active += zones[i].nr_active; | ||
| 1149 | *inactive += zones[i].nr_inactive; | ||
| 1150 | *free += zones[i].free_pages; | ||
| 1151 | } | ||
| 1152 | } | ||
| 1153 | |||
| 1154 | void get_zone_counts(unsigned long *active, | ||
| 1155 | unsigned long *inactive, unsigned long *free) | ||
| 1156 | { | ||
| 1157 | struct pglist_data *pgdat; | ||
| 1158 | |||
| 1159 | *active = 0; | ||
| 1160 | *inactive = 0; | ||
| 1161 | *free = 0; | ||
| 1162 | for_each_pgdat(pgdat) { | ||
| 1163 | unsigned long l, m, n; | ||
| 1164 | __get_zone_counts(&l, &m, &n, pgdat); | ||
| 1165 | *active += l; | ||
| 1166 | *inactive += m; | ||
| 1167 | *free += n; | ||
| 1168 | } | ||
| 1169 | } | ||
| 1170 | |||
| 1171 | void si_meminfo(struct sysinfo *val) | ||
| 1172 | { | ||
| 1173 | val->totalram = totalram_pages; | ||
| 1174 | val->sharedram = 0; | ||
| 1175 | val->freeram = nr_free_pages(); | ||
| 1176 | val->bufferram = nr_blockdev_pages(); | ||
| 1177 | #ifdef CONFIG_HIGHMEM | ||
| 1178 | val->totalhigh = totalhigh_pages; | ||
| 1179 | val->freehigh = nr_free_highpages(); | ||
| 1180 | #else | ||
| 1181 | val->totalhigh = 0; | ||
| 1182 | val->freehigh = 0; | ||
| 1183 | #endif | ||
| 1184 | val->mem_unit = PAGE_SIZE; | ||
| 1185 | } | ||
| 1186 | |||
| 1187 | EXPORT_SYMBOL(si_meminfo); | ||
| 1188 | |||
| 1189 | #ifdef CONFIG_NUMA | ||
| 1190 | void si_meminfo_node(struct sysinfo *val, int nid) | ||
| 1191 | { | ||
| 1192 | pg_data_t *pgdat = NODE_DATA(nid); | ||
| 1193 | |||
| 1194 | val->totalram = pgdat->node_present_pages; | ||
| 1195 | val->freeram = nr_free_pages_pgdat(pgdat); | ||
| 1196 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; | ||
| 1197 | val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; | ||
| 1198 | val->mem_unit = PAGE_SIZE; | ||
| 1199 | } | ||
| 1200 | #endif | ||
| 1201 | |||
| 1202 | #define K(x) ((x) << (PAGE_SHIFT-10)) | ||
| 1203 | |||
| 1204 | /* | ||
| 1205 | * Show free area list (used inside shift_scroll-lock stuff) | ||
| 1206 | * We also calculate the percentage fragmentation. We do this by counting the | ||
| 1207 | * memory on each free list with the exception of the first item on the list. | ||
| 1208 | */ | ||
| 1209 | void show_free_areas(void) | ||
| 1210 | { | ||
| 1211 | struct page_state ps; | ||
| 1212 | int cpu, temperature; | ||
| 1213 | unsigned long active; | ||
| 1214 | unsigned long inactive; | ||
| 1215 | unsigned long free; | ||
| 1216 | struct zone *zone; | ||
| 1217 | |||
| 1218 | for_each_zone(zone) { | ||
| 1219 | show_node(zone); | ||
| 1220 | printk("%s per-cpu:", zone->name); | ||
| 1221 | |||
| 1222 | if (!zone->present_pages) { | ||
| 1223 | printk(" empty\n"); | ||
| 1224 | continue; | ||
| 1225 | } else | ||
| 1226 | printk("\n"); | ||
| 1227 | |||
| 1228 | for (cpu = 0; cpu < NR_CPUS; ++cpu) { | ||
| 1229 | struct per_cpu_pageset *pageset; | ||
| 1230 | |||
| 1231 | if (!cpu_possible(cpu)) | ||
| 1232 | continue; | ||
| 1233 | |||
| 1234 | pageset = zone->pageset + cpu; | ||
| 1235 | |||
| 1236 | for (temperature = 0; temperature < 2; temperature++) | ||
| 1237 | printk("cpu %d %s: low %d, high %d, batch %d\n", | ||
| 1238 | cpu, | ||
| 1239 | temperature ? "cold" : "hot", | ||
| 1240 | pageset->pcp[temperature].low, | ||
| 1241 | pageset->pcp[temperature].high, | ||
| 1242 | pageset->pcp[temperature].batch); | ||
| 1243 | } | ||
| 1244 | } | ||
| 1245 | |||
| 1246 | get_page_state(&ps); | ||
| 1247 | get_zone_counts(&active, &inactive, &free); | ||
| 1248 | |||
| 1249 | printk("\nFree pages: %11ukB (%ukB HighMem)\n", | ||
| 1250 | K(nr_free_pages()), | ||
| 1251 | K(nr_free_highpages())); | ||
| 1252 | |||
| 1253 | printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " | ||
| 1254 | "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", | ||
| 1255 | active, | ||
| 1256 | inactive, | ||
| 1257 | ps.nr_dirty, | ||
| 1258 | ps.nr_writeback, | ||
| 1259 | ps.nr_unstable, | ||
| 1260 | nr_free_pages(), | ||
| 1261 | ps.nr_slab, | ||
| 1262 | ps.nr_mapped, | ||
| 1263 | ps.nr_page_table_pages); | ||
| 1264 | |||
| 1265 | for_each_zone(zone) { | ||
| 1266 | int i; | ||
| 1267 | |||
| 1268 | show_node(zone); | ||
| 1269 | printk("%s" | ||
| 1270 | " free:%lukB" | ||
| 1271 | " min:%lukB" | ||
| 1272 | " low:%lukB" | ||
| 1273 | " high:%lukB" | ||
| 1274 | " active:%lukB" | ||
| 1275 | " inactive:%lukB" | ||
| 1276 | " present:%lukB" | ||
| 1277 | " pages_scanned:%lu" | ||
| 1278 | " all_unreclaimable? %s" | ||
| 1279 | "\n", | ||
| 1280 | zone->name, | ||
| 1281 | K(zone->free_pages), | ||
| 1282 | K(zone->pages_min), | ||
| 1283 | K(zone->pages_low), | ||
| 1284 | K(zone->pages_high), | ||
| 1285 | K(zone->nr_active), | ||
| 1286 | K(zone->nr_inactive), | ||
| 1287 | K(zone->present_pages), | ||
| 1288 | zone->pages_scanned, | ||
| 1289 | (zone->all_unreclaimable ? "yes" : "no") | ||
| 1290 | ); | ||
| 1291 | printk("lowmem_reserve[]:"); | ||
| 1292 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
| 1293 | printk(" %lu", zone->lowmem_reserve[i]); | ||
| 1294 | printk("\n"); | ||
| 1295 | } | ||
| 1296 | |||
| 1297 | for_each_zone(zone) { | ||
| 1298 | unsigned long nr, flags, order, total = 0; | ||
| 1299 | |||
| 1300 | show_node(zone); | ||
| 1301 | printk("%s: ", zone->name); | ||
| 1302 | if (!zone->present_pages) { | ||
| 1303 | printk("empty\n"); | ||
| 1304 | continue; | ||
| 1305 | } | ||
| 1306 | |||
| 1307 | spin_lock_irqsave(&zone->lock, flags); | ||
| 1308 | for (order = 0; order < MAX_ORDER; order++) { | ||
| 1309 | nr = zone->free_area[order].nr_free; | ||
| 1310 | total += nr << order; | ||
| 1311 | printk("%lu*%lukB ", nr, K(1UL) << order); | ||
| 1312 | } | ||
| 1313 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 1314 | printk("= %lukB\n", K(total)); | ||
| 1315 | } | ||
| 1316 | |||
| 1317 | show_swap_cache_info(); | ||
| 1318 | } | ||
| 1319 | |||
| 1320 | /* | ||
| 1321 | * Builds allocation fallback zone lists. | ||
| 1322 | */ | ||
| 1323 | static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) | ||
| 1324 | { | ||
| 1325 | switch (k) { | ||
| 1326 | struct zone *zone; | ||
| 1327 | default: | ||
| 1328 | BUG(); | ||
| 1329 | case ZONE_HIGHMEM: | ||
| 1330 | zone = pgdat->node_zones + ZONE_HIGHMEM; | ||
| 1331 | if (zone->present_pages) { | ||
| 1332 | #ifndef CONFIG_HIGHMEM | ||
| 1333 | BUG(); | ||
| 1334 | #endif | ||
| 1335 | zonelist->zones[j++] = zone; | ||
| 1336 | } | ||
| 1337 | case ZONE_NORMAL: | ||
| 1338 | zone = pgdat->node_zones + ZONE_NORMAL; | ||
| 1339 | if (zone->present_pages) | ||
| 1340 | zonelist->zones[j++] = zone; | ||
| 1341 | case ZONE_DMA: | ||
| 1342 | zone = pgdat->node_zones + ZONE_DMA; | ||
| 1343 | if (zone->present_pages) | ||
| 1344 | zonelist->zones[j++] = zone; | ||
| 1345 | } | ||
| 1346 | |||
| 1347 | return j; | ||
| 1348 | } | ||
| 1349 | |||
| 1350 | #ifdef CONFIG_NUMA | ||
| 1351 | #define MAX_NODE_LOAD (num_online_nodes()) | ||
| 1352 | static int __initdata node_load[MAX_NUMNODES]; | ||
| 1353 | /** | ||
| 1354 | * find_next_best_node - find the next node that should appear in a given | ||
| 1355 | * node's fallback list | ||
| 1356 | * @node: node whose fallback list we're appending | ||
| 1357 | * @used_node_mask: nodemask_t of already used nodes | ||
| 1358 | * | ||
| 1359 | * We use a number of factors to determine which is the next node that should | ||
| 1360 | * appear on a given node's fallback list. The node should not have appeared | ||
| 1361 | * already in @node's fallback list, and it should be the next closest node | ||
| 1362 | * according to the distance array (which contains arbitrary distance values | ||
| 1363 | * from each node to each node in the system), and should also prefer nodes | ||
| 1364 | * with no CPUs, since presumably they'll have very little allocation pressure | ||
| 1365 | * on them otherwise. | ||
| 1366 | * It returns -1 if no node is found. | ||
| 1367 | */ | ||
| 1368 | static int __init find_next_best_node(int node, nodemask_t *used_node_mask) | ||
| 1369 | { | ||
| 1370 | int i, n, val; | ||
| 1371 | int min_val = INT_MAX; | ||
| 1372 | int best_node = -1; | ||
| 1373 | |||
| 1374 | for_each_online_node(i) { | ||
| 1375 | cpumask_t tmp; | ||
| 1376 | |||
| 1377 | /* Start from local node */ | ||
| 1378 | n = (node+i) % num_online_nodes(); | ||
| 1379 | |||
| 1380 | /* Don't want a node to appear more than once */ | ||
| 1381 | if (node_isset(n, *used_node_mask)) | ||
| 1382 | continue; | ||
| 1383 | |||
| 1384 | /* Use the local node if we haven't already */ | ||
| 1385 | if (!node_isset(node, *used_node_mask)) { | ||
| 1386 | best_node = node; | ||
| 1387 | break; | ||
| 1388 | } | ||
| 1389 | |||
| 1390 | /* Use the distance array to find the distance */ | ||
| 1391 | val = node_distance(node, n); | ||
| 1392 | |||
| 1393 | /* Give preference to headless and unused nodes */ | ||
| 1394 | tmp = node_to_cpumask(n); | ||
| 1395 | if (!cpus_empty(tmp)) | ||
| 1396 | val += PENALTY_FOR_NODE_WITH_CPUS; | ||
| 1397 | |||
| 1398 | /* Slight preference for less loaded node */ | ||
| 1399 | val *= (MAX_NODE_LOAD*MAX_NUMNODES); | ||
| 1400 | val += node_load[n]; | ||
| 1401 | |||
| 1402 | if (val < min_val) { | ||
| 1403 | min_val = val; | ||
| 1404 | best_node = n; | ||
| 1405 | } | ||
| 1406 | } | ||
| 1407 | |||
| 1408 | if (best_node >= 0) | ||
| 1409 | node_set(best_node, *used_node_mask); | ||
| 1410 | |||
| 1411 | return best_node; | ||
| 1412 | } | ||
| 1413 | |||
| 1414 | static void __init build_zonelists(pg_data_t *pgdat) | ||
| 1415 | { | ||
| 1416 | int i, j, k, node, local_node; | ||
| 1417 | int prev_node, load; | ||
| 1418 | struct zonelist *zonelist; | ||
| 1419 | nodemask_t used_mask; | ||
| 1420 | |||
| 1421 | /* initialize zonelists */ | ||
| 1422 | for (i = 0; i < GFP_ZONETYPES; i++) { | ||
| 1423 | zonelist = pgdat->node_zonelists + i; | ||
| 1424 | zonelist->zones[0] = NULL; | ||
| 1425 | } | ||
| 1426 | |||
| 1427 | /* NUMA-aware ordering of nodes */ | ||
| 1428 | local_node = pgdat->node_id; | ||
| 1429 | load = num_online_nodes(); | ||
| 1430 | prev_node = local_node; | ||
| 1431 | nodes_clear(used_mask); | ||
| 1432 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { | ||
| 1433 | /* | ||
| 1434 | * We don't want to pressure a particular node. | ||
| 1435 | * So adding penalty to the first node in same | ||
| 1436 | * distance group to make it round-robin. | ||
| 1437 | */ | ||
| 1438 | if (node_distance(local_node, node) != | ||
| 1439 | node_distance(local_node, prev_node)) | ||
| 1440 | node_load[node] += load; | ||
| 1441 | prev_node = node; | ||
| 1442 | load--; | ||
| 1443 | for (i = 0; i < GFP_ZONETYPES; i++) { | ||
| 1444 | zonelist = pgdat->node_zonelists + i; | ||
| 1445 | for (j = 0; zonelist->zones[j] != NULL; j++); | ||
| 1446 | |||
| 1447 | k = ZONE_NORMAL; | ||
| 1448 | if (i & __GFP_HIGHMEM) | ||
| 1449 | k = ZONE_HIGHMEM; | ||
| 1450 | if (i & __GFP_DMA) | ||
| 1451 | k = ZONE_DMA; | ||
| 1452 | |||
| 1453 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); | ||
| 1454 | zonelist->zones[j] = NULL; | ||
| 1455 | } | ||
| 1456 | } | ||
| 1457 | } | ||
| 1458 | |||
| 1459 | #else /* CONFIG_NUMA */ | ||
| 1460 | |||
| 1461 | static void __init build_zonelists(pg_data_t *pgdat) | ||
| 1462 | { | ||
| 1463 | int i, j, k, node, local_node; | ||
| 1464 | |||
| 1465 | local_node = pgdat->node_id; | ||
| 1466 | for (i = 0; i < GFP_ZONETYPES; i++) { | ||
| 1467 | struct zonelist *zonelist; | ||
| 1468 | |||
| 1469 | zonelist = pgdat->node_zonelists + i; | ||
| 1470 | |||
| 1471 | j = 0; | ||
| 1472 | k = ZONE_NORMAL; | ||
| 1473 | if (i & __GFP_HIGHMEM) | ||
| 1474 | k = ZONE_HIGHMEM; | ||
| 1475 | if (i & __GFP_DMA) | ||
| 1476 | k = ZONE_DMA; | ||
| 1477 | |||
| 1478 | j = build_zonelists_node(pgdat, zonelist, j, k); | ||
| 1479 | /* | ||
| 1480 | * Now we build the zonelist so that it contains the zones | ||
| 1481 | * of all the other nodes. | ||
| 1482 | * We don't want to pressure a particular node, so when | ||
| 1483 | * building the zones for node N, we make sure that the | ||
| 1484 | * zones coming right after the local ones are those from | ||
| 1485 | * node N+1 (modulo N) | ||
| 1486 | */ | ||
| 1487 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { | ||
| 1488 | if (!node_online(node)) | ||
| 1489 | continue; | ||
| 1490 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); | ||
| 1491 | } | ||
| 1492 | for (node = 0; node < local_node; node++) { | ||
| 1493 | if (!node_online(node)) | ||
| 1494 | continue; | ||
| 1495 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); | ||
| 1496 | } | ||
| 1497 | |||
| 1498 | zonelist->zones[j] = NULL; | ||
| 1499 | } | ||
| 1500 | } | ||
| 1501 | |||
| 1502 | #endif /* CONFIG_NUMA */ | ||
| 1503 | |||
| 1504 | void __init build_all_zonelists(void) | ||
| 1505 | { | ||
| 1506 | int i; | ||
| 1507 | |||
| 1508 | for_each_online_node(i) | ||
| 1509 | build_zonelists(NODE_DATA(i)); | ||
| 1510 | printk("Built %i zonelists\n", num_online_nodes()); | ||
| 1511 | cpuset_init_current_mems_allowed(); | ||
| 1512 | } | ||
| 1513 | |||
| 1514 | /* | ||
| 1515 | * Helper functions to size the waitqueue hash table. | ||
| 1516 | * Essentially these want to choose hash table sizes sufficiently | ||
| 1517 | * large so that collisions trying to wait on pages are rare. | ||
| 1518 | * But in fact, the number of active page waitqueues on typical | ||
| 1519 | * systems is ridiculously low, less than 200. So this is even | ||
| 1520 | * conservative, even though it seems large. | ||
| 1521 | * | ||
| 1522 | * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to | ||
| 1523 | * waitqueues, i.e. the size of the waitq table given the number of pages. | ||
| 1524 | */ | ||
| 1525 | #define PAGES_PER_WAITQUEUE 256 | ||
| 1526 | |||
| 1527 | static inline unsigned long wait_table_size(unsigned long pages) | ||
| 1528 | { | ||
| 1529 | unsigned long size = 1; | ||
| 1530 | |||
| 1531 | pages /= PAGES_PER_WAITQUEUE; | ||
| 1532 | |||
| 1533 | while (size < pages) | ||
| 1534 | size <<= 1; | ||
| 1535 | |||
| 1536 | /* | ||
| 1537 | * Once we have dozens or even hundreds of threads sleeping | ||
| 1538 | * on IO we've got bigger problems than wait queue collision. | ||
| 1539 | * Limit the size of the wait table to a reasonable size. | ||
| 1540 | */ | ||
| 1541 | size = min(size, 4096UL); | ||
| 1542 | |||
| 1543 | return max(size, 4UL); | ||
| 1544 | } | ||
| 1545 | |||
| 1546 | /* | ||
| 1547 | * This is an integer logarithm so that shifts can be used later | ||
| 1548 | * to extract the more random high bits from the multiplicative | ||
| 1549 | * hash function before the remainder is taken. | ||
| 1550 | */ | ||
| 1551 | static inline unsigned long wait_table_bits(unsigned long size) | ||
| 1552 | { | ||
| 1553 | return ffz(~size); | ||
| 1554 | } | ||
| 1555 | |||
| 1556 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) | ||
| 1557 | |||
| 1558 | static void __init calculate_zone_totalpages(struct pglist_data *pgdat, | ||
| 1559 | unsigned long *zones_size, unsigned long *zholes_size) | ||
| 1560 | { | ||
| 1561 | unsigned long realtotalpages, totalpages = 0; | ||
| 1562 | int i; | ||
| 1563 | |||
| 1564 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
| 1565 | totalpages += zones_size[i]; | ||
| 1566 | pgdat->node_spanned_pages = totalpages; | ||
| 1567 | |||
| 1568 | realtotalpages = totalpages; | ||
| 1569 | if (zholes_size) | ||
| 1570 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
| 1571 | realtotalpages -= zholes_size[i]; | ||
| 1572 | pgdat->node_present_pages = realtotalpages; | ||
| 1573 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); | ||
| 1574 | } | ||
| 1575 | |||
| 1576 | |||
| 1577 | /* | ||
| 1578 | * Initially all pages are reserved - free ones are freed | ||
| 1579 | * up by free_all_bootmem() once the early boot process is | ||
| 1580 | * done. Non-atomic initialization, single-pass. | ||
| 1581 | */ | ||
| 1582 | void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, | ||
| 1583 | unsigned long start_pfn) | ||
| 1584 | { | ||
| 1585 | struct page *start = pfn_to_page(start_pfn); | ||
| 1586 | struct page *page; | ||
| 1587 | |||
| 1588 | for (page = start; page < (start + size); page++) { | ||
| 1589 | set_page_zone(page, NODEZONE(nid, zone)); | ||
| 1590 | set_page_count(page, 0); | ||
| 1591 | reset_page_mapcount(page); | ||
| 1592 | SetPageReserved(page); | ||
| 1593 | INIT_LIST_HEAD(&page->lru); | ||
| 1594 | #ifdef WANT_PAGE_VIRTUAL | ||
| 1595 | /* The shift won't overflow because ZONE_NORMAL is below 4G. */ | ||
| 1596 | if (!is_highmem_idx(zone)) | ||
| 1597 | set_page_address(page, __va(start_pfn << PAGE_SHIFT)); | ||
| 1598 | #endif | ||
| 1599 | start_pfn++; | ||
| 1600 | } | ||
| 1601 | } | ||
| 1602 | |||
| 1603 | void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, | ||
| 1604 | unsigned long size) | ||
| 1605 | { | ||
| 1606 | int order; | ||
| 1607 | for (order = 0; order < MAX_ORDER ; order++) { | ||
| 1608 | INIT_LIST_HEAD(&zone->free_area[order].free_list); | ||
| 1609 | zone->free_area[order].nr_free = 0; | ||
| 1610 | } | ||
| 1611 | } | ||
| 1612 | |||
| 1613 | #ifndef __HAVE_ARCH_MEMMAP_INIT | ||
| 1614 | #define memmap_init(size, nid, zone, start_pfn) \ | ||
| 1615 | memmap_init_zone((size), (nid), (zone), (start_pfn)) | ||
| 1616 | #endif | ||
| 1617 | |||
| 1618 | /* | ||
| 1619 | * Set up the zone data structures: | ||
| 1620 | * - mark all pages reserved | ||
| 1621 | * - mark all memory queues empty | ||
| 1622 | * - clear the memory bitmaps | ||
| 1623 | */ | ||
| 1624 | static void __init free_area_init_core(struct pglist_data *pgdat, | ||
| 1625 | unsigned long *zones_size, unsigned long *zholes_size) | ||
| 1626 | { | ||
| 1627 | unsigned long i, j; | ||
| 1628 | const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); | ||
| 1629 | int cpu, nid = pgdat->node_id; | ||
| 1630 | unsigned long zone_start_pfn = pgdat->node_start_pfn; | ||
| 1631 | |||
| 1632 | pgdat->nr_zones = 0; | ||
| 1633 | init_waitqueue_head(&pgdat->kswapd_wait); | ||
| 1634 | pgdat->kswapd_max_order = 0; | ||
| 1635 | |||
| 1636 | for (j = 0; j < MAX_NR_ZONES; j++) { | ||
| 1637 | struct zone *zone = pgdat->node_zones + j; | ||
| 1638 | unsigned long size, realsize; | ||
| 1639 | unsigned long batch; | ||
| 1640 | |||
| 1641 | zone_table[NODEZONE(nid, j)] = zone; | ||
| 1642 | realsize = size = zones_size[j]; | ||
| 1643 | if (zholes_size) | ||
| 1644 | realsize -= zholes_size[j]; | ||
| 1645 | |||
| 1646 | if (j == ZONE_DMA || j == ZONE_NORMAL) | ||
| 1647 | nr_kernel_pages += realsize; | ||
| 1648 | nr_all_pages += realsize; | ||
| 1649 | |||
| 1650 | zone->spanned_pages = size; | ||
| 1651 | zone->present_pages = realsize; | ||
| 1652 | zone->name = zone_names[j]; | ||
| 1653 | spin_lock_init(&zone->lock); | ||
| 1654 | spin_lock_init(&zone->lru_lock); | ||
| 1655 | zone->zone_pgdat = pgdat; | ||
| 1656 | zone->free_pages = 0; | ||
| 1657 | |||
| 1658 | zone->temp_priority = zone->prev_priority = DEF_PRIORITY; | ||
| 1659 | |||
| 1660 | /* | ||
| 1661 | * The per-cpu-pages pools are set to around 1000th of the | ||
| 1662 | * size of the zone. But no more than 1/4 of a meg - there's | ||
| 1663 | * no point in going beyond the size of L2 cache. | ||
| 1664 | * | ||
| 1665 | * OK, so we don't know how big the cache is. So guess. | ||
| 1666 | */ | ||
| 1667 | batch = zone->present_pages / 1024; | ||
| 1668 | if (batch * PAGE_SIZE > 256 * 1024) | ||
| 1669 | batch = (256 * 1024) / PAGE_SIZE; | ||
| 1670 | batch /= 4; /* We effectively *= 4 below */ | ||
| 1671 | if (batch < 1) | ||
| 1672 | batch = 1; | ||
| 1673 | |||
| 1674 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
| 1675 | struct per_cpu_pages *pcp; | ||
| 1676 | |||
| 1677 | pcp = &zone->pageset[cpu].pcp[0]; /* hot */ | ||
| 1678 | pcp->count = 0; | ||
| 1679 | pcp->low = 2 * batch; | ||
| 1680 | pcp->high = 6 * batch; | ||
| 1681 | pcp->batch = 1 * batch; | ||
| 1682 | INIT_LIST_HEAD(&pcp->list); | ||
| 1683 | |||
| 1684 | pcp = &zone->pageset[cpu].pcp[1]; /* cold */ | ||
| 1685 | pcp->count = 0; | ||
| 1686 | pcp->low = 0; | ||
| 1687 | pcp->high = 2 * batch; | ||
| 1688 | pcp->batch = 1 * batch; | ||
| 1689 | INIT_LIST_HEAD(&pcp->list); | ||
| 1690 | } | ||
| 1691 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", | ||
| 1692 | zone_names[j], realsize, batch); | ||
| 1693 | INIT_LIST_HEAD(&zone->active_list); | ||
| 1694 | INIT_LIST_HEAD(&zone->inactive_list); | ||
| 1695 | zone->nr_scan_active = 0; | ||
| 1696 | zone->nr_scan_inactive = 0; | ||
| 1697 | zone->nr_active = 0; | ||
| 1698 | zone->nr_inactive = 0; | ||
| 1699 | if (!size) | ||
| 1700 | continue; | ||
| 1701 | |||
| 1702 | /* | ||
| 1703 | * The per-page waitqueue mechanism uses hashed waitqueues | ||
| 1704 | * per zone. | ||
| 1705 | */ | ||
| 1706 | zone->wait_table_size = wait_table_size(size); | ||
| 1707 | zone->wait_table_bits = | ||
| 1708 | wait_table_bits(zone->wait_table_size); | ||
| 1709 | zone->wait_table = (wait_queue_head_t *) | ||
| 1710 | alloc_bootmem_node(pgdat, zone->wait_table_size | ||
| 1711 | * sizeof(wait_queue_head_t)); | ||
| 1712 | |||
| 1713 | for(i = 0; i < zone->wait_table_size; ++i) | ||
| 1714 | init_waitqueue_head(zone->wait_table + i); | ||
| 1715 | |||
| 1716 | pgdat->nr_zones = j+1; | ||
| 1717 | |||
| 1718 | zone->zone_mem_map = pfn_to_page(zone_start_pfn); | ||
| 1719 | zone->zone_start_pfn = zone_start_pfn; | ||
| 1720 | |||
| 1721 | if ((zone_start_pfn) & (zone_required_alignment-1)) | ||
| 1722 | printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n"); | ||
| 1723 | |||
| 1724 | memmap_init(size, nid, j, zone_start_pfn); | ||
| 1725 | |||
| 1726 | zone_start_pfn += size; | ||
| 1727 | |||
| 1728 | zone_init_free_lists(pgdat, zone, zone->spanned_pages); | ||
| 1729 | } | ||
| 1730 | } | ||
| 1731 | |||
| 1732 | static void __init alloc_node_mem_map(struct pglist_data *pgdat) | ||
| 1733 | { | ||
| 1734 | unsigned long size; | ||
| 1735 | |||
| 1736 | /* Skip empty nodes */ | ||
| 1737 | if (!pgdat->node_spanned_pages) | ||
| 1738 | return; | ||
| 1739 | |||
| 1740 | /* ia64 gets its own node_mem_map, before this, without bootmem */ | ||
| 1741 | if (!pgdat->node_mem_map) { | ||
| 1742 | size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); | ||
| 1743 | pgdat->node_mem_map = alloc_bootmem_node(pgdat, size); | ||
| 1744 | } | ||
| 1745 | #ifndef CONFIG_DISCONTIGMEM | ||
| 1746 | /* | ||
| 1747 | * With no DISCONTIG, the global mem_map is just set as node 0's | ||
| 1748 | */ | ||
| 1749 | if (pgdat == NODE_DATA(0)) | ||
| 1750 | mem_map = NODE_DATA(0)->node_mem_map; | ||
| 1751 | #endif | ||
| 1752 | } | ||
| 1753 | |||
| 1754 | void __init free_area_init_node(int nid, struct pglist_data *pgdat, | ||
| 1755 | unsigned long *zones_size, unsigned long node_start_pfn, | ||
| 1756 | unsigned long *zholes_size) | ||
| 1757 | { | ||
| 1758 | pgdat->node_id = nid; | ||
| 1759 | pgdat->node_start_pfn = node_start_pfn; | ||
| 1760 | calculate_zone_totalpages(pgdat, zones_size, zholes_size); | ||
| 1761 | |||
| 1762 | alloc_node_mem_map(pgdat); | ||
| 1763 | |||
| 1764 | free_area_init_core(pgdat, zones_size, zholes_size); | ||
| 1765 | } | ||
| 1766 | |||
| 1767 | #ifndef CONFIG_DISCONTIGMEM | ||
| 1768 | static bootmem_data_t contig_bootmem_data; | ||
| 1769 | struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; | ||
| 1770 | |||
| 1771 | EXPORT_SYMBOL(contig_page_data); | ||
| 1772 | |||
| 1773 | void __init free_area_init(unsigned long *zones_size) | ||
| 1774 | { | ||
| 1775 | free_area_init_node(0, &contig_page_data, zones_size, | ||
| 1776 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); | ||
| 1777 | } | ||
| 1778 | #endif | ||
| 1779 | |||
| 1780 | #ifdef CONFIG_PROC_FS | ||
| 1781 | |||
| 1782 | #include <linux/seq_file.h> | ||
| 1783 | |||
| 1784 | static void *frag_start(struct seq_file *m, loff_t *pos) | ||
| 1785 | { | ||
| 1786 | pg_data_t *pgdat; | ||
| 1787 | loff_t node = *pos; | ||
| 1788 | |||
| 1789 | for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) | ||
| 1790 | --node; | ||
| 1791 | |||
| 1792 | return pgdat; | ||
| 1793 | } | ||
| 1794 | |||
| 1795 | static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) | ||
| 1796 | { | ||
| 1797 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
| 1798 | |||
| 1799 | (*pos)++; | ||
| 1800 | return pgdat->pgdat_next; | ||
| 1801 | } | ||
| 1802 | |||
| 1803 | static void frag_stop(struct seq_file *m, void *arg) | ||
| 1804 | { | ||
| 1805 | } | ||
| 1806 | |||
| 1807 | /* | ||
| 1808 | * This walks the free areas for each zone. | ||
| 1809 | */ | ||
| 1810 | static int frag_show(struct seq_file *m, void *arg) | ||
| 1811 | { | ||
| 1812 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
| 1813 | struct zone *zone; | ||
| 1814 | struct zone *node_zones = pgdat->node_zones; | ||
| 1815 | unsigned long flags; | ||
| 1816 | int order; | ||
| 1817 | |||
| 1818 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | ||
| 1819 | if (!zone->present_pages) | ||
| 1820 | continue; | ||
| 1821 | |||
| 1822 | spin_lock_irqsave(&zone->lock, flags); | ||
| 1823 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); | ||
| 1824 | for (order = 0; order < MAX_ORDER; ++order) | ||
| 1825 | seq_printf(m, "%6lu ", zone->free_area[order].nr_free); | ||
| 1826 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 1827 | seq_putc(m, '\n'); | ||
| 1828 | } | ||
| 1829 | return 0; | ||
| 1830 | } | ||
| 1831 | |||
| 1832 | struct seq_operations fragmentation_op = { | ||
| 1833 | .start = frag_start, | ||
| 1834 | .next = frag_next, | ||
| 1835 | .stop = frag_stop, | ||
| 1836 | .show = frag_show, | ||
| 1837 | }; | ||
| 1838 | |||
| 1839 | static char *vmstat_text[] = { | ||
| 1840 | "nr_dirty", | ||
| 1841 | "nr_writeback", | ||
| 1842 | "nr_unstable", | ||
| 1843 | "nr_page_table_pages", | ||
| 1844 | "nr_mapped", | ||
| 1845 | "nr_slab", | ||
| 1846 | |||
| 1847 | "pgpgin", | ||
| 1848 | "pgpgout", | ||
| 1849 | "pswpin", | ||
| 1850 | "pswpout", | ||
| 1851 | "pgalloc_high", | ||
| 1852 | |||
| 1853 | "pgalloc_normal", | ||
| 1854 | "pgalloc_dma", | ||
| 1855 | "pgfree", | ||
| 1856 | "pgactivate", | ||
| 1857 | "pgdeactivate", | ||
| 1858 | |||
| 1859 | "pgfault", | ||
| 1860 | "pgmajfault", | ||
| 1861 | "pgrefill_high", | ||
| 1862 | "pgrefill_normal", | ||
| 1863 | "pgrefill_dma", | ||
| 1864 | |||
| 1865 | "pgsteal_high", | ||
| 1866 | "pgsteal_normal", | ||
| 1867 | "pgsteal_dma", | ||
| 1868 | "pgscan_kswapd_high", | ||
| 1869 | "pgscan_kswapd_normal", | ||
| 1870 | |||
| 1871 | "pgscan_kswapd_dma", | ||
| 1872 | "pgscan_direct_high", | ||
| 1873 | "pgscan_direct_normal", | ||
| 1874 | "pgscan_direct_dma", | ||
| 1875 | "pginodesteal", | ||
| 1876 | |||
| 1877 | "slabs_scanned", | ||
| 1878 | "kswapd_steal", | ||
| 1879 | "kswapd_inodesteal", | ||
| 1880 | "pageoutrun", | ||
| 1881 | "allocstall", | ||
| 1882 | |||
| 1883 | "pgrotated", | ||
| 1884 | }; | ||
| 1885 | |||
| 1886 | static void *vmstat_start(struct seq_file *m, loff_t *pos) | ||
| 1887 | { | ||
| 1888 | struct page_state *ps; | ||
| 1889 | |||
| 1890 | if (*pos >= ARRAY_SIZE(vmstat_text)) | ||
| 1891 | return NULL; | ||
| 1892 | |||
| 1893 | ps = kmalloc(sizeof(*ps), GFP_KERNEL); | ||
| 1894 | m->private = ps; | ||
| 1895 | if (!ps) | ||
| 1896 | return ERR_PTR(-ENOMEM); | ||
| 1897 | get_full_page_state(ps); | ||
| 1898 | ps->pgpgin /= 2; /* sectors -> kbytes */ | ||
| 1899 | ps->pgpgout /= 2; | ||
| 1900 | return (unsigned long *)ps + *pos; | ||
| 1901 | } | ||
| 1902 | |||
| 1903 | static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) | ||
| 1904 | { | ||
| 1905 | (*pos)++; | ||
| 1906 | if (*pos >= ARRAY_SIZE(vmstat_text)) | ||
| 1907 | return NULL; | ||
| 1908 | return (unsigned long *)m->private + *pos; | ||
| 1909 | } | ||
| 1910 | |||
| 1911 | static int vmstat_show(struct seq_file *m, void *arg) | ||
| 1912 | { | ||
| 1913 | unsigned long *l = arg; | ||
| 1914 | unsigned long off = l - (unsigned long *)m->private; | ||
| 1915 | |||
| 1916 | seq_printf(m, "%s %lu\n", vmstat_text[off], *l); | ||
| 1917 | return 0; | ||
| 1918 | } | ||
| 1919 | |||
| 1920 | static void vmstat_stop(struct seq_file *m, void *arg) | ||
| 1921 | { | ||
| 1922 | kfree(m->private); | ||
| 1923 | m->private = NULL; | ||
| 1924 | } | ||
| 1925 | |||
| 1926 | struct seq_operations vmstat_op = { | ||
| 1927 | .start = vmstat_start, | ||
| 1928 | .next = vmstat_next, | ||
| 1929 | .stop = vmstat_stop, | ||
| 1930 | .show = vmstat_show, | ||
| 1931 | }; | ||
| 1932 | |||
| 1933 | #endif /* CONFIG_PROC_FS */ | ||
| 1934 | |||
| 1935 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 1936 | static int page_alloc_cpu_notify(struct notifier_block *self, | ||
| 1937 | unsigned long action, void *hcpu) | ||
| 1938 | { | ||
| 1939 | int cpu = (unsigned long)hcpu; | ||
| 1940 | long *count; | ||
| 1941 | unsigned long *src, *dest; | ||
| 1942 | |||
| 1943 | if (action == CPU_DEAD) { | ||
| 1944 | int i; | ||
| 1945 | |||
| 1946 | /* Drain local pagecache count. */ | ||
| 1947 | count = &per_cpu(nr_pagecache_local, cpu); | ||
| 1948 | atomic_add(*count, &nr_pagecache); | ||
| 1949 | *count = 0; | ||
| 1950 | local_irq_disable(); | ||
| 1951 | __drain_pages(cpu); | ||
| 1952 | |||
| 1953 | /* Add dead cpu's page_states to our own. */ | ||
| 1954 | dest = (unsigned long *)&__get_cpu_var(page_states); | ||
| 1955 | src = (unsigned long *)&per_cpu(page_states, cpu); | ||
| 1956 | |||
| 1957 | for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long); | ||
| 1958 | i++) { | ||
| 1959 | dest[i] += src[i]; | ||
| 1960 | src[i] = 0; | ||
| 1961 | } | ||
| 1962 | |||
| 1963 | local_irq_enable(); | ||
| 1964 | } | ||
| 1965 | return NOTIFY_OK; | ||
| 1966 | } | ||
| 1967 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
| 1968 | |||
| 1969 | void __init page_alloc_init(void) | ||
| 1970 | { | ||
| 1971 | hotcpu_notifier(page_alloc_cpu_notify, 0); | ||
| 1972 | } | ||
| 1973 | |||
| 1974 | /* | ||
| 1975 | * setup_per_zone_lowmem_reserve - called whenever | ||
| 1976 | * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone | ||
| 1977 | * has a correct pages reserved value, so an adequate number of | ||
| 1978 | * pages are left in the zone after a successful __alloc_pages(). | ||
| 1979 | */ | ||
| 1980 | static void setup_per_zone_lowmem_reserve(void) | ||
| 1981 | { | ||
| 1982 | struct pglist_data *pgdat; | ||
| 1983 | int j, idx; | ||
| 1984 | |||
| 1985 | for_each_pgdat(pgdat) { | ||
| 1986 | for (j = 0; j < MAX_NR_ZONES; j++) { | ||
| 1987 | struct zone *zone = pgdat->node_zones + j; | ||
| 1988 | unsigned long present_pages = zone->present_pages; | ||
| 1989 | |||
| 1990 | zone->lowmem_reserve[j] = 0; | ||
| 1991 | |||
| 1992 | for (idx = j-1; idx >= 0; idx--) { | ||
| 1993 | struct zone *lower_zone; | ||
| 1994 | |||
| 1995 | if (sysctl_lowmem_reserve_ratio[idx] < 1) | ||
| 1996 | sysctl_lowmem_reserve_ratio[idx] = 1; | ||
| 1997 | |||
| 1998 | lower_zone = pgdat->node_zones + idx; | ||
| 1999 | lower_zone->lowmem_reserve[j] = present_pages / | ||
| 2000 | sysctl_lowmem_reserve_ratio[idx]; | ||
| 2001 | present_pages += lower_zone->present_pages; | ||
| 2002 | } | ||
| 2003 | } | ||
| 2004 | } | ||
| 2005 | } | ||
| 2006 | |||
| 2007 | /* | ||
| 2008 | * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures | ||
| 2009 | * that the pages_{min,low,high} values for each zone are set correctly | ||
| 2010 | * with respect to min_free_kbytes. | ||
| 2011 | */ | ||
| 2012 | static void setup_per_zone_pages_min(void) | ||
| 2013 | { | ||
| 2014 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); | ||
| 2015 | unsigned long lowmem_pages = 0; | ||
| 2016 | struct zone *zone; | ||
| 2017 | unsigned long flags; | ||
| 2018 | |||
| 2019 | /* Calculate total number of !ZONE_HIGHMEM pages */ | ||
| 2020 | for_each_zone(zone) { | ||
| 2021 | if (!is_highmem(zone)) | ||
| 2022 | lowmem_pages += zone->present_pages; | ||
| 2023 | } | ||
| 2024 | |||
| 2025 | for_each_zone(zone) { | ||
| 2026 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
| 2027 | if (is_highmem(zone)) { | ||
| 2028 | /* | ||
| 2029 | * Often, highmem doesn't need to reserve any pages. | ||
| 2030 | * But the pages_min/low/high values are also used for | ||
| 2031 | * batching up page reclaim activity so we need a | ||
| 2032 | * decent value here. | ||
| 2033 | */ | ||
| 2034 | int min_pages; | ||
| 2035 | |||
| 2036 | min_pages = zone->present_pages / 1024; | ||
| 2037 | if (min_pages < SWAP_CLUSTER_MAX) | ||
| 2038 | min_pages = SWAP_CLUSTER_MAX; | ||
| 2039 | if (min_pages > 128) | ||
| 2040 | min_pages = 128; | ||
| 2041 | zone->pages_min = min_pages; | ||
| 2042 | } else { | ||
| 2043 | /* if it's a lowmem zone, reserve a number of pages | ||
| 2044 | * proportionate to the zone's size. | ||
| 2045 | */ | ||
| 2046 | zone->pages_min = (pages_min * zone->present_pages) / | ||
| 2047 | lowmem_pages; | ||
| 2048 | } | ||
| 2049 | |||
| 2050 | /* | ||
| 2051 | * When interpreting these watermarks, just keep in mind that: | ||
| 2052 | * zone->pages_min == (zone->pages_min * 4) / 4; | ||
| 2053 | */ | ||
| 2054 | zone->pages_low = (zone->pages_min * 5) / 4; | ||
| 2055 | zone->pages_high = (zone->pages_min * 6) / 4; | ||
| 2056 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
| 2057 | } | ||
| 2058 | } | ||
| 2059 | |||
| 2060 | /* | ||
| 2061 | * Initialise min_free_kbytes. | ||
| 2062 | * | ||
| 2063 | * For small machines we want it small (128k min). For large machines | ||
| 2064 | * we want it large (64MB max). But it is not linear, because network | ||
| 2065 | * bandwidth does not increase linearly with machine size. We use | ||
| 2066 | * | ||
| 2067 | * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: | ||
| 2068 | * min_free_kbytes = sqrt(lowmem_kbytes * 16) | ||
| 2069 | * | ||
| 2070 | * which yields | ||
| 2071 | * | ||
| 2072 | * 16MB: 512k | ||
| 2073 | * 32MB: 724k | ||
| 2074 | * 64MB: 1024k | ||
| 2075 | * 128MB: 1448k | ||
| 2076 | * 256MB: 2048k | ||
| 2077 | * 512MB: 2896k | ||
| 2078 | * 1024MB: 4096k | ||
| 2079 | * 2048MB: 5792k | ||
| 2080 | * 4096MB: 8192k | ||
| 2081 | * 8192MB: 11584k | ||
| 2082 | * 16384MB: 16384k | ||
| 2083 | */ | ||
| 2084 | static int __init init_per_zone_pages_min(void) | ||
| 2085 | { | ||
| 2086 | unsigned long lowmem_kbytes; | ||
| 2087 | |||
| 2088 | lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); | ||
| 2089 | |||
| 2090 | min_free_kbytes = int_sqrt(lowmem_kbytes * 16); | ||
| 2091 | if (min_free_kbytes < 128) | ||
| 2092 | min_free_kbytes = 128; | ||
| 2093 | if (min_free_kbytes > 65536) | ||
| 2094 | min_free_kbytes = 65536; | ||
| 2095 | setup_per_zone_pages_min(); | ||
| 2096 | setup_per_zone_lowmem_reserve(); | ||
| 2097 | return 0; | ||
| 2098 | } | ||
| 2099 | module_init(init_per_zone_pages_min) | ||
| 2100 | |||
| 2101 | /* | ||
| 2102 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so | ||
| 2103 | * that we can call two helper functions whenever min_free_kbytes | ||
| 2104 | * changes. | ||
| 2105 | */ | ||
| 2106 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, | ||
| 2107 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | ||
| 2108 | { | ||
| 2109 | proc_dointvec(table, write, file, buffer, length, ppos); | ||
| 2110 | setup_per_zone_pages_min(); | ||
| 2111 | return 0; | ||
| 2112 | } | ||
| 2113 | |||
| 2114 | /* | ||
| 2115 | * lowmem_reserve_ratio_sysctl_handler - just a wrapper around | ||
| 2116 | * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() | ||
| 2117 | * whenever sysctl_lowmem_reserve_ratio changes. | ||
| 2118 | * | ||
| 2119 | * The reserve ratio obviously has absolutely no relation with the | ||
| 2120 | * pages_min watermarks. The lowmem reserve ratio can only make sense | ||
| 2121 | * if in function of the boot time zone sizes. | ||
| 2122 | */ | ||
| 2123 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | ||
| 2124 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | ||
| 2125 | { | ||
| 2126 | proc_dointvec_minmax(table, write, file, buffer, length, ppos); | ||
| 2127 | setup_per_zone_lowmem_reserve(); | ||
| 2128 | return 0; | ||
| 2129 | } | ||
| 2130 | |||
| 2131 | __initdata int hashdist = HASHDIST_DEFAULT; | ||
| 2132 | |||
| 2133 | #ifdef CONFIG_NUMA | ||
| 2134 | static int __init set_hashdist(char *str) | ||
| 2135 | { | ||
| 2136 | if (!str) | ||
| 2137 | return 0; | ||
| 2138 | hashdist = simple_strtoul(str, &str, 0); | ||
| 2139 | return 1; | ||
| 2140 | } | ||
| 2141 | __setup("hashdist=", set_hashdist); | ||
| 2142 | #endif | ||
| 2143 | |||
| 2144 | /* | ||
| 2145 | * allocate a large system hash table from bootmem | ||
| 2146 | * - it is assumed that the hash table must contain an exact power-of-2 | ||
| 2147 | * quantity of entries | ||
| 2148 | * - limit is the number of hash buckets, not the total allocation size | ||
| 2149 | */ | ||
| 2150 | void *__init alloc_large_system_hash(const char *tablename, | ||
| 2151 | unsigned long bucketsize, | ||
| 2152 | unsigned long numentries, | ||
| 2153 | int scale, | ||
| 2154 | int flags, | ||
| 2155 | unsigned int *_hash_shift, | ||
| 2156 | unsigned int *_hash_mask, | ||
| 2157 | unsigned long limit) | ||
| 2158 | { | ||
| 2159 | unsigned long long max = limit; | ||
| 2160 | unsigned long log2qty, size; | ||
| 2161 | void *table = NULL; | ||
| 2162 | |||
| 2163 | /* allow the kernel cmdline to have a say */ | ||
| 2164 | if (!numentries) { | ||
| 2165 | /* round applicable memory size up to nearest megabyte */ | ||
| 2166 | numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; | ||
| 2167 | numentries += (1UL << (20 - PAGE_SHIFT)) - 1; | ||
| 2168 | numentries >>= 20 - PAGE_SHIFT; | ||
| 2169 | numentries <<= 20 - PAGE_SHIFT; | ||
| 2170 | |||
| 2171 | /* limit to 1 bucket per 2^scale bytes of low memory */ | ||
| 2172 | if (scale > PAGE_SHIFT) | ||
| 2173 | numentries >>= (scale - PAGE_SHIFT); | ||
| 2174 | else | ||
| 2175 | numentries <<= (PAGE_SHIFT - scale); | ||
| 2176 | } | ||
| 2177 | /* rounded up to nearest power of 2 in size */ | ||
| 2178 | numentries = 1UL << (long_log2(numentries) + 1); | ||
| 2179 | |||
| 2180 | /* limit allocation size to 1/16 total memory by default */ | ||
| 2181 | if (max == 0) { | ||
| 2182 | max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; | ||
| 2183 | do_div(max, bucketsize); | ||
| 2184 | } | ||
| 2185 | |||
| 2186 | if (numentries > max) | ||
| 2187 | numentries = max; | ||
| 2188 | |||
| 2189 | log2qty = long_log2(numentries); | ||
| 2190 | |||
| 2191 | do { | ||
| 2192 | size = bucketsize << log2qty; | ||
| 2193 | if (flags & HASH_EARLY) | ||
| 2194 | table = alloc_bootmem(size); | ||
| 2195 | else if (hashdist) | ||
| 2196 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); | ||
| 2197 | else { | ||
| 2198 | unsigned long order; | ||
| 2199 | for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) | ||
| 2200 | ; | ||
| 2201 | table = (void*) __get_free_pages(GFP_ATOMIC, order); | ||
| 2202 | } | ||
| 2203 | } while (!table && size > PAGE_SIZE && --log2qty); | ||
| 2204 | |||
| 2205 | if (!table) | ||
| 2206 | panic("Failed to allocate %s hash table\n", tablename); | ||
| 2207 | |||
| 2208 | printk("%s hash table entries: %d (order: %d, %lu bytes)\n", | ||
| 2209 | tablename, | ||
| 2210 | (1U << log2qty), | ||
| 2211 | long_log2(size) - PAGE_SHIFT, | ||
| 2212 | size); | ||
| 2213 | |||
| 2214 | if (_hash_shift) | ||
| 2215 | *_hash_shift = log2qty; | ||
| 2216 | if (_hash_mask) | ||
| 2217 | *_hash_mask = (1 << log2qty) - 1; | ||
| 2218 | |||
| 2219 | return table; | ||
| 2220 | } | ||
diff --git a/mm/page_io.c b/mm/page_io.c new file mode 100644 index 000000000000..667c76df1ec2 --- /dev/null +++ b/mm/page_io.c | |||
| @@ -0,0 +1,160 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/page_io.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | ||
| 5 | * | ||
| 6 | * Swap reorganised 29.12.95, | ||
| 7 | * Asynchronous swapping added 30.12.95. Stephen Tweedie | ||
| 8 | * Removed race in async swapping. 14.4.1996. Bruno Haible | ||
| 9 | * Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie | ||
| 10 | * Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman | ||
| 11 | */ | ||
| 12 | |||
| 13 | #include <linux/mm.h> | ||
| 14 | #include <linux/kernel_stat.h> | ||
| 15 | #include <linux/pagemap.h> | ||
| 16 | #include <linux/swap.h> | ||
| 17 | #include <linux/bio.h> | ||
| 18 | #include <linux/swapops.h> | ||
| 19 | #include <linux/writeback.h> | ||
| 20 | #include <asm/pgtable.h> | ||
| 21 | |||
| 22 | static struct bio *get_swap_bio(unsigned int __nocast gfp_flags, pgoff_t index, | ||
| 23 | struct page *page, bio_end_io_t end_io) | ||
| 24 | { | ||
| 25 | struct bio *bio; | ||
| 26 | |||
| 27 | bio = bio_alloc(gfp_flags, 1); | ||
| 28 | if (bio) { | ||
| 29 | struct swap_info_struct *sis; | ||
| 30 | swp_entry_t entry = { .val = index, }; | ||
| 31 | |||
| 32 | sis = get_swap_info_struct(swp_type(entry)); | ||
| 33 | bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * | ||
| 34 | (PAGE_SIZE >> 9); | ||
| 35 | bio->bi_bdev = sis->bdev; | ||
| 36 | bio->bi_io_vec[0].bv_page = page; | ||
| 37 | bio->bi_io_vec[0].bv_len = PAGE_SIZE; | ||
| 38 | bio->bi_io_vec[0].bv_offset = 0; | ||
| 39 | bio->bi_vcnt = 1; | ||
| 40 | bio->bi_idx = 0; | ||
| 41 | bio->bi_size = PAGE_SIZE; | ||
| 42 | bio->bi_end_io = end_io; | ||
| 43 | } | ||
| 44 | return bio; | ||
| 45 | } | ||
| 46 | |||
| 47 | static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err) | ||
| 48 | { | ||
| 49 | const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
| 50 | struct page *page = bio->bi_io_vec[0].bv_page; | ||
| 51 | |||
| 52 | if (bio->bi_size) | ||
| 53 | return 1; | ||
| 54 | |||
| 55 | if (!uptodate) | ||
| 56 | SetPageError(page); | ||
| 57 | end_page_writeback(page); | ||
| 58 | bio_put(bio); | ||
| 59 | return 0; | ||
| 60 | } | ||
| 61 | |||
| 62 | static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err) | ||
| 63 | { | ||
| 64 | const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
| 65 | struct page *page = bio->bi_io_vec[0].bv_page; | ||
| 66 | |||
| 67 | if (bio->bi_size) | ||
| 68 | return 1; | ||
| 69 | |||
| 70 | if (!uptodate) { | ||
| 71 | SetPageError(page); | ||
| 72 | ClearPageUptodate(page); | ||
| 73 | } else { | ||
| 74 | SetPageUptodate(page); | ||
| 75 | } | ||
| 76 | unlock_page(page); | ||
| 77 | bio_put(bio); | ||
| 78 | return 0; | ||
| 79 | } | ||
| 80 | |||
| 81 | /* | ||
| 82 | * We may have stale swap cache pages in memory: notice | ||
| 83 | * them here and get rid of the unnecessary final write. | ||
| 84 | */ | ||
| 85 | int swap_writepage(struct page *page, struct writeback_control *wbc) | ||
| 86 | { | ||
| 87 | struct bio *bio; | ||
| 88 | int ret = 0, rw = WRITE; | ||
| 89 | |||
| 90 | if (remove_exclusive_swap_page(page)) { | ||
| 91 | unlock_page(page); | ||
| 92 | goto out; | ||
| 93 | } | ||
| 94 | bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write); | ||
| 95 | if (bio == NULL) { | ||
| 96 | set_page_dirty(page); | ||
| 97 | unlock_page(page); | ||
| 98 | ret = -ENOMEM; | ||
| 99 | goto out; | ||
| 100 | } | ||
| 101 | if (wbc->sync_mode == WB_SYNC_ALL) | ||
| 102 | rw |= (1 << BIO_RW_SYNC); | ||
| 103 | inc_page_state(pswpout); | ||
| 104 | set_page_writeback(page); | ||
| 105 | unlock_page(page); | ||
| 106 | submit_bio(rw, bio); | ||
| 107 | out: | ||
| 108 | return ret; | ||
| 109 | } | ||
| 110 | |||
| 111 | int swap_readpage(struct file *file, struct page *page) | ||
| 112 | { | ||
| 113 | struct bio *bio; | ||
| 114 | int ret = 0; | ||
| 115 | |||
| 116 | BUG_ON(!PageLocked(page)); | ||
| 117 | ClearPageUptodate(page); | ||
| 118 | bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read); | ||
| 119 | if (bio == NULL) { | ||
| 120 | unlock_page(page); | ||
| 121 | ret = -ENOMEM; | ||
| 122 | goto out; | ||
| 123 | } | ||
| 124 | inc_page_state(pswpin); | ||
| 125 | submit_bio(READ, bio); | ||
| 126 | out: | ||
| 127 | return ret; | ||
| 128 | } | ||
| 129 | |||
| 130 | #if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_PM_DISK) | ||
| 131 | /* | ||
| 132 | * A scruffy utility function to read or write an arbitrary swap page | ||
| 133 | * and wait on the I/O. The caller must have a ref on the page. | ||
| 134 | * | ||
| 135 | * We use end_swap_bio_read() even for writes, because it happens to do what | ||
| 136 | * we want. | ||
| 137 | */ | ||
| 138 | int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page) | ||
| 139 | { | ||
| 140 | struct bio *bio; | ||
| 141 | int ret = 0; | ||
| 142 | |||
| 143 | lock_page(page); | ||
| 144 | |||
| 145 | bio = get_swap_bio(GFP_KERNEL, entry.val, page, end_swap_bio_read); | ||
| 146 | if (bio == NULL) { | ||
| 147 | unlock_page(page); | ||
| 148 | ret = -ENOMEM; | ||
| 149 | goto out; | ||
| 150 | } | ||
| 151 | |||
| 152 | submit_bio(rw | (1 << BIO_RW_SYNC), bio); | ||
| 153 | wait_on_page_locked(page); | ||
| 154 | |||
| 155 | if (!PageUptodate(page) || PageError(page)) | ||
| 156 | ret = -EIO; | ||
| 157 | out: | ||
| 158 | return ret; | ||
| 159 | } | ||
| 160 | #endif | ||
diff --git a/mm/pdflush.c b/mm/pdflush.c new file mode 100644 index 000000000000..38ce279cc8cd --- /dev/null +++ b/mm/pdflush.c | |||
| @@ -0,0 +1,228 @@ | |||
| 1 | /* | ||
| 2 | * mm/pdflush.c - worker threads for writing back filesystem data | ||
| 3 | * | ||
| 4 | * Copyright (C) 2002, Linus Torvalds. | ||
| 5 | * | ||
| 6 | * 09Apr2002 akpm@zip.com.au | ||
| 7 | * Initial version | ||
| 8 | * 29Feb2004 kaos@sgi.com | ||
| 9 | * Move worker thread creation to kthread to avoid chewing | ||
| 10 | * up stack space with nested calls to kernel_thread. | ||
| 11 | */ | ||
| 12 | |||
| 13 | #include <linux/sched.h> | ||
| 14 | #include <linux/list.h> | ||
| 15 | #include <linux/signal.h> | ||
| 16 | #include <linux/spinlock.h> | ||
| 17 | #include <linux/gfp.h> | ||
| 18 | #include <linux/init.h> | ||
| 19 | #include <linux/module.h> | ||
| 20 | #include <linux/fs.h> // Needed by writeback.h | ||
| 21 | #include <linux/writeback.h> // Prototypes pdflush_operation() | ||
| 22 | #include <linux/kthread.h> | ||
| 23 | |||
| 24 | |||
| 25 | /* | ||
| 26 | * Minimum and maximum number of pdflush instances | ||
| 27 | */ | ||
| 28 | #define MIN_PDFLUSH_THREADS 2 | ||
| 29 | #define MAX_PDFLUSH_THREADS 8 | ||
| 30 | |||
| 31 | static void start_one_pdflush_thread(void); | ||
| 32 | |||
| 33 | |||
| 34 | /* | ||
| 35 | * The pdflush threads are worker threads for writing back dirty data. | ||
| 36 | * Ideally, we'd like one thread per active disk spindle. But the disk | ||
| 37 | * topology is very hard to divine at this level. Instead, we take | ||
| 38 | * care in various places to prevent more than one pdflush thread from | ||
| 39 | * performing writeback against a single filesystem. pdflush threads | ||
| 40 | * have the PF_FLUSHER flag set in current->flags to aid in this. | ||
| 41 | */ | ||
| 42 | |||
| 43 | /* | ||
| 44 | * All the pdflush threads. Protected by pdflush_lock | ||
| 45 | */ | ||
| 46 | static LIST_HEAD(pdflush_list); | ||
| 47 | static DEFINE_SPINLOCK(pdflush_lock); | ||
| 48 | |||
| 49 | /* | ||
| 50 | * The count of currently-running pdflush threads. Protected | ||
| 51 | * by pdflush_lock. | ||
| 52 | * | ||
| 53 | * Readable by sysctl, but not writable. Published to userspace at | ||
| 54 | * /proc/sys/vm/nr_pdflush_threads. | ||
| 55 | */ | ||
| 56 | int nr_pdflush_threads = 0; | ||
| 57 | |||
| 58 | /* | ||
| 59 | * The time at which the pdflush thread pool last went empty | ||
| 60 | */ | ||
| 61 | static unsigned long last_empty_jifs; | ||
| 62 | |||
| 63 | /* | ||
| 64 | * The pdflush thread. | ||
| 65 | * | ||
| 66 | * Thread pool management algorithm: | ||
| 67 | * | ||
| 68 | * - The minimum and maximum number of pdflush instances are bound | ||
| 69 | * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS. | ||
| 70 | * | ||
| 71 | * - If there have been no idle pdflush instances for 1 second, create | ||
| 72 | * a new one. | ||
| 73 | * | ||
| 74 | * - If the least-recently-went-to-sleep pdflush thread has been asleep | ||
| 75 | * for more than one second, terminate a thread. | ||
| 76 | */ | ||
| 77 | |||
| 78 | /* | ||
| 79 | * A structure for passing work to a pdflush thread. Also for passing | ||
| 80 | * state information between pdflush threads. Protected by pdflush_lock. | ||
| 81 | */ | ||
| 82 | struct pdflush_work { | ||
| 83 | struct task_struct *who; /* The thread */ | ||
| 84 | void (*fn)(unsigned long); /* A callback function */ | ||
| 85 | unsigned long arg0; /* An argument to the callback */ | ||
| 86 | struct list_head list; /* On pdflush_list, when idle */ | ||
| 87 | unsigned long when_i_went_to_sleep; | ||
| 88 | }; | ||
| 89 | |||
| 90 | static int __pdflush(struct pdflush_work *my_work) | ||
| 91 | { | ||
| 92 | current->flags |= PF_FLUSHER; | ||
| 93 | my_work->fn = NULL; | ||
| 94 | my_work->who = current; | ||
| 95 | INIT_LIST_HEAD(&my_work->list); | ||
| 96 | |||
| 97 | spin_lock_irq(&pdflush_lock); | ||
| 98 | nr_pdflush_threads++; | ||
| 99 | for ( ; ; ) { | ||
| 100 | struct pdflush_work *pdf; | ||
| 101 | |||
| 102 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 103 | list_move(&my_work->list, &pdflush_list); | ||
| 104 | my_work->when_i_went_to_sleep = jiffies; | ||
| 105 | spin_unlock_irq(&pdflush_lock); | ||
| 106 | |||
| 107 | schedule(); | ||
| 108 | if (try_to_freeze(PF_FREEZE)) { | ||
| 109 | spin_lock_irq(&pdflush_lock); | ||
| 110 | continue; | ||
| 111 | } | ||
| 112 | |||
| 113 | spin_lock_irq(&pdflush_lock); | ||
| 114 | if (!list_empty(&my_work->list)) { | ||
| 115 | printk("pdflush: bogus wakeup!\n"); | ||
| 116 | my_work->fn = NULL; | ||
| 117 | continue; | ||
| 118 | } | ||
| 119 | if (my_work->fn == NULL) { | ||
| 120 | printk("pdflush: NULL work function\n"); | ||
| 121 | continue; | ||
| 122 | } | ||
| 123 | spin_unlock_irq(&pdflush_lock); | ||
| 124 | |||
| 125 | (*my_work->fn)(my_work->arg0); | ||
| 126 | |||
| 127 | /* | ||
| 128 | * Thread creation: For how long have there been zero | ||
| 129 | * available threads? | ||
| 130 | */ | ||
| 131 | if (jiffies - last_empty_jifs > 1 * HZ) { | ||
| 132 | /* unlocked list_empty() test is OK here */ | ||
| 133 | if (list_empty(&pdflush_list)) { | ||
| 134 | /* unlocked test is OK here */ | ||
| 135 | if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) | ||
| 136 | start_one_pdflush_thread(); | ||
| 137 | } | ||
| 138 | } | ||
| 139 | |||
| 140 | spin_lock_irq(&pdflush_lock); | ||
| 141 | my_work->fn = NULL; | ||
| 142 | |||
| 143 | /* | ||
| 144 | * Thread destruction: For how long has the sleepiest | ||
| 145 | * thread slept? | ||
| 146 | */ | ||
| 147 | if (list_empty(&pdflush_list)) | ||
| 148 | continue; | ||
| 149 | if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) | ||
| 150 | continue; | ||
| 151 | pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); | ||
| 152 | if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) { | ||
| 153 | /* Limit exit rate */ | ||
| 154 | pdf->when_i_went_to_sleep = jiffies; | ||
| 155 | break; /* exeunt */ | ||
| 156 | } | ||
| 157 | } | ||
| 158 | nr_pdflush_threads--; | ||
| 159 | spin_unlock_irq(&pdflush_lock); | ||
| 160 | return 0; | ||
| 161 | } | ||
| 162 | |||
| 163 | /* | ||
| 164 | * Of course, my_work wants to be just a local in __pdflush(). It is | ||
| 165 | * separated out in this manner to hopefully prevent the compiler from | ||
| 166 | * performing unfortunate optimisations against the auto variables. Because | ||
| 167 | * these are visible to other tasks and CPUs. (No problem has actually | ||
| 168 | * been observed. This is just paranoia). | ||
| 169 | */ | ||
| 170 | static int pdflush(void *dummy) | ||
| 171 | { | ||
| 172 | struct pdflush_work my_work; | ||
| 173 | |||
| 174 | /* | ||
| 175 | * pdflush can spend a lot of time doing encryption via dm-crypt. We | ||
| 176 | * don't want to do that at keventd's priority. | ||
| 177 | */ | ||
| 178 | set_user_nice(current, 0); | ||
| 179 | return __pdflush(&my_work); | ||
| 180 | } | ||
| 181 | |||
| 182 | /* | ||
| 183 | * Attempt to wake up a pdflush thread, and get it to do some work for you. | ||
| 184 | * Returns zero if it indeed managed to find a worker thread, and passed your | ||
| 185 | * payload to it. | ||
| 186 | */ | ||
| 187 | int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0) | ||
| 188 | { | ||
| 189 | unsigned long flags; | ||
| 190 | int ret = 0; | ||
| 191 | |||
| 192 | if (fn == NULL) | ||
| 193 | BUG(); /* Hard to diagnose if it's deferred */ | ||
| 194 | |||
| 195 | spin_lock_irqsave(&pdflush_lock, flags); | ||
| 196 | if (list_empty(&pdflush_list)) { | ||
| 197 | spin_unlock_irqrestore(&pdflush_lock, flags); | ||
| 198 | ret = -1; | ||
| 199 | } else { | ||
| 200 | struct pdflush_work *pdf; | ||
| 201 | |||
| 202 | pdf = list_entry(pdflush_list.next, struct pdflush_work, list); | ||
| 203 | list_del_init(&pdf->list); | ||
| 204 | if (list_empty(&pdflush_list)) | ||
| 205 | last_empty_jifs = jiffies; | ||
| 206 | pdf->fn = fn; | ||
| 207 | pdf->arg0 = arg0; | ||
| 208 | wake_up_process(pdf->who); | ||
| 209 | spin_unlock_irqrestore(&pdflush_lock, flags); | ||
| 210 | } | ||
| 211 | return ret; | ||
| 212 | } | ||
| 213 | |||
| 214 | static void start_one_pdflush_thread(void) | ||
| 215 | { | ||
| 216 | kthread_run(pdflush, NULL, "pdflush"); | ||
| 217 | } | ||
| 218 | |||
| 219 | static int __init pdflush_init(void) | ||
| 220 | { | ||
| 221 | int i; | ||
| 222 | |||
| 223 | for (i = 0; i < MIN_PDFLUSH_THREADS; i++) | ||
| 224 | start_one_pdflush_thread(); | ||
| 225 | return 0; | ||
| 226 | } | ||
| 227 | |||
| 228 | module_init(pdflush_init); | ||
diff --git a/mm/prio_tree.c b/mm/prio_tree.c new file mode 100644 index 000000000000..b4e76c25f953 --- /dev/null +++ b/mm/prio_tree.c | |||
| @@ -0,0 +1,207 @@ | |||
| 1 | /* | ||
| 2 | * mm/prio_tree.c - priority search tree for mapping->i_mmap | ||
| 3 | * | ||
| 4 | * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu> | ||
| 5 | * | ||
| 6 | * This file is released under the GPL v2. | ||
| 7 | * | ||
| 8 | * Based on the radix priority search tree proposed by Edward M. McCreight | ||
| 9 | * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985 | ||
| 10 | * | ||
| 11 | * 02Feb2004 Initial version | ||
| 12 | */ | ||
| 13 | |||
| 14 | #include <linux/mm.h> | ||
| 15 | #include <linux/prio_tree.h> | ||
| 16 | |||
| 17 | /* | ||
| 18 | * See lib/prio_tree.c for details on the general radix priority search tree | ||
| 19 | * code. | ||
| 20 | */ | ||
| 21 | |||
| 22 | /* | ||
| 23 | * The following #defines are mirrored from lib/prio_tree.c. They're only used | ||
| 24 | * for debugging, and should be removed (along with the debugging code using | ||
| 25 | * them) when switching also VMAs to the regular prio_tree code. | ||
| 26 | */ | ||
| 27 | |||
| 28 | #define RADIX_INDEX(vma) ((vma)->vm_pgoff) | ||
| 29 | #define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT) | ||
| 30 | /* avoid overflow */ | ||
| 31 | #define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1)) | ||
| 32 | |||
| 33 | /* | ||
| 34 | * Radix priority search tree for address_space->i_mmap | ||
| 35 | * | ||
| 36 | * For each vma that map a unique set of file pages i.e., unique [radix_index, | ||
| 37 | * heap_index] value, we have a corresponing priority search tree node. If | ||
| 38 | * multiple vmas have identical [radix_index, heap_index] value, then one of | ||
| 39 | * them is used as a tree node and others are stored in a vm_set list. The tree | ||
| 40 | * node points to the first vma (head) of the list using vm_set.head. | ||
| 41 | * | ||
| 42 | * prio_tree_root | ||
| 43 | * | | ||
| 44 | * A vm_set.head | ||
| 45 | * / \ / | ||
| 46 | * L R -> H-I-J-K-M-N-O-P-Q-S | ||
| 47 | * ^ ^ <-- vm_set.list --> | ||
| 48 | * tree nodes | ||
| 49 | * | ||
| 50 | * We need some way to identify whether a vma is a tree node, head of a vm_set | ||
| 51 | * list, or just a member of a vm_set list. We cannot use vm_flags to store | ||
| 52 | * such information. The reason is, in the above figure, it is possible that | ||
| 53 | * vm_flags' of R and H are covered by the different mmap_sems. When R is | ||
| 54 | * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold | ||
| 55 | * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now. | ||
| 56 | * That's why some trick involving shared.vm_set.parent is used for identifying | ||
| 57 | * tree nodes and list head nodes. | ||
| 58 | * | ||
| 59 | * vma radix priority search tree node rules: | ||
| 60 | * | ||
| 61 | * vma->shared.vm_set.parent != NULL ==> a tree node | ||
| 62 | * vma->shared.vm_set.head != NULL ==> list of others mapping same range | ||
| 63 | * vma->shared.vm_set.head == NULL ==> no others map the same range | ||
| 64 | * | ||
| 65 | * vma->shared.vm_set.parent == NULL | ||
| 66 | * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range | ||
| 67 | * vma->shared.vm_set.head == NULL ==> a list node | ||
| 68 | */ | ||
| 69 | |||
| 70 | /* | ||
| 71 | * Add a new vma known to map the same set of pages as the old vma: | ||
| 72 | * useful for fork's dup_mmap as well as vma_prio_tree_insert below. | ||
| 73 | * Note that it just happens to work correctly on i_mmap_nonlinear too. | ||
| 74 | */ | ||
| 75 | void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old) | ||
| 76 | { | ||
| 77 | /* Leave these BUG_ONs till prio_tree patch stabilizes */ | ||
| 78 | BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old)); | ||
| 79 | BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old)); | ||
| 80 | |||
| 81 | vma->shared.vm_set.head = NULL; | ||
| 82 | vma->shared.vm_set.parent = NULL; | ||
| 83 | |||
| 84 | if (!old->shared.vm_set.parent) | ||
| 85 | list_add(&vma->shared.vm_set.list, | ||
| 86 | &old->shared.vm_set.list); | ||
| 87 | else if (old->shared.vm_set.head) | ||
| 88 | list_add_tail(&vma->shared.vm_set.list, | ||
| 89 | &old->shared.vm_set.head->shared.vm_set.list); | ||
| 90 | else { | ||
| 91 | INIT_LIST_HEAD(&vma->shared.vm_set.list); | ||
| 92 | vma->shared.vm_set.head = old; | ||
| 93 | old->shared.vm_set.head = vma; | ||
| 94 | } | ||
| 95 | } | ||
| 96 | |||
| 97 | void vma_prio_tree_insert(struct vm_area_struct *vma, | ||
| 98 | struct prio_tree_root *root) | ||
| 99 | { | ||
| 100 | struct prio_tree_node *ptr; | ||
| 101 | struct vm_area_struct *old; | ||
| 102 | |||
| 103 | vma->shared.vm_set.head = NULL; | ||
| 104 | |||
| 105 | ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node); | ||
| 106 | if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) { | ||
| 107 | old = prio_tree_entry(ptr, struct vm_area_struct, | ||
| 108 | shared.prio_tree_node); | ||
| 109 | vma_prio_tree_add(vma, old); | ||
| 110 | } | ||
| 111 | } | ||
| 112 | |||
| 113 | void vma_prio_tree_remove(struct vm_area_struct *vma, | ||
| 114 | struct prio_tree_root *root) | ||
| 115 | { | ||
| 116 | struct vm_area_struct *node, *head, *new_head; | ||
| 117 | |||
| 118 | if (!vma->shared.vm_set.head) { | ||
| 119 | if (!vma->shared.vm_set.parent) | ||
| 120 | list_del_init(&vma->shared.vm_set.list); | ||
| 121 | else | ||
| 122 | raw_prio_tree_remove(root, &vma->shared.prio_tree_node); | ||
| 123 | } else { | ||
| 124 | /* Leave this BUG_ON till prio_tree patch stabilizes */ | ||
| 125 | BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma); | ||
| 126 | if (vma->shared.vm_set.parent) { | ||
| 127 | head = vma->shared.vm_set.head; | ||
| 128 | if (!list_empty(&head->shared.vm_set.list)) { | ||
| 129 | new_head = list_entry( | ||
| 130 | head->shared.vm_set.list.next, | ||
| 131 | struct vm_area_struct, | ||
| 132 | shared.vm_set.list); | ||
| 133 | list_del_init(&head->shared.vm_set.list); | ||
| 134 | } else | ||
| 135 | new_head = NULL; | ||
| 136 | |||
| 137 | raw_prio_tree_replace(root, &vma->shared.prio_tree_node, | ||
| 138 | &head->shared.prio_tree_node); | ||
| 139 | head->shared.vm_set.head = new_head; | ||
| 140 | if (new_head) | ||
| 141 | new_head->shared.vm_set.head = head; | ||
| 142 | |||
| 143 | } else { | ||
| 144 | node = vma->shared.vm_set.head; | ||
| 145 | if (!list_empty(&vma->shared.vm_set.list)) { | ||
| 146 | new_head = list_entry( | ||
| 147 | vma->shared.vm_set.list.next, | ||
| 148 | struct vm_area_struct, | ||
| 149 | shared.vm_set.list); | ||
| 150 | list_del_init(&vma->shared.vm_set.list); | ||
| 151 | node->shared.vm_set.head = new_head; | ||
| 152 | new_head->shared.vm_set.head = node; | ||
| 153 | } else | ||
| 154 | node->shared.vm_set.head = NULL; | ||
| 155 | } | ||
| 156 | } | ||
| 157 | } | ||
| 158 | |||
| 159 | /* | ||
| 160 | * Helper function to enumerate vmas that map a given file page or a set of | ||
| 161 | * contiguous file pages. The function returns vmas that at least map a single | ||
| 162 | * page in the given range of contiguous file pages. | ||
| 163 | */ | ||
| 164 | struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, | ||
| 165 | struct prio_tree_iter *iter) | ||
| 166 | { | ||
| 167 | struct prio_tree_node *ptr; | ||
| 168 | struct vm_area_struct *next; | ||
| 169 | |||
| 170 | if (!vma) { | ||
| 171 | /* | ||
| 172 | * First call is with NULL vma | ||
| 173 | */ | ||
| 174 | ptr = prio_tree_next(iter); | ||
| 175 | if (ptr) { | ||
| 176 | next = prio_tree_entry(ptr, struct vm_area_struct, | ||
| 177 | shared.prio_tree_node); | ||
| 178 | prefetch(next->shared.vm_set.head); | ||
| 179 | return next; | ||
| 180 | } else | ||
| 181 | return NULL; | ||
| 182 | } | ||
| 183 | |||
| 184 | if (vma->shared.vm_set.parent) { | ||
| 185 | if (vma->shared.vm_set.head) { | ||
| 186 | next = vma->shared.vm_set.head; | ||
| 187 | prefetch(next->shared.vm_set.list.next); | ||
| 188 | return next; | ||
| 189 | } | ||
| 190 | } else { | ||
| 191 | next = list_entry(vma->shared.vm_set.list.next, | ||
| 192 | struct vm_area_struct, shared.vm_set.list); | ||
| 193 | if (!next->shared.vm_set.head) { | ||
| 194 | prefetch(next->shared.vm_set.list.next); | ||
| 195 | return next; | ||
| 196 | } | ||
| 197 | } | ||
| 198 | |||
| 199 | ptr = prio_tree_next(iter); | ||
| 200 | if (ptr) { | ||
| 201 | next = prio_tree_entry(ptr, struct vm_area_struct, | ||
| 202 | shared.prio_tree_node); | ||
| 203 | prefetch(next->shared.vm_set.head); | ||
| 204 | return next; | ||
| 205 | } else | ||
| 206 | return NULL; | ||
| 207 | } | ||
diff --git a/mm/readahead.c b/mm/readahead.c new file mode 100644 index 000000000000..b840e7c6ea74 --- /dev/null +++ b/mm/readahead.c | |||
| @@ -0,0 +1,557 @@ | |||
| 1 | /* | ||
| 2 | * mm/readahead.c - address_space-level file readahead. | ||
| 3 | * | ||
| 4 | * Copyright (C) 2002, Linus Torvalds | ||
| 5 | * | ||
| 6 | * 09Apr2002 akpm@zip.com.au | ||
| 7 | * Initial version. | ||
| 8 | */ | ||
| 9 | |||
| 10 | #include <linux/kernel.h> | ||
| 11 | #include <linux/fs.h> | ||
| 12 | #include <linux/mm.h> | ||
| 13 | #include <linux/module.h> | ||
| 14 | #include <linux/blkdev.h> | ||
| 15 | #include <linux/backing-dev.h> | ||
| 16 | #include <linux/pagevec.h> | ||
| 17 | |||
| 18 | void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | ||
| 19 | { | ||
| 20 | } | ||
| 21 | EXPORT_SYMBOL(default_unplug_io_fn); | ||
| 22 | |||
| 23 | struct backing_dev_info default_backing_dev_info = { | ||
| 24 | .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE, | ||
| 25 | .state = 0, | ||
| 26 | .capabilities = BDI_CAP_MAP_COPY, | ||
| 27 | .unplug_io_fn = default_unplug_io_fn, | ||
| 28 | }; | ||
| 29 | EXPORT_SYMBOL_GPL(default_backing_dev_info); | ||
| 30 | |||
| 31 | /* | ||
| 32 | * Initialise a struct file's readahead state. Assumes that the caller has | ||
| 33 | * memset *ra to zero. | ||
| 34 | */ | ||
| 35 | void | ||
| 36 | file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) | ||
| 37 | { | ||
| 38 | ra->ra_pages = mapping->backing_dev_info->ra_pages; | ||
| 39 | ra->prev_page = -1; | ||
| 40 | } | ||
| 41 | |||
| 42 | /* | ||
| 43 | * Return max readahead size for this inode in number-of-pages. | ||
| 44 | */ | ||
| 45 | static inline unsigned long get_max_readahead(struct file_ra_state *ra) | ||
| 46 | { | ||
| 47 | return ra->ra_pages; | ||
| 48 | } | ||
| 49 | |||
| 50 | static inline unsigned long get_min_readahead(struct file_ra_state *ra) | ||
| 51 | { | ||
| 52 | return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; | ||
| 53 | } | ||
| 54 | |||
| 55 | static inline void ra_off(struct file_ra_state *ra) | ||
| 56 | { | ||
| 57 | ra->start = 0; | ||
| 58 | ra->flags = 0; | ||
| 59 | ra->size = 0; | ||
| 60 | ra->ahead_start = 0; | ||
| 61 | ra->ahead_size = 0; | ||
| 62 | return; | ||
| 63 | } | ||
| 64 | |||
| 65 | /* | ||
| 66 | * Set the initial window size, round to next power of 2 and square | ||
| 67 | * for small size, x 4 for medium, and x 2 for large | ||
| 68 | * for 128k (32 page) max ra | ||
| 69 | * 1-8 page = 32k initial, > 8 page = 128k initial | ||
| 70 | */ | ||
| 71 | static unsigned long get_init_ra_size(unsigned long size, unsigned long max) | ||
| 72 | { | ||
| 73 | unsigned long newsize = roundup_pow_of_two(size); | ||
| 74 | |||
| 75 | if (newsize <= max / 64) | ||
| 76 | newsize = newsize * newsize; | ||
| 77 | else if (newsize <= max / 4) | ||
| 78 | newsize = max / 4; | ||
| 79 | else | ||
| 80 | newsize = max; | ||
| 81 | return newsize; | ||
| 82 | } | ||
| 83 | |||
| 84 | /* | ||
| 85 | * Set the new window size, this is called only when I/O is to be submitted, | ||
| 86 | * not for each call to readahead. If a cache miss occured, reduce next I/O | ||
| 87 | * size, else increase depending on how close to max we are. | ||
| 88 | */ | ||
| 89 | static inline unsigned long get_next_ra_size(struct file_ra_state *ra) | ||
| 90 | { | ||
| 91 | unsigned long max = get_max_readahead(ra); | ||
| 92 | unsigned long min = get_min_readahead(ra); | ||
| 93 | unsigned long cur = ra->size; | ||
| 94 | unsigned long newsize; | ||
| 95 | |||
| 96 | if (ra->flags & RA_FLAG_MISS) { | ||
| 97 | ra->flags &= ~RA_FLAG_MISS; | ||
| 98 | newsize = max((cur - 2), min); | ||
| 99 | } else if (cur < max / 16) { | ||
| 100 | newsize = 4 * cur; | ||
| 101 | } else { | ||
| 102 | newsize = 2 * cur; | ||
| 103 | } | ||
| 104 | return min(newsize, max); | ||
| 105 | } | ||
| 106 | |||
| 107 | #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) | ||
| 108 | |||
| 109 | /** | ||
| 110 | * read_cache_pages - populate an address space with some pages, and | ||
| 111 | * start reads against them. | ||
| 112 | * @mapping: the address_space | ||
| 113 | * @pages: The address of a list_head which contains the target pages. These | ||
| 114 | * pages have their ->index populated and are otherwise uninitialised. | ||
| 115 | * @filler: callback routine for filling a single page. | ||
| 116 | * @data: private data for the callback routine. | ||
| 117 | * | ||
| 118 | * Hides the details of the LRU cache etc from the filesystems. | ||
| 119 | */ | ||
| 120 | int read_cache_pages(struct address_space *mapping, struct list_head *pages, | ||
| 121 | int (*filler)(void *, struct page *), void *data) | ||
| 122 | { | ||
| 123 | struct page *page; | ||
| 124 | struct pagevec lru_pvec; | ||
| 125 | int ret = 0; | ||
| 126 | |||
| 127 | pagevec_init(&lru_pvec, 0); | ||
| 128 | |||
| 129 | while (!list_empty(pages)) { | ||
| 130 | page = list_to_page(pages); | ||
| 131 | list_del(&page->lru); | ||
| 132 | if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { | ||
| 133 | page_cache_release(page); | ||
| 134 | continue; | ||
| 135 | } | ||
| 136 | ret = filler(data, page); | ||
| 137 | if (!pagevec_add(&lru_pvec, page)) | ||
| 138 | __pagevec_lru_add(&lru_pvec); | ||
| 139 | if (ret) { | ||
| 140 | while (!list_empty(pages)) { | ||
| 141 | struct page *victim; | ||
| 142 | |||
| 143 | victim = list_to_page(pages); | ||
| 144 | list_del(&victim->lru); | ||
| 145 | page_cache_release(victim); | ||
| 146 | } | ||
| 147 | break; | ||
| 148 | } | ||
| 149 | } | ||
| 150 | pagevec_lru_add(&lru_pvec); | ||
| 151 | return ret; | ||
| 152 | } | ||
| 153 | |||
| 154 | EXPORT_SYMBOL(read_cache_pages); | ||
| 155 | |||
| 156 | static int read_pages(struct address_space *mapping, struct file *filp, | ||
| 157 | struct list_head *pages, unsigned nr_pages) | ||
| 158 | { | ||
| 159 | unsigned page_idx; | ||
| 160 | struct pagevec lru_pvec; | ||
| 161 | int ret = 0; | ||
| 162 | |||
| 163 | if (mapping->a_ops->readpages) { | ||
| 164 | ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); | ||
| 165 | goto out; | ||
| 166 | } | ||
| 167 | |||
| 168 | pagevec_init(&lru_pvec, 0); | ||
| 169 | for (page_idx = 0; page_idx < nr_pages; page_idx++) { | ||
| 170 | struct page *page = list_to_page(pages); | ||
| 171 | list_del(&page->lru); | ||
| 172 | if (!add_to_page_cache(page, mapping, | ||
| 173 | page->index, GFP_KERNEL)) { | ||
| 174 | mapping->a_ops->readpage(filp, page); | ||
| 175 | if (!pagevec_add(&lru_pvec, page)) | ||
| 176 | __pagevec_lru_add(&lru_pvec); | ||
| 177 | } else { | ||
| 178 | page_cache_release(page); | ||
| 179 | } | ||
| 180 | } | ||
| 181 | pagevec_lru_add(&lru_pvec); | ||
| 182 | out: | ||
| 183 | return ret; | ||
| 184 | } | ||
| 185 | |||
| 186 | /* | ||
| 187 | * Readahead design. | ||
| 188 | * | ||
| 189 | * The fields in struct file_ra_state represent the most-recently-executed | ||
| 190 | * readahead attempt: | ||
| 191 | * | ||
| 192 | * start: Page index at which we started the readahead | ||
| 193 | * size: Number of pages in that read | ||
| 194 | * Together, these form the "current window". | ||
| 195 | * Together, start and size represent the `readahead window'. | ||
| 196 | * prev_page: The page which the readahead algorithm most-recently inspected. | ||
| 197 | * It is mainly used to detect sequential file reading. | ||
| 198 | * If page_cache_readahead sees that it is again being called for | ||
| 199 | * a page which it just looked at, it can return immediately without | ||
| 200 | * making any state changes. | ||
| 201 | * ahead_start, | ||
| 202 | * ahead_size: Together, these form the "ahead window". | ||
| 203 | * ra_pages: The externally controlled max readahead for this fd. | ||
| 204 | * | ||
| 205 | * When readahead is in the off state (size == 0), readahead is disabled. | ||
| 206 | * In this state, prev_page is used to detect the resumption of sequential I/O. | ||
| 207 | * | ||
| 208 | * The readahead code manages two windows - the "current" and the "ahead" | ||
| 209 | * windows. The intent is that while the application is walking the pages | ||
| 210 | * in the current window, I/O is underway on the ahead window. When the | ||
| 211 | * current window is fully traversed, it is replaced by the ahead window | ||
| 212 | * and the ahead window is invalidated. When this copying happens, the | ||
| 213 | * new current window's pages are probably still locked. So | ||
| 214 | * we submit a new batch of I/O immediately, creating a new ahead window. | ||
| 215 | * | ||
| 216 | * So: | ||
| 217 | * | ||
| 218 | * ----|----------------|----------------|----- | ||
| 219 | * ^start ^start+size | ||
| 220 | * ^ahead_start ^ahead_start+ahead_size | ||
| 221 | * | ||
| 222 | * ^ When this page is read, we submit I/O for the | ||
| 223 | * ahead window. | ||
| 224 | * | ||
| 225 | * A `readahead hit' occurs when a read request is made against a page which is | ||
| 226 | * the next sequential page. Ahead window calculations are done only when it | ||
| 227 | * is time to submit a new IO. The code ramps up the size agressively at first, | ||
| 228 | * but slow down as it approaches max_readhead. | ||
| 229 | * | ||
| 230 | * Any seek/ramdom IO will result in readahead being turned off. It will resume | ||
| 231 | * at the first sequential access. | ||
| 232 | * | ||
| 233 | * There is a special-case: if the first page which the application tries to | ||
| 234 | * read happens to be the first page of the file, it is assumed that a linear | ||
| 235 | * read is about to happen and the window is immediately set to the initial size | ||
| 236 | * based on I/O request size and the max_readahead. | ||
| 237 | * | ||
| 238 | * This function is to be called for every read request, rather than when | ||
| 239 | * it is time to perform readahead. It is called only once for the entire I/O | ||
| 240 | * regardless of size unless readahead is unable to start enough I/O to satisfy | ||
| 241 | * the request (I/O request > max_readahead). | ||
| 242 | */ | ||
| 243 | |||
| 244 | /* | ||
| 245 | * do_page_cache_readahead actually reads a chunk of disk. It allocates all | ||
| 246 | * the pages first, then submits them all for I/O. This avoids the very bad | ||
| 247 | * behaviour which would occur if page allocations are causing VM writeback. | ||
| 248 | * We really don't want to intermingle reads and writes like that. | ||
| 249 | * | ||
| 250 | * Returns the number of pages requested, or the maximum amount of I/O allowed. | ||
| 251 | * | ||
| 252 | * do_page_cache_readahead() returns -1 if it encountered request queue | ||
| 253 | * congestion. | ||
| 254 | */ | ||
| 255 | static int | ||
| 256 | __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | ||
| 257 | unsigned long offset, unsigned long nr_to_read) | ||
| 258 | { | ||
| 259 | struct inode *inode = mapping->host; | ||
| 260 | struct page *page; | ||
| 261 | unsigned long end_index; /* The last page we want to read */ | ||
| 262 | LIST_HEAD(page_pool); | ||
| 263 | int page_idx; | ||
| 264 | int ret = 0; | ||
| 265 | loff_t isize = i_size_read(inode); | ||
| 266 | |||
| 267 | if (isize == 0) | ||
| 268 | goto out; | ||
| 269 | |||
| 270 | end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); | ||
| 271 | |||
| 272 | /* | ||
| 273 | * Preallocate as many pages as we will need. | ||
| 274 | */ | ||
| 275 | read_lock_irq(&mapping->tree_lock); | ||
| 276 | for (page_idx = 0; page_idx < nr_to_read; page_idx++) { | ||
| 277 | unsigned long page_offset = offset + page_idx; | ||
| 278 | |||
| 279 | if (page_offset > end_index) | ||
| 280 | break; | ||
| 281 | |||
| 282 | page = radix_tree_lookup(&mapping->page_tree, page_offset); | ||
| 283 | if (page) | ||
| 284 | continue; | ||
| 285 | |||
| 286 | read_unlock_irq(&mapping->tree_lock); | ||
| 287 | page = page_cache_alloc_cold(mapping); | ||
| 288 | read_lock_irq(&mapping->tree_lock); | ||
| 289 | if (!page) | ||
| 290 | break; | ||
| 291 | page->index = page_offset; | ||
| 292 | list_add(&page->lru, &page_pool); | ||
| 293 | ret++; | ||
| 294 | } | ||
| 295 | read_unlock_irq(&mapping->tree_lock); | ||
| 296 | |||
| 297 | /* | ||
| 298 | * Now start the IO. We ignore I/O errors - if the page is not | ||
| 299 | * uptodate then the caller will launch readpage again, and | ||
| 300 | * will then handle the error. | ||
| 301 | */ | ||
| 302 | if (ret) | ||
| 303 | read_pages(mapping, filp, &page_pool, ret); | ||
| 304 | BUG_ON(!list_empty(&page_pool)); | ||
| 305 | out: | ||
| 306 | return ret; | ||
| 307 | } | ||
| 308 | |||
| 309 | /* | ||
| 310 | * Chunk the readahead into 2 megabyte units, so that we don't pin too much | ||
| 311 | * memory at once. | ||
| 312 | */ | ||
| 313 | int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | ||
| 314 | unsigned long offset, unsigned long nr_to_read) | ||
| 315 | { | ||
| 316 | int ret = 0; | ||
| 317 | |||
| 318 | if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) | ||
| 319 | return -EINVAL; | ||
| 320 | |||
| 321 | while (nr_to_read) { | ||
| 322 | int err; | ||
| 323 | |||
| 324 | unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE; | ||
| 325 | |||
| 326 | if (this_chunk > nr_to_read) | ||
| 327 | this_chunk = nr_to_read; | ||
| 328 | err = __do_page_cache_readahead(mapping, filp, | ||
| 329 | offset, this_chunk); | ||
| 330 | if (err < 0) { | ||
| 331 | ret = err; | ||
| 332 | break; | ||
| 333 | } | ||
| 334 | ret += err; | ||
| 335 | offset += this_chunk; | ||
| 336 | nr_to_read -= this_chunk; | ||
| 337 | } | ||
| 338 | return ret; | ||
| 339 | } | ||
| 340 | |||
| 341 | /* | ||
| 342 | * Check how effective readahead is being. If the amount of started IO is | ||
| 343 | * less than expected then the file is partly or fully in pagecache and | ||
| 344 | * readahead isn't helping. | ||
| 345 | * | ||
| 346 | */ | ||
| 347 | static inline int check_ra_success(struct file_ra_state *ra, | ||
| 348 | unsigned long nr_to_read, unsigned long actual) | ||
| 349 | { | ||
| 350 | if (actual == 0) { | ||
| 351 | ra->cache_hit += nr_to_read; | ||
| 352 | if (ra->cache_hit >= VM_MAX_CACHE_HIT) { | ||
| 353 | ra_off(ra); | ||
| 354 | ra->flags |= RA_FLAG_INCACHE; | ||
| 355 | return 0; | ||
| 356 | } | ||
| 357 | } else { | ||
| 358 | ra->cache_hit=0; | ||
| 359 | } | ||
| 360 | return 1; | ||
| 361 | } | ||
| 362 | |||
| 363 | /* | ||
| 364 | * This version skips the IO if the queue is read-congested, and will tell the | ||
| 365 | * block layer to abandon the readahead if request allocation would block. | ||
| 366 | * | ||
| 367 | * force_page_cache_readahead() will ignore queue congestion and will block on | ||
| 368 | * request queues. | ||
| 369 | */ | ||
| 370 | int do_page_cache_readahead(struct address_space *mapping, struct file *filp, | ||
| 371 | unsigned long offset, unsigned long nr_to_read) | ||
| 372 | { | ||
| 373 | if (bdi_read_congested(mapping->backing_dev_info)) | ||
| 374 | return -1; | ||
| 375 | |||
| 376 | return __do_page_cache_readahead(mapping, filp, offset, nr_to_read); | ||
| 377 | } | ||
| 378 | |||
| 379 | /* | ||
| 380 | * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block' | ||
| 381 | * is set wait till the read completes. Otherwise attempt to read without | ||
| 382 | * blocking. | ||
| 383 | * Returns 1 meaning 'success' if read is succesfull without switching off | ||
| 384 | * readhaead mode. Otherwise return failure. | ||
| 385 | */ | ||
| 386 | static int | ||
| 387 | blockable_page_cache_readahead(struct address_space *mapping, struct file *filp, | ||
| 388 | unsigned long offset, unsigned long nr_to_read, | ||
| 389 | struct file_ra_state *ra, int block) | ||
| 390 | { | ||
| 391 | int actual; | ||
| 392 | |||
| 393 | if (!block && bdi_read_congested(mapping->backing_dev_info)) | ||
| 394 | return 0; | ||
| 395 | |||
| 396 | actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read); | ||
| 397 | |||
| 398 | return check_ra_success(ra, nr_to_read, actual); | ||
| 399 | } | ||
| 400 | |||
| 401 | static int make_ahead_window(struct address_space *mapping, struct file *filp, | ||
| 402 | struct file_ra_state *ra, int force) | ||
| 403 | { | ||
| 404 | int block, ret; | ||
| 405 | |||
| 406 | ra->ahead_size = get_next_ra_size(ra); | ||
| 407 | ra->ahead_start = ra->start + ra->size; | ||
| 408 | |||
| 409 | block = force || (ra->prev_page >= ra->ahead_start); | ||
| 410 | ret = blockable_page_cache_readahead(mapping, filp, | ||
| 411 | ra->ahead_start, ra->ahead_size, ra, block); | ||
| 412 | |||
| 413 | if (!ret && !force) { | ||
| 414 | /* A read failure in blocking mode, implies pages are | ||
| 415 | * all cached. So we can safely assume we have taken | ||
| 416 | * care of all the pages requested in this call. | ||
| 417 | * A read failure in non-blocking mode, implies we are | ||
| 418 | * reading more pages than requested in this call. So | ||
| 419 | * we safely assume we have taken care of all the pages | ||
| 420 | * requested in this call. | ||
| 421 | * | ||
| 422 | * Just reset the ahead window in case we failed due to | ||
| 423 | * congestion. The ahead window will any way be closed | ||
| 424 | * in case we failed due to excessive page cache hits. | ||
| 425 | */ | ||
| 426 | ra->ahead_start = 0; | ||
| 427 | ra->ahead_size = 0; | ||
| 428 | } | ||
| 429 | |||
| 430 | return ret; | ||
| 431 | } | ||
| 432 | |||
| 433 | /* | ||
| 434 | * page_cache_readahead is the main function. If performs the adaptive | ||
| 435 | * readahead window size management and submits the readahead I/O. | ||
| 436 | */ | ||
| 437 | unsigned long | ||
| 438 | page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, | ||
| 439 | struct file *filp, unsigned long offset, | ||
| 440 | unsigned long req_size) | ||
| 441 | { | ||
| 442 | unsigned long max, newsize; | ||
| 443 | int sequential; | ||
| 444 | |||
| 445 | /* | ||
| 446 | * We avoid doing extra work and bogusly perturbing the readahead | ||
| 447 | * window expansion logic. | ||
| 448 | */ | ||
| 449 | if (offset == ra->prev_page && --req_size) | ||
| 450 | ++offset; | ||
| 451 | |||
| 452 | /* Note that prev_page == -1 if it is a first read */ | ||
| 453 | sequential = (offset == ra->prev_page + 1); | ||
| 454 | ra->prev_page = offset; | ||
| 455 | |||
| 456 | max = get_max_readahead(ra); | ||
| 457 | newsize = min(req_size, max); | ||
| 458 | |||
| 459 | /* No readahead or sub-page sized read or file already in cache */ | ||
| 460 | if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE)) | ||
| 461 | goto out; | ||
| 462 | |||
| 463 | ra->prev_page += newsize - 1; | ||
| 464 | |||
| 465 | /* | ||
| 466 | * Special case - first read at start of file. We'll assume it's | ||
| 467 | * a whole-file read and grow the window fast. Or detect first | ||
| 468 | * sequential access | ||
| 469 | */ | ||
| 470 | if (sequential && ra->size == 0) { | ||
| 471 | ra->size = get_init_ra_size(newsize, max); | ||
| 472 | ra->start = offset; | ||
| 473 | if (!blockable_page_cache_readahead(mapping, filp, offset, | ||
| 474 | ra->size, ra, 1)) | ||
| 475 | goto out; | ||
| 476 | |||
| 477 | /* | ||
| 478 | * If the request size is larger than our max readahead, we | ||
| 479 | * at least want to be sure that we get 2 IOs in flight and | ||
| 480 | * we know that we will definitly need the new I/O. | ||
| 481 | * once we do this, subsequent calls should be able to overlap | ||
| 482 | * IOs,* thus preventing stalls. so issue the ahead window | ||
| 483 | * immediately. | ||
| 484 | */ | ||
| 485 | if (req_size >= max) | ||
| 486 | make_ahead_window(mapping, filp, ra, 1); | ||
| 487 | |||
| 488 | goto out; | ||
| 489 | } | ||
| 490 | |||
| 491 | /* | ||
| 492 | * Now handle the random case: | ||
| 493 | * partial page reads and first access were handled above, | ||
| 494 | * so this must be the next page otherwise it is random | ||
| 495 | */ | ||
| 496 | if (!sequential) { | ||
| 497 | ra_off(ra); | ||
| 498 | blockable_page_cache_readahead(mapping, filp, offset, | ||
| 499 | newsize, ra, 1); | ||
| 500 | goto out; | ||
| 501 | } | ||
| 502 | |||
| 503 | /* | ||
| 504 | * If we get here we are doing sequential IO and this was not the first | ||
| 505 | * occurence (ie we have an existing window) | ||
| 506 | */ | ||
| 507 | |||
| 508 | if (ra->ahead_start == 0) { /* no ahead window yet */ | ||
| 509 | if (!make_ahead_window(mapping, filp, ra, 0)) | ||
| 510 | goto out; | ||
| 511 | } | ||
| 512 | /* | ||
| 513 | * Already have an ahead window, check if we crossed into it. | ||
| 514 | * If so, shift windows and issue a new ahead window. | ||
| 515 | * Only return the #pages that are in the current window, so that | ||
| 516 | * we get called back on the first page of the ahead window which | ||
| 517 | * will allow us to submit more IO. | ||
| 518 | */ | ||
| 519 | if (ra->prev_page >= ra->ahead_start) { | ||
| 520 | ra->start = ra->ahead_start; | ||
| 521 | ra->size = ra->ahead_size; | ||
| 522 | make_ahead_window(mapping, filp, ra, 0); | ||
| 523 | } | ||
| 524 | |||
| 525 | out: | ||
| 526 | return ra->prev_page + 1; | ||
| 527 | } | ||
| 528 | |||
| 529 | /* | ||
| 530 | * handle_ra_miss() is called when it is known that a page which should have | ||
| 531 | * been present in the pagecache (we just did some readahead there) was in fact | ||
| 532 | * not found. This will happen if it was evicted by the VM (readahead | ||
| 533 | * thrashing) | ||
| 534 | * | ||
| 535 | * Turn on the cache miss flag in the RA struct, this will cause the RA code | ||
| 536 | * to reduce the RA size on the next read. | ||
| 537 | */ | ||
| 538 | void handle_ra_miss(struct address_space *mapping, | ||
| 539 | struct file_ra_state *ra, pgoff_t offset) | ||
| 540 | { | ||
| 541 | ra->flags |= RA_FLAG_MISS; | ||
| 542 | ra->flags &= ~RA_FLAG_INCACHE; | ||
| 543 | } | ||
| 544 | |||
| 545 | /* | ||
| 546 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a | ||
| 547 | * sensible upper limit. | ||
| 548 | */ | ||
| 549 | unsigned long max_sane_readahead(unsigned long nr) | ||
| 550 | { | ||
| 551 | unsigned long active; | ||
| 552 | unsigned long inactive; | ||
| 553 | unsigned long free; | ||
| 554 | |||
| 555 | __get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id())); | ||
| 556 | return min(nr, (inactive + free) / 2); | ||
| 557 | } | ||
diff --git a/mm/rmap.c b/mm/rmap.c new file mode 100644 index 000000000000..884d6d1928bc --- /dev/null +++ b/mm/rmap.c | |||
| @@ -0,0 +1,862 @@ | |||
| 1 | /* | ||
| 2 | * mm/rmap.c - physical to virtual reverse mappings | ||
| 3 | * | ||
| 4 | * Copyright 2001, Rik van Riel <riel@conectiva.com.br> | ||
| 5 | * Released under the General Public License (GPL). | ||
| 6 | * | ||
| 7 | * Simple, low overhead reverse mapping scheme. | ||
| 8 | * Please try to keep this thing as modular as possible. | ||
| 9 | * | ||
| 10 | * Provides methods for unmapping each kind of mapped page: | ||
| 11 | * the anon methods track anonymous pages, and | ||
| 12 | * the file methods track pages belonging to an inode. | ||
| 13 | * | ||
| 14 | * Original design by Rik van Riel <riel@conectiva.com.br> 2001 | ||
| 15 | * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 | ||
| 16 | * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 | ||
| 17 | * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004 | ||
| 18 | */ | ||
| 19 | |||
| 20 | /* | ||
| 21 | * Lock ordering in mm: | ||
| 22 | * | ||
| 23 | * inode->i_sem (while writing or truncating, not reading or faulting) | ||
| 24 | * inode->i_alloc_sem | ||
| 25 | * | ||
| 26 | * When a page fault occurs in writing from user to file, down_read | ||
| 27 | * of mmap_sem nests within i_sem; in sys_msync, i_sem nests within | ||
| 28 | * down_read of mmap_sem; i_sem and down_write of mmap_sem are never | ||
| 29 | * taken together; in truncation, i_sem is taken outermost. | ||
| 30 | * | ||
| 31 | * mm->mmap_sem | ||
| 32 | * page->flags PG_locked (lock_page) | ||
| 33 | * mapping->i_mmap_lock | ||
| 34 | * anon_vma->lock | ||
| 35 | * mm->page_table_lock | ||
| 36 | * zone->lru_lock (in mark_page_accessed) | ||
| 37 | * swap_list_lock (in swap_free etc's swap_info_get) | ||
| 38 | * mmlist_lock (in mmput, drain_mmlist and others) | ||
| 39 | * swap_device_lock (in swap_duplicate, swap_info_get) | ||
| 40 | * mapping->private_lock (in __set_page_dirty_buffers) | ||
| 41 | * inode_lock (in set_page_dirty's __mark_inode_dirty) | ||
| 42 | * sb_lock (within inode_lock in fs/fs-writeback.c) | ||
| 43 | * mapping->tree_lock (widely used, in set_page_dirty, | ||
| 44 | * in arch-dependent flush_dcache_mmap_lock, | ||
| 45 | * within inode_lock in __sync_single_inode) | ||
| 46 | */ | ||
| 47 | |||
| 48 | #include <linux/mm.h> | ||
| 49 | #include <linux/pagemap.h> | ||
| 50 | #include <linux/swap.h> | ||
| 51 | #include <linux/swapops.h> | ||
| 52 | #include <linux/slab.h> | ||
| 53 | #include <linux/init.h> | ||
| 54 | #include <linux/rmap.h> | ||
| 55 | #include <linux/rcupdate.h> | ||
| 56 | |||
| 57 | #include <asm/tlbflush.h> | ||
| 58 | |||
| 59 | //#define RMAP_DEBUG /* can be enabled only for debugging */ | ||
| 60 | |||
| 61 | kmem_cache_t *anon_vma_cachep; | ||
| 62 | |||
| 63 | static inline void validate_anon_vma(struct vm_area_struct *find_vma) | ||
| 64 | { | ||
| 65 | #ifdef RMAP_DEBUG | ||
| 66 | struct anon_vma *anon_vma = find_vma->anon_vma; | ||
| 67 | struct vm_area_struct *vma; | ||
| 68 | unsigned int mapcount = 0; | ||
| 69 | int found = 0; | ||
| 70 | |||
| 71 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | ||
| 72 | mapcount++; | ||
| 73 | BUG_ON(mapcount > 100000); | ||
| 74 | if (vma == find_vma) | ||
| 75 | found = 1; | ||
| 76 | } | ||
| 77 | BUG_ON(!found); | ||
| 78 | #endif | ||
| 79 | } | ||
| 80 | |||
| 81 | /* This must be called under the mmap_sem. */ | ||
| 82 | int anon_vma_prepare(struct vm_area_struct *vma) | ||
| 83 | { | ||
| 84 | struct anon_vma *anon_vma = vma->anon_vma; | ||
| 85 | |||
| 86 | might_sleep(); | ||
| 87 | if (unlikely(!anon_vma)) { | ||
| 88 | struct mm_struct *mm = vma->vm_mm; | ||
| 89 | struct anon_vma *allocated, *locked; | ||
| 90 | |||
| 91 | anon_vma = find_mergeable_anon_vma(vma); | ||
| 92 | if (anon_vma) { | ||
| 93 | allocated = NULL; | ||
| 94 | locked = anon_vma; | ||
| 95 | spin_lock(&locked->lock); | ||
| 96 | } else { | ||
| 97 | anon_vma = anon_vma_alloc(); | ||
| 98 | if (unlikely(!anon_vma)) | ||
| 99 | return -ENOMEM; | ||
| 100 | allocated = anon_vma; | ||
| 101 | locked = NULL; | ||
| 102 | } | ||
| 103 | |||
| 104 | /* page_table_lock to protect against threads */ | ||
| 105 | spin_lock(&mm->page_table_lock); | ||
| 106 | if (likely(!vma->anon_vma)) { | ||
| 107 | vma->anon_vma = anon_vma; | ||
| 108 | list_add(&vma->anon_vma_node, &anon_vma->head); | ||
| 109 | allocated = NULL; | ||
| 110 | } | ||
| 111 | spin_unlock(&mm->page_table_lock); | ||
| 112 | |||
| 113 | if (locked) | ||
| 114 | spin_unlock(&locked->lock); | ||
| 115 | if (unlikely(allocated)) | ||
| 116 | anon_vma_free(allocated); | ||
| 117 | } | ||
| 118 | return 0; | ||
| 119 | } | ||
| 120 | |||
| 121 | void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) | ||
| 122 | { | ||
| 123 | BUG_ON(vma->anon_vma != next->anon_vma); | ||
| 124 | list_del(&next->anon_vma_node); | ||
| 125 | } | ||
| 126 | |||
| 127 | void __anon_vma_link(struct vm_area_struct *vma) | ||
| 128 | { | ||
| 129 | struct anon_vma *anon_vma = vma->anon_vma; | ||
| 130 | |||
| 131 | if (anon_vma) { | ||
| 132 | list_add(&vma->anon_vma_node, &anon_vma->head); | ||
| 133 | validate_anon_vma(vma); | ||
| 134 | } | ||
| 135 | } | ||
| 136 | |||
| 137 | void anon_vma_link(struct vm_area_struct *vma) | ||
| 138 | { | ||
| 139 | struct anon_vma *anon_vma = vma->anon_vma; | ||
| 140 | |||
| 141 | if (anon_vma) { | ||
| 142 | spin_lock(&anon_vma->lock); | ||
| 143 | list_add(&vma->anon_vma_node, &anon_vma->head); | ||
| 144 | validate_anon_vma(vma); | ||
| 145 | spin_unlock(&anon_vma->lock); | ||
| 146 | } | ||
| 147 | } | ||
| 148 | |||
| 149 | void anon_vma_unlink(struct vm_area_struct *vma) | ||
| 150 | { | ||
| 151 | struct anon_vma *anon_vma = vma->anon_vma; | ||
| 152 | int empty; | ||
| 153 | |||
| 154 | if (!anon_vma) | ||
| 155 | return; | ||
| 156 | |||
| 157 | spin_lock(&anon_vma->lock); | ||
| 158 | validate_anon_vma(vma); | ||
| 159 | list_del(&vma->anon_vma_node); | ||
| 160 | |||
| 161 | /* We must garbage collect the anon_vma if it's empty */ | ||
| 162 | empty = list_empty(&anon_vma->head); | ||
| 163 | spin_unlock(&anon_vma->lock); | ||
| 164 | |||
| 165 | if (empty) | ||
| 166 | anon_vma_free(anon_vma); | ||
| 167 | } | ||
| 168 | |||
| 169 | static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) | ||
| 170 | { | ||
| 171 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == | ||
| 172 | SLAB_CTOR_CONSTRUCTOR) { | ||
| 173 | struct anon_vma *anon_vma = data; | ||
| 174 | |||
| 175 | spin_lock_init(&anon_vma->lock); | ||
| 176 | INIT_LIST_HEAD(&anon_vma->head); | ||
| 177 | } | ||
| 178 | } | ||
| 179 | |||
| 180 | void __init anon_vma_init(void) | ||
| 181 | { | ||
| 182 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), | ||
| 183 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL); | ||
| 184 | } | ||
| 185 | |||
| 186 | /* | ||
| 187 | * Getting a lock on a stable anon_vma from a page off the LRU is | ||
| 188 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. | ||
| 189 | */ | ||
| 190 | static struct anon_vma *page_lock_anon_vma(struct page *page) | ||
| 191 | { | ||
| 192 | struct anon_vma *anon_vma = NULL; | ||
| 193 | unsigned long anon_mapping; | ||
| 194 | |||
| 195 | rcu_read_lock(); | ||
| 196 | anon_mapping = (unsigned long) page->mapping; | ||
| 197 | if (!(anon_mapping & PAGE_MAPPING_ANON)) | ||
| 198 | goto out; | ||
| 199 | if (!page_mapped(page)) | ||
| 200 | goto out; | ||
| 201 | |||
| 202 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); | ||
| 203 | spin_lock(&anon_vma->lock); | ||
| 204 | out: | ||
| 205 | rcu_read_unlock(); | ||
| 206 | return anon_vma; | ||
| 207 | } | ||
| 208 | |||
| 209 | /* | ||
| 210 | * At what user virtual address is page expected in vma? | ||
| 211 | */ | ||
| 212 | static inline unsigned long | ||
| 213 | vma_address(struct page *page, struct vm_area_struct *vma) | ||
| 214 | { | ||
| 215 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
| 216 | unsigned long address; | ||
| 217 | |||
| 218 | address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | ||
| 219 | if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { | ||
| 220 | /* page should be within any vma from prio_tree_next */ | ||
| 221 | BUG_ON(!PageAnon(page)); | ||
| 222 | return -EFAULT; | ||
| 223 | } | ||
| 224 | return address; | ||
| 225 | } | ||
| 226 | |||
| 227 | /* | ||
| 228 | * At what user virtual address is page expected in vma? checking that the | ||
| 229 | * page matches the vma: currently only used by unuse_process, on anon pages. | ||
| 230 | */ | ||
| 231 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | ||
| 232 | { | ||
| 233 | if (PageAnon(page)) { | ||
| 234 | if ((void *)vma->anon_vma != | ||
| 235 | (void *)page->mapping - PAGE_MAPPING_ANON) | ||
| 236 | return -EFAULT; | ||
| 237 | } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { | ||
| 238 | if (vma->vm_file->f_mapping != page->mapping) | ||
| 239 | return -EFAULT; | ||
| 240 | } else | ||
| 241 | return -EFAULT; | ||
| 242 | return vma_address(page, vma); | ||
| 243 | } | ||
| 244 | |||
| 245 | /* | ||
| 246 | * Subfunctions of page_referenced: page_referenced_one called | ||
| 247 | * repeatedly from either page_referenced_anon or page_referenced_file. | ||
| 248 | */ | ||
| 249 | static int page_referenced_one(struct page *page, | ||
| 250 | struct vm_area_struct *vma, unsigned int *mapcount, int ignore_token) | ||
| 251 | { | ||
| 252 | struct mm_struct *mm = vma->vm_mm; | ||
| 253 | unsigned long address; | ||
| 254 | pgd_t *pgd; | ||
| 255 | pud_t *pud; | ||
| 256 | pmd_t *pmd; | ||
| 257 | pte_t *pte; | ||
| 258 | int referenced = 0; | ||
| 259 | |||
| 260 | if (!get_mm_counter(mm, rss)) | ||
| 261 | goto out; | ||
| 262 | address = vma_address(page, vma); | ||
| 263 | if (address == -EFAULT) | ||
| 264 | goto out; | ||
| 265 | |||
| 266 | spin_lock(&mm->page_table_lock); | ||
| 267 | |||
| 268 | pgd = pgd_offset(mm, address); | ||
| 269 | if (!pgd_present(*pgd)) | ||
| 270 | goto out_unlock; | ||
| 271 | |||
| 272 | pud = pud_offset(pgd, address); | ||
| 273 | if (!pud_present(*pud)) | ||
| 274 | goto out_unlock; | ||
| 275 | |||
| 276 | pmd = pmd_offset(pud, address); | ||
| 277 | if (!pmd_present(*pmd)) | ||
| 278 | goto out_unlock; | ||
| 279 | |||
| 280 | pte = pte_offset_map(pmd, address); | ||
| 281 | if (!pte_present(*pte)) | ||
| 282 | goto out_unmap; | ||
| 283 | |||
| 284 | if (page_to_pfn(page) != pte_pfn(*pte)) | ||
| 285 | goto out_unmap; | ||
| 286 | |||
| 287 | if (ptep_clear_flush_young(vma, address, pte)) | ||
| 288 | referenced++; | ||
| 289 | |||
| 290 | if (mm != current->mm && !ignore_token && has_swap_token(mm)) | ||
| 291 | referenced++; | ||
| 292 | |||
| 293 | (*mapcount)--; | ||
| 294 | |||
| 295 | out_unmap: | ||
| 296 | pte_unmap(pte); | ||
| 297 | out_unlock: | ||
| 298 | spin_unlock(&mm->page_table_lock); | ||
| 299 | out: | ||
| 300 | return referenced; | ||
| 301 | } | ||
| 302 | |||
| 303 | static int page_referenced_anon(struct page *page, int ignore_token) | ||
| 304 | { | ||
| 305 | unsigned int mapcount; | ||
| 306 | struct anon_vma *anon_vma; | ||
| 307 | struct vm_area_struct *vma; | ||
| 308 | int referenced = 0; | ||
| 309 | |||
| 310 | anon_vma = page_lock_anon_vma(page); | ||
| 311 | if (!anon_vma) | ||
| 312 | return referenced; | ||
| 313 | |||
| 314 | mapcount = page_mapcount(page); | ||
| 315 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | ||
| 316 | referenced += page_referenced_one(page, vma, &mapcount, | ||
| 317 | ignore_token); | ||
| 318 | if (!mapcount) | ||
| 319 | break; | ||
| 320 | } | ||
| 321 | spin_unlock(&anon_vma->lock); | ||
| 322 | return referenced; | ||
| 323 | } | ||
| 324 | |||
| 325 | /** | ||
| 326 | * page_referenced_file - referenced check for object-based rmap | ||
| 327 | * @page: the page we're checking references on. | ||
| 328 | * | ||
| 329 | * For an object-based mapped page, find all the places it is mapped and | ||
| 330 | * check/clear the referenced flag. This is done by following the page->mapping | ||
| 331 | * pointer, then walking the chain of vmas it holds. It returns the number | ||
| 332 | * of references it found. | ||
| 333 | * | ||
| 334 | * This function is only called from page_referenced for object-based pages. | ||
| 335 | */ | ||
| 336 | static int page_referenced_file(struct page *page, int ignore_token) | ||
| 337 | { | ||
| 338 | unsigned int mapcount; | ||
| 339 | struct address_space *mapping = page->mapping; | ||
| 340 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
| 341 | struct vm_area_struct *vma; | ||
| 342 | struct prio_tree_iter iter; | ||
| 343 | int referenced = 0; | ||
| 344 | |||
| 345 | /* | ||
| 346 | * The caller's checks on page->mapping and !PageAnon have made | ||
| 347 | * sure that this is a file page: the check for page->mapping | ||
| 348 | * excludes the case just before it gets set on an anon page. | ||
| 349 | */ | ||
| 350 | BUG_ON(PageAnon(page)); | ||
| 351 | |||
| 352 | /* | ||
| 353 | * The page lock not only makes sure that page->mapping cannot | ||
| 354 | * suddenly be NULLified by truncation, it makes sure that the | ||
| 355 | * structure at mapping cannot be freed and reused yet, | ||
| 356 | * so we can safely take mapping->i_mmap_lock. | ||
| 357 | */ | ||
| 358 | BUG_ON(!PageLocked(page)); | ||
| 359 | |||
| 360 | spin_lock(&mapping->i_mmap_lock); | ||
| 361 | |||
| 362 | /* | ||
| 363 | * i_mmap_lock does not stabilize mapcount at all, but mapcount | ||
| 364 | * is more likely to be accurate if we note it after spinning. | ||
| 365 | */ | ||
| 366 | mapcount = page_mapcount(page); | ||
| 367 | |||
| 368 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | ||
| 369 | if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) | ||
| 370 | == (VM_LOCKED|VM_MAYSHARE)) { | ||
| 371 | referenced++; | ||
| 372 | break; | ||
| 373 | } | ||
| 374 | referenced += page_referenced_one(page, vma, &mapcount, | ||
| 375 | ignore_token); | ||
| 376 | if (!mapcount) | ||
| 377 | break; | ||
| 378 | } | ||
| 379 | |||
| 380 | spin_unlock(&mapping->i_mmap_lock); | ||
| 381 | return referenced; | ||
| 382 | } | ||
| 383 | |||
| 384 | /** | ||
| 385 | * page_referenced - test if the page was referenced | ||
| 386 | * @page: the page to test | ||
| 387 | * @is_locked: caller holds lock on the page | ||
| 388 | * | ||
| 389 | * Quick test_and_clear_referenced for all mappings to a page, | ||
| 390 | * returns the number of ptes which referenced the page. | ||
| 391 | */ | ||
| 392 | int page_referenced(struct page *page, int is_locked, int ignore_token) | ||
| 393 | { | ||
| 394 | int referenced = 0; | ||
| 395 | |||
| 396 | if (!swap_token_default_timeout) | ||
| 397 | ignore_token = 1; | ||
| 398 | |||
| 399 | if (page_test_and_clear_young(page)) | ||
| 400 | referenced++; | ||
| 401 | |||
| 402 | if (TestClearPageReferenced(page)) | ||
| 403 | referenced++; | ||
| 404 | |||
| 405 | if (page_mapped(page) && page->mapping) { | ||
| 406 | if (PageAnon(page)) | ||
| 407 | referenced += page_referenced_anon(page, ignore_token); | ||
| 408 | else if (is_locked) | ||
| 409 | referenced += page_referenced_file(page, ignore_token); | ||
| 410 | else if (TestSetPageLocked(page)) | ||
| 411 | referenced++; | ||
| 412 | else { | ||
| 413 | if (page->mapping) | ||
| 414 | referenced += page_referenced_file(page, | ||
| 415 | ignore_token); | ||
| 416 | unlock_page(page); | ||
| 417 | } | ||
| 418 | } | ||
| 419 | return referenced; | ||
| 420 | } | ||
| 421 | |||
| 422 | /** | ||
| 423 | * page_add_anon_rmap - add pte mapping to an anonymous page | ||
| 424 | * @page: the page to add the mapping to | ||
| 425 | * @vma: the vm area in which the mapping is added | ||
| 426 | * @address: the user virtual address mapped | ||
| 427 | * | ||
| 428 | * The caller needs to hold the mm->page_table_lock. | ||
| 429 | */ | ||
| 430 | void page_add_anon_rmap(struct page *page, | ||
| 431 | struct vm_area_struct *vma, unsigned long address) | ||
| 432 | { | ||
| 433 | struct anon_vma *anon_vma = vma->anon_vma; | ||
| 434 | pgoff_t index; | ||
| 435 | |||
| 436 | BUG_ON(PageReserved(page)); | ||
| 437 | BUG_ON(!anon_vma); | ||
| 438 | |||
| 439 | inc_mm_counter(vma->vm_mm, anon_rss); | ||
| 440 | |||
| 441 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
| 442 | index = (address - vma->vm_start) >> PAGE_SHIFT; | ||
| 443 | index += vma->vm_pgoff; | ||
| 444 | index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; | ||
| 445 | |||
| 446 | if (atomic_inc_and_test(&page->_mapcount)) { | ||
| 447 | page->index = index; | ||
| 448 | page->mapping = (struct address_space *) anon_vma; | ||
| 449 | inc_page_state(nr_mapped); | ||
| 450 | } | ||
| 451 | /* else checking page index and mapping is racy */ | ||
| 452 | } | ||
| 453 | |||
| 454 | /** | ||
| 455 | * page_add_file_rmap - add pte mapping to a file page | ||
| 456 | * @page: the page to add the mapping to | ||
| 457 | * | ||
| 458 | * The caller needs to hold the mm->page_table_lock. | ||
| 459 | */ | ||
| 460 | void page_add_file_rmap(struct page *page) | ||
| 461 | { | ||
| 462 | BUG_ON(PageAnon(page)); | ||
| 463 | if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) | ||
| 464 | return; | ||
| 465 | |||
| 466 | if (atomic_inc_and_test(&page->_mapcount)) | ||
| 467 | inc_page_state(nr_mapped); | ||
| 468 | } | ||
| 469 | |||
| 470 | /** | ||
| 471 | * page_remove_rmap - take down pte mapping from a page | ||
| 472 | * @page: page to remove mapping from | ||
| 473 | * | ||
| 474 | * Caller needs to hold the mm->page_table_lock. | ||
| 475 | */ | ||
| 476 | void page_remove_rmap(struct page *page) | ||
| 477 | { | ||
| 478 | BUG_ON(PageReserved(page)); | ||
| 479 | |||
| 480 | if (atomic_add_negative(-1, &page->_mapcount)) { | ||
| 481 | BUG_ON(page_mapcount(page) < 0); | ||
| 482 | /* | ||
| 483 | * It would be tidy to reset the PageAnon mapping here, | ||
| 484 | * but that might overwrite a racing page_add_anon_rmap | ||
| 485 | * which increments mapcount after us but sets mapping | ||
| 486 | * before us: so leave the reset to free_hot_cold_page, | ||
| 487 | * and remember that it's only reliable while mapped. | ||
| 488 | * Leaving it set also helps swapoff to reinstate ptes | ||
| 489 | * faster for those pages still in swapcache. | ||
| 490 | */ | ||
| 491 | if (page_test_and_clear_dirty(page)) | ||
| 492 | set_page_dirty(page); | ||
| 493 | dec_page_state(nr_mapped); | ||
| 494 | } | ||
| 495 | } | ||
| 496 | |||
| 497 | /* | ||
| 498 | * Subfunctions of try_to_unmap: try_to_unmap_one called | ||
| 499 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. | ||
| 500 | */ | ||
| 501 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) | ||
| 502 | { | ||
| 503 | struct mm_struct *mm = vma->vm_mm; | ||
| 504 | unsigned long address; | ||
| 505 | pgd_t *pgd; | ||
| 506 | pud_t *pud; | ||
| 507 | pmd_t *pmd; | ||
| 508 | pte_t *pte; | ||
| 509 | pte_t pteval; | ||
| 510 | int ret = SWAP_AGAIN; | ||
| 511 | |||
| 512 | if (!get_mm_counter(mm, rss)) | ||
| 513 | goto out; | ||
| 514 | address = vma_address(page, vma); | ||
| 515 | if (address == -EFAULT) | ||
| 516 | goto out; | ||
| 517 | |||
| 518 | /* | ||
| 519 | * We need the page_table_lock to protect us from page faults, | ||
| 520 | * munmap, fork, etc... | ||
| 521 | */ | ||
| 522 | spin_lock(&mm->page_table_lock); | ||
| 523 | |||
| 524 | pgd = pgd_offset(mm, address); | ||
| 525 | if (!pgd_present(*pgd)) | ||
| 526 | goto out_unlock; | ||
| 527 | |||
| 528 | pud = pud_offset(pgd, address); | ||
| 529 | if (!pud_present(*pud)) | ||
| 530 | goto out_unlock; | ||
| 531 | |||
| 532 | pmd = pmd_offset(pud, address); | ||
| 533 | if (!pmd_present(*pmd)) | ||
| 534 | goto out_unlock; | ||
| 535 | |||
| 536 | pte = pte_offset_map(pmd, address); | ||
| 537 | if (!pte_present(*pte)) | ||
| 538 | goto out_unmap; | ||
| 539 | |||
| 540 | if (page_to_pfn(page) != pte_pfn(*pte)) | ||
| 541 | goto out_unmap; | ||
| 542 | |||
| 543 | /* | ||
| 544 | * If the page is mlock()d, we cannot swap it out. | ||
| 545 | * If it's recently referenced (perhaps page_referenced | ||
| 546 | * skipped over this mm) then we should reactivate it. | ||
| 547 | */ | ||
| 548 | if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) || | ||
| 549 | ptep_clear_flush_young(vma, address, pte)) { | ||
| 550 | ret = SWAP_FAIL; | ||
| 551 | goto out_unmap; | ||
| 552 | } | ||
| 553 | |||
| 554 | /* | ||
| 555 | * Don't pull an anonymous page out from under get_user_pages. | ||
| 556 | * GUP carefully breaks COW and raises page count (while holding | ||
| 557 | * page_table_lock, as we have here) to make sure that the page | ||
| 558 | * cannot be freed. If we unmap that page here, a user write | ||
| 559 | * access to the virtual address will bring back the page, but | ||
| 560 | * its raised count will (ironically) be taken to mean it's not | ||
| 561 | * an exclusive swap page, do_wp_page will replace it by a copy | ||
| 562 | * page, and the user never get to see the data GUP was holding | ||
| 563 | * the original page for. | ||
| 564 | * | ||
| 565 | * This test is also useful for when swapoff (unuse_process) has | ||
| 566 | * to drop page lock: its reference to the page stops existing | ||
| 567 | * ptes from being unmapped, so swapoff can make progress. | ||
| 568 | */ | ||
| 569 | if (PageSwapCache(page) && | ||
| 570 | page_count(page) != page_mapcount(page) + 2) { | ||
| 571 | ret = SWAP_FAIL; | ||
| 572 | goto out_unmap; | ||
| 573 | } | ||
| 574 | |||
| 575 | /* Nuke the page table entry. */ | ||
| 576 | flush_cache_page(vma, address, page_to_pfn(page)); | ||
| 577 | pteval = ptep_clear_flush(vma, address, pte); | ||
| 578 | |||
| 579 | /* Move the dirty bit to the physical page now the pte is gone. */ | ||
| 580 | if (pte_dirty(pteval)) | ||
| 581 | set_page_dirty(page); | ||
| 582 | |||
| 583 | if (PageAnon(page)) { | ||
| 584 | swp_entry_t entry = { .val = page->private }; | ||
| 585 | /* | ||
| 586 | * Store the swap location in the pte. | ||
| 587 | * See handle_pte_fault() ... | ||
| 588 | */ | ||
| 589 | BUG_ON(!PageSwapCache(page)); | ||
| 590 | swap_duplicate(entry); | ||
| 591 | if (list_empty(&mm->mmlist)) { | ||
| 592 | spin_lock(&mmlist_lock); | ||
| 593 | list_add(&mm->mmlist, &init_mm.mmlist); | ||
| 594 | spin_unlock(&mmlist_lock); | ||
| 595 | } | ||
| 596 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | ||
| 597 | BUG_ON(pte_file(*pte)); | ||
| 598 | dec_mm_counter(mm, anon_rss); | ||
| 599 | } | ||
| 600 | |||
| 601 | inc_mm_counter(mm, rss); | ||
| 602 | page_remove_rmap(page); | ||
| 603 | page_cache_release(page); | ||
| 604 | |||
| 605 | out_unmap: | ||
| 606 | pte_unmap(pte); | ||
| 607 | out_unlock: | ||
| 608 | spin_unlock(&mm->page_table_lock); | ||
| 609 | out: | ||
| 610 | return ret; | ||
| 611 | } | ||
| 612 | |||
| 613 | /* | ||
| 614 | * objrmap doesn't work for nonlinear VMAs because the assumption that | ||
| 615 | * offset-into-file correlates with offset-into-virtual-addresses does not hold. | ||
| 616 | * Consequently, given a particular page and its ->index, we cannot locate the | ||
| 617 | * ptes which are mapping that page without an exhaustive linear search. | ||
| 618 | * | ||
| 619 | * So what this code does is a mini "virtual scan" of each nonlinear VMA which | ||
| 620 | * maps the file to which the target page belongs. The ->vm_private_data field | ||
| 621 | * holds the current cursor into that scan. Successive searches will circulate | ||
| 622 | * around the vma's virtual address space. | ||
| 623 | * | ||
| 624 | * So as more replacement pressure is applied to the pages in a nonlinear VMA, | ||
| 625 | * more scanning pressure is placed against them as well. Eventually pages | ||
| 626 | * will become fully unmapped and are eligible for eviction. | ||
| 627 | * | ||
| 628 | * For very sparsely populated VMAs this is a little inefficient - chances are | ||
| 629 | * there there won't be many ptes located within the scan cluster. In this case | ||
| 630 | * maybe we could scan further - to the end of the pte page, perhaps. | ||
| 631 | */ | ||
| 632 | #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) | ||
| 633 | #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) | ||
| 634 | |||
| 635 | static void try_to_unmap_cluster(unsigned long cursor, | ||
| 636 | unsigned int *mapcount, struct vm_area_struct *vma) | ||
| 637 | { | ||
| 638 | struct mm_struct *mm = vma->vm_mm; | ||
| 639 | pgd_t *pgd; | ||
| 640 | pud_t *pud; | ||
| 641 | pmd_t *pmd; | ||
| 642 | pte_t *pte; | ||
| 643 | pte_t pteval; | ||
| 644 | struct page *page; | ||
| 645 | unsigned long address; | ||
| 646 | unsigned long end; | ||
| 647 | unsigned long pfn; | ||
| 648 | |||
| 649 | /* | ||
| 650 | * We need the page_table_lock to protect us from page faults, | ||
| 651 | * munmap, fork, etc... | ||
| 652 | */ | ||
| 653 | spin_lock(&mm->page_table_lock); | ||
| 654 | |||
| 655 | address = (vma->vm_start + cursor) & CLUSTER_MASK; | ||
| 656 | end = address + CLUSTER_SIZE; | ||
| 657 | if (address < vma->vm_start) | ||
| 658 | address = vma->vm_start; | ||
| 659 | if (end > vma->vm_end) | ||
| 660 | end = vma->vm_end; | ||
| 661 | |||
| 662 | pgd = pgd_offset(mm, address); | ||
| 663 | if (!pgd_present(*pgd)) | ||
| 664 | goto out_unlock; | ||
| 665 | |||
| 666 | pud = pud_offset(pgd, address); | ||
| 667 | if (!pud_present(*pud)) | ||
| 668 | goto out_unlock; | ||
| 669 | |||
| 670 | pmd = pmd_offset(pud, address); | ||
| 671 | if (!pmd_present(*pmd)) | ||
| 672 | goto out_unlock; | ||
| 673 | |||
| 674 | for (pte = pte_offset_map(pmd, address); | ||
| 675 | address < end; pte++, address += PAGE_SIZE) { | ||
| 676 | |||
| 677 | if (!pte_present(*pte)) | ||
| 678 | continue; | ||
| 679 | |||
| 680 | pfn = pte_pfn(*pte); | ||
| 681 | if (!pfn_valid(pfn)) | ||
| 682 | continue; | ||
| 683 | |||
| 684 | page = pfn_to_page(pfn); | ||
| 685 | BUG_ON(PageAnon(page)); | ||
| 686 | if (PageReserved(page)) | ||
| 687 | continue; | ||
| 688 | |||
| 689 | if (ptep_clear_flush_young(vma, address, pte)) | ||
| 690 | continue; | ||
| 691 | |||
| 692 | /* Nuke the page table entry. */ | ||
| 693 | flush_cache_page(vma, address, pfn); | ||
| 694 | pteval = ptep_clear_flush(vma, address, pte); | ||
| 695 | |||
| 696 | /* If nonlinear, store the file page offset in the pte. */ | ||
| 697 | if (page->index != linear_page_index(vma, address)) | ||
| 698 | set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); | ||
| 699 | |||
| 700 | /* Move the dirty bit to the physical page now the pte is gone. */ | ||
| 701 | if (pte_dirty(pteval)) | ||
| 702 | set_page_dirty(page); | ||
| 703 | |||
| 704 | page_remove_rmap(page); | ||
| 705 | page_cache_release(page); | ||
| 706 | dec_mm_counter(mm, rss); | ||
| 707 | (*mapcount)--; | ||
| 708 | } | ||
| 709 | |||
| 710 | pte_unmap(pte); | ||
| 711 | |||
| 712 | out_unlock: | ||
| 713 | spin_unlock(&mm->page_table_lock); | ||
| 714 | } | ||
| 715 | |||
| 716 | static int try_to_unmap_anon(struct page *page) | ||
| 717 | { | ||
| 718 | struct anon_vma *anon_vma; | ||
| 719 | struct vm_area_struct *vma; | ||
| 720 | int ret = SWAP_AGAIN; | ||
| 721 | |||
| 722 | anon_vma = page_lock_anon_vma(page); | ||
| 723 | if (!anon_vma) | ||
| 724 | return ret; | ||
| 725 | |||
| 726 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | ||
| 727 | ret = try_to_unmap_one(page, vma); | ||
| 728 | if (ret == SWAP_FAIL || !page_mapped(page)) | ||
| 729 | break; | ||
| 730 | } | ||
| 731 | spin_unlock(&anon_vma->lock); | ||
| 732 | return ret; | ||
| 733 | } | ||
| 734 | |||
| 735 | /** | ||
| 736 | * try_to_unmap_file - unmap file page using the object-based rmap method | ||
| 737 | * @page: the page to unmap | ||
| 738 | * | ||
| 739 | * Find all the mappings of a page using the mapping pointer and the vma chains | ||
| 740 | * contained in the address_space struct it points to. | ||
| 741 | * | ||
| 742 | * This function is only called from try_to_unmap for object-based pages. | ||
| 743 | */ | ||
| 744 | static int try_to_unmap_file(struct page *page) | ||
| 745 | { | ||
| 746 | struct address_space *mapping = page->mapping; | ||
| 747 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
| 748 | struct vm_area_struct *vma; | ||
| 749 | struct prio_tree_iter iter; | ||
| 750 | int ret = SWAP_AGAIN; | ||
| 751 | unsigned long cursor; | ||
| 752 | unsigned long max_nl_cursor = 0; | ||
| 753 | unsigned long max_nl_size = 0; | ||
| 754 | unsigned int mapcount; | ||
| 755 | |||
| 756 | spin_lock(&mapping->i_mmap_lock); | ||
| 757 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | ||
| 758 | ret = try_to_unmap_one(page, vma); | ||
| 759 | if (ret == SWAP_FAIL || !page_mapped(page)) | ||
| 760 | goto out; | ||
| 761 | } | ||
| 762 | |||
| 763 | if (list_empty(&mapping->i_mmap_nonlinear)) | ||
| 764 | goto out; | ||
| 765 | |||
| 766 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | ||
| 767 | shared.vm_set.list) { | ||
| 768 | if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) | ||
| 769 | continue; | ||
| 770 | cursor = (unsigned long) vma->vm_private_data; | ||
| 771 | if (cursor > max_nl_cursor) | ||
| 772 | max_nl_cursor = cursor; | ||
| 773 | cursor = vma->vm_end - vma->vm_start; | ||
| 774 | if (cursor > max_nl_size) | ||
| 775 | max_nl_size = cursor; | ||
| 776 | } | ||
| 777 | |||
| 778 | if (max_nl_size == 0) { /* any nonlinears locked or reserved */ | ||
| 779 | ret = SWAP_FAIL; | ||
| 780 | goto out; | ||
| 781 | } | ||
| 782 | |||
| 783 | /* | ||
| 784 | * We don't try to search for this page in the nonlinear vmas, | ||
| 785 | * and page_referenced wouldn't have found it anyway. Instead | ||
| 786 | * just walk the nonlinear vmas trying to age and unmap some. | ||
| 787 | * The mapcount of the page we came in with is irrelevant, | ||
| 788 | * but even so use it as a guide to how hard we should try? | ||
| 789 | */ | ||
| 790 | mapcount = page_mapcount(page); | ||
| 791 | if (!mapcount) | ||
| 792 | goto out; | ||
| 793 | cond_resched_lock(&mapping->i_mmap_lock); | ||
| 794 | |||
| 795 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; | ||
| 796 | if (max_nl_cursor == 0) | ||
| 797 | max_nl_cursor = CLUSTER_SIZE; | ||
| 798 | |||
| 799 | do { | ||
| 800 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | ||
| 801 | shared.vm_set.list) { | ||
| 802 | if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) | ||
| 803 | continue; | ||
| 804 | cursor = (unsigned long) vma->vm_private_data; | ||
| 805 | while (get_mm_counter(vma->vm_mm, rss) && | ||
| 806 | cursor < max_nl_cursor && | ||
| 807 | cursor < vma->vm_end - vma->vm_start) { | ||
| 808 | try_to_unmap_cluster(cursor, &mapcount, vma); | ||
| 809 | cursor += CLUSTER_SIZE; | ||
| 810 | vma->vm_private_data = (void *) cursor; | ||
| 811 | if ((int)mapcount <= 0) | ||
| 812 | goto out; | ||
| 813 | } | ||
| 814 | vma->vm_private_data = (void *) max_nl_cursor; | ||
| 815 | } | ||
| 816 | cond_resched_lock(&mapping->i_mmap_lock); | ||
| 817 | max_nl_cursor += CLUSTER_SIZE; | ||
| 818 | } while (max_nl_cursor <= max_nl_size); | ||
| 819 | |||
| 820 | /* | ||
| 821 | * Don't loop forever (perhaps all the remaining pages are | ||
| 822 | * in locked vmas). Reset cursor on all unreserved nonlinear | ||
| 823 | * vmas, now forgetting on which ones it had fallen behind. | ||
| 824 | */ | ||
| 825 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | ||
| 826 | shared.vm_set.list) { | ||
| 827 | if (!(vma->vm_flags & VM_RESERVED)) | ||
| 828 | vma->vm_private_data = NULL; | ||
| 829 | } | ||
| 830 | out: | ||
| 831 | spin_unlock(&mapping->i_mmap_lock); | ||
| 832 | return ret; | ||
| 833 | } | ||
| 834 | |||
| 835 | /** | ||
| 836 | * try_to_unmap - try to remove all page table mappings to a page | ||
| 837 | * @page: the page to get unmapped | ||
| 838 | * | ||
| 839 | * Tries to remove all the page table entries which are mapping this | ||
| 840 | * page, used in the pageout path. Caller must hold the page lock. | ||
| 841 | * Return values are: | ||
| 842 | * | ||
| 843 | * SWAP_SUCCESS - we succeeded in removing all mappings | ||
| 844 | * SWAP_AGAIN - we missed a mapping, try again later | ||
| 845 | * SWAP_FAIL - the page is unswappable | ||
| 846 | */ | ||
| 847 | int try_to_unmap(struct page *page) | ||
| 848 | { | ||
| 849 | int ret; | ||
| 850 | |||
| 851 | BUG_ON(PageReserved(page)); | ||
| 852 | BUG_ON(!PageLocked(page)); | ||
| 853 | |||
| 854 | if (PageAnon(page)) | ||
| 855 | ret = try_to_unmap_anon(page); | ||
| 856 | else | ||
| 857 | ret = try_to_unmap_file(page); | ||
| 858 | |||
| 859 | if (!page_mapped(page)) | ||
| 860 | ret = SWAP_SUCCESS; | ||
| 861 | return ret; | ||
| 862 | } | ||
diff --git a/mm/shmem.c b/mm/shmem.c new file mode 100644 index 000000000000..61574b81d979 --- /dev/null +++ b/mm/shmem.c | |||
| @@ -0,0 +1,2326 @@ | |||
| 1 | /* | ||
| 2 | * Resizable virtual memory filesystem for Linux. | ||
| 3 | * | ||
| 4 | * Copyright (C) 2000 Linus Torvalds. | ||
| 5 | * 2000 Transmeta Corp. | ||
| 6 | * 2000-2001 Christoph Rohland | ||
| 7 | * 2000-2001 SAP AG | ||
| 8 | * 2002 Red Hat Inc. | ||
| 9 | * Copyright (C) 2002-2004 Hugh Dickins. | ||
| 10 | * Copyright (C) 2002-2004 VERITAS Software Corporation. | ||
| 11 | * Copyright (C) 2004 Andi Kleen, SuSE Labs | ||
| 12 | * | ||
| 13 | * Extended attribute support for tmpfs: | ||
| 14 | * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> | ||
| 15 | * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> | ||
| 16 | * | ||
| 17 | * This file is released under the GPL. | ||
| 18 | */ | ||
| 19 | |||
| 20 | /* | ||
| 21 | * This virtual memory filesystem is heavily based on the ramfs. It | ||
| 22 | * extends ramfs by the ability to use swap and honor resource limits | ||
| 23 | * which makes it a completely usable filesystem. | ||
| 24 | */ | ||
| 25 | |||
| 26 | #include <linux/config.h> | ||
| 27 | #include <linux/module.h> | ||
| 28 | #include <linux/init.h> | ||
| 29 | #include <linux/devfs_fs_kernel.h> | ||
| 30 | #include <linux/fs.h> | ||
| 31 | #include <linux/mm.h> | ||
| 32 | #include <linux/mman.h> | ||
| 33 | #include <linux/file.h> | ||
| 34 | #include <linux/swap.h> | ||
| 35 | #include <linux/pagemap.h> | ||
| 36 | #include <linux/string.h> | ||
| 37 | #include <linux/slab.h> | ||
| 38 | #include <linux/backing-dev.h> | ||
| 39 | #include <linux/shmem_fs.h> | ||
| 40 | #include <linux/mount.h> | ||
| 41 | #include <linux/writeback.h> | ||
| 42 | #include <linux/vfs.h> | ||
| 43 | #include <linux/blkdev.h> | ||
| 44 | #include <linux/security.h> | ||
| 45 | #include <linux/swapops.h> | ||
| 46 | #include <linux/mempolicy.h> | ||
| 47 | #include <linux/namei.h> | ||
| 48 | #include <linux/xattr.h> | ||
| 49 | #include <asm/uaccess.h> | ||
| 50 | #include <asm/div64.h> | ||
| 51 | #include <asm/pgtable.h> | ||
| 52 | |||
| 53 | /* This magic number is used in glibc for posix shared memory */ | ||
| 54 | #define TMPFS_MAGIC 0x01021994 | ||
| 55 | |||
| 56 | #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) | ||
| 57 | #define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) | ||
| 58 | #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) | ||
| 59 | |||
| 60 | #define SHMEM_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) | ||
| 61 | #define SHMEM_MAX_BYTES ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT) | ||
| 62 | |||
| 63 | #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) | ||
| 64 | |||
| 65 | /* info->flags needs VM_flags to handle pagein/truncate races efficiently */ | ||
| 66 | #define SHMEM_PAGEIN VM_READ | ||
| 67 | #define SHMEM_TRUNCATE VM_WRITE | ||
| 68 | |||
| 69 | /* Definition to limit shmem_truncate's steps between cond_rescheds */ | ||
| 70 | #define LATENCY_LIMIT 64 | ||
| 71 | |||
| 72 | /* Pretend that each entry is of this size in directory's i_size */ | ||
| 73 | #define BOGO_DIRENT_SIZE 20 | ||
| 74 | |||
| 75 | /* Keep swapped page count in private field of indirect struct page */ | ||
| 76 | #define nr_swapped private | ||
| 77 | |||
| 78 | /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ | ||
| 79 | enum sgp_type { | ||
| 80 | SGP_QUICK, /* don't try more than file page cache lookup */ | ||
| 81 | SGP_READ, /* don't exceed i_size, don't allocate page */ | ||
| 82 | SGP_CACHE, /* don't exceed i_size, may allocate page */ | ||
| 83 | SGP_WRITE, /* may exceed i_size, may allocate page */ | ||
| 84 | }; | ||
| 85 | |||
| 86 | static int shmem_getpage(struct inode *inode, unsigned long idx, | ||
| 87 | struct page **pagep, enum sgp_type sgp, int *type); | ||
| 88 | |||
| 89 | static inline struct page *shmem_dir_alloc(unsigned int gfp_mask) | ||
| 90 | { | ||
| 91 | /* | ||
| 92 | * The above definition of ENTRIES_PER_PAGE, and the use of | ||
| 93 | * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: | ||
| 94 | * might be reconsidered if it ever diverges from PAGE_SIZE. | ||
| 95 | */ | ||
| 96 | return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT); | ||
| 97 | } | ||
| 98 | |||
| 99 | static inline void shmem_dir_free(struct page *page) | ||
| 100 | { | ||
| 101 | __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT); | ||
| 102 | } | ||
| 103 | |||
| 104 | static struct page **shmem_dir_map(struct page *page) | ||
| 105 | { | ||
| 106 | return (struct page **)kmap_atomic(page, KM_USER0); | ||
| 107 | } | ||
| 108 | |||
| 109 | static inline void shmem_dir_unmap(struct page **dir) | ||
| 110 | { | ||
| 111 | kunmap_atomic(dir, KM_USER0); | ||
| 112 | } | ||
| 113 | |||
| 114 | static swp_entry_t *shmem_swp_map(struct page *page) | ||
| 115 | { | ||
| 116 | return (swp_entry_t *)kmap_atomic(page, KM_USER1); | ||
| 117 | } | ||
| 118 | |||
| 119 | static inline void shmem_swp_balance_unmap(void) | ||
| 120 | { | ||
| 121 | /* | ||
| 122 | * When passing a pointer to an i_direct entry, to code which | ||
| 123 | * also handles indirect entries and so will shmem_swp_unmap, | ||
| 124 | * we must arrange for the preempt count to remain in balance. | ||
| 125 | * What kmap_atomic of a lowmem page does depends on config | ||
| 126 | * and architecture, so pretend to kmap_atomic some lowmem page. | ||
| 127 | */ | ||
| 128 | (void) kmap_atomic(ZERO_PAGE(0), KM_USER1); | ||
| 129 | } | ||
| 130 | |||
| 131 | static inline void shmem_swp_unmap(swp_entry_t *entry) | ||
| 132 | { | ||
| 133 | kunmap_atomic(entry, KM_USER1); | ||
| 134 | } | ||
| 135 | |||
| 136 | static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) | ||
| 137 | { | ||
| 138 | return sb->s_fs_info; | ||
| 139 | } | ||
| 140 | |||
| 141 | /* | ||
| 142 | * shmem_file_setup pre-accounts the whole fixed size of a VM object, | ||
| 143 | * for shared memory and for shared anonymous (/dev/zero) mappings | ||
| 144 | * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), | ||
| 145 | * consistent with the pre-accounting of private mappings ... | ||
| 146 | */ | ||
| 147 | static inline int shmem_acct_size(unsigned long flags, loff_t size) | ||
| 148 | { | ||
| 149 | return (flags & VM_ACCOUNT)? | ||
| 150 | security_vm_enough_memory(VM_ACCT(size)): 0; | ||
| 151 | } | ||
| 152 | |||
| 153 | static inline void shmem_unacct_size(unsigned long flags, loff_t size) | ||
| 154 | { | ||
| 155 | if (flags & VM_ACCOUNT) | ||
| 156 | vm_unacct_memory(VM_ACCT(size)); | ||
| 157 | } | ||
| 158 | |||
| 159 | /* | ||
| 160 | * ... whereas tmpfs objects are accounted incrementally as | ||
| 161 | * pages are allocated, in order to allow huge sparse files. | ||
| 162 | * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, | ||
| 163 | * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. | ||
| 164 | */ | ||
| 165 | static inline int shmem_acct_block(unsigned long flags) | ||
| 166 | { | ||
| 167 | return (flags & VM_ACCOUNT)? | ||
| 168 | 0: security_vm_enough_memory(VM_ACCT(PAGE_CACHE_SIZE)); | ||
| 169 | } | ||
| 170 | |||
| 171 | static inline void shmem_unacct_blocks(unsigned long flags, long pages) | ||
| 172 | { | ||
| 173 | if (!(flags & VM_ACCOUNT)) | ||
| 174 | vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE)); | ||
| 175 | } | ||
| 176 | |||
| 177 | static struct super_operations shmem_ops; | ||
| 178 | static struct address_space_operations shmem_aops; | ||
| 179 | static struct file_operations shmem_file_operations; | ||
| 180 | static struct inode_operations shmem_inode_operations; | ||
| 181 | static struct inode_operations shmem_dir_inode_operations; | ||
| 182 | static struct inode_operations shmem_special_inode_operations; | ||
| 183 | static struct vm_operations_struct shmem_vm_ops; | ||
| 184 | |||
| 185 | static struct backing_dev_info shmem_backing_dev_info = { | ||
| 186 | .ra_pages = 0, /* No readahead */ | ||
| 187 | .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, | ||
| 188 | .unplug_io_fn = default_unplug_io_fn, | ||
| 189 | }; | ||
| 190 | |||
| 191 | static LIST_HEAD(shmem_swaplist); | ||
| 192 | static DEFINE_SPINLOCK(shmem_swaplist_lock); | ||
| 193 | |||
| 194 | static void shmem_free_blocks(struct inode *inode, long pages) | ||
| 195 | { | ||
| 196 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
| 197 | if (sbinfo) { | ||
| 198 | spin_lock(&sbinfo->stat_lock); | ||
| 199 | sbinfo->free_blocks += pages; | ||
| 200 | inode->i_blocks -= pages*BLOCKS_PER_PAGE; | ||
| 201 | spin_unlock(&sbinfo->stat_lock); | ||
| 202 | } | ||
| 203 | } | ||
| 204 | |||
| 205 | /* | ||
| 206 | * shmem_recalc_inode - recalculate the size of an inode | ||
| 207 | * | ||
| 208 | * @inode: inode to recalc | ||
| 209 | * | ||
| 210 | * We have to calculate the free blocks since the mm can drop | ||
| 211 | * undirtied hole pages behind our back. | ||
| 212 | * | ||
| 213 | * But normally info->alloced == inode->i_mapping->nrpages + info->swapped | ||
| 214 | * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) | ||
| 215 | * | ||
| 216 | * It has to be called with the spinlock held. | ||
| 217 | */ | ||
| 218 | static void shmem_recalc_inode(struct inode *inode) | ||
| 219 | { | ||
| 220 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
| 221 | long freed; | ||
| 222 | |||
| 223 | freed = info->alloced - info->swapped - inode->i_mapping->nrpages; | ||
| 224 | if (freed > 0) { | ||
| 225 | info->alloced -= freed; | ||
| 226 | shmem_unacct_blocks(info->flags, freed); | ||
| 227 | shmem_free_blocks(inode, freed); | ||
| 228 | } | ||
| 229 | } | ||
| 230 | |||
| 231 | /* | ||
| 232 | * shmem_swp_entry - find the swap vector position in the info structure | ||
| 233 | * | ||
| 234 | * @info: info structure for the inode | ||
| 235 | * @index: index of the page to find | ||
| 236 | * @page: optional page to add to the structure. Has to be preset to | ||
| 237 | * all zeros | ||
| 238 | * | ||
| 239 | * If there is no space allocated yet it will return NULL when | ||
| 240 | * page is NULL, else it will use the page for the needed block, | ||
| 241 | * setting it to NULL on return to indicate that it has been used. | ||
| 242 | * | ||
| 243 | * The swap vector is organized the following way: | ||
| 244 | * | ||
| 245 | * There are SHMEM_NR_DIRECT entries directly stored in the | ||
| 246 | * shmem_inode_info structure. So small files do not need an addional | ||
| 247 | * allocation. | ||
| 248 | * | ||
| 249 | * For pages with index > SHMEM_NR_DIRECT there is the pointer | ||
| 250 | * i_indirect which points to a page which holds in the first half | ||
| 251 | * doubly indirect blocks, in the second half triple indirect blocks: | ||
| 252 | * | ||
| 253 | * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the | ||
| 254 | * following layout (for SHMEM_NR_DIRECT == 16): | ||
| 255 | * | ||
| 256 | * i_indirect -> dir --> 16-19 | ||
| 257 | * | +-> 20-23 | ||
| 258 | * | | ||
| 259 | * +-->dir2 --> 24-27 | ||
| 260 | * | +-> 28-31 | ||
| 261 | * | +-> 32-35 | ||
| 262 | * | +-> 36-39 | ||
| 263 | * | | ||
| 264 | * +-->dir3 --> 40-43 | ||
| 265 | * +-> 44-47 | ||
| 266 | * +-> 48-51 | ||
| 267 | * +-> 52-55 | ||
| 268 | */ | ||
| 269 | static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page) | ||
| 270 | { | ||
| 271 | unsigned long offset; | ||
| 272 | struct page **dir; | ||
| 273 | struct page *subdir; | ||
| 274 | |||
| 275 | if (index < SHMEM_NR_DIRECT) { | ||
| 276 | shmem_swp_balance_unmap(); | ||
| 277 | return info->i_direct+index; | ||
| 278 | } | ||
| 279 | if (!info->i_indirect) { | ||
| 280 | if (page) { | ||
| 281 | info->i_indirect = *page; | ||
| 282 | *page = NULL; | ||
| 283 | } | ||
| 284 | return NULL; /* need another page */ | ||
| 285 | } | ||
| 286 | |||
| 287 | index -= SHMEM_NR_DIRECT; | ||
| 288 | offset = index % ENTRIES_PER_PAGE; | ||
| 289 | index /= ENTRIES_PER_PAGE; | ||
| 290 | dir = shmem_dir_map(info->i_indirect); | ||
| 291 | |||
| 292 | if (index >= ENTRIES_PER_PAGE/2) { | ||
| 293 | index -= ENTRIES_PER_PAGE/2; | ||
| 294 | dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE; | ||
| 295 | index %= ENTRIES_PER_PAGE; | ||
| 296 | subdir = *dir; | ||
| 297 | if (!subdir) { | ||
| 298 | if (page) { | ||
| 299 | *dir = *page; | ||
| 300 | *page = NULL; | ||
| 301 | } | ||
| 302 | shmem_dir_unmap(dir); | ||
| 303 | return NULL; /* need another page */ | ||
| 304 | } | ||
| 305 | shmem_dir_unmap(dir); | ||
| 306 | dir = shmem_dir_map(subdir); | ||
| 307 | } | ||
| 308 | |||
| 309 | dir += index; | ||
| 310 | subdir = *dir; | ||
| 311 | if (!subdir) { | ||
| 312 | if (!page || !(subdir = *page)) { | ||
| 313 | shmem_dir_unmap(dir); | ||
| 314 | return NULL; /* need a page */ | ||
| 315 | } | ||
| 316 | *dir = subdir; | ||
| 317 | *page = NULL; | ||
| 318 | } | ||
| 319 | shmem_dir_unmap(dir); | ||
| 320 | return shmem_swp_map(subdir) + offset; | ||
| 321 | } | ||
| 322 | |||
| 323 | static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value) | ||
| 324 | { | ||
| 325 | long incdec = value? 1: -1; | ||
| 326 | |||
| 327 | entry->val = value; | ||
| 328 | info->swapped += incdec; | ||
| 329 | if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) | ||
| 330 | kmap_atomic_to_page(entry)->nr_swapped += incdec; | ||
| 331 | } | ||
| 332 | |||
| 333 | /* | ||
| 334 | * shmem_swp_alloc - get the position of the swap entry for the page. | ||
| 335 | * If it does not exist allocate the entry. | ||
| 336 | * | ||
| 337 | * @info: info structure for the inode | ||
| 338 | * @index: index of the page to find | ||
| 339 | * @sgp: check and recheck i_size? skip allocation? | ||
| 340 | */ | ||
| 341 | static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) | ||
| 342 | { | ||
| 343 | struct inode *inode = &info->vfs_inode; | ||
| 344 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
| 345 | struct page *page = NULL; | ||
| 346 | swp_entry_t *entry; | ||
| 347 | |||
| 348 | if (sgp != SGP_WRITE && | ||
| 349 | ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | ||
| 350 | return ERR_PTR(-EINVAL); | ||
| 351 | |||
| 352 | while (!(entry = shmem_swp_entry(info, index, &page))) { | ||
| 353 | if (sgp == SGP_READ) | ||
| 354 | return shmem_swp_map(ZERO_PAGE(0)); | ||
| 355 | /* | ||
| 356 | * Test free_blocks against 1 not 0, since we have 1 data | ||
| 357 | * page (and perhaps indirect index pages) yet to allocate: | ||
| 358 | * a waste to allocate index if we cannot allocate data. | ||
| 359 | */ | ||
| 360 | if (sbinfo) { | ||
| 361 | spin_lock(&sbinfo->stat_lock); | ||
| 362 | if (sbinfo->free_blocks <= 1) { | ||
| 363 | spin_unlock(&sbinfo->stat_lock); | ||
| 364 | return ERR_PTR(-ENOSPC); | ||
| 365 | } | ||
| 366 | sbinfo->free_blocks--; | ||
| 367 | inode->i_blocks += BLOCKS_PER_PAGE; | ||
| 368 | spin_unlock(&sbinfo->stat_lock); | ||
| 369 | } | ||
| 370 | |||
| 371 | spin_unlock(&info->lock); | ||
| 372 | page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO); | ||
| 373 | if (page) { | ||
| 374 | page->nr_swapped = 0; | ||
| 375 | } | ||
| 376 | spin_lock(&info->lock); | ||
| 377 | |||
| 378 | if (!page) { | ||
| 379 | shmem_free_blocks(inode, 1); | ||
| 380 | return ERR_PTR(-ENOMEM); | ||
| 381 | } | ||
| 382 | if (sgp != SGP_WRITE && | ||
| 383 | ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { | ||
| 384 | entry = ERR_PTR(-EINVAL); | ||
| 385 | break; | ||
| 386 | } | ||
| 387 | if (info->next_index <= index) | ||
| 388 | info->next_index = index + 1; | ||
| 389 | } | ||
| 390 | if (page) { | ||
| 391 | /* another task gave its page, or truncated the file */ | ||
| 392 | shmem_free_blocks(inode, 1); | ||
| 393 | shmem_dir_free(page); | ||
| 394 | } | ||
| 395 | if (info->next_index <= index && !IS_ERR(entry)) | ||
| 396 | info->next_index = index + 1; | ||
| 397 | return entry; | ||
| 398 | } | ||
| 399 | |||
| 400 | /* | ||
| 401 | * shmem_free_swp - free some swap entries in a directory | ||
| 402 | * | ||
| 403 | * @dir: pointer to the directory | ||
| 404 | * @edir: pointer after last entry of the directory | ||
| 405 | */ | ||
| 406 | static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir) | ||
| 407 | { | ||
| 408 | swp_entry_t *ptr; | ||
| 409 | int freed = 0; | ||
| 410 | |||
| 411 | for (ptr = dir; ptr < edir; ptr++) { | ||
| 412 | if (ptr->val) { | ||
| 413 | free_swap_and_cache(*ptr); | ||
| 414 | *ptr = (swp_entry_t){0}; | ||
| 415 | freed++; | ||
| 416 | } | ||
| 417 | } | ||
| 418 | return freed; | ||
| 419 | } | ||
| 420 | |||
| 421 | static int shmem_map_and_free_swp(struct page *subdir, | ||
| 422 | int offset, int limit, struct page ***dir) | ||
| 423 | { | ||
| 424 | swp_entry_t *ptr; | ||
| 425 | int freed = 0; | ||
| 426 | |||
| 427 | ptr = shmem_swp_map(subdir); | ||
| 428 | for (; offset < limit; offset += LATENCY_LIMIT) { | ||
| 429 | int size = limit - offset; | ||
| 430 | if (size > LATENCY_LIMIT) | ||
| 431 | size = LATENCY_LIMIT; | ||
| 432 | freed += shmem_free_swp(ptr+offset, ptr+offset+size); | ||
| 433 | if (need_resched()) { | ||
| 434 | shmem_swp_unmap(ptr); | ||
| 435 | if (*dir) { | ||
| 436 | shmem_dir_unmap(*dir); | ||
| 437 | *dir = NULL; | ||
| 438 | } | ||
| 439 | cond_resched(); | ||
| 440 | ptr = shmem_swp_map(subdir); | ||
| 441 | } | ||
| 442 | } | ||
| 443 | shmem_swp_unmap(ptr); | ||
| 444 | return freed; | ||
| 445 | } | ||
| 446 | |||
| 447 | static void shmem_free_pages(struct list_head *next) | ||
| 448 | { | ||
| 449 | struct page *page; | ||
| 450 | int freed = 0; | ||
| 451 | |||
| 452 | do { | ||
| 453 | page = container_of(next, struct page, lru); | ||
| 454 | next = next->next; | ||
| 455 | shmem_dir_free(page); | ||
| 456 | freed++; | ||
| 457 | if (freed >= LATENCY_LIMIT) { | ||
| 458 | cond_resched(); | ||
| 459 | freed = 0; | ||
| 460 | } | ||
| 461 | } while (next); | ||
| 462 | } | ||
| 463 | |||
| 464 | static void shmem_truncate(struct inode *inode) | ||
| 465 | { | ||
| 466 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
| 467 | unsigned long idx; | ||
| 468 | unsigned long size; | ||
| 469 | unsigned long limit; | ||
| 470 | unsigned long stage; | ||
| 471 | unsigned long diroff; | ||
| 472 | struct page **dir; | ||
| 473 | struct page *topdir; | ||
| 474 | struct page *middir; | ||
| 475 | struct page *subdir; | ||
| 476 | swp_entry_t *ptr; | ||
| 477 | LIST_HEAD(pages_to_free); | ||
| 478 | long nr_pages_to_free = 0; | ||
| 479 | long nr_swaps_freed = 0; | ||
| 480 | int offset; | ||
| 481 | int freed; | ||
| 482 | |||
| 483 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | ||
| 484 | idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
| 485 | if (idx >= info->next_index) | ||
| 486 | return; | ||
| 487 | |||
| 488 | spin_lock(&info->lock); | ||
| 489 | info->flags |= SHMEM_TRUNCATE; | ||
| 490 | limit = info->next_index; | ||
| 491 | info->next_index = idx; | ||
| 492 | topdir = info->i_indirect; | ||
| 493 | if (topdir && idx <= SHMEM_NR_DIRECT) { | ||
| 494 | info->i_indirect = NULL; | ||
| 495 | nr_pages_to_free++; | ||
| 496 | list_add(&topdir->lru, &pages_to_free); | ||
| 497 | } | ||
| 498 | spin_unlock(&info->lock); | ||
| 499 | |||
| 500 | if (info->swapped && idx < SHMEM_NR_DIRECT) { | ||
| 501 | ptr = info->i_direct; | ||
| 502 | size = limit; | ||
| 503 | if (size > SHMEM_NR_DIRECT) | ||
| 504 | size = SHMEM_NR_DIRECT; | ||
| 505 | nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size); | ||
| 506 | } | ||
| 507 | if (!topdir) | ||
| 508 | goto done2; | ||
| 509 | |||
| 510 | BUG_ON(limit <= SHMEM_NR_DIRECT); | ||
| 511 | limit -= SHMEM_NR_DIRECT; | ||
| 512 | idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; | ||
| 513 | offset = idx % ENTRIES_PER_PAGE; | ||
| 514 | idx -= offset; | ||
| 515 | |||
| 516 | dir = shmem_dir_map(topdir); | ||
| 517 | stage = ENTRIES_PER_PAGEPAGE/2; | ||
| 518 | if (idx < ENTRIES_PER_PAGEPAGE/2) { | ||
| 519 | middir = topdir; | ||
| 520 | diroff = idx/ENTRIES_PER_PAGE; | ||
| 521 | } else { | ||
| 522 | dir += ENTRIES_PER_PAGE/2; | ||
| 523 | dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE; | ||
| 524 | while (stage <= idx) | ||
| 525 | stage += ENTRIES_PER_PAGEPAGE; | ||
| 526 | middir = *dir; | ||
| 527 | if (*dir) { | ||
| 528 | diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) % | ||
| 529 | ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE; | ||
| 530 | if (!diroff && !offset) { | ||
| 531 | *dir = NULL; | ||
| 532 | nr_pages_to_free++; | ||
| 533 | list_add(&middir->lru, &pages_to_free); | ||
| 534 | } | ||
| 535 | shmem_dir_unmap(dir); | ||
| 536 | dir = shmem_dir_map(middir); | ||
| 537 | } else { | ||
| 538 | diroff = 0; | ||
| 539 | offset = 0; | ||
| 540 | idx = stage; | ||
| 541 | } | ||
| 542 | } | ||
| 543 | |||
| 544 | for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) { | ||
| 545 | if (unlikely(idx == stage)) { | ||
| 546 | shmem_dir_unmap(dir); | ||
| 547 | dir = shmem_dir_map(topdir) + | ||
| 548 | ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; | ||
| 549 | while (!*dir) { | ||
| 550 | dir++; | ||
| 551 | idx += ENTRIES_PER_PAGEPAGE; | ||
| 552 | if (idx >= limit) | ||
| 553 | goto done1; | ||
| 554 | } | ||
| 555 | stage = idx + ENTRIES_PER_PAGEPAGE; | ||
| 556 | middir = *dir; | ||
| 557 | *dir = NULL; | ||
| 558 | nr_pages_to_free++; | ||
| 559 | list_add(&middir->lru, &pages_to_free); | ||
| 560 | shmem_dir_unmap(dir); | ||
| 561 | cond_resched(); | ||
| 562 | dir = shmem_dir_map(middir); | ||
| 563 | diroff = 0; | ||
| 564 | } | ||
| 565 | subdir = dir[diroff]; | ||
| 566 | if (subdir && subdir->nr_swapped) { | ||
| 567 | size = limit - idx; | ||
| 568 | if (size > ENTRIES_PER_PAGE) | ||
| 569 | size = ENTRIES_PER_PAGE; | ||
| 570 | freed = shmem_map_and_free_swp(subdir, | ||
| 571 | offset, size, &dir); | ||
| 572 | if (!dir) | ||
| 573 | dir = shmem_dir_map(middir); | ||
| 574 | nr_swaps_freed += freed; | ||
| 575 | if (offset) | ||
| 576 | spin_lock(&info->lock); | ||
| 577 | subdir->nr_swapped -= freed; | ||
| 578 | if (offset) | ||
| 579 | spin_unlock(&info->lock); | ||
| 580 | BUG_ON(subdir->nr_swapped > offset); | ||
| 581 | } | ||
| 582 | if (offset) | ||
| 583 | offset = 0; | ||
| 584 | else if (subdir) { | ||
| 585 | dir[diroff] = NULL; | ||
| 586 | nr_pages_to_free++; | ||
| 587 | list_add(&subdir->lru, &pages_to_free); | ||
| 588 | } | ||
| 589 | } | ||
| 590 | done1: | ||
| 591 | shmem_dir_unmap(dir); | ||
| 592 | done2: | ||
| 593 | if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) { | ||
| 594 | /* | ||
| 595 | * Call truncate_inode_pages again: racing shmem_unuse_inode | ||
| 596 | * may have swizzled a page in from swap since vmtruncate or | ||
| 597 | * generic_delete_inode did it, before we lowered next_index. | ||
| 598 | * Also, though shmem_getpage checks i_size before adding to | ||
| 599 | * cache, no recheck after: so fix the narrow window there too. | ||
| 600 | */ | ||
| 601 | truncate_inode_pages(inode->i_mapping, inode->i_size); | ||
| 602 | } | ||
| 603 | |||
| 604 | spin_lock(&info->lock); | ||
| 605 | info->flags &= ~SHMEM_TRUNCATE; | ||
| 606 | info->swapped -= nr_swaps_freed; | ||
| 607 | if (nr_pages_to_free) | ||
| 608 | shmem_free_blocks(inode, nr_pages_to_free); | ||
| 609 | shmem_recalc_inode(inode); | ||
| 610 | spin_unlock(&info->lock); | ||
| 611 | |||
| 612 | /* | ||
| 613 | * Empty swap vector directory pages to be freed? | ||
| 614 | */ | ||
| 615 | if (!list_empty(&pages_to_free)) { | ||
| 616 | pages_to_free.prev->next = NULL; | ||
| 617 | shmem_free_pages(pages_to_free.next); | ||
| 618 | } | ||
| 619 | } | ||
| 620 | |||
| 621 | static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | ||
| 622 | { | ||
| 623 | struct inode *inode = dentry->d_inode; | ||
| 624 | struct page *page = NULL; | ||
| 625 | int error; | ||
| 626 | |||
| 627 | if (attr->ia_valid & ATTR_SIZE) { | ||
| 628 | if (attr->ia_size < inode->i_size) { | ||
| 629 | /* | ||
| 630 | * If truncating down to a partial page, then | ||
| 631 | * if that page is already allocated, hold it | ||
| 632 | * in memory until the truncation is over, so | ||
| 633 | * truncate_partial_page cannnot miss it were | ||
| 634 | * it assigned to swap. | ||
| 635 | */ | ||
| 636 | if (attr->ia_size & (PAGE_CACHE_SIZE-1)) { | ||
| 637 | (void) shmem_getpage(inode, | ||
| 638 | attr->ia_size>>PAGE_CACHE_SHIFT, | ||
| 639 | &page, SGP_READ, NULL); | ||
| 640 | } | ||
| 641 | /* | ||
| 642 | * Reset SHMEM_PAGEIN flag so that shmem_truncate can | ||
| 643 | * detect if any pages might have been added to cache | ||
| 644 | * after truncate_inode_pages. But we needn't bother | ||
| 645 | * if it's being fully truncated to zero-length: the | ||
| 646 | * nrpages check is efficient enough in that case. | ||
| 647 | */ | ||
| 648 | if (attr->ia_size) { | ||
| 649 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
| 650 | spin_lock(&info->lock); | ||
| 651 | info->flags &= ~SHMEM_PAGEIN; | ||
| 652 | spin_unlock(&info->lock); | ||
| 653 | } | ||
| 654 | } | ||
| 655 | } | ||
| 656 | |||
| 657 | error = inode_change_ok(inode, attr); | ||
| 658 | if (!error) | ||
| 659 | error = inode_setattr(inode, attr); | ||
| 660 | if (page) | ||
| 661 | page_cache_release(page); | ||
| 662 | return error; | ||
| 663 | } | ||
| 664 | |||
| 665 | static void shmem_delete_inode(struct inode *inode) | ||
| 666 | { | ||
| 667 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
| 668 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
| 669 | |||
| 670 | if (inode->i_op->truncate == shmem_truncate) { | ||
| 671 | shmem_unacct_size(info->flags, inode->i_size); | ||
| 672 | inode->i_size = 0; | ||
| 673 | shmem_truncate(inode); | ||
| 674 | if (!list_empty(&info->swaplist)) { | ||
| 675 | spin_lock(&shmem_swaplist_lock); | ||
| 676 | list_del_init(&info->swaplist); | ||
| 677 | spin_unlock(&shmem_swaplist_lock); | ||
| 678 | } | ||
| 679 | } | ||
| 680 | if (sbinfo) { | ||
| 681 | BUG_ON(inode->i_blocks); | ||
| 682 | spin_lock(&sbinfo->stat_lock); | ||
| 683 | sbinfo->free_inodes++; | ||
| 684 | spin_unlock(&sbinfo->stat_lock); | ||
| 685 | } | ||
| 686 | clear_inode(inode); | ||
| 687 | } | ||
| 688 | |||
| 689 | static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) | ||
| 690 | { | ||
| 691 | swp_entry_t *ptr; | ||
| 692 | |||
| 693 | for (ptr = dir; ptr < edir; ptr++) { | ||
| 694 | if (ptr->val == entry.val) | ||
| 695 | return ptr - dir; | ||
| 696 | } | ||
| 697 | return -1; | ||
| 698 | } | ||
| 699 | |||
| 700 | static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) | ||
| 701 | { | ||
| 702 | struct inode *inode; | ||
| 703 | unsigned long idx; | ||
| 704 | unsigned long size; | ||
| 705 | unsigned long limit; | ||
| 706 | unsigned long stage; | ||
| 707 | struct page **dir; | ||
| 708 | struct page *subdir; | ||
| 709 | swp_entry_t *ptr; | ||
| 710 | int offset; | ||
| 711 | |||
| 712 | idx = 0; | ||
| 713 | ptr = info->i_direct; | ||
| 714 | spin_lock(&info->lock); | ||
| 715 | limit = info->next_index; | ||
| 716 | size = limit; | ||
| 717 | if (size > SHMEM_NR_DIRECT) | ||
| 718 | size = SHMEM_NR_DIRECT; | ||
| 719 | offset = shmem_find_swp(entry, ptr, ptr+size); | ||
| 720 | if (offset >= 0) { | ||
| 721 | shmem_swp_balance_unmap(); | ||
| 722 | goto found; | ||
| 723 | } | ||
| 724 | if (!info->i_indirect) | ||
| 725 | goto lost2; | ||
| 726 | |||
| 727 | dir = shmem_dir_map(info->i_indirect); | ||
| 728 | stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2; | ||
| 729 | |||
| 730 | for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) { | ||
| 731 | if (unlikely(idx == stage)) { | ||
| 732 | shmem_dir_unmap(dir-1); | ||
| 733 | dir = shmem_dir_map(info->i_indirect) + | ||
| 734 | ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; | ||
| 735 | while (!*dir) { | ||
| 736 | dir++; | ||
| 737 | idx += ENTRIES_PER_PAGEPAGE; | ||
| 738 | if (idx >= limit) | ||
| 739 | goto lost1; | ||
| 740 | } | ||
| 741 | stage = idx + ENTRIES_PER_PAGEPAGE; | ||
| 742 | subdir = *dir; | ||
| 743 | shmem_dir_unmap(dir); | ||
| 744 | dir = shmem_dir_map(subdir); | ||
| 745 | } | ||
| 746 | subdir = *dir; | ||
| 747 | if (subdir && subdir->nr_swapped) { | ||
| 748 | ptr = shmem_swp_map(subdir); | ||
| 749 | size = limit - idx; | ||
| 750 | if (size > ENTRIES_PER_PAGE) | ||
| 751 | size = ENTRIES_PER_PAGE; | ||
| 752 | offset = shmem_find_swp(entry, ptr, ptr+size); | ||
| 753 | if (offset >= 0) { | ||
| 754 | shmem_dir_unmap(dir); | ||
| 755 | goto found; | ||
| 756 | } | ||
| 757 | shmem_swp_unmap(ptr); | ||
| 758 | } | ||
| 759 | } | ||
| 760 | lost1: | ||
| 761 | shmem_dir_unmap(dir-1); | ||
| 762 | lost2: | ||
| 763 | spin_unlock(&info->lock); | ||
| 764 | return 0; | ||
| 765 | found: | ||
| 766 | idx += offset; | ||
| 767 | inode = &info->vfs_inode; | ||
| 768 | if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) { | ||
| 769 | info->flags |= SHMEM_PAGEIN; | ||
| 770 | shmem_swp_set(info, ptr + offset, 0); | ||
| 771 | } | ||
| 772 | shmem_swp_unmap(ptr); | ||
| 773 | spin_unlock(&info->lock); | ||
| 774 | /* | ||
| 775 | * Decrement swap count even when the entry is left behind: | ||
| 776 | * try_to_unuse will skip over mms, then reincrement count. | ||
| 777 | */ | ||
| 778 | swap_free(entry); | ||
| 779 | return 1; | ||
| 780 | } | ||
| 781 | |||
| 782 | /* | ||
| 783 | * shmem_unuse() search for an eventually swapped out shmem page. | ||
| 784 | */ | ||
| 785 | int shmem_unuse(swp_entry_t entry, struct page *page) | ||
| 786 | { | ||
| 787 | struct list_head *p, *next; | ||
| 788 | struct shmem_inode_info *info; | ||
| 789 | int found = 0; | ||
| 790 | |||
| 791 | spin_lock(&shmem_swaplist_lock); | ||
| 792 | list_for_each_safe(p, next, &shmem_swaplist) { | ||
| 793 | info = list_entry(p, struct shmem_inode_info, swaplist); | ||
| 794 | if (!info->swapped) | ||
| 795 | list_del_init(&info->swaplist); | ||
| 796 | else if (shmem_unuse_inode(info, entry, page)) { | ||
| 797 | /* move head to start search for next from here */ | ||
| 798 | list_move_tail(&shmem_swaplist, &info->swaplist); | ||
| 799 | found = 1; | ||
| 800 | break; | ||
| 801 | } | ||
| 802 | } | ||
| 803 | spin_unlock(&shmem_swaplist_lock); | ||
| 804 | return found; | ||
| 805 | } | ||
| 806 | |||
| 807 | /* | ||
| 808 | * Move the page from the page cache to the swap cache. | ||
| 809 | */ | ||
| 810 | static int shmem_writepage(struct page *page, struct writeback_control *wbc) | ||
| 811 | { | ||
| 812 | struct shmem_inode_info *info; | ||
| 813 | swp_entry_t *entry, swap; | ||
| 814 | struct address_space *mapping; | ||
| 815 | unsigned long index; | ||
| 816 | struct inode *inode; | ||
| 817 | |||
| 818 | BUG_ON(!PageLocked(page)); | ||
| 819 | BUG_ON(page_mapped(page)); | ||
| 820 | |||
| 821 | mapping = page->mapping; | ||
| 822 | index = page->index; | ||
| 823 | inode = mapping->host; | ||
| 824 | info = SHMEM_I(inode); | ||
| 825 | if (info->flags & VM_LOCKED) | ||
| 826 | goto redirty; | ||
| 827 | swap = get_swap_page(); | ||
| 828 | if (!swap.val) | ||
| 829 | goto redirty; | ||
| 830 | |||
| 831 | spin_lock(&info->lock); | ||
| 832 | shmem_recalc_inode(inode); | ||
| 833 | if (index >= info->next_index) { | ||
| 834 | BUG_ON(!(info->flags & SHMEM_TRUNCATE)); | ||
| 835 | goto unlock; | ||
| 836 | } | ||
| 837 | entry = shmem_swp_entry(info, index, NULL); | ||
| 838 | BUG_ON(!entry); | ||
| 839 | BUG_ON(entry->val); | ||
| 840 | |||
| 841 | if (move_to_swap_cache(page, swap) == 0) { | ||
| 842 | shmem_swp_set(info, entry, swap.val); | ||
| 843 | shmem_swp_unmap(entry); | ||
| 844 | spin_unlock(&info->lock); | ||
| 845 | if (list_empty(&info->swaplist)) { | ||
| 846 | spin_lock(&shmem_swaplist_lock); | ||
| 847 | /* move instead of add in case we're racing */ | ||
| 848 | list_move_tail(&info->swaplist, &shmem_swaplist); | ||
| 849 | spin_unlock(&shmem_swaplist_lock); | ||
| 850 | } | ||
| 851 | unlock_page(page); | ||
| 852 | return 0; | ||
| 853 | } | ||
| 854 | |||
| 855 | shmem_swp_unmap(entry); | ||
| 856 | unlock: | ||
| 857 | spin_unlock(&info->lock); | ||
| 858 | swap_free(swap); | ||
| 859 | redirty: | ||
| 860 | set_page_dirty(page); | ||
| 861 | return WRITEPAGE_ACTIVATE; /* Return with the page locked */ | ||
| 862 | } | ||
| 863 | |||
| 864 | #ifdef CONFIG_NUMA | ||
| 865 | static struct page *shmem_swapin_async(struct shared_policy *p, | ||
| 866 | swp_entry_t entry, unsigned long idx) | ||
| 867 | { | ||
| 868 | struct page *page; | ||
| 869 | struct vm_area_struct pvma; | ||
| 870 | |||
| 871 | /* Create a pseudo vma that just contains the policy */ | ||
| 872 | memset(&pvma, 0, sizeof(struct vm_area_struct)); | ||
| 873 | pvma.vm_end = PAGE_SIZE; | ||
| 874 | pvma.vm_pgoff = idx; | ||
| 875 | pvma.vm_policy = mpol_shared_policy_lookup(p, idx); | ||
| 876 | page = read_swap_cache_async(entry, &pvma, 0); | ||
| 877 | mpol_free(pvma.vm_policy); | ||
| 878 | return page; | ||
| 879 | } | ||
| 880 | |||
| 881 | struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry, | ||
| 882 | unsigned long idx) | ||
| 883 | { | ||
| 884 | struct shared_policy *p = &info->policy; | ||
| 885 | int i, num; | ||
| 886 | struct page *page; | ||
| 887 | unsigned long offset; | ||
| 888 | |||
| 889 | num = valid_swaphandles(entry, &offset); | ||
| 890 | for (i = 0; i < num; offset++, i++) { | ||
| 891 | page = shmem_swapin_async(p, | ||
| 892 | swp_entry(swp_type(entry), offset), idx); | ||
| 893 | if (!page) | ||
| 894 | break; | ||
| 895 | page_cache_release(page); | ||
| 896 | } | ||
| 897 | lru_add_drain(); /* Push any new pages onto the LRU now */ | ||
| 898 | return shmem_swapin_async(p, entry, idx); | ||
| 899 | } | ||
| 900 | |||
| 901 | static struct page * | ||
| 902 | shmem_alloc_page(unsigned long gfp, struct shmem_inode_info *info, | ||
| 903 | unsigned long idx) | ||
| 904 | { | ||
| 905 | struct vm_area_struct pvma; | ||
| 906 | struct page *page; | ||
| 907 | |||
| 908 | memset(&pvma, 0, sizeof(struct vm_area_struct)); | ||
| 909 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); | ||
| 910 | pvma.vm_pgoff = idx; | ||
| 911 | pvma.vm_end = PAGE_SIZE; | ||
| 912 | page = alloc_page_vma(gfp | __GFP_ZERO, &pvma, 0); | ||
| 913 | mpol_free(pvma.vm_policy); | ||
| 914 | return page; | ||
| 915 | } | ||
| 916 | #else | ||
| 917 | static inline struct page * | ||
| 918 | shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx) | ||
| 919 | { | ||
| 920 | swapin_readahead(entry, 0, NULL); | ||
| 921 | return read_swap_cache_async(entry, NULL, 0); | ||
| 922 | } | ||
| 923 | |||
| 924 | static inline struct page * | ||
| 925 | shmem_alloc_page(unsigned int __nocast gfp,struct shmem_inode_info *info, | ||
| 926 | unsigned long idx) | ||
| 927 | { | ||
| 928 | return alloc_page(gfp | __GFP_ZERO); | ||
| 929 | } | ||
| 930 | #endif | ||
| 931 | |||
| 932 | /* | ||
| 933 | * shmem_getpage - either get the page from swap or allocate a new one | ||
| 934 | * | ||
| 935 | * If we allocate a new one we do not mark it dirty. That's up to the | ||
| 936 | * vm. If we swap it in we mark it dirty since we also free the swap | ||
| 937 | * entry since a page cannot live in both the swap and page cache | ||
| 938 | */ | ||
| 939 | static int shmem_getpage(struct inode *inode, unsigned long idx, | ||
| 940 | struct page **pagep, enum sgp_type sgp, int *type) | ||
| 941 | { | ||
| 942 | struct address_space *mapping = inode->i_mapping; | ||
| 943 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
| 944 | struct shmem_sb_info *sbinfo; | ||
| 945 | struct page *filepage = *pagep; | ||
| 946 | struct page *swappage; | ||
| 947 | swp_entry_t *entry; | ||
| 948 | swp_entry_t swap; | ||
| 949 | int error; | ||
| 950 | |||
| 951 | if (idx >= SHMEM_MAX_INDEX) | ||
| 952 | return -EFBIG; | ||
| 953 | /* | ||
| 954 | * Normally, filepage is NULL on entry, and either found | ||
| 955 | * uptodate immediately, or allocated and zeroed, or read | ||
| 956 | * in under swappage, which is then assigned to filepage. | ||
| 957 | * But shmem_prepare_write passes in a locked filepage, | ||
| 958 | * which may be found not uptodate by other callers too, | ||
| 959 | * and may need to be copied from the swappage read in. | ||
| 960 | */ | ||
| 961 | repeat: | ||
| 962 | if (!filepage) | ||
| 963 | filepage = find_lock_page(mapping, idx); | ||
| 964 | if (filepage && PageUptodate(filepage)) | ||
| 965 | goto done; | ||
| 966 | error = 0; | ||
| 967 | if (sgp == SGP_QUICK) | ||
| 968 | goto failed; | ||
| 969 | |||
| 970 | spin_lock(&info->lock); | ||
| 971 | shmem_recalc_inode(inode); | ||
| 972 | entry = shmem_swp_alloc(info, idx, sgp); | ||
| 973 | if (IS_ERR(entry)) { | ||
| 974 | spin_unlock(&info->lock); | ||
| 975 | error = PTR_ERR(entry); | ||
| 976 | goto failed; | ||
| 977 | } | ||
| 978 | swap = *entry; | ||
| 979 | |||
| 980 | if (swap.val) { | ||
| 981 | /* Look it up and read it in.. */ | ||
| 982 | swappage = lookup_swap_cache(swap); | ||
| 983 | if (!swappage) { | ||
| 984 | shmem_swp_unmap(entry); | ||
| 985 | spin_unlock(&info->lock); | ||
| 986 | /* here we actually do the io */ | ||
| 987 | if (type && *type == VM_FAULT_MINOR) { | ||
| 988 | inc_page_state(pgmajfault); | ||
| 989 | *type = VM_FAULT_MAJOR; | ||
| 990 | } | ||
| 991 | swappage = shmem_swapin(info, swap, idx); | ||
| 992 | if (!swappage) { | ||
| 993 | spin_lock(&info->lock); | ||
| 994 | entry = shmem_swp_alloc(info, idx, sgp); | ||
| 995 | if (IS_ERR(entry)) | ||
| 996 | error = PTR_ERR(entry); | ||
| 997 | else { | ||
| 998 | if (entry->val == swap.val) | ||
| 999 | error = -ENOMEM; | ||
| 1000 | shmem_swp_unmap(entry); | ||
| 1001 | } | ||
| 1002 | spin_unlock(&info->lock); | ||
| 1003 | if (error) | ||
| 1004 | goto failed; | ||
| 1005 | goto repeat; | ||
| 1006 | } | ||
| 1007 | wait_on_page_locked(swappage); | ||
| 1008 | page_cache_release(swappage); | ||
| 1009 | goto repeat; | ||
| 1010 | } | ||
| 1011 | |||
| 1012 | /* We have to do this with page locked to prevent races */ | ||
| 1013 | if (TestSetPageLocked(swappage)) { | ||
| 1014 | shmem_swp_unmap(entry); | ||
| 1015 | spin_unlock(&info->lock); | ||
| 1016 | wait_on_page_locked(swappage); | ||
| 1017 | page_cache_release(swappage); | ||
| 1018 | goto repeat; | ||
| 1019 | } | ||
| 1020 | if (PageWriteback(swappage)) { | ||
| 1021 | shmem_swp_unmap(entry); | ||
| 1022 | spin_unlock(&info->lock); | ||
| 1023 | wait_on_page_writeback(swappage); | ||
| 1024 | unlock_page(swappage); | ||
| 1025 | page_cache_release(swappage); | ||
| 1026 | goto repeat; | ||
| 1027 | } | ||
| 1028 | if (!PageUptodate(swappage)) { | ||
| 1029 | shmem_swp_unmap(entry); | ||
| 1030 | spin_unlock(&info->lock); | ||
| 1031 | unlock_page(swappage); | ||
| 1032 | page_cache_release(swappage); | ||
| 1033 | error = -EIO; | ||
| 1034 | goto failed; | ||
| 1035 | } | ||
| 1036 | |||
| 1037 | if (filepage) { | ||
| 1038 | shmem_swp_set(info, entry, 0); | ||
| 1039 | shmem_swp_unmap(entry); | ||
| 1040 | delete_from_swap_cache(swappage); | ||
| 1041 | spin_unlock(&info->lock); | ||
| 1042 | copy_highpage(filepage, swappage); | ||
| 1043 | unlock_page(swappage); | ||
| 1044 | page_cache_release(swappage); | ||
| 1045 | flush_dcache_page(filepage); | ||
| 1046 | SetPageUptodate(filepage); | ||
| 1047 | set_page_dirty(filepage); | ||
| 1048 | swap_free(swap); | ||
| 1049 | } else if (!(error = move_from_swap_cache( | ||
| 1050 | swappage, idx, mapping))) { | ||
| 1051 | info->flags |= SHMEM_PAGEIN; | ||
| 1052 | shmem_swp_set(info, entry, 0); | ||
| 1053 | shmem_swp_unmap(entry); | ||
| 1054 | spin_unlock(&info->lock); | ||
| 1055 | filepage = swappage; | ||
| 1056 | swap_free(swap); | ||
| 1057 | } else { | ||
| 1058 | shmem_swp_unmap(entry); | ||
| 1059 | spin_unlock(&info->lock); | ||
| 1060 | unlock_page(swappage); | ||
| 1061 | page_cache_release(swappage); | ||
| 1062 | if (error == -ENOMEM) { | ||
| 1063 | /* let kswapd refresh zone for GFP_ATOMICs */ | ||
| 1064 | blk_congestion_wait(WRITE, HZ/50); | ||
| 1065 | } | ||
| 1066 | goto repeat; | ||
| 1067 | } | ||
| 1068 | } else if (sgp == SGP_READ && !filepage) { | ||
| 1069 | shmem_swp_unmap(entry); | ||
| 1070 | filepage = find_get_page(mapping, idx); | ||
| 1071 | if (filepage && | ||
| 1072 | (!PageUptodate(filepage) || TestSetPageLocked(filepage))) { | ||
| 1073 | spin_unlock(&info->lock); | ||
| 1074 | wait_on_page_locked(filepage); | ||
| 1075 | page_cache_release(filepage); | ||
| 1076 | filepage = NULL; | ||
| 1077 | goto repeat; | ||
| 1078 | } | ||
| 1079 | spin_unlock(&info->lock); | ||
| 1080 | } else { | ||
| 1081 | shmem_swp_unmap(entry); | ||
| 1082 | sbinfo = SHMEM_SB(inode->i_sb); | ||
| 1083 | if (sbinfo) { | ||
| 1084 | spin_lock(&sbinfo->stat_lock); | ||
| 1085 | if (sbinfo->free_blocks == 0 || | ||
| 1086 | shmem_acct_block(info->flags)) { | ||
| 1087 | spin_unlock(&sbinfo->stat_lock); | ||
| 1088 | spin_unlock(&info->lock); | ||
| 1089 | error = -ENOSPC; | ||
| 1090 | goto failed; | ||
| 1091 | } | ||
| 1092 | sbinfo->free_blocks--; | ||
| 1093 | inode->i_blocks += BLOCKS_PER_PAGE; | ||
| 1094 | spin_unlock(&sbinfo->stat_lock); | ||
| 1095 | } else if (shmem_acct_block(info->flags)) { | ||
| 1096 | spin_unlock(&info->lock); | ||
| 1097 | error = -ENOSPC; | ||
| 1098 | goto failed; | ||
| 1099 | } | ||
| 1100 | |||
| 1101 | if (!filepage) { | ||
| 1102 | spin_unlock(&info->lock); | ||
| 1103 | filepage = shmem_alloc_page(mapping_gfp_mask(mapping), | ||
| 1104 | info, | ||
| 1105 | idx); | ||
| 1106 | if (!filepage) { | ||
| 1107 | shmem_unacct_blocks(info->flags, 1); | ||
| 1108 | shmem_free_blocks(inode, 1); | ||
| 1109 | error = -ENOMEM; | ||
| 1110 | goto failed; | ||
| 1111 | } | ||
| 1112 | |||
| 1113 | spin_lock(&info->lock); | ||
| 1114 | entry = shmem_swp_alloc(info, idx, sgp); | ||
| 1115 | if (IS_ERR(entry)) | ||
| 1116 | error = PTR_ERR(entry); | ||
| 1117 | else { | ||
| 1118 | swap = *entry; | ||
| 1119 | shmem_swp_unmap(entry); | ||
| 1120 | } | ||
| 1121 | if (error || swap.val || 0 != add_to_page_cache_lru( | ||
| 1122 | filepage, mapping, idx, GFP_ATOMIC)) { | ||
| 1123 | spin_unlock(&info->lock); | ||
| 1124 | page_cache_release(filepage); | ||
| 1125 | shmem_unacct_blocks(info->flags, 1); | ||
| 1126 | shmem_free_blocks(inode, 1); | ||
| 1127 | filepage = NULL; | ||
| 1128 | if (error) | ||
| 1129 | goto failed; | ||
| 1130 | goto repeat; | ||
| 1131 | } | ||
| 1132 | info->flags |= SHMEM_PAGEIN; | ||
| 1133 | } | ||
| 1134 | |||
| 1135 | info->alloced++; | ||
| 1136 | spin_unlock(&info->lock); | ||
| 1137 | flush_dcache_page(filepage); | ||
| 1138 | SetPageUptodate(filepage); | ||
| 1139 | } | ||
| 1140 | done: | ||
| 1141 | if (*pagep != filepage) { | ||
| 1142 | unlock_page(filepage); | ||
| 1143 | *pagep = filepage; | ||
| 1144 | } | ||
| 1145 | return 0; | ||
| 1146 | |||
| 1147 | failed: | ||
| 1148 | if (*pagep != filepage) { | ||
| 1149 | unlock_page(filepage); | ||
| 1150 | page_cache_release(filepage); | ||
| 1151 | } | ||
| 1152 | return error; | ||
| 1153 | } | ||
| 1154 | |||
| 1155 | struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type) | ||
| 1156 | { | ||
| 1157 | struct inode *inode = vma->vm_file->f_dentry->d_inode; | ||
| 1158 | struct page *page = NULL; | ||
| 1159 | unsigned long idx; | ||
| 1160 | int error; | ||
| 1161 | |||
| 1162 | idx = (address - vma->vm_start) >> PAGE_SHIFT; | ||
| 1163 | idx += vma->vm_pgoff; | ||
| 1164 | idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; | ||
| 1165 | if (((loff_t) idx << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | ||
| 1166 | return NOPAGE_SIGBUS; | ||
| 1167 | |||
| 1168 | error = shmem_getpage(inode, idx, &page, SGP_CACHE, type); | ||
| 1169 | if (error) | ||
| 1170 | return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS; | ||
| 1171 | |||
| 1172 | mark_page_accessed(page); | ||
| 1173 | return page; | ||
| 1174 | } | ||
| 1175 | |||
| 1176 | static int shmem_populate(struct vm_area_struct *vma, | ||
| 1177 | unsigned long addr, unsigned long len, | ||
| 1178 | pgprot_t prot, unsigned long pgoff, int nonblock) | ||
| 1179 | { | ||
| 1180 | struct inode *inode = vma->vm_file->f_dentry->d_inode; | ||
| 1181 | struct mm_struct *mm = vma->vm_mm; | ||
| 1182 | enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE; | ||
| 1183 | unsigned long size; | ||
| 1184 | |||
| 1185 | size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
| 1186 | if (pgoff >= size || pgoff + (len >> PAGE_SHIFT) > size) | ||
| 1187 | return -EINVAL; | ||
| 1188 | |||
| 1189 | while ((long) len > 0) { | ||
| 1190 | struct page *page = NULL; | ||
| 1191 | int err; | ||
| 1192 | /* | ||
| 1193 | * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE | ||
| 1194 | */ | ||
| 1195 | err = shmem_getpage(inode, pgoff, &page, sgp, NULL); | ||
| 1196 | if (err) | ||
| 1197 | return err; | ||
| 1198 | if (page) { | ||
| 1199 | mark_page_accessed(page); | ||
| 1200 | err = install_page(mm, vma, addr, page, prot); | ||
| 1201 | if (err) { | ||
| 1202 | page_cache_release(page); | ||
| 1203 | return err; | ||
| 1204 | } | ||
| 1205 | } else if (nonblock) { | ||
| 1206 | err = install_file_pte(mm, vma, addr, pgoff, prot); | ||
| 1207 | if (err) | ||
| 1208 | return err; | ||
| 1209 | } | ||
| 1210 | |||
| 1211 | len -= PAGE_SIZE; | ||
| 1212 | addr += PAGE_SIZE; | ||
| 1213 | pgoff++; | ||
| 1214 | } | ||
| 1215 | return 0; | ||
| 1216 | } | ||
| 1217 | |||
| 1218 | #ifdef CONFIG_NUMA | ||
| 1219 | int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) | ||
| 1220 | { | ||
| 1221 | struct inode *i = vma->vm_file->f_dentry->d_inode; | ||
| 1222 | return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); | ||
| 1223 | } | ||
| 1224 | |||
| 1225 | struct mempolicy * | ||
| 1226 | shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) | ||
| 1227 | { | ||
| 1228 | struct inode *i = vma->vm_file->f_dentry->d_inode; | ||
| 1229 | unsigned long idx; | ||
| 1230 | |||
| 1231 | idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
| 1232 | return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); | ||
| 1233 | } | ||
| 1234 | #endif | ||
| 1235 | |||
| 1236 | int shmem_lock(struct file *file, int lock, struct user_struct *user) | ||
| 1237 | { | ||
| 1238 | struct inode *inode = file->f_dentry->d_inode; | ||
| 1239 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
| 1240 | int retval = -ENOMEM; | ||
| 1241 | |||
| 1242 | spin_lock(&info->lock); | ||
| 1243 | if (lock && !(info->flags & VM_LOCKED)) { | ||
| 1244 | if (!user_shm_lock(inode->i_size, user)) | ||
| 1245 | goto out_nomem; | ||
| 1246 | info->flags |= VM_LOCKED; | ||
| 1247 | } | ||
| 1248 | if (!lock && (info->flags & VM_LOCKED) && user) { | ||
| 1249 | user_shm_unlock(inode->i_size, user); | ||
| 1250 | info->flags &= ~VM_LOCKED; | ||
| 1251 | } | ||
| 1252 | retval = 0; | ||
| 1253 | out_nomem: | ||
| 1254 | spin_unlock(&info->lock); | ||
| 1255 | return retval; | ||
| 1256 | } | ||
| 1257 | |||
| 1258 | static int shmem_mmap(struct file *file, struct vm_area_struct *vma) | ||
| 1259 | { | ||
| 1260 | file_accessed(file); | ||
| 1261 | vma->vm_ops = &shmem_vm_ops; | ||
| 1262 | return 0; | ||
| 1263 | } | ||
| 1264 | |||
| 1265 | static struct inode * | ||
| 1266 | shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | ||
| 1267 | { | ||
| 1268 | struct inode *inode; | ||
| 1269 | struct shmem_inode_info *info; | ||
| 1270 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); | ||
| 1271 | |||
| 1272 | if (sbinfo) { | ||
| 1273 | spin_lock(&sbinfo->stat_lock); | ||
| 1274 | if (!sbinfo->free_inodes) { | ||
| 1275 | spin_unlock(&sbinfo->stat_lock); | ||
| 1276 | return NULL; | ||
| 1277 | } | ||
| 1278 | sbinfo->free_inodes--; | ||
| 1279 | spin_unlock(&sbinfo->stat_lock); | ||
| 1280 | } | ||
| 1281 | |||
| 1282 | inode = new_inode(sb); | ||
| 1283 | if (inode) { | ||
| 1284 | inode->i_mode = mode; | ||
| 1285 | inode->i_uid = current->fsuid; | ||
| 1286 | inode->i_gid = current->fsgid; | ||
| 1287 | inode->i_blksize = PAGE_CACHE_SIZE; | ||
| 1288 | inode->i_blocks = 0; | ||
| 1289 | inode->i_mapping->a_ops = &shmem_aops; | ||
| 1290 | inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; | ||
| 1291 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
| 1292 | info = SHMEM_I(inode); | ||
| 1293 | memset(info, 0, (char *)inode - (char *)info); | ||
| 1294 | spin_lock_init(&info->lock); | ||
| 1295 | INIT_LIST_HEAD(&info->swaplist); | ||
| 1296 | |||
| 1297 | switch (mode & S_IFMT) { | ||
| 1298 | default: | ||
| 1299 | inode->i_op = &shmem_special_inode_operations; | ||
| 1300 | init_special_inode(inode, mode, dev); | ||
| 1301 | break; | ||
| 1302 | case S_IFREG: | ||
| 1303 | inode->i_op = &shmem_inode_operations; | ||
| 1304 | inode->i_fop = &shmem_file_operations; | ||
| 1305 | mpol_shared_policy_init(&info->policy); | ||
| 1306 | break; | ||
| 1307 | case S_IFDIR: | ||
| 1308 | inode->i_nlink++; | ||
| 1309 | /* Some things misbehave if size == 0 on a directory */ | ||
| 1310 | inode->i_size = 2 * BOGO_DIRENT_SIZE; | ||
| 1311 | inode->i_op = &shmem_dir_inode_operations; | ||
| 1312 | inode->i_fop = &simple_dir_operations; | ||
| 1313 | break; | ||
| 1314 | case S_IFLNK: | ||
| 1315 | /* | ||
| 1316 | * Must not load anything in the rbtree, | ||
| 1317 | * mpol_free_shared_policy will not be called. | ||
| 1318 | */ | ||
| 1319 | mpol_shared_policy_init(&info->policy); | ||
| 1320 | break; | ||
| 1321 | } | ||
| 1322 | } else if (sbinfo) { | ||
| 1323 | spin_lock(&sbinfo->stat_lock); | ||
| 1324 | sbinfo->free_inodes++; | ||
| 1325 | spin_unlock(&sbinfo->stat_lock); | ||
| 1326 | } | ||
| 1327 | return inode; | ||
| 1328 | } | ||
| 1329 | |||
| 1330 | #ifdef CONFIG_TMPFS | ||
| 1331 | |||
| 1332 | static int shmem_set_size(struct shmem_sb_info *sbinfo, | ||
| 1333 | unsigned long max_blocks, unsigned long max_inodes) | ||
| 1334 | { | ||
| 1335 | int error; | ||
| 1336 | unsigned long blocks, inodes; | ||
| 1337 | |||
| 1338 | spin_lock(&sbinfo->stat_lock); | ||
| 1339 | blocks = sbinfo->max_blocks - sbinfo->free_blocks; | ||
| 1340 | inodes = sbinfo->max_inodes - sbinfo->free_inodes; | ||
| 1341 | error = -EINVAL; | ||
| 1342 | if (max_blocks < blocks) | ||
| 1343 | goto out; | ||
| 1344 | if (max_inodes < inodes) | ||
| 1345 | goto out; | ||
| 1346 | error = 0; | ||
| 1347 | sbinfo->max_blocks = max_blocks; | ||
| 1348 | sbinfo->free_blocks = max_blocks - blocks; | ||
| 1349 | sbinfo->max_inodes = max_inodes; | ||
| 1350 | sbinfo->free_inodes = max_inodes - inodes; | ||
| 1351 | out: | ||
| 1352 | spin_unlock(&sbinfo->stat_lock); | ||
| 1353 | return error; | ||
| 1354 | } | ||
| 1355 | |||
| 1356 | static struct inode_operations shmem_symlink_inode_operations; | ||
| 1357 | static struct inode_operations shmem_symlink_inline_operations; | ||
| 1358 | |||
| 1359 | /* | ||
| 1360 | * Normally tmpfs makes no use of shmem_prepare_write, but it | ||
| 1361 | * lets a tmpfs file be used read-write below the loop driver. | ||
| 1362 | */ | ||
| 1363 | static int | ||
| 1364 | shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) | ||
| 1365 | { | ||
| 1366 | struct inode *inode = page->mapping->host; | ||
| 1367 | return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL); | ||
| 1368 | } | ||
| 1369 | |||
| 1370 | static ssize_t | ||
| 1371 | shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) | ||
| 1372 | { | ||
| 1373 | struct inode *inode = file->f_dentry->d_inode; | ||
| 1374 | loff_t pos; | ||
| 1375 | unsigned long written; | ||
| 1376 | ssize_t err; | ||
| 1377 | |||
| 1378 | if ((ssize_t) count < 0) | ||
| 1379 | return -EINVAL; | ||
| 1380 | |||
| 1381 | if (!access_ok(VERIFY_READ, buf, count)) | ||
| 1382 | return -EFAULT; | ||
| 1383 | |||
| 1384 | down(&inode->i_sem); | ||
| 1385 | |||
| 1386 | pos = *ppos; | ||
| 1387 | written = 0; | ||
| 1388 | |||
| 1389 | err = generic_write_checks(file, &pos, &count, 0); | ||
| 1390 | if (err || !count) | ||
| 1391 | goto out; | ||
| 1392 | |||
| 1393 | err = remove_suid(file->f_dentry); | ||
| 1394 | if (err) | ||
| 1395 | goto out; | ||
| 1396 | |||
| 1397 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | ||
| 1398 | |||
| 1399 | do { | ||
| 1400 | struct page *page = NULL; | ||
| 1401 | unsigned long bytes, index, offset; | ||
| 1402 | char *kaddr; | ||
| 1403 | int left; | ||
| 1404 | |||
| 1405 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ | ||
| 1406 | index = pos >> PAGE_CACHE_SHIFT; | ||
| 1407 | bytes = PAGE_CACHE_SIZE - offset; | ||
| 1408 | if (bytes > count) | ||
| 1409 | bytes = count; | ||
| 1410 | |||
| 1411 | /* | ||
| 1412 | * We don't hold page lock across copy from user - | ||
| 1413 | * what would it guard against? - so no deadlock here. | ||
| 1414 | * But it still may be a good idea to prefault below. | ||
| 1415 | */ | ||
| 1416 | |||
| 1417 | err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL); | ||
| 1418 | if (err) | ||
| 1419 | break; | ||
| 1420 | |||
| 1421 | left = bytes; | ||
| 1422 | if (PageHighMem(page)) { | ||
| 1423 | volatile unsigned char dummy; | ||
| 1424 | __get_user(dummy, buf); | ||
| 1425 | __get_user(dummy, buf + bytes - 1); | ||
| 1426 | |||
| 1427 | kaddr = kmap_atomic(page, KM_USER0); | ||
| 1428 | left = __copy_from_user_inatomic(kaddr + offset, | ||
| 1429 | buf, bytes); | ||
| 1430 | kunmap_atomic(kaddr, KM_USER0); | ||
| 1431 | } | ||
| 1432 | if (left) { | ||
| 1433 | kaddr = kmap(page); | ||
| 1434 | left = __copy_from_user(kaddr + offset, buf, bytes); | ||
| 1435 | kunmap(page); | ||
| 1436 | } | ||
| 1437 | |||
| 1438 | written += bytes; | ||
| 1439 | count -= bytes; | ||
| 1440 | pos += bytes; | ||
| 1441 | buf += bytes; | ||
| 1442 | if (pos > inode->i_size) | ||
| 1443 | i_size_write(inode, pos); | ||
| 1444 | |||
| 1445 | flush_dcache_page(page); | ||
| 1446 | set_page_dirty(page); | ||
| 1447 | mark_page_accessed(page); | ||
| 1448 | page_cache_release(page); | ||
| 1449 | |||
| 1450 | if (left) { | ||
| 1451 | pos -= left; | ||
| 1452 | written -= left; | ||
| 1453 | err = -EFAULT; | ||
| 1454 | break; | ||
| 1455 | } | ||
| 1456 | |||
| 1457 | /* | ||
| 1458 | * Our dirty pages are not counted in nr_dirty, | ||
| 1459 | * and we do not attempt to balance dirty pages. | ||
| 1460 | */ | ||
| 1461 | |||
| 1462 | cond_resched(); | ||
| 1463 | } while (count); | ||
| 1464 | |||
| 1465 | *ppos = pos; | ||
| 1466 | if (written) | ||
| 1467 | err = written; | ||
| 1468 | out: | ||
| 1469 | up(&inode->i_sem); | ||
| 1470 | return err; | ||
| 1471 | } | ||
| 1472 | |||
| 1473 | static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) | ||
| 1474 | { | ||
| 1475 | struct inode *inode = filp->f_dentry->d_inode; | ||
| 1476 | struct address_space *mapping = inode->i_mapping; | ||
| 1477 | unsigned long index, offset; | ||
| 1478 | |||
| 1479 | index = *ppos >> PAGE_CACHE_SHIFT; | ||
| 1480 | offset = *ppos & ~PAGE_CACHE_MASK; | ||
| 1481 | |||
| 1482 | for (;;) { | ||
| 1483 | struct page *page = NULL; | ||
| 1484 | unsigned long end_index, nr, ret; | ||
| 1485 | loff_t i_size = i_size_read(inode); | ||
| 1486 | |||
| 1487 | end_index = i_size >> PAGE_CACHE_SHIFT; | ||
| 1488 | if (index > end_index) | ||
| 1489 | break; | ||
| 1490 | if (index == end_index) { | ||
| 1491 | nr = i_size & ~PAGE_CACHE_MASK; | ||
| 1492 | if (nr <= offset) | ||
| 1493 | break; | ||
| 1494 | } | ||
| 1495 | |||
| 1496 | desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL); | ||
| 1497 | if (desc->error) { | ||
| 1498 | if (desc->error == -EINVAL) | ||
| 1499 | desc->error = 0; | ||
| 1500 | break; | ||
| 1501 | } | ||
| 1502 | |||
| 1503 | /* | ||
| 1504 | * We must evaluate after, since reads (unlike writes) | ||
| 1505 | * are called without i_sem protection against truncate | ||
| 1506 | */ | ||
| 1507 | nr = PAGE_CACHE_SIZE; | ||
| 1508 | i_size = i_size_read(inode); | ||
| 1509 | end_index = i_size >> PAGE_CACHE_SHIFT; | ||
| 1510 | if (index == end_index) { | ||
| 1511 | nr = i_size & ~PAGE_CACHE_MASK; | ||
| 1512 | if (nr <= offset) { | ||
| 1513 | if (page) | ||
| 1514 | page_cache_release(page); | ||
| 1515 | break; | ||
| 1516 | } | ||
| 1517 | } | ||
| 1518 | nr -= offset; | ||
| 1519 | |||
| 1520 | if (page) { | ||
| 1521 | /* | ||
| 1522 | * If users can be writing to this page using arbitrary | ||
| 1523 | * virtual addresses, take care about potential aliasing | ||
| 1524 | * before reading the page on the kernel side. | ||
| 1525 | */ | ||
| 1526 | if (mapping_writably_mapped(mapping)) | ||
| 1527 | flush_dcache_page(page); | ||
| 1528 | /* | ||
| 1529 | * Mark the page accessed if we read the beginning. | ||
| 1530 | */ | ||
| 1531 | if (!offset) | ||
| 1532 | mark_page_accessed(page); | ||
| 1533 | } else | ||
| 1534 | page = ZERO_PAGE(0); | ||
| 1535 | |||
| 1536 | /* | ||
| 1537 | * Ok, we have the page, and it's up-to-date, so | ||
| 1538 | * now we can copy it to user space... | ||
| 1539 | * | ||
| 1540 | * The actor routine returns how many bytes were actually used.. | ||
| 1541 | * NOTE! This may not be the same as how much of a user buffer | ||
| 1542 | * we filled up (we may be padding etc), so we can only update | ||
| 1543 | * "pos" here (the actor routine has to update the user buffer | ||
| 1544 | * pointers and the remaining count). | ||
| 1545 | */ | ||
| 1546 | ret = actor(desc, page, offset, nr); | ||
| 1547 | offset += ret; | ||
| 1548 | index += offset >> PAGE_CACHE_SHIFT; | ||
| 1549 | offset &= ~PAGE_CACHE_MASK; | ||
| 1550 | |||
| 1551 | page_cache_release(page); | ||
| 1552 | if (ret != nr || !desc->count) | ||
| 1553 | break; | ||
| 1554 | |||
| 1555 | cond_resched(); | ||
| 1556 | } | ||
| 1557 | |||
| 1558 | *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; | ||
| 1559 | file_accessed(filp); | ||
| 1560 | } | ||
| 1561 | |||
| 1562 | static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) | ||
| 1563 | { | ||
| 1564 | read_descriptor_t desc; | ||
| 1565 | |||
| 1566 | if ((ssize_t) count < 0) | ||
| 1567 | return -EINVAL; | ||
| 1568 | if (!access_ok(VERIFY_WRITE, buf, count)) | ||
| 1569 | return -EFAULT; | ||
| 1570 | if (!count) | ||
| 1571 | return 0; | ||
| 1572 | |||
| 1573 | desc.written = 0; | ||
| 1574 | desc.count = count; | ||
| 1575 | desc.arg.buf = buf; | ||
| 1576 | desc.error = 0; | ||
| 1577 | |||
| 1578 | do_shmem_file_read(filp, ppos, &desc, file_read_actor); | ||
| 1579 | if (desc.written) | ||
| 1580 | return desc.written; | ||
| 1581 | return desc.error; | ||
| 1582 | } | ||
| 1583 | |||
| 1584 | static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos, | ||
| 1585 | size_t count, read_actor_t actor, void *target) | ||
| 1586 | { | ||
| 1587 | read_descriptor_t desc; | ||
| 1588 | |||
| 1589 | if (!count) | ||
| 1590 | return 0; | ||
| 1591 | |||
| 1592 | desc.written = 0; | ||
| 1593 | desc.count = count; | ||
| 1594 | desc.arg.data = target; | ||
| 1595 | desc.error = 0; | ||
| 1596 | |||
| 1597 | do_shmem_file_read(in_file, ppos, &desc, actor); | ||
| 1598 | if (desc.written) | ||
| 1599 | return desc.written; | ||
| 1600 | return desc.error; | ||
| 1601 | } | ||
| 1602 | |||
| 1603 | static int shmem_statfs(struct super_block *sb, struct kstatfs *buf) | ||
| 1604 | { | ||
| 1605 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); | ||
| 1606 | |||
| 1607 | buf->f_type = TMPFS_MAGIC; | ||
| 1608 | buf->f_bsize = PAGE_CACHE_SIZE; | ||
| 1609 | buf->f_namelen = NAME_MAX; | ||
| 1610 | if (sbinfo) { | ||
| 1611 | spin_lock(&sbinfo->stat_lock); | ||
| 1612 | buf->f_blocks = sbinfo->max_blocks; | ||
| 1613 | buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; | ||
| 1614 | buf->f_files = sbinfo->max_inodes; | ||
| 1615 | buf->f_ffree = sbinfo->free_inodes; | ||
| 1616 | spin_unlock(&sbinfo->stat_lock); | ||
| 1617 | } | ||
| 1618 | /* else leave those fields 0 like simple_statfs */ | ||
| 1619 | return 0; | ||
| 1620 | } | ||
| 1621 | |||
| 1622 | /* | ||
| 1623 | * File creation. Allocate an inode, and we're done.. | ||
| 1624 | */ | ||
| 1625 | static int | ||
| 1626 | shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) | ||
| 1627 | { | ||
| 1628 | struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev); | ||
| 1629 | int error = -ENOSPC; | ||
| 1630 | |||
| 1631 | if (inode) { | ||
| 1632 | if (dir->i_mode & S_ISGID) { | ||
| 1633 | inode->i_gid = dir->i_gid; | ||
| 1634 | if (S_ISDIR(mode)) | ||
| 1635 | inode->i_mode |= S_ISGID; | ||
| 1636 | } | ||
| 1637 | dir->i_size += BOGO_DIRENT_SIZE; | ||
| 1638 | dir->i_ctime = dir->i_mtime = CURRENT_TIME; | ||
| 1639 | d_instantiate(dentry, inode); | ||
| 1640 | dget(dentry); /* Extra count - pin the dentry in core */ | ||
| 1641 | error = 0; | ||
| 1642 | } | ||
| 1643 | return error; | ||
| 1644 | } | ||
| 1645 | |||
| 1646 | static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode) | ||
| 1647 | { | ||
| 1648 | int error; | ||
| 1649 | |||
| 1650 | if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) | ||
| 1651 | return error; | ||
| 1652 | dir->i_nlink++; | ||
| 1653 | return 0; | ||
| 1654 | } | ||
| 1655 | |||
| 1656 | static int shmem_create(struct inode *dir, struct dentry *dentry, int mode, | ||
| 1657 | struct nameidata *nd) | ||
| 1658 | { | ||
| 1659 | return shmem_mknod(dir, dentry, mode | S_IFREG, 0); | ||
| 1660 | } | ||
| 1661 | |||
| 1662 | /* | ||
| 1663 | * Link a file.. | ||
| 1664 | */ | ||
| 1665 | static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) | ||
| 1666 | { | ||
| 1667 | struct inode *inode = old_dentry->d_inode; | ||
| 1668 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
| 1669 | |||
| 1670 | /* | ||
| 1671 | * No ordinary (disk based) filesystem counts links as inodes; | ||
| 1672 | * but each new link needs a new dentry, pinning lowmem, and | ||
| 1673 | * tmpfs dentries cannot be pruned until they are unlinked. | ||
| 1674 | */ | ||
| 1675 | if (sbinfo) { | ||
| 1676 | spin_lock(&sbinfo->stat_lock); | ||
| 1677 | if (!sbinfo->free_inodes) { | ||
| 1678 | spin_unlock(&sbinfo->stat_lock); | ||
| 1679 | return -ENOSPC; | ||
| 1680 | } | ||
| 1681 | sbinfo->free_inodes--; | ||
| 1682 | spin_unlock(&sbinfo->stat_lock); | ||
| 1683 | } | ||
| 1684 | |||
| 1685 | dir->i_size += BOGO_DIRENT_SIZE; | ||
| 1686 | inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; | ||
| 1687 | inode->i_nlink++; | ||
| 1688 | atomic_inc(&inode->i_count); /* New dentry reference */ | ||
| 1689 | dget(dentry); /* Extra pinning count for the created dentry */ | ||
| 1690 | d_instantiate(dentry, inode); | ||
| 1691 | return 0; | ||
| 1692 | } | ||
| 1693 | |||
| 1694 | static int shmem_unlink(struct inode *dir, struct dentry *dentry) | ||
| 1695 | { | ||
| 1696 | struct inode *inode = dentry->d_inode; | ||
| 1697 | |||
| 1698 | if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) { | ||
| 1699 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
| 1700 | if (sbinfo) { | ||
| 1701 | spin_lock(&sbinfo->stat_lock); | ||
| 1702 | sbinfo->free_inodes++; | ||
| 1703 | spin_unlock(&sbinfo->stat_lock); | ||
| 1704 | } | ||
| 1705 | } | ||
| 1706 | |||
| 1707 | dir->i_size -= BOGO_DIRENT_SIZE; | ||
| 1708 | inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; | ||
| 1709 | inode->i_nlink--; | ||
| 1710 | dput(dentry); /* Undo the count from "create" - this does all the work */ | ||
| 1711 | return 0; | ||
| 1712 | } | ||
| 1713 | |||
| 1714 | static int shmem_rmdir(struct inode *dir, struct dentry *dentry) | ||
| 1715 | { | ||
| 1716 | if (!simple_empty(dentry)) | ||
| 1717 | return -ENOTEMPTY; | ||
| 1718 | |||
| 1719 | dir->i_nlink--; | ||
| 1720 | return shmem_unlink(dir, dentry); | ||
| 1721 | } | ||
| 1722 | |||
| 1723 | /* | ||
| 1724 | * The VFS layer already does all the dentry stuff for rename, | ||
| 1725 | * we just have to decrement the usage count for the target if | ||
| 1726 | * it exists so that the VFS layer correctly free's it when it | ||
| 1727 | * gets overwritten. | ||
| 1728 | */ | ||
| 1729 | static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) | ||
| 1730 | { | ||
| 1731 | struct inode *inode = old_dentry->d_inode; | ||
| 1732 | int they_are_dirs = S_ISDIR(inode->i_mode); | ||
| 1733 | |||
| 1734 | if (!simple_empty(new_dentry)) | ||
| 1735 | return -ENOTEMPTY; | ||
| 1736 | |||
| 1737 | if (new_dentry->d_inode) { | ||
| 1738 | (void) shmem_unlink(new_dir, new_dentry); | ||
| 1739 | if (they_are_dirs) | ||
| 1740 | old_dir->i_nlink--; | ||
| 1741 | } else if (they_are_dirs) { | ||
| 1742 | old_dir->i_nlink--; | ||
| 1743 | new_dir->i_nlink++; | ||
| 1744 | } | ||
| 1745 | |||
| 1746 | old_dir->i_size -= BOGO_DIRENT_SIZE; | ||
| 1747 | new_dir->i_size += BOGO_DIRENT_SIZE; | ||
| 1748 | old_dir->i_ctime = old_dir->i_mtime = | ||
| 1749 | new_dir->i_ctime = new_dir->i_mtime = | ||
| 1750 | inode->i_ctime = CURRENT_TIME; | ||
| 1751 | return 0; | ||
| 1752 | } | ||
| 1753 | |||
| 1754 | static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname) | ||
| 1755 | { | ||
| 1756 | int error; | ||
| 1757 | int len; | ||
| 1758 | struct inode *inode; | ||
| 1759 | struct page *page = NULL; | ||
| 1760 | char *kaddr; | ||
| 1761 | struct shmem_inode_info *info; | ||
| 1762 | |||
| 1763 | len = strlen(symname) + 1; | ||
| 1764 | if (len > PAGE_CACHE_SIZE) | ||
| 1765 | return -ENAMETOOLONG; | ||
| 1766 | |||
| 1767 | inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0); | ||
| 1768 | if (!inode) | ||
| 1769 | return -ENOSPC; | ||
| 1770 | |||
| 1771 | info = SHMEM_I(inode); | ||
| 1772 | inode->i_size = len-1; | ||
| 1773 | if (len <= (char *)inode - (char *)info) { | ||
| 1774 | /* do it inline */ | ||
| 1775 | memcpy(info, symname, len); | ||
| 1776 | inode->i_op = &shmem_symlink_inline_operations; | ||
| 1777 | } else { | ||
| 1778 | error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); | ||
| 1779 | if (error) { | ||
| 1780 | iput(inode); | ||
| 1781 | return error; | ||
| 1782 | } | ||
| 1783 | inode->i_op = &shmem_symlink_inode_operations; | ||
| 1784 | kaddr = kmap_atomic(page, KM_USER0); | ||
| 1785 | memcpy(kaddr, symname, len); | ||
| 1786 | kunmap_atomic(kaddr, KM_USER0); | ||
| 1787 | set_page_dirty(page); | ||
| 1788 | page_cache_release(page); | ||
| 1789 | } | ||
| 1790 | if (dir->i_mode & S_ISGID) | ||
| 1791 | inode->i_gid = dir->i_gid; | ||
| 1792 | dir->i_size += BOGO_DIRENT_SIZE; | ||
| 1793 | dir->i_ctime = dir->i_mtime = CURRENT_TIME; | ||
| 1794 | d_instantiate(dentry, inode); | ||
| 1795 | dget(dentry); | ||
| 1796 | return 0; | ||
| 1797 | } | ||
| 1798 | |||
| 1799 | static int shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) | ||
| 1800 | { | ||
| 1801 | nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode)); | ||
| 1802 | return 0; | ||
| 1803 | } | ||
| 1804 | |||
| 1805 | static int shmem_follow_link(struct dentry *dentry, struct nameidata *nd) | ||
| 1806 | { | ||
| 1807 | struct page *page = NULL; | ||
| 1808 | int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); | ||
| 1809 | nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); | ||
| 1810 | return 0; | ||
| 1811 | } | ||
| 1812 | |||
| 1813 | static void shmem_put_link(struct dentry *dentry, struct nameidata *nd) | ||
| 1814 | { | ||
| 1815 | if (!IS_ERR(nd_get_link(nd))) { | ||
| 1816 | struct page *page; | ||
| 1817 | |||
| 1818 | page = find_get_page(dentry->d_inode->i_mapping, 0); | ||
| 1819 | if (!page) | ||
| 1820 | BUG(); | ||
| 1821 | kunmap(page); | ||
| 1822 | mark_page_accessed(page); | ||
| 1823 | page_cache_release(page); | ||
| 1824 | page_cache_release(page); | ||
| 1825 | } | ||
| 1826 | } | ||
| 1827 | |||
| 1828 | static struct inode_operations shmem_symlink_inline_operations = { | ||
| 1829 | .readlink = generic_readlink, | ||
| 1830 | .follow_link = shmem_follow_link_inline, | ||
| 1831 | #ifdef CONFIG_TMPFS_XATTR | ||
| 1832 | .setxattr = generic_setxattr, | ||
| 1833 | .getxattr = generic_getxattr, | ||
| 1834 | .listxattr = generic_listxattr, | ||
| 1835 | .removexattr = generic_removexattr, | ||
| 1836 | #endif | ||
| 1837 | }; | ||
| 1838 | |||
| 1839 | static struct inode_operations shmem_symlink_inode_operations = { | ||
| 1840 | .truncate = shmem_truncate, | ||
| 1841 | .readlink = generic_readlink, | ||
| 1842 | .follow_link = shmem_follow_link, | ||
| 1843 | .put_link = shmem_put_link, | ||
| 1844 | #ifdef CONFIG_TMPFS_XATTR | ||
| 1845 | .setxattr = generic_setxattr, | ||
| 1846 | .getxattr = generic_getxattr, | ||
| 1847 | .listxattr = generic_listxattr, | ||
| 1848 | .removexattr = generic_removexattr, | ||
| 1849 | #endif | ||
| 1850 | }; | ||
| 1851 | |||
| 1852 | static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes) | ||
| 1853 | { | ||
| 1854 | char *this_char, *value, *rest; | ||
| 1855 | |||
| 1856 | while ((this_char = strsep(&options, ",")) != NULL) { | ||
| 1857 | if (!*this_char) | ||
| 1858 | continue; | ||
| 1859 | if ((value = strchr(this_char,'=')) != NULL) { | ||
| 1860 | *value++ = 0; | ||
| 1861 | } else { | ||
| 1862 | printk(KERN_ERR | ||
| 1863 | "tmpfs: No value for mount option '%s'\n", | ||
| 1864 | this_char); | ||
| 1865 | return 1; | ||
| 1866 | } | ||
| 1867 | |||
| 1868 | if (!strcmp(this_char,"size")) { | ||
| 1869 | unsigned long long size; | ||
| 1870 | size = memparse(value,&rest); | ||
| 1871 | if (*rest == '%') { | ||
| 1872 | size <<= PAGE_SHIFT; | ||
| 1873 | size *= totalram_pages; | ||
| 1874 | do_div(size, 100); | ||
| 1875 | rest++; | ||
| 1876 | } | ||
| 1877 | if (*rest) | ||
| 1878 | goto bad_val; | ||
| 1879 | *blocks = size >> PAGE_CACHE_SHIFT; | ||
| 1880 | } else if (!strcmp(this_char,"nr_blocks")) { | ||
| 1881 | *blocks = memparse(value,&rest); | ||
| 1882 | if (*rest) | ||
| 1883 | goto bad_val; | ||
| 1884 | } else if (!strcmp(this_char,"nr_inodes")) { | ||
| 1885 | *inodes = memparse(value,&rest); | ||
| 1886 | if (*rest) | ||
| 1887 | goto bad_val; | ||
| 1888 | } else if (!strcmp(this_char,"mode")) { | ||
| 1889 | if (!mode) | ||
| 1890 | continue; | ||
| 1891 | *mode = simple_strtoul(value,&rest,8); | ||
| 1892 | if (*rest) | ||
| 1893 | goto bad_val; | ||
| 1894 | } else if (!strcmp(this_char,"uid")) { | ||
| 1895 | if (!uid) | ||
| 1896 | continue; | ||
| 1897 | *uid = simple_strtoul(value,&rest,0); | ||
| 1898 | if (*rest) | ||
| 1899 | goto bad_val; | ||
| 1900 | } else if (!strcmp(this_char,"gid")) { | ||
| 1901 | if (!gid) | ||
| 1902 | continue; | ||
| 1903 | *gid = simple_strtoul(value,&rest,0); | ||
| 1904 | if (*rest) | ||
| 1905 | goto bad_val; | ||
| 1906 | } else { | ||
| 1907 | printk(KERN_ERR "tmpfs: Bad mount option %s\n", | ||
| 1908 | this_char); | ||
| 1909 | return 1; | ||
| 1910 | } | ||
| 1911 | } | ||
| 1912 | return 0; | ||
| 1913 | |||
| 1914 | bad_val: | ||
| 1915 | printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", | ||
| 1916 | value, this_char); | ||
| 1917 | return 1; | ||
| 1918 | |||
| 1919 | } | ||
| 1920 | |||
| 1921 | static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) | ||
| 1922 | { | ||
| 1923 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); | ||
| 1924 | unsigned long max_blocks = 0; | ||
| 1925 | unsigned long max_inodes = 0; | ||
| 1926 | |||
| 1927 | if (sbinfo) { | ||
| 1928 | max_blocks = sbinfo->max_blocks; | ||
| 1929 | max_inodes = sbinfo->max_inodes; | ||
| 1930 | } | ||
| 1931 | if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes)) | ||
| 1932 | return -EINVAL; | ||
| 1933 | /* Keep it simple: disallow limited <-> unlimited remount */ | ||
| 1934 | if ((max_blocks || max_inodes) == !sbinfo) | ||
| 1935 | return -EINVAL; | ||
| 1936 | /* But allow the pointless unlimited -> unlimited remount */ | ||
| 1937 | if (!sbinfo) | ||
| 1938 | return 0; | ||
| 1939 | return shmem_set_size(sbinfo, max_blocks, max_inodes); | ||
| 1940 | } | ||
| 1941 | #endif | ||
| 1942 | |||
| 1943 | static void shmem_put_super(struct super_block *sb) | ||
| 1944 | { | ||
| 1945 | kfree(sb->s_fs_info); | ||
| 1946 | sb->s_fs_info = NULL; | ||
| 1947 | } | ||
| 1948 | |||
| 1949 | #ifdef CONFIG_TMPFS_XATTR | ||
| 1950 | static struct xattr_handler *shmem_xattr_handlers[]; | ||
| 1951 | #else | ||
| 1952 | #define shmem_xattr_handlers NULL | ||
| 1953 | #endif | ||
| 1954 | |||
| 1955 | static int shmem_fill_super(struct super_block *sb, | ||
| 1956 | void *data, int silent) | ||
| 1957 | { | ||
| 1958 | struct inode *inode; | ||
| 1959 | struct dentry *root; | ||
| 1960 | int mode = S_IRWXUGO | S_ISVTX; | ||
| 1961 | uid_t uid = current->fsuid; | ||
| 1962 | gid_t gid = current->fsgid; | ||
| 1963 | int err = -ENOMEM; | ||
| 1964 | |||
| 1965 | #ifdef CONFIG_TMPFS | ||
| 1966 | unsigned long blocks = 0; | ||
| 1967 | unsigned long inodes = 0; | ||
| 1968 | |||
| 1969 | /* | ||
| 1970 | * Per default we only allow half of the physical ram per | ||
| 1971 | * tmpfs instance, limiting inodes to one per page of lowmem; | ||
| 1972 | * but the internal instance is left unlimited. | ||
| 1973 | */ | ||
| 1974 | if (!(sb->s_flags & MS_NOUSER)) { | ||
| 1975 | blocks = totalram_pages / 2; | ||
| 1976 | inodes = totalram_pages - totalhigh_pages; | ||
| 1977 | if (inodes > blocks) | ||
| 1978 | inodes = blocks; | ||
| 1979 | |||
| 1980 | if (shmem_parse_options(data, &mode, | ||
| 1981 | &uid, &gid, &blocks, &inodes)) | ||
| 1982 | return -EINVAL; | ||
| 1983 | } | ||
| 1984 | |||
| 1985 | if (blocks || inodes) { | ||
| 1986 | struct shmem_sb_info *sbinfo; | ||
| 1987 | sbinfo = kmalloc(sizeof(struct shmem_sb_info), GFP_KERNEL); | ||
| 1988 | if (!sbinfo) | ||
| 1989 | return -ENOMEM; | ||
| 1990 | sb->s_fs_info = sbinfo; | ||
| 1991 | spin_lock_init(&sbinfo->stat_lock); | ||
| 1992 | sbinfo->max_blocks = blocks; | ||
| 1993 | sbinfo->free_blocks = blocks; | ||
| 1994 | sbinfo->max_inodes = inodes; | ||
| 1995 | sbinfo->free_inodes = inodes; | ||
| 1996 | } | ||
| 1997 | sb->s_xattr = shmem_xattr_handlers; | ||
| 1998 | #else | ||
| 1999 | sb->s_flags |= MS_NOUSER; | ||
| 2000 | #endif | ||
| 2001 | |||
| 2002 | sb->s_maxbytes = SHMEM_MAX_BYTES; | ||
| 2003 | sb->s_blocksize = PAGE_CACHE_SIZE; | ||
| 2004 | sb->s_blocksize_bits = PAGE_CACHE_SHIFT; | ||
| 2005 | sb->s_magic = TMPFS_MAGIC; | ||
| 2006 | sb->s_op = &shmem_ops; | ||
| 2007 | inode = shmem_get_inode(sb, S_IFDIR | mode, 0); | ||
| 2008 | if (!inode) | ||
| 2009 | goto failed; | ||
| 2010 | inode->i_uid = uid; | ||
| 2011 | inode->i_gid = gid; | ||
| 2012 | root = d_alloc_root(inode); | ||
| 2013 | if (!root) | ||
| 2014 | goto failed_iput; | ||
| 2015 | sb->s_root = root; | ||
| 2016 | return 0; | ||
| 2017 | |||
| 2018 | failed_iput: | ||
| 2019 | iput(inode); | ||
| 2020 | failed: | ||
| 2021 | shmem_put_super(sb); | ||
| 2022 | return err; | ||
| 2023 | } | ||
| 2024 | |||
| 2025 | static kmem_cache_t *shmem_inode_cachep; | ||
| 2026 | |||
| 2027 | static struct inode *shmem_alloc_inode(struct super_block *sb) | ||
| 2028 | { | ||
| 2029 | struct shmem_inode_info *p; | ||
| 2030 | p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL); | ||
| 2031 | if (!p) | ||
| 2032 | return NULL; | ||
| 2033 | return &p->vfs_inode; | ||
| 2034 | } | ||
| 2035 | |||
| 2036 | static void shmem_destroy_inode(struct inode *inode) | ||
| 2037 | { | ||
| 2038 | if ((inode->i_mode & S_IFMT) == S_IFREG) { | ||
| 2039 | /* only struct inode is valid if it's an inline symlink */ | ||
| 2040 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); | ||
| 2041 | } | ||
| 2042 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); | ||
| 2043 | } | ||
| 2044 | |||
| 2045 | static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) | ||
| 2046 | { | ||
| 2047 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; | ||
| 2048 | |||
| 2049 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == | ||
| 2050 | SLAB_CTOR_CONSTRUCTOR) { | ||
| 2051 | inode_init_once(&p->vfs_inode); | ||
| 2052 | } | ||
| 2053 | } | ||
| 2054 | |||
| 2055 | static int init_inodecache(void) | ||
| 2056 | { | ||
| 2057 | shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", | ||
| 2058 | sizeof(struct shmem_inode_info), | ||
| 2059 | 0, 0, init_once, NULL); | ||
| 2060 | if (shmem_inode_cachep == NULL) | ||
| 2061 | return -ENOMEM; | ||
| 2062 | return 0; | ||
| 2063 | } | ||
| 2064 | |||
| 2065 | static void destroy_inodecache(void) | ||
| 2066 | { | ||
| 2067 | if (kmem_cache_destroy(shmem_inode_cachep)) | ||
| 2068 | printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n"); | ||
| 2069 | } | ||
| 2070 | |||
| 2071 | static struct address_space_operations shmem_aops = { | ||
| 2072 | .writepage = shmem_writepage, | ||
| 2073 | .set_page_dirty = __set_page_dirty_nobuffers, | ||
| 2074 | #ifdef CONFIG_TMPFS | ||
| 2075 | .prepare_write = shmem_prepare_write, | ||
| 2076 | .commit_write = simple_commit_write, | ||
| 2077 | #endif | ||
| 2078 | }; | ||
| 2079 | |||
| 2080 | static struct file_operations shmem_file_operations = { | ||
| 2081 | .mmap = shmem_mmap, | ||
| 2082 | #ifdef CONFIG_TMPFS | ||
| 2083 | .llseek = generic_file_llseek, | ||
| 2084 | .read = shmem_file_read, | ||
| 2085 | .write = shmem_file_write, | ||
| 2086 | .fsync = simple_sync_file, | ||
| 2087 | .sendfile = shmem_file_sendfile, | ||
| 2088 | #endif | ||
| 2089 | }; | ||
| 2090 | |||
| 2091 | static struct inode_operations shmem_inode_operations = { | ||
| 2092 | .truncate = shmem_truncate, | ||
| 2093 | .setattr = shmem_notify_change, | ||
| 2094 | #ifdef CONFIG_TMPFS_XATTR | ||
| 2095 | .setxattr = generic_setxattr, | ||
| 2096 | .getxattr = generic_getxattr, | ||
| 2097 | .listxattr = generic_listxattr, | ||
| 2098 | .removexattr = generic_removexattr, | ||
| 2099 | #endif | ||
| 2100 | }; | ||
| 2101 | |||
| 2102 | static struct inode_operations shmem_dir_inode_operations = { | ||
| 2103 | #ifdef CONFIG_TMPFS | ||
| 2104 | .create = shmem_create, | ||
| 2105 | .lookup = simple_lookup, | ||
| 2106 | .link = shmem_link, | ||
| 2107 | .unlink = shmem_unlink, | ||
| 2108 | .symlink = shmem_symlink, | ||
| 2109 | .mkdir = shmem_mkdir, | ||
| 2110 | .rmdir = shmem_rmdir, | ||
| 2111 | .mknod = shmem_mknod, | ||
| 2112 | .rename = shmem_rename, | ||
| 2113 | #ifdef CONFIG_TMPFS_XATTR | ||
| 2114 | .setxattr = generic_setxattr, | ||
| 2115 | .getxattr = generic_getxattr, | ||
| 2116 | .listxattr = generic_listxattr, | ||
| 2117 | .removexattr = generic_removexattr, | ||
| 2118 | #endif | ||
| 2119 | #endif | ||
| 2120 | }; | ||
| 2121 | |||
| 2122 | static struct inode_operations shmem_special_inode_operations = { | ||
| 2123 | #ifdef CONFIG_TMPFS_XATTR | ||
| 2124 | .setxattr = generic_setxattr, | ||
| 2125 | .getxattr = generic_getxattr, | ||
| 2126 | .listxattr = generic_listxattr, | ||
| 2127 | .removexattr = generic_removexattr, | ||
| 2128 | #endif | ||
| 2129 | }; | ||
| 2130 | |||
| 2131 | static struct super_operations shmem_ops = { | ||
| 2132 | .alloc_inode = shmem_alloc_inode, | ||
| 2133 | .destroy_inode = shmem_destroy_inode, | ||
| 2134 | #ifdef CONFIG_TMPFS | ||
| 2135 | .statfs = shmem_statfs, | ||
| 2136 | .remount_fs = shmem_remount_fs, | ||
| 2137 | #endif | ||
| 2138 | .delete_inode = shmem_delete_inode, | ||
| 2139 | .drop_inode = generic_delete_inode, | ||
| 2140 | .put_super = shmem_put_super, | ||
| 2141 | }; | ||
| 2142 | |||
| 2143 | static struct vm_operations_struct shmem_vm_ops = { | ||
| 2144 | .nopage = shmem_nopage, | ||
| 2145 | .populate = shmem_populate, | ||
| 2146 | #ifdef CONFIG_NUMA | ||
| 2147 | .set_policy = shmem_set_policy, | ||
| 2148 | .get_policy = shmem_get_policy, | ||
| 2149 | #endif | ||
| 2150 | }; | ||
| 2151 | |||
| 2152 | |||
| 2153 | #ifdef CONFIG_TMPFS_SECURITY | ||
| 2154 | |||
| 2155 | static size_t shmem_xattr_security_list(struct inode *inode, char *list, size_t list_len, | ||
| 2156 | const char *name, size_t name_len) | ||
| 2157 | { | ||
| 2158 | return security_inode_listsecurity(inode, list, list_len); | ||
| 2159 | } | ||
| 2160 | |||
| 2161 | static int shmem_xattr_security_get(struct inode *inode, const char *name, void *buffer, size_t size) | ||
| 2162 | { | ||
| 2163 | if (strcmp(name, "") == 0) | ||
| 2164 | return -EINVAL; | ||
| 2165 | return security_inode_getsecurity(inode, name, buffer, size); | ||
| 2166 | } | ||
| 2167 | |||
| 2168 | static int shmem_xattr_security_set(struct inode *inode, const char *name, const void *value, size_t size, int flags) | ||
| 2169 | { | ||
| 2170 | if (strcmp(name, "") == 0) | ||
| 2171 | return -EINVAL; | ||
| 2172 | return security_inode_setsecurity(inode, name, value, size, flags); | ||
| 2173 | } | ||
| 2174 | |||
| 2175 | static struct xattr_handler shmem_xattr_security_handler = { | ||
| 2176 | .prefix = XATTR_SECURITY_PREFIX, | ||
| 2177 | .list = shmem_xattr_security_list, | ||
| 2178 | .get = shmem_xattr_security_get, | ||
| 2179 | .set = shmem_xattr_security_set, | ||
| 2180 | }; | ||
| 2181 | |||
| 2182 | #endif /* CONFIG_TMPFS_SECURITY */ | ||
| 2183 | |||
| 2184 | #ifdef CONFIG_TMPFS_XATTR | ||
| 2185 | |||
| 2186 | static struct xattr_handler *shmem_xattr_handlers[] = { | ||
| 2187 | #ifdef CONFIG_TMPFS_SECURITY | ||
| 2188 | &shmem_xattr_security_handler, | ||
| 2189 | #endif | ||
| 2190 | NULL | ||
| 2191 | }; | ||
| 2192 | |||
| 2193 | #endif /* CONFIG_TMPFS_XATTR */ | ||
| 2194 | |||
| 2195 | static struct super_block *shmem_get_sb(struct file_system_type *fs_type, | ||
| 2196 | int flags, const char *dev_name, void *data) | ||
| 2197 | { | ||
| 2198 | return get_sb_nodev(fs_type, flags, data, shmem_fill_super); | ||
| 2199 | } | ||
| 2200 | |||
| 2201 | static struct file_system_type tmpfs_fs_type = { | ||
| 2202 | .owner = THIS_MODULE, | ||
| 2203 | .name = "tmpfs", | ||
| 2204 | .get_sb = shmem_get_sb, | ||
| 2205 | .kill_sb = kill_litter_super, | ||
| 2206 | }; | ||
| 2207 | static struct vfsmount *shm_mnt; | ||
| 2208 | |||
| 2209 | static int __init init_tmpfs(void) | ||
| 2210 | { | ||
| 2211 | int error; | ||
| 2212 | |||
| 2213 | error = init_inodecache(); | ||
| 2214 | if (error) | ||
| 2215 | goto out3; | ||
| 2216 | |||
| 2217 | error = register_filesystem(&tmpfs_fs_type); | ||
| 2218 | if (error) { | ||
| 2219 | printk(KERN_ERR "Could not register tmpfs\n"); | ||
| 2220 | goto out2; | ||
| 2221 | } | ||
| 2222 | #ifdef CONFIG_TMPFS | ||
| 2223 | devfs_mk_dir("shm"); | ||
| 2224 | #endif | ||
| 2225 | shm_mnt = do_kern_mount(tmpfs_fs_type.name, MS_NOUSER, | ||
| 2226 | tmpfs_fs_type.name, NULL); | ||
| 2227 | if (IS_ERR(shm_mnt)) { | ||
| 2228 | error = PTR_ERR(shm_mnt); | ||
| 2229 | printk(KERN_ERR "Could not kern_mount tmpfs\n"); | ||
| 2230 | goto out1; | ||
| 2231 | } | ||
| 2232 | return 0; | ||
| 2233 | |||
| 2234 | out1: | ||
| 2235 | unregister_filesystem(&tmpfs_fs_type); | ||
| 2236 | out2: | ||
| 2237 | destroy_inodecache(); | ||
| 2238 | out3: | ||
| 2239 | shm_mnt = ERR_PTR(error); | ||
| 2240 | return error; | ||
| 2241 | } | ||
| 2242 | module_init(init_tmpfs) | ||
| 2243 | |||
| 2244 | /* | ||
| 2245 | * shmem_file_setup - get an unlinked file living in tmpfs | ||
| 2246 | * | ||
| 2247 | * @name: name for dentry (to be seen in /proc/<pid>/maps | ||
| 2248 | * @size: size to be set for the file | ||
| 2249 | * | ||
| 2250 | */ | ||
| 2251 | struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | ||
| 2252 | { | ||
| 2253 | int error; | ||
| 2254 | struct file *file; | ||
| 2255 | struct inode *inode; | ||
| 2256 | struct dentry *dentry, *root; | ||
| 2257 | struct qstr this; | ||
| 2258 | |||
| 2259 | if (IS_ERR(shm_mnt)) | ||
| 2260 | return (void *)shm_mnt; | ||
| 2261 | |||
| 2262 | if (size < 0 || size > SHMEM_MAX_BYTES) | ||
| 2263 | return ERR_PTR(-EINVAL); | ||
| 2264 | |||
| 2265 | if (shmem_acct_size(flags, size)) | ||
| 2266 | return ERR_PTR(-ENOMEM); | ||
| 2267 | |||
| 2268 | error = -ENOMEM; | ||
| 2269 | this.name = name; | ||
| 2270 | this.len = strlen(name); | ||
| 2271 | this.hash = 0; /* will go */ | ||
| 2272 | root = shm_mnt->mnt_root; | ||
| 2273 | dentry = d_alloc(root, &this); | ||
| 2274 | if (!dentry) | ||
| 2275 | goto put_memory; | ||
| 2276 | |||
| 2277 | error = -ENFILE; | ||
| 2278 | file = get_empty_filp(); | ||
| 2279 | if (!file) | ||
| 2280 | goto put_dentry; | ||
| 2281 | |||
| 2282 | error = -ENOSPC; | ||
| 2283 | inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); | ||
| 2284 | if (!inode) | ||
| 2285 | goto close_file; | ||
| 2286 | |||
| 2287 | SHMEM_I(inode)->flags = flags & VM_ACCOUNT; | ||
| 2288 | d_instantiate(dentry, inode); | ||
| 2289 | inode->i_size = size; | ||
| 2290 | inode->i_nlink = 0; /* It is unlinked */ | ||
| 2291 | file->f_vfsmnt = mntget(shm_mnt); | ||
| 2292 | file->f_dentry = dentry; | ||
| 2293 | file->f_mapping = inode->i_mapping; | ||
| 2294 | file->f_op = &shmem_file_operations; | ||
| 2295 | file->f_mode = FMODE_WRITE | FMODE_READ; | ||
| 2296 | return file; | ||
| 2297 | |||
| 2298 | close_file: | ||
| 2299 | put_filp(file); | ||
| 2300 | put_dentry: | ||
| 2301 | dput(dentry); | ||
| 2302 | put_memory: | ||
| 2303 | shmem_unacct_size(flags, size); | ||
| 2304 | return ERR_PTR(error); | ||
| 2305 | } | ||
| 2306 | |||
| 2307 | /* | ||
| 2308 | * shmem_zero_setup - setup a shared anonymous mapping | ||
| 2309 | * | ||
| 2310 | * @vma: the vma to be mmapped is prepared by do_mmap_pgoff | ||
| 2311 | */ | ||
| 2312 | int shmem_zero_setup(struct vm_area_struct *vma) | ||
| 2313 | { | ||
| 2314 | struct file *file; | ||
| 2315 | loff_t size = vma->vm_end - vma->vm_start; | ||
| 2316 | |||
| 2317 | file = shmem_file_setup("dev/zero", size, vma->vm_flags); | ||
| 2318 | if (IS_ERR(file)) | ||
| 2319 | return PTR_ERR(file); | ||
| 2320 | |||
| 2321 | if (vma->vm_file) | ||
| 2322 | fput(vma->vm_file); | ||
| 2323 | vma->vm_file = file; | ||
| 2324 | vma->vm_ops = &shmem_vm_ops; | ||
| 2325 | return 0; | ||
| 2326 | } | ||
diff --git a/mm/slab.c b/mm/slab.c new file mode 100644 index 000000000000..ec660d85ddd7 --- /dev/null +++ b/mm/slab.c | |||
| @@ -0,0 +1,3060 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/slab.c | ||
| 3 | * Written by Mark Hemment, 1996/97. | ||
| 4 | * (markhe@nextd.demon.co.uk) | ||
| 5 | * | ||
| 6 | * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli | ||
| 7 | * | ||
| 8 | * Major cleanup, different bufctl logic, per-cpu arrays | ||
| 9 | * (c) 2000 Manfred Spraul | ||
| 10 | * | ||
| 11 | * Cleanup, make the head arrays unconditional, preparation for NUMA | ||
| 12 | * (c) 2002 Manfred Spraul | ||
| 13 | * | ||
| 14 | * An implementation of the Slab Allocator as described in outline in; | ||
| 15 | * UNIX Internals: The New Frontiers by Uresh Vahalia | ||
| 16 | * Pub: Prentice Hall ISBN 0-13-101908-2 | ||
| 17 | * or with a little more detail in; | ||
| 18 | * The Slab Allocator: An Object-Caching Kernel Memory Allocator | ||
| 19 | * Jeff Bonwick (Sun Microsystems). | ||
| 20 | * Presented at: USENIX Summer 1994 Technical Conference | ||
| 21 | * | ||
| 22 | * The memory is organized in caches, one cache for each object type. | ||
| 23 | * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) | ||
| 24 | * Each cache consists out of many slabs (they are small (usually one | ||
| 25 | * page long) and always contiguous), and each slab contains multiple | ||
| 26 | * initialized objects. | ||
| 27 | * | ||
| 28 | * This means, that your constructor is used only for newly allocated | ||
| 29 | * slabs and you must pass objects with the same intializations to | ||
| 30 | * kmem_cache_free. | ||
| 31 | * | ||
| 32 | * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, | ||
| 33 | * normal). If you need a special memory type, then must create a new | ||
| 34 | * cache for that memory type. | ||
| 35 | * | ||
| 36 | * In order to reduce fragmentation, the slabs are sorted in 3 groups: | ||
| 37 | * full slabs with 0 free objects | ||
| 38 | * partial slabs | ||
| 39 | * empty slabs with no allocated objects | ||
| 40 | * | ||
| 41 | * If partial slabs exist, then new allocations come from these slabs, | ||
| 42 | * otherwise from empty slabs or new slabs are allocated. | ||
| 43 | * | ||
| 44 | * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache | ||
| 45 | * during kmem_cache_destroy(). The caller must prevent concurrent allocs. | ||
| 46 | * | ||
| 47 | * Each cache has a short per-cpu head array, most allocs | ||
| 48 | * and frees go into that array, and if that array overflows, then 1/2 | ||
| 49 | * of the entries in the array are given back into the global cache. | ||
| 50 | * The head array is strictly LIFO and should improve the cache hit rates. | ||
| 51 | * On SMP, it additionally reduces the spinlock operations. | ||
| 52 | * | ||
| 53 | * The c_cpuarray may not be read with enabled local interrupts - | ||
| 54 | * it's changed with a smp_call_function(). | ||
| 55 | * | ||
| 56 | * SMP synchronization: | ||
| 57 | * constructors and destructors are called without any locking. | ||
| 58 | * Several members in kmem_cache_t and struct slab never change, they | ||
| 59 | * are accessed without any locking. | ||
| 60 | * The per-cpu arrays are never accessed from the wrong cpu, no locking, | ||
| 61 | * and local interrupts are disabled so slab code is preempt-safe. | ||
| 62 | * The non-constant members are protected with a per-cache irq spinlock. | ||
| 63 | * | ||
| 64 | * Many thanks to Mark Hemment, who wrote another per-cpu slab patch | ||
| 65 | * in 2000 - many ideas in the current implementation are derived from | ||
| 66 | * his patch. | ||
| 67 | * | ||
| 68 | * Further notes from the original documentation: | ||
| 69 | * | ||
| 70 | * 11 April '97. Started multi-threading - markhe | ||
| 71 | * The global cache-chain is protected by the semaphore 'cache_chain_sem'. | ||
| 72 | * The sem is only needed when accessing/extending the cache-chain, which | ||
| 73 | * can never happen inside an interrupt (kmem_cache_create(), | ||
| 74 | * kmem_cache_shrink() and kmem_cache_reap()). | ||
| 75 | * | ||
| 76 | * At present, each engine can be growing a cache. This should be blocked. | ||
| 77 | * | ||
| 78 | */ | ||
| 79 | |||
| 80 | #include <linux/config.h> | ||
| 81 | #include <linux/slab.h> | ||
| 82 | #include <linux/mm.h> | ||
| 83 | #include <linux/swap.h> | ||
| 84 | #include <linux/cache.h> | ||
| 85 | #include <linux/interrupt.h> | ||
| 86 | #include <linux/init.h> | ||
| 87 | #include <linux/compiler.h> | ||
| 88 | #include <linux/seq_file.h> | ||
| 89 | #include <linux/notifier.h> | ||
| 90 | #include <linux/kallsyms.h> | ||
| 91 | #include <linux/cpu.h> | ||
| 92 | #include <linux/sysctl.h> | ||
| 93 | #include <linux/module.h> | ||
| 94 | #include <linux/rcupdate.h> | ||
| 95 | |||
| 96 | #include <asm/uaccess.h> | ||
| 97 | #include <asm/cacheflush.h> | ||
| 98 | #include <asm/tlbflush.h> | ||
| 99 | #include <asm/page.h> | ||
| 100 | |||
| 101 | /* | ||
| 102 | * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, | ||
| 103 | * SLAB_RED_ZONE & SLAB_POISON. | ||
| 104 | * 0 for faster, smaller code (especially in the critical paths). | ||
| 105 | * | ||
| 106 | * STATS - 1 to collect stats for /proc/slabinfo. | ||
| 107 | * 0 for faster, smaller code (especially in the critical paths). | ||
| 108 | * | ||
| 109 | * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) | ||
| 110 | */ | ||
| 111 | |||
| 112 | #ifdef CONFIG_DEBUG_SLAB | ||
| 113 | #define DEBUG 1 | ||
| 114 | #define STATS 1 | ||
| 115 | #define FORCED_DEBUG 1 | ||
| 116 | #else | ||
| 117 | #define DEBUG 0 | ||
| 118 | #define STATS 0 | ||
| 119 | #define FORCED_DEBUG 0 | ||
| 120 | #endif | ||
| 121 | |||
| 122 | |||
| 123 | /* Shouldn't this be in a header file somewhere? */ | ||
| 124 | #define BYTES_PER_WORD sizeof(void *) | ||
| 125 | |||
| 126 | #ifndef cache_line_size | ||
| 127 | #define cache_line_size() L1_CACHE_BYTES | ||
| 128 | #endif | ||
| 129 | |||
| 130 | #ifndef ARCH_KMALLOC_MINALIGN | ||
| 131 | /* | ||
| 132 | * Enforce a minimum alignment for the kmalloc caches. | ||
| 133 | * Usually, the kmalloc caches are cache_line_size() aligned, except when | ||
| 134 | * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. | ||
| 135 | * Some archs want to perform DMA into kmalloc caches and need a guaranteed | ||
| 136 | * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that. | ||
| 137 | * Note that this flag disables some debug features. | ||
| 138 | */ | ||
| 139 | #define ARCH_KMALLOC_MINALIGN 0 | ||
| 140 | #endif | ||
| 141 | |||
| 142 | #ifndef ARCH_SLAB_MINALIGN | ||
| 143 | /* | ||
| 144 | * Enforce a minimum alignment for all caches. | ||
| 145 | * Intended for archs that get misalignment faults even for BYTES_PER_WORD | ||
| 146 | * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. | ||
| 147 | * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables | ||
| 148 | * some debug features. | ||
| 149 | */ | ||
| 150 | #define ARCH_SLAB_MINALIGN 0 | ||
| 151 | #endif | ||
| 152 | |||
| 153 | #ifndef ARCH_KMALLOC_FLAGS | ||
| 154 | #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN | ||
| 155 | #endif | ||
| 156 | |||
| 157 | /* Legal flag mask for kmem_cache_create(). */ | ||
| 158 | #if DEBUG | ||
| 159 | # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ | ||
| 160 | SLAB_POISON | SLAB_HWCACHE_ALIGN | \ | ||
| 161 | SLAB_NO_REAP | SLAB_CACHE_DMA | \ | ||
| 162 | SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ | ||
| 163 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | ||
| 164 | SLAB_DESTROY_BY_RCU) | ||
| 165 | #else | ||
| 166 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ | ||
| 167 | SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ | ||
| 168 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | ||
| 169 | SLAB_DESTROY_BY_RCU) | ||
| 170 | #endif | ||
| 171 | |||
| 172 | /* | ||
| 173 | * kmem_bufctl_t: | ||
| 174 | * | ||
| 175 | * Bufctl's are used for linking objs within a slab | ||
| 176 | * linked offsets. | ||
| 177 | * | ||
| 178 | * This implementation relies on "struct page" for locating the cache & | ||
| 179 | * slab an object belongs to. | ||
| 180 | * This allows the bufctl structure to be small (one int), but limits | ||
| 181 | * the number of objects a slab (not a cache) can contain when off-slab | ||
| 182 | * bufctls are used. The limit is the size of the largest general cache | ||
| 183 | * that does not use off-slab slabs. | ||
| 184 | * For 32bit archs with 4 kB pages, is this 56. | ||
| 185 | * This is not serious, as it is only for large objects, when it is unwise | ||
| 186 | * to have too many per slab. | ||
| 187 | * Note: This limit can be raised by introducing a general cache whose size | ||
| 188 | * is less than 512 (PAGE_SIZE<<3), but greater than 256. | ||
| 189 | */ | ||
| 190 | |||
| 191 | #define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) | ||
| 192 | #define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) | ||
| 193 | #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) | ||
| 194 | |||
| 195 | /* Max number of objs-per-slab for caches which use off-slab slabs. | ||
| 196 | * Needed to avoid a possible looping condition in cache_grow(). | ||
| 197 | */ | ||
| 198 | static unsigned long offslab_limit; | ||
| 199 | |||
| 200 | /* | ||
| 201 | * struct slab | ||
| 202 | * | ||
| 203 | * Manages the objs in a slab. Placed either at the beginning of mem allocated | ||
| 204 | * for a slab, or allocated from an general cache. | ||
| 205 | * Slabs are chained into three list: fully used, partial, fully free slabs. | ||
| 206 | */ | ||
| 207 | struct slab { | ||
| 208 | struct list_head list; | ||
| 209 | unsigned long colouroff; | ||
| 210 | void *s_mem; /* including colour offset */ | ||
| 211 | unsigned int inuse; /* num of objs active in slab */ | ||
| 212 | kmem_bufctl_t free; | ||
| 213 | }; | ||
| 214 | |||
| 215 | /* | ||
| 216 | * struct slab_rcu | ||
| 217 | * | ||
| 218 | * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to | ||
| 219 | * arrange for kmem_freepages to be called via RCU. This is useful if | ||
| 220 | * we need to approach a kernel structure obliquely, from its address | ||
| 221 | * obtained without the usual locking. We can lock the structure to | ||
| 222 | * stabilize it and check it's still at the given address, only if we | ||
| 223 | * can be sure that the memory has not been meanwhile reused for some | ||
| 224 | * other kind of object (which our subsystem's lock might corrupt). | ||
| 225 | * | ||
| 226 | * rcu_read_lock before reading the address, then rcu_read_unlock after | ||
| 227 | * taking the spinlock within the structure expected at that address. | ||
| 228 | * | ||
| 229 | * We assume struct slab_rcu can overlay struct slab when destroying. | ||
| 230 | */ | ||
| 231 | struct slab_rcu { | ||
| 232 | struct rcu_head head; | ||
| 233 | kmem_cache_t *cachep; | ||
| 234 | void *addr; | ||
| 235 | }; | ||
| 236 | |||
| 237 | /* | ||
| 238 | * struct array_cache | ||
| 239 | * | ||
| 240 | * Per cpu structures | ||
| 241 | * Purpose: | ||
| 242 | * - LIFO ordering, to hand out cache-warm objects from _alloc | ||
| 243 | * - reduce the number of linked list operations | ||
| 244 | * - reduce spinlock operations | ||
| 245 | * | ||
| 246 | * The limit is stored in the per-cpu structure to reduce the data cache | ||
| 247 | * footprint. | ||
| 248 | * | ||
| 249 | */ | ||
| 250 | struct array_cache { | ||
| 251 | unsigned int avail; | ||
| 252 | unsigned int limit; | ||
| 253 | unsigned int batchcount; | ||
| 254 | unsigned int touched; | ||
| 255 | }; | ||
| 256 | |||
| 257 | /* bootstrap: The caches do not work without cpuarrays anymore, | ||
| 258 | * but the cpuarrays are allocated from the generic caches... | ||
| 259 | */ | ||
| 260 | #define BOOT_CPUCACHE_ENTRIES 1 | ||
| 261 | struct arraycache_init { | ||
| 262 | struct array_cache cache; | ||
| 263 | void * entries[BOOT_CPUCACHE_ENTRIES]; | ||
| 264 | }; | ||
| 265 | |||
| 266 | /* | ||
| 267 | * The slab lists of all objects. | ||
| 268 | * Hopefully reduce the internal fragmentation | ||
| 269 | * NUMA: The spinlock could be moved from the kmem_cache_t | ||
| 270 | * into this structure, too. Figure out what causes | ||
| 271 | * fewer cross-node spinlock operations. | ||
| 272 | */ | ||
| 273 | struct kmem_list3 { | ||
| 274 | struct list_head slabs_partial; /* partial list first, better asm code */ | ||
| 275 | struct list_head slabs_full; | ||
| 276 | struct list_head slabs_free; | ||
| 277 | unsigned long free_objects; | ||
| 278 | int free_touched; | ||
| 279 | unsigned long next_reap; | ||
| 280 | struct array_cache *shared; | ||
| 281 | }; | ||
| 282 | |||
| 283 | #define LIST3_INIT(parent) \ | ||
| 284 | { \ | ||
| 285 | .slabs_full = LIST_HEAD_INIT(parent.slabs_full), \ | ||
| 286 | .slabs_partial = LIST_HEAD_INIT(parent.slabs_partial), \ | ||
| 287 | .slabs_free = LIST_HEAD_INIT(parent.slabs_free) \ | ||
| 288 | } | ||
| 289 | #define list3_data(cachep) \ | ||
| 290 | (&(cachep)->lists) | ||
| 291 | |||
| 292 | /* NUMA: per-node */ | ||
| 293 | #define list3_data_ptr(cachep, ptr) \ | ||
| 294 | list3_data(cachep) | ||
| 295 | |||
| 296 | /* | ||
| 297 | * kmem_cache_t | ||
| 298 | * | ||
| 299 | * manages a cache. | ||
| 300 | */ | ||
| 301 | |||
| 302 | struct kmem_cache_s { | ||
| 303 | /* 1) per-cpu data, touched during every alloc/free */ | ||
| 304 | struct array_cache *array[NR_CPUS]; | ||
| 305 | unsigned int batchcount; | ||
| 306 | unsigned int limit; | ||
| 307 | /* 2) touched by every alloc & free from the backend */ | ||
| 308 | struct kmem_list3 lists; | ||
| 309 | /* NUMA: kmem_3list_t *nodelists[MAX_NUMNODES] */ | ||
| 310 | unsigned int objsize; | ||
| 311 | unsigned int flags; /* constant flags */ | ||
| 312 | unsigned int num; /* # of objs per slab */ | ||
| 313 | unsigned int free_limit; /* upper limit of objects in the lists */ | ||
| 314 | spinlock_t spinlock; | ||
| 315 | |||
| 316 | /* 3) cache_grow/shrink */ | ||
| 317 | /* order of pgs per slab (2^n) */ | ||
| 318 | unsigned int gfporder; | ||
| 319 | |||
| 320 | /* force GFP flags, e.g. GFP_DMA */ | ||
| 321 | unsigned int gfpflags; | ||
| 322 | |||
| 323 | size_t colour; /* cache colouring range */ | ||
| 324 | unsigned int colour_off; /* colour offset */ | ||
| 325 | unsigned int colour_next; /* cache colouring */ | ||
| 326 | kmem_cache_t *slabp_cache; | ||
| 327 | unsigned int slab_size; | ||
| 328 | unsigned int dflags; /* dynamic flags */ | ||
| 329 | |||
| 330 | /* constructor func */ | ||
| 331 | void (*ctor)(void *, kmem_cache_t *, unsigned long); | ||
| 332 | |||
| 333 | /* de-constructor func */ | ||
| 334 | void (*dtor)(void *, kmem_cache_t *, unsigned long); | ||
| 335 | |||
| 336 | /* 4) cache creation/removal */ | ||
| 337 | const char *name; | ||
| 338 | struct list_head next; | ||
| 339 | |||
| 340 | /* 5) statistics */ | ||
| 341 | #if STATS | ||
| 342 | unsigned long num_active; | ||
| 343 | unsigned long num_allocations; | ||
| 344 | unsigned long high_mark; | ||
| 345 | unsigned long grown; | ||
| 346 | unsigned long reaped; | ||
| 347 | unsigned long errors; | ||
| 348 | unsigned long max_freeable; | ||
| 349 | unsigned long node_allocs; | ||
| 350 | atomic_t allochit; | ||
| 351 | atomic_t allocmiss; | ||
| 352 | atomic_t freehit; | ||
| 353 | atomic_t freemiss; | ||
| 354 | #endif | ||
| 355 | #if DEBUG | ||
| 356 | int dbghead; | ||
| 357 | int reallen; | ||
| 358 | #endif | ||
| 359 | }; | ||
| 360 | |||
| 361 | #define CFLGS_OFF_SLAB (0x80000000UL) | ||
| 362 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) | ||
| 363 | |||
| 364 | #define BATCHREFILL_LIMIT 16 | ||
| 365 | /* Optimization question: fewer reaps means less | ||
| 366 | * probability for unnessary cpucache drain/refill cycles. | ||
| 367 | * | ||
| 368 | * OTHO the cpuarrays can contain lots of objects, | ||
| 369 | * which could lock up otherwise freeable slabs. | ||
| 370 | */ | ||
| 371 | #define REAPTIMEOUT_CPUC (2*HZ) | ||
| 372 | #define REAPTIMEOUT_LIST3 (4*HZ) | ||
| 373 | |||
| 374 | #if STATS | ||
| 375 | #define STATS_INC_ACTIVE(x) ((x)->num_active++) | ||
| 376 | #define STATS_DEC_ACTIVE(x) ((x)->num_active--) | ||
| 377 | #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) | ||
| 378 | #define STATS_INC_GROWN(x) ((x)->grown++) | ||
| 379 | #define STATS_INC_REAPED(x) ((x)->reaped++) | ||
| 380 | #define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \ | ||
| 381 | (x)->high_mark = (x)->num_active; \ | ||
| 382 | } while (0) | ||
| 383 | #define STATS_INC_ERR(x) ((x)->errors++) | ||
| 384 | #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) | ||
| 385 | #define STATS_SET_FREEABLE(x, i) \ | ||
| 386 | do { if ((x)->max_freeable < i) \ | ||
| 387 | (x)->max_freeable = i; \ | ||
| 388 | } while (0) | ||
| 389 | |||
| 390 | #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) | ||
| 391 | #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) | ||
| 392 | #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) | ||
| 393 | #define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) | ||
| 394 | #else | ||
| 395 | #define STATS_INC_ACTIVE(x) do { } while (0) | ||
| 396 | #define STATS_DEC_ACTIVE(x) do { } while (0) | ||
| 397 | #define STATS_INC_ALLOCED(x) do { } while (0) | ||
| 398 | #define STATS_INC_GROWN(x) do { } while (0) | ||
| 399 | #define STATS_INC_REAPED(x) do { } while (0) | ||
| 400 | #define STATS_SET_HIGH(x) do { } while (0) | ||
| 401 | #define STATS_INC_ERR(x) do { } while (0) | ||
| 402 | #define STATS_INC_NODEALLOCS(x) do { } while (0) | ||
| 403 | #define STATS_SET_FREEABLE(x, i) \ | ||
| 404 | do { } while (0) | ||
| 405 | |||
| 406 | #define STATS_INC_ALLOCHIT(x) do { } while (0) | ||
| 407 | #define STATS_INC_ALLOCMISS(x) do { } while (0) | ||
| 408 | #define STATS_INC_FREEHIT(x) do { } while (0) | ||
| 409 | #define STATS_INC_FREEMISS(x) do { } while (0) | ||
| 410 | #endif | ||
| 411 | |||
| 412 | #if DEBUG | ||
| 413 | /* Magic nums for obj red zoning. | ||
| 414 | * Placed in the first word before and the first word after an obj. | ||
| 415 | */ | ||
| 416 | #define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ | ||
| 417 | #define RED_ACTIVE 0x170FC2A5UL /* when obj is active */ | ||
| 418 | |||
| 419 | /* ...and for poisoning */ | ||
| 420 | #define POISON_INUSE 0x5a /* for use-uninitialised poisoning */ | ||
| 421 | #define POISON_FREE 0x6b /* for use-after-free poisoning */ | ||
| 422 | #define POISON_END 0xa5 /* end-byte of poisoning */ | ||
| 423 | |||
| 424 | /* memory layout of objects: | ||
| 425 | * 0 : objp | ||
| 426 | * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that | ||
| 427 | * the end of an object is aligned with the end of the real | ||
| 428 | * allocation. Catches writes behind the end of the allocation. | ||
| 429 | * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1: | ||
| 430 | * redzone word. | ||
| 431 | * cachep->dbghead: The real object. | ||
| 432 | * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] | ||
| 433 | * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] | ||
| 434 | */ | ||
| 435 | static int obj_dbghead(kmem_cache_t *cachep) | ||
| 436 | { | ||
| 437 | return cachep->dbghead; | ||
| 438 | } | ||
| 439 | |||
| 440 | static int obj_reallen(kmem_cache_t *cachep) | ||
| 441 | { | ||
| 442 | return cachep->reallen; | ||
| 443 | } | ||
| 444 | |||
| 445 | static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp) | ||
| 446 | { | ||
| 447 | BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); | ||
| 448 | return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD); | ||
| 449 | } | ||
| 450 | |||
| 451 | static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp) | ||
| 452 | { | ||
| 453 | BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); | ||
| 454 | if (cachep->flags & SLAB_STORE_USER) | ||
| 455 | return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD); | ||
| 456 | return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD); | ||
| 457 | } | ||
| 458 | |||
| 459 | static void **dbg_userword(kmem_cache_t *cachep, void *objp) | ||
| 460 | { | ||
| 461 | BUG_ON(!(cachep->flags & SLAB_STORE_USER)); | ||
| 462 | return (void**)(objp+cachep->objsize-BYTES_PER_WORD); | ||
| 463 | } | ||
| 464 | |||
| 465 | #else | ||
| 466 | |||
| 467 | #define obj_dbghead(x) 0 | ||
| 468 | #define obj_reallen(cachep) (cachep->objsize) | ||
| 469 | #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;}) | ||
| 470 | #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;}) | ||
| 471 | #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) | ||
| 472 | |||
| 473 | #endif | ||
| 474 | |||
| 475 | /* | ||
| 476 | * Maximum size of an obj (in 2^order pages) | ||
| 477 | * and absolute limit for the gfp order. | ||
| 478 | */ | ||
| 479 | #if defined(CONFIG_LARGE_ALLOCS) | ||
| 480 | #define MAX_OBJ_ORDER 13 /* up to 32Mb */ | ||
| 481 | #define MAX_GFP_ORDER 13 /* up to 32Mb */ | ||
| 482 | #elif defined(CONFIG_MMU) | ||
| 483 | #define MAX_OBJ_ORDER 5 /* 32 pages */ | ||
| 484 | #define MAX_GFP_ORDER 5 /* 32 pages */ | ||
| 485 | #else | ||
| 486 | #define MAX_OBJ_ORDER 8 /* up to 1Mb */ | ||
| 487 | #define MAX_GFP_ORDER 8 /* up to 1Mb */ | ||
| 488 | #endif | ||
| 489 | |||
| 490 | /* | ||
| 491 | * Do not go above this order unless 0 objects fit into the slab. | ||
| 492 | */ | ||
| 493 | #define BREAK_GFP_ORDER_HI 1 | ||
| 494 | #define BREAK_GFP_ORDER_LO 0 | ||
| 495 | static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; | ||
| 496 | |||
| 497 | /* Macros for storing/retrieving the cachep and or slab from the | ||
| 498 | * global 'mem_map'. These are used to find the slab an obj belongs to. | ||
| 499 | * With kfree(), these are used to find the cache which an obj belongs to. | ||
| 500 | */ | ||
| 501 | #define SET_PAGE_CACHE(pg,x) ((pg)->lru.next = (struct list_head *)(x)) | ||
| 502 | #define GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->lru.next) | ||
| 503 | #define SET_PAGE_SLAB(pg,x) ((pg)->lru.prev = (struct list_head *)(x)) | ||
| 504 | #define GET_PAGE_SLAB(pg) ((struct slab *)(pg)->lru.prev) | ||
| 505 | |||
| 506 | /* These are the default caches for kmalloc. Custom caches can have other sizes. */ | ||
| 507 | struct cache_sizes malloc_sizes[] = { | ||
| 508 | #define CACHE(x) { .cs_size = (x) }, | ||
| 509 | #include <linux/kmalloc_sizes.h> | ||
| 510 | CACHE(ULONG_MAX) | ||
| 511 | #undef CACHE | ||
| 512 | }; | ||
| 513 | EXPORT_SYMBOL(malloc_sizes); | ||
| 514 | |||
| 515 | /* Must match cache_sizes above. Out of line to keep cache footprint low. */ | ||
| 516 | struct cache_names { | ||
| 517 | char *name; | ||
| 518 | char *name_dma; | ||
| 519 | }; | ||
| 520 | |||
| 521 | static struct cache_names __initdata cache_names[] = { | ||
| 522 | #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, | ||
| 523 | #include <linux/kmalloc_sizes.h> | ||
| 524 | { NULL, } | ||
| 525 | #undef CACHE | ||
| 526 | }; | ||
| 527 | |||
| 528 | static struct arraycache_init initarray_cache __initdata = | ||
| 529 | { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; | ||
| 530 | static struct arraycache_init initarray_generic = | ||
| 531 | { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; | ||
| 532 | |||
| 533 | /* internal cache of cache description objs */ | ||
| 534 | static kmem_cache_t cache_cache = { | ||
| 535 | .lists = LIST3_INIT(cache_cache.lists), | ||
| 536 | .batchcount = 1, | ||
| 537 | .limit = BOOT_CPUCACHE_ENTRIES, | ||
| 538 | .objsize = sizeof(kmem_cache_t), | ||
| 539 | .flags = SLAB_NO_REAP, | ||
| 540 | .spinlock = SPIN_LOCK_UNLOCKED, | ||
| 541 | .name = "kmem_cache", | ||
| 542 | #if DEBUG | ||
| 543 | .reallen = sizeof(kmem_cache_t), | ||
| 544 | #endif | ||
| 545 | }; | ||
| 546 | |||
| 547 | /* Guard access to the cache-chain. */ | ||
| 548 | static struct semaphore cache_chain_sem; | ||
| 549 | static struct list_head cache_chain; | ||
| 550 | |||
| 551 | /* | ||
| 552 | * vm_enough_memory() looks at this to determine how many | ||
| 553 | * slab-allocated pages are possibly freeable under pressure | ||
| 554 | * | ||
| 555 | * SLAB_RECLAIM_ACCOUNT turns this on per-slab | ||
| 556 | */ | ||
| 557 | atomic_t slab_reclaim_pages; | ||
| 558 | EXPORT_SYMBOL(slab_reclaim_pages); | ||
| 559 | |||
| 560 | /* | ||
| 561 | * chicken and egg problem: delay the per-cpu array allocation | ||
| 562 | * until the general caches are up. | ||
| 563 | */ | ||
| 564 | static enum { | ||
| 565 | NONE, | ||
| 566 | PARTIAL, | ||
| 567 | FULL | ||
| 568 | } g_cpucache_up; | ||
| 569 | |||
| 570 | static DEFINE_PER_CPU(struct work_struct, reap_work); | ||
| 571 | |||
| 572 | static void free_block(kmem_cache_t* cachep, void** objpp, int len); | ||
| 573 | static void enable_cpucache (kmem_cache_t *cachep); | ||
| 574 | static void cache_reap (void *unused); | ||
| 575 | |||
| 576 | static inline void **ac_entry(struct array_cache *ac) | ||
| 577 | { | ||
| 578 | return (void**)(ac+1); | ||
| 579 | } | ||
| 580 | |||
| 581 | static inline struct array_cache *ac_data(kmem_cache_t *cachep) | ||
| 582 | { | ||
| 583 | return cachep->array[smp_processor_id()]; | ||
| 584 | } | ||
| 585 | |||
| 586 | static inline kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags) | ||
| 587 | { | ||
| 588 | struct cache_sizes *csizep = malloc_sizes; | ||
| 589 | |||
| 590 | #if DEBUG | ||
| 591 | /* This happens if someone tries to call | ||
| 592 | * kmem_cache_create(), or __kmalloc(), before | ||
| 593 | * the generic caches are initialized. | ||
| 594 | */ | ||
| 595 | BUG_ON(csizep->cs_cachep == NULL); | ||
| 596 | #endif | ||
| 597 | while (size > csizep->cs_size) | ||
| 598 | csizep++; | ||
| 599 | |||
| 600 | /* | ||
| 601 | * Really subtile: The last entry with cs->cs_size==ULONG_MAX | ||
| 602 | * has cs_{dma,}cachep==NULL. Thus no special case | ||
| 603 | * for large kmalloc calls required. | ||
| 604 | */ | ||
| 605 | if (unlikely(gfpflags & GFP_DMA)) | ||
| 606 | return csizep->cs_dmacachep; | ||
| 607 | return csizep->cs_cachep; | ||
| 608 | } | ||
| 609 | |||
| 610 | /* Cal the num objs, wastage, and bytes left over for a given slab size. */ | ||
| 611 | static void cache_estimate(unsigned long gfporder, size_t size, size_t align, | ||
| 612 | int flags, size_t *left_over, unsigned int *num) | ||
| 613 | { | ||
| 614 | int i; | ||
| 615 | size_t wastage = PAGE_SIZE<<gfporder; | ||
| 616 | size_t extra = 0; | ||
| 617 | size_t base = 0; | ||
| 618 | |||
| 619 | if (!(flags & CFLGS_OFF_SLAB)) { | ||
| 620 | base = sizeof(struct slab); | ||
| 621 | extra = sizeof(kmem_bufctl_t); | ||
| 622 | } | ||
| 623 | i = 0; | ||
| 624 | while (i*size + ALIGN(base+i*extra, align) <= wastage) | ||
| 625 | i++; | ||
| 626 | if (i > 0) | ||
| 627 | i--; | ||
| 628 | |||
| 629 | if (i > SLAB_LIMIT) | ||
| 630 | i = SLAB_LIMIT; | ||
| 631 | |||
| 632 | *num = i; | ||
| 633 | wastage -= i*size; | ||
| 634 | wastage -= ALIGN(base+i*extra, align); | ||
| 635 | *left_over = wastage; | ||
| 636 | } | ||
| 637 | |||
| 638 | #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) | ||
| 639 | |||
| 640 | static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg) | ||
| 641 | { | ||
| 642 | printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", | ||
| 643 | function, cachep->name, msg); | ||
| 644 | dump_stack(); | ||
| 645 | } | ||
| 646 | |||
| 647 | /* | ||
| 648 | * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz | ||
| 649 | * via the workqueue/eventd. | ||
| 650 | * Add the CPU number into the expiration time to minimize the possibility of | ||
| 651 | * the CPUs getting into lockstep and contending for the global cache chain | ||
| 652 | * lock. | ||
| 653 | */ | ||
| 654 | static void __devinit start_cpu_timer(int cpu) | ||
| 655 | { | ||
| 656 | struct work_struct *reap_work = &per_cpu(reap_work, cpu); | ||
| 657 | |||
| 658 | /* | ||
| 659 | * When this gets called from do_initcalls via cpucache_init(), | ||
| 660 | * init_workqueues() has already run, so keventd will be setup | ||
| 661 | * at that time. | ||
| 662 | */ | ||
| 663 | if (keventd_up() && reap_work->func == NULL) { | ||
| 664 | INIT_WORK(reap_work, cache_reap, NULL); | ||
| 665 | schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); | ||
| 666 | } | ||
| 667 | } | ||
| 668 | |||
| 669 | static struct array_cache *alloc_arraycache(int cpu, int entries, | ||
| 670 | int batchcount) | ||
| 671 | { | ||
| 672 | int memsize = sizeof(void*)*entries+sizeof(struct array_cache); | ||
| 673 | struct array_cache *nc = NULL; | ||
| 674 | |||
| 675 | if (cpu != -1) { | ||
| 676 | kmem_cache_t *cachep; | ||
| 677 | cachep = kmem_find_general_cachep(memsize, GFP_KERNEL); | ||
| 678 | if (cachep) | ||
| 679 | nc = kmem_cache_alloc_node(cachep, cpu_to_node(cpu)); | ||
| 680 | } | ||
| 681 | if (!nc) | ||
| 682 | nc = kmalloc(memsize, GFP_KERNEL); | ||
| 683 | if (nc) { | ||
| 684 | nc->avail = 0; | ||
| 685 | nc->limit = entries; | ||
| 686 | nc->batchcount = batchcount; | ||
| 687 | nc->touched = 0; | ||
| 688 | } | ||
| 689 | return nc; | ||
| 690 | } | ||
| 691 | |||
| 692 | static int __devinit cpuup_callback(struct notifier_block *nfb, | ||
| 693 | unsigned long action, void *hcpu) | ||
| 694 | { | ||
| 695 | long cpu = (long)hcpu; | ||
| 696 | kmem_cache_t* cachep; | ||
| 697 | |||
| 698 | switch (action) { | ||
| 699 | case CPU_UP_PREPARE: | ||
| 700 | down(&cache_chain_sem); | ||
| 701 | list_for_each_entry(cachep, &cache_chain, next) { | ||
| 702 | struct array_cache *nc; | ||
| 703 | |||
| 704 | nc = alloc_arraycache(cpu, cachep->limit, cachep->batchcount); | ||
| 705 | if (!nc) | ||
| 706 | goto bad; | ||
| 707 | |||
| 708 | spin_lock_irq(&cachep->spinlock); | ||
| 709 | cachep->array[cpu] = nc; | ||
| 710 | cachep->free_limit = (1+num_online_cpus())*cachep->batchcount | ||
| 711 | + cachep->num; | ||
| 712 | spin_unlock_irq(&cachep->spinlock); | ||
| 713 | |||
| 714 | } | ||
| 715 | up(&cache_chain_sem); | ||
| 716 | break; | ||
| 717 | case CPU_ONLINE: | ||
| 718 | start_cpu_timer(cpu); | ||
| 719 | break; | ||
| 720 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 721 | case CPU_DEAD: | ||
| 722 | /* fall thru */ | ||
| 723 | case CPU_UP_CANCELED: | ||
| 724 | down(&cache_chain_sem); | ||
| 725 | |||
| 726 | list_for_each_entry(cachep, &cache_chain, next) { | ||
| 727 | struct array_cache *nc; | ||
| 728 | |||
| 729 | spin_lock_irq(&cachep->spinlock); | ||
| 730 | /* cpu is dead; no one can alloc from it. */ | ||
| 731 | nc = cachep->array[cpu]; | ||
| 732 | cachep->array[cpu] = NULL; | ||
| 733 | cachep->free_limit -= cachep->batchcount; | ||
| 734 | free_block(cachep, ac_entry(nc), nc->avail); | ||
| 735 | spin_unlock_irq(&cachep->spinlock); | ||
| 736 | kfree(nc); | ||
| 737 | } | ||
| 738 | up(&cache_chain_sem); | ||
| 739 | break; | ||
| 740 | #endif | ||
| 741 | } | ||
| 742 | return NOTIFY_OK; | ||
| 743 | bad: | ||
| 744 | up(&cache_chain_sem); | ||
| 745 | return NOTIFY_BAD; | ||
| 746 | } | ||
| 747 | |||
| 748 | static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; | ||
| 749 | |||
| 750 | /* Initialisation. | ||
| 751 | * Called after the gfp() functions have been enabled, and before smp_init(). | ||
| 752 | */ | ||
| 753 | void __init kmem_cache_init(void) | ||
| 754 | { | ||
| 755 | size_t left_over; | ||
| 756 | struct cache_sizes *sizes; | ||
| 757 | struct cache_names *names; | ||
| 758 | |||
| 759 | /* | ||
| 760 | * Fragmentation resistance on low memory - only use bigger | ||
| 761 | * page orders on machines with more than 32MB of memory. | ||
| 762 | */ | ||
| 763 | if (num_physpages > (32 << 20) >> PAGE_SHIFT) | ||
| 764 | slab_break_gfp_order = BREAK_GFP_ORDER_HI; | ||
| 765 | |||
| 766 | |||
| 767 | /* Bootstrap is tricky, because several objects are allocated | ||
| 768 | * from caches that do not exist yet: | ||
| 769 | * 1) initialize the cache_cache cache: it contains the kmem_cache_t | ||
| 770 | * structures of all caches, except cache_cache itself: cache_cache | ||
| 771 | * is statically allocated. | ||
| 772 | * Initially an __init data area is used for the head array, it's | ||
| 773 | * replaced with a kmalloc allocated array at the end of the bootstrap. | ||
| 774 | * 2) Create the first kmalloc cache. | ||
| 775 | * The kmem_cache_t for the new cache is allocated normally. An __init | ||
| 776 | * data area is used for the head array. | ||
| 777 | * 3) Create the remaining kmalloc caches, with minimally sized head arrays. | ||
| 778 | * 4) Replace the __init data head arrays for cache_cache and the first | ||
| 779 | * kmalloc cache with kmalloc allocated arrays. | ||
| 780 | * 5) Resize the head arrays of the kmalloc caches to their final sizes. | ||
| 781 | */ | ||
| 782 | |||
| 783 | /* 1) create the cache_cache */ | ||
| 784 | init_MUTEX(&cache_chain_sem); | ||
| 785 | INIT_LIST_HEAD(&cache_chain); | ||
| 786 | list_add(&cache_cache.next, &cache_chain); | ||
| 787 | cache_cache.colour_off = cache_line_size(); | ||
| 788 | cache_cache.array[smp_processor_id()] = &initarray_cache.cache; | ||
| 789 | |||
| 790 | cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); | ||
| 791 | |||
| 792 | cache_estimate(0, cache_cache.objsize, cache_line_size(), 0, | ||
| 793 | &left_over, &cache_cache.num); | ||
| 794 | if (!cache_cache.num) | ||
| 795 | BUG(); | ||
| 796 | |||
| 797 | cache_cache.colour = left_over/cache_cache.colour_off; | ||
| 798 | cache_cache.colour_next = 0; | ||
| 799 | cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) + | ||
| 800 | sizeof(struct slab), cache_line_size()); | ||
| 801 | |||
| 802 | /* 2+3) create the kmalloc caches */ | ||
| 803 | sizes = malloc_sizes; | ||
| 804 | names = cache_names; | ||
| 805 | |||
| 806 | while (sizes->cs_size != ULONG_MAX) { | ||
| 807 | /* For performance, all the general caches are L1 aligned. | ||
| 808 | * This should be particularly beneficial on SMP boxes, as it | ||
| 809 | * eliminates "false sharing". | ||
| 810 | * Note for systems short on memory removing the alignment will | ||
| 811 | * allow tighter packing of the smaller caches. */ | ||
| 812 | sizes->cs_cachep = kmem_cache_create(names->name, | ||
| 813 | sizes->cs_size, ARCH_KMALLOC_MINALIGN, | ||
| 814 | (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); | ||
| 815 | |||
| 816 | /* Inc off-slab bufctl limit until the ceiling is hit. */ | ||
| 817 | if (!(OFF_SLAB(sizes->cs_cachep))) { | ||
| 818 | offslab_limit = sizes->cs_size-sizeof(struct slab); | ||
| 819 | offslab_limit /= sizeof(kmem_bufctl_t); | ||
| 820 | } | ||
| 821 | |||
| 822 | sizes->cs_dmacachep = kmem_cache_create(names->name_dma, | ||
| 823 | sizes->cs_size, ARCH_KMALLOC_MINALIGN, | ||
| 824 | (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC), | ||
| 825 | NULL, NULL); | ||
| 826 | |||
| 827 | sizes++; | ||
| 828 | names++; | ||
| 829 | } | ||
| 830 | /* 4) Replace the bootstrap head arrays */ | ||
| 831 | { | ||
| 832 | void * ptr; | ||
| 833 | |||
| 834 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); | ||
| 835 | local_irq_disable(); | ||
| 836 | BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); | ||
| 837 | memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init)); | ||
| 838 | cache_cache.array[smp_processor_id()] = ptr; | ||
| 839 | local_irq_enable(); | ||
| 840 | |||
| 841 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); | ||
| 842 | local_irq_disable(); | ||
| 843 | BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache); | ||
| 844 | memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep), | ||
| 845 | sizeof(struct arraycache_init)); | ||
| 846 | malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr; | ||
| 847 | local_irq_enable(); | ||
| 848 | } | ||
| 849 | |||
| 850 | /* 5) resize the head arrays to their final sizes */ | ||
| 851 | { | ||
| 852 | kmem_cache_t *cachep; | ||
| 853 | down(&cache_chain_sem); | ||
| 854 | list_for_each_entry(cachep, &cache_chain, next) | ||
| 855 | enable_cpucache(cachep); | ||
| 856 | up(&cache_chain_sem); | ||
| 857 | } | ||
| 858 | |||
| 859 | /* Done! */ | ||
| 860 | g_cpucache_up = FULL; | ||
| 861 | |||
| 862 | /* Register a cpu startup notifier callback | ||
| 863 | * that initializes ac_data for all new cpus | ||
| 864 | */ | ||
| 865 | register_cpu_notifier(&cpucache_notifier); | ||
| 866 | |||
| 867 | |||
| 868 | /* The reap timers are started later, with a module init call: | ||
| 869 | * That part of the kernel is not yet operational. | ||
| 870 | */ | ||
| 871 | } | ||
| 872 | |||
| 873 | static int __init cpucache_init(void) | ||
| 874 | { | ||
| 875 | int cpu; | ||
| 876 | |||
| 877 | /* | ||
| 878 | * Register the timers that return unneeded | ||
| 879 | * pages to gfp. | ||
| 880 | */ | ||
| 881 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
| 882 | if (cpu_online(cpu)) | ||
| 883 | start_cpu_timer(cpu); | ||
| 884 | } | ||
| 885 | |||
| 886 | return 0; | ||
| 887 | } | ||
| 888 | |||
| 889 | __initcall(cpucache_init); | ||
| 890 | |||
| 891 | /* | ||
| 892 | * Interface to system's page allocator. No need to hold the cache-lock. | ||
| 893 | * | ||
| 894 | * If we requested dmaable memory, we will get it. Even if we | ||
| 895 | * did not request dmaable memory, we might get it, but that | ||
| 896 | * would be relatively rare and ignorable. | ||
| 897 | */ | ||
| 898 | static void *kmem_getpages(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid) | ||
| 899 | { | ||
| 900 | struct page *page; | ||
| 901 | void *addr; | ||
| 902 | int i; | ||
| 903 | |||
| 904 | flags |= cachep->gfpflags; | ||
| 905 | if (likely(nodeid == -1)) { | ||
| 906 | page = alloc_pages(flags, cachep->gfporder); | ||
| 907 | } else { | ||
| 908 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); | ||
| 909 | } | ||
| 910 | if (!page) | ||
| 911 | return NULL; | ||
| 912 | addr = page_address(page); | ||
| 913 | |||
| 914 | i = (1 << cachep->gfporder); | ||
| 915 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | ||
| 916 | atomic_add(i, &slab_reclaim_pages); | ||
| 917 | add_page_state(nr_slab, i); | ||
| 918 | while (i--) { | ||
| 919 | SetPageSlab(page); | ||
| 920 | page++; | ||
| 921 | } | ||
| 922 | return addr; | ||
| 923 | } | ||
| 924 | |||
| 925 | /* | ||
| 926 | * Interface to system's page release. | ||
| 927 | */ | ||
| 928 | static void kmem_freepages(kmem_cache_t *cachep, void *addr) | ||
| 929 | { | ||
| 930 | unsigned long i = (1<<cachep->gfporder); | ||
| 931 | struct page *page = virt_to_page(addr); | ||
| 932 | const unsigned long nr_freed = i; | ||
| 933 | |||
| 934 | while (i--) { | ||
| 935 | if (!TestClearPageSlab(page)) | ||
| 936 | BUG(); | ||
| 937 | page++; | ||
| 938 | } | ||
| 939 | sub_page_state(nr_slab, nr_freed); | ||
| 940 | if (current->reclaim_state) | ||
| 941 | current->reclaim_state->reclaimed_slab += nr_freed; | ||
| 942 | free_pages((unsigned long)addr, cachep->gfporder); | ||
| 943 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | ||
| 944 | atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages); | ||
| 945 | } | ||
| 946 | |||
| 947 | static void kmem_rcu_free(struct rcu_head *head) | ||
| 948 | { | ||
| 949 | struct slab_rcu *slab_rcu = (struct slab_rcu *) head; | ||
| 950 | kmem_cache_t *cachep = slab_rcu->cachep; | ||
| 951 | |||
| 952 | kmem_freepages(cachep, slab_rcu->addr); | ||
| 953 | if (OFF_SLAB(cachep)) | ||
| 954 | kmem_cache_free(cachep->slabp_cache, slab_rcu); | ||
| 955 | } | ||
| 956 | |||
| 957 | #if DEBUG | ||
| 958 | |||
| 959 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
| 960 | static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, | ||
| 961 | unsigned long caller) | ||
| 962 | { | ||
| 963 | int size = obj_reallen(cachep); | ||
| 964 | |||
| 965 | addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)]; | ||
| 966 | |||
| 967 | if (size < 5*sizeof(unsigned long)) | ||
| 968 | return; | ||
| 969 | |||
| 970 | *addr++=0x12345678; | ||
| 971 | *addr++=caller; | ||
| 972 | *addr++=smp_processor_id(); | ||
| 973 | size -= 3*sizeof(unsigned long); | ||
| 974 | { | ||
| 975 | unsigned long *sptr = &caller; | ||
| 976 | unsigned long svalue; | ||
| 977 | |||
| 978 | while (!kstack_end(sptr)) { | ||
| 979 | svalue = *sptr++; | ||
| 980 | if (kernel_text_address(svalue)) { | ||
| 981 | *addr++=svalue; | ||
| 982 | size -= sizeof(unsigned long); | ||
| 983 | if (size <= sizeof(unsigned long)) | ||
| 984 | break; | ||
| 985 | } | ||
| 986 | } | ||
| 987 | |||
| 988 | } | ||
| 989 | *addr++=0x87654321; | ||
| 990 | } | ||
| 991 | #endif | ||
| 992 | |||
| 993 | static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) | ||
| 994 | { | ||
| 995 | int size = obj_reallen(cachep); | ||
| 996 | addr = &((char*)addr)[obj_dbghead(cachep)]; | ||
| 997 | |||
| 998 | memset(addr, val, size); | ||
| 999 | *(unsigned char *)(addr+size-1) = POISON_END; | ||
| 1000 | } | ||
| 1001 | |||
| 1002 | static void dump_line(char *data, int offset, int limit) | ||
| 1003 | { | ||
| 1004 | int i; | ||
| 1005 | printk(KERN_ERR "%03x:", offset); | ||
| 1006 | for (i=0;i<limit;i++) { | ||
| 1007 | printk(" %02x", (unsigned char)data[offset+i]); | ||
| 1008 | } | ||
| 1009 | printk("\n"); | ||
| 1010 | } | ||
| 1011 | #endif | ||
| 1012 | |||
| 1013 | #if DEBUG | ||
| 1014 | |||
| 1015 | static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines) | ||
| 1016 | { | ||
| 1017 | int i, size; | ||
| 1018 | char *realobj; | ||
| 1019 | |||
| 1020 | if (cachep->flags & SLAB_RED_ZONE) { | ||
| 1021 | printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", | ||
| 1022 | *dbg_redzone1(cachep, objp), | ||
| 1023 | *dbg_redzone2(cachep, objp)); | ||
| 1024 | } | ||
| 1025 | |||
| 1026 | if (cachep->flags & SLAB_STORE_USER) { | ||
| 1027 | printk(KERN_ERR "Last user: [<%p>]", | ||
| 1028 | *dbg_userword(cachep, objp)); | ||
| 1029 | print_symbol("(%s)", | ||
| 1030 | (unsigned long)*dbg_userword(cachep, objp)); | ||
| 1031 | printk("\n"); | ||
| 1032 | } | ||
| 1033 | realobj = (char*)objp+obj_dbghead(cachep); | ||
| 1034 | size = obj_reallen(cachep); | ||
| 1035 | for (i=0; i<size && lines;i+=16, lines--) { | ||
| 1036 | int limit; | ||
| 1037 | limit = 16; | ||
| 1038 | if (i+limit > size) | ||
| 1039 | limit = size-i; | ||
| 1040 | dump_line(realobj, i, limit); | ||
| 1041 | } | ||
| 1042 | } | ||
| 1043 | |||
| 1044 | static void check_poison_obj(kmem_cache_t *cachep, void *objp) | ||
| 1045 | { | ||
| 1046 | char *realobj; | ||
| 1047 | int size, i; | ||
| 1048 | int lines = 0; | ||
| 1049 | |||
| 1050 | realobj = (char*)objp+obj_dbghead(cachep); | ||
| 1051 | size = obj_reallen(cachep); | ||
| 1052 | |||
| 1053 | for (i=0;i<size;i++) { | ||
| 1054 | char exp = POISON_FREE; | ||
| 1055 | if (i == size-1) | ||
| 1056 | exp = POISON_END; | ||
| 1057 | if (realobj[i] != exp) { | ||
| 1058 | int limit; | ||
| 1059 | /* Mismatch ! */ | ||
| 1060 | /* Print header */ | ||
| 1061 | if (lines == 0) { | ||
| 1062 | printk(KERN_ERR "Slab corruption: start=%p, len=%d\n", | ||
| 1063 | realobj, size); | ||
| 1064 | print_objinfo(cachep, objp, 0); | ||
| 1065 | } | ||
| 1066 | /* Hexdump the affected line */ | ||
| 1067 | i = (i/16)*16; | ||
| 1068 | limit = 16; | ||
| 1069 | if (i+limit > size) | ||
| 1070 | limit = size-i; | ||
| 1071 | dump_line(realobj, i, limit); | ||
| 1072 | i += 16; | ||
| 1073 | lines++; | ||
| 1074 | /* Limit to 5 lines */ | ||
| 1075 | if (lines > 5) | ||
| 1076 | break; | ||
| 1077 | } | ||
| 1078 | } | ||
| 1079 | if (lines != 0) { | ||
| 1080 | /* Print some data about the neighboring objects, if they | ||
| 1081 | * exist: | ||
| 1082 | */ | ||
| 1083 | struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp)); | ||
| 1084 | int objnr; | ||
| 1085 | |||
| 1086 | objnr = (objp-slabp->s_mem)/cachep->objsize; | ||
| 1087 | if (objnr) { | ||
| 1088 | objp = slabp->s_mem+(objnr-1)*cachep->objsize; | ||
| 1089 | realobj = (char*)objp+obj_dbghead(cachep); | ||
| 1090 | printk(KERN_ERR "Prev obj: start=%p, len=%d\n", | ||
| 1091 | realobj, size); | ||
| 1092 | print_objinfo(cachep, objp, 2); | ||
| 1093 | } | ||
| 1094 | if (objnr+1 < cachep->num) { | ||
| 1095 | objp = slabp->s_mem+(objnr+1)*cachep->objsize; | ||
| 1096 | realobj = (char*)objp+obj_dbghead(cachep); | ||
| 1097 | printk(KERN_ERR "Next obj: start=%p, len=%d\n", | ||
| 1098 | realobj, size); | ||
| 1099 | print_objinfo(cachep, objp, 2); | ||
| 1100 | } | ||
| 1101 | } | ||
| 1102 | } | ||
| 1103 | #endif | ||
| 1104 | |||
| 1105 | /* Destroy all the objs in a slab, and release the mem back to the system. | ||
| 1106 | * Before calling the slab must have been unlinked from the cache. | ||
| 1107 | * The cache-lock is not held/needed. | ||
| 1108 | */ | ||
| 1109 | static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) | ||
| 1110 | { | ||
| 1111 | void *addr = slabp->s_mem - slabp->colouroff; | ||
| 1112 | |||
| 1113 | #if DEBUG | ||
| 1114 | int i; | ||
| 1115 | for (i = 0; i < cachep->num; i++) { | ||
| 1116 | void *objp = slabp->s_mem + cachep->objsize * i; | ||
| 1117 | |||
| 1118 | if (cachep->flags & SLAB_POISON) { | ||
| 1119 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
| 1120 | if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep)) | ||
| 1121 | kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1); | ||
| 1122 | else | ||
| 1123 | check_poison_obj(cachep, objp); | ||
| 1124 | #else | ||
| 1125 | check_poison_obj(cachep, objp); | ||
| 1126 | #endif | ||
| 1127 | } | ||
| 1128 | if (cachep->flags & SLAB_RED_ZONE) { | ||
| 1129 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) | ||
| 1130 | slab_error(cachep, "start of a freed object " | ||
| 1131 | "was overwritten"); | ||
| 1132 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) | ||
| 1133 | slab_error(cachep, "end of a freed object " | ||
| 1134 | "was overwritten"); | ||
| 1135 | } | ||
| 1136 | if (cachep->dtor && !(cachep->flags & SLAB_POISON)) | ||
| 1137 | (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0); | ||
| 1138 | } | ||
| 1139 | #else | ||
| 1140 | if (cachep->dtor) { | ||
| 1141 | int i; | ||
| 1142 | for (i = 0; i < cachep->num; i++) { | ||
| 1143 | void* objp = slabp->s_mem+cachep->objsize*i; | ||
| 1144 | (cachep->dtor)(objp, cachep, 0); | ||
| 1145 | } | ||
| 1146 | } | ||
| 1147 | #endif | ||
| 1148 | |||
| 1149 | if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { | ||
| 1150 | struct slab_rcu *slab_rcu; | ||
| 1151 | |||
| 1152 | slab_rcu = (struct slab_rcu *) slabp; | ||
| 1153 | slab_rcu->cachep = cachep; | ||
| 1154 | slab_rcu->addr = addr; | ||
| 1155 | call_rcu(&slab_rcu->head, kmem_rcu_free); | ||
| 1156 | } else { | ||
| 1157 | kmem_freepages(cachep, addr); | ||
| 1158 | if (OFF_SLAB(cachep)) | ||
| 1159 | kmem_cache_free(cachep->slabp_cache, slabp); | ||
| 1160 | } | ||
| 1161 | } | ||
| 1162 | |||
| 1163 | /** | ||
| 1164 | * kmem_cache_create - Create a cache. | ||
| 1165 | * @name: A string which is used in /proc/slabinfo to identify this cache. | ||
| 1166 | * @size: The size of objects to be created in this cache. | ||
| 1167 | * @align: The required alignment for the objects. | ||
| 1168 | * @flags: SLAB flags | ||
| 1169 | * @ctor: A constructor for the objects. | ||
| 1170 | * @dtor: A destructor for the objects. | ||
| 1171 | * | ||
| 1172 | * Returns a ptr to the cache on success, NULL on failure. | ||
| 1173 | * Cannot be called within a int, but can be interrupted. | ||
| 1174 | * The @ctor is run when new pages are allocated by the cache | ||
| 1175 | * and the @dtor is run before the pages are handed back. | ||
| 1176 | * | ||
| 1177 | * @name must be valid until the cache is destroyed. This implies that | ||
| 1178 | * the module calling this has to destroy the cache before getting | ||
| 1179 | * unloaded. | ||
| 1180 | * | ||
| 1181 | * The flags are | ||
| 1182 | * | ||
| 1183 | * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) | ||
| 1184 | * to catch references to uninitialised memory. | ||
| 1185 | * | ||
| 1186 | * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check | ||
| 1187 | * for buffer overruns. | ||
| 1188 | * | ||
| 1189 | * %SLAB_NO_REAP - Don't automatically reap this cache when we're under | ||
| 1190 | * memory pressure. | ||
| 1191 | * | ||
| 1192 | * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware | ||
| 1193 | * cacheline. This can be beneficial if you're counting cycles as closely | ||
| 1194 | * as davem. | ||
| 1195 | */ | ||
| 1196 | kmem_cache_t * | ||
| 1197 | kmem_cache_create (const char *name, size_t size, size_t align, | ||
| 1198 | unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), | ||
| 1199 | void (*dtor)(void*, kmem_cache_t *, unsigned long)) | ||
| 1200 | { | ||
| 1201 | size_t left_over, slab_size, ralign; | ||
| 1202 | kmem_cache_t *cachep = NULL; | ||
| 1203 | |||
| 1204 | /* | ||
| 1205 | * Sanity checks... these are all serious usage bugs. | ||
| 1206 | */ | ||
| 1207 | if ((!name) || | ||
| 1208 | in_interrupt() || | ||
| 1209 | (size < BYTES_PER_WORD) || | ||
| 1210 | (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) || | ||
| 1211 | (dtor && !ctor)) { | ||
| 1212 | printk(KERN_ERR "%s: Early error in slab %s\n", | ||
| 1213 | __FUNCTION__, name); | ||
| 1214 | BUG(); | ||
| 1215 | } | ||
| 1216 | |||
| 1217 | #if DEBUG | ||
| 1218 | WARN_ON(strchr(name, ' ')); /* It confuses parsers */ | ||
| 1219 | if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { | ||
| 1220 | /* No constructor, but inital state check requested */ | ||
| 1221 | printk(KERN_ERR "%s: No con, but init state check " | ||
| 1222 | "requested - %s\n", __FUNCTION__, name); | ||
| 1223 | flags &= ~SLAB_DEBUG_INITIAL; | ||
| 1224 | } | ||
| 1225 | |||
| 1226 | #if FORCED_DEBUG | ||
| 1227 | /* | ||
| 1228 | * Enable redzoning and last user accounting, except for caches with | ||
| 1229 | * large objects, if the increased size would increase the object size | ||
| 1230 | * above the next power of two: caches with object sizes just above a | ||
| 1231 | * power of two have a significant amount of internal fragmentation. | ||
| 1232 | */ | ||
| 1233 | if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD))) | ||
| 1234 | flags |= SLAB_RED_ZONE|SLAB_STORE_USER; | ||
| 1235 | if (!(flags & SLAB_DESTROY_BY_RCU)) | ||
| 1236 | flags |= SLAB_POISON; | ||
| 1237 | #endif | ||
| 1238 | if (flags & SLAB_DESTROY_BY_RCU) | ||
| 1239 | BUG_ON(flags & SLAB_POISON); | ||
| 1240 | #endif | ||
| 1241 | if (flags & SLAB_DESTROY_BY_RCU) | ||
| 1242 | BUG_ON(dtor); | ||
| 1243 | |||
| 1244 | /* | ||
| 1245 | * Always checks flags, a caller might be expecting debug | ||
| 1246 | * support which isn't available. | ||
| 1247 | */ | ||
| 1248 | if (flags & ~CREATE_MASK) | ||
| 1249 | BUG(); | ||
| 1250 | |||
| 1251 | /* Check that size is in terms of words. This is needed to avoid | ||
| 1252 | * unaligned accesses for some archs when redzoning is used, and makes | ||
| 1253 | * sure any on-slab bufctl's are also correctly aligned. | ||
| 1254 | */ | ||
| 1255 | if (size & (BYTES_PER_WORD-1)) { | ||
| 1256 | size += (BYTES_PER_WORD-1); | ||
| 1257 | size &= ~(BYTES_PER_WORD-1); | ||
| 1258 | } | ||
| 1259 | |||
| 1260 | /* calculate out the final buffer alignment: */ | ||
| 1261 | /* 1) arch recommendation: can be overridden for debug */ | ||
| 1262 | if (flags & SLAB_HWCACHE_ALIGN) { | ||
| 1263 | /* Default alignment: as specified by the arch code. | ||
| 1264 | * Except if an object is really small, then squeeze multiple | ||
| 1265 | * objects into one cacheline. | ||
| 1266 | */ | ||
| 1267 | ralign = cache_line_size(); | ||
| 1268 | while (size <= ralign/2) | ||
| 1269 | ralign /= 2; | ||
| 1270 | } else { | ||
| 1271 | ralign = BYTES_PER_WORD; | ||
| 1272 | } | ||
| 1273 | /* 2) arch mandated alignment: disables debug if necessary */ | ||
| 1274 | if (ralign < ARCH_SLAB_MINALIGN) { | ||
| 1275 | ralign = ARCH_SLAB_MINALIGN; | ||
| 1276 | if (ralign > BYTES_PER_WORD) | ||
| 1277 | flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); | ||
| 1278 | } | ||
| 1279 | /* 3) caller mandated alignment: disables debug if necessary */ | ||
| 1280 | if (ralign < align) { | ||
| 1281 | ralign = align; | ||
| 1282 | if (ralign > BYTES_PER_WORD) | ||
| 1283 | flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); | ||
| 1284 | } | ||
| 1285 | /* 4) Store it. Note that the debug code below can reduce | ||
| 1286 | * the alignment to BYTES_PER_WORD. | ||
| 1287 | */ | ||
| 1288 | align = ralign; | ||
| 1289 | |||
| 1290 | /* Get cache's description obj. */ | ||
| 1291 | cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); | ||
| 1292 | if (!cachep) | ||
| 1293 | goto opps; | ||
| 1294 | memset(cachep, 0, sizeof(kmem_cache_t)); | ||
| 1295 | |||
| 1296 | #if DEBUG | ||
| 1297 | cachep->reallen = size; | ||
| 1298 | |||
| 1299 | if (flags & SLAB_RED_ZONE) { | ||
| 1300 | /* redzoning only works with word aligned caches */ | ||
| 1301 | align = BYTES_PER_WORD; | ||
| 1302 | |||
| 1303 | /* add space for red zone words */ | ||
| 1304 | cachep->dbghead += BYTES_PER_WORD; | ||
| 1305 | size += 2*BYTES_PER_WORD; | ||
| 1306 | } | ||
| 1307 | if (flags & SLAB_STORE_USER) { | ||
| 1308 | /* user store requires word alignment and | ||
| 1309 | * one word storage behind the end of the real | ||
| 1310 | * object. | ||
| 1311 | */ | ||
| 1312 | align = BYTES_PER_WORD; | ||
| 1313 | size += BYTES_PER_WORD; | ||
| 1314 | } | ||
| 1315 | #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) | ||
| 1316 | if (size > 128 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { | ||
| 1317 | cachep->dbghead += PAGE_SIZE - size; | ||
| 1318 | size = PAGE_SIZE; | ||
| 1319 | } | ||
| 1320 | #endif | ||
| 1321 | #endif | ||
| 1322 | |||
| 1323 | /* Determine if the slab management is 'on' or 'off' slab. */ | ||
| 1324 | if (size >= (PAGE_SIZE>>3)) | ||
| 1325 | /* | ||
| 1326 | * Size is large, assume best to place the slab management obj | ||
| 1327 | * off-slab (should allow better packing of objs). | ||
| 1328 | */ | ||
| 1329 | flags |= CFLGS_OFF_SLAB; | ||
| 1330 | |||
| 1331 | size = ALIGN(size, align); | ||
| 1332 | |||
| 1333 | if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) { | ||
| 1334 | /* | ||
| 1335 | * A VFS-reclaimable slab tends to have most allocations | ||
| 1336 | * as GFP_NOFS and we really don't want to have to be allocating | ||
| 1337 | * higher-order pages when we are unable to shrink dcache. | ||
| 1338 | */ | ||
| 1339 | cachep->gfporder = 0; | ||
| 1340 | cache_estimate(cachep->gfporder, size, align, flags, | ||
| 1341 | &left_over, &cachep->num); | ||
| 1342 | } else { | ||
| 1343 | /* | ||
| 1344 | * Calculate size (in pages) of slabs, and the num of objs per | ||
| 1345 | * slab. This could be made much more intelligent. For now, | ||
| 1346 | * try to avoid using high page-orders for slabs. When the | ||
| 1347 | * gfp() funcs are more friendly towards high-order requests, | ||
| 1348 | * this should be changed. | ||
| 1349 | */ | ||
| 1350 | do { | ||
| 1351 | unsigned int break_flag = 0; | ||
| 1352 | cal_wastage: | ||
| 1353 | cache_estimate(cachep->gfporder, size, align, flags, | ||
| 1354 | &left_over, &cachep->num); | ||
| 1355 | if (break_flag) | ||
| 1356 | break; | ||
| 1357 | if (cachep->gfporder >= MAX_GFP_ORDER) | ||
| 1358 | break; | ||
| 1359 | if (!cachep->num) | ||
| 1360 | goto next; | ||
| 1361 | if (flags & CFLGS_OFF_SLAB && | ||
| 1362 | cachep->num > offslab_limit) { | ||
| 1363 | /* This num of objs will cause problems. */ | ||
| 1364 | cachep->gfporder--; | ||
| 1365 | break_flag++; | ||
| 1366 | goto cal_wastage; | ||
| 1367 | } | ||
| 1368 | |||
| 1369 | /* | ||
| 1370 | * Large num of objs is good, but v. large slabs are | ||
| 1371 | * currently bad for the gfp()s. | ||
| 1372 | */ | ||
| 1373 | if (cachep->gfporder >= slab_break_gfp_order) | ||
| 1374 | break; | ||
| 1375 | |||
| 1376 | if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder)) | ||
| 1377 | break; /* Acceptable internal fragmentation. */ | ||
| 1378 | next: | ||
| 1379 | cachep->gfporder++; | ||
| 1380 | } while (1); | ||
| 1381 | } | ||
| 1382 | |||
| 1383 | if (!cachep->num) { | ||
| 1384 | printk("kmem_cache_create: couldn't create cache %s.\n", name); | ||
| 1385 | kmem_cache_free(&cache_cache, cachep); | ||
| 1386 | cachep = NULL; | ||
| 1387 | goto opps; | ||
| 1388 | } | ||
| 1389 | slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t) | ||
| 1390 | + sizeof(struct slab), align); | ||
| 1391 | |||
| 1392 | /* | ||
| 1393 | * If the slab has been placed off-slab, and we have enough space then | ||
| 1394 | * move it on-slab. This is at the expense of any extra colouring. | ||
| 1395 | */ | ||
| 1396 | if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { | ||
| 1397 | flags &= ~CFLGS_OFF_SLAB; | ||
| 1398 | left_over -= slab_size; | ||
| 1399 | } | ||
| 1400 | |||
| 1401 | if (flags & CFLGS_OFF_SLAB) { | ||
| 1402 | /* really off slab. No need for manual alignment */ | ||
| 1403 | slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab); | ||
| 1404 | } | ||
| 1405 | |||
| 1406 | cachep->colour_off = cache_line_size(); | ||
| 1407 | /* Offset must be a multiple of the alignment. */ | ||
| 1408 | if (cachep->colour_off < align) | ||
| 1409 | cachep->colour_off = align; | ||
| 1410 | cachep->colour = left_over/cachep->colour_off; | ||
| 1411 | cachep->slab_size = slab_size; | ||
| 1412 | cachep->flags = flags; | ||
| 1413 | cachep->gfpflags = 0; | ||
| 1414 | if (flags & SLAB_CACHE_DMA) | ||
| 1415 | cachep->gfpflags |= GFP_DMA; | ||
| 1416 | spin_lock_init(&cachep->spinlock); | ||
| 1417 | cachep->objsize = size; | ||
| 1418 | /* NUMA */ | ||
| 1419 | INIT_LIST_HEAD(&cachep->lists.slabs_full); | ||
| 1420 | INIT_LIST_HEAD(&cachep->lists.slabs_partial); | ||
| 1421 | INIT_LIST_HEAD(&cachep->lists.slabs_free); | ||
| 1422 | |||
| 1423 | if (flags & CFLGS_OFF_SLAB) | ||
| 1424 | cachep->slabp_cache = kmem_find_general_cachep(slab_size,0); | ||
| 1425 | cachep->ctor = ctor; | ||
| 1426 | cachep->dtor = dtor; | ||
| 1427 | cachep->name = name; | ||
| 1428 | |||
| 1429 | /* Don't let CPUs to come and go */ | ||
| 1430 | lock_cpu_hotplug(); | ||
| 1431 | |||
| 1432 | if (g_cpucache_up == FULL) { | ||
| 1433 | enable_cpucache(cachep); | ||
| 1434 | } else { | ||
| 1435 | if (g_cpucache_up == NONE) { | ||
| 1436 | /* Note: the first kmem_cache_create must create | ||
| 1437 | * the cache that's used by kmalloc(24), otherwise | ||
| 1438 | * the creation of further caches will BUG(). | ||
| 1439 | */ | ||
| 1440 | cachep->array[smp_processor_id()] = &initarray_generic.cache; | ||
| 1441 | g_cpucache_up = PARTIAL; | ||
| 1442 | } else { | ||
| 1443 | cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL); | ||
| 1444 | } | ||
| 1445 | BUG_ON(!ac_data(cachep)); | ||
| 1446 | ac_data(cachep)->avail = 0; | ||
| 1447 | ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES; | ||
| 1448 | ac_data(cachep)->batchcount = 1; | ||
| 1449 | ac_data(cachep)->touched = 0; | ||
| 1450 | cachep->batchcount = 1; | ||
| 1451 | cachep->limit = BOOT_CPUCACHE_ENTRIES; | ||
| 1452 | cachep->free_limit = (1+num_online_cpus())*cachep->batchcount | ||
| 1453 | + cachep->num; | ||
| 1454 | } | ||
| 1455 | |||
| 1456 | cachep->lists.next_reap = jiffies + REAPTIMEOUT_LIST3 + | ||
| 1457 | ((unsigned long)cachep)%REAPTIMEOUT_LIST3; | ||
| 1458 | |||
| 1459 | /* Need the semaphore to access the chain. */ | ||
| 1460 | down(&cache_chain_sem); | ||
| 1461 | { | ||
| 1462 | struct list_head *p; | ||
| 1463 | mm_segment_t old_fs; | ||
| 1464 | |||
| 1465 | old_fs = get_fs(); | ||
| 1466 | set_fs(KERNEL_DS); | ||
| 1467 | list_for_each(p, &cache_chain) { | ||
| 1468 | kmem_cache_t *pc = list_entry(p, kmem_cache_t, next); | ||
| 1469 | char tmp; | ||
| 1470 | /* This happens when the module gets unloaded and doesn't | ||
| 1471 | destroy its slab cache and noone else reuses the vmalloc | ||
| 1472 | area of the module. Print a warning. */ | ||
| 1473 | if (__get_user(tmp,pc->name)) { | ||
| 1474 | printk("SLAB: cache with size %d has lost its name\n", | ||
| 1475 | pc->objsize); | ||
| 1476 | continue; | ||
| 1477 | } | ||
| 1478 | if (!strcmp(pc->name,name)) { | ||
| 1479 | printk("kmem_cache_create: duplicate cache %s\n",name); | ||
| 1480 | up(&cache_chain_sem); | ||
| 1481 | unlock_cpu_hotplug(); | ||
| 1482 | BUG(); | ||
| 1483 | } | ||
| 1484 | } | ||
| 1485 | set_fs(old_fs); | ||
| 1486 | } | ||
| 1487 | |||
| 1488 | /* cache setup completed, link it into the list */ | ||
| 1489 | list_add(&cachep->next, &cache_chain); | ||
| 1490 | up(&cache_chain_sem); | ||
| 1491 | unlock_cpu_hotplug(); | ||
| 1492 | opps: | ||
| 1493 | if (!cachep && (flags & SLAB_PANIC)) | ||
| 1494 | panic("kmem_cache_create(): failed to create slab `%s'\n", | ||
| 1495 | name); | ||
| 1496 | return cachep; | ||
| 1497 | } | ||
| 1498 | EXPORT_SYMBOL(kmem_cache_create); | ||
| 1499 | |||
| 1500 | #if DEBUG | ||
| 1501 | static void check_irq_off(void) | ||
| 1502 | { | ||
| 1503 | BUG_ON(!irqs_disabled()); | ||
| 1504 | } | ||
| 1505 | |||
| 1506 | static void check_irq_on(void) | ||
| 1507 | { | ||
| 1508 | BUG_ON(irqs_disabled()); | ||
| 1509 | } | ||
| 1510 | |||
| 1511 | static void check_spinlock_acquired(kmem_cache_t *cachep) | ||
| 1512 | { | ||
| 1513 | #ifdef CONFIG_SMP | ||
| 1514 | check_irq_off(); | ||
| 1515 | BUG_ON(spin_trylock(&cachep->spinlock)); | ||
| 1516 | #endif | ||
| 1517 | } | ||
| 1518 | #else | ||
| 1519 | #define check_irq_off() do { } while(0) | ||
| 1520 | #define check_irq_on() do { } while(0) | ||
| 1521 | #define check_spinlock_acquired(x) do { } while(0) | ||
| 1522 | #endif | ||
| 1523 | |||
| 1524 | /* | ||
| 1525 | * Waits for all CPUs to execute func(). | ||
| 1526 | */ | ||
| 1527 | static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg) | ||
| 1528 | { | ||
| 1529 | check_irq_on(); | ||
| 1530 | preempt_disable(); | ||
| 1531 | |||
| 1532 | local_irq_disable(); | ||
| 1533 | func(arg); | ||
| 1534 | local_irq_enable(); | ||
| 1535 | |||
| 1536 | if (smp_call_function(func, arg, 1, 1)) | ||
| 1537 | BUG(); | ||
| 1538 | |||
| 1539 | preempt_enable(); | ||
| 1540 | } | ||
| 1541 | |||
| 1542 | static void drain_array_locked(kmem_cache_t* cachep, | ||
| 1543 | struct array_cache *ac, int force); | ||
| 1544 | |||
| 1545 | static void do_drain(void *arg) | ||
| 1546 | { | ||
| 1547 | kmem_cache_t *cachep = (kmem_cache_t*)arg; | ||
| 1548 | struct array_cache *ac; | ||
| 1549 | |||
| 1550 | check_irq_off(); | ||
| 1551 | ac = ac_data(cachep); | ||
| 1552 | spin_lock(&cachep->spinlock); | ||
| 1553 | free_block(cachep, &ac_entry(ac)[0], ac->avail); | ||
| 1554 | spin_unlock(&cachep->spinlock); | ||
| 1555 | ac->avail = 0; | ||
| 1556 | } | ||
| 1557 | |||
| 1558 | static void drain_cpu_caches(kmem_cache_t *cachep) | ||
| 1559 | { | ||
| 1560 | smp_call_function_all_cpus(do_drain, cachep); | ||
| 1561 | check_irq_on(); | ||
| 1562 | spin_lock_irq(&cachep->spinlock); | ||
| 1563 | if (cachep->lists.shared) | ||
| 1564 | drain_array_locked(cachep, cachep->lists.shared, 1); | ||
| 1565 | spin_unlock_irq(&cachep->spinlock); | ||
| 1566 | } | ||
| 1567 | |||
| 1568 | |||
| 1569 | /* NUMA shrink all list3s */ | ||
| 1570 | static int __cache_shrink(kmem_cache_t *cachep) | ||
| 1571 | { | ||
| 1572 | struct slab *slabp; | ||
| 1573 | int ret; | ||
| 1574 | |||
| 1575 | drain_cpu_caches(cachep); | ||
| 1576 | |||
| 1577 | check_irq_on(); | ||
| 1578 | spin_lock_irq(&cachep->spinlock); | ||
| 1579 | |||
| 1580 | for(;;) { | ||
| 1581 | struct list_head *p; | ||
| 1582 | |||
| 1583 | p = cachep->lists.slabs_free.prev; | ||
| 1584 | if (p == &cachep->lists.slabs_free) | ||
| 1585 | break; | ||
| 1586 | |||
| 1587 | slabp = list_entry(cachep->lists.slabs_free.prev, struct slab, list); | ||
| 1588 | #if DEBUG | ||
| 1589 | if (slabp->inuse) | ||
| 1590 | BUG(); | ||
| 1591 | #endif | ||
| 1592 | list_del(&slabp->list); | ||
| 1593 | |||
| 1594 | cachep->lists.free_objects -= cachep->num; | ||
| 1595 | spin_unlock_irq(&cachep->spinlock); | ||
| 1596 | slab_destroy(cachep, slabp); | ||
| 1597 | spin_lock_irq(&cachep->spinlock); | ||
| 1598 | } | ||
| 1599 | ret = !list_empty(&cachep->lists.slabs_full) || | ||
| 1600 | !list_empty(&cachep->lists.slabs_partial); | ||
| 1601 | spin_unlock_irq(&cachep->spinlock); | ||
| 1602 | return ret; | ||
| 1603 | } | ||
| 1604 | |||
| 1605 | /** | ||
| 1606 | * kmem_cache_shrink - Shrink a cache. | ||
| 1607 | * @cachep: The cache to shrink. | ||
| 1608 | * | ||
| 1609 | * Releases as many slabs as possible for a cache. | ||
| 1610 | * To help debugging, a zero exit status indicates all slabs were released. | ||
| 1611 | */ | ||
| 1612 | int kmem_cache_shrink(kmem_cache_t *cachep) | ||
| 1613 | { | ||
| 1614 | if (!cachep || in_interrupt()) | ||
| 1615 | BUG(); | ||
| 1616 | |||
| 1617 | return __cache_shrink(cachep); | ||
| 1618 | } | ||
| 1619 | EXPORT_SYMBOL(kmem_cache_shrink); | ||
| 1620 | |||
| 1621 | /** | ||
| 1622 | * kmem_cache_destroy - delete a cache | ||
| 1623 | * @cachep: the cache to destroy | ||
| 1624 | * | ||
| 1625 | * Remove a kmem_cache_t object from the slab cache. | ||
| 1626 | * Returns 0 on success. | ||
| 1627 | * | ||
| 1628 | * It is expected this function will be called by a module when it is | ||
| 1629 | * unloaded. This will remove the cache completely, and avoid a duplicate | ||
| 1630 | * cache being allocated each time a module is loaded and unloaded, if the | ||
| 1631 | * module doesn't have persistent in-kernel storage across loads and unloads. | ||
| 1632 | * | ||
| 1633 | * The cache must be empty before calling this function. | ||
| 1634 | * | ||
| 1635 | * The caller must guarantee that noone will allocate memory from the cache | ||
| 1636 | * during the kmem_cache_destroy(). | ||
| 1637 | */ | ||
| 1638 | int kmem_cache_destroy(kmem_cache_t * cachep) | ||
| 1639 | { | ||
| 1640 | int i; | ||
| 1641 | |||
| 1642 | if (!cachep || in_interrupt()) | ||
| 1643 | BUG(); | ||
| 1644 | |||
| 1645 | /* Don't let CPUs to come and go */ | ||
| 1646 | lock_cpu_hotplug(); | ||
| 1647 | |||
| 1648 | /* Find the cache in the chain of caches. */ | ||
| 1649 | down(&cache_chain_sem); | ||
| 1650 | /* | ||
| 1651 | * the chain is never empty, cache_cache is never destroyed | ||
| 1652 | */ | ||
| 1653 | list_del(&cachep->next); | ||
| 1654 | up(&cache_chain_sem); | ||
| 1655 | |||
| 1656 | if (__cache_shrink(cachep)) { | ||
| 1657 | slab_error(cachep, "Can't free all objects"); | ||
| 1658 | down(&cache_chain_sem); | ||
| 1659 | list_add(&cachep->next,&cache_chain); | ||
| 1660 | up(&cache_chain_sem); | ||
| 1661 | unlock_cpu_hotplug(); | ||
| 1662 | return 1; | ||
| 1663 | } | ||
| 1664 | |||
| 1665 | if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) | ||
| 1666 | synchronize_kernel(); | ||
| 1667 | |||
| 1668 | /* no cpu_online check required here since we clear the percpu | ||
| 1669 | * array on cpu offline and set this to NULL. | ||
| 1670 | */ | ||
| 1671 | for (i = 0; i < NR_CPUS; i++) | ||
| 1672 | kfree(cachep->array[i]); | ||
| 1673 | |||
| 1674 | /* NUMA: free the list3 structures */ | ||
| 1675 | kfree(cachep->lists.shared); | ||
| 1676 | cachep->lists.shared = NULL; | ||
| 1677 | kmem_cache_free(&cache_cache, cachep); | ||
| 1678 | |||
| 1679 | unlock_cpu_hotplug(); | ||
| 1680 | |||
| 1681 | return 0; | ||
| 1682 | } | ||
| 1683 | EXPORT_SYMBOL(kmem_cache_destroy); | ||
| 1684 | |||
| 1685 | /* Get the memory for a slab management obj. */ | ||
| 1686 | static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, | ||
| 1687 | void *objp, int colour_off, unsigned int __nocast local_flags) | ||
| 1688 | { | ||
| 1689 | struct slab *slabp; | ||
| 1690 | |||
| 1691 | if (OFF_SLAB(cachep)) { | ||
| 1692 | /* Slab management obj is off-slab. */ | ||
| 1693 | slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); | ||
| 1694 | if (!slabp) | ||
| 1695 | return NULL; | ||
| 1696 | } else { | ||
| 1697 | slabp = objp+colour_off; | ||
| 1698 | colour_off += cachep->slab_size; | ||
| 1699 | } | ||
| 1700 | slabp->inuse = 0; | ||
| 1701 | slabp->colouroff = colour_off; | ||
| 1702 | slabp->s_mem = objp+colour_off; | ||
| 1703 | |||
| 1704 | return slabp; | ||
| 1705 | } | ||
| 1706 | |||
| 1707 | static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) | ||
| 1708 | { | ||
| 1709 | return (kmem_bufctl_t *)(slabp+1); | ||
| 1710 | } | ||
| 1711 | |||
| 1712 | static void cache_init_objs(kmem_cache_t *cachep, | ||
| 1713 | struct slab *slabp, unsigned long ctor_flags) | ||
| 1714 | { | ||
| 1715 | int i; | ||
| 1716 | |||
| 1717 | for (i = 0; i < cachep->num; i++) { | ||
| 1718 | void* objp = slabp->s_mem+cachep->objsize*i; | ||
| 1719 | #if DEBUG | ||
| 1720 | /* need to poison the objs? */ | ||
| 1721 | if (cachep->flags & SLAB_POISON) | ||
| 1722 | poison_obj(cachep, objp, POISON_FREE); | ||
| 1723 | if (cachep->flags & SLAB_STORE_USER) | ||
| 1724 | *dbg_userword(cachep, objp) = NULL; | ||
| 1725 | |||
| 1726 | if (cachep->flags & SLAB_RED_ZONE) { | ||
| 1727 | *dbg_redzone1(cachep, objp) = RED_INACTIVE; | ||
| 1728 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; | ||
| 1729 | } | ||
| 1730 | /* | ||
| 1731 | * Constructors are not allowed to allocate memory from | ||
| 1732 | * the same cache which they are a constructor for. | ||
| 1733 | * Otherwise, deadlock. They must also be threaded. | ||
| 1734 | */ | ||
| 1735 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) | ||
| 1736 | cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags); | ||
| 1737 | |||
| 1738 | if (cachep->flags & SLAB_RED_ZONE) { | ||
| 1739 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) | ||
| 1740 | slab_error(cachep, "constructor overwrote the" | ||
| 1741 | " end of an object"); | ||
| 1742 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) | ||
| 1743 | slab_error(cachep, "constructor overwrote the" | ||
| 1744 | " start of an object"); | ||
| 1745 | } | ||
| 1746 | if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) | ||
| 1747 | kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); | ||
| 1748 | #else | ||
| 1749 | if (cachep->ctor) | ||
| 1750 | cachep->ctor(objp, cachep, ctor_flags); | ||
| 1751 | #endif | ||
| 1752 | slab_bufctl(slabp)[i] = i+1; | ||
| 1753 | } | ||
| 1754 | slab_bufctl(slabp)[i-1] = BUFCTL_END; | ||
| 1755 | slabp->free = 0; | ||
| 1756 | } | ||
| 1757 | |||
| 1758 | static void kmem_flagcheck(kmem_cache_t *cachep, unsigned int flags) | ||
| 1759 | { | ||
| 1760 | if (flags & SLAB_DMA) { | ||
| 1761 | if (!(cachep->gfpflags & GFP_DMA)) | ||
| 1762 | BUG(); | ||
| 1763 | } else { | ||
| 1764 | if (cachep->gfpflags & GFP_DMA) | ||
| 1765 | BUG(); | ||
| 1766 | } | ||
| 1767 | } | ||
| 1768 | |||
| 1769 | static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp) | ||
| 1770 | { | ||
| 1771 | int i; | ||
| 1772 | struct page *page; | ||
| 1773 | |||
| 1774 | /* Nasty!!!!!! I hope this is OK. */ | ||
| 1775 | i = 1 << cachep->gfporder; | ||
| 1776 | page = virt_to_page(objp); | ||
| 1777 | do { | ||
| 1778 | SET_PAGE_CACHE(page, cachep); | ||
| 1779 | SET_PAGE_SLAB(page, slabp); | ||
| 1780 | page++; | ||
| 1781 | } while (--i); | ||
| 1782 | } | ||
| 1783 | |||
| 1784 | /* | ||
| 1785 | * Grow (by 1) the number of slabs within a cache. This is called by | ||
| 1786 | * kmem_cache_alloc() when there are no active objs left in a cache. | ||
| 1787 | */ | ||
| 1788 | static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid) | ||
| 1789 | { | ||
| 1790 | struct slab *slabp; | ||
| 1791 | void *objp; | ||
| 1792 | size_t offset; | ||
| 1793 | unsigned int local_flags; | ||
| 1794 | unsigned long ctor_flags; | ||
| 1795 | |||
| 1796 | /* Be lazy and only check for valid flags here, | ||
| 1797 | * keeping it out of the critical path in kmem_cache_alloc(). | ||
| 1798 | */ | ||
| 1799 | if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) | ||
| 1800 | BUG(); | ||
| 1801 | if (flags & SLAB_NO_GROW) | ||
| 1802 | return 0; | ||
| 1803 | |||
| 1804 | ctor_flags = SLAB_CTOR_CONSTRUCTOR; | ||
| 1805 | local_flags = (flags & SLAB_LEVEL_MASK); | ||
| 1806 | if (!(local_flags & __GFP_WAIT)) | ||
| 1807 | /* | ||
| 1808 | * Not allowed to sleep. Need to tell a constructor about | ||
| 1809 | * this - it might need to know... | ||
| 1810 | */ | ||
| 1811 | ctor_flags |= SLAB_CTOR_ATOMIC; | ||
| 1812 | |||
| 1813 | /* About to mess with non-constant members - lock. */ | ||
| 1814 | check_irq_off(); | ||
| 1815 | spin_lock(&cachep->spinlock); | ||
| 1816 | |||
| 1817 | /* Get colour for the slab, and cal the next value. */ | ||
| 1818 | offset = cachep->colour_next; | ||
| 1819 | cachep->colour_next++; | ||
| 1820 | if (cachep->colour_next >= cachep->colour) | ||
| 1821 | cachep->colour_next = 0; | ||
| 1822 | offset *= cachep->colour_off; | ||
| 1823 | |||
| 1824 | spin_unlock(&cachep->spinlock); | ||
| 1825 | |||
| 1826 | if (local_flags & __GFP_WAIT) | ||
| 1827 | local_irq_enable(); | ||
| 1828 | |||
| 1829 | /* | ||
| 1830 | * The test for missing atomic flag is performed here, rather than | ||
| 1831 | * the more obvious place, simply to reduce the critical path length | ||
| 1832 | * in kmem_cache_alloc(). If a caller is seriously mis-behaving they | ||
| 1833 | * will eventually be caught here (where it matters). | ||
| 1834 | */ | ||
| 1835 | kmem_flagcheck(cachep, flags); | ||
| 1836 | |||
| 1837 | |||
| 1838 | /* Get mem for the objs. */ | ||
| 1839 | if (!(objp = kmem_getpages(cachep, flags, nodeid))) | ||
| 1840 | goto failed; | ||
| 1841 | |||
| 1842 | /* Get slab management. */ | ||
| 1843 | if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags))) | ||
| 1844 | goto opps1; | ||
| 1845 | |||
| 1846 | set_slab_attr(cachep, slabp, objp); | ||
| 1847 | |||
| 1848 | cache_init_objs(cachep, slabp, ctor_flags); | ||
| 1849 | |||
| 1850 | if (local_flags & __GFP_WAIT) | ||
| 1851 | local_irq_disable(); | ||
| 1852 | check_irq_off(); | ||
| 1853 | spin_lock(&cachep->spinlock); | ||
| 1854 | |||
| 1855 | /* Make slab active. */ | ||
| 1856 | list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_free)); | ||
| 1857 | STATS_INC_GROWN(cachep); | ||
| 1858 | list3_data(cachep)->free_objects += cachep->num; | ||
| 1859 | spin_unlock(&cachep->spinlock); | ||
| 1860 | return 1; | ||
| 1861 | opps1: | ||
| 1862 | kmem_freepages(cachep, objp); | ||
| 1863 | failed: | ||
| 1864 | if (local_flags & __GFP_WAIT) | ||
| 1865 | local_irq_disable(); | ||
| 1866 | return 0; | ||
| 1867 | } | ||
| 1868 | |||
| 1869 | #if DEBUG | ||
| 1870 | |||
| 1871 | /* | ||
| 1872 | * Perform extra freeing checks: | ||
| 1873 | * - detect bad pointers. | ||
| 1874 | * - POISON/RED_ZONE checking | ||
| 1875 | * - destructor calls, for caches with POISON+dtor | ||
| 1876 | */ | ||
| 1877 | static void kfree_debugcheck(const void *objp) | ||
| 1878 | { | ||
| 1879 | struct page *page; | ||
| 1880 | |||
| 1881 | if (!virt_addr_valid(objp)) { | ||
| 1882 | printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", | ||
| 1883 | (unsigned long)objp); | ||
| 1884 | BUG(); | ||
| 1885 | } | ||
| 1886 | page = virt_to_page(objp); | ||
| 1887 | if (!PageSlab(page)) { | ||
| 1888 | printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp); | ||
| 1889 | BUG(); | ||
| 1890 | } | ||
| 1891 | } | ||
| 1892 | |||
| 1893 | static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, | ||
| 1894 | void *caller) | ||
| 1895 | { | ||
| 1896 | struct page *page; | ||
| 1897 | unsigned int objnr; | ||
| 1898 | struct slab *slabp; | ||
| 1899 | |||
| 1900 | objp -= obj_dbghead(cachep); | ||
| 1901 | kfree_debugcheck(objp); | ||
| 1902 | page = virt_to_page(objp); | ||
| 1903 | |||
| 1904 | if (GET_PAGE_CACHE(page) != cachep) { | ||
| 1905 | printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n", | ||
| 1906 | GET_PAGE_CACHE(page),cachep); | ||
| 1907 | printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); | ||
| 1908 | printk(KERN_ERR "%p is %s.\n", GET_PAGE_CACHE(page), GET_PAGE_CACHE(page)->name); | ||
| 1909 | WARN_ON(1); | ||
| 1910 | } | ||
| 1911 | slabp = GET_PAGE_SLAB(page); | ||
| 1912 | |||
| 1913 | if (cachep->flags & SLAB_RED_ZONE) { | ||
| 1914 | if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { | ||
| 1915 | slab_error(cachep, "double free, or memory outside" | ||
| 1916 | " object was overwritten"); | ||
| 1917 | printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", | ||
| 1918 | objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); | ||
| 1919 | } | ||
| 1920 | *dbg_redzone1(cachep, objp) = RED_INACTIVE; | ||
| 1921 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; | ||
| 1922 | } | ||
| 1923 | if (cachep->flags & SLAB_STORE_USER) | ||
| 1924 | *dbg_userword(cachep, objp) = caller; | ||
| 1925 | |||
| 1926 | objnr = (objp-slabp->s_mem)/cachep->objsize; | ||
| 1927 | |||
| 1928 | BUG_ON(objnr >= cachep->num); | ||
| 1929 | BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize); | ||
| 1930 | |||
| 1931 | if (cachep->flags & SLAB_DEBUG_INITIAL) { | ||
| 1932 | /* Need to call the slab's constructor so the | ||
| 1933 | * caller can perform a verify of its state (debugging). | ||
| 1934 | * Called without the cache-lock held. | ||
| 1935 | */ | ||
| 1936 | cachep->ctor(objp+obj_dbghead(cachep), | ||
| 1937 | cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY); | ||
| 1938 | } | ||
| 1939 | if (cachep->flags & SLAB_POISON && cachep->dtor) { | ||
| 1940 | /* we want to cache poison the object, | ||
| 1941 | * call the destruction callback | ||
| 1942 | */ | ||
| 1943 | cachep->dtor(objp+obj_dbghead(cachep), cachep, 0); | ||
| 1944 | } | ||
| 1945 | if (cachep->flags & SLAB_POISON) { | ||
| 1946 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
| 1947 | if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { | ||
| 1948 | store_stackinfo(cachep, objp, (unsigned long)caller); | ||
| 1949 | kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); | ||
| 1950 | } else { | ||
| 1951 | poison_obj(cachep, objp, POISON_FREE); | ||
| 1952 | } | ||
| 1953 | #else | ||
| 1954 | poison_obj(cachep, objp, POISON_FREE); | ||
| 1955 | #endif | ||
| 1956 | } | ||
| 1957 | return objp; | ||
| 1958 | } | ||
| 1959 | |||
| 1960 | static void check_slabp(kmem_cache_t *cachep, struct slab *slabp) | ||
| 1961 | { | ||
| 1962 | kmem_bufctl_t i; | ||
| 1963 | int entries = 0; | ||
| 1964 | |||
| 1965 | check_spinlock_acquired(cachep); | ||
| 1966 | /* Check slab's freelist to see if this obj is there. */ | ||
| 1967 | for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { | ||
| 1968 | entries++; | ||
| 1969 | if (entries > cachep->num || i >= cachep->num) | ||
| 1970 | goto bad; | ||
| 1971 | } | ||
| 1972 | if (entries != cachep->num - slabp->inuse) { | ||
| 1973 | bad: | ||
| 1974 | printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", | ||
| 1975 | cachep->name, cachep->num, slabp, slabp->inuse); | ||
| 1976 | for (i=0;i<sizeof(slabp)+cachep->num*sizeof(kmem_bufctl_t);i++) { | ||
| 1977 | if ((i%16)==0) | ||
| 1978 | printk("\n%03x:", i); | ||
| 1979 | printk(" %02x", ((unsigned char*)slabp)[i]); | ||
| 1980 | } | ||
| 1981 | printk("\n"); | ||
| 1982 | BUG(); | ||
| 1983 | } | ||
| 1984 | } | ||
| 1985 | #else | ||
| 1986 | #define kfree_debugcheck(x) do { } while(0) | ||
| 1987 | #define cache_free_debugcheck(x,objp,z) (objp) | ||
| 1988 | #define check_slabp(x,y) do { } while(0) | ||
| 1989 | #endif | ||
| 1990 | |||
| 1991 | static void *cache_alloc_refill(kmem_cache_t *cachep, unsigned int __nocast flags) | ||
| 1992 | { | ||
| 1993 | int batchcount; | ||
| 1994 | struct kmem_list3 *l3; | ||
| 1995 | struct array_cache *ac; | ||
| 1996 | |||
| 1997 | check_irq_off(); | ||
| 1998 | ac = ac_data(cachep); | ||
| 1999 | retry: | ||
| 2000 | batchcount = ac->batchcount; | ||
| 2001 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { | ||
| 2002 | /* if there was little recent activity on this | ||
| 2003 | * cache, then perform only a partial refill. | ||
| 2004 | * Otherwise we could generate refill bouncing. | ||
| 2005 | */ | ||
| 2006 | batchcount = BATCHREFILL_LIMIT; | ||
| 2007 | } | ||
| 2008 | l3 = list3_data(cachep); | ||
| 2009 | |||
| 2010 | BUG_ON(ac->avail > 0); | ||
| 2011 | spin_lock(&cachep->spinlock); | ||
| 2012 | if (l3->shared) { | ||
| 2013 | struct array_cache *shared_array = l3->shared; | ||
| 2014 | if (shared_array->avail) { | ||
| 2015 | if (batchcount > shared_array->avail) | ||
| 2016 | batchcount = shared_array->avail; | ||
| 2017 | shared_array->avail -= batchcount; | ||
| 2018 | ac->avail = batchcount; | ||
| 2019 | memcpy(ac_entry(ac), &ac_entry(shared_array)[shared_array->avail], | ||
| 2020 | sizeof(void*)*batchcount); | ||
| 2021 | shared_array->touched = 1; | ||
| 2022 | goto alloc_done; | ||
| 2023 | } | ||
| 2024 | } | ||
| 2025 | while (batchcount > 0) { | ||
| 2026 | struct list_head *entry; | ||
| 2027 | struct slab *slabp; | ||
| 2028 | /* Get slab alloc is to come from. */ | ||
| 2029 | entry = l3->slabs_partial.next; | ||
| 2030 | if (entry == &l3->slabs_partial) { | ||
| 2031 | l3->free_touched = 1; | ||
| 2032 | entry = l3->slabs_free.next; | ||
| 2033 | if (entry == &l3->slabs_free) | ||
| 2034 | goto must_grow; | ||
| 2035 | } | ||
| 2036 | |||
| 2037 | slabp = list_entry(entry, struct slab, list); | ||
| 2038 | check_slabp(cachep, slabp); | ||
| 2039 | check_spinlock_acquired(cachep); | ||
| 2040 | while (slabp->inuse < cachep->num && batchcount--) { | ||
| 2041 | kmem_bufctl_t next; | ||
| 2042 | STATS_INC_ALLOCED(cachep); | ||
| 2043 | STATS_INC_ACTIVE(cachep); | ||
| 2044 | STATS_SET_HIGH(cachep); | ||
| 2045 | |||
| 2046 | /* get obj pointer */ | ||
| 2047 | ac_entry(ac)[ac->avail++] = slabp->s_mem + slabp->free*cachep->objsize; | ||
| 2048 | |||
| 2049 | slabp->inuse++; | ||
| 2050 | next = slab_bufctl(slabp)[slabp->free]; | ||
| 2051 | #if DEBUG | ||
| 2052 | slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; | ||
| 2053 | #endif | ||
| 2054 | slabp->free = next; | ||
| 2055 | } | ||
| 2056 | check_slabp(cachep, slabp); | ||
| 2057 | |||
| 2058 | /* move slabp to correct slabp list: */ | ||
| 2059 | list_del(&slabp->list); | ||
| 2060 | if (slabp->free == BUFCTL_END) | ||
| 2061 | list_add(&slabp->list, &l3->slabs_full); | ||
| 2062 | else | ||
| 2063 | list_add(&slabp->list, &l3->slabs_partial); | ||
| 2064 | } | ||
| 2065 | |||
| 2066 | must_grow: | ||
| 2067 | l3->free_objects -= ac->avail; | ||
| 2068 | alloc_done: | ||
| 2069 | spin_unlock(&cachep->spinlock); | ||
| 2070 | |||
| 2071 | if (unlikely(!ac->avail)) { | ||
| 2072 | int x; | ||
| 2073 | x = cache_grow(cachep, flags, -1); | ||
| 2074 | |||
| 2075 | // cache_grow can reenable interrupts, then ac could change. | ||
| 2076 | ac = ac_data(cachep); | ||
| 2077 | if (!x && ac->avail == 0) // no objects in sight? abort | ||
| 2078 | return NULL; | ||
| 2079 | |||
| 2080 | if (!ac->avail) // objects refilled by interrupt? | ||
| 2081 | goto retry; | ||
| 2082 | } | ||
| 2083 | ac->touched = 1; | ||
| 2084 | return ac_entry(ac)[--ac->avail]; | ||
| 2085 | } | ||
| 2086 | |||
| 2087 | static inline void | ||
| 2088 | cache_alloc_debugcheck_before(kmem_cache_t *cachep, unsigned int __nocast flags) | ||
| 2089 | { | ||
| 2090 | might_sleep_if(flags & __GFP_WAIT); | ||
| 2091 | #if DEBUG | ||
| 2092 | kmem_flagcheck(cachep, flags); | ||
| 2093 | #endif | ||
| 2094 | } | ||
| 2095 | |||
| 2096 | #if DEBUG | ||
| 2097 | static void * | ||
| 2098 | cache_alloc_debugcheck_after(kmem_cache_t *cachep, | ||
| 2099 | unsigned long flags, void *objp, void *caller) | ||
| 2100 | { | ||
| 2101 | if (!objp) | ||
| 2102 | return objp; | ||
| 2103 | if (cachep->flags & SLAB_POISON) { | ||
| 2104 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
| 2105 | if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) | ||
| 2106 | kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1); | ||
| 2107 | else | ||
| 2108 | check_poison_obj(cachep, objp); | ||
| 2109 | #else | ||
| 2110 | check_poison_obj(cachep, objp); | ||
| 2111 | #endif | ||
| 2112 | poison_obj(cachep, objp, POISON_INUSE); | ||
| 2113 | } | ||
| 2114 | if (cachep->flags & SLAB_STORE_USER) | ||
| 2115 | *dbg_userword(cachep, objp) = caller; | ||
| 2116 | |||
| 2117 | if (cachep->flags & SLAB_RED_ZONE) { | ||
| 2118 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { | ||
| 2119 | slab_error(cachep, "double free, or memory outside" | ||
| 2120 | " object was overwritten"); | ||
| 2121 | printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", | ||
| 2122 | objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); | ||
| 2123 | } | ||
| 2124 | *dbg_redzone1(cachep, objp) = RED_ACTIVE; | ||
| 2125 | *dbg_redzone2(cachep, objp) = RED_ACTIVE; | ||
| 2126 | } | ||
| 2127 | objp += obj_dbghead(cachep); | ||
| 2128 | if (cachep->ctor && cachep->flags & SLAB_POISON) { | ||
| 2129 | unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; | ||
| 2130 | |||
| 2131 | if (!(flags & __GFP_WAIT)) | ||
| 2132 | ctor_flags |= SLAB_CTOR_ATOMIC; | ||
| 2133 | |||
| 2134 | cachep->ctor(objp, cachep, ctor_flags); | ||
| 2135 | } | ||
| 2136 | return objp; | ||
| 2137 | } | ||
| 2138 | #else | ||
| 2139 | #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) | ||
| 2140 | #endif | ||
| 2141 | |||
| 2142 | |||
| 2143 | static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags) | ||
| 2144 | { | ||
| 2145 | unsigned long save_flags; | ||
| 2146 | void* objp; | ||
| 2147 | struct array_cache *ac; | ||
| 2148 | |||
| 2149 | cache_alloc_debugcheck_before(cachep, flags); | ||
| 2150 | |||
| 2151 | local_irq_save(save_flags); | ||
| 2152 | ac = ac_data(cachep); | ||
| 2153 | if (likely(ac->avail)) { | ||
| 2154 | STATS_INC_ALLOCHIT(cachep); | ||
| 2155 | ac->touched = 1; | ||
| 2156 | objp = ac_entry(ac)[--ac->avail]; | ||
| 2157 | } else { | ||
| 2158 | STATS_INC_ALLOCMISS(cachep); | ||
| 2159 | objp = cache_alloc_refill(cachep, flags); | ||
| 2160 | } | ||
| 2161 | local_irq_restore(save_flags); | ||
| 2162 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, __builtin_return_address(0)); | ||
| 2163 | return objp; | ||
| 2164 | } | ||
| 2165 | |||
| 2166 | /* | ||
| 2167 | * NUMA: different approach needed if the spinlock is moved into | ||
| 2168 | * the l3 structure | ||
| 2169 | */ | ||
| 2170 | |||
| 2171 | static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects) | ||
| 2172 | { | ||
| 2173 | int i; | ||
| 2174 | |||
| 2175 | check_spinlock_acquired(cachep); | ||
| 2176 | |||
| 2177 | /* NUMA: move add into loop */ | ||
| 2178 | cachep->lists.free_objects += nr_objects; | ||
| 2179 | |||
| 2180 | for (i = 0; i < nr_objects; i++) { | ||
| 2181 | void *objp = objpp[i]; | ||
| 2182 | struct slab *slabp; | ||
| 2183 | unsigned int objnr; | ||
| 2184 | |||
| 2185 | slabp = GET_PAGE_SLAB(virt_to_page(objp)); | ||
| 2186 | list_del(&slabp->list); | ||
| 2187 | objnr = (objp - slabp->s_mem) / cachep->objsize; | ||
| 2188 | check_slabp(cachep, slabp); | ||
| 2189 | #if DEBUG | ||
| 2190 | if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { | ||
| 2191 | printk(KERN_ERR "slab: double free detected in cache '%s', objp %p.\n", | ||
| 2192 | cachep->name, objp); | ||
| 2193 | BUG(); | ||
| 2194 | } | ||
| 2195 | #endif | ||
| 2196 | slab_bufctl(slabp)[objnr] = slabp->free; | ||
| 2197 | slabp->free = objnr; | ||
| 2198 | STATS_DEC_ACTIVE(cachep); | ||
| 2199 | slabp->inuse--; | ||
| 2200 | check_slabp(cachep, slabp); | ||
| 2201 | |||
| 2202 | /* fixup slab chains */ | ||
| 2203 | if (slabp->inuse == 0) { | ||
| 2204 | if (cachep->lists.free_objects > cachep->free_limit) { | ||
| 2205 | cachep->lists.free_objects -= cachep->num; | ||
| 2206 | slab_destroy(cachep, slabp); | ||
| 2207 | } else { | ||
| 2208 | list_add(&slabp->list, | ||
| 2209 | &list3_data_ptr(cachep, objp)->slabs_free); | ||
| 2210 | } | ||
| 2211 | } else { | ||
| 2212 | /* Unconditionally move a slab to the end of the | ||
| 2213 | * partial list on free - maximum time for the | ||
| 2214 | * other objects to be freed, too. | ||
| 2215 | */ | ||
| 2216 | list_add_tail(&slabp->list, | ||
| 2217 | &list3_data_ptr(cachep, objp)->slabs_partial); | ||
| 2218 | } | ||
| 2219 | } | ||
| 2220 | } | ||
| 2221 | |||
| 2222 | static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac) | ||
| 2223 | { | ||
| 2224 | int batchcount; | ||
| 2225 | |||
| 2226 | batchcount = ac->batchcount; | ||
| 2227 | #if DEBUG | ||
| 2228 | BUG_ON(!batchcount || batchcount > ac->avail); | ||
| 2229 | #endif | ||
| 2230 | check_irq_off(); | ||
| 2231 | spin_lock(&cachep->spinlock); | ||
| 2232 | if (cachep->lists.shared) { | ||
| 2233 | struct array_cache *shared_array = cachep->lists.shared; | ||
| 2234 | int max = shared_array->limit-shared_array->avail; | ||
| 2235 | if (max) { | ||
| 2236 | if (batchcount > max) | ||
| 2237 | batchcount = max; | ||
| 2238 | memcpy(&ac_entry(shared_array)[shared_array->avail], | ||
| 2239 | &ac_entry(ac)[0], | ||
| 2240 | sizeof(void*)*batchcount); | ||
| 2241 | shared_array->avail += batchcount; | ||
| 2242 | goto free_done; | ||
| 2243 | } | ||
| 2244 | } | ||
| 2245 | |||
| 2246 | free_block(cachep, &ac_entry(ac)[0], batchcount); | ||
| 2247 | free_done: | ||
| 2248 | #if STATS | ||
| 2249 | { | ||
| 2250 | int i = 0; | ||
| 2251 | struct list_head *p; | ||
| 2252 | |||
| 2253 | p = list3_data(cachep)->slabs_free.next; | ||
| 2254 | while (p != &(list3_data(cachep)->slabs_free)) { | ||
| 2255 | struct slab *slabp; | ||
| 2256 | |||
| 2257 | slabp = list_entry(p, struct slab, list); | ||
| 2258 | BUG_ON(slabp->inuse); | ||
| 2259 | |||
| 2260 | i++; | ||
| 2261 | p = p->next; | ||
| 2262 | } | ||
| 2263 | STATS_SET_FREEABLE(cachep, i); | ||
| 2264 | } | ||
| 2265 | #endif | ||
| 2266 | spin_unlock(&cachep->spinlock); | ||
| 2267 | ac->avail -= batchcount; | ||
| 2268 | memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount], | ||
| 2269 | sizeof(void*)*ac->avail); | ||
| 2270 | } | ||
| 2271 | |||
| 2272 | /* | ||
| 2273 | * __cache_free | ||
| 2274 | * Release an obj back to its cache. If the obj has a constructed | ||
| 2275 | * state, it must be in this state _before_ it is released. | ||
| 2276 | * | ||
| 2277 | * Called with disabled ints. | ||
| 2278 | */ | ||
| 2279 | static inline void __cache_free(kmem_cache_t *cachep, void *objp) | ||
| 2280 | { | ||
| 2281 | struct array_cache *ac = ac_data(cachep); | ||
| 2282 | |||
| 2283 | check_irq_off(); | ||
| 2284 | objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); | ||
| 2285 | |||
| 2286 | if (likely(ac->avail < ac->limit)) { | ||
| 2287 | STATS_INC_FREEHIT(cachep); | ||
| 2288 | ac_entry(ac)[ac->avail++] = objp; | ||
| 2289 | return; | ||
| 2290 | } else { | ||
| 2291 | STATS_INC_FREEMISS(cachep); | ||
| 2292 | cache_flusharray(cachep, ac); | ||
| 2293 | ac_entry(ac)[ac->avail++] = objp; | ||
| 2294 | } | ||
| 2295 | } | ||
| 2296 | |||
| 2297 | /** | ||
| 2298 | * kmem_cache_alloc - Allocate an object | ||
| 2299 | * @cachep: The cache to allocate from. | ||
| 2300 | * @flags: See kmalloc(). | ||
| 2301 | * | ||
| 2302 | * Allocate an object from this cache. The flags are only relevant | ||
| 2303 | * if the cache has no available objects. | ||
| 2304 | */ | ||
| 2305 | void *kmem_cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags) | ||
| 2306 | { | ||
| 2307 | return __cache_alloc(cachep, flags); | ||
| 2308 | } | ||
| 2309 | EXPORT_SYMBOL(kmem_cache_alloc); | ||
| 2310 | |||
| 2311 | /** | ||
| 2312 | * kmem_ptr_validate - check if an untrusted pointer might | ||
| 2313 | * be a slab entry. | ||
| 2314 | * @cachep: the cache we're checking against | ||
| 2315 | * @ptr: pointer to validate | ||
| 2316 | * | ||
| 2317 | * This verifies that the untrusted pointer looks sane: | ||
| 2318 | * it is _not_ a guarantee that the pointer is actually | ||
| 2319 | * part of the slab cache in question, but it at least | ||
| 2320 | * validates that the pointer can be dereferenced and | ||
| 2321 | * looks half-way sane. | ||
| 2322 | * | ||
| 2323 | * Currently only used for dentry validation. | ||
| 2324 | */ | ||
| 2325 | int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr) | ||
| 2326 | { | ||
| 2327 | unsigned long addr = (unsigned long) ptr; | ||
| 2328 | unsigned long min_addr = PAGE_OFFSET; | ||
| 2329 | unsigned long align_mask = BYTES_PER_WORD-1; | ||
| 2330 | unsigned long size = cachep->objsize; | ||
| 2331 | struct page *page; | ||
| 2332 | |||
| 2333 | if (unlikely(addr < min_addr)) | ||
| 2334 | goto out; | ||
| 2335 | if (unlikely(addr > (unsigned long)high_memory - size)) | ||
| 2336 | goto out; | ||
| 2337 | if (unlikely(addr & align_mask)) | ||
| 2338 | goto out; | ||
| 2339 | if (unlikely(!kern_addr_valid(addr))) | ||
| 2340 | goto out; | ||
| 2341 | if (unlikely(!kern_addr_valid(addr + size - 1))) | ||
| 2342 | goto out; | ||
| 2343 | page = virt_to_page(ptr); | ||
| 2344 | if (unlikely(!PageSlab(page))) | ||
| 2345 | goto out; | ||
| 2346 | if (unlikely(GET_PAGE_CACHE(page) != cachep)) | ||
| 2347 | goto out; | ||
| 2348 | return 1; | ||
| 2349 | out: | ||
| 2350 | return 0; | ||
| 2351 | } | ||
| 2352 | |||
| 2353 | #ifdef CONFIG_NUMA | ||
| 2354 | /** | ||
| 2355 | * kmem_cache_alloc_node - Allocate an object on the specified node | ||
| 2356 | * @cachep: The cache to allocate from. | ||
| 2357 | * @flags: See kmalloc(). | ||
| 2358 | * @nodeid: node number of the target node. | ||
| 2359 | * | ||
| 2360 | * Identical to kmem_cache_alloc, except that this function is slow | ||
| 2361 | * and can sleep. And it will allocate memory on the given node, which | ||
| 2362 | * can improve the performance for cpu bound structures. | ||
| 2363 | */ | ||
| 2364 | void *kmem_cache_alloc_node(kmem_cache_t *cachep, int nodeid) | ||
| 2365 | { | ||
| 2366 | int loop; | ||
| 2367 | void *objp; | ||
| 2368 | struct slab *slabp; | ||
| 2369 | kmem_bufctl_t next; | ||
| 2370 | |||
| 2371 | for (loop = 0;;loop++) { | ||
| 2372 | struct list_head *q; | ||
| 2373 | |||
| 2374 | objp = NULL; | ||
| 2375 | check_irq_on(); | ||
| 2376 | spin_lock_irq(&cachep->spinlock); | ||
| 2377 | /* walk through all partial and empty slab and find one | ||
| 2378 | * from the right node */ | ||
| 2379 | list_for_each(q,&cachep->lists.slabs_partial) { | ||
| 2380 | slabp = list_entry(q, struct slab, list); | ||
| 2381 | |||
| 2382 | if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid || | ||
| 2383 | loop > 2) | ||
| 2384 | goto got_slabp; | ||
| 2385 | } | ||
| 2386 | list_for_each(q, &cachep->lists.slabs_free) { | ||
| 2387 | slabp = list_entry(q, struct slab, list); | ||
| 2388 | |||
| 2389 | if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid || | ||
| 2390 | loop > 2) | ||
| 2391 | goto got_slabp; | ||
| 2392 | } | ||
| 2393 | spin_unlock_irq(&cachep->spinlock); | ||
| 2394 | |||
| 2395 | local_irq_disable(); | ||
| 2396 | if (!cache_grow(cachep, GFP_KERNEL, nodeid)) { | ||
| 2397 | local_irq_enable(); | ||
| 2398 | return NULL; | ||
| 2399 | } | ||
| 2400 | local_irq_enable(); | ||
| 2401 | } | ||
| 2402 | got_slabp: | ||
| 2403 | /* found one: allocate object */ | ||
| 2404 | check_slabp(cachep, slabp); | ||
| 2405 | check_spinlock_acquired(cachep); | ||
| 2406 | |||
| 2407 | STATS_INC_ALLOCED(cachep); | ||
| 2408 | STATS_INC_ACTIVE(cachep); | ||
| 2409 | STATS_SET_HIGH(cachep); | ||
| 2410 | STATS_INC_NODEALLOCS(cachep); | ||
| 2411 | |||
| 2412 | objp = slabp->s_mem + slabp->free*cachep->objsize; | ||
| 2413 | |||
| 2414 | slabp->inuse++; | ||
| 2415 | next = slab_bufctl(slabp)[slabp->free]; | ||
| 2416 | #if DEBUG | ||
| 2417 | slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; | ||
| 2418 | #endif | ||
| 2419 | slabp->free = next; | ||
| 2420 | check_slabp(cachep, slabp); | ||
| 2421 | |||
| 2422 | /* move slabp to correct slabp list: */ | ||
| 2423 | list_del(&slabp->list); | ||
| 2424 | if (slabp->free == BUFCTL_END) | ||
| 2425 | list_add(&slabp->list, &cachep->lists.slabs_full); | ||
| 2426 | else | ||
| 2427 | list_add(&slabp->list, &cachep->lists.slabs_partial); | ||
| 2428 | |||
| 2429 | list3_data(cachep)->free_objects--; | ||
| 2430 | spin_unlock_irq(&cachep->spinlock); | ||
| 2431 | |||
| 2432 | objp = cache_alloc_debugcheck_after(cachep, GFP_KERNEL, objp, | ||
| 2433 | __builtin_return_address(0)); | ||
| 2434 | return objp; | ||
| 2435 | } | ||
| 2436 | EXPORT_SYMBOL(kmem_cache_alloc_node); | ||
| 2437 | |||
| 2438 | #endif | ||
| 2439 | |||
| 2440 | /** | ||
| 2441 | * kmalloc - allocate memory | ||
| 2442 | * @size: how many bytes of memory are required. | ||
| 2443 | * @flags: the type of memory to allocate. | ||
| 2444 | * | ||
| 2445 | * kmalloc is the normal method of allocating memory | ||
| 2446 | * in the kernel. | ||
| 2447 | * | ||
| 2448 | * The @flags argument may be one of: | ||
| 2449 | * | ||
| 2450 | * %GFP_USER - Allocate memory on behalf of user. May sleep. | ||
| 2451 | * | ||
| 2452 | * %GFP_KERNEL - Allocate normal kernel ram. May sleep. | ||
| 2453 | * | ||
| 2454 | * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers. | ||
| 2455 | * | ||
| 2456 | * Additionally, the %GFP_DMA flag may be set to indicate the memory | ||
| 2457 | * must be suitable for DMA. This can mean different things on different | ||
| 2458 | * platforms. For example, on i386, it means that the memory must come | ||
| 2459 | * from the first 16MB. | ||
| 2460 | */ | ||
| 2461 | void *__kmalloc(size_t size, unsigned int __nocast flags) | ||
| 2462 | { | ||
| 2463 | kmem_cache_t *cachep; | ||
| 2464 | |||
| 2465 | cachep = kmem_find_general_cachep(size, flags); | ||
| 2466 | if (unlikely(cachep == NULL)) | ||
| 2467 | return NULL; | ||
| 2468 | return __cache_alloc(cachep, flags); | ||
| 2469 | } | ||
| 2470 | EXPORT_SYMBOL(__kmalloc); | ||
| 2471 | |||
| 2472 | #ifdef CONFIG_SMP | ||
| 2473 | /** | ||
| 2474 | * __alloc_percpu - allocate one copy of the object for every present | ||
| 2475 | * cpu in the system, zeroing them. | ||
| 2476 | * Objects should be dereferenced using the per_cpu_ptr macro only. | ||
| 2477 | * | ||
| 2478 | * @size: how many bytes of memory are required. | ||
| 2479 | * @align: the alignment, which can't be greater than SMP_CACHE_BYTES. | ||
| 2480 | */ | ||
| 2481 | void *__alloc_percpu(size_t size, size_t align) | ||
| 2482 | { | ||
| 2483 | int i; | ||
| 2484 | struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL); | ||
| 2485 | |||
| 2486 | if (!pdata) | ||
| 2487 | return NULL; | ||
| 2488 | |||
| 2489 | for (i = 0; i < NR_CPUS; i++) { | ||
| 2490 | if (!cpu_possible(i)) | ||
| 2491 | continue; | ||
| 2492 | pdata->ptrs[i] = kmem_cache_alloc_node( | ||
| 2493 | kmem_find_general_cachep(size, GFP_KERNEL), | ||
| 2494 | cpu_to_node(i)); | ||
| 2495 | |||
| 2496 | if (!pdata->ptrs[i]) | ||
| 2497 | goto unwind_oom; | ||
| 2498 | memset(pdata->ptrs[i], 0, size); | ||
| 2499 | } | ||
| 2500 | |||
| 2501 | /* Catch derefs w/o wrappers */ | ||
| 2502 | return (void *) (~(unsigned long) pdata); | ||
| 2503 | |||
| 2504 | unwind_oom: | ||
| 2505 | while (--i >= 0) { | ||
| 2506 | if (!cpu_possible(i)) | ||
| 2507 | continue; | ||
| 2508 | kfree(pdata->ptrs[i]); | ||
| 2509 | } | ||
| 2510 | kfree(pdata); | ||
| 2511 | return NULL; | ||
| 2512 | } | ||
| 2513 | EXPORT_SYMBOL(__alloc_percpu); | ||
| 2514 | #endif | ||
| 2515 | |||
| 2516 | /** | ||
| 2517 | * kmem_cache_free - Deallocate an object | ||
| 2518 | * @cachep: The cache the allocation was from. | ||
| 2519 | * @objp: The previously allocated object. | ||
| 2520 | * | ||
| 2521 | * Free an object which was previously allocated from this | ||
| 2522 | * cache. | ||
| 2523 | */ | ||
| 2524 | void kmem_cache_free(kmem_cache_t *cachep, void *objp) | ||
| 2525 | { | ||
| 2526 | unsigned long flags; | ||
| 2527 | |||
| 2528 | local_irq_save(flags); | ||
| 2529 | __cache_free(cachep, objp); | ||
| 2530 | local_irq_restore(flags); | ||
| 2531 | } | ||
| 2532 | EXPORT_SYMBOL(kmem_cache_free); | ||
| 2533 | |||
| 2534 | /** | ||
| 2535 | * kcalloc - allocate memory for an array. The memory is set to zero. | ||
| 2536 | * @n: number of elements. | ||
| 2537 | * @size: element size. | ||
| 2538 | * @flags: the type of memory to allocate. | ||
| 2539 | */ | ||
| 2540 | void *kcalloc(size_t n, size_t size, unsigned int __nocast flags) | ||
| 2541 | { | ||
| 2542 | void *ret = NULL; | ||
| 2543 | |||
| 2544 | if (n != 0 && size > INT_MAX / n) | ||
| 2545 | return ret; | ||
| 2546 | |||
| 2547 | ret = kmalloc(n * size, flags); | ||
| 2548 | if (ret) | ||
| 2549 | memset(ret, 0, n * size); | ||
| 2550 | return ret; | ||
| 2551 | } | ||
| 2552 | EXPORT_SYMBOL(kcalloc); | ||
| 2553 | |||
| 2554 | /** | ||
| 2555 | * kfree - free previously allocated memory | ||
| 2556 | * @objp: pointer returned by kmalloc. | ||
| 2557 | * | ||
| 2558 | * Don't free memory not originally allocated by kmalloc() | ||
| 2559 | * or you will run into trouble. | ||
| 2560 | */ | ||
| 2561 | void kfree(const void *objp) | ||
| 2562 | { | ||
| 2563 | kmem_cache_t *c; | ||
| 2564 | unsigned long flags; | ||
| 2565 | |||
| 2566 | if (unlikely(!objp)) | ||
| 2567 | return; | ||
| 2568 | local_irq_save(flags); | ||
| 2569 | kfree_debugcheck(objp); | ||
| 2570 | c = GET_PAGE_CACHE(virt_to_page(objp)); | ||
| 2571 | __cache_free(c, (void*)objp); | ||
| 2572 | local_irq_restore(flags); | ||
| 2573 | } | ||
| 2574 | EXPORT_SYMBOL(kfree); | ||
| 2575 | |||
| 2576 | #ifdef CONFIG_SMP | ||
| 2577 | /** | ||
| 2578 | * free_percpu - free previously allocated percpu memory | ||
| 2579 | * @objp: pointer returned by alloc_percpu. | ||
| 2580 | * | ||
| 2581 | * Don't free memory not originally allocated by alloc_percpu() | ||
| 2582 | * The complemented objp is to check for that. | ||
| 2583 | */ | ||
| 2584 | void | ||
| 2585 | free_percpu(const void *objp) | ||
| 2586 | { | ||
| 2587 | int i; | ||
| 2588 | struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp); | ||
| 2589 | |||
| 2590 | for (i = 0; i < NR_CPUS; i++) { | ||
| 2591 | if (!cpu_possible(i)) | ||
| 2592 | continue; | ||
| 2593 | kfree(p->ptrs[i]); | ||
| 2594 | } | ||
| 2595 | kfree(p); | ||
| 2596 | } | ||
| 2597 | EXPORT_SYMBOL(free_percpu); | ||
| 2598 | #endif | ||
| 2599 | |||
| 2600 | unsigned int kmem_cache_size(kmem_cache_t *cachep) | ||
| 2601 | { | ||
| 2602 | return obj_reallen(cachep); | ||
| 2603 | } | ||
| 2604 | EXPORT_SYMBOL(kmem_cache_size); | ||
| 2605 | |||
| 2606 | struct ccupdate_struct { | ||
| 2607 | kmem_cache_t *cachep; | ||
| 2608 | struct array_cache *new[NR_CPUS]; | ||
| 2609 | }; | ||
| 2610 | |||
| 2611 | static void do_ccupdate_local(void *info) | ||
| 2612 | { | ||
| 2613 | struct ccupdate_struct *new = (struct ccupdate_struct *)info; | ||
| 2614 | struct array_cache *old; | ||
| 2615 | |||
| 2616 | check_irq_off(); | ||
| 2617 | old = ac_data(new->cachep); | ||
| 2618 | |||
| 2619 | new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; | ||
| 2620 | new->new[smp_processor_id()] = old; | ||
| 2621 | } | ||
| 2622 | |||
| 2623 | |||
| 2624 | static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount, | ||
| 2625 | int shared) | ||
| 2626 | { | ||
| 2627 | struct ccupdate_struct new; | ||
| 2628 | struct array_cache *new_shared; | ||
| 2629 | int i; | ||
| 2630 | |||
| 2631 | memset(&new.new,0,sizeof(new.new)); | ||
| 2632 | for (i = 0; i < NR_CPUS; i++) { | ||
| 2633 | if (cpu_online(i)) { | ||
| 2634 | new.new[i] = alloc_arraycache(i, limit, batchcount); | ||
| 2635 | if (!new.new[i]) { | ||
| 2636 | for (i--; i >= 0; i--) kfree(new.new[i]); | ||
| 2637 | return -ENOMEM; | ||
| 2638 | } | ||
| 2639 | } else { | ||
| 2640 | new.new[i] = NULL; | ||
| 2641 | } | ||
| 2642 | } | ||
| 2643 | new.cachep = cachep; | ||
| 2644 | |||
| 2645 | smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); | ||
| 2646 | |||
| 2647 | check_irq_on(); | ||
| 2648 | spin_lock_irq(&cachep->spinlock); | ||
| 2649 | cachep->batchcount = batchcount; | ||
| 2650 | cachep->limit = limit; | ||
| 2651 | cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num; | ||
| 2652 | spin_unlock_irq(&cachep->spinlock); | ||
| 2653 | |||
| 2654 | for (i = 0; i < NR_CPUS; i++) { | ||
| 2655 | struct array_cache *ccold = new.new[i]; | ||
| 2656 | if (!ccold) | ||
| 2657 | continue; | ||
| 2658 | spin_lock_irq(&cachep->spinlock); | ||
| 2659 | free_block(cachep, ac_entry(ccold), ccold->avail); | ||
| 2660 | spin_unlock_irq(&cachep->spinlock); | ||
| 2661 | kfree(ccold); | ||
| 2662 | } | ||
| 2663 | new_shared = alloc_arraycache(-1, batchcount*shared, 0xbaadf00d); | ||
| 2664 | if (new_shared) { | ||
| 2665 | struct array_cache *old; | ||
| 2666 | |||
| 2667 | spin_lock_irq(&cachep->spinlock); | ||
| 2668 | old = cachep->lists.shared; | ||
| 2669 | cachep->lists.shared = new_shared; | ||
| 2670 | if (old) | ||
| 2671 | free_block(cachep, ac_entry(old), old->avail); | ||
| 2672 | spin_unlock_irq(&cachep->spinlock); | ||
| 2673 | kfree(old); | ||
| 2674 | } | ||
| 2675 | |||
| 2676 | return 0; | ||
| 2677 | } | ||
| 2678 | |||
| 2679 | |||
| 2680 | static void enable_cpucache(kmem_cache_t *cachep) | ||
| 2681 | { | ||
| 2682 | int err; | ||
| 2683 | int limit, shared; | ||
| 2684 | |||
| 2685 | /* The head array serves three purposes: | ||
| 2686 | * - create a LIFO ordering, i.e. return objects that are cache-warm | ||
| 2687 | * - reduce the number of spinlock operations. | ||
| 2688 | * - reduce the number of linked list operations on the slab and | ||
| 2689 | * bufctl chains: array operations are cheaper. | ||
| 2690 | * The numbers are guessed, we should auto-tune as described by | ||
| 2691 | * Bonwick. | ||
| 2692 | */ | ||
| 2693 | if (cachep->objsize > 131072) | ||
| 2694 | limit = 1; | ||
| 2695 | else if (cachep->objsize > PAGE_SIZE) | ||
| 2696 | limit = 8; | ||
| 2697 | else if (cachep->objsize > 1024) | ||
| 2698 | limit = 24; | ||
| 2699 | else if (cachep->objsize > 256) | ||
| 2700 | limit = 54; | ||
| 2701 | else | ||
| 2702 | limit = 120; | ||
| 2703 | |||
| 2704 | /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound | ||
| 2705 | * allocation behaviour: Most allocs on one cpu, most free operations | ||
| 2706 | * on another cpu. For these cases, an efficient object passing between | ||
| 2707 | * cpus is necessary. This is provided by a shared array. The array | ||
| 2708 | * replaces Bonwick's magazine layer. | ||
| 2709 | * On uniprocessor, it's functionally equivalent (but less efficient) | ||
| 2710 | * to a larger limit. Thus disabled by default. | ||
| 2711 | */ | ||
| 2712 | shared = 0; | ||
| 2713 | #ifdef CONFIG_SMP | ||
| 2714 | if (cachep->objsize <= PAGE_SIZE) | ||
| 2715 | shared = 8; | ||
| 2716 | #endif | ||
| 2717 | |||
| 2718 | #if DEBUG | ||
| 2719 | /* With debugging enabled, large batchcount lead to excessively | ||
| 2720 | * long periods with disabled local interrupts. Limit the | ||
| 2721 | * batchcount | ||
| 2722 | */ | ||
| 2723 | if (limit > 32) | ||
| 2724 | limit = 32; | ||
| 2725 | #endif | ||
| 2726 | err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared); | ||
| 2727 | if (err) | ||
| 2728 | printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", | ||
| 2729 | cachep->name, -err); | ||
| 2730 | } | ||
| 2731 | |||
| 2732 | static void drain_array_locked(kmem_cache_t *cachep, | ||
| 2733 | struct array_cache *ac, int force) | ||
| 2734 | { | ||
| 2735 | int tofree; | ||
| 2736 | |||
| 2737 | check_spinlock_acquired(cachep); | ||
| 2738 | if (ac->touched && !force) { | ||
| 2739 | ac->touched = 0; | ||
| 2740 | } else if (ac->avail) { | ||
| 2741 | tofree = force ? ac->avail : (ac->limit+4)/5; | ||
| 2742 | if (tofree > ac->avail) { | ||
| 2743 | tofree = (ac->avail+1)/2; | ||
| 2744 | } | ||
| 2745 | free_block(cachep, ac_entry(ac), tofree); | ||
| 2746 | ac->avail -= tofree; | ||
| 2747 | memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree], | ||
| 2748 | sizeof(void*)*ac->avail); | ||
| 2749 | } | ||
| 2750 | } | ||
| 2751 | |||
| 2752 | /** | ||
| 2753 | * cache_reap - Reclaim memory from caches. | ||
| 2754 | * | ||
| 2755 | * Called from workqueue/eventd every few seconds. | ||
| 2756 | * Purpose: | ||
| 2757 | * - clear the per-cpu caches for this CPU. | ||
| 2758 | * - return freeable pages to the main free memory pool. | ||
| 2759 | * | ||
| 2760 | * If we cannot acquire the cache chain semaphore then just give up - we'll | ||
| 2761 | * try again on the next iteration. | ||
| 2762 | */ | ||
| 2763 | static void cache_reap(void *unused) | ||
| 2764 | { | ||
| 2765 | struct list_head *walk; | ||
| 2766 | |||
| 2767 | if (down_trylock(&cache_chain_sem)) { | ||
| 2768 | /* Give up. Setup the next iteration. */ | ||
| 2769 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id()); | ||
| 2770 | return; | ||
| 2771 | } | ||
| 2772 | |||
| 2773 | list_for_each(walk, &cache_chain) { | ||
| 2774 | kmem_cache_t *searchp; | ||
| 2775 | struct list_head* p; | ||
| 2776 | int tofree; | ||
| 2777 | struct slab *slabp; | ||
| 2778 | |||
| 2779 | searchp = list_entry(walk, kmem_cache_t, next); | ||
| 2780 | |||
| 2781 | if (searchp->flags & SLAB_NO_REAP) | ||
| 2782 | goto next; | ||
| 2783 | |||
| 2784 | check_irq_on(); | ||
| 2785 | |||
| 2786 | spin_lock_irq(&searchp->spinlock); | ||
| 2787 | |||
| 2788 | drain_array_locked(searchp, ac_data(searchp), 0); | ||
| 2789 | |||
| 2790 | if(time_after(searchp->lists.next_reap, jiffies)) | ||
| 2791 | goto next_unlock; | ||
| 2792 | |||
| 2793 | searchp->lists.next_reap = jiffies + REAPTIMEOUT_LIST3; | ||
| 2794 | |||
| 2795 | if (searchp->lists.shared) | ||
| 2796 | drain_array_locked(searchp, searchp->lists.shared, 0); | ||
| 2797 | |||
| 2798 | if (searchp->lists.free_touched) { | ||
| 2799 | searchp->lists.free_touched = 0; | ||
| 2800 | goto next_unlock; | ||
| 2801 | } | ||
| 2802 | |||
| 2803 | tofree = (searchp->free_limit+5*searchp->num-1)/(5*searchp->num); | ||
| 2804 | do { | ||
| 2805 | p = list3_data(searchp)->slabs_free.next; | ||
| 2806 | if (p == &(list3_data(searchp)->slabs_free)) | ||
| 2807 | break; | ||
| 2808 | |||
| 2809 | slabp = list_entry(p, struct slab, list); | ||
| 2810 | BUG_ON(slabp->inuse); | ||
| 2811 | list_del(&slabp->list); | ||
| 2812 | STATS_INC_REAPED(searchp); | ||
| 2813 | |||
| 2814 | /* Safe to drop the lock. The slab is no longer | ||
| 2815 | * linked to the cache. | ||
| 2816 | * searchp cannot disappear, we hold | ||
| 2817 | * cache_chain_lock | ||
| 2818 | */ | ||
| 2819 | searchp->lists.free_objects -= searchp->num; | ||
| 2820 | spin_unlock_irq(&searchp->spinlock); | ||
| 2821 | slab_destroy(searchp, slabp); | ||
| 2822 | spin_lock_irq(&searchp->spinlock); | ||
| 2823 | } while(--tofree > 0); | ||
| 2824 | next_unlock: | ||
| 2825 | spin_unlock_irq(&searchp->spinlock); | ||
| 2826 | next: | ||
| 2827 | cond_resched(); | ||
| 2828 | } | ||
| 2829 | check_irq_on(); | ||
| 2830 | up(&cache_chain_sem); | ||
| 2831 | /* Setup the next iteration */ | ||
| 2832 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id()); | ||
| 2833 | } | ||
| 2834 | |||
| 2835 | #ifdef CONFIG_PROC_FS | ||
| 2836 | |||
| 2837 | static void *s_start(struct seq_file *m, loff_t *pos) | ||
| 2838 | { | ||
| 2839 | loff_t n = *pos; | ||
| 2840 | struct list_head *p; | ||
| 2841 | |||
| 2842 | down(&cache_chain_sem); | ||
| 2843 | if (!n) { | ||
| 2844 | /* | ||
| 2845 | * Output format version, so at least we can change it | ||
| 2846 | * without _too_ many complaints. | ||
| 2847 | */ | ||
| 2848 | #if STATS | ||
| 2849 | seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); | ||
| 2850 | #else | ||
| 2851 | seq_puts(m, "slabinfo - version: 2.1\n"); | ||
| 2852 | #endif | ||
| 2853 | seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>"); | ||
| 2854 | seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); | ||
| 2855 | seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); | ||
| 2856 | #if STATS | ||
| 2857 | seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>" | ||
| 2858 | " <error> <maxfreeable> <freelimit> <nodeallocs>"); | ||
| 2859 | seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); | ||
| 2860 | #endif | ||
| 2861 | seq_putc(m, '\n'); | ||
| 2862 | } | ||
| 2863 | p = cache_chain.next; | ||
| 2864 | while (n--) { | ||
| 2865 | p = p->next; | ||
| 2866 | if (p == &cache_chain) | ||
| 2867 | return NULL; | ||
| 2868 | } | ||
| 2869 | return list_entry(p, kmem_cache_t, next); | ||
| 2870 | } | ||
| 2871 | |||
| 2872 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) | ||
| 2873 | { | ||
| 2874 | kmem_cache_t *cachep = p; | ||
| 2875 | ++*pos; | ||
| 2876 | return cachep->next.next == &cache_chain ? NULL | ||
| 2877 | : list_entry(cachep->next.next, kmem_cache_t, next); | ||
| 2878 | } | ||
| 2879 | |||
| 2880 | static void s_stop(struct seq_file *m, void *p) | ||
| 2881 | { | ||
| 2882 | up(&cache_chain_sem); | ||
| 2883 | } | ||
| 2884 | |||
| 2885 | static int s_show(struct seq_file *m, void *p) | ||
| 2886 | { | ||
| 2887 | kmem_cache_t *cachep = p; | ||
| 2888 | struct list_head *q; | ||
| 2889 | struct slab *slabp; | ||
| 2890 | unsigned long active_objs; | ||
| 2891 | unsigned long num_objs; | ||
| 2892 | unsigned long active_slabs = 0; | ||
| 2893 | unsigned long num_slabs; | ||
| 2894 | const char *name; | ||
| 2895 | char *error = NULL; | ||
| 2896 | |||
| 2897 | check_irq_on(); | ||
| 2898 | spin_lock_irq(&cachep->spinlock); | ||
| 2899 | active_objs = 0; | ||
| 2900 | num_slabs = 0; | ||
| 2901 | list_for_each(q,&cachep->lists.slabs_full) { | ||
| 2902 | slabp = list_entry(q, struct slab, list); | ||
| 2903 | if (slabp->inuse != cachep->num && !error) | ||
| 2904 | error = "slabs_full accounting error"; | ||
| 2905 | active_objs += cachep->num; | ||
| 2906 | active_slabs++; | ||
| 2907 | } | ||
| 2908 | list_for_each(q,&cachep->lists.slabs_partial) { | ||
| 2909 | slabp = list_entry(q, struct slab, list); | ||
| 2910 | if (slabp->inuse == cachep->num && !error) | ||
| 2911 | error = "slabs_partial inuse accounting error"; | ||
| 2912 | if (!slabp->inuse && !error) | ||
| 2913 | error = "slabs_partial/inuse accounting error"; | ||
| 2914 | active_objs += slabp->inuse; | ||
| 2915 | active_slabs++; | ||
| 2916 | } | ||
| 2917 | list_for_each(q,&cachep->lists.slabs_free) { | ||
| 2918 | slabp = list_entry(q, struct slab, list); | ||
| 2919 | if (slabp->inuse && !error) | ||
| 2920 | error = "slabs_free/inuse accounting error"; | ||
| 2921 | num_slabs++; | ||
| 2922 | } | ||
| 2923 | num_slabs+=active_slabs; | ||
| 2924 | num_objs = num_slabs*cachep->num; | ||
| 2925 | if (num_objs - active_objs != cachep->lists.free_objects && !error) | ||
| 2926 | error = "free_objects accounting error"; | ||
| 2927 | |||
| 2928 | name = cachep->name; | ||
| 2929 | if (error) | ||
| 2930 | printk(KERN_ERR "slab: cache %s error: %s\n", name, error); | ||
| 2931 | |||
| 2932 | seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", | ||
| 2933 | name, active_objs, num_objs, cachep->objsize, | ||
| 2934 | cachep->num, (1<<cachep->gfporder)); | ||
| 2935 | seq_printf(m, " : tunables %4u %4u %4u", | ||
| 2936 | cachep->limit, cachep->batchcount, | ||
| 2937 | cachep->lists.shared->limit/cachep->batchcount); | ||
| 2938 | seq_printf(m, " : slabdata %6lu %6lu %6u", | ||
| 2939 | active_slabs, num_slabs, cachep->lists.shared->avail); | ||
| 2940 | #if STATS | ||
| 2941 | { /* list3 stats */ | ||
| 2942 | unsigned long high = cachep->high_mark; | ||
| 2943 | unsigned long allocs = cachep->num_allocations; | ||
| 2944 | unsigned long grown = cachep->grown; | ||
| 2945 | unsigned long reaped = cachep->reaped; | ||
| 2946 | unsigned long errors = cachep->errors; | ||
| 2947 | unsigned long max_freeable = cachep->max_freeable; | ||
| 2948 | unsigned long free_limit = cachep->free_limit; | ||
| 2949 | unsigned long node_allocs = cachep->node_allocs; | ||
| 2950 | |||
| 2951 | seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu", | ||
| 2952 | allocs, high, grown, reaped, errors, | ||
| 2953 | max_freeable, free_limit, node_allocs); | ||
| 2954 | } | ||
| 2955 | /* cpu stats */ | ||
| 2956 | { | ||
| 2957 | unsigned long allochit = atomic_read(&cachep->allochit); | ||
| 2958 | unsigned long allocmiss = atomic_read(&cachep->allocmiss); | ||
| 2959 | unsigned long freehit = atomic_read(&cachep->freehit); | ||
| 2960 | unsigned long freemiss = atomic_read(&cachep->freemiss); | ||
| 2961 | |||
| 2962 | seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", | ||
| 2963 | allochit, allocmiss, freehit, freemiss); | ||
| 2964 | } | ||
| 2965 | #endif | ||
| 2966 | seq_putc(m, '\n'); | ||
| 2967 | spin_unlock_irq(&cachep->spinlock); | ||
| 2968 | return 0; | ||
| 2969 | } | ||
| 2970 | |||
| 2971 | /* | ||
| 2972 | * slabinfo_op - iterator that generates /proc/slabinfo | ||
| 2973 | * | ||
| 2974 | * Output layout: | ||
| 2975 | * cache-name | ||
| 2976 | * num-active-objs | ||
| 2977 | * total-objs | ||
| 2978 | * object size | ||
| 2979 | * num-active-slabs | ||
| 2980 | * total-slabs | ||
| 2981 | * num-pages-per-slab | ||
| 2982 | * + further values on SMP and with statistics enabled | ||
| 2983 | */ | ||
| 2984 | |||
| 2985 | struct seq_operations slabinfo_op = { | ||
| 2986 | .start = s_start, | ||
| 2987 | .next = s_next, | ||
| 2988 | .stop = s_stop, | ||
| 2989 | .show = s_show, | ||
| 2990 | }; | ||
| 2991 | |||
| 2992 | #define MAX_SLABINFO_WRITE 128 | ||
| 2993 | /** | ||
| 2994 | * slabinfo_write - Tuning for the slab allocator | ||
| 2995 | * @file: unused | ||
| 2996 | * @buffer: user buffer | ||
| 2997 | * @count: data length | ||
| 2998 | * @ppos: unused | ||
| 2999 | */ | ||
| 3000 | ssize_t slabinfo_write(struct file *file, const char __user *buffer, | ||
| 3001 | size_t count, loff_t *ppos) | ||
| 3002 | { | ||
| 3003 | char kbuf[MAX_SLABINFO_WRITE+1], *tmp; | ||
| 3004 | int limit, batchcount, shared, res; | ||
| 3005 | struct list_head *p; | ||
| 3006 | |||
| 3007 | if (count > MAX_SLABINFO_WRITE) | ||
| 3008 | return -EINVAL; | ||
| 3009 | if (copy_from_user(&kbuf, buffer, count)) | ||
| 3010 | return -EFAULT; | ||
| 3011 | kbuf[MAX_SLABINFO_WRITE] = '\0'; | ||
| 3012 | |||
| 3013 | tmp = strchr(kbuf, ' '); | ||
| 3014 | if (!tmp) | ||
| 3015 | return -EINVAL; | ||
| 3016 | *tmp = '\0'; | ||
| 3017 | tmp++; | ||
| 3018 | if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) | ||
| 3019 | return -EINVAL; | ||
| 3020 | |||
| 3021 | /* Find the cache in the chain of caches. */ | ||
| 3022 | down(&cache_chain_sem); | ||
| 3023 | res = -EINVAL; | ||
| 3024 | list_for_each(p,&cache_chain) { | ||
| 3025 | kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); | ||
| 3026 | |||
| 3027 | if (!strcmp(cachep->name, kbuf)) { | ||
| 3028 | if (limit < 1 || | ||
| 3029 | batchcount < 1 || | ||
| 3030 | batchcount > limit || | ||
| 3031 | shared < 0) { | ||
| 3032 | res = -EINVAL; | ||
| 3033 | } else { | ||
| 3034 | res = do_tune_cpucache(cachep, limit, batchcount, shared); | ||
| 3035 | } | ||
| 3036 | break; | ||
| 3037 | } | ||
| 3038 | } | ||
| 3039 | up(&cache_chain_sem); | ||
| 3040 | if (res >= 0) | ||
| 3041 | res = count; | ||
| 3042 | return res; | ||
| 3043 | } | ||
| 3044 | #endif | ||
| 3045 | |||
| 3046 | unsigned int ksize(const void *objp) | ||
| 3047 | { | ||
| 3048 | kmem_cache_t *c; | ||
| 3049 | unsigned long flags; | ||
| 3050 | unsigned int size = 0; | ||
| 3051 | |||
| 3052 | if (likely(objp != NULL)) { | ||
| 3053 | local_irq_save(flags); | ||
| 3054 | c = GET_PAGE_CACHE(virt_to_page(objp)); | ||
| 3055 | size = kmem_cache_size(c); | ||
| 3056 | local_irq_restore(flags); | ||
| 3057 | } | ||
| 3058 | |||
| 3059 | return size; | ||
| 3060 | } | ||
diff --git a/mm/swap.c b/mm/swap.c new file mode 100644 index 000000000000..7771d2803f62 --- /dev/null +++ b/mm/swap.c | |||
| @@ -0,0 +1,485 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/swap.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | ||
| 5 | */ | ||
| 6 | |||
| 7 | /* | ||
| 8 | * This file contains the default values for the opereation of the | ||
| 9 | * Linux VM subsystem. Fine-tuning documentation can be found in | ||
| 10 | * Documentation/sysctl/vm.txt. | ||
| 11 | * Started 18.12.91 | ||
| 12 | * Swap aging added 23.2.95, Stephen Tweedie. | ||
| 13 | * Buffermem limits added 12.3.98, Rik van Riel. | ||
| 14 | */ | ||
| 15 | |||
| 16 | #include <linux/mm.h> | ||
| 17 | #include <linux/sched.h> | ||
| 18 | #include <linux/kernel_stat.h> | ||
| 19 | #include <linux/swap.h> | ||
| 20 | #include <linux/mman.h> | ||
| 21 | #include <linux/pagemap.h> | ||
| 22 | #include <linux/pagevec.h> | ||
| 23 | #include <linux/init.h> | ||
| 24 | #include <linux/module.h> | ||
| 25 | #include <linux/mm_inline.h> | ||
| 26 | #include <linux/buffer_head.h> /* for try_to_release_page() */ | ||
| 27 | #include <linux/module.h> | ||
| 28 | #include <linux/percpu_counter.h> | ||
| 29 | #include <linux/percpu.h> | ||
| 30 | #include <linux/cpu.h> | ||
| 31 | #include <linux/notifier.h> | ||
| 32 | #include <linux/init.h> | ||
| 33 | |||
| 34 | /* How many pages do we try to swap or page in/out together? */ | ||
| 35 | int page_cluster; | ||
| 36 | |||
| 37 | #ifdef CONFIG_HUGETLB_PAGE | ||
| 38 | |||
| 39 | void put_page(struct page *page) | ||
| 40 | { | ||
| 41 | if (unlikely(PageCompound(page))) { | ||
| 42 | page = (struct page *)page->private; | ||
| 43 | if (put_page_testzero(page)) { | ||
| 44 | void (*dtor)(struct page *page); | ||
| 45 | |||
| 46 | dtor = (void (*)(struct page *))page[1].mapping; | ||
| 47 | (*dtor)(page); | ||
| 48 | } | ||
| 49 | return; | ||
| 50 | } | ||
| 51 | if (!PageReserved(page) && put_page_testzero(page)) | ||
| 52 | __page_cache_release(page); | ||
| 53 | } | ||
| 54 | EXPORT_SYMBOL(put_page); | ||
| 55 | #endif | ||
| 56 | |||
| 57 | /* | ||
| 58 | * Writeback is about to end against a page which has been marked for immediate | ||
| 59 | * reclaim. If it still appears to be reclaimable, move it to the tail of the | ||
| 60 | * inactive list. The page still has PageWriteback set, which will pin it. | ||
| 61 | * | ||
| 62 | * We don't expect many pages to come through here, so don't bother batching | ||
| 63 | * things up. | ||
| 64 | * | ||
| 65 | * To avoid placing the page at the tail of the LRU while PG_writeback is still | ||
| 66 | * set, this function will clear PG_writeback before performing the page | ||
| 67 | * motion. Do that inside the lru lock because once PG_writeback is cleared | ||
| 68 | * we may not touch the page. | ||
| 69 | * | ||
| 70 | * Returns zero if it cleared PG_writeback. | ||
| 71 | */ | ||
| 72 | int rotate_reclaimable_page(struct page *page) | ||
| 73 | { | ||
| 74 | struct zone *zone; | ||
| 75 | unsigned long flags; | ||
| 76 | |||
| 77 | if (PageLocked(page)) | ||
| 78 | return 1; | ||
| 79 | if (PageDirty(page)) | ||
| 80 | return 1; | ||
| 81 | if (PageActive(page)) | ||
| 82 | return 1; | ||
| 83 | if (!PageLRU(page)) | ||
| 84 | return 1; | ||
| 85 | |||
| 86 | zone = page_zone(page); | ||
| 87 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
| 88 | if (PageLRU(page) && !PageActive(page)) { | ||
| 89 | list_del(&page->lru); | ||
| 90 | list_add_tail(&page->lru, &zone->inactive_list); | ||
| 91 | inc_page_state(pgrotated); | ||
| 92 | } | ||
| 93 | if (!test_clear_page_writeback(page)) | ||
| 94 | BUG(); | ||
| 95 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
| 96 | return 0; | ||
| 97 | } | ||
| 98 | |||
| 99 | /* | ||
| 100 | * FIXME: speed this up? | ||
| 101 | */ | ||
| 102 | void fastcall activate_page(struct page *page) | ||
| 103 | { | ||
| 104 | struct zone *zone = page_zone(page); | ||
| 105 | |||
| 106 | spin_lock_irq(&zone->lru_lock); | ||
| 107 | if (PageLRU(page) && !PageActive(page)) { | ||
| 108 | del_page_from_inactive_list(zone, page); | ||
| 109 | SetPageActive(page); | ||
| 110 | add_page_to_active_list(zone, page); | ||
| 111 | inc_page_state(pgactivate); | ||
| 112 | } | ||
| 113 | spin_unlock_irq(&zone->lru_lock); | ||
| 114 | } | ||
| 115 | |||
| 116 | /* | ||
| 117 | * Mark a page as having seen activity. | ||
| 118 | * | ||
| 119 | * inactive,unreferenced -> inactive,referenced | ||
| 120 | * inactive,referenced -> active,unreferenced | ||
| 121 | * active,unreferenced -> active,referenced | ||
| 122 | */ | ||
| 123 | void fastcall mark_page_accessed(struct page *page) | ||
| 124 | { | ||
| 125 | if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { | ||
| 126 | activate_page(page); | ||
| 127 | ClearPageReferenced(page); | ||
| 128 | } else if (!PageReferenced(page)) { | ||
| 129 | SetPageReferenced(page); | ||
| 130 | } | ||
| 131 | } | ||
| 132 | |||
| 133 | EXPORT_SYMBOL(mark_page_accessed); | ||
| 134 | |||
| 135 | /** | ||
| 136 | * lru_cache_add: add a page to the page lists | ||
| 137 | * @page: the page to add | ||
| 138 | */ | ||
| 139 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; | ||
| 140 | static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; | ||
| 141 | |||
| 142 | void fastcall lru_cache_add(struct page *page) | ||
| 143 | { | ||
| 144 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); | ||
| 145 | |||
| 146 | page_cache_get(page); | ||
| 147 | if (!pagevec_add(pvec, page)) | ||
| 148 | __pagevec_lru_add(pvec); | ||
| 149 | put_cpu_var(lru_add_pvecs); | ||
| 150 | } | ||
| 151 | |||
| 152 | void fastcall lru_cache_add_active(struct page *page) | ||
| 153 | { | ||
| 154 | struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); | ||
| 155 | |||
| 156 | page_cache_get(page); | ||
| 157 | if (!pagevec_add(pvec, page)) | ||
| 158 | __pagevec_lru_add_active(pvec); | ||
| 159 | put_cpu_var(lru_add_active_pvecs); | ||
| 160 | } | ||
| 161 | |||
| 162 | void lru_add_drain(void) | ||
| 163 | { | ||
| 164 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); | ||
| 165 | |||
| 166 | if (pagevec_count(pvec)) | ||
| 167 | __pagevec_lru_add(pvec); | ||
| 168 | pvec = &__get_cpu_var(lru_add_active_pvecs); | ||
| 169 | if (pagevec_count(pvec)) | ||
| 170 | __pagevec_lru_add_active(pvec); | ||
| 171 | put_cpu_var(lru_add_pvecs); | ||
| 172 | } | ||
| 173 | |||
| 174 | /* | ||
| 175 | * This path almost never happens for VM activity - pages are normally | ||
| 176 | * freed via pagevecs. But it gets used by networking. | ||
| 177 | */ | ||
| 178 | void fastcall __page_cache_release(struct page *page) | ||
| 179 | { | ||
| 180 | unsigned long flags; | ||
| 181 | struct zone *zone = page_zone(page); | ||
| 182 | |||
| 183 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
| 184 | if (TestClearPageLRU(page)) | ||
| 185 | del_page_from_lru(zone, page); | ||
| 186 | if (page_count(page) != 0) | ||
| 187 | page = NULL; | ||
| 188 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
| 189 | if (page) | ||
| 190 | free_hot_page(page); | ||
| 191 | } | ||
| 192 | |||
| 193 | EXPORT_SYMBOL(__page_cache_release); | ||
| 194 | |||
| 195 | /* | ||
| 196 | * Batched page_cache_release(). Decrement the reference count on all the | ||
| 197 | * passed pages. If it fell to zero then remove the page from the LRU and | ||
| 198 | * free it. | ||
| 199 | * | ||
| 200 | * Avoid taking zone->lru_lock if possible, but if it is taken, retain it | ||
| 201 | * for the remainder of the operation. | ||
| 202 | * | ||
| 203 | * The locking in this function is against shrink_cache(): we recheck the | ||
| 204 | * page count inside the lock to see whether shrink_cache grabbed the page | ||
| 205 | * via the LRU. If it did, give up: shrink_cache will free it. | ||
| 206 | */ | ||
| 207 | void release_pages(struct page **pages, int nr, int cold) | ||
| 208 | { | ||
| 209 | int i; | ||
| 210 | struct pagevec pages_to_free; | ||
| 211 | struct zone *zone = NULL; | ||
| 212 | |||
| 213 | pagevec_init(&pages_to_free, cold); | ||
| 214 | for (i = 0; i < nr; i++) { | ||
| 215 | struct page *page = pages[i]; | ||
| 216 | struct zone *pagezone; | ||
| 217 | |||
| 218 | if (PageReserved(page) || !put_page_testzero(page)) | ||
| 219 | continue; | ||
| 220 | |||
| 221 | pagezone = page_zone(page); | ||
| 222 | if (pagezone != zone) { | ||
| 223 | if (zone) | ||
| 224 | spin_unlock_irq(&zone->lru_lock); | ||
| 225 | zone = pagezone; | ||
| 226 | spin_lock_irq(&zone->lru_lock); | ||
| 227 | } | ||
| 228 | if (TestClearPageLRU(page)) | ||
| 229 | del_page_from_lru(zone, page); | ||
| 230 | if (page_count(page) == 0) { | ||
| 231 | if (!pagevec_add(&pages_to_free, page)) { | ||
| 232 | spin_unlock_irq(&zone->lru_lock); | ||
| 233 | __pagevec_free(&pages_to_free); | ||
| 234 | pagevec_reinit(&pages_to_free); | ||
| 235 | zone = NULL; /* No lock is held */ | ||
| 236 | } | ||
| 237 | } | ||
| 238 | } | ||
| 239 | if (zone) | ||
| 240 | spin_unlock_irq(&zone->lru_lock); | ||
| 241 | |||
| 242 | pagevec_free(&pages_to_free); | ||
| 243 | } | ||
| 244 | |||
| 245 | /* | ||
| 246 | * The pages which we're about to release may be in the deferred lru-addition | ||
| 247 | * queues. That would prevent them from really being freed right now. That's | ||
| 248 | * OK from a correctness point of view but is inefficient - those pages may be | ||
| 249 | * cache-warm and we want to give them back to the page allocator ASAP. | ||
| 250 | * | ||
| 251 | * So __pagevec_release() will drain those queues here. __pagevec_lru_add() | ||
| 252 | * and __pagevec_lru_add_active() call release_pages() directly to avoid | ||
| 253 | * mutual recursion. | ||
| 254 | */ | ||
| 255 | void __pagevec_release(struct pagevec *pvec) | ||
| 256 | { | ||
| 257 | lru_add_drain(); | ||
| 258 | release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); | ||
| 259 | pagevec_reinit(pvec); | ||
| 260 | } | ||
| 261 | |||
| 262 | /* | ||
| 263 | * pagevec_release() for pages which are known to not be on the LRU | ||
| 264 | * | ||
| 265 | * This function reinitialises the caller's pagevec. | ||
| 266 | */ | ||
| 267 | void __pagevec_release_nonlru(struct pagevec *pvec) | ||
| 268 | { | ||
| 269 | int i; | ||
| 270 | struct pagevec pages_to_free; | ||
| 271 | |||
| 272 | pagevec_init(&pages_to_free, pvec->cold); | ||
| 273 | pages_to_free.cold = pvec->cold; | ||
| 274 | for (i = 0; i < pagevec_count(pvec); i++) { | ||
| 275 | struct page *page = pvec->pages[i]; | ||
| 276 | |||
| 277 | BUG_ON(PageLRU(page)); | ||
| 278 | if (put_page_testzero(page)) | ||
| 279 | pagevec_add(&pages_to_free, page); | ||
| 280 | } | ||
| 281 | pagevec_free(&pages_to_free); | ||
| 282 | pagevec_reinit(pvec); | ||
| 283 | } | ||
| 284 | |||
| 285 | /* | ||
| 286 | * Add the passed pages to the LRU, then drop the caller's refcount | ||
| 287 | * on them. Reinitialises the caller's pagevec. | ||
| 288 | */ | ||
| 289 | void __pagevec_lru_add(struct pagevec *pvec) | ||
| 290 | { | ||
| 291 | int i; | ||
| 292 | struct zone *zone = NULL; | ||
| 293 | |||
| 294 | for (i = 0; i < pagevec_count(pvec); i++) { | ||
| 295 | struct page *page = pvec->pages[i]; | ||
| 296 | struct zone *pagezone = page_zone(page); | ||
| 297 | |||
| 298 | if (pagezone != zone) { | ||
| 299 | if (zone) | ||
| 300 | spin_unlock_irq(&zone->lru_lock); | ||
| 301 | zone = pagezone; | ||
| 302 | spin_lock_irq(&zone->lru_lock); | ||
| 303 | } | ||
| 304 | if (TestSetPageLRU(page)) | ||
| 305 | BUG(); | ||
| 306 | add_page_to_inactive_list(zone, page); | ||
| 307 | } | ||
| 308 | if (zone) | ||
| 309 | spin_unlock_irq(&zone->lru_lock); | ||
| 310 | release_pages(pvec->pages, pvec->nr, pvec->cold); | ||
| 311 | pagevec_reinit(pvec); | ||
| 312 | } | ||
| 313 | |||
| 314 | EXPORT_SYMBOL(__pagevec_lru_add); | ||
| 315 | |||
| 316 | void __pagevec_lru_add_active(struct pagevec *pvec) | ||
| 317 | { | ||
| 318 | int i; | ||
| 319 | struct zone *zone = NULL; | ||
| 320 | |||
| 321 | for (i = 0; i < pagevec_count(pvec); i++) { | ||
| 322 | struct page *page = pvec->pages[i]; | ||
| 323 | struct zone *pagezone = page_zone(page); | ||
| 324 | |||
| 325 | if (pagezone != zone) { | ||
| 326 | if (zone) | ||
| 327 | spin_unlock_irq(&zone->lru_lock); | ||
| 328 | zone = pagezone; | ||
| 329 | spin_lock_irq(&zone->lru_lock); | ||
| 330 | } | ||
| 331 | if (TestSetPageLRU(page)) | ||
| 332 | BUG(); | ||
| 333 | if (TestSetPageActive(page)) | ||
| 334 | BUG(); | ||
| 335 | add_page_to_active_list(zone, page); | ||
| 336 | } | ||
| 337 | if (zone) | ||
| 338 | spin_unlock_irq(&zone->lru_lock); | ||
| 339 | release_pages(pvec->pages, pvec->nr, pvec->cold); | ||
| 340 | pagevec_reinit(pvec); | ||
| 341 | } | ||
| 342 | |||
| 343 | /* | ||
| 344 | * Try to drop buffers from the pages in a pagevec | ||
| 345 | */ | ||
| 346 | void pagevec_strip(struct pagevec *pvec) | ||
| 347 | { | ||
| 348 | int i; | ||
| 349 | |||
| 350 | for (i = 0; i < pagevec_count(pvec); i++) { | ||
| 351 | struct page *page = pvec->pages[i]; | ||
| 352 | |||
| 353 | if (PagePrivate(page) && !TestSetPageLocked(page)) { | ||
| 354 | try_to_release_page(page, 0); | ||
| 355 | unlock_page(page); | ||
| 356 | } | ||
| 357 | } | ||
| 358 | } | ||
| 359 | |||
| 360 | /** | ||
| 361 | * pagevec_lookup - gang pagecache lookup | ||
| 362 | * @pvec: Where the resulting pages are placed | ||
| 363 | * @mapping: The address_space to search | ||
| 364 | * @start: The starting page index | ||
| 365 | * @nr_pages: The maximum number of pages | ||
| 366 | * | ||
| 367 | * pagevec_lookup() will search for and return a group of up to @nr_pages pages | ||
| 368 | * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a | ||
| 369 | * reference against the pages in @pvec. | ||
| 370 | * | ||
| 371 | * The search returns a group of mapping-contiguous pages with ascending | ||
| 372 | * indexes. There may be holes in the indices due to not-present pages. | ||
| 373 | * | ||
| 374 | * pagevec_lookup() returns the number of pages which were found. | ||
| 375 | */ | ||
| 376 | unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, | ||
| 377 | pgoff_t start, unsigned nr_pages) | ||
| 378 | { | ||
| 379 | pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); | ||
| 380 | return pagevec_count(pvec); | ||
| 381 | } | ||
| 382 | |||
| 383 | unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, | ||
| 384 | pgoff_t *index, int tag, unsigned nr_pages) | ||
| 385 | { | ||
| 386 | pvec->nr = find_get_pages_tag(mapping, index, tag, | ||
| 387 | nr_pages, pvec->pages); | ||
| 388 | return pagevec_count(pvec); | ||
| 389 | } | ||
| 390 | |||
| 391 | |||
| 392 | #ifdef CONFIG_SMP | ||
| 393 | /* | ||
| 394 | * We tolerate a little inaccuracy to avoid ping-ponging the counter between | ||
| 395 | * CPUs | ||
| 396 | */ | ||
| 397 | #define ACCT_THRESHOLD max(16, NR_CPUS * 2) | ||
| 398 | |||
| 399 | static DEFINE_PER_CPU(long, committed_space) = 0; | ||
| 400 | |||
| 401 | void vm_acct_memory(long pages) | ||
| 402 | { | ||
| 403 | long *local; | ||
| 404 | |||
| 405 | preempt_disable(); | ||
| 406 | local = &__get_cpu_var(committed_space); | ||
| 407 | *local += pages; | ||
| 408 | if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) { | ||
| 409 | atomic_add(*local, &vm_committed_space); | ||
| 410 | *local = 0; | ||
| 411 | } | ||
| 412 | preempt_enable(); | ||
| 413 | } | ||
| 414 | EXPORT_SYMBOL(vm_acct_memory); | ||
| 415 | |||
| 416 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 417 | static void lru_drain_cache(unsigned int cpu) | ||
| 418 | { | ||
| 419 | struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); | ||
| 420 | |||
| 421 | /* CPU is dead, so no locking needed. */ | ||
| 422 | if (pagevec_count(pvec)) | ||
| 423 | __pagevec_lru_add(pvec); | ||
| 424 | pvec = &per_cpu(lru_add_active_pvecs, cpu); | ||
| 425 | if (pagevec_count(pvec)) | ||
| 426 | __pagevec_lru_add_active(pvec); | ||
| 427 | } | ||
| 428 | |||
| 429 | /* Drop the CPU's cached committed space back into the central pool. */ | ||
| 430 | static int cpu_swap_callback(struct notifier_block *nfb, | ||
| 431 | unsigned long action, | ||
| 432 | void *hcpu) | ||
| 433 | { | ||
| 434 | long *committed; | ||
| 435 | |||
| 436 | committed = &per_cpu(committed_space, (long)hcpu); | ||
| 437 | if (action == CPU_DEAD) { | ||
| 438 | atomic_add(*committed, &vm_committed_space); | ||
| 439 | *committed = 0; | ||
| 440 | lru_drain_cache((long)hcpu); | ||
| 441 | } | ||
| 442 | return NOTIFY_OK; | ||
| 443 | } | ||
| 444 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
| 445 | #endif /* CONFIG_SMP */ | ||
| 446 | |||
| 447 | #ifdef CONFIG_SMP | ||
| 448 | void percpu_counter_mod(struct percpu_counter *fbc, long amount) | ||
| 449 | { | ||
| 450 | long count; | ||
| 451 | long *pcount; | ||
| 452 | int cpu = get_cpu(); | ||
| 453 | |||
| 454 | pcount = per_cpu_ptr(fbc->counters, cpu); | ||
| 455 | count = *pcount + amount; | ||
| 456 | if (count >= FBC_BATCH || count <= -FBC_BATCH) { | ||
| 457 | spin_lock(&fbc->lock); | ||
| 458 | fbc->count += count; | ||
| 459 | spin_unlock(&fbc->lock); | ||
| 460 | count = 0; | ||
| 461 | } | ||
| 462 | *pcount = count; | ||
| 463 | put_cpu(); | ||
| 464 | } | ||
| 465 | EXPORT_SYMBOL(percpu_counter_mod); | ||
| 466 | #endif | ||
| 467 | |||
| 468 | /* | ||
| 469 | * Perform any setup for the swap system | ||
| 470 | */ | ||
| 471 | void __init swap_setup(void) | ||
| 472 | { | ||
| 473 | unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); | ||
| 474 | |||
| 475 | /* Use a smaller cluster for small-memory machines */ | ||
| 476 | if (megs < 16) | ||
| 477 | page_cluster = 2; | ||
| 478 | else | ||
| 479 | page_cluster = 3; | ||
| 480 | /* | ||
| 481 | * Right now other parts of the system means that we | ||
| 482 | * _really_ don't want to cluster much more | ||
| 483 | */ | ||
| 484 | hotcpu_notifier(cpu_swap_callback, 0); | ||
| 485 | } | ||
diff --git a/mm/swap_state.c b/mm/swap_state.c new file mode 100644 index 000000000000..a063a902ed03 --- /dev/null +++ b/mm/swap_state.c | |||
| @@ -0,0 +1,382 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/swap_state.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | ||
| 5 | * Swap reorganised 29.12.95, Stephen Tweedie | ||
| 6 | * | ||
| 7 | * Rewritten to use page cache, (C) 1998 Stephen Tweedie | ||
| 8 | */ | ||
| 9 | #include <linux/module.h> | ||
| 10 | #include <linux/mm.h> | ||
| 11 | #include <linux/kernel_stat.h> | ||
| 12 | #include <linux/swap.h> | ||
| 13 | #include <linux/init.h> | ||
| 14 | #include <linux/pagemap.h> | ||
| 15 | #include <linux/buffer_head.h> | ||
| 16 | #include <linux/backing-dev.h> | ||
| 17 | |||
| 18 | #include <asm/pgtable.h> | ||
| 19 | |||
| 20 | /* | ||
| 21 | * swapper_space is a fiction, retained to simplify the path through | ||
| 22 | * vmscan's shrink_list, to make sync_page look nicer, and to allow | ||
| 23 | * future use of radix_tree tags in the swap cache. | ||
| 24 | */ | ||
| 25 | static struct address_space_operations swap_aops = { | ||
| 26 | .writepage = swap_writepage, | ||
| 27 | .sync_page = block_sync_page, | ||
| 28 | .set_page_dirty = __set_page_dirty_nobuffers, | ||
| 29 | }; | ||
| 30 | |||
| 31 | static struct backing_dev_info swap_backing_dev_info = { | ||
| 32 | .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, | ||
| 33 | .unplug_io_fn = swap_unplug_io_fn, | ||
| 34 | }; | ||
| 35 | |||
| 36 | struct address_space swapper_space = { | ||
| 37 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), | ||
| 38 | .tree_lock = RW_LOCK_UNLOCKED, | ||
| 39 | .a_ops = &swap_aops, | ||
| 40 | .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), | ||
| 41 | .backing_dev_info = &swap_backing_dev_info, | ||
| 42 | }; | ||
| 43 | EXPORT_SYMBOL(swapper_space); | ||
| 44 | |||
| 45 | #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) | ||
| 46 | |||
| 47 | static struct { | ||
| 48 | unsigned long add_total; | ||
| 49 | unsigned long del_total; | ||
| 50 | unsigned long find_success; | ||
| 51 | unsigned long find_total; | ||
| 52 | unsigned long noent_race; | ||
| 53 | unsigned long exist_race; | ||
| 54 | } swap_cache_info; | ||
| 55 | |||
| 56 | void show_swap_cache_info(void) | ||
| 57 | { | ||
| 58 | printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n", | ||
| 59 | swap_cache_info.add_total, swap_cache_info.del_total, | ||
| 60 | swap_cache_info.find_success, swap_cache_info.find_total, | ||
| 61 | swap_cache_info.noent_race, swap_cache_info.exist_race); | ||
| 62 | printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); | ||
| 63 | printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); | ||
| 64 | } | ||
| 65 | |||
| 66 | /* | ||
| 67 | * __add_to_swap_cache resembles add_to_page_cache on swapper_space, | ||
| 68 | * but sets SwapCache flag and private instead of mapping and index. | ||
| 69 | */ | ||
| 70 | static int __add_to_swap_cache(struct page *page, | ||
| 71 | swp_entry_t entry, int gfp_mask) | ||
| 72 | { | ||
| 73 | int error; | ||
| 74 | |||
| 75 | BUG_ON(PageSwapCache(page)); | ||
| 76 | BUG_ON(PagePrivate(page)); | ||
| 77 | error = radix_tree_preload(gfp_mask); | ||
| 78 | if (!error) { | ||
| 79 | write_lock_irq(&swapper_space.tree_lock); | ||
| 80 | error = radix_tree_insert(&swapper_space.page_tree, | ||
| 81 | entry.val, page); | ||
| 82 | if (!error) { | ||
| 83 | page_cache_get(page); | ||
| 84 | SetPageLocked(page); | ||
| 85 | SetPageSwapCache(page); | ||
| 86 | page->private = entry.val; | ||
| 87 | total_swapcache_pages++; | ||
| 88 | pagecache_acct(1); | ||
| 89 | } | ||
| 90 | write_unlock_irq(&swapper_space.tree_lock); | ||
| 91 | radix_tree_preload_end(); | ||
| 92 | } | ||
| 93 | return error; | ||
| 94 | } | ||
| 95 | |||
| 96 | static int add_to_swap_cache(struct page *page, swp_entry_t entry) | ||
| 97 | { | ||
| 98 | int error; | ||
| 99 | |||
| 100 | if (!swap_duplicate(entry)) { | ||
| 101 | INC_CACHE_INFO(noent_race); | ||
| 102 | return -ENOENT; | ||
| 103 | } | ||
| 104 | error = __add_to_swap_cache(page, entry, GFP_KERNEL); | ||
| 105 | /* | ||
| 106 | * Anon pages are already on the LRU, we don't run lru_cache_add here. | ||
| 107 | */ | ||
| 108 | if (error) { | ||
| 109 | swap_free(entry); | ||
| 110 | if (error == -EEXIST) | ||
| 111 | INC_CACHE_INFO(exist_race); | ||
| 112 | return error; | ||
| 113 | } | ||
| 114 | INC_CACHE_INFO(add_total); | ||
| 115 | return 0; | ||
| 116 | } | ||
| 117 | |||
| 118 | /* | ||
| 119 | * This must be called only on pages that have | ||
| 120 | * been verified to be in the swap cache. | ||
| 121 | */ | ||
| 122 | void __delete_from_swap_cache(struct page *page) | ||
| 123 | { | ||
| 124 | BUG_ON(!PageLocked(page)); | ||
| 125 | BUG_ON(!PageSwapCache(page)); | ||
| 126 | BUG_ON(PageWriteback(page)); | ||
| 127 | |||
| 128 | radix_tree_delete(&swapper_space.page_tree, page->private); | ||
| 129 | page->private = 0; | ||
| 130 | ClearPageSwapCache(page); | ||
| 131 | total_swapcache_pages--; | ||
| 132 | pagecache_acct(-1); | ||
| 133 | INC_CACHE_INFO(del_total); | ||
| 134 | } | ||
| 135 | |||
| 136 | /** | ||
| 137 | * add_to_swap - allocate swap space for a page | ||
| 138 | * @page: page we want to move to swap | ||
| 139 | * | ||
| 140 | * Allocate swap space for the page and add the page to the | ||
| 141 | * swap cache. Caller needs to hold the page lock. | ||
| 142 | */ | ||
| 143 | int add_to_swap(struct page * page) | ||
| 144 | { | ||
| 145 | swp_entry_t entry; | ||
| 146 | int pf_flags; | ||
| 147 | int err; | ||
| 148 | |||
| 149 | if (!PageLocked(page)) | ||
| 150 | BUG(); | ||
| 151 | |||
| 152 | for (;;) { | ||
| 153 | entry = get_swap_page(); | ||
| 154 | if (!entry.val) | ||
| 155 | return 0; | ||
| 156 | |||
| 157 | /* Radix-tree node allocations are performing | ||
| 158 | * GFP_ATOMIC allocations under PF_MEMALLOC. | ||
| 159 | * They can completely exhaust the page allocator. | ||
| 160 | * | ||
| 161 | * So PF_MEMALLOC is dropped here. This causes the slab | ||
| 162 | * allocations to fail earlier, so radix-tree nodes will | ||
| 163 | * then be allocated from the mempool reserves. | ||
| 164 | * | ||
| 165 | * We're still using __GFP_HIGH for radix-tree node | ||
| 166 | * allocations, so some of the emergency pools are available, | ||
| 167 | * just not all of them. | ||
| 168 | */ | ||
| 169 | |||
| 170 | pf_flags = current->flags; | ||
| 171 | current->flags &= ~PF_MEMALLOC; | ||
| 172 | |||
| 173 | /* | ||
| 174 | * Add it to the swap cache and mark it dirty | ||
| 175 | */ | ||
| 176 | err = __add_to_swap_cache(page, entry, GFP_ATOMIC|__GFP_NOWARN); | ||
| 177 | |||
| 178 | if (pf_flags & PF_MEMALLOC) | ||
| 179 | current->flags |= PF_MEMALLOC; | ||
| 180 | |||
| 181 | switch (err) { | ||
| 182 | case 0: /* Success */ | ||
| 183 | SetPageUptodate(page); | ||
| 184 | SetPageDirty(page); | ||
| 185 | INC_CACHE_INFO(add_total); | ||
| 186 | return 1; | ||
| 187 | case -EEXIST: | ||
| 188 | /* Raced with "speculative" read_swap_cache_async */ | ||
| 189 | INC_CACHE_INFO(exist_race); | ||
| 190 | swap_free(entry); | ||
| 191 | continue; | ||
| 192 | default: | ||
| 193 | /* -ENOMEM radix-tree allocation failure */ | ||
| 194 | swap_free(entry); | ||
| 195 | return 0; | ||
| 196 | } | ||
| 197 | } | ||
| 198 | } | ||
| 199 | |||
| 200 | /* | ||
| 201 | * This must be called only on pages that have | ||
| 202 | * been verified to be in the swap cache and locked. | ||
| 203 | * It will never put the page into the free list, | ||
| 204 | * the caller has a reference on the page. | ||
| 205 | */ | ||
| 206 | void delete_from_swap_cache(struct page *page) | ||
| 207 | { | ||
| 208 | swp_entry_t entry; | ||
| 209 | |||
| 210 | BUG_ON(!PageSwapCache(page)); | ||
| 211 | BUG_ON(!PageLocked(page)); | ||
| 212 | BUG_ON(PageWriteback(page)); | ||
| 213 | BUG_ON(PagePrivate(page)); | ||
| 214 | |||
| 215 | entry.val = page->private; | ||
| 216 | |||
| 217 | write_lock_irq(&swapper_space.tree_lock); | ||
| 218 | __delete_from_swap_cache(page); | ||
| 219 | write_unlock_irq(&swapper_space.tree_lock); | ||
| 220 | |||
| 221 | swap_free(entry); | ||
| 222 | page_cache_release(page); | ||
| 223 | } | ||
| 224 | |||
| 225 | /* | ||
| 226 | * Strange swizzling function only for use by shmem_writepage | ||
| 227 | */ | ||
| 228 | int move_to_swap_cache(struct page *page, swp_entry_t entry) | ||
| 229 | { | ||
| 230 | int err = __add_to_swap_cache(page, entry, GFP_ATOMIC); | ||
| 231 | if (!err) { | ||
| 232 | remove_from_page_cache(page); | ||
| 233 | page_cache_release(page); /* pagecache ref */ | ||
| 234 | if (!swap_duplicate(entry)) | ||
| 235 | BUG(); | ||
| 236 | SetPageDirty(page); | ||
| 237 | INC_CACHE_INFO(add_total); | ||
| 238 | } else if (err == -EEXIST) | ||
| 239 | INC_CACHE_INFO(exist_race); | ||
| 240 | return err; | ||
| 241 | } | ||
| 242 | |||
| 243 | /* | ||
| 244 | * Strange swizzling function for shmem_getpage (and shmem_unuse) | ||
| 245 | */ | ||
| 246 | int move_from_swap_cache(struct page *page, unsigned long index, | ||
| 247 | struct address_space *mapping) | ||
| 248 | { | ||
| 249 | int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC); | ||
| 250 | if (!err) { | ||
| 251 | delete_from_swap_cache(page); | ||
| 252 | /* shift page from clean_pages to dirty_pages list */ | ||
| 253 | ClearPageDirty(page); | ||
| 254 | set_page_dirty(page); | ||
| 255 | } | ||
| 256 | return err; | ||
| 257 | } | ||
| 258 | |||
| 259 | /* | ||
| 260 | * If we are the only user, then try to free up the swap cache. | ||
| 261 | * | ||
| 262 | * Its ok to check for PageSwapCache without the page lock | ||
| 263 | * here because we are going to recheck again inside | ||
| 264 | * exclusive_swap_page() _with_ the lock. | ||
| 265 | * - Marcelo | ||
| 266 | */ | ||
| 267 | static inline void free_swap_cache(struct page *page) | ||
| 268 | { | ||
| 269 | if (PageSwapCache(page) && !TestSetPageLocked(page)) { | ||
| 270 | remove_exclusive_swap_page(page); | ||
| 271 | unlock_page(page); | ||
| 272 | } | ||
| 273 | } | ||
| 274 | |||
| 275 | /* | ||
| 276 | * Perform a free_page(), also freeing any swap cache associated with | ||
| 277 | * this page if it is the last user of the page. Can not do a lock_page, | ||
| 278 | * as we are holding the page_table_lock spinlock. | ||
| 279 | */ | ||
| 280 | void free_page_and_swap_cache(struct page *page) | ||
| 281 | { | ||
| 282 | free_swap_cache(page); | ||
| 283 | page_cache_release(page); | ||
| 284 | } | ||
| 285 | |||
| 286 | /* | ||
| 287 | * Passed an array of pages, drop them all from swapcache and then release | ||
| 288 | * them. They are removed from the LRU and freed if this is their last use. | ||
| 289 | */ | ||
| 290 | void free_pages_and_swap_cache(struct page **pages, int nr) | ||
| 291 | { | ||
| 292 | int chunk = 16; | ||
| 293 | struct page **pagep = pages; | ||
| 294 | |||
| 295 | lru_add_drain(); | ||
| 296 | while (nr) { | ||
| 297 | int todo = min(chunk, nr); | ||
| 298 | int i; | ||
| 299 | |||
| 300 | for (i = 0; i < todo; i++) | ||
| 301 | free_swap_cache(pagep[i]); | ||
| 302 | release_pages(pagep, todo, 0); | ||
| 303 | pagep += todo; | ||
| 304 | nr -= todo; | ||
| 305 | } | ||
| 306 | } | ||
| 307 | |||
| 308 | /* | ||
| 309 | * Lookup a swap entry in the swap cache. A found page will be returned | ||
| 310 | * unlocked and with its refcount incremented - we rely on the kernel | ||
| 311 | * lock getting page table operations atomic even if we drop the page | ||
| 312 | * lock before returning. | ||
| 313 | */ | ||
| 314 | struct page * lookup_swap_cache(swp_entry_t entry) | ||
| 315 | { | ||
| 316 | struct page *page; | ||
| 317 | |||
| 318 | page = find_get_page(&swapper_space, entry.val); | ||
| 319 | |||
| 320 | if (page) | ||
| 321 | INC_CACHE_INFO(find_success); | ||
| 322 | |||
| 323 | INC_CACHE_INFO(find_total); | ||
| 324 | return page; | ||
| 325 | } | ||
| 326 | |||
| 327 | /* | ||
| 328 | * Locate a page of swap in physical memory, reserving swap cache space | ||
| 329 | * and reading the disk if it is not already cached. | ||
| 330 | * A failure return means that either the page allocation failed or that | ||
| 331 | * the swap entry is no longer in use. | ||
| 332 | */ | ||
| 333 | struct page *read_swap_cache_async(swp_entry_t entry, | ||
| 334 | struct vm_area_struct *vma, unsigned long addr) | ||
| 335 | { | ||
| 336 | struct page *found_page, *new_page = NULL; | ||
| 337 | int err; | ||
| 338 | |||
| 339 | do { | ||
| 340 | /* | ||
| 341 | * First check the swap cache. Since this is normally | ||
| 342 | * called after lookup_swap_cache() failed, re-calling | ||
| 343 | * that would confuse statistics. | ||
| 344 | */ | ||
| 345 | found_page = find_get_page(&swapper_space, entry.val); | ||
| 346 | if (found_page) | ||
| 347 | break; | ||
| 348 | |||
| 349 | /* | ||
| 350 | * Get a new page to read into from swap. | ||
| 351 | */ | ||
| 352 | if (!new_page) { | ||
| 353 | new_page = alloc_page_vma(GFP_HIGHUSER, vma, addr); | ||
| 354 | if (!new_page) | ||
| 355 | break; /* Out of memory */ | ||
| 356 | } | ||
| 357 | |||
| 358 | /* | ||
| 359 | * Associate the page with swap entry in the swap cache. | ||
| 360 | * May fail (-ENOENT) if swap entry has been freed since | ||
| 361 | * our caller observed it. May fail (-EEXIST) if there | ||
| 362 | * is already a page associated with this entry in the | ||
| 363 | * swap cache: added by a racing read_swap_cache_async, | ||
| 364 | * or by try_to_swap_out (or shmem_writepage) re-using | ||
| 365 | * the just freed swap entry for an existing page. | ||
| 366 | * May fail (-ENOMEM) if radix-tree node allocation failed. | ||
| 367 | */ | ||
| 368 | err = add_to_swap_cache(new_page, entry); | ||
| 369 | if (!err) { | ||
| 370 | /* | ||
| 371 | * Initiate read into locked page and return. | ||
| 372 | */ | ||
| 373 | lru_cache_add_active(new_page); | ||
| 374 | swap_readpage(NULL, new_page); | ||
| 375 | return new_page; | ||
| 376 | } | ||
| 377 | } while (err != -ENOENT && err != -ENOMEM); | ||
| 378 | |||
| 379 | if (new_page) | ||
| 380 | page_cache_release(new_page); | ||
| 381 | return found_page; | ||
| 382 | } | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c new file mode 100644 index 000000000000..a60e0075d55b --- /dev/null +++ b/mm/swapfile.c | |||
| @@ -0,0 +1,1672 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/swapfile.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | ||
| 5 | * Swap reorganised 29.12.95, Stephen Tweedie | ||
| 6 | */ | ||
| 7 | |||
| 8 | #include <linux/config.h> | ||
| 9 | #include <linux/mm.h> | ||
| 10 | #include <linux/hugetlb.h> | ||
| 11 | #include <linux/mman.h> | ||
| 12 | #include <linux/slab.h> | ||
| 13 | #include <linux/kernel_stat.h> | ||
| 14 | #include <linux/swap.h> | ||
| 15 | #include <linux/vmalloc.h> | ||
| 16 | #include <linux/pagemap.h> | ||
| 17 | #include <linux/namei.h> | ||
| 18 | #include <linux/shm.h> | ||
| 19 | #include <linux/blkdev.h> | ||
| 20 | #include <linux/writeback.h> | ||
| 21 | #include <linux/proc_fs.h> | ||
| 22 | #include <linux/seq_file.h> | ||
| 23 | #include <linux/init.h> | ||
| 24 | #include <linux/module.h> | ||
| 25 | #include <linux/rmap.h> | ||
| 26 | #include <linux/security.h> | ||
| 27 | #include <linux/backing-dev.h> | ||
| 28 | #include <linux/syscalls.h> | ||
| 29 | |||
| 30 | #include <asm/pgtable.h> | ||
| 31 | #include <asm/tlbflush.h> | ||
| 32 | #include <linux/swapops.h> | ||
| 33 | |||
| 34 | DEFINE_SPINLOCK(swaplock); | ||
| 35 | unsigned int nr_swapfiles; | ||
| 36 | long total_swap_pages; | ||
| 37 | static int swap_overflow; | ||
| 38 | |||
| 39 | EXPORT_SYMBOL(total_swap_pages); | ||
| 40 | |||
| 41 | static const char Bad_file[] = "Bad swap file entry "; | ||
| 42 | static const char Unused_file[] = "Unused swap file entry "; | ||
| 43 | static const char Bad_offset[] = "Bad swap offset entry "; | ||
| 44 | static const char Unused_offset[] = "Unused swap offset entry "; | ||
| 45 | |||
| 46 | struct swap_list_t swap_list = {-1, -1}; | ||
| 47 | |||
| 48 | struct swap_info_struct swap_info[MAX_SWAPFILES]; | ||
| 49 | |||
| 50 | static DECLARE_MUTEX(swapon_sem); | ||
| 51 | |||
| 52 | /* | ||
| 53 | * We need this because the bdev->unplug_fn can sleep and we cannot | ||
| 54 | * hold swap_list_lock while calling the unplug_fn. And swap_list_lock | ||
| 55 | * cannot be turned into a semaphore. | ||
| 56 | */ | ||
| 57 | static DECLARE_RWSEM(swap_unplug_sem); | ||
| 58 | |||
| 59 | #define SWAPFILE_CLUSTER 256 | ||
| 60 | |||
| 61 | void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | ||
| 62 | { | ||
| 63 | swp_entry_t entry; | ||
| 64 | |||
| 65 | down_read(&swap_unplug_sem); | ||
| 66 | entry.val = page->private; | ||
| 67 | if (PageSwapCache(page)) { | ||
| 68 | struct block_device *bdev = swap_info[swp_type(entry)].bdev; | ||
| 69 | struct backing_dev_info *bdi; | ||
| 70 | |||
| 71 | /* | ||
| 72 | * If the page is removed from swapcache from under us (with a | ||
| 73 | * racy try_to_unuse/swapoff) we need an additional reference | ||
| 74 | * count to avoid reading garbage from page->private above. If | ||
| 75 | * the WARN_ON triggers during a swapoff it maybe the race | ||
| 76 | * condition and it's harmless. However if it triggers without | ||
| 77 | * swapoff it signals a problem. | ||
| 78 | */ | ||
| 79 | WARN_ON(page_count(page) <= 1); | ||
| 80 | |||
| 81 | bdi = bdev->bd_inode->i_mapping->backing_dev_info; | ||
| 82 | bdi->unplug_io_fn(bdi, page); | ||
| 83 | } | ||
| 84 | up_read(&swap_unplug_sem); | ||
| 85 | } | ||
| 86 | |||
| 87 | static inline int scan_swap_map(struct swap_info_struct *si) | ||
| 88 | { | ||
| 89 | unsigned long offset; | ||
| 90 | /* | ||
| 91 | * We try to cluster swap pages by allocating them | ||
| 92 | * sequentially in swap. Once we've allocated | ||
| 93 | * SWAPFILE_CLUSTER pages this way, however, we resort to | ||
| 94 | * first-free allocation, starting a new cluster. This | ||
| 95 | * prevents us from scattering swap pages all over the entire | ||
| 96 | * swap partition, so that we reduce overall disk seek times | ||
| 97 | * between swap pages. -- sct */ | ||
| 98 | if (si->cluster_nr) { | ||
| 99 | while (si->cluster_next <= si->highest_bit) { | ||
| 100 | offset = si->cluster_next++; | ||
| 101 | if (si->swap_map[offset]) | ||
| 102 | continue; | ||
| 103 | si->cluster_nr--; | ||
| 104 | goto got_page; | ||
| 105 | } | ||
| 106 | } | ||
| 107 | si->cluster_nr = SWAPFILE_CLUSTER; | ||
| 108 | |||
| 109 | /* try to find an empty (even not aligned) cluster. */ | ||
| 110 | offset = si->lowest_bit; | ||
| 111 | check_next_cluster: | ||
| 112 | if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit) | ||
| 113 | { | ||
| 114 | unsigned long nr; | ||
| 115 | for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++) | ||
| 116 | if (si->swap_map[nr]) | ||
| 117 | { | ||
| 118 | offset = nr+1; | ||
| 119 | goto check_next_cluster; | ||
| 120 | } | ||
| 121 | /* We found a completly empty cluster, so start | ||
| 122 | * using it. | ||
| 123 | */ | ||
| 124 | goto got_page; | ||
| 125 | } | ||
| 126 | /* No luck, so now go finegrined as usual. -Andrea */ | ||
| 127 | for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) { | ||
| 128 | if (si->swap_map[offset]) | ||
| 129 | continue; | ||
| 130 | si->lowest_bit = offset+1; | ||
| 131 | got_page: | ||
| 132 | if (offset == si->lowest_bit) | ||
| 133 | si->lowest_bit++; | ||
| 134 | if (offset == si->highest_bit) | ||
| 135 | si->highest_bit--; | ||
| 136 | if (si->lowest_bit > si->highest_bit) { | ||
| 137 | si->lowest_bit = si->max; | ||
| 138 | si->highest_bit = 0; | ||
| 139 | } | ||
| 140 | si->swap_map[offset] = 1; | ||
| 141 | si->inuse_pages++; | ||
| 142 | nr_swap_pages--; | ||
| 143 | si->cluster_next = offset+1; | ||
| 144 | return offset; | ||
| 145 | } | ||
| 146 | si->lowest_bit = si->max; | ||
| 147 | si->highest_bit = 0; | ||
| 148 | return 0; | ||
| 149 | } | ||
| 150 | |||
| 151 | swp_entry_t get_swap_page(void) | ||
| 152 | { | ||
| 153 | struct swap_info_struct * p; | ||
| 154 | unsigned long offset; | ||
| 155 | swp_entry_t entry; | ||
| 156 | int type, wrapped = 0; | ||
| 157 | |||
| 158 | entry.val = 0; /* Out of memory */ | ||
| 159 | swap_list_lock(); | ||
| 160 | type = swap_list.next; | ||
| 161 | if (type < 0) | ||
| 162 | goto out; | ||
| 163 | if (nr_swap_pages <= 0) | ||
| 164 | goto out; | ||
| 165 | |||
| 166 | while (1) { | ||
| 167 | p = &swap_info[type]; | ||
| 168 | if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { | ||
| 169 | swap_device_lock(p); | ||
| 170 | offset = scan_swap_map(p); | ||
| 171 | swap_device_unlock(p); | ||
| 172 | if (offset) { | ||
| 173 | entry = swp_entry(type,offset); | ||
| 174 | type = swap_info[type].next; | ||
| 175 | if (type < 0 || | ||
| 176 | p->prio != swap_info[type].prio) { | ||
| 177 | swap_list.next = swap_list.head; | ||
| 178 | } else { | ||
| 179 | swap_list.next = type; | ||
| 180 | } | ||
| 181 | goto out; | ||
| 182 | } | ||
| 183 | } | ||
| 184 | type = p->next; | ||
| 185 | if (!wrapped) { | ||
| 186 | if (type < 0 || p->prio != swap_info[type].prio) { | ||
| 187 | type = swap_list.head; | ||
| 188 | wrapped = 1; | ||
| 189 | } | ||
| 190 | } else | ||
| 191 | if (type < 0) | ||
| 192 | goto out; /* out of swap space */ | ||
| 193 | } | ||
| 194 | out: | ||
| 195 | swap_list_unlock(); | ||
| 196 | return entry; | ||
| 197 | } | ||
| 198 | |||
| 199 | static struct swap_info_struct * swap_info_get(swp_entry_t entry) | ||
| 200 | { | ||
| 201 | struct swap_info_struct * p; | ||
| 202 | unsigned long offset, type; | ||
| 203 | |||
| 204 | if (!entry.val) | ||
| 205 | goto out; | ||
| 206 | type = swp_type(entry); | ||
| 207 | if (type >= nr_swapfiles) | ||
| 208 | goto bad_nofile; | ||
| 209 | p = & swap_info[type]; | ||
| 210 | if (!(p->flags & SWP_USED)) | ||
| 211 | goto bad_device; | ||
| 212 | offset = swp_offset(entry); | ||
| 213 | if (offset >= p->max) | ||
| 214 | goto bad_offset; | ||
| 215 | if (!p->swap_map[offset]) | ||
| 216 | goto bad_free; | ||
| 217 | swap_list_lock(); | ||
| 218 | if (p->prio > swap_info[swap_list.next].prio) | ||
| 219 | swap_list.next = type; | ||
| 220 | swap_device_lock(p); | ||
| 221 | return p; | ||
| 222 | |||
| 223 | bad_free: | ||
| 224 | printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); | ||
| 225 | goto out; | ||
| 226 | bad_offset: | ||
| 227 | printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); | ||
| 228 | goto out; | ||
| 229 | bad_device: | ||
| 230 | printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); | ||
| 231 | goto out; | ||
| 232 | bad_nofile: | ||
| 233 | printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); | ||
| 234 | out: | ||
| 235 | return NULL; | ||
| 236 | } | ||
| 237 | |||
| 238 | static void swap_info_put(struct swap_info_struct * p) | ||
| 239 | { | ||
| 240 | swap_device_unlock(p); | ||
| 241 | swap_list_unlock(); | ||
| 242 | } | ||
| 243 | |||
| 244 | static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) | ||
| 245 | { | ||
| 246 | int count = p->swap_map[offset]; | ||
| 247 | |||
| 248 | if (count < SWAP_MAP_MAX) { | ||
| 249 | count--; | ||
| 250 | p->swap_map[offset] = count; | ||
| 251 | if (!count) { | ||
| 252 | if (offset < p->lowest_bit) | ||
| 253 | p->lowest_bit = offset; | ||
| 254 | if (offset > p->highest_bit) | ||
| 255 | p->highest_bit = offset; | ||
| 256 | nr_swap_pages++; | ||
| 257 | p->inuse_pages--; | ||
| 258 | } | ||
| 259 | } | ||
| 260 | return count; | ||
| 261 | } | ||
| 262 | |||
| 263 | /* | ||
| 264 | * Caller has made sure that the swapdevice corresponding to entry | ||
| 265 | * is still around or has not been recycled. | ||
| 266 | */ | ||
| 267 | void swap_free(swp_entry_t entry) | ||
| 268 | { | ||
| 269 | struct swap_info_struct * p; | ||
| 270 | |||
| 271 | p = swap_info_get(entry); | ||
| 272 | if (p) { | ||
| 273 | swap_entry_free(p, swp_offset(entry)); | ||
| 274 | swap_info_put(p); | ||
| 275 | } | ||
| 276 | } | ||
| 277 | |||
| 278 | /* | ||
| 279 | * Check if we're the only user of a swap page, | ||
| 280 | * when the page is locked. | ||
| 281 | */ | ||
| 282 | static int exclusive_swap_page(struct page *page) | ||
| 283 | { | ||
| 284 | int retval = 0; | ||
| 285 | struct swap_info_struct * p; | ||
| 286 | swp_entry_t entry; | ||
| 287 | |||
| 288 | entry.val = page->private; | ||
| 289 | p = swap_info_get(entry); | ||
| 290 | if (p) { | ||
| 291 | /* Is the only swap cache user the cache itself? */ | ||
| 292 | if (p->swap_map[swp_offset(entry)] == 1) { | ||
| 293 | /* Recheck the page count with the swapcache lock held.. */ | ||
| 294 | write_lock_irq(&swapper_space.tree_lock); | ||
| 295 | if (page_count(page) == 2) | ||
| 296 | retval = 1; | ||
| 297 | write_unlock_irq(&swapper_space.tree_lock); | ||
| 298 | } | ||
| 299 | swap_info_put(p); | ||
| 300 | } | ||
| 301 | return retval; | ||
| 302 | } | ||
| 303 | |||
| 304 | /* | ||
| 305 | * We can use this swap cache entry directly | ||
| 306 | * if there are no other references to it. | ||
| 307 | * | ||
| 308 | * Here "exclusive_swap_page()" does the real | ||
| 309 | * work, but we opportunistically check whether | ||
| 310 | * we need to get all the locks first.. | ||
| 311 | */ | ||
| 312 | int can_share_swap_page(struct page *page) | ||
| 313 | { | ||
| 314 | int retval = 0; | ||
| 315 | |||
| 316 | if (!PageLocked(page)) | ||
| 317 | BUG(); | ||
| 318 | switch (page_count(page)) { | ||
| 319 | case 3: | ||
| 320 | if (!PagePrivate(page)) | ||
| 321 | break; | ||
| 322 | /* Fallthrough */ | ||
| 323 | case 2: | ||
| 324 | if (!PageSwapCache(page)) | ||
| 325 | break; | ||
| 326 | retval = exclusive_swap_page(page); | ||
| 327 | break; | ||
| 328 | case 1: | ||
| 329 | if (PageReserved(page)) | ||
| 330 | break; | ||
| 331 | retval = 1; | ||
| 332 | } | ||
| 333 | return retval; | ||
| 334 | } | ||
| 335 | |||
| 336 | /* | ||
| 337 | * Work out if there are any other processes sharing this | ||
| 338 | * swap cache page. Free it if you can. Return success. | ||
| 339 | */ | ||
| 340 | int remove_exclusive_swap_page(struct page *page) | ||
| 341 | { | ||
| 342 | int retval; | ||
| 343 | struct swap_info_struct * p; | ||
| 344 | swp_entry_t entry; | ||
| 345 | |||
| 346 | BUG_ON(PagePrivate(page)); | ||
| 347 | BUG_ON(!PageLocked(page)); | ||
| 348 | |||
| 349 | if (!PageSwapCache(page)) | ||
| 350 | return 0; | ||
| 351 | if (PageWriteback(page)) | ||
| 352 | return 0; | ||
| 353 | if (page_count(page) != 2) /* 2: us + cache */ | ||
| 354 | return 0; | ||
| 355 | |||
| 356 | entry.val = page->private; | ||
| 357 | p = swap_info_get(entry); | ||
| 358 | if (!p) | ||
| 359 | return 0; | ||
| 360 | |||
| 361 | /* Is the only swap cache user the cache itself? */ | ||
| 362 | retval = 0; | ||
| 363 | if (p->swap_map[swp_offset(entry)] == 1) { | ||
| 364 | /* Recheck the page count with the swapcache lock held.. */ | ||
| 365 | write_lock_irq(&swapper_space.tree_lock); | ||
| 366 | if ((page_count(page) == 2) && !PageWriteback(page)) { | ||
| 367 | __delete_from_swap_cache(page); | ||
| 368 | SetPageDirty(page); | ||
| 369 | retval = 1; | ||
| 370 | } | ||
| 371 | write_unlock_irq(&swapper_space.tree_lock); | ||
| 372 | } | ||
| 373 | swap_info_put(p); | ||
| 374 | |||
| 375 | if (retval) { | ||
| 376 | swap_free(entry); | ||
| 377 | page_cache_release(page); | ||
| 378 | } | ||
| 379 | |||
| 380 | return retval; | ||
| 381 | } | ||
| 382 | |||
| 383 | /* | ||
| 384 | * Free the swap entry like above, but also try to | ||
| 385 | * free the page cache entry if it is the last user. | ||
| 386 | */ | ||
| 387 | void free_swap_and_cache(swp_entry_t entry) | ||
| 388 | { | ||
| 389 | struct swap_info_struct * p; | ||
| 390 | struct page *page = NULL; | ||
| 391 | |||
| 392 | p = swap_info_get(entry); | ||
| 393 | if (p) { | ||
| 394 | if (swap_entry_free(p, swp_offset(entry)) == 1) | ||
| 395 | page = find_trylock_page(&swapper_space, entry.val); | ||
| 396 | swap_info_put(p); | ||
| 397 | } | ||
| 398 | if (page) { | ||
| 399 | int one_user; | ||
| 400 | |||
| 401 | BUG_ON(PagePrivate(page)); | ||
| 402 | page_cache_get(page); | ||
| 403 | one_user = (page_count(page) == 2); | ||
| 404 | /* Only cache user (+us), or swap space full? Free it! */ | ||
| 405 | if (!PageWriteback(page) && (one_user || vm_swap_full())) { | ||
| 406 | delete_from_swap_cache(page); | ||
| 407 | SetPageDirty(page); | ||
| 408 | } | ||
| 409 | unlock_page(page); | ||
| 410 | page_cache_release(page); | ||
| 411 | } | ||
| 412 | } | ||
| 413 | |||
| 414 | /* | ||
| 415 | * Always set the resulting pte to be nowrite (the same as COW pages | ||
| 416 | * after one process has exited). We don't know just how many PTEs will | ||
| 417 | * share this swap entry, so be cautious and let do_wp_page work out | ||
| 418 | * what to do if a write is requested later. | ||
| 419 | * | ||
| 420 | * vma->vm_mm->page_table_lock is held. | ||
| 421 | */ | ||
| 422 | static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, | ||
| 423 | unsigned long addr, swp_entry_t entry, struct page *page) | ||
| 424 | { | ||
| 425 | inc_mm_counter(vma->vm_mm, rss); | ||
| 426 | get_page(page); | ||
| 427 | set_pte_at(vma->vm_mm, addr, pte, | ||
| 428 | pte_mkold(mk_pte(page, vma->vm_page_prot))); | ||
| 429 | page_add_anon_rmap(page, vma, addr); | ||
| 430 | swap_free(entry); | ||
| 431 | /* | ||
| 432 | * Move the page to the active list so it is not | ||
| 433 | * immediately swapped out again after swapon. | ||
| 434 | */ | ||
| 435 | activate_page(page); | ||
| 436 | } | ||
| 437 | |||
| 438 | static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | ||
| 439 | unsigned long addr, unsigned long end, | ||
| 440 | swp_entry_t entry, struct page *page) | ||
| 441 | { | ||
| 442 | pte_t *pte; | ||
| 443 | pte_t swp_pte = swp_entry_to_pte(entry); | ||
| 444 | |||
| 445 | pte = pte_offset_map(pmd, addr); | ||
| 446 | do { | ||
| 447 | /* | ||
| 448 | * swapoff spends a _lot_ of time in this loop! | ||
| 449 | * Test inline before going to call unuse_pte. | ||
| 450 | */ | ||
| 451 | if (unlikely(pte_same(*pte, swp_pte))) { | ||
| 452 | unuse_pte(vma, pte, addr, entry, page); | ||
| 453 | pte_unmap(pte); | ||
| 454 | return 1; | ||
| 455 | } | ||
| 456 | } while (pte++, addr += PAGE_SIZE, addr != end); | ||
| 457 | pte_unmap(pte - 1); | ||
| 458 | return 0; | ||
| 459 | } | ||
| 460 | |||
| 461 | static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | ||
| 462 | unsigned long addr, unsigned long end, | ||
| 463 | swp_entry_t entry, struct page *page) | ||
| 464 | { | ||
| 465 | pmd_t *pmd; | ||
| 466 | unsigned long next; | ||
| 467 | |||
| 468 | pmd = pmd_offset(pud, addr); | ||
| 469 | do { | ||
| 470 | next = pmd_addr_end(addr, end); | ||
| 471 | if (pmd_none_or_clear_bad(pmd)) | ||
| 472 | continue; | ||
| 473 | if (unuse_pte_range(vma, pmd, addr, next, entry, page)) | ||
| 474 | return 1; | ||
| 475 | } while (pmd++, addr = next, addr != end); | ||
| 476 | return 0; | ||
| 477 | } | ||
| 478 | |||
| 479 | static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | ||
| 480 | unsigned long addr, unsigned long end, | ||
| 481 | swp_entry_t entry, struct page *page) | ||
| 482 | { | ||
| 483 | pud_t *pud; | ||
| 484 | unsigned long next; | ||
| 485 | |||
| 486 | pud = pud_offset(pgd, addr); | ||
| 487 | do { | ||
| 488 | next = pud_addr_end(addr, end); | ||
| 489 | if (pud_none_or_clear_bad(pud)) | ||
| 490 | continue; | ||
| 491 | if (unuse_pmd_range(vma, pud, addr, next, entry, page)) | ||
| 492 | return 1; | ||
| 493 | } while (pud++, addr = next, addr != end); | ||
| 494 | return 0; | ||
| 495 | } | ||
| 496 | |||
| 497 | static int unuse_vma(struct vm_area_struct *vma, | ||
| 498 | swp_entry_t entry, struct page *page) | ||
| 499 | { | ||
| 500 | pgd_t *pgd; | ||
| 501 | unsigned long addr, end, next; | ||
| 502 | |||
| 503 | if (page->mapping) { | ||
| 504 | addr = page_address_in_vma(page, vma); | ||
| 505 | if (addr == -EFAULT) | ||
| 506 | return 0; | ||
| 507 | else | ||
| 508 | end = addr + PAGE_SIZE; | ||
| 509 | } else { | ||
| 510 | addr = vma->vm_start; | ||
| 511 | end = vma->vm_end; | ||
| 512 | } | ||
| 513 | |||
| 514 | pgd = pgd_offset(vma->vm_mm, addr); | ||
| 515 | do { | ||
| 516 | next = pgd_addr_end(addr, end); | ||
| 517 | if (pgd_none_or_clear_bad(pgd)) | ||
| 518 | continue; | ||
| 519 | if (unuse_pud_range(vma, pgd, addr, next, entry, page)) | ||
| 520 | return 1; | ||
| 521 | } while (pgd++, addr = next, addr != end); | ||
| 522 | return 0; | ||
| 523 | } | ||
| 524 | |||
| 525 | static int unuse_mm(struct mm_struct *mm, | ||
| 526 | swp_entry_t entry, struct page *page) | ||
| 527 | { | ||
| 528 | struct vm_area_struct *vma; | ||
| 529 | |||
| 530 | if (!down_read_trylock(&mm->mmap_sem)) { | ||
| 531 | /* | ||
| 532 | * Our reference to the page stops try_to_unmap_one from | ||
| 533 | * unmapping its ptes, so swapoff can make progress. | ||
| 534 | */ | ||
| 535 | unlock_page(page); | ||
| 536 | down_read(&mm->mmap_sem); | ||
| 537 | lock_page(page); | ||
| 538 | } | ||
| 539 | spin_lock(&mm->page_table_lock); | ||
| 540 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
| 541 | if (vma->anon_vma && unuse_vma(vma, entry, page)) | ||
| 542 | break; | ||
| 543 | } | ||
| 544 | spin_unlock(&mm->page_table_lock); | ||
| 545 | up_read(&mm->mmap_sem); | ||
| 546 | /* | ||
| 547 | * Currently unuse_mm cannot fail, but leave error handling | ||
| 548 | * at call sites for now, since we change it from time to time. | ||
| 549 | */ | ||
| 550 | return 0; | ||
| 551 | } | ||
| 552 | |||
| 553 | /* | ||
| 554 | * Scan swap_map from current position to next entry still in use. | ||
| 555 | * Recycle to start on reaching the end, returning 0 when empty. | ||
| 556 | */ | ||
| 557 | static int find_next_to_unuse(struct swap_info_struct *si, int prev) | ||
| 558 | { | ||
| 559 | int max = si->max; | ||
| 560 | int i = prev; | ||
| 561 | int count; | ||
| 562 | |||
| 563 | /* | ||
| 564 | * No need for swap_device_lock(si) here: we're just looking | ||
| 565 | * for whether an entry is in use, not modifying it; false | ||
| 566 | * hits are okay, and sys_swapoff() has already prevented new | ||
| 567 | * allocations from this area (while holding swap_list_lock()). | ||
| 568 | */ | ||
| 569 | for (;;) { | ||
| 570 | if (++i >= max) { | ||
| 571 | if (!prev) { | ||
| 572 | i = 0; | ||
| 573 | break; | ||
| 574 | } | ||
| 575 | /* | ||
| 576 | * No entries in use at top of swap_map, | ||
| 577 | * loop back to start and recheck there. | ||
| 578 | */ | ||
| 579 | max = prev + 1; | ||
| 580 | prev = 0; | ||
| 581 | i = 1; | ||
| 582 | } | ||
| 583 | count = si->swap_map[i]; | ||
| 584 | if (count && count != SWAP_MAP_BAD) | ||
| 585 | break; | ||
| 586 | } | ||
| 587 | return i; | ||
| 588 | } | ||
| 589 | |||
| 590 | /* | ||
| 591 | * We completely avoid races by reading each swap page in advance, | ||
| 592 | * and then search for the process using it. All the necessary | ||
| 593 | * page table adjustments can then be made atomically. | ||
| 594 | */ | ||
| 595 | static int try_to_unuse(unsigned int type) | ||
| 596 | { | ||
| 597 | struct swap_info_struct * si = &swap_info[type]; | ||
| 598 | struct mm_struct *start_mm; | ||
| 599 | unsigned short *swap_map; | ||
| 600 | unsigned short swcount; | ||
| 601 | struct page *page; | ||
| 602 | swp_entry_t entry; | ||
| 603 | int i = 0; | ||
| 604 | int retval = 0; | ||
| 605 | int reset_overflow = 0; | ||
| 606 | int shmem; | ||
| 607 | |||
| 608 | /* | ||
| 609 | * When searching mms for an entry, a good strategy is to | ||
| 610 | * start at the first mm we freed the previous entry from | ||
| 611 | * (though actually we don't notice whether we or coincidence | ||
| 612 | * freed the entry). Initialize this start_mm with a hold. | ||
| 613 | * | ||
| 614 | * A simpler strategy would be to start at the last mm we | ||
| 615 | * freed the previous entry from; but that would take less | ||
| 616 | * advantage of mmlist ordering, which clusters forked mms | ||
| 617 | * together, child after parent. If we race with dup_mmap(), we | ||
| 618 | * prefer to resolve parent before child, lest we miss entries | ||
| 619 | * duplicated after we scanned child: using last mm would invert | ||
| 620 | * that. Though it's only a serious concern when an overflowed | ||
| 621 | * swap count is reset from SWAP_MAP_MAX, preventing a rescan. | ||
| 622 | */ | ||
| 623 | start_mm = &init_mm; | ||
| 624 | atomic_inc(&init_mm.mm_users); | ||
| 625 | |||
| 626 | /* | ||
| 627 | * Keep on scanning until all entries have gone. Usually, | ||
| 628 | * one pass through swap_map is enough, but not necessarily: | ||
| 629 | * there are races when an instance of an entry might be missed. | ||
| 630 | */ | ||
| 631 | while ((i = find_next_to_unuse(si, i)) != 0) { | ||
| 632 | if (signal_pending(current)) { | ||
| 633 | retval = -EINTR; | ||
| 634 | break; | ||
| 635 | } | ||
| 636 | |||
| 637 | /* | ||
| 638 | * Get a page for the entry, using the existing swap | ||
| 639 | * cache page if there is one. Otherwise, get a clean | ||
| 640 | * page and read the swap into it. | ||
| 641 | */ | ||
| 642 | swap_map = &si->swap_map[i]; | ||
| 643 | entry = swp_entry(type, i); | ||
| 644 | page = read_swap_cache_async(entry, NULL, 0); | ||
| 645 | if (!page) { | ||
| 646 | /* | ||
| 647 | * Either swap_duplicate() failed because entry | ||
| 648 | * has been freed independently, and will not be | ||
| 649 | * reused since sys_swapoff() already disabled | ||
| 650 | * allocation from here, or alloc_page() failed. | ||
| 651 | */ | ||
| 652 | if (!*swap_map) | ||
| 653 | continue; | ||
| 654 | retval = -ENOMEM; | ||
| 655 | break; | ||
| 656 | } | ||
| 657 | |||
| 658 | /* | ||
| 659 | * Don't hold on to start_mm if it looks like exiting. | ||
| 660 | */ | ||
| 661 | if (atomic_read(&start_mm->mm_users) == 1) { | ||
| 662 | mmput(start_mm); | ||
| 663 | start_mm = &init_mm; | ||
| 664 | atomic_inc(&init_mm.mm_users); | ||
| 665 | } | ||
| 666 | |||
| 667 | /* | ||
| 668 | * Wait for and lock page. When do_swap_page races with | ||
| 669 | * try_to_unuse, do_swap_page can handle the fault much | ||
| 670 | * faster than try_to_unuse can locate the entry. This | ||
| 671 | * apparently redundant "wait_on_page_locked" lets try_to_unuse | ||
| 672 | * defer to do_swap_page in such a case - in some tests, | ||
| 673 | * do_swap_page and try_to_unuse repeatedly compete. | ||
| 674 | */ | ||
| 675 | wait_on_page_locked(page); | ||
| 676 | wait_on_page_writeback(page); | ||
| 677 | lock_page(page); | ||
| 678 | wait_on_page_writeback(page); | ||
| 679 | |||
| 680 | /* | ||
| 681 | * Remove all references to entry. | ||
| 682 | * Whenever we reach init_mm, there's no address space | ||
| 683 | * to search, but use it as a reminder to search shmem. | ||
| 684 | */ | ||
| 685 | shmem = 0; | ||
| 686 | swcount = *swap_map; | ||
| 687 | if (swcount > 1) { | ||
| 688 | if (start_mm == &init_mm) | ||
| 689 | shmem = shmem_unuse(entry, page); | ||
| 690 | else | ||
| 691 | retval = unuse_mm(start_mm, entry, page); | ||
| 692 | } | ||
| 693 | if (*swap_map > 1) { | ||
| 694 | int set_start_mm = (*swap_map >= swcount); | ||
| 695 | struct list_head *p = &start_mm->mmlist; | ||
| 696 | struct mm_struct *new_start_mm = start_mm; | ||
| 697 | struct mm_struct *prev_mm = start_mm; | ||
| 698 | struct mm_struct *mm; | ||
| 699 | |||
| 700 | atomic_inc(&new_start_mm->mm_users); | ||
| 701 | atomic_inc(&prev_mm->mm_users); | ||
| 702 | spin_lock(&mmlist_lock); | ||
| 703 | while (*swap_map > 1 && !retval && | ||
| 704 | (p = p->next) != &start_mm->mmlist) { | ||
| 705 | mm = list_entry(p, struct mm_struct, mmlist); | ||
| 706 | if (atomic_inc_return(&mm->mm_users) == 1) { | ||
| 707 | atomic_dec(&mm->mm_users); | ||
| 708 | continue; | ||
| 709 | } | ||
| 710 | spin_unlock(&mmlist_lock); | ||
| 711 | mmput(prev_mm); | ||
| 712 | prev_mm = mm; | ||
| 713 | |||
| 714 | cond_resched(); | ||
| 715 | |||
| 716 | swcount = *swap_map; | ||
| 717 | if (swcount <= 1) | ||
| 718 | ; | ||
| 719 | else if (mm == &init_mm) { | ||
| 720 | set_start_mm = 1; | ||
| 721 | shmem = shmem_unuse(entry, page); | ||
| 722 | } else | ||
| 723 | retval = unuse_mm(mm, entry, page); | ||
| 724 | if (set_start_mm && *swap_map < swcount) { | ||
| 725 | mmput(new_start_mm); | ||
| 726 | atomic_inc(&mm->mm_users); | ||
| 727 | new_start_mm = mm; | ||
| 728 | set_start_mm = 0; | ||
| 729 | } | ||
| 730 | spin_lock(&mmlist_lock); | ||
| 731 | } | ||
| 732 | spin_unlock(&mmlist_lock); | ||
| 733 | mmput(prev_mm); | ||
| 734 | mmput(start_mm); | ||
| 735 | start_mm = new_start_mm; | ||
| 736 | } | ||
| 737 | if (retval) { | ||
| 738 | unlock_page(page); | ||
| 739 | page_cache_release(page); | ||
| 740 | break; | ||
| 741 | } | ||
| 742 | |||
| 743 | /* | ||
| 744 | * How could swap count reach 0x7fff when the maximum | ||
| 745 | * pid is 0x7fff, and there's no way to repeat a swap | ||
| 746 | * page within an mm (except in shmem, where it's the | ||
| 747 | * shared object which takes the reference count)? | ||
| 748 | * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. | ||
| 749 | * | ||
| 750 | * If that's wrong, then we should worry more about | ||
| 751 | * exit_mmap() and do_munmap() cases described above: | ||
| 752 | * we might be resetting SWAP_MAP_MAX too early here. | ||
| 753 | * We know "Undead"s can happen, they're okay, so don't | ||
| 754 | * report them; but do report if we reset SWAP_MAP_MAX. | ||
| 755 | */ | ||
| 756 | if (*swap_map == SWAP_MAP_MAX) { | ||
| 757 | swap_device_lock(si); | ||
| 758 | *swap_map = 1; | ||
| 759 | swap_device_unlock(si); | ||
| 760 | reset_overflow = 1; | ||
| 761 | } | ||
| 762 | |||
| 763 | /* | ||
| 764 | * If a reference remains (rare), we would like to leave | ||
| 765 | * the page in the swap cache; but try_to_unmap could | ||
| 766 | * then re-duplicate the entry once we drop page lock, | ||
| 767 | * so we might loop indefinitely; also, that page could | ||
| 768 | * not be swapped out to other storage meanwhile. So: | ||
| 769 | * delete from cache even if there's another reference, | ||
| 770 | * after ensuring that the data has been saved to disk - | ||
| 771 | * since if the reference remains (rarer), it will be | ||
| 772 | * read from disk into another page. Splitting into two | ||
| 773 | * pages would be incorrect if swap supported "shared | ||
| 774 | * private" pages, but they are handled by tmpfs files. | ||
| 775 | * | ||
| 776 | * Note shmem_unuse already deleted a swappage from | ||
| 777 | * the swap cache, unless the move to filepage failed: | ||
| 778 | * in which case it left swappage in cache, lowered its | ||
| 779 | * swap count to pass quickly through the loops above, | ||
| 780 | * and now we must reincrement count to try again later. | ||
| 781 | */ | ||
| 782 | if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { | ||
| 783 | struct writeback_control wbc = { | ||
| 784 | .sync_mode = WB_SYNC_NONE, | ||
| 785 | }; | ||
| 786 | |||
| 787 | swap_writepage(page, &wbc); | ||
| 788 | lock_page(page); | ||
| 789 | wait_on_page_writeback(page); | ||
| 790 | } | ||
| 791 | if (PageSwapCache(page)) { | ||
| 792 | if (shmem) | ||
| 793 | swap_duplicate(entry); | ||
| 794 | else | ||
| 795 | delete_from_swap_cache(page); | ||
| 796 | } | ||
| 797 | |||
| 798 | /* | ||
| 799 | * So we could skip searching mms once swap count went | ||
| 800 | * to 1, we did not mark any present ptes as dirty: must | ||
| 801 | * mark page dirty so shrink_list will preserve it. | ||
| 802 | */ | ||
| 803 | SetPageDirty(page); | ||
| 804 | unlock_page(page); | ||
| 805 | page_cache_release(page); | ||
| 806 | |||
| 807 | /* | ||
| 808 | * Make sure that we aren't completely killing | ||
| 809 | * interactive performance. | ||
| 810 | */ | ||
| 811 | cond_resched(); | ||
| 812 | } | ||
| 813 | |||
| 814 | mmput(start_mm); | ||
| 815 | if (reset_overflow) { | ||
| 816 | printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); | ||
| 817 | swap_overflow = 0; | ||
| 818 | } | ||
| 819 | return retval; | ||
| 820 | } | ||
| 821 | |||
| 822 | /* | ||
| 823 | * After a successful try_to_unuse, if no swap is now in use, we know we | ||
| 824 | * can empty the mmlist. swap_list_lock must be held on entry and exit. | ||
| 825 | * Note that mmlist_lock nests inside swap_list_lock, and an mm must be | ||
| 826 | * added to the mmlist just after page_duplicate - before would be racy. | ||
| 827 | */ | ||
| 828 | static void drain_mmlist(void) | ||
| 829 | { | ||
| 830 | struct list_head *p, *next; | ||
| 831 | unsigned int i; | ||
| 832 | |||
| 833 | for (i = 0; i < nr_swapfiles; i++) | ||
| 834 | if (swap_info[i].inuse_pages) | ||
| 835 | return; | ||
| 836 | spin_lock(&mmlist_lock); | ||
| 837 | list_for_each_safe(p, next, &init_mm.mmlist) | ||
| 838 | list_del_init(p); | ||
| 839 | spin_unlock(&mmlist_lock); | ||
| 840 | } | ||
| 841 | |||
| 842 | /* | ||
| 843 | * Use this swapdev's extent info to locate the (PAGE_SIZE) block which | ||
| 844 | * corresponds to page offset `offset'. | ||
| 845 | */ | ||
| 846 | sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) | ||
| 847 | { | ||
| 848 | struct swap_extent *se = sis->curr_swap_extent; | ||
| 849 | struct swap_extent *start_se = se; | ||
| 850 | |||
| 851 | for ( ; ; ) { | ||
| 852 | struct list_head *lh; | ||
| 853 | |||
| 854 | if (se->start_page <= offset && | ||
| 855 | offset < (se->start_page + se->nr_pages)) { | ||
| 856 | return se->start_block + (offset - se->start_page); | ||
| 857 | } | ||
| 858 | lh = se->list.prev; | ||
| 859 | if (lh == &sis->extent_list) | ||
| 860 | lh = lh->prev; | ||
| 861 | se = list_entry(lh, struct swap_extent, list); | ||
| 862 | sis->curr_swap_extent = se; | ||
| 863 | BUG_ON(se == start_se); /* It *must* be present */ | ||
| 864 | } | ||
| 865 | } | ||
| 866 | |||
| 867 | /* | ||
| 868 | * Free all of a swapdev's extent information | ||
| 869 | */ | ||
| 870 | static void destroy_swap_extents(struct swap_info_struct *sis) | ||
| 871 | { | ||
| 872 | while (!list_empty(&sis->extent_list)) { | ||
| 873 | struct swap_extent *se; | ||
| 874 | |||
| 875 | se = list_entry(sis->extent_list.next, | ||
| 876 | struct swap_extent, list); | ||
| 877 | list_del(&se->list); | ||
| 878 | kfree(se); | ||
| 879 | } | ||
| 880 | sis->nr_extents = 0; | ||
| 881 | } | ||
| 882 | |||
| 883 | /* | ||
| 884 | * Add a block range (and the corresponding page range) into this swapdev's | ||
| 885 | * extent list. The extent list is kept sorted in block order. | ||
| 886 | * | ||
| 887 | * This function rather assumes that it is called in ascending sector_t order. | ||
| 888 | * It doesn't look for extent coalescing opportunities. | ||
| 889 | */ | ||
| 890 | static int | ||
| 891 | add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | ||
| 892 | unsigned long nr_pages, sector_t start_block) | ||
| 893 | { | ||
| 894 | struct swap_extent *se; | ||
| 895 | struct swap_extent *new_se; | ||
| 896 | struct list_head *lh; | ||
| 897 | |||
| 898 | lh = sis->extent_list.next; /* The highest-addressed block */ | ||
| 899 | while (lh != &sis->extent_list) { | ||
| 900 | se = list_entry(lh, struct swap_extent, list); | ||
| 901 | if (se->start_block + se->nr_pages == start_block && | ||
| 902 | se->start_page + se->nr_pages == start_page) { | ||
| 903 | /* Merge it */ | ||
| 904 | se->nr_pages += nr_pages; | ||
| 905 | return 0; | ||
| 906 | } | ||
| 907 | lh = lh->next; | ||
| 908 | } | ||
| 909 | |||
| 910 | /* | ||
| 911 | * No merge. Insert a new extent, preserving ordering. | ||
| 912 | */ | ||
| 913 | new_se = kmalloc(sizeof(*se), GFP_KERNEL); | ||
| 914 | if (new_se == NULL) | ||
| 915 | return -ENOMEM; | ||
| 916 | new_se->start_page = start_page; | ||
| 917 | new_se->nr_pages = nr_pages; | ||
| 918 | new_se->start_block = start_block; | ||
| 919 | |||
| 920 | lh = sis->extent_list.prev; /* The lowest block */ | ||
| 921 | while (lh != &sis->extent_list) { | ||
| 922 | se = list_entry(lh, struct swap_extent, list); | ||
| 923 | if (se->start_block > start_block) | ||
| 924 | break; | ||
| 925 | lh = lh->prev; | ||
| 926 | } | ||
| 927 | list_add_tail(&new_se->list, lh); | ||
| 928 | sis->nr_extents++; | ||
| 929 | return 0; | ||
| 930 | } | ||
| 931 | |||
| 932 | /* | ||
| 933 | * A `swap extent' is a simple thing which maps a contiguous range of pages | ||
| 934 | * onto a contiguous range of disk blocks. An ordered list of swap extents | ||
| 935 | * is built at swapon time and is then used at swap_writepage/swap_readpage | ||
| 936 | * time for locating where on disk a page belongs. | ||
| 937 | * | ||
| 938 | * If the swapfile is an S_ISBLK block device, a single extent is installed. | ||
| 939 | * This is done so that the main operating code can treat S_ISBLK and S_ISREG | ||
| 940 | * swap files identically. | ||
| 941 | * | ||
| 942 | * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap | ||
| 943 | * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK | ||
| 944 | * swapfiles are handled *identically* after swapon time. | ||
| 945 | * | ||
| 946 | * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks | ||
| 947 | * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If | ||
| 948 | * some stray blocks are found which do not fall within the PAGE_SIZE alignment | ||
| 949 | * requirements, they are simply tossed out - we will never use those blocks | ||
| 950 | * for swapping. | ||
| 951 | * | ||
| 952 | * For S_ISREG swapfiles we hold i_sem across the life of the swapon. This | ||
| 953 | * prevents root from shooting her foot off by ftruncating an in-use swapfile, | ||
| 954 | * which will scribble on the fs. | ||
| 955 | * | ||
| 956 | * The amount of disk space which a single swap extent represents varies. | ||
| 957 | * Typically it is in the 1-4 megabyte range. So we can have hundreds of | ||
| 958 | * extents in the list. To avoid much list walking, we cache the previous | ||
| 959 | * search location in `curr_swap_extent', and start new searches from there. | ||
| 960 | * This is extremely effective. The average number of iterations in | ||
| 961 | * map_swap_page() has been measured at about 0.3 per page. - akpm. | ||
| 962 | */ | ||
| 963 | static int setup_swap_extents(struct swap_info_struct *sis) | ||
| 964 | { | ||
| 965 | struct inode *inode; | ||
| 966 | unsigned blocks_per_page; | ||
| 967 | unsigned long page_no; | ||
| 968 | unsigned blkbits; | ||
| 969 | sector_t probe_block; | ||
| 970 | sector_t last_block; | ||
| 971 | int ret; | ||
| 972 | |||
| 973 | inode = sis->swap_file->f_mapping->host; | ||
| 974 | if (S_ISBLK(inode->i_mode)) { | ||
| 975 | ret = add_swap_extent(sis, 0, sis->max, 0); | ||
| 976 | goto done; | ||
| 977 | } | ||
| 978 | |||
| 979 | blkbits = inode->i_blkbits; | ||
| 980 | blocks_per_page = PAGE_SIZE >> blkbits; | ||
| 981 | |||
| 982 | /* | ||
| 983 | * Map all the blocks into the extent list. This code doesn't try | ||
| 984 | * to be very smart. | ||
| 985 | */ | ||
| 986 | probe_block = 0; | ||
| 987 | page_no = 0; | ||
| 988 | last_block = i_size_read(inode) >> blkbits; | ||
| 989 | while ((probe_block + blocks_per_page) <= last_block && | ||
| 990 | page_no < sis->max) { | ||
| 991 | unsigned block_in_page; | ||
| 992 | sector_t first_block; | ||
| 993 | |||
| 994 | first_block = bmap(inode, probe_block); | ||
| 995 | if (first_block == 0) | ||
| 996 | goto bad_bmap; | ||
| 997 | |||
| 998 | /* | ||
| 999 | * It must be PAGE_SIZE aligned on-disk | ||
| 1000 | */ | ||
| 1001 | if (first_block & (blocks_per_page - 1)) { | ||
| 1002 | probe_block++; | ||
| 1003 | goto reprobe; | ||
| 1004 | } | ||
| 1005 | |||
| 1006 | for (block_in_page = 1; block_in_page < blocks_per_page; | ||
| 1007 | block_in_page++) { | ||
| 1008 | sector_t block; | ||
| 1009 | |||
| 1010 | block = bmap(inode, probe_block + block_in_page); | ||
| 1011 | if (block == 0) | ||
| 1012 | goto bad_bmap; | ||
| 1013 | if (block != first_block + block_in_page) { | ||
| 1014 | /* Discontiguity */ | ||
| 1015 | probe_block++; | ||
| 1016 | goto reprobe; | ||
| 1017 | } | ||
| 1018 | } | ||
| 1019 | |||
| 1020 | /* | ||
| 1021 | * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks | ||
| 1022 | */ | ||
| 1023 | ret = add_swap_extent(sis, page_no, 1, | ||
| 1024 | first_block >> (PAGE_SHIFT - blkbits)); | ||
| 1025 | if (ret) | ||
| 1026 | goto out; | ||
| 1027 | page_no++; | ||
| 1028 | probe_block += blocks_per_page; | ||
| 1029 | reprobe: | ||
| 1030 | continue; | ||
| 1031 | } | ||
| 1032 | ret = 0; | ||
| 1033 | if (page_no == 0) | ||
| 1034 | ret = -EINVAL; | ||
| 1035 | sis->max = page_no; | ||
| 1036 | sis->highest_bit = page_no - 1; | ||
| 1037 | done: | ||
| 1038 | sis->curr_swap_extent = list_entry(sis->extent_list.prev, | ||
| 1039 | struct swap_extent, list); | ||
| 1040 | goto out; | ||
| 1041 | bad_bmap: | ||
| 1042 | printk(KERN_ERR "swapon: swapfile has holes\n"); | ||
| 1043 | ret = -EINVAL; | ||
| 1044 | out: | ||
| 1045 | return ret; | ||
| 1046 | } | ||
| 1047 | |||
| 1048 | #if 0 /* We don't need this yet */ | ||
| 1049 | #include <linux/backing-dev.h> | ||
| 1050 | int page_queue_congested(struct page *page) | ||
| 1051 | { | ||
| 1052 | struct backing_dev_info *bdi; | ||
| 1053 | |||
| 1054 | BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ | ||
| 1055 | |||
| 1056 | if (PageSwapCache(page)) { | ||
| 1057 | swp_entry_t entry = { .val = page->private }; | ||
| 1058 | struct swap_info_struct *sis; | ||
| 1059 | |||
| 1060 | sis = get_swap_info_struct(swp_type(entry)); | ||
| 1061 | bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info; | ||
| 1062 | } else | ||
| 1063 | bdi = page->mapping->backing_dev_info; | ||
| 1064 | return bdi_write_congested(bdi); | ||
| 1065 | } | ||
| 1066 | #endif | ||
| 1067 | |||
| 1068 | asmlinkage long sys_swapoff(const char __user * specialfile) | ||
| 1069 | { | ||
| 1070 | struct swap_info_struct * p = NULL; | ||
| 1071 | unsigned short *swap_map; | ||
| 1072 | struct file *swap_file, *victim; | ||
| 1073 | struct address_space *mapping; | ||
| 1074 | struct inode *inode; | ||
| 1075 | char * pathname; | ||
| 1076 | int i, type, prev; | ||
| 1077 | int err; | ||
| 1078 | |||
| 1079 | if (!capable(CAP_SYS_ADMIN)) | ||
| 1080 | return -EPERM; | ||
| 1081 | |||
| 1082 | pathname = getname(specialfile); | ||
| 1083 | err = PTR_ERR(pathname); | ||
| 1084 | if (IS_ERR(pathname)) | ||
| 1085 | goto out; | ||
| 1086 | |||
| 1087 | victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0); | ||
| 1088 | putname(pathname); | ||
| 1089 | err = PTR_ERR(victim); | ||
| 1090 | if (IS_ERR(victim)) | ||
| 1091 | goto out; | ||
| 1092 | |||
| 1093 | mapping = victim->f_mapping; | ||
| 1094 | prev = -1; | ||
| 1095 | swap_list_lock(); | ||
| 1096 | for (type = swap_list.head; type >= 0; type = swap_info[type].next) { | ||
| 1097 | p = swap_info + type; | ||
| 1098 | if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { | ||
| 1099 | if (p->swap_file->f_mapping == mapping) | ||
| 1100 | break; | ||
| 1101 | } | ||
| 1102 | prev = type; | ||
| 1103 | } | ||
| 1104 | if (type < 0) { | ||
| 1105 | err = -EINVAL; | ||
| 1106 | swap_list_unlock(); | ||
| 1107 | goto out_dput; | ||
| 1108 | } | ||
| 1109 | if (!security_vm_enough_memory(p->pages)) | ||
| 1110 | vm_unacct_memory(p->pages); | ||
| 1111 | else { | ||
| 1112 | err = -ENOMEM; | ||
| 1113 | swap_list_unlock(); | ||
| 1114 | goto out_dput; | ||
| 1115 | } | ||
| 1116 | if (prev < 0) { | ||
| 1117 | swap_list.head = p->next; | ||
| 1118 | } else { | ||
| 1119 | swap_info[prev].next = p->next; | ||
| 1120 | } | ||
| 1121 | if (type == swap_list.next) { | ||
| 1122 | /* just pick something that's safe... */ | ||
| 1123 | swap_list.next = swap_list.head; | ||
| 1124 | } | ||
| 1125 | nr_swap_pages -= p->pages; | ||
| 1126 | total_swap_pages -= p->pages; | ||
| 1127 | p->flags &= ~SWP_WRITEOK; | ||
| 1128 | swap_list_unlock(); | ||
| 1129 | current->flags |= PF_SWAPOFF; | ||
| 1130 | err = try_to_unuse(type); | ||
| 1131 | current->flags &= ~PF_SWAPOFF; | ||
| 1132 | |||
| 1133 | /* wait for any unplug function to finish */ | ||
| 1134 | down_write(&swap_unplug_sem); | ||
| 1135 | up_write(&swap_unplug_sem); | ||
| 1136 | |||
| 1137 | if (err) { | ||
| 1138 | /* re-insert swap space back into swap_list */ | ||
| 1139 | swap_list_lock(); | ||
| 1140 | for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) | ||
| 1141 | if (p->prio >= swap_info[i].prio) | ||
| 1142 | break; | ||
| 1143 | p->next = i; | ||
| 1144 | if (prev < 0) | ||
| 1145 | swap_list.head = swap_list.next = p - swap_info; | ||
| 1146 | else | ||
| 1147 | swap_info[prev].next = p - swap_info; | ||
| 1148 | nr_swap_pages += p->pages; | ||
| 1149 | total_swap_pages += p->pages; | ||
| 1150 | p->flags |= SWP_WRITEOK; | ||
| 1151 | swap_list_unlock(); | ||
| 1152 | goto out_dput; | ||
| 1153 | } | ||
| 1154 | down(&swapon_sem); | ||
| 1155 | swap_list_lock(); | ||
| 1156 | drain_mmlist(); | ||
| 1157 | swap_device_lock(p); | ||
| 1158 | swap_file = p->swap_file; | ||
| 1159 | p->swap_file = NULL; | ||
| 1160 | p->max = 0; | ||
| 1161 | swap_map = p->swap_map; | ||
| 1162 | p->swap_map = NULL; | ||
| 1163 | p->flags = 0; | ||
| 1164 | destroy_swap_extents(p); | ||
| 1165 | swap_device_unlock(p); | ||
| 1166 | swap_list_unlock(); | ||
| 1167 | up(&swapon_sem); | ||
| 1168 | vfree(swap_map); | ||
| 1169 | inode = mapping->host; | ||
| 1170 | if (S_ISBLK(inode->i_mode)) { | ||
| 1171 | struct block_device *bdev = I_BDEV(inode); | ||
| 1172 | set_blocksize(bdev, p->old_block_size); | ||
| 1173 | bd_release(bdev); | ||
| 1174 | } else { | ||
| 1175 | down(&inode->i_sem); | ||
| 1176 | inode->i_flags &= ~S_SWAPFILE; | ||
| 1177 | up(&inode->i_sem); | ||
| 1178 | } | ||
| 1179 | filp_close(swap_file, NULL); | ||
| 1180 | err = 0; | ||
| 1181 | |||
| 1182 | out_dput: | ||
| 1183 | filp_close(victim, NULL); | ||
| 1184 | out: | ||
| 1185 | return err; | ||
| 1186 | } | ||
| 1187 | |||
| 1188 | #ifdef CONFIG_PROC_FS | ||
| 1189 | /* iterator */ | ||
| 1190 | static void *swap_start(struct seq_file *swap, loff_t *pos) | ||
| 1191 | { | ||
| 1192 | struct swap_info_struct *ptr = swap_info; | ||
| 1193 | int i; | ||
| 1194 | loff_t l = *pos; | ||
| 1195 | |||
| 1196 | down(&swapon_sem); | ||
| 1197 | |||
| 1198 | for (i = 0; i < nr_swapfiles; i++, ptr++) { | ||
| 1199 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | ||
| 1200 | continue; | ||
| 1201 | if (!l--) | ||
| 1202 | return ptr; | ||
| 1203 | } | ||
| 1204 | |||
| 1205 | return NULL; | ||
| 1206 | } | ||
| 1207 | |||
| 1208 | static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) | ||
| 1209 | { | ||
| 1210 | struct swap_info_struct *ptr = v; | ||
| 1211 | struct swap_info_struct *endptr = swap_info + nr_swapfiles; | ||
| 1212 | |||
| 1213 | for (++ptr; ptr < endptr; ptr++) { | ||
| 1214 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | ||
| 1215 | continue; | ||
| 1216 | ++*pos; | ||
| 1217 | return ptr; | ||
| 1218 | } | ||
| 1219 | |||
| 1220 | return NULL; | ||
| 1221 | } | ||
| 1222 | |||
| 1223 | static void swap_stop(struct seq_file *swap, void *v) | ||
| 1224 | { | ||
| 1225 | up(&swapon_sem); | ||
| 1226 | } | ||
| 1227 | |||
| 1228 | static int swap_show(struct seq_file *swap, void *v) | ||
| 1229 | { | ||
| 1230 | struct swap_info_struct *ptr = v; | ||
| 1231 | struct file *file; | ||
| 1232 | int len; | ||
| 1233 | |||
| 1234 | if (v == swap_info) | ||
| 1235 | seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); | ||
| 1236 | |||
| 1237 | file = ptr->swap_file; | ||
| 1238 | len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\"); | ||
| 1239 | seq_printf(swap, "%*s%s\t%d\t%ld\t%d\n", | ||
| 1240 | len < 40 ? 40 - len : 1, " ", | ||
| 1241 | S_ISBLK(file->f_dentry->d_inode->i_mode) ? | ||
| 1242 | "partition" : "file\t", | ||
| 1243 | ptr->pages << (PAGE_SHIFT - 10), | ||
| 1244 | ptr->inuse_pages << (PAGE_SHIFT - 10), | ||
| 1245 | ptr->prio); | ||
| 1246 | return 0; | ||
| 1247 | } | ||
| 1248 | |||
| 1249 | static struct seq_operations swaps_op = { | ||
| 1250 | .start = swap_start, | ||
| 1251 | .next = swap_next, | ||
| 1252 | .stop = swap_stop, | ||
| 1253 | .show = swap_show | ||
| 1254 | }; | ||
| 1255 | |||
| 1256 | static int swaps_open(struct inode *inode, struct file *file) | ||
| 1257 | { | ||
| 1258 | return seq_open(file, &swaps_op); | ||
| 1259 | } | ||
| 1260 | |||
| 1261 | static struct file_operations proc_swaps_operations = { | ||
| 1262 | .open = swaps_open, | ||
| 1263 | .read = seq_read, | ||
| 1264 | .llseek = seq_lseek, | ||
| 1265 | .release = seq_release, | ||
| 1266 | }; | ||
| 1267 | |||
| 1268 | static int __init procswaps_init(void) | ||
| 1269 | { | ||
| 1270 | struct proc_dir_entry *entry; | ||
| 1271 | |||
| 1272 | entry = create_proc_entry("swaps", 0, NULL); | ||
| 1273 | if (entry) | ||
| 1274 | entry->proc_fops = &proc_swaps_operations; | ||
| 1275 | return 0; | ||
| 1276 | } | ||
| 1277 | __initcall(procswaps_init); | ||
| 1278 | #endif /* CONFIG_PROC_FS */ | ||
| 1279 | |||
| 1280 | /* | ||
| 1281 | * Written 01/25/92 by Simmule Turner, heavily changed by Linus. | ||
| 1282 | * | ||
| 1283 | * The swapon system call | ||
| 1284 | */ | ||
| 1285 | asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | ||
| 1286 | { | ||
| 1287 | struct swap_info_struct * p; | ||
| 1288 | char *name = NULL; | ||
| 1289 | struct block_device *bdev = NULL; | ||
| 1290 | struct file *swap_file = NULL; | ||
| 1291 | struct address_space *mapping; | ||
| 1292 | unsigned int type; | ||
| 1293 | int i, prev; | ||
| 1294 | int error; | ||
| 1295 | static int least_priority; | ||
| 1296 | union swap_header *swap_header = NULL; | ||
| 1297 | int swap_header_version; | ||
| 1298 | int nr_good_pages = 0; | ||
| 1299 | unsigned long maxpages = 1; | ||
| 1300 | int swapfilesize; | ||
| 1301 | unsigned short *swap_map; | ||
| 1302 | struct page *page = NULL; | ||
| 1303 | struct inode *inode = NULL; | ||
| 1304 | int did_down = 0; | ||
| 1305 | |||
| 1306 | if (!capable(CAP_SYS_ADMIN)) | ||
| 1307 | return -EPERM; | ||
| 1308 | swap_list_lock(); | ||
| 1309 | p = swap_info; | ||
| 1310 | for (type = 0 ; type < nr_swapfiles ; type++,p++) | ||
| 1311 | if (!(p->flags & SWP_USED)) | ||
| 1312 | break; | ||
| 1313 | error = -EPERM; | ||
| 1314 | /* | ||
| 1315 | * Test if adding another swap device is possible. There are | ||
| 1316 | * two limiting factors: 1) the number of bits for the swap | ||
| 1317 | * type swp_entry_t definition and 2) the number of bits for | ||
| 1318 | * the swap type in the swap ptes as defined by the different | ||
| 1319 | * architectures. To honor both limitations a swap entry | ||
| 1320 | * with swap offset 0 and swap type ~0UL is created, encoded | ||
| 1321 | * to a swap pte, decoded to a swp_entry_t again and finally | ||
| 1322 | * the swap type part is extracted. This will mask all bits | ||
| 1323 | * from the initial ~0UL that can't be encoded in either the | ||
| 1324 | * swp_entry_t or the architecture definition of a swap pte. | ||
| 1325 | */ | ||
| 1326 | if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) { | ||
| 1327 | swap_list_unlock(); | ||
| 1328 | goto out; | ||
| 1329 | } | ||
| 1330 | if (type >= nr_swapfiles) | ||
| 1331 | nr_swapfiles = type+1; | ||
| 1332 | INIT_LIST_HEAD(&p->extent_list); | ||
| 1333 | p->flags = SWP_USED; | ||
| 1334 | p->nr_extents = 0; | ||
| 1335 | p->swap_file = NULL; | ||
| 1336 | p->old_block_size = 0; | ||
| 1337 | p->swap_map = NULL; | ||
| 1338 | p->lowest_bit = 0; | ||
| 1339 | p->highest_bit = 0; | ||
| 1340 | p->cluster_nr = 0; | ||
| 1341 | p->inuse_pages = 0; | ||
| 1342 | spin_lock_init(&p->sdev_lock); | ||
| 1343 | p->next = -1; | ||
| 1344 | if (swap_flags & SWAP_FLAG_PREFER) { | ||
| 1345 | p->prio = | ||
| 1346 | (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT; | ||
| 1347 | } else { | ||
| 1348 | p->prio = --least_priority; | ||
| 1349 | } | ||
| 1350 | swap_list_unlock(); | ||
| 1351 | name = getname(specialfile); | ||
| 1352 | error = PTR_ERR(name); | ||
| 1353 | if (IS_ERR(name)) { | ||
| 1354 | name = NULL; | ||
| 1355 | goto bad_swap_2; | ||
| 1356 | } | ||
| 1357 | swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); | ||
| 1358 | error = PTR_ERR(swap_file); | ||
| 1359 | if (IS_ERR(swap_file)) { | ||
| 1360 | swap_file = NULL; | ||
| 1361 | goto bad_swap_2; | ||
| 1362 | } | ||
| 1363 | |||
| 1364 | p->swap_file = swap_file; | ||
| 1365 | mapping = swap_file->f_mapping; | ||
| 1366 | inode = mapping->host; | ||
| 1367 | |||
| 1368 | error = -EBUSY; | ||
| 1369 | for (i = 0; i < nr_swapfiles; i++) { | ||
| 1370 | struct swap_info_struct *q = &swap_info[i]; | ||
| 1371 | |||
| 1372 | if (i == type || !q->swap_file) | ||
| 1373 | continue; | ||
| 1374 | if (mapping == q->swap_file->f_mapping) | ||
| 1375 | goto bad_swap; | ||
| 1376 | } | ||
| 1377 | |||
| 1378 | error = -EINVAL; | ||
| 1379 | if (S_ISBLK(inode->i_mode)) { | ||
| 1380 | bdev = I_BDEV(inode); | ||
| 1381 | error = bd_claim(bdev, sys_swapon); | ||
| 1382 | if (error < 0) { | ||
| 1383 | bdev = NULL; | ||
| 1384 | goto bad_swap; | ||
| 1385 | } | ||
| 1386 | p->old_block_size = block_size(bdev); | ||
| 1387 | error = set_blocksize(bdev, PAGE_SIZE); | ||
| 1388 | if (error < 0) | ||
| 1389 | goto bad_swap; | ||
| 1390 | p->bdev = bdev; | ||
| 1391 | } else if (S_ISREG(inode->i_mode)) { | ||
| 1392 | p->bdev = inode->i_sb->s_bdev; | ||
| 1393 | down(&inode->i_sem); | ||
| 1394 | did_down = 1; | ||
| 1395 | if (IS_SWAPFILE(inode)) { | ||
| 1396 | error = -EBUSY; | ||
| 1397 | goto bad_swap; | ||
| 1398 | } | ||
| 1399 | } else { | ||
| 1400 | goto bad_swap; | ||
| 1401 | } | ||
| 1402 | |||
| 1403 | swapfilesize = i_size_read(inode) >> PAGE_SHIFT; | ||
| 1404 | |||
| 1405 | /* | ||
| 1406 | * Read the swap header. | ||
| 1407 | */ | ||
| 1408 | if (!mapping->a_ops->readpage) { | ||
| 1409 | error = -EINVAL; | ||
| 1410 | goto bad_swap; | ||
| 1411 | } | ||
| 1412 | page = read_cache_page(mapping, 0, | ||
| 1413 | (filler_t *)mapping->a_ops->readpage, swap_file); | ||
| 1414 | if (IS_ERR(page)) { | ||
| 1415 | error = PTR_ERR(page); | ||
| 1416 | goto bad_swap; | ||
| 1417 | } | ||
| 1418 | wait_on_page_locked(page); | ||
| 1419 | if (!PageUptodate(page)) | ||
| 1420 | goto bad_swap; | ||
| 1421 | kmap(page); | ||
| 1422 | swap_header = page_address(page); | ||
| 1423 | |||
| 1424 | if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) | ||
| 1425 | swap_header_version = 1; | ||
| 1426 | else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) | ||
| 1427 | swap_header_version = 2; | ||
| 1428 | else { | ||
| 1429 | printk("Unable to find swap-space signature\n"); | ||
| 1430 | error = -EINVAL; | ||
| 1431 | goto bad_swap; | ||
| 1432 | } | ||
| 1433 | |||
| 1434 | switch (swap_header_version) { | ||
| 1435 | case 1: | ||
| 1436 | printk(KERN_ERR "version 0 swap is no longer supported. " | ||
| 1437 | "Use mkswap -v1 %s\n", name); | ||
| 1438 | error = -EINVAL; | ||
| 1439 | goto bad_swap; | ||
| 1440 | case 2: | ||
| 1441 | /* Check the swap header's sub-version and the size of | ||
| 1442 | the swap file and bad block lists */ | ||
| 1443 | if (swap_header->info.version != 1) { | ||
| 1444 | printk(KERN_WARNING | ||
| 1445 | "Unable to handle swap header version %d\n", | ||
| 1446 | swap_header->info.version); | ||
| 1447 | error = -EINVAL; | ||
| 1448 | goto bad_swap; | ||
| 1449 | } | ||
| 1450 | |||
| 1451 | p->lowest_bit = 1; | ||
| 1452 | /* | ||
| 1453 | * Find out how many pages are allowed for a single swap | ||
| 1454 | * device. There are two limiting factors: 1) the number of | ||
| 1455 | * bits for the swap offset in the swp_entry_t type and | ||
| 1456 | * 2) the number of bits in the a swap pte as defined by | ||
| 1457 | * the different architectures. In order to find the | ||
| 1458 | * largest possible bit mask a swap entry with swap type 0 | ||
| 1459 | * and swap offset ~0UL is created, encoded to a swap pte, | ||
| 1460 | * decoded to a swp_entry_t again and finally the swap | ||
| 1461 | * offset is extracted. This will mask all the bits from | ||
| 1462 | * the initial ~0UL mask that can't be encoded in either | ||
| 1463 | * the swp_entry_t or the architecture definition of a | ||
| 1464 | * swap pte. | ||
| 1465 | */ | ||
| 1466 | maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1; | ||
| 1467 | if (maxpages > swap_header->info.last_page) | ||
| 1468 | maxpages = swap_header->info.last_page; | ||
| 1469 | p->highest_bit = maxpages - 1; | ||
| 1470 | |||
| 1471 | error = -EINVAL; | ||
| 1472 | if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) | ||
| 1473 | goto bad_swap; | ||
| 1474 | |||
| 1475 | /* OK, set up the swap map and apply the bad block list */ | ||
| 1476 | if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { | ||
| 1477 | error = -ENOMEM; | ||
| 1478 | goto bad_swap; | ||
| 1479 | } | ||
| 1480 | |||
| 1481 | error = 0; | ||
| 1482 | memset(p->swap_map, 0, maxpages * sizeof(short)); | ||
| 1483 | for (i=0; i<swap_header->info.nr_badpages; i++) { | ||
| 1484 | int page = swap_header->info.badpages[i]; | ||
| 1485 | if (page <= 0 || page >= swap_header->info.last_page) | ||
| 1486 | error = -EINVAL; | ||
| 1487 | else | ||
| 1488 | p->swap_map[page] = SWAP_MAP_BAD; | ||
| 1489 | } | ||
| 1490 | nr_good_pages = swap_header->info.last_page - | ||
| 1491 | swap_header->info.nr_badpages - | ||
| 1492 | 1 /* header page */; | ||
| 1493 | if (error) | ||
| 1494 | goto bad_swap; | ||
| 1495 | } | ||
| 1496 | |||
| 1497 | if (swapfilesize && maxpages > swapfilesize) { | ||
| 1498 | printk(KERN_WARNING | ||
| 1499 | "Swap area shorter than signature indicates\n"); | ||
| 1500 | error = -EINVAL; | ||
| 1501 | goto bad_swap; | ||
| 1502 | } | ||
| 1503 | if (!nr_good_pages) { | ||
| 1504 | printk(KERN_WARNING "Empty swap-file\n"); | ||
| 1505 | error = -EINVAL; | ||
| 1506 | goto bad_swap; | ||
| 1507 | } | ||
| 1508 | p->swap_map[0] = SWAP_MAP_BAD; | ||
| 1509 | p->max = maxpages; | ||
| 1510 | p->pages = nr_good_pages; | ||
| 1511 | |||
| 1512 | error = setup_swap_extents(p); | ||
| 1513 | if (error) | ||
| 1514 | goto bad_swap; | ||
| 1515 | |||
| 1516 | down(&swapon_sem); | ||
| 1517 | swap_list_lock(); | ||
| 1518 | swap_device_lock(p); | ||
| 1519 | p->flags = SWP_ACTIVE; | ||
| 1520 | nr_swap_pages += nr_good_pages; | ||
| 1521 | total_swap_pages += nr_good_pages; | ||
| 1522 | printk(KERN_INFO "Adding %dk swap on %s. Priority:%d extents:%d\n", | ||
| 1523 | nr_good_pages<<(PAGE_SHIFT-10), name, | ||
| 1524 | p->prio, p->nr_extents); | ||
| 1525 | |||
| 1526 | /* insert swap space into swap_list: */ | ||
| 1527 | prev = -1; | ||
| 1528 | for (i = swap_list.head; i >= 0; i = swap_info[i].next) { | ||
| 1529 | if (p->prio >= swap_info[i].prio) { | ||
| 1530 | break; | ||
| 1531 | } | ||
| 1532 | prev = i; | ||
| 1533 | } | ||
| 1534 | p->next = i; | ||
| 1535 | if (prev < 0) { | ||
| 1536 | swap_list.head = swap_list.next = p - swap_info; | ||
| 1537 | } else { | ||
| 1538 | swap_info[prev].next = p - swap_info; | ||
| 1539 | } | ||
| 1540 | swap_device_unlock(p); | ||
| 1541 | swap_list_unlock(); | ||
| 1542 | up(&swapon_sem); | ||
| 1543 | error = 0; | ||
| 1544 | goto out; | ||
| 1545 | bad_swap: | ||
| 1546 | if (bdev) { | ||
| 1547 | set_blocksize(bdev, p->old_block_size); | ||
| 1548 | bd_release(bdev); | ||
| 1549 | } | ||
| 1550 | bad_swap_2: | ||
| 1551 | swap_list_lock(); | ||
| 1552 | swap_map = p->swap_map; | ||
| 1553 | p->swap_file = NULL; | ||
| 1554 | p->swap_map = NULL; | ||
| 1555 | p->flags = 0; | ||
| 1556 | if (!(swap_flags & SWAP_FLAG_PREFER)) | ||
| 1557 | ++least_priority; | ||
| 1558 | swap_list_unlock(); | ||
| 1559 | destroy_swap_extents(p); | ||
| 1560 | vfree(swap_map); | ||
| 1561 | if (swap_file) | ||
| 1562 | filp_close(swap_file, NULL); | ||
| 1563 | out: | ||
| 1564 | if (page && !IS_ERR(page)) { | ||
| 1565 | kunmap(page); | ||
| 1566 | page_cache_release(page); | ||
| 1567 | } | ||
| 1568 | if (name) | ||
| 1569 | putname(name); | ||
| 1570 | if (did_down) { | ||
| 1571 | if (!error) | ||
| 1572 | inode->i_flags |= S_SWAPFILE; | ||
| 1573 | up(&inode->i_sem); | ||
| 1574 | } | ||
| 1575 | return error; | ||
| 1576 | } | ||
| 1577 | |||
| 1578 | void si_swapinfo(struct sysinfo *val) | ||
| 1579 | { | ||
| 1580 | unsigned int i; | ||
| 1581 | unsigned long nr_to_be_unused = 0; | ||
| 1582 | |||
| 1583 | swap_list_lock(); | ||
| 1584 | for (i = 0; i < nr_swapfiles; i++) { | ||
| 1585 | if (!(swap_info[i].flags & SWP_USED) || | ||
| 1586 | (swap_info[i].flags & SWP_WRITEOK)) | ||
| 1587 | continue; | ||
| 1588 | nr_to_be_unused += swap_info[i].inuse_pages; | ||
| 1589 | } | ||
| 1590 | val->freeswap = nr_swap_pages + nr_to_be_unused; | ||
| 1591 | val->totalswap = total_swap_pages + nr_to_be_unused; | ||
| 1592 | swap_list_unlock(); | ||
| 1593 | } | ||
| 1594 | |||
| 1595 | /* | ||
| 1596 | * Verify that a swap entry is valid and increment its swap map count. | ||
| 1597 | * | ||
| 1598 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as | ||
| 1599 | * "permanent", but will be reclaimed by the next swapoff. | ||
| 1600 | */ | ||
| 1601 | int swap_duplicate(swp_entry_t entry) | ||
| 1602 | { | ||
| 1603 | struct swap_info_struct * p; | ||
| 1604 | unsigned long offset, type; | ||
| 1605 | int result = 0; | ||
| 1606 | |||
| 1607 | type = swp_type(entry); | ||
| 1608 | if (type >= nr_swapfiles) | ||
| 1609 | goto bad_file; | ||
| 1610 | p = type + swap_info; | ||
| 1611 | offset = swp_offset(entry); | ||
| 1612 | |||
| 1613 | swap_device_lock(p); | ||
| 1614 | if (offset < p->max && p->swap_map[offset]) { | ||
| 1615 | if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { | ||
| 1616 | p->swap_map[offset]++; | ||
| 1617 | result = 1; | ||
| 1618 | } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { | ||
| 1619 | if (swap_overflow++ < 5) | ||
| 1620 | printk(KERN_WARNING "swap_dup: swap entry overflow\n"); | ||
| 1621 | p->swap_map[offset] = SWAP_MAP_MAX; | ||
| 1622 | result = 1; | ||
| 1623 | } | ||
| 1624 | } | ||
| 1625 | swap_device_unlock(p); | ||
| 1626 | out: | ||
| 1627 | return result; | ||
| 1628 | |||
| 1629 | bad_file: | ||
| 1630 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); | ||
| 1631 | goto out; | ||
| 1632 | } | ||
| 1633 | |||
| 1634 | struct swap_info_struct * | ||
| 1635 | get_swap_info_struct(unsigned type) | ||
| 1636 | { | ||
| 1637 | return &swap_info[type]; | ||
| 1638 | } | ||
| 1639 | |||
| 1640 | /* | ||
| 1641 | * swap_device_lock prevents swap_map being freed. Don't grab an extra | ||
| 1642 | * reference on the swaphandle, it doesn't matter if it becomes unused. | ||
| 1643 | */ | ||
| 1644 | int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | ||
| 1645 | { | ||
| 1646 | int ret = 0, i = 1 << page_cluster; | ||
| 1647 | unsigned long toff; | ||
| 1648 | struct swap_info_struct *swapdev = swp_type(entry) + swap_info; | ||
| 1649 | |||
| 1650 | if (!page_cluster) /* no readahead */ | ||
| 1651 | return 0; | ||
| 1652 | toff = (swp_offset(entry) >> page_cluster) << page_cluster; | ||
| 1653 | if (!toff) /* first page is swap header */ | ||
| 1654 | toff++, i--; | ||
| 1655 | *offset = toff; | ||
| 1656 | |||
| 1657 | swap_device_lock(swapdev); | ||
| 1658 | do { | ||
| 1659 | /* Don't read-ahead past the end of the swap area */ | ||
| 1660 | if (toff >= swapdev->max) | ||
| 1661 | break; | ||
| 1662 | /* Don't read in free or bad pages */ | ||
| 1663 | if (!swapdev->swap_map[toff]) | ||
| 1664 | break; | ||
| 1665 | if (swapdev->swap_map[toff] == SWAP_MAP_BAD) | ||
| 1666 | break; | ||
| 1667 | toff++; | ||
| 1668 | ret++; | ||
| 1669 | } while (--i); | ||
| 1670 | swap_device_unlock(swapdev); | ||
| 1671 | return ret; | ||
| 1672 | } | ||
diff --git a/mm/thrash.c b/mm/thrash.c new file mode 100644 index 000000000000..11461f7ad830 --- /dev/null +++ b/mm/thrash.c | |||
| @@ -0,0 +1,102 @@ | |||
| 1 | /* | ||
| 2 | * mm/thrash.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 2004, Red Hat, Inc. | ||
| 5 | * Copyright (C) 2004, Rik van Riel <riel@redhat.com> | ||
| 6 | * Released under the GPL, see the file COPYING for details. | ||
| 7 | * | ||
| 8 | * Simple token based thrashing protection, using the algorithm | ||
| 9 | * described in: http://www.cs.wm.edu/~sjiang/token.pdf | ||
| 10 | */ | ||
| 11 | #include <linux/jiffies.h> | ||
| 12 | #include <linux/mm.h> | ||
| 13 | #include <linux/sched.h> | ||
| 14 | #include <linux/swap.h> | ||
| 15 | |||
| 16 | static DEFINE_SPINLOCK(swap_token_lock); | ||
| 17 | static unsigned long swap_token_timeout; | ||
| 18 | static unsigned long swap_token_check; | ||
| 19 | struct mm_struct * swap_token_mm = &init_mm; | ||
| 20 | |||
| 21 | #define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2) | ||
| 22 | #define SWAP_TOKEN_TIMEOUT 0 | ||
| 23 | /* | ||
| 24 | * Currently disabled; Needs further code to work at HZ * 300. | ||
| 25 | */ | ||
| 26 | unsigned long swap_token_default_timeout = SWAP_TOKEN_TIMEOUT; | ||
| 27 | |||
| 28 | /* | ||
| 29 | * Take the token away if the process had no page faults | ||
| 30 | * in the last interval, or if it has held the token for | ||
| 31 | * too long. | ||
| 32 | */ | ||
| 33 | #define SWAP_TOKEN_ENOUGH_RSS 1 | ||
| 34 | #define SWAP_TOKEN_TIMED_OUT 2 | ||
| 35 | static int should_release_swap_token(struct mm_struct *mm) | ||
| 36 | { | ||
| 37 | int ret = 0; | ||
| 38 | if (!mm->recent_pagein) | ||
| 39 | ret = SWAP_TOKEN_ENOUGH_RSS; | ||
| 40 | else if (time_after(jiffies, swap_token_timeout)) | ||
| 41 | ret = SWAP_TOKEN_TIMED_OUT; | ||
| 42 | mm->recent_pagein = 0; | ||
| 43 | return ret; | ||
| 44 | } | ||
| 45 | |||
| 46 | /* | ||
| 47 | * Try to grab the swapout protection token. We only try to | ||
| 48 | * grab it once every TOKEN_CHECK_INTERVAL, both to prevent | ||
| 49 | * SMP lock contention and to check that the process that held | ||
| 50 | * the token before is no longer thrashing. | ||
| 51 | */ | ||
| 52 | void grab_swap_token(void) | ||
| 53 | { | ||
| 54 | struct mm_struct *mm; | ||
| 55 | int reason; | ||
| 56 | |||
| 57 | /* We have the token. Let others know we still need it. */ | ||
| 58 | if (has_swap_token(current->mm)) { | ||
| 59 | current->mm->recent_pagein = 1; | ||
| 60 | return; | ||
| 61 | } | ||
| 62 | |||
| 63 | if (time_after(jiffies, swap_token_check)) { | ||
| 64 | |||
| 65 | /* Can't get swapout protection if we exceed our RSS limit. */ | ||
| 66 | // if (current->mm->rss > current->mm->rlimit_rss) | ||
| 67 | // return; | ||
| 68 | |||
| 69 | /* ... or if we recently held the token. */ | ||
| 70 | if (time_before(jiffies, current->mm->swap_token_time)) | ||
| 71 | return; | ||
| 72 | |||
| 73 | if (!spin_trylock(&swap_token_lock)) | ||
| 74 | return; | ||
| 75 | |||
| 76 | swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL; | ||
| 77 | |||
| 78 | mm = swap_token_mm; | ||
| 79 | if ((reason = should_release_swap_token(mm))) { | ||
| 80 | unsigned long eligible = jiffies; | ||
| 81 | if (reason == SWAP_TOKEN_TIMED_OUT) { | ||
| 82 | eligible += swap_token_default_timeout; | ||
| 83 | } | ||
| 84 | mm->swap_token_time = eligible; | ||
| 85 | swap_token_timeout = jiffies + swap_token_default_timeout; | ||
| 86 | swap_token_mm = current->mm; | ||
| 87 | } | ||
| 88 | spin_unlock(&swap_token_lock); | ||
| 89 | } | ||
| 90 | return; | ||
| 91 | } | ||
| 92 | |||
| 93 | /* Called on process exit. */ | ||
| 94 | void __put_swap_token(struct mm_struct *mm) | ||
| 95 | { | ||
| 96 | spin_lock(&swap_token_lock); | ||
| 97 | if (likely(mm == swap_token_mm)) { | ||
| 98 | swap_token_mm = &init_mm; | ||
| 99 | swap_token_check = jiffies; | ||
| 100 | } | ||
| 101 | spin_unlock(&swap_token_lock); | ||
| 102 | } | ||
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c new file mode 100644 index 000000000000..c13a2161bca2 --- /dev/null +++ b/mm/tiny-shmem.c | |||
| @@ -0,0 +1,122 @@ | |||
| 1 | /* | ||
| 2 | * tiny-shmem.c: simple shmemfs and tmpfs using ramfs code | ||
| 3 | * | ||
| 4 | * Matt Mackall <mpm@selenic.com> January, 2004 | ||
| 5 | * derived from mm/shmem.c and fs/ramfs/inode.c | ||
| 6 | * | ||
| 7 | * This is intended for small system where the benefits of the full | ||
| 8 | * shmem code (swap-backed and resource-limited) are outweighed by | ||
| 9 | * their complexity. On systems without swap this code should be | ||
| 10 | * effectively equivalent, but much lighter weight. | ||
| 11 | */ | ||
| 12 | |||
| 13 | #include <linux/fs.h> | ||
| 14 | #include <linux/init.h> | ||
| 15 | #include <linux/devfs_fs_kernel.h> | ||
| 16 | #include <linux/vfs.h> | ||
| 17 | #include <linux/mount.h> | ||
| 18 | #include <linux/file.h> | ||
| 19 | #include <linux/mm.h> | ||
| 20 | #include <linux/module.h> | ||
| 21 | #include <linux/swap.h> | ||
| 22 | #include <linux/ramfs.h> | ||
| 23 | |||
| 24 | static struct file_system_type tmpfs_fs_type = { | ||
| 25 | .name = "tmpfs", | ||
| 26 | .get_sb = ramfs_get_sb, | ||
| 27 | .kill_sb = kill_litter_super, | ||
| 28 | }; | ||
| 29 | |||
| 30 | static struct vfsmount *shm_mnt; | ||
| 31 | |||
| 32 | static int __init init_tmpfs(void) | ||
| 33 | { | ||
| 34 | register_filesystem(&tmpfs_fs_type); | ||
| 35 | #ifdef CONFIG_TMPFS | ||
| 36 | devfs_mk_dir("shm"); | ||
| 37 | #endif | ||
| 38 | shm_mnt = kern_mount(&tmpfs_fs_type); | ||
| 39 | return 0; | ||
| 40 | } | ||
| 41 | module_init(init_tmpfs) | ||
| 42 | |||
| 43 | /* | ||
| 44 | * shmem_file_setup - get an unlinked file living in tmpfs | ||
| 45 | * | ||
| 46 | * @name: name for dentry (to be seen in /proc/<pid>/maps | ||
| 47 | * @size: size to be set for the file | ||
| 48 | * | ||
| 49 | */ | ||
| 50 | struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | ||
| 51 | { | ||
| 52 | int error; | ||
| 53 | struct file *file; | ||
| 54 | struct inode *inode; | ||
| 55 | struct dentry *dentry, *root; | ||
| 56 | struct qstr this; | ||
| 57 | |||
| 58 | if (IS_ERR(shm_mnt)) | ||
| 59 | return (void *)shm_mnt; | ||
| 60 | |||
| 61 | error = -ENOMEM; | ||
| 62 | this.name = name; | ||
| 63 | this.len = strlen(name); | ||
| 64 | this.hash = 0; /* will go */ | ||
| 65 | root = shm_mnt->mnt_root; | ||
| 66 | dentry = d_alloc(root, &this); | ||
| 67 | if (!dentry) | ||
| 68 | goto put_memory; | ||
| 69 | |||
| 70 | error = -ENFILE; | ||
| 71 | file = get_empty_filp(); | ||
| 72 | if (!file) | ||
| 73 | goto put_dentry; | ||
| 74 | |||
| 75 | error = -ENOSPC; | ||
| 76 | inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); | ||
| 77 | if (!inode) | ||
| 78 | goto close_file; | ||
| 79 | |||
| 80 | d_instantiate(dentry, inode); | ||
| 81 | inode->i_size = size; | ||
| 82 | inode->i_nlink = 0; /* It is unlinked */ | ||
| 83 | file->f_vfsmnt = mntget(shm_mnt); | ||
| 84 | file->f_dentry = dentry; | ||
| 85 | file->f_mapping = inode->i_mapping; | ||
| 86 | file->f_op = &ramfs_file_operations; | ||
| 87 | file->f_mode = FMODE_WRITE | FMODE_READ; | ||
| 88 | return file; | ||
| 89 | |||
| 90 | close_file: | ||
| 91 | put_filp(file); | ||
| 92 | put_dentry: | ||
| 93 | dput(dentry); | ||
| 94 | put_memory: | ||
| 95 | return ERR_PTR(error); | ||
| 96 | } | ||
| 97 | |||
| 98 | /* | ||
| 99 | * shmem_zero_setup - setup a shared anonymous mapping | ||
| 100 | * | ||
| 101 | * @vma: the vma to be mmapped is prepared by do_mmap_pgoff | ||
| 102 | */ | ||
| 103 | int shmem_zero_setup(struct vm_area_struct *vma) | ||
| 104 | { | ||
| 105 | struct file *file; | ||
| 106 | loff_t size = vma->vm_end - vma->vm_start; | ||
| 107 | |||
| 108 | file = shmem_file_setup("dev/zero", size, vma->vm_flags); | ||
| 109 | if (IS_ERR(file)) | ||
| 110 | return PTR_ERR(file); | ||
| 111 | |||
| 112 | if (vma->vm_file) | ||
| 113 | fput(vma->vm_file); | ||
| 114 | vma->vm_file = file; | ||
| 115 | vma->vm_ops = &generic_file_vm_ops; | ||
| 116 | return 0; | ||
| 117 | } | ||
| 118 | |||
| 119 | int shmem_unuse(swp_entry_t entry, struct page *page) | ||
| 120 | { | ||
| 121 | return 0; | ||
| 122 | } | ||
diff --git a/mm/truncate.c b/mm/truncate.c new file mode 100644 index 000000000000..c9a63f0b69a2 --- /dev/null +++ b/mm/truncate.c | |||
| @@ -0,0 +1,336 @@ | |||
| 1 | /* | ||
| 2 | * mm/truncate.c - code for taking down pages from address_spaces | ||
| 3 | * | ||
| 4 | * Copyright (C) 2002, Linus Torvalds | ||
| 5 | * | ||
| 6 | * 10Sep2002 akpm@zip.com.au | ||
| 7 | * Initial version. | ||
| 8 | */ | ||
| 9 | |||
| 10 | #include <linux/kernel.h> | ||
| 11 | #include <linux/mm.h> | ||
| 12 | #include <linux/module.h> | ||
| 13 | #include <linux/pagemap.h> | ||
| 14 | #include <linux/pagevec.h> | ||
| 15 | #include <linux/buffer_head.h> /* grr. try_to_release_page, | ||
| 16 | block_invalidatepage */ | ||
| 17 | |||
| 18 | |||
| 19 | static int do_invalidatepage(struct page *page, unsigned long offset) | ||
| 20 | { | ||
| 21 | int (*invalidatepage)(struct page *, unsigned long); | ||
| 22 | invalidatepage = page->mapping->a_ops->invalidatepage; | ||
| 23 | if (invalidatepage == NULL) | ||
| 24 | invalidatepage = block_invalidatepage; | ||
| 25 | return (*invalidatepage)(page, offset); | ||
| 26 | } | ||
| 27 | |||
| 28 | static inline void truncate_partial_page(struct page *page, unsigned partial) | ||
| 29 | { | ||
| 30 | memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); | ||
| 31 | if (PagePrivate(page)) | ||
| 32 | do_invalidatepage(page, partial); | ||
| 33 | } | ||
| 34 | |||
| 35 | /* | ||
| 36 | * If truncate cannot remove the fs-private metadata from the page, the page | ||
| 37 | * becomes anonymous. It will be left on the LRU and may even be mapped into | ||
| 38 | * user pagetables if we're racing with filemap_nopage(). | ||
| 39 | * | ||
| 40 | * We need to bale out if page->mapping is no longer equal to the original | ||
| 41 | * mapping. This happens a) when the VM reclaimed the page while we waited on | ||
| 42 | * its lock, b) when a concurrent invalidate_inode_pages got there first and | ||
| 43 | * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. | ||
| 44 | */ | ||
| 45 | static void | ||
| 46 | truncate_complete_page(struct address_space *mapping, struct page *page) | ||
| 47 | { | ||
| 48 | if (page->mapping != mapping) | ||
| 49 | return; | ||
| 50 | |||
| 51 | if (PagePrivate(page)) | ||
| 52 | do_invalidatepage(page, 0); | ||
| 53 | |||
| 54 | clear_page_dirty(page); | ||
| 55 | ClearPageUptodate(page); | ||
| 56 | ClearPageMappedToDisk(page); | ||
| 57 | remove_from_page_cache(page); | ||
| 58 | page_cache_release(page); /* pagecache ref */ | ||
| 59 | } | ||
| 60 | |||
| 61 | /* | ||
| 62 | * This is for invalidate_inode_pages(). That function can be called at | ||
| 63 | * any time, and is not supposed to throw away dirty pages. But pages can | ||
| 64 | * be marked dirty at any time too. So we re-check the dirtiness inside | ||
| 65 | * ->tree_lock. That provides exclusion against the __set_page_dirty | ||
| 66 | * functions. | ||
| 67 | * | ||
| 68 | * Returns non-zero if the page was successfully invalidated. | ||
| 69 | */ | ||
| 70 | static int | ||
| 71 | invalidate_complete_page(struct address_space *mapping, struct page *page) | ||
| 72 | { | ||
| 73 | if (page->mapping != mapping) | ||
| 74 | return 0; | ||
| 75 | |||
| 76 | if (PagePrivate(page) && !try_to_release_page(page, 0)) | ||
| 77 | return 0; | ||
| 78 | |||
| 79 | write_lock_irq(&mapping->tree_lock); | ||
| 80 | if (PageDirty(page)) { | ||
| 81 | write_unlock_irq(&mapping->tree_lock); | ||
| 82 | return 0; | ||
| 83 | } | ||
| 84 | |||
| 85 | BUG_ON(PagePrivate(page)); | ||
| 86 | __remove_from_page_cache(page); | ||
| 87 | write_unlock_irq(&mapping->tree_lock); | ||
| 88 | ClearPageUptodate(page); | ||
| 89 | page_cache_release(page); /* pagecache ref */ | ||
| 90 | return 1; | ||
| 91 | } | ||
| 92 | |||
| 93 | /** | ||
| 94 | * truncate_inode_pages - truncate *all* the pages from an offset | ||
| 95 | * @mapping: mapping to truncate | ||
| 96 | * @lstart: offset from which to truncate | ||
| 97 | * | ||
| 98 | * Truncate the page cache at a set offset, removing the pages that are beyond | ||
| 99 | * that offset (and zeroing out partial pages). | ||
| 100 | * | ||
| 101 | * Truncate takes two passes - the first pass is nonblocking. It will not | ||
| 102 | * block on page locks and it will not block on writeback. The second pass | ||
| 103 | * will wait. This is to prevent as much IO as possible in the affected region. | ||
| 104 | * The first pass will remove most pages, so the search cost of the second pass | ||
| 105 | * is low. | ||
| 106 | * | ||
| 107 | * When looking at page->index outside the page lock we need to be careful to | ||
| 108 | * copy it into a local to avoid races (it could change at any time). | ||
| 109 | * | ||
| 110 | * We pass down the cache-hot hint to the page freeing code. Even if the | ||
| 111 | * mapping is large, it is probably the case that the final pages are the most | ||
| 112 | * recently touched, and freeing happens in ascending file offset order. | ||
| 113 | * | ||
| 114 | * Called under (and serialised by) inode->i_sem. | ||
| 115 | */ | ||
| 116 | void truncate_inode_pages(struct address_space *mapping, loff_t lstart) | ||
| 117 | { | ||
| 118 | const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; | ||
| 119 | const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); | ||
| 120 | struct pagevec pvec; | ||
| 121 | pgoff_t next; | ||
| 122 | int i; | ||
| 123 | |||
| 124 | if (mapping->nrpages == 0) | ||
| 125 | return; | ||
| 126 | |||
| 127 | pagevec_init(&pvec, 0); | ||
| 128 | next = start; | ||
| 129 | while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | ||
| 130 | for (i = 0; i < pagevec_count(&pvec); i++) { | ||
| 131 | struct page *page = pvec.pages[i]; | ||
| 132 | pgoff_t page_index = page->index; | ||
| 133 | |||
| 134 | if (page_index > next) | ||
| 135 | next = page_index; | ||
| 136 | next++; | ||
| 137 | if (TestSetPageLocked(page)) | ||
| 138 | continue; | ||
| 139 | if (PageWriteback(page)) { | ||
| 140 | unlock_page(page); | ||
| 141 | continue; | ||
| 142 | } | ||
| 143 | truncate_complete_page(mapping, page); | ||
| 144 | unlock_page(page); | ||
| 145 | } | ||
| 146 | pagevec_release(&pvec); | ||
| 147 | cond_resched(); | ||
| 148 | } | ||
| 149 | |||
| 150 | if (partial) { | ||
| 151 | struct page *page = find_lock_page(mapping, start - 1); | ||
| 152 | if (page) { | ||
| 153 | wait_on_page_writeback(page); | ||
| 154 | truncate_partial_page(page, partial); | ||
| 155 | unlock_page(page); | ||
| 156 | page_cache_release(page); | ||
| 157 | } | ||
| 158 | } | ||
| 159 | |||
| 160 | next = start; | ||
| 161 | for ( ; ; ) { | ||
| 162 | cond_resched(); | ||
| 163 | if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | ||
| 164 | if (next == start) | ||
| 165 | break; | ||
| 166 | next = start; | ||
| 167 | continue; | ||
| 168 | } | ||
| 169 | for (i = 0; i < pagevec_count(&pvec); i++) { | ||
| 170 | struct page *page = pvec.pages[i]; | ||
| 171 | |||
| 172 | lock_page(page); | ||
| 173 | wait_on_page_writeback(page); | ||
| 174 | if (page->index > next) | ||
| 175 | next = page->index; | ||
| 176 | next++; | ||
| 177 | truncate_complete_page(mapping, page); | ||
| 178 | unlock_page(page); | ||
| 179 | } | ||
| 180 | pagevec_release(&pvec); | ||
| 181 | } | ||
| 182 | } | ||
| 183 | |||
| 184 | EXPORT_SYMBOL(truncate_inode_pages); | ||
| 185 | |||
| 186 | /** | ||
| 187 | * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode | ||
| 188 | * @mapping: the address_space which holds the pages to invalidate | ||
| 189 | * @start: the offset 'from' which to invalidate | ||
| 190 | * @end: the offset 'to' which to invalidate (inclusive) | ||
| 191 | * | ||
| 192 | * This function only removes the unlocked pages, if you want to | ||
| 193 | * remove all the pages of one inode, you must call truncate_inode_pages. | ||
| 194 | * | ||
| 195 | * invalidate_mapping_pages() will not block on IO activity. It will not | ||
| 196 | * invalidate pages which are dirty, locked, under writeback or mapped into | ||
| 197 | * pagetables. | ||
| 198 | */ | ||
| 199 | unsigned long invalidate_mapping_pages(struct address_space *mapping, | ||
| 200 | pgoff_t start, pgoff_t end) | ||
| 201 | { | ||
| 202 | struct pagevec pvec; | ||
| 203 | pgoff_t next = start; | ||
| 204 | unsigned long ret = 0; | ||
| 205 | int i; | ||
| 206 | |||
| 207 | pagevec_init(&pvec, 0); | ||
| 208 | while (next <= end && | ||
| 209 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | ||
| 210 | for (i = 0; i < pagevec_count(&pvec); i++) { | ||
| 211 | struct page *page = pvec.pages[i]; | ||
| 212 | |||
| 213 | if (TestSetPageLocked(page)) { | ||
| 214 | next++; | ||
| 215 | continue; | ||
| 216 | } | ||
| 217 | if (page->index > next) | ||
| 218 | next = page->index; | ||
| 219 | next++; | ||
| 220 | if (PageDirty(page) || PageWriteback(page)) | ||
| 221 | goto unlock; | ||
| 222 | if (page_mapped(page)) | ||
| 223 | goto unlock; | ||
| 224 | ret += invalidate_complete_page(mapping, page); | ||
| 225 | unlock: | ||
| 226 | unlock_page(page); | ||
| 227 | if (next > end) | ||
| 228 | break; | ||
| 229 | } | ||
| 230 | pagevec_release(&pvec); | ||
| 231 | cond_resched(); | ||
| 232 | } | ||
| 233 | return ret; | ||
| 234 | } | ||
| 235 | |||
| 236 | unsigned long invalidate_inode_pages(struct address_space *mapping) | ||
| 237 | { | ||
| 238 | return invalidate_mapping_pages(mapping, 0, ~0UL); | ||
| 239 | } | ||
| 240 | |||
| 241 | EXPORT_SYMBOL(invalidate_inode_pages); | ||
| 242 | |||
| 243 | /** | ||
| 244 | * invalidate_inode_pages2_range - remove range of pages from an address_space | ||
| 245 | * @mapping - the address_space | ||
| 246 | * @start: the page offset 'from' which to invalidate | ||
| 247 | * @end: the page offset 'to' which to invalidate (inclusive) | ||
| 248 | * | ||
| 249 | * Any pages which are found to be mapped into pagetables are unmapped prior to | ||
| 250 | * invalidation. | ||
| 251 | * | ||
| 252 | * Returns -EIO if any pages could not be invalidated. | ||
| 253 | */ | ||
| 254 | int invalidate_inode_pages2_range(struct address_space *mapping, | ||
| 255 | pgoff_t start, pgoff_t end) | ||
| 256 | { | ||
| 257 | struct pagevec pvec; | ||
| 258 | pgoff_t next; | ||
| 259 | int i; | ||
| 260 | int ret = 0; | ||
| 261 | int did_range_unmap = 0; | ||
| 262 | int wrapped = 0; | ||
| 263 | |||
| 264 | pagevec_init(&pvec, 0); | ||
| 265 | next = start; | ||
| 266 | while (next <= end && !ret && !wrapped && | ||
| 267 | pagevec_lookup(&pvec, mapping, next, | ||
| 268 | min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | ||
| 269 | for (i = 0; !ret && i < pagevec_count(&pvec); i++) { | ||
| 270 | struct page *page = pvec.pages[i]; | ||
| 271 | pgoff_t page_index; | ||
| 272 | int was_dirty; | ||
| 273 | |||
| 274 | lock_page(page); | ||
| 275 | if (page->mapping != mapping) { | ||
| 276 | unlock_page(page); | ||
| 277 | continue; | ||
| 278 | } | ||
| 279 | page_index = page->index; | ||
| 280 | next = page_index + 1; | ||
| 281 | if (next == 0) | ||
| 282 | wrapped = 1; | ||
| 283 | if (page_index > end) { | ||
| 284 | unlock_page(page); | ||
| 285 | break; | ||
| 286 | } | ||
| 287 | wait_on_page_writeback(page); | ||
| 288 | while (page_mapped(page)) { | ||
| 289 | if (!did_range_unmap) { | ||
| 290 | /* | ||
| 291 | * Zap the rest of the file in one hit. | ||
| 292 | */ | ||
| 293 | unmap_mapping_range(mapping, | ||
| 294 | page_index << PAGE_CACHE_SHIFT, | ||
| 295 | (end - page_index + 1) | ||
| 296 | << PAGE_CACHE_SHIFT, | ||
| 297 | 0); | ||
| 298 | did_range_unmap = 1; | ||
| 299 | } else { | ||
| 300 | /* | ||
| 301 | * Just zap this page | ||
| 302 | */ | ||
| 303 | unmap_mapping_range(mapping, | ||
| 304 | page_index << PAGE_CACHE_SHIFT, | ||
| 305 | PAGE_CACHE_SIZE, 0); | ||
| 306 | } | ||
| 307 | } | ||
| 308 | was_dirty = test_clear_page_dirty(page); | ||
| 309 | if (!invalidate_complete_page(mapping, page)) { | ||
| 310 | if (was_dirty) | ||
| 311 | set_page_dirty(page); | ||
| 312 | ret = -EIO; | ||
| 313 | } | ||
| 314 | unlock_page(page); | ||
| 315 | } | ||
| 316 | pagevec_release(&pvec); | ||
| 317 | cond_resched(); | ||
| 318 | } | ||
| 319 | return ret; | ||
| 320 | } | ||
| 321 | EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); | ||
| 322 | |||
| 323 | /** | ||
| 324 | * invalidate_inode_pages2 - remove all pages from an address_space | ||
| 325 | * @mapping - the address_space | ||
| 326 | * | ||
| 327 | * Any pages which are found to be mapped into pagetables are unmapped prior to | ||
| 328 | * invalidation. | ||
| 329 | * | ||
| 330 | * Returns -EIO if any pages could not be invalidated. | ||
| 331 | */ | ||
| 332 | int invalidate_inode_pages2(struct address_space *mapping) | ||
| 333 | { | ||
| 334 | return invalidate_inode_pages2_range(mapping, 0, -1); | ||
| 335 | } | ||
| 336 | EXPORT_SYMBOL_GPL(invalidate_inode_pages2); | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c new file mode 100644 index 000000000000..c6182f6f1305 --- /dev/null +++ b/mm/vmalloc.c | |||
| @@ -0,0 +1,588 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/vmalloc.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 1993 Linus Torvalds | ||
| 5 | * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 | ||
| 6 | * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 | ||
| 7 | * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 | ||
| 8 | */ | ||
| 9 | |||
| 10 | #include <linux/mm.h> | ||
| 11 | #include <linux/module.h> | ||
| 12 | #include <linux/highmem.h> | ||
| 13 | #include <linux/slab.h> | ||
| 14 | #include <linux/spinlock.h> | ||
| 15 | #include <linux/interrupt.h> | ||
| 16 | |||
| 17 | #include <linux/vmalloc.h> | ||
| 18 | |||
| 19 | #include <asm/uaccess.h> | ||
| 20 | #include <asm/tlbflush.h> | ||
| 21 | |||
| 22 | |||
| 23 | DEFINE_RWLOCK(vmlist_lock); | ||
| 24 | struct vm_struct *vmlist; | ||
| 25 | |||
| 26 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) | ||
| 27 | { | ||
| 28 | pte_t *pte; | ||
| 29 | |||
| 30 | pte = pte_offset_kernel(pmd, addr); | ||
| 31 | do { | ||
| 32 | pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); | ||
| 33 | WARN_ON(!pte_none(ptent) && !pte_present(ptent)); | ||
| 34 | } while (pte++, addr += PAGE_SIZE, addr != end); | ||
| 35 | } | ||
| 36 | |||
| 37 | static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, | ||
| 38 | unsigned long end) | ||
| 39 | { | ||
| 40 | pmd_t *pmd; | ||
| 41 | unsigned long next; | ||
| 42 | |||
| 43 | pmd = pmd_offset(pud, addr); | ||
| 44 | do { | ||
| 45 | next = pmd_addr_end(addr, end); | ||
| 46 | if (pmd_none_or_clear_bad(pmd)) | ||
| 47 | continue; | ||
| 48 | vunmap_pte_range(pmd, addr, next); | ||
| 49 | } while (pmd++, addr = next, addr != end); | ||
| 50 | } | ||
| 51 | |||
| 52 | static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, | ||
| 53 | unsigned long end) | ||
| 54 | { | ||
| 55 | pud_t *pud; | ||
| 56 | unsigned long next; | ||
| 57 | |||
| 58 | pud = pud_offset(pgd, addr); | ||
| 59 | do { | ||
| 60 | next = pud_addr_end(addr, end); | ||
| 61 | if (pud_none_or_clear_bad(pud)) | ||
| 62 | continue; | ||
| 63 | vunmap_pmd_range(pud, addr, next); | ||
| 64 | } while (pud++, addr = next, addr != end); | ||
| 65 | } | ||
| 66 | |||
| 67 | void unmap_vm_area(struct vm_struct *area) | ||
| 68 | { | ||
| 69 | pgd_t *pgd; | ||
| 70 | unsigned long next; | ||
| 71 | unsigned long addr = (unsigned long) area->addr; | ||
| 72 | unsigned long end = addr + area->size; | ||
| 73 | |||
| 74 | BUG_ON(addr >= end); | ||
| 75 | pgd = pgd_offset_k(addr); | ||
| 76 | flush_cache_vunmap(addr, end); | ||
| 77 | do { | ||
| 78 | next = pgd_addr_end(addr, end); | ||
| 79 | if (pgd_none_or_clear_bad(pgd)) | ||
| 80 | continue; | ||
| 81 | vunmap_pud_range(pgd, addr, next); | ||
| 82 | } while (pgd++, addr = next, addr != end); | ||
| 83 | flush_tlb_kernel_range((unsigned long) area->addr, end); | ||
| 84 | } | ||
| 85 | |||
| 86 | static int vmap_pte_range(pmd_t *pmd, unsigned long addr, | ||
| 87 | unsigned long end, pgprot_t prot, struct page ***pages) | ||
| 88 | { | ||
| 89 | pte_t *pte; | ||
| 90 | |||
| 91 | pte = pte_alloc_kernel(&init_mm, pmd, addr); | ||
| 92 | if (!pte) | ||
| 93 | return -ENOMEM; | ||
| 94 | do { | ||
| 95 | struct page *page = **pages; | ||
| 96 | WARN_ON(!pte_none(*pte)); | ||
| 97 | if (!page) | ||
| 98 | return -ENOMEM; | ||
| 99 | set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); | ||
| 100 | (*pages)++; | ||
| 101 | } while (pte++, addr += PAGE_SIZE, addr != end); | ||
| 102 | return 0; | ||
| 103 | } | ||
| 104 | |||
| 105 | static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, | ||
| 106 | unsigned long end, pgprot_t prot, struct page ***pages) | ||
| 107 | { | ||
| 108 | pmd_t *pmd; | ||
| 109 | unsigned long next; | ||
| 110 | |||
| 111 | pmd = pmd_alloc(&init_mm, pud, addr); | ||
| 112 | if (!pmd) | ||
| 113 | return -ENOMEM; | ||
| 114 | do { | ||
| 115 | next = pmd_addr_end(addr, end); | ||
| 116 | if (vmap_pte_range(pmd, addr, next, prot, pages)) | ||
| 117 | return -ENOMEM; | ||
| 118 | } while (pmd++, addr = next, addr != end); | ||
| 119 | return 0; | ||
| 120 | } | ||
| 121 | |||
| 122 | static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, | ||
| 123 | unsigned long end, pgprot_t prot, struct page ***pages) | ||
| 124 | { | ||
| 125 | pud_t *pud; | ||
| 126 | unsigned long next; | ||
| 127 | |||
| 128 | pud = pud_alloc(&init_mm, pgd, addr); | ||
| 129 | if (!pud) | ||
| 130 | return -ENOMEM; | ||
| 131 | do { | ||
| 132 | next = pud_addr_end(addr, end); | ||
| 133 | if (vmap_pmd_range(pud, addr, next, prot, pages)) | ||
| 134 | return -ENOMEM; | ||
| 135 | } while (pud++, addr = next, addr != end); | ||
| 136 | return 0; | ||
| 137 | } | ||
| 138 | |||
| 139 | int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) | ||
| 140 | { | ||
| 141 | pgd_t *pgd; | ||
| 142 | unsigned long next; | ||
| 143 | unsigned long addr = (unsigned long) area->addr; | ||
| 144 | unsigned long end = addr + area->size - PAGE_SIZE; | ||
| 145 | int err; | ||
| 146 | |||
| 147 | BUG_ON(addr >= end); | ||
| 148 | pgd = pgd_offset_k(addr); | ||
| 149 | spin_lock(&init_mm.page_table_lock); | ||
| 150 | do { | ||
| 151 | next = pgd_addr_end(addr, end); | ||
| 152 | err = vmap_pud_range(pgd, addr, next, prot, pages); | ||
| 153 | if (err) | ||
| 154 | break; | ||
| 155 | } while (pgd++, addr = next, addr != end); | ||
| 156 | spin_unlock(&init_mm.page_table_lock); | ||
| 157 | flush_cache_vmap((unsigned long) area->addr, end); | ||
| 158 | return err; | ||
| 159 | } | ||
| 160 | |||
| 161 | #define IOREMAP_MAX_ORDER (7 + PAGE_SHIFT) /* 128 pages */ | ||
| 162 | |||
| 163 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | ||
| 164 | unsigned long start, unsigned long end) | ||
| 165 | { | ||
| 166 | struct vm_struct **p, *tmp, *area; | ||
| 167 | unsigned long align = 1; | ||
| 168 | unsigned long addr; | ||
| 169 | |||
| 170 | if (flags & VM_IOREMAP) { | ||
| 171 | int bit = fls(size); | ||
| 172 | |||
| 173 | if (bit > IOREMAP_MAX_ORDER) | ||
| 174 | bit = IOREMAP_MAX_ORDER; | ||
| 175 | else if (bit < PAGE_SHIFT) | ||
| 176 | bit = PAGE_SHIFT; | ||
| 177 | |||
| 178 | align = 1ul << bit; | ||
| 179 | } | ||
| 180 | addr = ALIGN(start, align); | ||
| 181 | size = PAGE_ALIGN(size); | ||
| 182 | |||
| 183 | area = kmalloc(sizeof(*area), GFP_KERNEL); | ||
| 184 | if (unlikely(!area)) | ||
| 185 | return NULL; | ||
| 186 | |||
| 187 | if (unlikely(!size)) { | ||
| 188 | kfree (area); | ||
| 189 | return NULL; | ||
| 190 | } | ||
| 191 | |||
| 192 | /* | ||
| 193 | * We always allocate a guard page. | ||
| 194 | */ | ||
| 195 | size += PAGE_SIZE; | ||
| 196 | |||
| 197 | write_lock(&vmlist_lock); | ||
| 198 | for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) { | ||
| 199 | if ((unsigned long)tmp->addr < addr) { | ||
| 200 | if((unsigned long)tmp->addr + tmp->size >= addr) | ||
| 201 | addr = ALIGN(tmp->size + | ||
| 202 | (unsigned long)tmp->addr, align); | ||
| 203 | continue; | ||
| 204 | } | ||
| 205 | if ((size + addr) < addr) | ||
| 206 | goto out; | ||
| 207 | if (size + addr <= (unsigned long)tmp->addr) | ||
| 208 | goto found; | ||
| 209 | addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align); | ||
| 210 | if (addr > end - size) | ||
| 211 | goto out; | ||
| 212 | } | ||
| 213 | |||
| 214 | found: | ||
| 215 | area->next = *p; | ||
| 216 | *p = area; | ||
| 217 | |||
| 218 | area->flags = flags; | ||
| 219 | area->addr = (void *)addr; | ||
| 220 | area->size = size; | ||
| 221 | area->pages = NULL; | ||
| 222 | area->nr_pages = 0; | ||
| 223 | area->phys_addr = 0; | ||
| 224 | write_unlock(&vmlist_lock); | ||
| 225 | |||
| 226 | return area; | ||
| 227 | |||
| 228 | out: | ||
| 229 | write_unlock(&vmlist_lock); | ||
| 230 | kfree(area); | ||
| 231 | if (printk_ratelimit()) | ||
| 232 | printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n"); | ||
| 233 | return NULL; | ||
| 234 | } | ||
| 235 | |||
| 236 | /** | ||
| 237 | * get_vm_area - reserve a contingous kernel virtual area | ||
| 238 | * | ||
| 239 | * @size: size of the area | ||
| 240 | * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC | ||
| 241 | * | ||
| 242 | * Search an area of @size in the kernel virtual mapping area, | ||
| 243 | * and reserved it for out purposes. Returns the area descriptor | ||
| 244 | * on success or %NULL on failure. | ||
| 245 | */ | ||
| 246 | struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) | ||
| 247 | { | ||
| 248 | return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END); | ||
| 249 | } | ||
| 250 | |||
| 251 | /** | ||
| 252 | * remove_vm_area - find and remove a contingous kernel virtual area | ||
| 253 | * | ||
| 254 | * @addr: base address | ||
| 255 | * | ||
| 256 | * Search for the kernel VM area starting at @addr, and remove it. | ||
| 257 | * This function returns the found VM area, but using it is NOT safe | ||
| 258 | * on SMP machines. | ||
| 259 | */ | ||
| 260 | struct vm_struct *remove_vm_area(void *addr) | ||
| 261 | { | ||
| 262 | struct vm_struct **p, *tmp; | ||
| 263 | |||
| 264 | write_lock(&vmlist_lock); | ||
| 265 | for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) { | ||
| 266 | if (tmp->addr == addr) | ||
| 267 | goto found; | ||
| 268 | } | ||
| 269 | write_unlock(&vmlist_lock); | ||
| 270 | return NULL; | ||
| 271 | |||
| 272 | found: | ||
| 273 | unmap_vm_area(tmp); | ||
| 274 | *p = tmp->next; | ||
| 275 | write_unlock(&vmlist_lock); | ||
| 276 | |||
| 277 | /* | ||
| 278 | * Remove the guard page. | ||
| 279 | */ | ||
| 280 | tmp->size -= PAGE_SIZE; | ||
| 281 | return tmp; | ||
| 282 | } | ||
| 283 | |||
| 284 | void __vunmap(void *addr, int deallocate_pages) | ||
| 285 | { | ||
| 286 | struct vm_struct *area; | ||
| 287 | |||
| 288 | if (!addr) | ||
| 289 | return; | ||
| 290 | |||
| 291 | if ((PAGE_SIZE-1) & (unsigned long)addr) { | ||
| 292 | printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); | ||
| 293 | WARN_ON(1); | ||
| 294 | return; | ||
| 295 | } | ||
| 296 | |||
| 297 | area = remove_vm_area(addr); | ||
| 298 | if (unlikely(!area)) { | ||
| 299 | printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", | ||
| 300 | addr); | ||
| 301 | WARN_ON(1); | ||
| 302 | return; | ||
| 303 | } | ||
| 304 | |||
| 305 | if (deallocate_pages) { | ||
| 306 | int i; | ||
| 307 | |||
| 308 | for (i = 0; i < area->nr_pages; i++) { | ||
| 309 | if (unlikely(!area->pages[i])) | ||
| 310 | BUG(); | ||
| 311 | __free_page(area->pages[i]); | ||
| 312 | } | ||
| 313 | |||
| 314 | if (area->nr_pages > PAGE_SIZE/sizeof(struct page *)) | ||
| 315 | vfree(area->pages); | ||
| 316 | else | ||
| 317 | kfree(area->pages); | ||
| 318 | } | ||
| 319 | |||
| 320 | kfree(area); | ||
| 321 | return; | ||
| 322 | } | ||
| 323 | |||
| 324 | /** | ||
| 325 | * vfree - release memory allocated by vmalloc() | ||
| 326 | * | ||
| 327 | * @addr: memory base address | ||
| 328 | * | ||
| 329 | * Free the virtually contiguous memory area starting at @addr, as | ||
| 330 | * obtained from vmalloc(), vmalloc_32() or __vmalloc(). | ||
| 331 | * | ||
| 332 | * May not be called in interrupt context. | ||
| 333 | */ | ||
| 334 | void vfree(void *addr) | ||
| 335 | { | ||
| 336 | BUG_ON(in_interrupt()); | ||
| 337 | __vunmap(addr, 1); | ||
| 338 | } | ||
| 339 | |||
| 340 | EXPORT_SYMBOL(vfree); | ||
| 341 | |||
| 342 | /** | ||
| 343 | * vunmap - release virtual mapping obtained by vmap() | ||
| 344 | * | ||
| 345 | * @addr: memory base address | ||
| 346 | * | ||
| 347 | * Free the virtually contiguous memory area starting at @addr, | ||
| 348 | * which was created from the page array passed to vmap(). | ||
| 349 | * | ||
| 350 | * May not be called in interrupt context. | ||
| 351 | */ | ||
| 352 | void vunmap(void *addr) | ||
| 353 | { | ||
| 354 | BUG_ON(in_interrupt()); | ||
| 355 | __vunmap(addr, 0); | ||
| 356 | } | ||
| 357 | |||
| 358 | EXPORT_SYMBOL(vunmap); | ||
| 359 | |||
| 360 | /** | ||
| 361 | * vmap - map an array of pages into virtually contiguous space | ||
| 362 | * | ||
| 363 | * @pages: array of page pointers | ||
| 364 | * @count: number of pages to map | ||
| 365 | * @flags: vm_area->flags | ||
| 366 | * @prot: page protection for the mapping | ||
| 367 | * | ||
| 368 | * Maps @count pages from @pages into contiguous kernel virtual | ||
| 369 | * space. | ||
| 370 | */ | ||
| 371 | void *vmap(struct page **pages, unsigned int count, | ||
| 372 | unsigned long flags, pgprot_t prot) | ||
| 373 | { | ||
| 374 | struct vm_struct *area; | ||
| 375 | |||
| 376 | if (count > num_physpages) | ||
| 377 | return NULL; | ||
| 378 | |||
| 379 | area = get_vm_area((count << PAGE_SHIFT), flags); | ||
| 380 | if (!area) | ||
| 381 | return NULL; | ||
| 382 | if (map_vm_area(area, prot, &pages)) { | ||
| 383 | vunmap(area->addr); | ||
| 384 | return NULL; | ||
| 385 | } | ||
| 386 | |||
| 387 | return area->addr; | ||
| 388 | } | ||
| 389 | |||
| 390 | EXPORT_SYMBOL(vmap); | ||
| 391 | |||
| 392 | void *__vmalloc_area(struct vm_struct *area, unsigned int __nocast gfp_mask, pgprot_t prot) | ||
| 393 | { | ||
| 394 | struct page **pages; | ||
| 395 | unsigned int nr_pages, array_size, i; | ||
| 396 | |||
| 397 | nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; | ||
| 398 | array_size = (nr_pages * sizeof(struct page *)); | ||
| 399 | |||
| 400 | area->nr_pages = nr_pages; | ||
| 401 | /* Please note that the recursion is strictly bounded. */ | ||
| 402 | if (array_size > PAGE_SIZE) | ||
| 403 | pages = __vmalloc(array_size, gfp_mask, PAGE_KERNEL); | ||
| 404 | else | ||
| 405 | pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM)); | ||
| 406 | area->pages = pages; | ||
| 407 | if (!area->pages) { | ||
| 408 | remove_vm_area(area->addr); | ||
| 409 | kfree(area); | ||
| 410 | return NULL; | ||
| 411 | } | ||
| 412 | memset(area->pages, 0, array_size); | ||
| 413 | |||
| 414 | for (i = 0; i < area->nr_pages; i++) { | ||
| 415 | area->pages[i] = alloc_page(gfp_mask); | ||
| 416 | if (unlikely(!area->pages[i])) { | ||
| 417 | /* Successfully allocated i pages, free them in __vunmap() */ | ||
| 418 | area->nr_pages = i; | ||
| 419 | goto fail; | ||
| 420 | } | ||
| 421 | } | ||
| 422 | |||
| 423 | if (map_vm_area(area, prot, &pages)) | ||
| 424 | goto fail; | ||
| 425 | return area->addr; | ||
| 426 | |||
| 427 | fail: | ||
| 428 | vfree(area->addr); | ||
| 429 | return NULL; | ||
| 430 | } | ||
| 431 | |||
| 432 | /** | ||
| 433 | * __vmalloc - allocate virtually contiguous memory | ||
| 434 | * | ||
| 435 | * @size: allocation size | ||
| 436 | * @gfp_mask: flags for the page level allocator | ||
| 437 | * @prot: protection mask for the allocated pages | ||
| 438 | * | ||
| 439 | * Allocate enough pages to cover @size from the page level | ||
| 440 | * allocator with @gfp_mask flags. Map them into contiguous | ||
| 441 | * kernel virtual space, using a pagetable protection of @prot. | ||
| 442 | */ | ||
| 443 | void *__vmalloc(unsigned long size, unsigned int __nocast gfp_mask, pgprot_t prot) | ||
| 444 | { | ||
| 445 | struct vm_struct *area; | ||
| 446 | |||
| 447 | size = PAGE_ALIGN(size); | ||
| 448 | if (!size || (size >> PAGE_SHIFT) > num_physpages) | ||
| 449 | return NULL; | ||
| 450 | |||
| 451 | area = get_vm_area(size, VM_ALLOC); | ||
| 452 | if (!area) | ||
| 453 | return NULL; | ||
| 454 | |||
| 455 | return __vmalloc_area(area, gfp_mask, prot); | ||
| 456 | } | ||
| 457 | |||
| 458 | EXPORT_SYMBOL(__vmalloc); | ||
| 459 | |||
| 460 | /** | ||
| 461 | * vmalloc - allocate virtually contiguous memory | ||
| 462 | * | ||
| 463 | * @size: allocation size | ||
| 464 | * | ||
| 465 | * Allocate enough pages to cover @size from the page level | ||
| 466 | * allocator and map them into contiguous kernel virtual space. | ||
| 467 | * | ||
| 468 | * For tight cotrol over page level allocator and protection flags | ||
| 469 | * use __vmalloc() instead. | ||
| 470 | */ | ||
| 471 | void *vmalloc(unsigned long size) | ||
| 472 | { | ||
| 473 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); | ||
| 474 | } | ||
| 475 | |||
| 476 | EXPORT_SYMBOL(vmalloc); | ||
| 477 | |||
| 478 | /** | ||
| 479 | * vmalloc_exec - allocate virtually contiguous, executable memory | ||
| 480 | * | ||
| 481 | * @size: allocation size | ||
| 482 | * | ||
| 483 | * Kernel-internal function to allocate enough pages to cover @size | ||
| 484 | * the page level allocator and map them into contiguous and | ||
| 485 | * executable kernel virtual space. | ||
| 486 | * | ||
| 487 | * For tight cotrol over page level allocator and protection flags | ||
| 488 | * use __vmalloc() instead. | ||
| 489 | */ | ||
| 490 | |||
| 491 | #ifndef PAGE_KERNEL_EXEC | ||
| 492 | # define PAGE_KERNEL_EXEC PAGE_KERNEL | ||
| 493 | #endif | ||
| 494 | |||
| 495 | void *vmalloc_exec(unsigned long size) | ||
| 496 | { | ||
| 497 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); | ||
| 498 | } | ||
| 499 | |||
| 500 | /** | ||
| 501 | * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) | ||
| 502 | * | ||
| 503 | * @size: allocation size | ||
| 504 | * | ||
| 505 | * Allocate enough 32bit PA addressable pages to cover @size from the | ||
| 506 | * page level allocator and map them into contiguous kernel virtual space. | ||
| 507 | */ | ||
| 508 | void *vmalloc_32(unsigned long size) | ||
| 509 | { | ||
| 510 | return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); | ||
| 511 | } | ||
| 512 | |||
| 513 | EXPORT_SYMBOL(vmalloc_32); | ||
| 514 | |||
| 515 | long vread(char *buf, char *addr, unsigned long count) | ||
| 516 | { | ||
| 517 | struct vm_struct *tmp; | ||
| 518 | char *vaddr, *buf_start = buf; | ||
| 519 | unsigned long n; | ||
| 520 | |||
| 521 | /* Don't allow overflow */ | ||
| 522 | if ((unsigned long) addr + count < count) | ||
| 523 | count = -(unsigned long) addr; | ||
| 524 | |||
| 525 | read_lock(&vmlist_lock); | ||
| 526 | for (tmp = vmlist; tmp; tmp = tmp->next) { | ||
| 527 | vaddr = (char *) tmp->addr; | ||
| 528 | if (addr >= vaddr + tmp->size - PAGE_SIZE) | ||
| 529 | continue; | ||
| 530 | while (addr < vaddr) { | ||
| 531 | if (count == 0) | ||
| 532 | goto finished; | ||
| 533 | *buf = '\0'; | ||
| 534 | buf++; | ||
| 535 | addr++; | ||
| 536 | count--; | ||
| 537 | } | ||
| 538 | n = vaddr + tmp->size - PAGE_SIZE - addr; | ||
| 539 | do { | ||
| 540 | if (count == 0) | ||
| 541 | goto finished; | ||
| 542 | *buf = *addr; | ||
| 543 | buf++; | ||
| 544 | addr++; | ||
| 545 | count--; | ||
| 546 | } while (--n > 0); | ||
| 547 | } | ||
| 548 | finished: | ||
| 549 | read_unlock(&vmlist_lock); | ||
| 550 | return buf - buf_start; | ||
| 551 | } | ||
| 552 | |||
| 553 | long vwrite(char *buf, char *addr, unsigned long count) | ||
| 554 | { | ||
| 555 | struct vm_struct *tmp; | ||
| 556 | char *vaddr, *buf_start = buf; | ||
| 557 | unsigned long n; | ||
| 558 | |||
| 559 | /* Don't allow overflow */ | ||
| 560 | if ((unsigned long) addr + count < count) | ||
| 561 | count = -(unsigned long) addr; | ||
| 562 | |||
| 563 | read_lock(&vmlist_lock); | ||
| 564 | for (tmp = vmlist; tmp; tmp = tmp->next) { | ||
| 565 | vaddr = (char *) tmp->addr; | ||
| 566 | if (addr >= vaddr + tmp->size - PAGE_SIZE) | ||
| 567 | continue; | ||
| 568 | while (addr < vaddr) { | ||
| 569 | if (count == 0) | ||
| 570 | goto finished; | ||
| 571 | buf++; | ||
| 572 | addr++; | ||
| 573 | count--; | ||
| 574 | } | ||
| 575 | n = vaddr + tmp->size - PAGE_SIZE - addr; | ||
| 576 | do { | ||
| 577 | if (count == 0) | ||
| 578 | goto finished; | ||
| 579 | *addr = *buf; | ||
| 580 | buf++; | ||
| 581 | addr++; | ||
| 582 | count--; | ||
| 583 | } while (--n > 0); | ||
| 584 | } | ||
| 585 | finished: | ||
| 586 | read_unlock(&vmlist_lock); | ||
| 587 | return buf - buf_start; | ||
| 588 | } | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c new file mode 100644 index 000000000000..4003c0518d28 --- /dev/null +++ b/mm/vmscan.c | |||
| @@ -0,0 +1,1311 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/vmscan.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | ||
| 5 | * | ||
| 6 | * Swap reorganised 29.12.95, Stephen Tweedie. | ||
| 7 | * kswapd added: 7.1.96 sct | ||
| 8 | * Removed kswapd_ctl limits, and swap out as many pages as needed | ||
| 9 | * to bring the system back to freepages.high: 2.4.97, Rik van Riel. | ||
| 10 | * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). | ||
| 11 | * Multiqueue VM started 5.8.00, Rik van Riel. | ||
| 12 | */ | ||
| 13 | |||
| 14 | #include <linux/mm.h> | ||
| 15 | #include <linux/module.h> | ||
| 16 | #include <linux/slab.h> | ||
| 17 | #include <linux/kernel_stat.h> | ||
| 18 | #include <linux/swap.h> | ||
| 19 | #include <linux/pagemap.h> | ||
| 20 | #include <linux/init.h> | ||
| 21 | #include <linux/highmem.h> | ||
| 22 | #include <linux/file.h> | ||
| 23 | #include <linux/writeback.h> | ||
| 24 | #include <linux/blkdev.h> | ||
| 25 | #include <linux/buffer_head.h> /* for try_to_release_page(), | ||
| 26 | buffer_heads_over_limit */ | ||
| 27 | #include <linux/mm_inline.h> | ||
| 28 | #include <linux/pagevec.h> | ||
| 29 | #include <linux/backing-dev.h> | ||
| 30 | #include <linux/rmap.h> | ||
| 31 | #include <linux/topology.h> | ||
| 32 | #include <linux/cpu.h> | ||
| 33 | #include <linux/cpuset.h> | ||
| 34 | #include <linux/notifier.h> | ||
| 35 | #include <linux/rwsem.h> | ||
| 36 | |||
| 37 | #include <asm/tlbflush.h> | ||
| 38 | #include <asm/div64.h> | ||
| 39 | |||
| 40 | #include <linux/swapops.h> | ||
| 41 | |||
| 42 | /* possible outcome of pageout() */ | ||
| 43 | typedef enum { | ||
| 44 | /* failed to write page out, page is locked */ | ||
| 45 | PAGE_KEEP, | ||
| 46 | /* move page to the active list, page is locked */ | ||
| 47 | PAGE_ACTIVATE, | ||
| 48 | /* page has been sent to the disk successfully, page is unlocked */ | ||
| 49 | PAGE_SUCCESS, | ||
| 50 | /* page is clean and locked */ | ||
| 51 | PAGE_CLEAN, | ||
| 52 | } pageout_t; | ||
| 53 | |||
| 54 | struct scan_control { | ||
| 55 | /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */ | ||
| 56 | unsigned long nr_to_scan; | ||
| 57 | |||
| 58 | /* Incremented by the number of inactive pages that were scanned */ | ||
| 59 | unsigned long nr_scanned; | ||
| 60 | |||
| 61 | /* Incremented by the number of pages reclaimed */ | ||
| 62 | unsigned long nr_reclaimed; | ||
| 63 | |||
| 64 | unsigned long nr_mapped; /* From page_state */ | ||
| 65 | |||
| 66 | /* How many pages shrink_cache() should reclaim */ | ||
| 67 | int nr_to_reclaim; | ||
| 68 | |||
| 69 | /* Ask shrink_caches, or shrink_zone to scan at this priority */ | ||
| 70 | unsigned int priority; | ||
| 71 | |||
| 72 | /* This context's GFP mask */ | ||
| 73 | unsigned int gfp_mask; | ||
| 74 | |||
| 75 | int may_writepage; | ||
| 76 | |||
| 77 | /* This context's SWAP_CLUSTER_MAX. If freeing memory for | ||
| 78 | * suspend, we effectively ignore SWAP_CLUSTER_MAX. | ||
| 79 | * In this context, it doesn't matter that we scan the | ||
| 80 | * whole list at once. */ | ||
| 81 | int swap_cluster_max; | ||
| 82 | }; | ||
| 83 | |||
| 84 | /* | ||
| 85 | * The list of shrinker callbacks used by to apply pressure to | ||
| 86 | * ageable caches. | ||
| 87 | */ | ||
| 88 | struct shrinker { | ||
| 89 | shrinker_t shrinker; | ||
| 90 | struct list_head list; | ||
| 91 | int seeks; /* seeks to recreate an obj */ | ||
| 92 | long nr; /* objs pending delete */ | ||
| 93 | }; | ||
| 94 | |||
| 95 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | ||
| 96 | |||
| 97 | #ifdef ARCH_HAS_PREFETCH | ||
| 98 | #define prefetch_prev_lru_page(_page, _base, _field) \ | ||
| 99 | do { \ | ||
| 100 | if ((_page)->lru.prev != _base) { \ | ||
| 101 | struct page *prev; \ | ||
| 102 | \ | ||
| 103 | prev = lru_to_page(&(_page->lru)); \ | ||
| 104 | prefetch(&prev->_field); \ | ||
| 105 | } \ | ||
| 106 | } while (0) | ||
| 107 | #else | ||
| 108 | #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) | ||
| 109 | #endif | ||
| 110 | |||
| 111 | #ifdef ARCH_HAS_PREFETCHW | ||
| 112 | #define prefetchw_prev_lru_page(_page, _base, _field) \ | ||
| 113 | do { \ | ||
| 114 | if ((_page)->lru.prev != _base) { \ | ||
| 115 | struct page *prev; \ | ||
| 116 | \ | ||
| 117 | prev = lru_to_page(&(_page->lru)); \ | ||
| 118 | prefetchw(&prev->_field); \ | ||
| 119 | } \ | ||
| 120 | } while (0) | ||
| 121 | #else | ||
| 122 | #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) | ||
| 123 | #endif | ||
| 124 | |||
| 125 | /* | ||
| 126 | * From 0 .. 100. Higher means more swappy. | ||
| 127 | */ | ||
| 128 | int vm_swappiness = 60; | ||
| 129 | static long total_memory; | ||
| 130 | |||
| 131 | static LIST_HEAD(shrinker_list); | ||
| 132 | static DECLARE_RWSEM(shrinker_rwsem); | ||
| 133 | |||
| 134 | /* | ||
| 135 | * Add a shrinker callback to be called from the vm | ||
| 136 | */ | ||
| 137 | struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker) | ||
| 138 | { | ||
| 139 | struct shrinker *shrinker; | ||
| 140 | |||
| 141 | shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL); | ||
| 142 | if (shrinker) { | ||
| 143 | shrinker->shrinker = theshrinker; | ||
| 144 | shrinker->seeks = seeks; | ||
| 145 | shrinker->nr = 0; | ||
| 146 | down_write(&shrinker_rwsem); | ||
| 147 | list_add_tail(&shrinker->list, &shrinker_list); | ||
| 148 | up_write(&shrinker_rwsem); | ||
| 149 | } | ||
| 150 | return shrinker; | ||
| 151 | } | ||
| 152 | EXPORT_SYMBOL(set_shrinker); | ||
| 153 | |||
| 154 | /* | ||
| 155 | * Remove one | ||
| 156 | */ | ||
| 157 | void remove_shrinker(struct shrinker *shrinker) | ||
| 158 | { | ||
| 159 | down_write(&shrinker_rwsem); | ||
| 160 | list_del(&shrinker->list); | ||
| 161 | up_write(&shrinker_rwsem); | ||
| 162 | kfree(shrinker); | ||
| 163 | } | ||
| 164 | EXPORT_SYMBOL(remove_shrinker); | ||
| 165 | |||
| 166 | #define SHRINK_BATCH 128 | ||
| 167 | /* | ||
| 168 | * Call the shrink functions to age shrinkable caches | ||
| 169 | * | ||
| 170 | * Here we assume it costs one seek to replace a lru page and that it also | ||
| 171 | * takes a seek to recreate a cache object. With this in mind we age equal | ||
| 172 | * percentages of the lru and ageable caches. This should balance the seeks | ||
| 173 | * generated by these structures. | ||
| 174 | * | ||
| 175 | * If the vm encounted mapped pages on the LRU it increase the pressure on | ||
| 176 | * slab to avoid swapping. | ||
| 177 | * | ||
| 178 | * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. | ||
| 179 | * | ||
| 180 | * `lru_pages' represents the number of on-LRU pages in all the zones which | ||
| 181 | * are eligible for the caller's allocation attempt. It is used for balancing | ||
| 182 | * slab reclaim versus page reclaim. | ||
| 183 | */ | ||
| 184 | static int shrink_slab(unsigned long scanned, unsigned int gfp_mask, | ||
| 185 | unsigned long lru_pages) | ||
| 186 | { | ||
| 187 | struct shrinker *shrinker; | ||
| 188 | |||
| 189 | if (scanned == 0) | ||
| 190 | scanned = SWAP_CLUSTER_MAX; | ||
| 191 | |||
| 192 | if (!down_read_trylock(&shrinker_rwsem)) | ||
| 193 | return 0; | ||
| 194 | |||
| 195 | list_for_each_entry(shrinker, &shrinker_list, list) { | ||
| 196 | unsigned long long delta; | ||
| 197 | unsigned long total_scan; | ||
| 198 | |||
| 199 | delta = (4 * scanned) / shrinker->seeks; | ||
| 200 | delta *= (*shrinker->shrinker)(0, gfp_mask); | ||
| 201 | do_div(delta, lru_pages + 1); | ||
| 202 | shrinker->nr += delta; | ||
| 203 | if (shrinker->nr < 0) | ||
| 204 | shrinker->nr = LONG_MAX; /* It wrapped! */ | ||
| 205 | |||
| 206 | total_scan = shrinker->nr; | ||
| 207 | shrinker->nr = 0; | ||
| 208 | |||
| 209 | while (total_scan >= SHRINK_BATCH) { | ||
| 210 | long this_scan = SHRINK_BATCH; | ||
| 211 | int shrink_ret; | ||
| 212 | |||
| 213 | shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); | ||
| 214 | if (shrink_ret == -1) | ||
| 215 | break; | ||
| 216 | mod_page_state(slabs_scanned, this_scan); | ||
| 217 | total_scan -= this_scan; | ||
| 218 | |||
| 219 | cond_resched(); | ||
| 220 | } | ||
| 221 | |||
| 222 | shrinker->nr += total_scan; | ||
| 223 | } | ||
| 224 | up_read(&shrinker_rwsem); | ||
| 225 | return 0; | ||
| 226 | } | ||
| 227 | |||
| 228 | /* Called without lock on whether page is mapped, so answer is unstable */ | ||
| 229 | static inline int page_mapping_inuse(struct page *page) | ||
| 230 | { | ||
| 231 | struct address_space *mapping; | ||
| 232 | |||
| 233 | /* Page is in somebody's page tables. */ | ||
| 234 | if (page_mapped(page)) | ||
| 235 | return 1; | ||
| 236 | |||
| 237 | /* Be more reluctant to reclaim swapcache than pagecache */ | ||
| 238 | if (PageSwapCache(page)) | ||
| 239 | return 1; | ||
| 240 | |||
| 241 | mapping = page_mapping(page); | ||
| 242 | if (!mapping) | ||
| 243 | return 0; | ||
| 244 | |||
| 245 | /* File is mmap'd by somebody? */ | ||
| 246 | return mapping_mapped(mapping); | ||
| 247 | } | ||
| 248 | |||
| 249 | static inline int is_page_cache_freeable(struct page *page) | ||
| 250 | { | ||
| 251 | return page_count(page) - !!PagePrivate(page) == 2; | ||
| 252 | } | ||
| 253 | |||
| 254 | static int may_write_to_queue(struct backing_dev_info *bdi) | ||
| 255 | { | ||
| 256 | if (current_is_kswapd()) | ||
| 257 | return 1; | ||
| 258 | if (current_is_pdflush()) /* This is unlikely, but why not... */ | ||
| 259 | return 1; | ||
| 260 | if (!bdi_write_congested(bdi)) | ||
| 261 | return 1; | ||
| 262 | if (bdi == current->backing_dev_info) | ||
| 263 | return 1; | ||
| 264 | return 0; | ||
| 265 | } | ||
| 266 | |||
| 267 | /* | ||
| 268 | * We detected a synchronous write error writing a page out. Probably | ||
| 269 | * -ENOSPC. We need to propagate that into the address_space for a subsequent | ||
| 270 | * fsync(), msync() or close(). | ||
| 271 | * | ||
| 272 | * The tricky part is that after writepage we cannot touch the mapping: nothing | ||
| 273 | * prevents it from being freed up. But we have a ref on the page and once | ||
| 274 | * that page is locked, the mapping is pinned. | ||
| 275 | * | ||
| 276 | * We're allowed to run sleeping lock_page() here because we know the caller has | ||
| 277 | * __GFP_FS. | ||
| 278 | */ | ||
| 279 | static void handle_write_error(struct address_space *mapping, | ||
| 280 | struct page *page, int error) | ||
| 281 | { | ||
| 282 | lock_page(page); | ||
| 283 | if (page_mapping(page) == mapping) { | ||
| 284 | if (error == -ENOSPC) | ||
| 285 | set_bit(AS_ENOSPC, &mapping->flags); | ||
| 286 | else | ||
| 287 | set_bit(AS_EIO, &mapping->flags); | ||
| 288 | } | ||
| 289 | unlock_page(page); | ||
| 290 | } | ||
| 291 | |||
| 292 | /* | ||
| 293 | * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). | ||
| 294 | */ | ||
| 295 | static pageout_t pageout(struct page *page, struct address_space *mapping) | ||
| 296 | { | ||
| 297 | /* | ||
| 298 | * If the page is dirty, only perform writeback if that write | ||
| 299 | * will be non-blocking. To prevent this allocation from being | ||
| 300 | * stalled by pagecache activity. But note that there may be | ||
| 301 | * stalls if we need to run get_block(). We could test | ||
| 302 | * PagePrivate for that. | ||
| 303 | * | ||
| 304 | * If this process is currently in generic_file_write() against | ||
| 305 | * this page's queue, we can perform writeback even if that | ||
| 306 | * will block. | ||
| 307 | * | ||
| 308 | * If the page is swapcache, write it back even if that would | ||
| 309 | * block, for some throttling. This happens by accident, because | ||
| 310 | * swap_backing_dev_info is bust: it doesn't reflect the | ||
| 311 | * congestion state of the swapdevs. Easy to fix, if needed. | ||
| 312 | * See swapfile.c:page_queue_congested(). | ||
| 313 | */ | ||
| 314 | if (!is_page_cache_freeable(page)) | ||
| 315 | return PAGE_KEEP; | ||
| 316 | if (!mapping) { | ||
| 317 | /* | ||
| 318 | * Some data journaling orphaned pages can have | ||
| 319 | * page->mapping == NULL while being dirty with clean buffers. | ||
| 320 | */ | ||
| 321 | if (PageDirty(page) && PagePrivate(page)) { | ||
| 322 | if (try_to_free_buffers(page)) { | ||
| 323 | ClearPageDirty(page); | ||
| 324 | printk("%s: orphaned page\n", __FUNCTION__); | ||
| 325 | return PAGE_CLEAN; | ||
| 326 | } | ||
| 327 | } | ||
| 328 | return PAGE_KEEP; | ||
| 329 | } | ||
| 330 | if (mapping->a_ops->writepage == NULL) | ||
| 331 | return PAGE_ACTIVATE; | ||
| 332 | if (!may_write_to_queue(mapping->backing_dev_info)) | ||
| 333 | return PAGE_KEEP; | ||
| 334 | |||
| 335 | if (clear_page_dirty_for_io(page)) { | ||
| 336 | int res; | ||
| 337 | struct writeback_control wbc = { | ||
| 338 | .sync_mode = WB_SYNC_NONE, | ||
| 339 | .nr_to_write = SWAP_CLUSTER_MAX, | ||
| 340 | .nonblocking = 1, | ||
| 341 | .for_reclaim = 1, | ||
| 342 | }; | ||
| 343 | |||
| 344 | SetPageReclaim(page); | ||
| 345 | res = mapping->a_ops->writepage(page, &wbc); | ||
| 346 | if (res < 0) | ||
| 347 | handle_write_error(mapping, page, res); | ||
| 348 | if (res == WRITEPAGE_ACTIVATE) { | ||
| 349 | ClearPageReclaim(page); | ||
| 350 | return PAGE_ACTIVATE; | ||
| 351 | } | ||
| 352 | if (!PageWriteback(page)) { | ||
| 353 | /* synchronous write or broken a_ops? */ | ||
| 354 | ClearPageReclaim(page); | ||
| 355 | } | ||
| 356 | |||
| 357 | return PAGE_SUCCESS; | ||
| 358 | } | ||
| 359 | |||
| 360 | return PAGE_CLEAN; | ||
| 361 | } | ||
| 362 | |||
| 363 | /* | ||
| 364 | * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed | ||
| 365 | */ | ||
| 366 | static int shrink_list(struct list_head *page_list, struct scan_control *sc) | ||
| 367 | { | ||
| 368 | LIST_HEAD(ret_pages); | ||
| 369 | struct pagevec freed_pvec; | ||
| 370 | int pgactivate = 0; | ||
| 371 | int reclaimed = 0; | ||
| 372 | |||
| 373 | cond_resched(); | ||
| 374 | |||
| 375 | pagevec_init(&freed_pvec, 1); | ||
| 376 | while (!list_empty(page_list)) { | ||
| 377 | struct address_space *mapping; | ||
| 378 | struct page *page; | ||
| 379 | int may_enter_fs; | ||
| 380 | int referenced; | ||
| 381 | |||
| 382 | cond_resched(); | ||
| 383 | |||
| 384 | page = lru_to_page(page_list); | ||
| 385 | list_del(&page->lru); | ||
| 386 | |||
| 387 | if (TestSetPageLocked(page)) | ||
| 388 | goto keep; | ||
| 389 | |||
| 390 | BUG_ON(PageActive(page)); | ||
| 391 | |||
| 392 | sc->nr_scanned++; | ||
| 393 | /* Double the slab pressure for mapped and swapcache pages */ | ||
| 394 | if (page_mapped(page) || PageSwapCache(page)) | ||
| 395 | sc->nr_scanned++; | ||
| 396 | |||
| 397 | if (PageWriteback(page)) | ||
| 398 | goto keep_locked; | ||
| 399 | |||
| 400 | referenced = page_referenced(page, 1, sc->priority <= 0); | ||
| 401 | /* In active use or really unfreeable? Activate it. */ | ||
| 402 | if (referenced && page_mapping_inuse(page)) | ||
| 403 | goto activate_locked; | ||
| 404 | |||
| 405 | #ifdef CONFIG_SWAP | ||
| 406 | /* | ||
| 407 | * Anonymous process memory has backing store? | ||
| 408 | * Try to allocate it some swap space here. | ||
| 409 | */ | ||
| 410 | if (PageAnon(page) && !PageSwapCache(page)) { | ||
| 411 | if (!add_to_swap(page)) | ||
| 412 | goto activate_locked; | ||
| 413 | } | ||
| 414 | #endif /* CONFIG_SWAP */ | ||
| 415 | |||
| 416 | mapping = page_mapping(page); | ||
| 417 | may_enter_fs = (sc->gfp_mask & __GFP_FS) || | ||
| 418 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); | ||
| 419 | |||
| 420 | /* | ||
| 421 | * The page is mapped into the page tables of one or more | ||
| 422 | * processes. Try to unmap it here. | ||
| 423 | */ | ||
| 424 | if (page_mapped(page) && mapping) { | ||
| 425 | switch (try_to_unmap(page)) { | ||
| 426 | case SWAP_FAIL: | ||
| 427 | goto activate_locked; | ||
| 428 | case SWAP_AGAIN: | ||
| 429 | goto keep_locked; | ||
| 430 | case SWAP_SUCCESS: | ||
| 431 | ; /* try to free the page below */ | ||
| 432 | } | ||
| 433 | } | ||
| 434 | |||
| 435 | if (PageDirty(page)) { | ||
| 436 | if (referenced) | ||
| 437 | goto keep_locked; | ||
| 438 | if (!may_enter_fs) | ||
| 439 | goto keep_locked; | ||
| 440 | if (laptop_mode && !sc->may_writepage) | ||
| 441 | goto keep_locked; | ||
| 442 | |||
| 443 | /* Page is dirty, try to write it out here */ | ||
| 444 | switch(pageout(page, mapping)) { | ||
| 445 | case PAGE_KEEP: | ||
| 446 | goto keep_locked; | ||
| 447 | case PAGE_ACTIVATE: | ||
| 448 | goto activate_locked; | ||
| 449 | case PAGE_SUCCESS: | ||
| 450 | if (PageWriteback(page) || PageDirty(page)) | ||
| 451 | goto keep; | ||
| 452 | /* | ||
| 453 | * A synchronous write - probably a ramdisk. Go | ||
| 454 | * ahead and try to reclaim the page. | ||
| 455 | */ | ||
| 456 | if (TestSetPageLocked(page)) | ||
| 457 | goto keep; | ||
| 458 | if (PageDirty(page) || PageWriteback(page)) | ||
| 459 | goto keep_locked; | ||
| 460 | mapping = page_mapping(page); | ||
| 461 | case PAGE_CLEAN: | ||
| 462 | ; /* try to free the page below */ | ||
| 463 | } | ||
| 464 | } | ||
| 465 | |||
| 466 | /* | ||
| 467 | * If the page has buffers, try to free the buffer mappings | ||
| 468 | * associated with this page. If we succeed we try to free | ||
| 469 | * the page as well. | ||
| 470 | * | ||
| 471 | * We do this even if the page is PageDirty(). | ||
| 472 | * try_to_release_page() does not perform I/O, but it is | ||
| 473 | * possible for a page to have PageDirty set, but it is actually | ||
| 474 | * clean (all its buffers are clean). This happens if the | ||
| 475 | * buffers were written out directly, with submit_bh(). ext3 | ||
| 476 | * will do this, as well as the blockdev mapping. | ||
| 477 | * try_to_release_page() will discover that cleanness and will | ||
| 478 | * drop the buffers and mark the page clean - it can be freed. | ||
| 479 | * | ||
| 480 | * Rarely, pages can have buffers and no ->mapping. These are | ||
| 481 | * the pages which were not successfully invalidated in | ||
| 482 | * truncate_complete_page(). We try to drop those buffers here | ||
| 483 | * and if that worked, and the page is no longer mapped into | ||
| 484 | * process address space (page_count == 1) it can be freed. | ||
| 485 | * Otherwise, leave the page on the LRU so it is swappable. | ||
| 486 | */ | ||
| 487 | if (PagePrivate(page)) { | ||
| 488 | if (!try_to_release_page(page, sc->gfp_mask)) | ||
| 489 | goto activate_locked; | ||
| 490 | if (!mapping && page_count(page) == 1) | ||
| 491 | goto free_it; | ||
| 492 | } | ||
| 493 | |||
| 494 | if (!mapping) | ||
| 495 | goto keep_locked; /* truncate got there first */ | ||
| 496 | |||
| 497 | write_lock_irq(&mapping->tree_lock); | ||
| 498 | |||
| 499 | /* | ||
| 500 | * The non-racy check for busy page. It is critical to check | ||
| 501 | * PageDirty _after_ making sure that the page is freeable and | ||
| 502 | * not in use by anybody. (pagecache + us == 2) | ||
| 503 | */ | ||
| 504 | if (page_count(page) != 2 || PageDirty(page)) { | ||
| 505 | write_unlock_irq(&mapping->tree_lock); | ||
| 506 | goto keep_locked; | ||
| 507 | } | ||
| 508 | |||
| 509 | #ifdef CONFIG_SWAP | ||
| 510 | if (PageSwapCache(page)) { | ||
| 511 | swp_entry_t swap = { .val = page->private }; | ||
| 512 | __delete_from_swap_cache(page); | ||
| 513 | write_unlock_irq(&mapping->tree_lock); | ||
| 514 | swap_free(swap); | ||
| 515 | __put_page(page); /* The pagecache ref */ | ||
| 516 | goto free_it; | ||
| 517 | } | ||
| 518 | #endif /* CONFIG_SWAP */ | ||
| 519 | |||
| 520 | __remove_from_page_cache(page); | ||
| 521 | write_unlock_irq(&mapping->tree_lock); | ||
| 522 | __put_page(page); | ||
| 523 | |||
| 524 | free_it: | ||
| 525 | unlock_page(page); | ||
| 526 | reclaimed++; | ||
| 527 | if (!pagevec_add(&freed_pvec, page)) | ||
| 528 | __pagevec_release_nonlru(&freed_pvec); | ||
| 529 | continue; | ||
| 530 | |||
| 531 | activate_locked: | ||
| 532 | SetPageActive(page); | ||
| 533 | pgactivate++; | ||
| 534 | keep_locked: | ||
| 535 | unlock_page(page); | ||
| 536 | keep: | ||
| 537 | list_add(&page->lru, &ret_pages); | ||
| 538 | BUG_ON(PageLRU(page)); | ||
| 539 | } | ||
| 540 | list_splice(&ret_pages, page_list); | ||
| 541 | if (pagevec_count(&freed_pvec)) | ||
| 542 | __pagevec_release_nonlru(&freed_pvec); | ||
| 543 | mod_page_state(pgactivate, pgactivate); | ||
| 544 | sc->nr_reclaimed += reclaimed; | ||
| 545 | return reclaimed; | ||
| 546 | } | ||
| 547 | |||
| 548 | /* | ||
| 549 | * zone->lru_lock is heavily contended. Some of the functions that | ||
| 550 | * shrink the lists perform better by taking out a batch of pages | ||
| 551 | * and working on them outside the LRU lock. | ||
| 552 | * | ||
| 553 | * For pagecache intensive workloads, this function is the hottest | ||
| 554 | * spot in the kernel (apart from copy_*_user functions). | ||
| 555 | * | ||
| 556 | * Appropriate locks must be held before calling this function. | ||
| 557 | * | ||
| 558 | * @nr_to_scan: The number of pages to look through on the list. | ||
| 559 | * @src: The LRU list to pull pages off. | ||
| 560 | * @dst: The temp list to put pages on to. | ||
| 561 | * @scanned: The number of pages that were scanned. | ||
| 562 | * | ||
| 563 | * returns how many pages were moved onto *@dst. | ||
| 564 | */ | ||
| 565 | static int isolate_lru_pages(int nr_to_scan, struct list_head *src, | ||
| 566 | struct list_head *dst, int *scanned) | ||
| 567 | { | ||
| 568 | int nr_taken = 0; | ||
| 569 | struct page *page; | ||
| 570 | int scan = 0; | ||
| 571 | |||
| 572 | while (scan++ < nr_to_scan && !list_empty(src)) { | ||
| 573 | page = lru_to_page(src); | ||
| 574 | prefetchw_prev_lru_page(page, src, flags); | ||
| 575 | |||
| 576 | if (!TestClearPageLRU(page)) | ||
| 577 | BUG(); | ||
| 578 | list_del(&page->lru); | ||
| 579 | if (get_page_testone(page)) { | ||
| 580 | /* | ||
| 581 | * It is being freed elsewhere | ||
| 582 | */ | ||
| 583 | __put_page(page); | ||
| 584 | SetPageLRU(page); | ||
| 585 | list_add(&page->lru, src); | ||
| 586 | continue; | ||
| 587 | } else { | ||
| 588 | list_add(&page->lru, dst); | ||
| 589 | nr_taken++; | ||
| 590 | } | ||
| 591 | } | ||
| 592 | |||
| 593 | *scanned = scan; | ||
| 594 | return nr_taken; | ||
| 595 | } | ||
| 596 | |||
| 597 | /* | ||
| 598 | * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed | ||
| 599 | */ | ||
| 600 | static void shrink_cache(struct zone *zone, struct scan_control *sc) | ||
| 601 | { | ||
| 602 | LIST_HEAD(page_list); | ||
| 603 | struct pagevec pvec; | ||
| 604 | int max_scan = sc->nr_to_scan; | ||
| 605 | |||
| 606 | pagevec_init(&pvec, 1); | ||
| 607 | |||
| 608 | lru_add_drain(); | ||
| 609 | spin_lock_irq(&zone->lru_lock); | ||
| 610 | while (max_scan > 0) { | ||
| 611 | struct page *page; | ||
| 612 | int nr_taken; | ||
| 613 | int nr_scan; | ||
| 614 | int nr_freed; | ||
| 615 | |||
| 616 | nr_taken = isolate_lru_pages(sc->swap_cluster_max, | ||
| 617 | &zone->inactive_list, | ||
| 618 | &page_list, &nr_scan); | ||
| 619 | zone->nr_inactive -= nr_taken; | ||
| 620 | zone->pages_scanned += nr_scan; | ||
| 621 | spin_unlock_irq(&zone->lru_lock); | ||
| 622 | |||
| 623 | if (nr_taken == 0) | ||
| 624 | goto done; | ||
| 625 | |||
| 626 | max_scan -= nr_scan; | ||
| 627 | if (current_is_kswapd()) | ||
| 628 | mod_page_state_zone(zone, pgscan_kswapd, nr_scan); | ||
| 629 | else | ||
| 630 | mod_page_state_zone(zone, pgscan_direct, nr_scan); | ||
| 631 | nr_freed = shrink_list(&page_list, sc); | ||
| 632 | if (current_is_kswapd()) | ||
| 633 | mod_page_state(kswapd_steal, nr_freed); | ||
| 634 | mod_page_state_zone(zone, pgsteal, nr_freed); | ||
| 635 | sc->nr_to_reclaim -= nr_freed; | ||
| 636 | |||
| 637 | spin_lock_irq(&zone->lru_lock); | ||
| 638 | /* | ||
| 639 | * Put back any unfreeable pages. | ||
| 640 | */ | ||
| 641 | while (!list_empty(&page_list)) { | ||
| 642 | page = lru_to_page(&page_list); | ||
| 643 | if (TestSetPageLRU(page)) | ||
| 644 | BUG(); | ||
| 645 | list_del(&page->lru); | ||
| 646 | if (PageActive(page)) | ||
| 647 | add_page_to_active_list(zone, page); | ||
| 648 | else | ||
| 649 | add_page_to_inactive_list(zone, page); | ||
| 650 | if (!pagevec_add(&pvec, page)) { | ||
| 651 | spin_unlock_irq(&zone->lru_lock); | ||
| 652 | __pagevec_release(&pvec); | ||
| 653 | spin_lock_irq(&zone->lru_lock); | ||
| 654 | } | ||
| 655 | } | ||
| 656 | } | ||
| 657 | spin_unlock_irq(&zone->lru_lock); | ||
| 658 | done: | ||
| 659 | pagevec_release(&pvec); | ||
| 660 | } | ||
| 661 | |||
| 662 | /* | ||
| 663 | * This moves pages from the active list to the inactive list. | ||
| 664 | * | ||
| 665 | * We move them the other way if the page is referenced by one or more | ||
| 666 | * processes, from rmap. | ||
| 667 | * | ||
| 668 | * If the pages are mostly unmapped, the processing is fast and it is | ||
| 669 | * appropriate to hold zone->lru_lock across the whole operation. But if | ||
| 670 | * the pages are mapped, the processing is slow (page_referenced()) so we | ||
| 671 | * should drop zone->lru_lock around each page. It's impossible to balance | ||
| 672 | * this, so instead we remove the pages from the LRU while processing them. | ||
| 673 | * It is safe to rely on PG_active against the non-LRU pages in here because | ||
| 674 | * nobody will play with that bit on a non-LRU page. | ||
| 675 | * | ||
| 676 | * The downside is that we have to touch page->_count against each page. | ||
| 677 | * But we had to alter page->flags anyway. | ||
| 678 | */ | ||
| 679 | static void | ||
| 680 | refill_inactive_zone(struct zone *zone, struct scan_control *sc) | ||
| 681 | { | ||
| 682 | int pgmoved; | ||
| 683 | int pgdeactivate = 0; | ||
| 684 | int pgscanned; | ||
| 685 | int nr_pages = sc->nr_to_scan; | ||
| 686 | LIST_HEAD(l_hold); /* The pages which were snipped off */ | ||
| 687 | LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ | ||
| 688 | LIST_HEAD(l_active); /* Pages to go onto the active_list */ | ||
| 689 | struct page *page; | ||
| 690 | struct pagevec pvec; | ||
| 691 | int reclaim_mapped = 0; | ||
| 692 | long mapped_ratio; | ||
| 693 | long distress; | ||
| 694 | long swap_tendency; | ||
| 695 | |||
| 696 | lru_add_drain(); | ||
| 697 | spin_lock_irq(&zone->lru_lock); | ||
| 698 | pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, | ||
| 699 | &l_hold, &pgscanned); | ||
| 700 | zone->pages_scanned += pgscanned; | ||
| 701 | zone->nr_active -= pgmoved; | ||
| 702 | spin_unlock_irq(&zone->lru_lock); | ||
| 703 | |||
| 704 | /* | ||
| 705 | * `distress' is a measure of how much trouble we're having reclaiming | ||
| 706 | * pages. 0 -> no problems. 100 -> great trouble. | ||
| 707 | */ | ||
| 708 | distress = 100 >> zone->prev_priority; | ||
| 709 | |||
| 710 | /* | ||
| 711 | * The point of this algorithm is to decide when to start reclaiming | ||
| 712 | * mapped memory instead of just pagecache. Work out how much memory | ||
| 713 | * is mapped. | ||
| 714 | */ | ||
| 715 | mapped_ratio = (sc->nr_mapped * 100) / total_memory; | ||
| 716 | |||
| 717 | /* | ||
| 718 | * Now decide how much we really want to unmap some pages. The mapped | ||
| 719 | * ratio is downgraded - just because there's a lot of mapped memory | ||
| 720 | * doesn't necessarily mean that page reclaim isn't succeeding. | ||
| 721 | * | ||
| 722 | * The distress ratio is important - we don't want to start going oom. | ||
| 723 | * | ||
| 724 | * A 100% value of vm_swappiness overrides this algorithm altogether. | ||
| 725 | */ | ||
| 726 | swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; | ||
| 727 | |||
| 728 | /* | ||
| 729 | * Now use this metric to decide whether to start moving mapped memory | ||
| 730 | * onto the inactive list. | ||
| 731 | */ | ||
| 732 | if (swap_tendency >= 100) | ||
| 733 | reclaim_mapped = 1; | ||
| 734 | |||
| 735 | while (!list_empty(&l_hold)) { | ||
| 736 | cond_resched(); | ||
| 737 | page = lru_to_page(&l_hold); | ||
| 738 | list_del(&page->lru); | ||
| 739 | if (page_mapped(page)) { | ||
| 740 | if (!reclaim_mapped || | ||
| 741 | (total_swap_pages == 0 && PageAnon(page)) || | ||
| 742 | page_referenced(page, 0, sc->priority <= 0)) { | ||
| 743 | list_add(&page->lru, &l_active); | ||
| 744 | continue; | ||
| 745 | } | ||
| 746 | } | ||
| 747 | list_add(&page->lru, &l_inactive); | ||
| 748 | } | ||
| 749 | |||
| 750 | pagevec_init(&pvec, 1); | ||
| 751 | pgmoved = 0; | ||
| 752 | spin_lock_irq(&zone->lru_lock); | ||
| 753 | while (!list_empty(&l_inactive)) { | ||
| 754 | page = lru_to_page(&l_inactive); | ||
| 755 | prefetchw_prev_lru_page(page, &l_inactive, flags); | ||
| 756 | if (TestSetPageLRU(page)) | ||
| 757 | BUG(); | ||
| 758 | if (!TestClearPageActive(page)) | ||
| 759 | BUG(); | ||
| 760 | list_move(&page->lru, &zone->inactive_list); | ||
| 761 | pgmoved++; | ||
| 762 | if (!pagevec_add(&pvec, page)) { | ||
| 763 | zone->nr_inactive += pgmoved; | ||
| 764 | spin_unlock_irq(&zone->lru_lock); | ||
| 765 | pgdeactivate += pgmoved; | ||
| 766 | pgmoved = 0; | ||
| 767 | if (buffer_heads_over_limit) | ||
| 768 | pagevec_strip(&pvec); | ||
| 769 | __pagevec_release(&pvec); | ||
| 770 | spin_lock_irq(&zone->lru_lock); | ||
| 771 | } | ||
| 772 | } | ||
| 773 | zone->nr_inactive += pgmoved; | ||
| 774 | pgdeactivate += pgmoved; | ||
| 775 | if (buffer_heads_over_limit) { | ||
| 776 | spin_unlock_irq(&zone->lru_lock); | ||
| 777 | pagevec_strip(&pvec); | ||
| 778 | spin_lock_irq(&zone->lru_lock); | ||
| 779 | } | ||
| 780 | |||
| 781 | pgmoved = 0; | ||
| 782 | while (!list_empty(&l_active)) { | ||
| 783 | page = lru_to_page(&l_active); | ||
| 784 | prefetchw_prev_lru_page(page, &l_active, flags); | ||
| 785 | if (TestSetPageLRU(page)) | ||
| 786 | BUG(); | ||
| 787 | BUG_ON(!PageActive(page)); | ||
| 788 | list_move(&page->lru, &zone->active_list); | ||
| 789 | pgmoved++; | ||
| 790 | if (!pagevec_add(&pvec, page)) { | ||
| 791 | zone->nr_active += pgmoved; | ||
| 792 | pgmoved = 0; | ||
| 793 | spin_unlock_irq(&zone->lru_lock); | ||
| 794 | __pagevec_release(&pvec); | ||
| 795 | spin_lock_irq(&zone->lru_lock); | ||
| 796 | } | ||
| 797 | } | ||
| 798 | zone->nr_active += pgmoved; | ||
| 799 | spin_unlock_irq(&zone->lru_lock); | ||
| 800 | pagevec_release(&pvec); | ||
| 801 | |||
| 802 | mod_page_state_zone(zone, pgrefill, pgscanned); | ||
| 803 | mod_page_state(pgdeactivate, pgdeactivate); | ||
| 804 | } | ||
| 805 | |||
| 806 | /* | ||
| 807 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | ||
| 808 | */ | ||
| 809 | static void | ||
| 810 | shrink_zone(struct zone *zone, struct scan_control *sc) | ||
| 811 | { | ||
| 812 | unsigned long nr_active; | ||
| 813 | unsigned long nr_inactive; | ||
| 814 | |||
| 815 | /* | ||
| 816 | * Add one to `nr_to_scan' just to make sure that the kernel will | ||
| 817 | * slowly sift through the active list. | ||
| 818 | */ | ||
| 819 | zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; | ||
| 820 | nr_active = zone->nr_scan_active; | ||
| 821 | if (nr_active >= sc->swap_cluster_max) | ||
| 822 | zone->nr_scan_active = 0; | ||
| 823 | else | ||
| 824 | nr_active = 0; | ||
| 825 | |||
| 826 | zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1; | ||
| 827 | nr_inactive = zone->nr_scan_inactive; | ||
| 828 | if (nr_inactive >= sc->swap_cluster_max) | ||
| 829 | zone->nr_scan_inactive = 0; | ||
| 830 | else | ||
| 831 | nr_inactive = 0; | ||
| 832 | |||
| 833 | sc->nr_to_reclaim = sc->swap_cluster_max; | ||
| 834 | |||
| 835 | while (nr_active || nr_inactive) { | ||
| 836 | if (nr_active) { | ||
| 837 | sc->nr_to_scan = min(nr_active, | ||
| 838 | (unsigned long)sc->swap_cluster_max); | ||
| 839 | nr_active -= sc->nr_to_scan; | ||
| 840 | refill_inactive_zone(zone, sc); | ||
| 841 | } | ||
| 842 | |||
| 843 | if (nr_inactive) { | ||
| 844 | sc->nr_to_scan = min(nr_inactive, | ||
| 845 | (unsigned long)sc->swap_cluster_max); | ||
| 846 | nr_inactive -= sc->nr_to_scan; | ||
| 847 | shrink_cache(zone, sc); | ||
| 848 | if (sc->nr_to_reclaim <= 0) | ||
| 849 | break; | ||
| 850 | } | ||
| 851 | } | ||
| 852 | |||
| 853 | throttle_vm_writeout(); | ||
| 854 | } | ||
| 855 | |||
| 856 | /* | ||
| 857 | * This is the direct reclaim path, for page-allocating processes. We only | ||
| 858 | * try to reclaim pages from zones which will satisfy the caller's allocation | ||
| 859 | * request. | ||
| 860 | * | ||
| 861 | * We reclaim from a zone even if that zone is over pages_high. Because: | ||
| 862 | * a) The caller may be trying to free *extra* pages to satisfy a higher-order | ||
| 863 | * allocation or | ||
| 864 | * b) The zones may be over pages_high but they must go *over* pages_high to | ||
| 865 | * satisfy the `incremental min' zone defense algorithm. | ||
| 866 | * | ||
| 867 | * Returns the number of reclaimed pages. | ||
| 868 | * | ||
| 869 | * If a zone is deemed to be full of pinned pages then just give it a light | ||
| 870 | * scan then give up on it. | ||
| 871 | */ | ||
| 872 | static void | ||
| 873 | shrink_caches(struct zone **zones, struct scan_control *sc) | ||
| 874 | { | ||
| 875 | int i; | ||
| 876 | |||
| 877 | for (i = 0; zones[i] != NULL; i++) { | ||
| 878 | struct zone *zone = zones[i]; | ||
| 879 | |||
| 880 | if (zone->present_pages == 0) | ||
| 881 | continue; | ||
| 882 | |||
| 883 | if (!cpuset_zone_allowed(zone)) | ||
| 884 | continue; | ||
| 885 | |||
| 886 | zone->temp_priority = sc->priority; | ||
| 887 | if (zone->prev_priority > sc->priority) | ||
| 888 | zone->prev_priority = sc->priority; | ||
| 889 | |||
| 890 | if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) | ||
| 891 | continue; /* Let kswapd poll it */ | ||
| 892 | |||
| 893 | shrink_zone(zone, sc); | ||
| 894 | } | ||
| 895 | } | ||
| 896 | |||
| 897 | /* | ||
| 898 | * This is the main entry point to direct page reclaim. | ||
| 899 | * | ||
| 900 | * If a full scan of the inactive list fails to free enough memory then we | ||
| 901 | * are "out of memory" and something needs to be killed. | ||
| 902 | * | ||
| 903 | * If the caller is !__GFP_FS then the probability of a failure is reasonably | ||
| 904 | * high - the zone may be full of dirty or under-writeback pages, which this | ||
| 905 | * caller can't do much about. We kick pdflush and take explicit naps in the | ||
| 906 | * hope that some of these pages can be written. But if the allocating task | ||
| 907 | * holds filesystem locks which prevent writeout this might not work, and the | ||
| 908 | * allocation attempt will fail. | ||
| 909 | */ | ||
| 910 | int try_to_free_pages(struct zone **zones, | ||
| 911 | unsigned int gfp_mask, unsigned int order) | ||
| 912 | { | ||
| 913 | int priority; | ||
| 914 | int ret = 0; | ||
| 915 | int total_scanned = 0, total_reclaimed = 0; | ||
| 916 | struct reclaim_state *reclaim_state = current->reclaim_state; | ||
| 917 | struct scan_control sc; | ||
| 918 | unsigned long lru_pages = 0; | ||
| 919 | int i; | ||
| 920 | |||
| 921 | sc.gfp_mask = gfp_mask; | ||
| 922 | sc.may_writepage = 0; | ||
| 923 | |||
| 924 | inc_page_state(allocstall); | ||
| 925 | |||
| 926 | for (i = 0; zones[i] != NULL; i++) { | ||
| 927 | struct zone *zone = zones[i]; | ||
| 928 | |||
| 929 | if (!cpuset_zone_allowed(zone)) | ||
| 930 | continue; | ||
| 931 | |||
| 932 | zone->temp_priority = DEF_PRIORITY; | ||
| 933 | lru_pages += zone->nr_active + zone->nr_inactive; | ||
| 934 | } | ||
| 935 | |||
| 936 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | ||
| 937 | sc.nr_mapped = read_page_state(nr_mapped); | ||
| 938 | sc.nr_scanned = 0; | ||
| 939 | sc.nr_reclaimed = 0; | ||
| 940 | sc.priority = priority; | ||
| 941 | sc.swap_cluster_max = SWAP_CLUSTER_MAX; | ||
| 942 | shrink_caches(zones, &sc); | ||
| 943 | shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); | ||
| 944 | if (reclaim_state) { | ||
| 945 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | ||
| 946 | reclaim_state->reclaimed_slab = 0; | ||
| 947 | } | ||
| 948 | total_scanned += sc.nr_scanned; | ||
| 949 | total_reclaimed += sc.nr_reclaimed; | ||
| 950 | if (total_reclaimed >= sc.swap_cluster_max) { | ||
| 951 | ret = 1; | ||
| 952 | goto out; | ||
| 953 | } | ||
| 954 | |||
| 955 | /* | ||
| 956 | * Try to write back as many pages as we just scanned. This | ||
| 957 | * tends to cause slow streaming writers to write data to the | ||
| 958 | * disk smoothly, at the dirtying rate, which is nice. But | ||
| 959 | * that's undesirable in laptop mode, where we *want* lumpy | ||
| 960 | * writeout. So in laptop mode, write out the whole world. | ||
| 961 | */ | ||
| 962 | if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) { | ||
| 963 | wakeup_bdflush(laptop_mode ? 0 : total_scanned); | ||
| 964 | sc.may_writepage = 1; | ||
| 965 | } | ||
| 966 | |||
| 967 | /* Take a nap, wait for some writeback to complete */ | ||
| 968 | if (sc.nr_scanned && priority < DEF_PRIORITY - 2) | ||
| 969 | blk_congestion_wait(WRITE, HZ/10); | ||
| 970 | } | ||
| 971 | out: | ||
| 972 | for (i = 0; zones[i] != 0; i++) { | ||
| 973 | struct zone *zone = zones[i]; | ||
| 974 | |||
| 975 | if (!cpuset_zone_allowed(zone)) | ||
| 976 | continue; | ||
| 977 | |||
| 978 | zone->prev_priority = zone->temp_priority; | ||
| 979 | } | ||
| 980 | return ret; | ||
| 981 | } | ||
| 982 | |||
| 983 | /* | ||
| 984 | * For kswapd, balance_pgdat() will work across all this node's zones until | ||
| 985 | * they are all at pages_high. | ||
| 986 | * | ||
| 987 | * If `nr_pages' is non-zero then it is the number of pages which are to be | ||
| 988 | * reclaimed, regardless of the zone occupancies. This is a software suspend | ||
| 989 | * special. | ||
| 990 | * | ||
| 991 | * Returns the number of pages which were actually freed. | ||
| 992 | * | ||
| 993 | * There is special handling here for zones which are full of pinned pages. | ||
| 994 | * This can happen if the pages are all mlocked, or if they are all used by | ||
| 995 | * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. | ||
| 996 | * What we do is to detect the case where all pages in the zone have been | ||
| 997 | * scanned twice and there has been zero successful reclaim. Mark the zone as | ||
| 998 | * dead and from now on, only perform a short scan. Basically we're polling | ||
| 999 | * the zone for when the problem goes away. | ||
| 1000 | * | ||
| 1001 | * kswapd scans the zones in the highmem->normal->dma direction. It skips | ||
| 1002 | * zones which have free_pages > pages_high, but once a zone is found to have | ||
| 1003 | * free_pages <= pages_high, we scan that zone and the lower zones regardless | ||
| 1004 | * of the number of free pages in the lower zones. This interoperates with | ||
| 1005 | * the page allocator fallback scheme to ensure that aging of pages is balanced | ||
| 1006 | * across the zones. | ||
| 1007 | */ | ||
| 1008 | static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order) | ||
| 1009 | { | ||
| 1010 | int to_free = nr_pages; | ||
| 1011 | int all_zones_ok; | ||
| 1012 | int priority; | ||
| 1013 | int i; | ||
| 1014 | int total_scanned, total_reclaimed; | ||
| 1015 | struct reclaim_state *reclaim_state = current->reclaim_state; | ||
| 1016 | struct scan_control sc; | ||
| 1017 | |||
| 1018 | loop_again: | ||
| 1019 | total_scanned = 0; | ||
| 1020 | total_reclaimed = 0; | ||
| 1021 | sc.gfp_mask = GFP_KERNEL; | ||
| 1022 | sc.may_writepage = 0; | ||
| 1023 | sc.nr_mapped = read_page_state(nr_mapped); | ||
| 1024 | |||
| 1025 | inc_page_state(pageoutrun); | ||
| 1026 | |||
| 1027 | for (i = 0; i < pgdat->nr_zones; i++) { | ||
| 1028 | struct zone *zone = pgdat->node_zones + i; | ||
| 1029 | |||
| 1030 | zone->temp_priority = DEF_PRIORITY; | ||
| 1031 | } | ||
| 1032 | |||
| 1033 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | ||
| 1034 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | ||
| 1035 | unsigned long lru_pages = 0; | ||
| 1036 | |||
| 1037 | all_zones_ok = 1; | ||
| 1038 | |||
| 1039 | if (nr_pages == 0) { | ||
| 1040 | /* | ||
| 1041 | * Scan in the highmem->dma direction for the highest | ||
| 1042 | * zone which needs scanning | ||
| 1043 | */ | ||
| 1044 | for (i = pgdat->nr_zones - 1; i >= 0; i--) { | ||
| 1045 | struct zone *zone = pgdat->node_zones + i; | ||
| 1046 | |||
| 1047 | if (zone->present_pages == 0) | ||
| 1048 | continue; | ||
| 1049 | |||
| 1050 | if (zone->all_unreclaimable && | ||
| 1051 | priority != DEF_PRIORITY) | ||
| 1052 | continue; | ||
| 1053 | |||
| 1054 | if (!zone_watermark_ok(zone, order, | ||
| 1055 | zone->pages_high, 0, 0, 0)) { | ||
| 1056 | end_zone = i; | ||
| 1057 | goto scan; | ||
| 1058 | } | ||
| 1059 | } | ||
| 1060 | goto out; | ||
| 1061 | } else { | ||
| 1062 | end_zone = pgdat->nr_zones - 1; | ||
| 1063 | } | ||
| 1064 | scan: | ||
| 1065 | for (i = 0; i <= end_zone; i++) { | ||
| 1066 | struct zone *zone = pgdat->node_zones + i; | ||
| 1067 | |||
| 1068 | lru_pages += zone->nr_active + zone->nr_inactive; | ||
| 1069 | } | ||
| 1070 | |||
| 1071 | /* | ||
| 1072 | * Now scan the zone in the dma->highmem direction, stopping | ||
| 1073 | * at the last zone which needs scanning. | ||
| 1074 | * | ||
| 1075 | * We do this because the page allocator works in the opposite | ||
| 1076 | * direction. This prevents the page allocator from allocating | ||
| 1077 | * pages behind kswapd's direction of progress, which would | ||
| 1078 | * cause too much scanning of the lower zones. | ||
| 1079 | */ | ||
| 1080 | for (i = 0; i <= end_zone; i++) { | ||
| 1081 | struct zone *zone = pgdat->node_zones + i; | ||
| 1082 | |||
| 1083 | if (zone->present_pages == 0) | ||
| 1084 | continue; | ||
| 1085 | |||
| 1086 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | ||
| 1087 | continue; | ||
| 1088 | |||
| 1089 | if (nr_pages == 0) { /* Not software suspend */ | ||
| 1090 | if (!zone_watermark_ok(zone, order, | ||
| 1091 | zone->pages_high, end_zone, 0, 0)) | ||
| 1092 | all_zones_ok = 0; | ||
| 1093 | } | ||
| 1094 | zone->temp_priority = priority; | ||
| 1095 | if (zone->prev_priority > priority) | ||
| 1096 | zone->prev_priority = priority; | ||
| 1097 | sc.nr_scanned = 0; | ||
| 1098 | sc.nr_reclaimed = 0; | ||
| 1099 | sc.priority = priority; | ||
| 1100 | sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; | ||
| 1101 | shrink_zone(zone, &sc); | ||
| 1102 | reclaim_state->reclaimed_slab = 0; | ||
| 1103 | shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); | ||
| 1104 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | ||
| 1105 | total_reclaimed += sc.nr_reclaimed; | ||
| 1106 | total_scanned += sc.nr_scanned; | ||
| 1107 | if (zone->all_unreclaimable) | ||
| 1108 | continue; | ||
| 1109 | if (zone->pages_scanned >= (zone->nr_active + | ||
| 1110 | zone->nr_inactive) * 4) | ||
| 1111 | zone->all_unreclaimable = 1; | ||
| 1112 | /* | ||
| 1113 | * If we've done a decent amount of scanning and | ||
| 1114 | * the reclaim ratio is low, start doing writepage | ||
| 1115 | * even in laptop mode | ||
| 1116 | */ | ||
| 1117 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && | ||
| 1118 | total_scanned > total_reclaimed+total_reclaimed/2) | ||
| 1119 | sc.may_writepage = 1; | ||
| 1120 | } | ||
| 1121 | if (nr_pages && to_free > total_reclaimed) | ||
| 1122 | continue; /* swsusp: need to do more work */ | ||
| 1123 | if (all_zones_ok) | ||
| 1124 | break; /* kswapd: all done */ | ||
| 1125 | /* | ||
| 1126 | * OK, kswapd is getting into trouble. Take a nap, then take | ||
| 1127 | * another pass across the zones. | ||
| 1128 | */ | ||
| 1129 | if (total_scanned && priority < DEF_PRIORITY - 2) | ||
| 1130 | blk_congestion_wait(WRITE, HZ/10); | ||
| 1131 | |||
| 1132 | /* | ||
| 1133 | * We do this so kswapd doesn't build up large priorities for | ||
| 1134 | * example when it is freeing in parallel with allocators. It | ||
| 1135 | * matches the direct reclaim path behaviour in terms of impact | ||
| 1136 | * on zone->*_priority. | ||
| 1137 | */ | ||
| 1138 | if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages)) | ||
| 1139 | break; | ||
| 1140 | } | ||
| 1141 | out: | ||
| 1142 | for (i = 0; i < pgdat->nr_zones; i++) { | ||
| 1143 | struct zone *zone = pgdat->node_zones + i; | ||
| 1144 | |||
| 1145 | zone->prev_priority = zone->temp_priority; | ||
| 1146 | } | ||
| 1147 | if (!all_zones_ok) { | ||
| 1148 | cond_resched(); | ||
| 1149 | goto loop_again; | ||
| 1150 | } | ||
| 1151 | |||
| 1152 | return total_reclaimed; | ||
| 1153 | } | ||
| 1154 | |||
| 1155 | /* | ||
| 1156 | * The background pageout daemon, started as a kernel thread | ||
| 1157 | * from the init process. | ||
| 1158 | * | ||
| 1159 | * This basically trickles out pages so that we have _some_ | ||
| 1160 | * free memory available even if there is no other activity | ||
| 1161 | * that frees anything up. This is needed for things like routing | ||
| 1162 | * etc, where we otherwise might have all activity going on in | ||
| 1163 | * asynchronous contexts that cannot page things out. | ||
| 1164 | * | ||
| 1165 | * If there are applications that are active memory-allocators | ||
| 1166 | * (most normal use), this basically shouldn't matter. | ||
| 1167 | */ | ||
| 1168 | static int kswapd(void *p) | ||
| 1169 | { | ||
| 1170 | unsigned long order; | ||
| 1171 | pg_data_t *pgdat = (pg_data_t*)p; | ||
| 1172 | struct task_struct *tsk = current; | ||
| 1173 | DEFINE_WAIT(wait); | ||
| 1174 | struct reclaim_state reclaim_state = { | ||
| 1175 | .reclaimed_slab = 0, | ||
| 1176 | }; | ||
| 1177 | cpumask_t cpumask; | ||
| 1178 | |||
| 1179 | daemonize("kswapd%d", pgdat->node_id); | ||
| 1180 | cpumask = node_to_cpumask(pgdat->node_id); | ||
| 1181 | if (!cpus_empty(cpumask)) | ||
| 1182 | set_cpus_allowed(tsk, cpumask); | ||
| 1183 | current->reclaim_state = &reclaim_state; | ||
| 1184 | |||
| 1185 | /* | ||
| 1186 | * Tell the memory management that we're a "memory allocator", | ||
| 1187 | * and that if we need more memory we should get access to it | ||
| 1188 | * regardless (see "__alloc_pages()"). "kswapd" should | ||
| 1189 | * never get caught in the normal page freeing logic. | ||
| 1190 | * | ||
| 1191 | * (Kswapd normally doesn't need memory anyway, but sometimes | ||
| 1192 | * you need a small amount of memory in order to be able to | ||
| 1193 | * page out something else, and this flag essentially protects | ||
| 1194 | * us from recursively trying to free more memory as we're | ||
| 1195 | * trying to free the first piece of memory in the first place). | ||
| 1196 | */ | ||
| 1197 | tsk->flags |= PF_MEMALLOC|PF_KSWAPD; | ||
| 1198 | |||
| 1199 | order = 0; | ||
| 1200 | for ( ; ; ) { | ||
| 1201 | unsigned long new_order; | ||
| 1202 | if (current->flags & PF_FREEZE) | ||
| 1203 | refrigerator(PF_FREEZE); | ||
| 1204 | |||
| 1205 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
| 1206 | new_order = pgdat->kswapd_max_order; | ||
| 1207 | pgdat->kswapd_max_order = 0; | ||
| 1208 | if (order < new_order) { | ||
| 1209 | /* | ||
| 1210 | * Don't sleep if someone wants a larger 'order' | ||
| 1211 | * allocation | ||
| 1212 | */ | ||
| 1213 | order = new_order; | ||
| 1214 | } else { | ||
| 1215 | schedule(); | ||
| 1216 | order = pgdat->kswapd_max_order; | ||
| 1217 | } | ||
| 1218 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
| 1219 | |||
| 1220 | balance_pgdat(pgdat, 0, order); | ||
| 1221 | } | ||
| 1222 | return 0; | ||
| 1223 | } | ||
| 1224 | |||
| 1225 | /* | ||
| 1226 | * A zone is low on free memory, so wake its kswapd task to service it. | ||
| 1227 | */ | ||
| 1228 | void wakeup_kswapd(struct zone *zone, int order) | ||
| 1229 | { | ||
| 1230 | pg_data_t *pgdat; | ||
| 1231 | |||
| 1232 | if (zone->present_pages == 0) | ||
| 1233 | return; | ||
| 1234 | |||
| 1235 | pgdat = zone->zone_pgdat; | ||
| 1236 | if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0, 0)) | ||
| 1237 | return; | ||
| 1238 | if (pgdat->kswapd_max_order < order) | ||
| 1239 | pgdat->kswapd_max_order = order; | ||
| 1240 | if (!cpuset_zone_allowed(zone)) | ||
| 1241 | return; | ||
| 1242 | if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait)) | ||
| 1243 | return; | ||
| 1244 | wake_up_interruptible(&zone->zone_pgdat->kswapd_wait); | ||
| 1245 | } | ||
| 1246 | |||
| 1247 | #ifdef CONFIG_PM | ||
| 1248 | /* | ||
| 1249 | * Try to free `nr_pages' of memory, system-wide. Returns the number of freed | ||
| 1250 | * pages. | ||
| 1251 | */ | ||
| 1252 | int shrink_all_memory(int nr_pages) | ||
| 1253 | { | ||
| 1254 | pg_data_t *pgdat; | ||
| 1255 | int nr_to_free = nr_pages; | ||
| 1256 | int ret = 0; | ||
| 1257 | struct reclaim_state reclaim_state = { | ||
| 1258 | .reclaimed_slab = 0, | ||
| 1259 | }; | ||
| 1260 | |||
| 1261 | current->reclaim_state = &reclaim_state; | ||
| 1262 | for_each_pgdat(pgdat) { | ||
| 1263 | int freed; | ||
| 1264 | freed = balance_pgdat(pgdat, nr_to_free, 0); | ||
| 1265 | ret += freed; | ||
| 1266 | nr_to_free -= freed; | ||
| 1267 | if (nr_to_free <= 0) | ||
| 1268 | break; | ||
| 1269 | } | ||
| 1270 | current->reclaim_state = NULL; | ||
| 1271 | return ret; | ||
| 1272 | } | ||
| 1273 | #endif | ||
| 1274 | |||
| 1275 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 1276 | /* It's optimal to keep kswapds on the same CPUs as their memory, but | ||
| 1277 | not required for correctness. So if the last cpu in a node goes | ||
| 1278 | away, we get changed to run anywhere: as the first one comes back, | ||
| 1279 | restore their cpu bindings. */ | ||
| 1280 | static int __devinit cpu_callback(struct notifier_block *nfb, | ||
| 1281 | unsigned long action, | ||
| 1282 | void *hcpu) | ||
| 1283 | { | ||
| 1284 | pg_data_t *pgdat; | ||
| 1285 | cpumask_t mask; | ||
| 1286 | |||
| 1287 | if (action == CPU_ONLINE) { | ||
| 1288 | for_each_pgdat(pgdat) { | ||
| 1289 | mask = node_to_cpumask(pgdat->node_id); | ||
| 1290 | if (any_online_cpu(mask) != NR_CPUS) | ||
| 1291 | /* One of our CPUs online: restore mask */ | ||
| 1292 | set_cpus_allowed(pgdat->kswapd, mask); | ||
| 1293 | } | ||
| 1294 | } | ||
| 1295 | return NOTIFY_OK; | ||
| 1296 | } | ||
| 1297 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
| 1298 | |||
| 1299 | static int __init kswapd_init(void) | ||
| 1300 | { | ||
| 1301 | pg_data_t *pgdat; | ||
| 1302 | swap_setup(); | ||
| 1303 | for_each_pgdat(pgdat) | ||
| 1304 | pgdat->kswapd | ||
| 1305 | = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); | ||
| 1306 | total_memory = nr_free_pagecache_pages(); | ||
| 1307 | hotcpu_notifier(cpu_callback, 0); | ||
| 1308 | return 0; | ||
| 1309 | } | ||
| 1310 | |||
| 1311 | module_init(kswapd_init) | ||
