diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 2 | ||||
-rw-r--r-- | mm/bootmem.c | 58 | ||||
-rw-r--r-- | mm/filemap.c | 78 | ||||
-rw-r--r-- | mm/hugetlb.c | 192 | ||||
-rw-r--r-- | mm/internal.h | 21 | ||||
-rw-r--r-- | mm/madvise.c | 35 | ||||
-rw-r--r-- | mm/memory.c | 32 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 1 | ||||
-rw-r--r-- | mm/mempolicy.c | 102 | ||||
-rw-r--r-- | mm/nommu.c | 7 | ||||
-rw-r--r-- | mm/page_alloc.c | 343 | ||||
-rw-r--r-- | mm/readahead.c | 15 | ||||
-rw-r--r-- | mm/rmap.c | 57 | ||||
-rw-r--r-- | mm/shmem.c | 36 | ||||
-rw-r--r-- | mm/swap.c | 27 | ||||
-rw-r--r-- | mm/swap_state.c | 4 | ||||
-rw-r--r-- | mm/swapfile.c | 20 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 29 | ||||
-rw-r--r-- | mm/truncate.c | 44 | ||||
-rw-r--r-- | mm/vmscan.c | 125 |
20 files changed, 732 insertions, 496 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 21eb51d4da8f..b3db11f137e0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -11,7 +11,7 @@ choice | |||
11 | 11 | ||
12 | config FLATMEM_MANUAL | 12 | config FLATMEM_MANUAL |
13 | bool "Flat Memory" | 13 | bool "Flat Memory" |
14 | depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE | 14 | depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE |
15 | help | 15 | help |
16 | This option allows you to change some of the ways that | 16 | This option allows you to change some of the ways that |
17 | Linux manages its memory internally. Most users will | 17 | Linux manages its memory internally. Most users will |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 16b9465eb4eb..35c32290f717 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -296,20 +296,12 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) | |||
296 | unsigned long v = ~map[i / BITS_PER_LONG]; | 296 | unsigned long v = ~map[i / BITS_PER_LONG]; |
297 | 297 | ||
298 | if (gofast && v == ~0UL) { | 298 | if (gofast && v == ~0UL) { |
299 | int j, order; | 299 | int order; |
300 | 300 | ||
301 | page = pfn_to_page(pfn); | 301 | page = pfn_to_page(pfn); |
302 | count += BITS_PER_LONG; | 302 | count += BITS_PER_LONG; |
303 | __ClearPageReserved(page); | ||
304 | order = ffs(BITS_PER_LONG) - 1; | 303 | order = ffs(BITS_PER_LONG) - 1; |
305 | set_page_refs(page, order); | 304 | __free_pages_bootmem(page, order); |
306 | for (j = 1; j < BITS_PER_LONG; j++) { | ||
307 | if (j + 16 < BITS_PER_LONG) | ||
308 | prefetchw(page + j + 16); | ||
309 | __ClearPageReserved(page + j); | ||
310 | set_page_count(page + j, 0); | ||
311 | } | ||
312 | __free_pages(page, order); | ||
313 | i += BITS_PER_LONG; | 305 | i += BITS_PER_LONG; |
314 | page += BITS_PER_LONG; | 306 | page += BITS_PER_LONG; |
315 | } else if (v) { | 307 | } else if (v) { |
@@ -319,9 +311,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) | |||
319 | for (m = 1; m && i < idx; m<<=1, page++, i++) { | 311 | for (m = 1; m && i < idx; m<<=1, page++, i++) { |
320 | if (v & m) { | 312 | if (v & m) { |
321 | count++; | 313 | count++; |
322 | __ClearPageReserved(page); | 314 | __free_pages_bootmem(page, 0); |
323 | set_page_refs(page, 0); | ||
324 | __free_page(page); | ||
325 | } | 315 | } |
326 | } | 316 | } |
327 | } else { | 317 | } else { |
@@ -339,9 +329,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) | |||
339 | count = 0; | 329 | count = 0; |
340 | for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { | 330 | for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { |
341 | count++; | 331 | count++; |
342 | __ClearPageReserved(page); | 332 | __free_pages_bootmem(page, 0); |
343 | set_page_count(page, 1); | ||
344 | __free_page(page); | ||
345 | } | 333 | } |
346 | total += count; | 334 | total += count; |
347 | bdata->node_bootmem_map = NULL; | 335 | bdata->node_bootmem_map = NULL; |
@@ -393,15 +381,14 @@ unsigned long __init free_all_bootmem (void) | |||
393 | return(free_all_bootmem_core(NODE_DATA(0))); | 381 | return(free_all_bootmem_core(NODE_DATA(0))); |
394 | } | 382 | } |
395 | 383 | ||
396 | void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal, | 384 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) |
397 | unsigned long limit) | ||
398 | { | 385 | { |
399 | pg_data_t *pgdat = pgdat_list; | 386 | pg_data_t *pgdat = pgdat_list; |
400 | void *ptr; | 387 | void *ptr; |
401 | 388 | ||
402 | for_each_pgdat(pgdat) | 389 | for_each_pgdat(pgdat) |
403 | if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, | 390 | if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, |
404 | align, goal, limit))) | 391 | align, goal, 0))) |
405 | return(ptr); | 392 | return(ptr); |
406 | 393 | ||
407 | /* | 394 | /* |
@@ -413,15 +400,40 @@ void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, un | |||
413 | } | 400 | } |
414 | 401 | ||
415 | 402 | ||
416 | void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align, | 403 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, |
417 | unsigned long goal, unsigned long limit) | 404 | unsigned long goal) |
418 | { | 405 | { |
419 | void *ptr; | 406 | void *ptr; |
420 | 407 | ||
421 | ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, limit); | 408 | ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); |
422 | if (ptr) | 409 | if (ptr) |
423 | return (ptr); | 410 | return (ptr); |
424 | 411 | ||
425 | return __alloc_bootmem_limit(size, align, goal, limit); | 412 | return __alloc_bootmem(size, align, goal); |
413 | } | ||
414 | |||
415 | #define LOW32LIMIT 0xffffffff | ||
416 | |||
417 | void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) | ||
418 | { | ||
419 | pg_data_t *pgdat = pgdat_list; | ||
420 | void *ptr; | ||
421 | |||
422 | for_each_pgdat(pgdat) | ||
423 | if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, | ||
424 | align, goal, LOW32LIMIT))) | ||
425 | return(ptr); | ||
426 | |||
427 | /* | ||
428 | * Whoops, we cannot satisfy the allocation request. | ||
429 | */ | ||
430 | printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size); | ||
431 | panic("Out of low memory"); | ||
432 | return NULL; | ||
426 | } | 433 | } |
427 | 434 | ||
435 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | ||
436 | unsigned long align, unsigned long goal) | ||
437 | { | ||
438 | return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT); | ||
439 | } | ||
diff --git a/mm/filemap.c b/mm/filemap.c index 33a28bfde158..4ef24a397684 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -555,11 +555,12 @@ repeat: | |||
555 | page_cache_get(page); | 555 | page_cache_get(page); |
556 | if (TestSetPageLocked(page)) { | 556 | if (TestSetPageLocked(page)) { |
557 | read_unlock_irq(&mapping->tree_lock); | 557 | read_unlock_irq(&mapping->tree_lock); |
558 | lock_page(page); | 558 | __lock_page(page); |
559 | read_lock_irq(&mapping->tree_lock); | 559 | read_lock_irq(&mapping->tree_lock); |
560 | 560 | ||
561 | /* Has the page been truncated while we slept? */ | 561 | /* Has the page been truncated while we slept? */ |
562 | if (page->mapping != mapping || page->index != offset) { | 562 | if (unlikely(page->mapping != mapping || |
563 | page->index != offset)) { | ||
563 | unlock_page(page); | 564 | unlock_page(page); |
564 | page_cache_release(page); | 565 | page_cache_release(page); |
565 | goto repeat; | 566 | goto repeat; |
@@ -831,8 +832,13 @@ readpage: | |||
831 | /* Start the actual read. The read will unlock the page. */ | 832 | /* Start the actual read. The read will unlock the page. */ |
832 | error = mapping->a_ops->readpage(filp, page); | 833 | error = mapping->a_ops->readpage(filp, page); |
833 | 834 | ||
834 | if (unlikely(error)) | 835 | if (unlikely(error)) { |
836 | if (error == AOP_TRUNCATED_PAGE) { | ||
837 | page_cache_release(page); | ||
838 | goto find_page; | ||
839 | } | ||
835 | goto readpage_error; | 840 | goto readpage_error; |
841 | } | ||
836 | 842 | ||
837 | if (!PageUptodate(page)) { | 843 | if (!PageUptodate(page)) { |
838 | lock_page(page); | 844 | lock_page(page); |
@@ -1152,26 +1158,24 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset) | |||
1152 | { | 1158 | { |
1153 | struct address_space *mapping = file->f_mapping; | 1159 | struct address_space *mapping = file->f_mapping; |
1154 | struct page *page; | 1160 | struct page *page; |
1155 | int error; | 1161 | int ret; |
1156 | 1162 | ||
1157 | page = page_cache_alloc_cold(mapping); | 1163 | do { |
1158 | if (!page) | 1164 | page = page_cache_alloc_cold(mapping); |
1159 | return -ENOMEM; | 1165 | if (!page) |
1166 | return -ENOMEM; | ||
1167 | |||
1168 | ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); | ||
1169 | if (ret == 0) | ||
1170 | ret = mapping->a_ops->readpage(file, page); | ||
1171 | else if (ret == -EEXIST) | ||
1172 | ret = 0; /* losing race to add is OK */ | ||
1160 | 1173 | ||
1161 | error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); | ||
1162 | if (!error) { | ||
1163 | error = mapping->a_ops->readpage(file, page); | ||
1164 | page_cache_release(page); | 1174 | page_cache_release(page); |
1165 | return error; | ||
1166 | } | ||
1167 | 1175 | ||
1168 | /* | 1176 | } while (ret == AOP_TRUNCATED_PAGE); |
1169 | * We arrive here in the unlikely event that someone | 1177 | |
1170 | * raced with us and added our page to the cache first | 1178 | return ret; |
1171 | * or we are out of memory for radix-tree nodes. | ||
1172 | */ | ||
1173 | page_cache_release(page); | ||
1174 | return error == -EEXIST ? 0 : error; | ||
1175 | } | 1179 | } |
1176 | 1180 | ||
1177 | #define MMAP_LOTSAMISS (100) | 1181 | #define MMAP_LOTSAMISS (100) |
@@ -1331,10 +1335,14 @@ page_not_uptodate: | |||
1331 | goto success; | 1335 | goto success; |
1332 | } | 1336 | } |
1333 | 1337 | ||
1334 | if (!mapping->a_ops->readpage(file, page)) { | 1338 | error = mapping->a_ops->readpage(file, page); |
1339 | if (!error) { | ||
1335 | wait_on_page_locked(page); | 1340 | wait_on_page_locked(page); |
1336 | if (PageUptodate(page)) | 1341 | if (PageUptodate(page)) |
1337 | goto success; | 1342 | goto success; |
1343 | } else if (error == AOP_TRUNCATED_PAGE) { | ||
1344 | page_cache_release(page); | ||
1345 | goto retry_find; | ||
1338 | } | 1346 | } |
1339 | 1347 | ||
1340 | /* | 1348 | /* |
@@ -1358,10 +1366,14 @@ page_not_uptodate: | |||
1358 | goto success; | 1366 | goto success; |
1359 | } | 1367 | } |
1360 | ClearPageError(page); | 1368 | ClearPageError(page); |
1361 | if (!mapping->a_ops->readpage(file, page)) { | 1369 | error = mapping->a_ops->readpage(file, page); |
1370 | if (!error) { | ||
1362 | wait_on_page_locked(page); | 1371 | wait_on_page_locked(page); |
1363 | if (PageUptodate(page)) | 1372 | if (PageUptodate(page)) |
1364 | goto success; | 1373 | goto success; |
1374 | } else if (error == AOP_TRUNCATED_PAGE) { | ||
1375 | page_cache_release(page); | ||
1376 | goto retry_find; | ||
1365 | } | 1377 | } |
1366 | 1378 | ||
1367 | /* | 1379 | /* |
@@ -1444,10 +1456,14 @@ page_not_uptodate: | |||
1444 | goto success; | 1456 | goto success; |
1445 | } | 1457 | } |
1446 | 1458 | ||
1447 | if (!mapping->a_ops->readpage(file, page)) { | 1459 | error = mapping->a_ops->readpage(file, page); |
1460 | if (!error) { | ||
1448 | wait_on_page_locked(page); | 1461 | wait_on_page_locked(page); |
1449 | if (PageUptodate(page)) | 1462 | if (PageUptodate(page)) |
1450 | goto success; | 1463 | goto success; |
1464 | } else if (error == AOP_TRUNCATED_PAGE) { | ||
1465 | page_cache_release(page); | ||
1466 | goto retry_find; | ||
1451 | } | 1467 | } |
1452 | 1468 | ||
1453 | /* | 1469 | /* |
@@ -1470,10 +1486,14 @@ page_not_uptodate: | |||
1470 | } | 1486 | } |
1471 | 1487 | ||
1472 | ClearPageError(page); | 1488 | ClearPageError(page); |
1473 | if (!mapping->a_ops->readpage(file, page)) { | 1489 | error = mapping->a_ops->readpage(file, page); |
1490 | if (!error) { | ||
1474 | wait_on_page_locked(page); | 1491 | wait_on_page_locked(page); |
1475 | if (PageUptodate(page)) | 1492 | if (PageUptodate(page)) |
1476 | goto success; | 1493 | goto success; |
1494 | } else if (error == AOP_TRUNCATED_PAGE) { | ||
1495 | page_cache_release(page); | ||
1496 | goto retry_find; | ||
1477 | } | 1497 | } |
1478 | 1498 | ||
1479 | /* | 1499 | /* |
@@ -1934,12 +1954,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
1934 | status = a_ops->prepare_write(file, page, offset, offset+bytes); | 1954 | status = a_ops->prepare_write(file, page, offset, offset+bytes); |
1935 | if (unlikely(status)) { | 1955 | if (unlikely(status)) { |
1936 | loff_t isize = i_size_read(inode); | 1956 | loff_t isize = i_size_read(inode); |
1957 | |||
1958 | if (status != AOP_TRUNCATED_PAGE) | ||
1959 | unlock_page(page); | ||
1960 | page_cache_release(page); | ||
1961 | if (status == AOP_TRUNCATED_PAGE) | ||
1962 | continue; | ||
1937 | /* | 1963 | /* |
1938 | * prepare_write() may have instantiated a few blocks | 1964 | * prepare_write() may have instantiated a few blocks |
1939 | * outside i_size. Trim these off again. | 1965 | * outside i_size. Trim these off again. |
1940 | */ | 1966 | */ |
1941 | unlock_page(page); | ||
1942 | page_cache_release(page); | ||
1943 | if (pos + bytes > isize) | 1967 | if (pos + bytes > isize) |
1944 | vmtruncate(inode, isize); | 1968 | vmtruncate(inode, isize); |
1945 | break; | 1969 | break; |
@@ -1952,6 +1976,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
1952 | cur_iov, iov_base, bytes); | 1976 | cur_iov, iov_base, bytes); |
1953 | flush_dcache_page(page); | 1977 | flush_dcache_page(page); |
1954 | status = a_ops->commit_write(file, page, offset, offset+bytes); | 1978 | status = a_ops->commit_write(file, page, offset, offset+bytes); |
1979 | if (status == AOP_TRUNCATED_PAGE) { | ||
1980 | page_cache_release(page); | ||
1981 | continue; | ||
1982 | } | ||
1955 | if (likely(copied > 0)) { | 1983 | if (likely(copied > 0)) { |
1956 | if (!status) | 1984 | if (!status) |
1957 | status = copied; | 1985 | status = copied; |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3e52df7c471b..f4c43d7980ba 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -11,6 +11,8 @@ | |||
11 | #include <linux/highmem.h> | 11 | #include <linux/highmem.h> |
12 | #include <linux/nodemask.h> | 12 | #include <linux/nodemask.h> |
13 | #include <linux/pagemap.h> | 13 | #include <linux/pagemap.h> |
14 | #include <linux/mempolicy.h> | ||
15 | |||
14 | #include <asm/page.h> | 16 | #include <asm/page.h> |
15 | #include <asm/pgtable.h> | 17 | #include <asm/pgtable.h> |
16 | 18 | ||
@@ -36,18 +38,21 @@ static void enqueue_huge_page(struct page *page) | |||
36 | free_huge_pages_node[nid]++; | 38 | free_huge_pages_node[nid]++; |
37 | } | 39 | } |
38 | 40 | ||
39 | static struct page *dequeue_huge_page(void) | 41 | static struct page *dequeue_huge_page(struct vm_area_struct *vma, |
42 | unsigned long address) | ||
40 | { | 43 | { |
41 | int nid = numa_node_id(); | 44 | int nid = numa_node_id(); |
42 | struct page *page = NULL; | 45 | struct page *page = NULL; |
46 | struct zonelist *zonelist = huge_zonelist(vma, address); | ||
47 | struct zone **z; | ||
43 | 48 | ||
44 | if (list_empty(&hugepage_freelists[nid])) { | 49 | for (z = zonelist->zones; *z; z++) { |
45 | for (nid = 0; nid < MAX_NUMNODES; ++nid) | 50 | nid = (*z)->zone_pgdat->node_id; |
46 | if (!list_empty(&hugepage_freelists[nid])) | 51 | if (!list_empty(&hugepage_freelists[nid])) |
47 | break; | 52 | break; |
48 | } | 53 | } |
49 | if (nid >= 0 && nid < MAX_NUMNODES && | 54 | |
50 | !list_empty(&hugepage_freelists[nid])) { | 55 | if (*z) { |
51 | page = list_entry(hugepage_freelists[nid].next, | 56 | page = list_entry(hugepage_freelists[nid].next, |
52 | struct page, lru); | 57 | struct page, lru); |
53 | list_del(&page->lru); | 58 | list_del(&page->lru); |
@@ -85,13 +90,13 @@ void free_huge_page(struct page *page) | |||
85 | spin_unlock(&hugetlb_lock); | 90 | spin_unlock(&hugetlb_lock); |
86 | } | 91 | } |
87 | 92 | ||
88 | struct page *alloc_huge_page(void) | 93 | struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) |
89 | { | 94 | { |
90 | struct page *page; | 95 | struct page *page; |
91 | int i; | 96 | int i; |
92 | 97 | ||
93 | spin_lock(&hugetlb_lock); | 98 | spin_lock(&hugetlb_lock); |
94 | page = dequeue_huge_page(); | 99 | page = dequeue_huge_page(vma, addr); |
95 | if (!page) { | 100 | if (!page) { |
96 | spin_unlock(&hugetlb_lock); | 101 | spin_unlock(&hugetlb_lock); |
97 | return NULL; | 102 | return NULL; |
@@ -194,7 +199,7 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
194 | spin_lock(&hugetlb_lock); | 199 | spin_lock(&hugetlb_lock); |
195 | try_to_free_low(count); | 200 | try_to_free_low(count); |
196 | while (count < nr_huge_pages) { | 201 | while (count < nr_huge_pages) { |
197 | struct page *page = dequeue_huge_page(); | 202 | struct page *page = dequeue_huge_page(NULL, 0); |
198 | if (!page) | 203 | if (!page) |
199 | break; | 204 | break; |
200 | update_and_free_page(page); | 205 | update_and_free_page(page); |
@@ -261,11 +266,12 @@ struct vm_operations_struct hugetlb_vm_ops = { | |||
261 | .nopage = hugetlb_nopage, | 266 | .nopage = hugetlb_nopage, |
262 | }; | 267 | }; |
263 | 268 | ||
264 | static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) | 269 | static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, |
270 | int writable) | ||
265 | { | 271 | { |
266 | pte_t entry; | 272 | pte_t entry; |
267 | 273 | ||
268 | if (vma->vm_flags & VM_WRITE) { | 274 | if (writable) { |
269 | entry = | 275 | entry = |
270 | pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); | 276 | pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); |
271 | } else { | 277 | } else { |
@@ -277,12 +283,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) | |||
277 | return entry; | 283 | return entry; |
278 | } | 284 | } |
279 | 285 | ||
286 | static void set_huge_ptep_writable(struct vm_area_struct *vma, | ||
287 | unsigned long address, pte_t *ptep) | ||
288 | { | ||
289 | pte_t entry; | ||
290 | |||
291 | entry = pte_mkwrite(pte_mkdirty(*ptep)); | ||
292 | ptep_set_access_flags(vma, address, ptep, entry, 1); | ||
293 | update_mmu_cache(vma, address, entry); | ||
294 | lazy_mmu_prot_update(entry); | ||
295 | } | ||
296 | |||
297 | |||
280 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | 298 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, |
281 | struct vm_area_struct *vma) | 299 | struct vm_area_struct *vma) |
282 | { | 300 | { |
283 | pte_t *src_pte, *dst_pte, entry; | 301 | pte_t *src_pte, *dst_pte, entry; |
284 | struct page *ptepage; | 302 | struct page *ptepage; |
285 | unsigned long addr; | 303 | unsigned long addr; |
304 | int cow; | ||
305 | |||
306 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | ||
286 | 307 | ||
287 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | 308 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { |
288 | src_pte = huge_pte_offset(src, addr); | 309 | src_pte = huge_pte_offset(src, addr); |
@@ -294,6 +315,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
294 | spin_lock(&dst->page_table_lock); | 315 | spin_lock(&dst->page_table_lock); |
295 | spin_lock(&src->page_table_lock); | 316 | spin_lock(&src->page_table_lock); |
296 | if (!pte_none(*src_pte)) { | 317 | if (!pte_none(*src_pte)) { |
318 | if (cow) | ||
319 | ptep_set_wrprotect(src, addr, src_pte); | ||
297 | entry = *src_pte; | 320 | entry = *src_pte; |
298 | ptepage = pte_page(entry); | 321 | ptepage = pte_page(entry); |
299 | get_page(ptepage); | 322 | get_page(ptepage); |
@@ -345,57 +368,63 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
345 | flush_tlb_range(vma, start, end); | 368 | flush_tlb_range(vma, start, end); |
346 | } | 369 | } |
347 | 370 | ||
348 | static struct page *find_lock_huge_page(struct address_space *mapping, | 371 | static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, |
349 | unsigned long idx) | 372 | unsigned long address, pte_t *ptep, pte_t pte) |
350 | { | 373 | { |
351 | struct page *page; | 374 | struct page *old_page, *new_page; |
352 | int err; | 375 | int i, avoidcopy; |
353 | struct inode *inode = mapping->host; | ||
354 | unsigned long size; | ||
355 | 376 | ||
356 | retry: | 377 | old_page = pte_page(pte); |
357 | page = find_lock_page(mapping, idx); | ||
358 | if (page) | ||
359 | goto out; | ||
360 | 378 | ||
361 | /* Check to make sure the mapping hasn't been truncated */ | 379 | /* If no-one else is actually using this page, avoid the copy |
362 | size = i_size_read(inode) >> HPAGE_SHIFT; | 380 | * and just make the page writable */ |
363 | if (idx >= size) | 381 | avoidcopy = (page_count(old_page) == 1); |
364 | goto out; | 382 | if (avoidcopy) { |
383 | set_huge_ptep_writable(vma, address, ptep); | ||
384 | return VM_FAULT_MINOR; | ||
385 | } | ||
365 | 386 | ||
366 | if (hugetlb_get_quota(mapping)) | 387 | page_cache_get(old_page); |
367 | goto out; | 388 | new_page = alloc_huge_page(vma, address); |
368 | page = alloc_huge_page(); | 389 | |
369 | if (!page) { | 390 | if (!new_page) { |
370 | hugetlb_put_quota(mapping); | 391 | page_cache_release(old_page); |
371 | goto out; | 392 | |
393 | /* Logically this is OOM, not a SIGBUS, but an OOM | ||
394 | * could cause the kernel to go killing other | ||
395 | * processes which won't help the hugepage situation | ||
396 | * at all (?) */ | ||
397 | return VM_FAULT_SIGBUS; | ||
372 | } | 398 | } |
373 | 399 | ||
374 | err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); | 400 | spin_unlock(&mm->page_table_lock); |
375 | if (err) { | 401 | for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) |
376 | put_page(page); | 402 | copy_user_highpage(new_page + i, old_page + i, |
377 | hugetlb_put_quota(mapping); | 403 | address + i*PAGE_SIZE); |
378 | if (err == -EEXIST) | 404 | spin_lock(&mm->page_table_lock); |
379 | goto retry; | 405 | |
380 | page = NULL; | 406 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); |
407 | if (likely(pte_same(*ptep, pte))) { | ||
408 | /* Break COW */ | ||
409 | set_huge_pte_at(mm, address, ptep, | ||
410 | make_huge_pte(vma, new_page, 1)); | ||
411 | /* Make the old page be freed below */ | ||
412 | new_page = old_page; | ||
381 | } | 413 | } |
382 | out: | 414 | page_cache_release(new_page); |
383 | return page; | 415 | page_cache_release(old_page); |
416 | return VM_FAULT_MINOR; | ||
384 | } | 417 | } |
385 | 418 | ||
386 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 419 | int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, |
387 | unsigned long address, int write_access) | 420 | unsigned long address, pte_t *ptep, int write_access) |
388 | { | 421 | { |
389 | int ret = VM_FAULT_SIGBUS; | 422 | int ret = VM_FAULT_SIGBUS; |
390 | unsigned long idx; | 423 | unsigned long idx; |
391 | unsigned long size; | 424 | unsigned long size; |
392 | pte_t *pte; | ||
393 | struct page *page; | 425 | struct page *page; |
394 | struct address_space *mapping; | 426 | struct address_space *mapping; |
395 | 427 | pte_t new_pte; | |
396 | pte = huge_pte_alloc(mm, address); | ||
397 | if (!pte) | ||
398 | goto out; | ||
399 | 428 | ||
400 | mapping = vma->vm_file->f_mapping; | 429 | mapping = vma->vm_file->f_mapping; |
401 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) | 430 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) |
@@ -405,9 +434,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
405 | * Use page lock to guard against racing truncation | 434 | * Use page lock to guard against racing truncation |
406 | * before we get page_table_lock. | 435 | * before we get page_table_lock. |
407 | */ | 436 | */ |
408 | page = find_lock_huge_page(mapping, idx); | 437 | retry: |
409 | if (!page) | 438 | page = find_lock_page(mapping, idx); |
410 | goto out; | 439 | if (!page) { |
440 | if (hugetlb_get_quota(mapping)) | ||
441 | goto out; | ||
442 | page = alloc_huge_page(vma, address); | ||
443 | if (!page) { | ||
444 | hugetlb_put_quota(mapping); | ||
445 | goto out; | ||
446 | } | ||
447 | |||
448 | if (vma->vm_flags & VM_SHARED) { | ||
449 | int err; | ||
450 | |||
451 | err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); | ||
452 | if (err) { | ||
453 | put_page(page); | ||
454 | hugetlb_put_quota(mapping); | ||
455 | if (err == -EEXIST) | ||
456 | goto retry; | ||
457 | goto out; | ||
458 | } | ||
459 | } else | ||
460 | lock_page(page); | ||
461 | } | ||
411 | 462 | ||
412 | spin_lock(&mm->page_table_lock); | 463 | spin_lock(&mm->page_table_lock); |
413 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; | 464 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; |
@@ -415,11 +466,19 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
415 | goto backout; | 466 | goto backout; |
416 | 467 | ||
417 | ret = VM_FAULT_MINOR; | 468 | ret = VM_FAULT_MINOR; |
418 | if (!pte_none(*pte)) | 469 | if (!pte_none(*ptep)) |
419 | goto backout; | 470 | goto backout; |
420 | 471 | ||
421 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); | 472 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); |
422 | set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page)); | 473 | new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) |
474 | && (vma->vm_flags & VM_SHARED))); | ||
475 | set_huge_pte_at(mm, address, ptep, new_pte); | ||
476 | |||
477 | if (write_access && !(vma->vm_flags & VM_SHARED)) { | ||
478 | /* Optimization, do the COW without a second fault */ | ||
479 | ret = hugetlb_cow(mm, vma, address, ptep, new_pte); | ||
480 | } | ||
481 | |||
423 | spin_unlock(&mm->page_table_lock); | 482 | spin_unlock(&mm->page_table_lock); |
424 | unlock_page(page); | 483 | unlock_page(page); |
425 | out: | 484 | out: |
@@ -433,6 +492,33 @@ backout: | |||
433 | goto out; | 492 | goto out; |
434 | } | 493 | } |
435 | 494 | ||
495 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | ||
496 | unsigned long address, int write_access) | ||
497 | { | ||
498 | pte_t *ptep; | ||
499 | pte_t entry; | ||
500 | int ret; | ||
501 | |||
502 | ptep = huge_pte_alloc(mm, address); | ||
503 | if (!ptep) | ||
504 | return VM_FAULT_OOM; | ||
505 | |||
506 | entry = *ptep; | ||
507 | if (pte_none(entry)) | ||
508 | return hugetlb_no_page(mm, vma, address, ptep, write_access); | ||
509 | |||
510 | ret = VM_FAULT_MINOR; | ||
511 | |||
512 | spin_lock(&mm->page_table_lock); | ||
513 | /* Check for a racing update before calling hugetlb_cow */ | ||
514 | if (likely(pte_same(entry, *ptep))) | ||
515 | if (write_access && !pte_write(entry)) | ||
516 | ret = hugetlb_cow(mm, vma, address, ptep, entry); | ||
517 | spin_unlock(&mm->page_table_lock); | ||
518 | |||
519 | return ret; | ||
520 | } | ||
521 | |||
436 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 522 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |
437 | struct page **pages, struct vm_area_struct **vmas, | 523 | struct page **pages, struct vm_area_struct **vmas, |
438 | unsigned long *position, int *length, int i) | 524 | unsigned long *position, int *length, int i) |
diff --git a/mm/internal.h b/mm/internal.h index 6bf134e8fb3d..17256bb2f4ef 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -9,5 +9,22 @@ | |||
9 | * 2 of the License, or (at your option) any later version. | 9 | * 2 of the License, or (at your option) any later version. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | /* page_alloc.c */ | 12 | static inline void set_page_refs(struct page *page, int order) |
13 | extern void set_page_refs(struct page *page, int order); | 13 | { |
14 | #ifdef CONFIG_MMU | ||
15 | set_page_count(page, 1); | ||
16 | #else | ||
17 | int i; | ||
18 | |||
19 | /* | ||
20 | * We need to reference all the pages for this order, otherwise if | ||
21 | * anyone accesses one of the pages with (get/put) it will be freed. | ||
22 | * - eg: access_process_vm() | ||
23 | */ | ||
24 | for (i = 0; i < (1 << order); i++) | ||
25 | set_page_count(page + i, 1); | ||
26 | #endif /* CONFIG_MMU */ | ||
27 | } | ||
28 | |||
29 | extern void fastcall __init __free_pages_bootmem(struct page *page, | ||
30 | unsigned int order); | ||
diff --git a/mm/madvise.c b/mm/madvise.c index 2b7cf0400a21..ae0ae3ea299a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -140,6 +140,36 @@ static long madvise_dontneed(struct vm_area_struct * vma, | |||
140 | return 0; | 140 | return 0; |
141 | } | 141 | } |
142 | 142 | ||
143 | /* | ||
144 | * Application wants to free up the pages and associated backing store. | ||
145 | * This is effectively punching a hole into the middle of a file. | ||
146 | * | ||
147 | * NOTE: Currently, only shmfs/tmpfs is supported for this operation. | ||
148 | * Other filesystems return -ENOSYS. | ||
149 | */ | ||
150 | static long madvise_remove(struct vm_area_struct *vma, | ||
151 | unsigned long start, unsigned long end) | ||
152 | { | ||
153 | struct address_space *mapping; | ||
154 | loff_t offset, endoff; | ||
155 | |||
156 | if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) | ||
157 | return -EINVAL; | ||
158 | |||
159 | if (!vma->vm_file || !vma->vm_file->f_mapping | ||
160 | || !vma->vm_file->f_mapping->host) { | ||
161 | return -EINVAL; | ||
162 | } | ||
163 | |||
164 | mapping = vma->vm_file->f_mapping; | ||
165 | |||
166 | offset = (loff_t)(start - vma->vm_start) | ||
167 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | ||
168 | endoff = (loff_t)(end - vma->vm_start - 1) | ||
169 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | ||
170 | return vmtruncate_range(mapping->host, offset, endoff); | ||
171 | } | ||
172 | |||
143 | static long | 173 | static long |
144 | madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, | 174 | madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, |
145 | unsigned long start, unsigned long end, int behavior) | 175 | unsigned long start, unsigned long end, int behavior) |
@@ -152,6 +182,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
152 | case MADV_RANDOM: | 182 | case MADV_RANDOM: |
153 | error = madvise_behavior(vma, prev, start, end, behavior); | 183 | error = madvise_behavior(vma, prev, start, end, behavior); |
154 | break; | 184 | break; |
185 | case MADV_REMOVE: | ||
186 | error = madvise_remove(vma, start, end); | ||
187 | break; | ||
155 | 188 | ||
156 | case MADV_WILLNEED: | 189 | case MADV_WILLNEED: |
157 | error = madvise_willneed(vma, prev, start, end); | 190 | error = madvise_willneed(vma, prev, start, end); |
@@ -190,6 +223,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
190 | * some pages ahead. | 223 | * some pages ahead. |
191 | * MADV_DONTNEED - the application is finished with the given range, | 224 | * MADV_DONTNEED - the application is finished with the given range, |
192 | * so the kernel can free resources associated with it. | 225 | * so the kernel can free resources associated with it. |
226 | * MADV_REMOVE - the application wants to free up the given range of | ||
227 | * pages and associated backing store. | ||
193 | * | 228 | * |
194 | * return values: | 229 | * return values: |
195 | * zero - success | 230 | * zero - success |
diff --git a/mm/memory.c b/mm/memory.c index d8dde07a3656..7197f9bcd384 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1498,7 +1498,7 @@ gotten: | |||
1498 | update_mmu_cache(vma, address, entry); | 1498 | update_mmu_cache(vma, address, entry); |
1499 | lazy_mmu_prot_update(entry); | 1499 | lazy_mmu_prot_update(entry); |
1500 | lru_cache_add_active(new_page); | 1500 | lru_cache_add_active(new_page); |
1501 | page_add_anon_rmap(new_page, vma, address); | 1501 | page_add_new_anon_rmap(new_page, vma, address); |
1502 | 1502 | ||
1503 | /* Free the old page.. */ | 1503 | /* Free the old page.. */ |
1504 | new_page = old_page; | 1504 | new_page = old_page; |
@@ -1770,9 +1770,32 @@ out_big: | |||
1770 | out_busy: | 1770 | out_busy: |
1771 | return -ETXTBSY; | 1771 | return -ETXTBSY; |
1772 | } | 1772 | } |
1773 | |||
1774 | EXPORT_SYMBOL(vmtruncate); | 1773 | EXPORT_SYMBOL(vmtruncate); |
1775 | 1774 | ||
1775 | int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | ||
1776 | { | ||
1777 | struct address_space *mapping = inode->i_mapping; | ||
1778 | |||
1779 | /* | ||
1780 | * If the underlying filesystem is not going to provide | ||
1781 | * a way to truncate a range of blocks (punch a hole) - | ||
1782 | * we should return failure right now. | ||
1783 | */ | ||
1784 | if (!inode->i_op || !inode->i_op->truncate_range) | ||
1785 | return -ENOSYS; | ||
1786 | |||
1787 | down(&inode->i_sem); | ||
1788 | down_write(&inode->i_alloc_sem); | ||
1789 | unmap_mapping_range(mapping, offset, (end - offset), 1); | ||
1790 | truncate_inode_pages_range(mapping, offset, end); | ||
1791 | inode->i_op->truncate_range(inode, offset, end); | ||
1792 | up_write(&inode->i_alloc_sem); | ||
1793 | up(&inode->i_sem); | ||
1794 | |||
1795 | return 0; | ||
1796 | } | ||
1797 | EXPORT_SYMBOL(vmtruncate_range); | ||
1798 | |||
1776 | /* | 1799 | /* |
1777 | * Primitive swap readahead code. We simply read an aligned block of | 1800 | * Primitive swap readahead code. We simply read an aligned block of |
1778 | * (1 << page_cluster) entries in the swap area. This method is chosen | 1801 | * (1 << page_cluster) entries in the swap area. This method is chosen |
@@ -1954,8 +1977,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1954 | goto release; | 1977 | goto release; |
1955 | inc_mm_counter(mm, anon_rss); | 1978 | inc_mm_counter(mm, anon_rss); |
1956 | lru_cache_add_active(page); | 1979 | lru_cache_add_active(page); |
1957 | SetPageReferenced(page); | 1980 | page_add_new_anon_rmap(page, vma, address); |
1958 | page_add_anon_rmap(page, vma, address); | ||
1959 | } else { | 1981 | } else { |
1960 | /* Map the ZERO_PAGE - vm_page_prot is readonly */ | 1982 | /* Map the ZERO_PAGE - vm_page_prot is readonly */ |
1961 | page = ZERO_PAGE(address); | 1983 | page = ZERO_PAGE(address); |
@@ -2086,7 +2108,7 @@ retry: | |||
2086 | if (anon) { | 2108 | if (anon) { |
2087 | inc_mm_counter(mm, anon_rss); | 2109 | inc_mm_counter(mm, anon_rss); |
2088 | lru_cache_add_active(new_page); | 2110 | lru_cache_add_active(new_page); |
2089 | page_add_anon_rmap(new_page, vma, address); | 2111 | page_add_new_anon_rmap(new_page, vma, address); |
2090 | } else { | 2112 | } else { |
2091 | inc_mm_counter(mm, file_rss); | 2113 | inc_mm_counter(mm, file_rss); |
2092 | page_add_file_rmap(new_page); | 2114 | page_add_file_rmap(new_page); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f6d4af8af8a8..a918f77f02f3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -42,7 +42,6 @@ extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | |||
42 | int nr_pages); | 42 | int nr_pages); |
43 | static int __add_section(struct zone *zone, unsigned long phys_start_pfn) | 43 | static int __add_section(struct zone *zone, unsigned long phys_start_pfn) |
44 | { | 44 | { |
45 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
46 | int nr_pages = PAGES_PER_SECTION; | 45 | int nr_pages = PAGES_PER_SECTION; |
47 | int ret; | 46 | int ret; |
48 | 47 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 72f402cc9c9a..0f1d2b8a952b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -93,7 +93,7 @@ static kmem_cache_t *sn_cache; | |||
93 | 93 | ||
94 | /* Highest zone. An specific allocation for a zone below that is not | 94 | /* Highest zone. An specific allocation for a zone below that is not |
95 | policied. */ | 95 | policied. */ |
96 | static int policy_zone; | 96 | int policy_zone = ZONE_DMA; |
97 | 97 | ||
98 | struct mempolicy default_policy = { | 98 | struct mempolicy default_policy = { |
99 | .refcnt = ATOMIC_INIT(1), /* never free it */ | 99 | .refcnt = ATOMIC_INIT(1), /* never free it */ |
@@ -131,17 +131,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) | |||
131 | if (!zl) | 131 | if (!zl) |
132 | return NULL; | 132 | return NULL; |
133 | num = 0; | 133 | num = 0; |
134 | for_each_node_mask(nd, *nodes) { | 134 | for_each_node_mask(nd, *nodes) |
135 | int k; | 135 | zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone]; |
136 | for (k = MAX_NR_ZONES-1; k >= 0; k--) { | ||
137 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; | ||
138 | if (!z->present_pages) | ||
139 | continue; | ||
140 | zl->zones[num++] = z; | ||
141 | if (k > policy_zone) | ||
142 | policy_zone = k; | ||
143 | } | ||
144 | } | ||
145 | zl->zones[num] = NULL; | 136 | zl->zones[num] = NULL; |
146 | return zl; | 137 | return zl; |
147 | } | 138 | } |
@@ -785,6 +776,34 @@ static unsigned offset_il_node(struct mempolicy *pol, | |||
785 | return nid; | 776 | return nid; |
786 | } | 777 | } |
787 | 778 | ||
779 | /* Determine a node number for interleave */ | ||
780 | static inline unsigned interleave_nid(struct mempolicy *pol, | ||
781 | struct vm_area_struct *vma, unsigned long addr, int shift) | ||
782 | { | ||
783 | if (vma) { | ||
784 | unsigned long off; | ||
785 | |||
786 | off = vma->vm_pgoff; | ||
787 | off += (addr - vma->vm_start) >> shift; | ||
788 | return offset_il_node(pol, vma, off); | ||
789 | } else | ||
790 | return interleave_nodes(pol); | ||
791 | } | ||
792 | |||
793 | /* Return a zonelist suitable for a huge page allocation. */ | ||
794 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) | ||
795 | { | ||
796 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | ||
797 | |||
798 | if (pol->policy == MPOL_INTERLEAVE) { | ||
799 | unsigned nid; | ||
800 | |||
801 | nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); | ||
802 | return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER); | ||
803 | } | ||
804 | return zonelist_policy(GFP_HIGHUSER, pol); | ||
805 | } | ||
806 | |||
788 | /* Allocate a page in interleaved policy. | 807 | /* Allocate a page in interleaved policy. |
789 | Own path because it needs to do special accounting. */ | 808 | Own path because it needs to do special accounting. */ |
790 | static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | 809 | static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, |
@@ -833,15 +852,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
833 | 852 | ||
834 | if (unlikely(pol->policy == MPOL_INTERLEAVE)) { | 853 | if (unlikely(pol->policy == MPOL_INTERLEAVE)) { |
835 | unsigned nid; | 854 | unsigned nid; |
836 | if (vma) { | 855 | |
837 | unsigned long off; | 856 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); |
838 | off = vma->vm_pgoff; | ||
839 | off += (addr - vma->vm_start) >> PAGE_SHIFT; | ||
840 | nid = offset_il_node(pol, vma, off); | ||
841 | } else { | ||
842 | /* fall back to process interleaving */ | ||
843 | nid = interleave_nodes(pol); | ||
844 | } | ||
845 | return alloc_page_interleave(gfp, 0, nid); | 857 | return alloc_page_interleave(gfp, 0, nid); |
846 | } | 858 | } |
847 | return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); | 859 | return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); |
@@ -940,54 +952,6 @@ void __mpol_free(struct mempolicy *p) | |||
940 | } | 952 | } |
941 | 953 | ||
942 | /* | 954 | /* |
943 | * Hugetlb policy. Same as above, just works with node numbers instead of | ||
944 | * zonelists. | ||
945 | */ | ||
946 | |||
947 | /* Find first node suitable for an allocation */ | ||
948 | int mpol_first_node(struct vm_area_struct *vma, unsigned long addr) | ||
949 | { | ||
950 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | ||
951 | |||
952 | switch (pol->policy) { | ||
953 | case MPOL_DEFAULT: | ||
954 | return numa_node_id(); | ||
955 | case MPOL_BIND: | ||
956 | return pol->v.zonelist->zones[0]->zone_pgdat->node_id; | ||
957 | case MPOL_INTERLEAVE: | ||
958 | return interleave_nodes(pol); | ||
959 | case MPOL_PREFERRED: | ||
960 | return pol->v.preferred_node >= 0 ? | ||
961 | pol->v.preferred_node : numa_node_id(); | ||
962 | } | ||
963 | BUG(); | ||
964 | return 0; | ||
965 | } | ||
966 | |||
967 | /* Find secondary valid nodes for an allocation */ | ||
968 | int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr) | ||
969 | { | ||
970 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | ||
971 | |||
972 | switch (pol->policy) { | ||
973 | case MPOL_PREFERRED: | ||
974 | case MPOL_DEFAULT: | ||
975 | case MPOL_INTERLEAVE: | ||
976 | return 1; | ||
977 | case MPOL_BIND: { | ||
978 | struct zone **z; | ||
979 | for (z = pol->v.zonelist->zones; *z; z++) | ||
980 | if ((*z)->zone_pgdat->node_id == nid) | ||
981 | return 1; | ||
982 | return 0; | ||
983 | } | ||
984 | default: | ||
985 | BUG(); | ||
986 | return 0; | ||
987 | } | ||
988 | } | ||
989 | |||
990 | /* | ||
991 | * Shared memory backing store policy support. | 955 | * Shared memory backing store policy support. |
992 | * | 956 | * |
993 | * Remember policies even when nobody has shared memory mapped. | 957 | * Remember policies even when nobody has shared memory mapped. |
diff --git a/mm/nommu.c b/mm/nommu.c index c1196812876b..c10262d68232 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1177,3 +1177,10 @@ int in_gate_area_no_task(unsigned long addr) | |||
1177 | { | 1177 | { |
1178 | return 0; | 1178 | return 0; |
1179 | } | 1179 | } |
1180 | |||
1181 | struct page *filemap_nopage(struct vm_area_struct *area, | ||
1182 | unsigned long address, int *type) | ||
1183 | { | ||
1184 | BUG(); | ||
1185 | return NULL; | ||
1186 | } | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fe14a8c87fc2..fd47494cb989 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/memory_hotplug.h> | 36 | #include <linux/memory_hotplug.h> |
37 | #include <linux/nodemask.h> | 37 | #include <linux/nodemask.h> |
38 | #include <linux/vmalloc.h> | 38 | #include <linux/vmalloc.h> |
39 | #include <linux/mempolicy.h> | ||
39 | 40 | ||
40 | #include <asm/tlbflush.h> | 41 | #include <asm/tlbflush.h> |
41 | #include "internal.h" | 42 | #include "internal.h" |
@@ -53,6 +54,8 @@ unsigned long totalram_pages __read_mostly; | |||
53 | unsigned long totalhigh_pages __read_mostly; | 54 | unsigned long totalhigh_pages __read_mostly; |
54 | long nr_swap_pages; | 55 | long nr_swap_pages; |
55 | 56 | ||
57 | static void fastcall free_hot_cold_page(struct page *page, int cold); | ||
58 | |||
56 | /* | 59 | /* |
57 | * results with 256, 32 in the lowmem_reserve sysctl: | 60 | * results with 256, 32 in the lowmem_reserve sysctl: |
58 | * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) | 61 | * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) |
@@ -81,6 +84,7 @@ int min_free_kbytes = 1024; | |||
81 | unsigned long __initdata nr_kernel_pages; | 84 | unsigned long __initdata nr_kernel_pages; |
82 | unsigned long __initdata nr_all_pages; | 85 | unsigned long __initdata nr_all_pages; |
83 | 86 | ||
87 | #ifdef CONFIG_DEBUG_VM | ||
84 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 88 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
85 | { | 89 | { |
86 | int ret = 0; | 90 | int ret = 0; |
@@ -122,16 +126,23 @@ static int bad_range(struct zone *zone, struct page *page) | |||
122 | return 0; | 126 | return 0; |
123 | } | 127 | } |
124 | 128 | ||
125 | static void bad_page(const char *function, struct page *page) | 129 | #else |
130 | static inline int bad_range(struct zone *zone, struct page *page) | ||
131 | { | ||
132 | return 0; | ||
133 | } | ||
134 | #endif | ||
135 | |||
136 | static void bad_page(struct page *page) | ||
126 | { | 137 | { |
127 | printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", | 138 | printk(KERN_EMERG "Bad page state in process '%s'\n" |
128 | function, current->comm, page); | 139 | "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" |
129 | printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", | 140 | "Trying to fix it up, but a reboot is needed\n" |
130 | (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, | 141 | "Backtrace:\n", |
131 | page->mapping, page_mapcount(page), page_count(page)); | 142 | current->comm, page, (int)(2*sizeof(unsigned long)), |
132 | printk(KERN_EMERG "Backtrace:\n"); | 143 | (unsigned long)page->flags, page->mapping, |
144 | page_mapcount(page), page_count(page)); | ||
133 | dump_stack(); | 145 | dump_stack(); |
134 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); | ||
135 | page->flags &= ~(1 << PG_lru | | 146 | page->flags &= ~(1 << PG_lru | |
136 | 1 << PG_private | | 147 | 1 << PG_private | |
137 | 1 << PG_locked | | 148 | 1 << PG_locked | |
@@ -184,19 +195,15 @@ static void destroy_compound_page(struct page *page, unsigned long order) | |||
184 | int i; | 195 | int i; |
185 | int nr_pages = 1 << order; | 196 | int nr_pages = 1 << order; |
186 | 197 | ||
187 | if (!PageCompound(page)) | 198 | if (unlikely(page[1].index != order)) |
188 | return; | 199 | bad_page(page); |
189 | |||
190 | if (page[1].index != order) | ||
191 | bad_page(__FUNCTION__, page); | ||
192 | 200 | ||
193 | for (i = 0; i < nr_pages; i++) { | 201 | for (i = 0; i < nr_pages; i++) { |
194 | struct page *p = page + i; | 202 | struct page *p = page + i; |
195 | 203 | ||
196 | if (!PageCompound(p)) | 204 | if (unlikely(!PageCompound(p) | |
197 | bad_page(__FUNCTION__, page); | 205 | (page_private(p) != (unsigned long)page))) |
198 | if (page_private(p) != (unsigned long)page) | 206 | bad_page(page); |
199 | bad_page(__FUNCTION__, page); | ||
200 | ClearPageCompound(p); | 207 | ClearPageCompound(p); |
201 | } | 208 | } |
202 | } | 209 | } |
@@ -255,14 +262,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order) | |||
255 | /* | 262 | /* |
256 | * This function checks whether a page is free && is the buddy | 263 | * This function checks whether a page is free && is the buddy |
257 | * we can do coalesce a page and its buddy if | 264 | * we can do coalesce a page and its buddy if |
258 | * (a) the buddy is free && | 265 | * (a) the buddy is not in a hole && |
259 | * (b) the buddy is on the buddy system && | 266 | * (b) the buddy is free && |
260 | * (c) a page and its buddy have the same order. | 267 | * (c) the buddy is on the buddy system && |
268 | * (d) a page and its buddy have the same order. | ||
261 | * for recording page's order, we use page_private(page) and PG_private. | 269 | * for recording page's order, we use page_private(page) and PG_private. |
262 | * | 270 | * |
263 | */ | 271 | */ |
264 | static inline int page_is_buddy(struct page *page, int order) | 272 | static inline int page_is_buddy(struct page *page, int order) |
265 | { | 273 | { |
274 | #ifdef CONFIG_HOLES_IN_ZONE | ||
275 | if (!pfn_valid(page_to_pfn(page))) | ||
276 | return 0; | ||
277 | #endif | ||
278 | |||
266 | if (PagePrivate(page) && | 279 | if (PagePrivate(page) && |
267 | (page_order(page) == order) && | 280 | (page_order(page) == order) && |
268 | page_count(page) == 0) | 281 | page_count(page) == 0) |
@@ -300,7 +313,7 @@ static inline void __free_pages_bulk (struct page *page, | |||
300 | unsigned long page_idx; | 313 | unsigned long page_idx; |
301 | int order_size = 1 << order; | 314 | int order_size = 1 << order; |
302 | 315 | ||
303 | if (unlikely(order)) | 316 | if (unlikely(PageCompound(page))) |
304 | destroy_compound_page(page, order); | 317 | destroy_compound_page(page, order); |
305 | 318 | ||
306 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 319 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); |
@@ -314,17 +327,15 @@ static inline void __free_pages_bulk (struct page *page, | |||
314 | struct free_area *area; | 327 | struct free_area *area; |
315 | struct page *buddy; | 328 | struct page *buddy; |
316 | 329 | ||
317 | combined_idx = __find_combined_index(page_idx, order); | ||
318 | buddy = __page_find_buddy(page, page_idx, order); | 330 | buddy = __page_find_buddy(page, page_idx, order); |
319 | |||
320 | if (bad_range(zone, buddy)) | ||
321 | break; | ||
322 | if (!page_is_buddy(buddy, order)) | 331 | if (!page_is_buddy(buddy, order)) |
323 | break; /* Move the buddy up one level. */ | 332 | break; /* Move the buddy up one level. */ |
333 | |||
324 | list_del(&buddy->lru); | 334 | list_del(&buddy->lru); |
325 | area = zone->free_area + order; | 335 | area = zone->free_area + order; |
326 | area->nr_free--; | 336 | area->nr_free--; |
327 | rmv_page_order(buddy); | 337 | rmv_page_order(buddy); |
338 | combined_idx = __find_combined_index(page_idx, order); | ||
328 | page = page + (combined_idx - page_idx); | 339 | page = page + (combined_idx - page_idx); |
329 | page_idx = combined_idx; | 340 | page_idx = combined_idx; |
330 | order++; | 341 | order++; |
@@ -334,11 +345,11 @@ static inline void __free_pages_bulk (struct page *page, | |||
334 | zone->free_area[order].nr_free++; | 345 | zone->free_area[order].nr_free++; |
335 | } | 346 | } |
336 | 347 | ||
337 | static inline int free_pages_check(const char *function, struct page *page) | 348 | static inline int free_pages_check(struct page *page) |
338 | { | 349 | { |
339 | if ( page_mapcount(page) || | 350 | if (unlikely(page_mapcount(page) | |
340 | page->mapping != NULL || | 351 | (page->mapping != NULL) | |
341 | page_count(page) != 0 || | 352 | (page_count(page) != 0) | |
342 | (page->flags & ( | 353 | (page->flags & ( |
343 | 1 << PG_lru | | 354 | 1 << PG_lru | |
344 | 1 << PG_private | | 355 | 1 << PG_private | |
@@ -348,8 +359,8 @@ static inline int free_pages_check(const char *function, struct page *page) | |||
348 | 1 << PG_slab | | 359 | 1 << PG_slab | |
349 | 1 << PG_swapcache | | 360 | 1 << PG_swapcache | |
350 | 1 << PG_writeback | | 361 | 1 << PG_writeback | |
351 | 1 << PG_reserved ))) | 362 | 1 << PG_reserved )))) |
352 | bad_page(function, page); | 363 | bad_page(page); |
353 | if (PageDirty(page)) | 364 | if (PageDirty(page)) |
354 | __ClearPageDirty(page); | 365 | __ClearPageDirty(page); |
355 | /* | 366 | /* |
@@ -375,11 +386,10 @@ static int | |||
375 | free_pages_bulk(struct zone *zone, int count, | 386 | free_pages_bulk(struct zone *zone, int count, |
376 | struct list_head *list, unsigned int order) | 387 | struct list_head *list, unsigned int order) |
377 | { | 388 | { |
378 | unsigned long flags; | ||
379 | struct page *page = NULL; | 389 | struct page *page = NULL; |
380 | int ret = 0; | 390 | int ret = 0; |
381 | 391 | ||
382 | spin_lock_irqsave(&zone->lock, flags); | 392 | spin_lock(&zone->lock); |
383 | zone->all_unreclaimable = 0; | 393 | zone->all_unreclaimable = 0; |
384 | zone->pages_scanned = 0; | 394 | zone->pages_scanned = 0; |
385 | while (!list_empty(list) && count--) { | 395 | while (!list_empty(list) && count--) { |
@@ -389,12 +399,13 @@ free_pages_bulk(struct zone *zone, int count, | |||
389 | __free_pages_bulk(page, zone, order); | 399 | __free_pages_bulk(page, zone, order); |
390 | ret++; | 400 | ret++; |
391 | } | 401 | } |
392 | spin_unlock_irqrestore(&zone->lock, flags); | 402 | spin_unlock(&zone->lock); |
393 | return ret; | 403 | return ret; |
394 | } | 404 | } |
395 | 405 | ||
396 | void __free_pages_ok(struct page *page, unsigned int order) | 406 | void __free_pages_ok(struct page *page, unsigned int order) |
397 | { | 407 | { |
408 | unsigned long flags; | ||
398 | LIST_HEAD(list); | 409 | LIST_HEAD(list); |
399 | int i; | 410 | int i; |
400 | int reserved = 0; | 411 | int reserved = 0; |
@@ -408,14 +419,49 @@ void __free_pages_ok(struct page *page, unsigned int order) | |||
408 | #endif | 419 | #endif |
409 | 420 | ||
410 | for (i = 0 ; i < (1 << order) ; ++i) | 421 | for (i = 0 ; i < (1 << order) ; ++i) |
411 | reserved += free_pages_check(__FUNCTION__, page + i); | 422 | reserved += free_pages_check(page + i); |
412 | if (reserved) | 423 | if (reserved) |
413 | return; | 424 | return; |
414 | 425 | ||
415 | list_add(&page->lru, &list); | 426 | list_add(&page->lru, &list); |
416 | mod_page_state(pgfree, 1 << order); | ||
417 | kernel_map_pages(page, 1<<order, 0); | 427 | kernel_map_pages(page, 1<<order, 0); |
428 | local_irq_save(flags); | ||
429 | __mod_page_state(pgfree, 1 << order); | ||
418 | free_pages_bulk(page_zone(page), 1, &list, order); | 430 | free_pages_bulk(page_zone(page), 1, &list, order); |
431 | local_irq_restore(flags); | ||
432 | } | ||
433 | |||
434 | /* | ||
435 | * permit the bootmem allocator to evade page validation on high-order frees | ||
436 | */ | ||
437 | void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) | ||
438 | { | ||
439 | if (order == 0) { | ||
440 | __ClearPageReserved(page); | ||
441 | set_page_count(page, 0); | ||
442 | |||
443 | free_hot_cold_page(page, 0); | ||
444 | } else { | ||
445 | LIST_HEAD(list); | ||
446 | int loop; | ||
447 | |||
448 | for (loop = 0; loop < BITS_PER_LONG; loop++) { | ||
449 | struct page *p = &page[loop]; | ||
450 | |||
451 | if (loop + 16 < BITS_PER_LONG) | ||
452 | prefetchw(p + 16); | ||
453 | __ClearPageReserved(p); | ||
454 | set_page_count(p, 0); | ||
455 | } | ||
456 | |||
457 | arch_free_page(page, order); | ||
458 | |||
459 | mod_page_state(pgfree, 1 << order); | ||
460 | |||
461 | list_add(&page->lru, &list); | ||
462 | kernel_map_pages(page, 1 << order, 0); | ||
463 | free_pages_bulk(page_zone(page), 1, &list, order); | ||
464 | } | ||
419 | } | 465 | } |
420 | 466 | ||
421 | 467 | ||
@@ -433,8 +479,7 @@ void __free_pages_ok(struct page *page, unsigned int order) | |||
433 | * | 479 | * |
434 | * -- wli | 480 | * -- wli |
435 | */ | 481 | */ |
436 | static inline struct page * | 482 | static inline void expand(struct zone *zone, struct page *page, |
437 | expand(struct zone *zone, struct page *page, | ||
438 | int low, int high, struct free_area *area) | 483 | int low, int high, struct free_area *area) |
439 | { | 484 | { |
440 | unsigned long size = 1 << high; | 485 | unsigned long size = 1 << high; |
@@ -448,24 +493,6 @@ expand(struct zone *zone, struct page *page, | |||
448 | area->nr_free++; | 493 | area->nr_free++; |
449 | set_page_order(&page[size], high); | 494 | set_page_order(&page[size], high); |
450 | } | 495 | } |
451 | return page; | ||
452 | } | ||
453 | |||
454 | void set_page_refs(struct page *page, int order) | ||
455 | { | ||
456 | #ifdef CONFIG_MMU | ||
457 | set_page_count(page, 1); | ||
458 | #else | ||
459 | int i; | ||
460 | |||
461 | /* | ||
462 | * We need to reference all the pages for this order, otherwise if | ||
463 | * anyone accesses one of the pages with (get/put) it will be freed. | ||
464 | * - eg: access_process_vm() | ||
465 | */ | ||
466 | for (i = 0; i < (1 << order); i++) | ||
467 | set_page_count(page + i, 1); | ||
468 | #endif /* CONFIG_MMU */ | ||
469 | } | 496 | } |
470 | 497 | ||
471 | /* | 498 | /* |
@@ -473,9 +500,9 @@ void set_page_refs(struct page *page, int order) | |||
473 | */ | 500 | */ |
474 | static int prep_new_page(struct page *page, int order) | 501 | static int prep_new_page(struct page *page, int order) |
475 | { | 502 | { |
476 | if ( page_mapcount(page) || | 503 | if (unlikely(page_mapcount(page) | |
477 | page->mapping != NULL || | 504 | (page->mapping != NULL) | |
478 | page_count(page) != 0 || | 505 | (page_count(page) != 0) | |
479 | (page->flags & ( | 506 | (page->flags & ( |
480 | 1 << PG_lru | | 507 | 1 << PG_lru | |
481 | 1 << PG_private | | 508 | 1 << PG_private | |
@@ -486,8 +513,8 @@ static int prep_new_page(struct page *page, int order) | |||
486 | 1 << PG_slab | | 513 | 1 << PG_slab | |
487 | 1 << PG_swapcache | | 514 | 1 << PG_swapcache | |
488 | 1 << PG_writeback | | 515 | 1 << PG_writeback | |
489 | 1 << PG_reserved ))) | 516 | 1 << PG_reserved )))) |
490 | bad_page(__FUNCTION__, page); | 517 | bad_page(page); |
491 | 518 | ||
492 | /* | 519 | /* |
493 | * For now, we report if PG_reserved was found set, but do not | 520 | * For now, we report if PG_reserved was found set, but do not |
@@ -525,7 +552,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) | |||
525 | rmv_page_order(page); | 552 | rmv_page_order(page); |
526 | area->nr_free--; | 553 | area->nr_free--; |
527 | zone->free_pages -= 1UL << order; | 554 | zone->free_pages -= 1UL << order; |
528 | return expand(zone, page, order, current_order, area); | 555 | expand(zone, page, order, current_order, area); |
556 | return page; | ||
529 | } | 557 | } |
530 | 558 | ||
531 | return NULL; | 559 | return NULL; |
@@ -539,21 +567,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) | |||
539 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 567 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
540 | unsigned long count, struct list_head *list) | 568 | unsigned long count, struct list_head *list) |
541 | { | 569 | { |
542 | unsigned long flags; | ||
543 | int i; | 570 | int i; |
544 | int allocated = 0; | ||
545 | struct page *page; | ||
546 | 571 | ||
547 | spin_lock_irqsave(&zone->lock, flags); | 572 | spin_lock(&zone->lock); |
548 | for (i = 0; i < count; ++i) { | 573 | for (i = 0; i < count; ++i) { |
549 | page = __rmqueue(zone, order); | 574 | struct page *page = __rmqueue(zone, order); |
550 | if (page == NULL) | 575 | if (unlikely(page == NULL)) |
551 | break; | 576 | break; |
552 | allocated++; | ||
553 | list_add_tail(&page->lru, list); | 577 | list_add_tail(&page->lru, list); |
554 | } | 578 | } |
555 | spin_unlock_irqrestore(&zone->lock, flags); | 579 | spin_unlock(&zone->lock); |
556 | return allocated; | 580 | return i; |
557 | } | 581 | } |
558 | 582 | ||
559 | #ifdef CONFIG_NUMA | 583 | #ifdef CONFIG_NUMA |
@@ -589,6 +613,7 @@ void drain_remote_pages(void) | |||
589 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) | 613 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) |
590 | static void __drain_pages(unsigned int cpu) | 614 | static void __drain_pages(unsigned int cpu) |
591 | { | 615 | { |
616 | unsigned long flags; | ||
592 | struct zone *zone; | 617 | struct zone *zone; |
593 | int i; | 618 | int i; |
594 | 619 | ||
@@ -600,8 +625,10 @@ static void __drain_pages(unsigned int cpu) | |||
600 | struct per_cpu_pages *pcp; | 625 | struct per_cpu_pages *pcp; |
601 | 626 | ||
602 | pcp = &pset->pcp[i]; | 627 | pcp = &pset->pcp[i]; |
628 | local_irq_save(flags); | ||
603 | pcp->count -= free_pages_bulk(zone, pcp->count, | 629 | pcp->count -= free_pages_bulk(zone, pcp->count, |
604 | &pcp->list, 0); | 630 | &pcp->list, 0); |
631 | local_irq_restore(flags); | ||
605 | } | 632 | } |
606 | } | 633 | } |
607 | } | 634 | } |
@@ -647,18 +674,14 @@ void drain_local_pages(void) | |||
647 | } | 674 | } |
648 | #endif /* CONFIG_PM */ | 675 | #endif /* CONFIG_PM */ |
649 | 676 | ||
650 | static void zone_statistics(struct zonelist *zonelist, struct zone *z) | 677 | static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu) |
651 | { | 678 | { |
652 | #ifdef CONFIG_NUMA | 679 | #ifdef CONFIG_NUMA |
653 | unsigned long flags; | ||
654 | int cpu; | ||
655 | pg_data_t *pg = z->zone_pgdat; | 680 | pg_data_t *pg = z->zone_pgdat; |
656 | pg_data_t *orig = zonelist->zones[0]->zone_pgdat; | 681 | pg_data_t *orig = zonelist->zones[0]->zone_pgdat; |
657 | struct per_cpu_pageset *p; | 682 | struct per_cpu_pageset *p; |
658 | 683 | ||
659 | local_irq_save(flags); | 684 | p = zone_pcp(z, cpu); |
660 | cpu = smp_processor_id(); | ||
661 | p = zone_pcp(z,cpu); | ||
662 | if (pg == orig) { | 685 | if (pg == orig) { |
663 | p->numa_hit++; | 686 | p->numa_hit++; |
664 | } else { | 687 | } else { |
@@ -669,14 +692,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) | |||
669 | p->local_node++; | 692 | p->local_node++; |
670 | else | 693 | else |
671 | p->other_node++; | 694 | p->other_node++; |
672 | local_irq_restore(flags); | ||
673 | #endif | 695 | #endif |
674 | } | 696 | } |
675 | 697 | ||
676 | /* | 698 | /* |
677 | * Free a 0-order page | 699 | * Free a 0-order page |
678 | */ | 700 | */ |
679 | static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); | ||
680 | static void fastcall free_hot_cold_page(struct page *page, int cold) | 701 | static void fastcall free_hot_cold_page(struct page *page, int cold) |
681 | { | 702 | { |
682 | struct zone *zone = page_zone(page); | 703 | struct zone *zone = page_zone(page); |
@@ -687,14 +708,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) | |||
687 | 708 | ||
688 | if (PageAnon(page)) | 709 | if (PageAnon(page)) |
689 | page->mapping = NULL; | 710 | page->mapping = NULL; |
690 | if (free_pages_check(__FUNCTION__, page)) | 711 | if (free_pages_check(page)) |
691 | return; | 712 | return; |
692 | 713 | ||
693 | inc_page_state(pgfree); | ||
694 | kernel_map_pages(page, 1, 0); | 714 | kernel_map_pages(page, 1, 0); |
695 | 715 | ||
696 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; | 716 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; |
697 | local_irq_save(flags); | 717 | local_irq_save(flags); |
718 | __inc_page_state(pgfree); | ||
698 | list_add(&page->lru, &pcp->list); | 719 | list_add(&page->lru, &pcp->list); |
699 | pcp->count++; | 720 | pcp->count++; |
700 | if (pcp->count >= pcp->high) | 721 | if (pcp->count >= pcp->high) |
@@ -727,49 +748,58 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | |||
727 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | 748 | * we cheat by calling it from here, in the order > 0 path. Saves a branch |
728 | * or two. | 749 | * or two. |
729 | */ | 750 | */ |
730 | static struct page * | 751 | static struct page *buffered_rmqueue(struct zonelist *zonelist, |
731 | buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) | 752 | struct zone *zone, int order, gfp_t gfp_flags) |
732 | { | 753 | { |
733 | unsigned long flags; | 754 | unsigned long flags; |
734 | struct page *page; | 755 | struct page *page; |
735 | int cold = !!(gfp_flags & __GFP_COLD); | 756 | int cold = !!(gfp_flags & __GFP_COLD); |
757 | int cpu; | ||
736 | 758 | ||
737 | again: | 759 | again: |
760 | cpu = get_cpu(); | ||
738 | if (order == 0) { | 761 | if (order == 0) { |
739 | struct per_cpu_pages *pcp; | 762 | struct per_cpu_pages *pcp; |
740 | 763 | ||
741 | page = NULL; | 764 | pcp = &zone_pcp(zone, cpu)->pcp[cold]; |
742 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; | ||
743 | local_irq_save(flags); | 765 | local_irq_save(flags); |
744 | if (pcp->count <= pcp->low) | 766 | if (!pcp->count) { |
745 | pcp->count += rmqueue_bulk(zone, 0, | 767 | pcp->count += rmqueue_bulk(zone, 0, |
746 | pcp->batch, &pcp->list); | 768 | pcp->batch, &pcp->list); |
747 | if (pcp->count) { | 769 | if (unlikely(!pcp->count)) |
748 | page = list_entry(pcp->list.next, struct page, lru); | 770 | goto failed; |
749 | list_del(&page->lru); | ||
750 | pcp->count--; | ||
751 | } | 771 | } |
752 | local_irq_restore(flags); | 772 | page = list_entry(pcp->list.next, struct page, lru); |
753 | put_cpu(); | 773 | list_del(&page->lru); |
774 | pcp->count--; | ||
754 | } else { | 775 | } else { |
755 | spin_lock_irqsave(&zone->lock, flags); | 776 | spin_lock_irqsave(&zone->lock, flags); |
756 | page = __rmqueue(zone, order); | 777 | page = __rmqueue(zone, order); |
757 | spin_unlock_irqrestore(&zone->lock, flags); | 778 | spin_unlock(&zone->lock); |
779 | if (!page) | ||
780 | goto failed; | ||
758 | } | 781 | } |
759 | 782 | ||
760 | if (page != NULL) { | 783 | __mod_page_state_zone(zone, pgalloc, 1 << order); |
761 | BUG_ON(bad_range(zone, page)); | 784 | zone_statistics(zonelist, zone, cpu); |
762 | mod_page_state_zone(zone, pgalloc, 1 << order); | 785 | local_irq_restore(flags); |
763 | if (prep_new_page(page, order)) | 786 | put_cpu(); |
764 | goto again; | 787 | |
788 | BUG_ON(bad_range(zone, page)); | ||
789 | if (prep_new_page(page, order)) | ||
790 | goto again; | ||
765 | 791 | ||
766 | if (gfp_flags & __GFP_ZERO) | 792 | if (gfp_flags & __GFP_ZERO) |
767 | prep_zero_page(page, order, gfp_flags); | 793 | prep_zero_page(page, order, gfp_flags); |
768 | 794 | ||
769 | if (order && (gfp_flags & __GFP_COMP)) | 795 | if (order && (gfp_flags & __GFP_COMP)) |
770 | prep_compound_page(page, order); | 796 | prep_compound_page(page, order); |
771 | } | ||
772 | return page; | 797 | return page; |
798 | |||
799 | failed: | ||
800 | local_irq_restore(flags); | ||
801 | put_cpu(); | ||
802 | return NULL; | ||
773 | } | 803 | } |
774 | 804 | ||
775 | #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ | 805 | #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ |
@@ -845,9 +875,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | |||
845 | continue; | 875 | continue; |
846 | } | 876 | } |
847 | 877 | ||
848 | page = buffered_rmqueue(*z, order, gfp_mask); | 878 | page = buffered_rmqueue(zonelist, *z, order, gfp_mask); |
849 | if (page) { | 879 | if (page) { |
850 | zone_statistics(zonelist, *z); | ||
851 | break; | 880 | break; |
852 | } | 881 | } |
853 | } while (*(++z) != NULL); | 882 | } while (*(++z) != NULL); |
@@ -903,8 +932,7 @@ restart: | |||
903 | alloc_flags |= ALLOC_HARDER; | 932 | alloc_flags |= ALLOC_HARDER; |
904 | if (gfp_mask & __GFP_HIGH) | 933 | if (gfp_mask & __GFP_HIGH) |
905 | alloc_flags |= ALLOC_HIGH; | 934 | alloc_flags |= ALLOC_HIGH; |
906 | if (wait) | 935 | alloc_flags |= ALLOC_CPUSET; |
907 | alloc_flags |= ALLOC_CPUSET; | ||
908 | 936 | ||
909 | /* | 937 | /* |
910 | * Go through the zonelist again. Let __GFP_HIGH and allocations | 938 | * Go through the zonelist again. Let __GFP_HIGH and allocations |
@@ -926,7 +954,7 @@ restart: | |||
926 | nofail_alloc: | 954 | nofail_alloc: |
927 | /* go through the zonelist yet again, ignoring mins */ | 955 | /* go through the zonelist yet again, ignoring mins */ |
928 | page = get_page_from_freelist(gfp_mask, order, | 956 | page = get_page_from_freelist(gfp_mask, order, |
929 | zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET); | 957 | zonelist, ALLOC_NO_WATERMARKS); |
930 | if (page) | 958 | if (page) |
931 | goto got_pg; | 959 | goto got_pg; |
932 | if (gfp_mask & __GFP_NOFAIL) { | 960 | if (gfp_mask & __GFP_NOFAIL) { |
@@ -1171,12 +1199,11 @@ EXPORT_SYMBOL(nr_pagecache); | |||
1171 | DEFINE_PER_CPU(long, nr_pagecache_local) = 0; | 1199 | DEFINE_PER_CPU(long, nr_pagecache_local) = 0; |
1172 | #endif | 1200 | #endif |
1173 | 1201 | ||
1174 | void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) | 1202 | static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) |
1175 | { | 1203 | { |
1176 | int cpu = 0; | 1204 | int cpu = 0; |
1177 | 1205 | ||
1178 | memset(ret, 0, sizeof(*ret)); | 1206 | memset(ret, 0, sizeof(*ret)); |
1179 | cpus_and(*cpumask, *cpumask, cpu_online_map); | ||
1180 | 1207 | ||
1181 | cpu = first_cpu(*cpumask); | 1208 | cpu = first_cpu(*cpumask); |
1182 | while (cpu < NR_CPUS) { | 1209 | while (cpu < NR_CPUS) { |
@@ -1224,12 +1251,12 @@ void get_full_page_state(struct page_state *ret) | |||
1224 | __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); | 1251 | __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); |
1225 | } | 1252 | } |
1226 | 1253 | ||
1227 | unsigned long __read_page_state(unsigned long offset) | 1254 | unsigned long read_page_state_offset(unsigned long offset) |
1228 | { | 1255 | { |
1229 | unsigned long ret = 0; | 1256 | unsigned long ret = 0; |
1230 | int cpu; | 1257 | int cpu; |
1231 | 1258 | ||
1232 | for_each_online_cpu(cpu) { | 1259 | for_each_cpu(cpu) { |
1233 | unsigned long in; | 1260 | unsigned long in; |
1234 | 1261 | ||
1235 | in = (unsigned long)&per_cpu(page_states, cpu) + offset; | 1262 | in = (unsigned long)&per_cpu(page_states, cpu) + offset; |
@@ -1238,18 +1265,26 @@ unsigned long __read_page_state(unsigned long offset) | |||
1238 | return ret; | 1265 | return ret; |
1239 | } | 1266 | } |
1240 | 1267 | ||
1241 | void __mod_page_state(unsigned long offset, unsigned long delta) | 1268 | void __mod_page_state_offset(unsigned long offset, unsigned long delta) |
1269 | { | ||
1270 | void *ptr; | ||
1271 | |||
1272 | ptr = &__get_cpu_var(page_states); | ||
1273 | *(unsigned long *)(ptr + offset) += delta; | ||
1274 | } | ||
1275 | EXPORT_SYMBOL(__mod_page_state_offset); | ||
1276 | |||
1277 | void mod_page_state_offset(unsigned long offset, unsigned long delta) | ||
1242 | { | 1278 | { |
1243 | unsigned long flags; | 1279 | unsigned long flags; |
1244 | void* ptr; | 1280 | void *ptr; |
1245 | 1281 | ||
1246 | local_irq_save(flags); | 1282 | local_irq_save(flags); |
1247 | ptr = &__get_cpu_var(page_states); | 1283 | ptr = &__get_cpu_var(page_states); |
1248 | *(unsigned long*)(ptr + offset) += delta; | 1284 | *(unsigned long *)(ptr + offset) += delta; |
1249 | local_irq_restore(flags); | 1285 | local_irq_restore(flags); |
1250 | } | 1286 | } |
1251 | 1287 | EXPORT_SYMBOL(mod_page_state_offset); | |
1252 | EXPORT_SYMBOL(__mod_page_state); | ||
1253 | 1288 | ||
1254 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, | 1289 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, |
1255 | unsigned long *free, struct pglist_data *pgdat) | 1290 | unsigned long *free, struct pglist_data *pgdat) |
@@ -1335,7 +1370,7 @@ void show_free_areas(void) | |||
1335 | show_node(zone); | 1370 | show_node(zone); |
1336 | printk("%s per-cpu:", zone->name); | 1371 | printk("%s per-cpu:", zone->name); |
1337 | 1372 | ||
1338 | if (!zone->present_pages) { | 1373 | if (!populated_zone(zone)) { |
1339 | printk(" empty\n"); | 1374 | printk(" empty\n"); |
1340 | continue; | 1375 | continue; |
1341 | } else | 1376 | } else |
@@ -1347,10 +1382,9 @@ void show_free_areas(void) | |||
1347 | pageset = zone_pcp(zone, cpu); | 1382 | pageset = zone_pcp(zone, cpu); |
1348 | 1383 | ||
1349 | for (temperature = 0; temperature < 2; temperature++) | 1384 | for (temperature = 0; temperature < 2; temperature++) |
1350 | printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", | 1385 | printk("cpu %d %s: high %d, batch %d used:%d\n", |
1351 | cpu, | 1386 | cpu, |
1352 | temperature ? "cold" : "hot", | 1387 | temperature ? "cold" : "hot", |
1353 | pageset->pcp[temperature].low, | ||
1354 | pageset->pcp[temperature].high, | 1388 | pageset->pcp[temperature].high, |
1355 | pageset->pcp[temperature].batch, | 1389 | pageset->pcp[temperature].batch, |
1356 | pageset->pcp[temperature].count); | 1390 | pageset->pcp[temperature].count); |
@@ -1413,7 +1447,7 @@ void show_free_areas(void) | |||
1413 | 1447 | ||
1414 | show_node(zone); | 1448 | show_node(zone); |
1415 | printk("%s: ", zone->name); | 1449 | printk("%s: ", zone->name); |
1416 | if (!zone->present_pages) { | 1450 | if (!populated_zone(zone)) { |
1417 | printk("empty\n"); | 1451 | printk("empty\n"); |
1418 | continue; | 1452 | continue; |
1419 | } | 1453 | } |
@@ -1433,36 +1467,29 @@ void show_free_areas(void) | |||
1433 | 1467 | ||
1434 | /* | 1468 | /* |
1435 | * Builds allocation fallback zone lists. | 1469 | * Builds allocation fallback zone lists. |
1470 | * | ||
1471 | * Add all populated zones of a node to the zonelist. | ||
1436 | */ | 1472 | */ |
1437 | static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) | 1473 | static int __init build_zonelists_node(pg_data_t *pgdat, |
1438 | { | 1474 | struct zonelist *zonelist, int nr_zones, int zone_type) |
1439 | switch (k) { | 1475 | { |
1440 | struct zone *zone; | 1476 | struct zone *zone; |
1441 | default: | 1477 | |
1442 | BUG(); | 1478 | BUG_ON(zone_type > ZONE_HIGHMEM); |
1443 | case ZONE_HIGHMEM: | 1479 | |
1444 | zone = pgdat->node_zones + ZONE_HIGHMEM; | 1480 | do { |
1445 | if (zone->present_pages) { | 1481 | zone = pgdat->node_zones + zone_type; |
1482 | if (populated_zone(zone)) { | ||
1446 | #ifndef CONFIG_HIGHMEM | 1483 | #ifndef CONFIG_HIGHMEM |
1447 | BUG(); | 1484 | BUG_ON(zone_type > ZONE_NORMAL); |
1448 | #endif | 1485 | #endif |
1449 | zonelist->zones[j++] = zone; | 1486 | zonelist->zones[nr_zones++] = zone; |
1487 | check_highest_zone(zone_type); | ||
1450 | } | 1488 | } |
1451 | case ZONE_NORMAL: | 1489 | zone_type--; |
1452 | zone = pgdat->node_zones + ZONE_NORMAL; | ||
1453 | if (zone->present_pages) | ||
1454 | zonelist->zones[j++] = zone; | ||
1455 | case ZONE_DMA32: | ||
1456 | zone = pgdat->node_zones + ZONE_DMA32; | ||
1457 | if (zone->present_pages) | ||
1458 | zonelist->zones[j++] = zone; | ||
1459 | case ZONE_DMA: | ||
1460 | zone = pgdat->node_zones + ZONE_DMA; | ||
1461 | if (zone->present_pages) | ||
1462 | zonelist->zones[j++] = zone; | ||
1463 | } | ||
1464 | 1490 | ||
1465 | return j; | 1491 | } while (zone_type >= 0); |
1492 | return nr_zones; | ||
1466 | } | 1493 | } |
1467 | 1494 | ||
1468 | static inline int highest_zone(int zone_bits) | 1495 | static inline int highest_zone(int zone_bits) |
@@ -1709,8 +1736,6 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
1709 | for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { | 1736 | for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { |
1710 | if (!early_pfn_valid(pfn)) | 1737 | if (!early_pfn_valid(pfn)) |
1711 | continue; | 1738 | continue; |
1712 | if (!early_pfn_in_nid(pfn, nid)) | ||
1713 | continue; | ||
1714 | page = pfn_to_page(pfn); | 1739 | page = pfn_to_page(pfn); |
1715 | set_page_links(page, zone, nid, pfn); | 1740 | set_page_links(page, zone, nid, pfn); |
1716 | set_page_count(page, 1); | 1741 | set_page_count(page, 1); |
@@ -1794,14 +1819,12 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | |||
1794 | 1819 | ||
1795 | pcp = &p->pcp[0]; /* hot */ | 1820 | pcp = &p->pcp[0]; /* hot */ |
1796 | pcp->count = 0; | 1821 | pcp->count = 0; |
1797 | pcp->low = 0; | ||
1798 | pcp->high = 6 * batch; | 1822 | pcp->high = 6 * batch; |
1799 | pcp->batch = max(1UL, 1 * batch); | 1823 | pcp->batch = max(1UL, 1 * batch); |
1800 | INIT_LIST_HEAD(&pcp->list); | 1824 | INIT_LIST_HEAD(&pcp->list); |
1801 | 1825 | ||
1802 | pcp = &p->pcp[1]; /* cold*/ | 1826 | pcp = &p->pcp[1]; /* cold*/ |
1803 | pcp->count = 0; | 1827 | pcp->count = 0; |
1804 | pcp->low = 0; | ||
1805 | pcp->high = 2 * batch; | 1828 | pcp->high = 2 * batch; |
1806 | pcp->batch = max(1UL, batch/2); | 1829 | pcp->batch = max(1UL, batch/2); |
1807 | INIT_LIST_HEAD(&pcp->list); | 1830 | INIT_LIST_HEAD(&pcp->list); |
@@ -2116,7 +2139,7 @@ static int frag_show(struct seq_file *m, void *arg) | |||
2116 | int order; | 2139 | int order; |
2117 | 2140 | ||
2118 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | 2141 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { |
2119 | if (!zone->present_pages) | 2142 | if (!populated_zone(zone)) |
2120 | continue; | 2143 | continue; |
2121 | 2144 | ||
2122 | spin_lock_irqsave(&zone->lock, flags); | 2145 | spin_lock_irqsave(&zone->lock, flags); |
@@ -2149,7 +2172,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) | |||
2149 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { | 2172 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { |
2150 | int i; | 2173 | int i; |
2151 | 2174 | ||
2152 | if (!zone->present_pages) | 2175 | if (!populated_zone(zone)) |
2153 | continue; | 2176 | continue; |
2154 | 2177 | ||
2155 | spin_lock_irqsave(&zone->lock, flags); | 2178 | spin_lock_irqsave(&zone->lock, flags); |
@@ -2197,12 +2220,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg) | |||
2197 | seq_printf(m, | 2220 | seq_printf(m, |
2198 | "\n cpu: %i pcp: %i" | 2221 | "\n cpu: %i pcp: %i" |
2199 | "\n count: %i" | 2222 | "\n count: %i" |
2200 | "\n low: %i" | ||
2201 | "\n high: %i" | 2223 | "\n high: %i" |
2202 | "\n batch: %i", | 2224 | "\n batch: %i", |
2203 | i, j, | 2225 | i, j, |
2204 | pageset->pcp[j].count, | 2226 | pageset->pcp[j].count, |
2205 | pageset->pcp[j].low, | ||
2206 | pageset->pcp[j].high, | 2227 | pageset->pcp[j].high, |
2207 | pageset->pcp[j].batch); | 2228 | pageset->pcp[j].batch); |
2208 | } | 2229 | } |
@@ -2257,32 +2278,40 @@ static char *vmstat_text[] = { | |||
2257 | "pgpgout", | 2278 | "pgpgout", |
2258 | "pswpin", | 2279 | "pswpin", |
2259 | "pswpout", | 2280 | "pswpout", |
2260 | "pgalloc_high", | ||
2261 | 2281 | ||
2282 | "pgalloc_high", | ||
2262 | "pgalloc_normal", | 2283 | "pgalloc_normal", |
2284 | "pgalloc_dma32", | ||
2263 | "pgalloc_dma", | 2285 | "pgalloc_dma", |
2286 | |||
2264 | "pgfree", | 2287 | "pgfree", |
2265 | "pgactivate", | 2288 | "pgactivate", |
2266 | "pgdeactivate", | 2289 | "pgdeactivate", |
2267 | 2290 | ||
2268 | "pgfault", | 2291 | "pgfault", |
2269 | "pgmajfault", | 2292 | "pgmajfault", |
2293 | |||
2270 | "pgrefill_high", | 2294 | "pgrefill_high", |
2271 | "pgrefill_normal", | 2295 | "pgrefill_normal", |
2296 | "pgrefill_dma32", | ||
2272 | "pgrefill_dma", | 2297 | "pgrefill_dma", |
2273 | 2298 | ||
2274 | "pgsteal_high", | 2299 | "pgsteal_high", |
2275 | "pgsteal_normal", | 2300 | "pgsteal_normal", |
2301 | "pgsteal_dma32", | ||
2276 | "pgsteal_dma", | 2302 | "pgsteal_dma", |
2303 | |||
2277 | "pgscan_kswapd_high", | 2304 | "pgscan_kswapd_high", |
2278 | "pgscan_kswapd_normal", | 2305 | "pgscan_kswapd_normal", |
2279 | 2306 | "pgscan_kswapd_dma32", | |
2280 | "pgscan_kswapd_dma", | 2307 | "pgscan_kswapd_dma", |
2308 | |||
2281 | "pgscan_direct_high", | 2309 | "pgscan_direct_high", |
2282 | "pgscan_direct_normal", | 2310 | "pgscan_direct_normal", |
2311 | "pgscan_direct_dma32", | ||
2283 | "pgscan_direct_dma", | 2312 | "pgscan_direct_dma", |
2284 | "pginodesteal", | ||
2285 | 2313 | ||
2314 | "pginodesteal", | ||
2286 | "slabs_scanned", | 2315 | "slabs_scanned", |
2287 | "kswapd_steal", | 2316 | "kswapd_steal", |
2288 | "kswapd_inodesteal", | 2317 | "kswapd_inodesteal", |
diff --git a/mm/readahead.c b/mm/readahead.c index 72e7adbb87c7..8d6eeaaa6296 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -158,7 +158,7 @@ static int read_pages(struct address_space *mapping, struct file *filp, | |||
158 | { | 158 | { |
159 | unsigned page_idx; | 159 | unsigned page_idx; |
160 | struct pagevec lru_pvec; | 160 | struct pagevec lru_pvec; |
161 | int ret = 0; | 161 | int ret; |
162 | 162 | ||
163 | if (mapping->a_ops->readpages) { | 163 | if (mapping->a_ops->readpages) { |
164 | ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); | 164 | ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); |
@@ -171,14 +171,17 @@ static int read_pages(struct address_space *mapping, struct file *filp, | |||
171 | list_del(&page->lru); | 171 | list_del(&page->lru); |
172 | if (!add_to_page_cache(page, mapping, | 172 | if (!add_to_page_cache(page, mapping, |
173 | page->index, GFP_KERNEL)) { | 173 | page->index, GFP_KERNEL)) { |
174 | mapping->a_ops->readpage(filp, page); | 174 | ret = mapping->a_ops->readpage(filp, page); |
175 | if (!pagevec_add(&lru_pvec, page)) | 175 | if (ret != AOP_TRUNCATED_PAGE) { |
176 | __pagevec_lru_add(&lru_pvec); | 176 | if (!pagevec_add(&lru_pvec, page)) |
177 | } else { | 177 | __pagevec_lru_add(&lru_pvec); |
178 | page_cache_release(page); | 178 | continue; |
179 | } /* else fall through to release */ | ||
179 | } | 180 | } |
181 | page_cache_release(page); | ||
180 | } | 182 | } |
181 | pagevec_lru_add(&lru_pvec); | 183 | pagevec_lru_add(&lru_pvec); |
184 | ret = 0; | ||
182 | out: | 185 | out: |
183 | return ret; | 186 | return ret; |
184 | } | 187 | } |
@@ -435,6 +435,30 @@ int page_referenced(struct page *page, int is_locked) | |||
435 | } | 435 | } |
436 | 436 | ||
437 | /** | 437 | /** |
438 | * page_set_anon_rmap - setup new anonymous rmap | ||
439 | * @page: the page to add the mapping to | ||
440 | * @vma: the vm area in which the mapping is added | ||
441 | * @address: the user virtual address mapped | ||
442 | */ | ||
443 | static void __page_set_anon_rmap(struct page *page, | ||
444 | struct vm_area_struct *vma, unsigned long address) | ||
445 | { | ||
446 | struct anon_vma *anon_vma = vma->anon_vma; | ||
447 | |||
448 | BUG_ON(!anon_vma); | ||
449 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
450 | page->mapping = (struct address_space *) anon_vma; | ||
451 | |||
452 | page->index = linear_page_index(vma, address); | ||
453 | |||
454 | /* | ||
455 | * nr_mapped state can be updated without turning off | ||
456 | * interrupts because it is not modified via interrupt. | ||
457 | */ | ||
458 | __inc_page_state(nr_mapped); | ||
459 | } | ||
460 | |||
461 | /** | ||
438 | * page_add_anon_rmap - add pte mapping to an anonymous page | 462 | * page_add_anon_rmap - add pte mapping to an anonymous page |
439 | * @page: the page to add the mapping to | 463 | * @page: the page to add the mapping to |
440 | * @vma: the vm area in which the mapping is added | 464 | * @vma: the vm area in which the mapping is added |
@@ -445,20 +469,27 @@ int page_referenced(struct page *page, int is_locked) | |||
445 | void page_add_anon_rmap(struct page *page, | 469 | void page_add_anon_rmap(struct page *page, |
446 | struct vm_area_struct *vma, unsigned long address) | 470 | struct vm_area_struct *vma, unsigned long address) |
447 | { | 471 | { |
448 | if (atomic_inc_and_test(&page->_mapcount)) { | 472 | if (atomic_inc_and_test(&page->_mapcount)) |
449 | struct anon_vma *anon_vma = vma->anon_vma; | 473 | __page_set_anon_rmap(page, vma, address); |
450 | |||
451 | BUG_ON(!anon_vma); | ||
452 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
453 | page->mapping = (struct address_space *) anon_vma; | ||
454 | |||
455 | page->index = linear_page_index(vma, address); | ||
456 | |||
457 | inc_page_state(nr_mapped); | ||
458 | } | ||
459 | /* else checking page index and mapping is racy */ | 474 | /* else checking page index and mapping is racy */ |
460 | } | 475 | } |
461 | 476 | ||
477 | /* | ||
478 | * page_add_new_anon_rmap - add pte mapping to a new anonymous page | ||
479 | * @page: the page to add the mapping to | ||
480 | * @vma: the vm area in which the mapping is added | ||
481 | * @address: the user virtual address mapped | ||
482 | * | ||
483 | * Same as page_add_anon_rmap but must only be called on *new* pages. | ||
484 | * This means the inc-and-test can be bypassed. | ||
485 | */ | ||
486 | void page_add_new_anon_rmap(struct page *page, | ||
487 | struct vm_area_struct *vma, unsigned long address) | ||
488 | { | ||
489 | atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ | ||
490 | __page_set_anon_rmap(page, vma, address); | ||
491 | } | ||
492 | |||
462 | /** | 493 | /** |
463 | * page_add_file_rmap - add pte mapping to a file page | 494 | * page_add_file_rmap - add pte mapping to a file page |
464 | * @page: the page to add the mapping to | 495 | * @page: the page to add the mapping to |
@@ -471,7 +502,7 @@ void page_add_file_rmap(struct page *page) | |||
471 | BUG_ON(!pfn_valid(page_to_pfn(page))); | 502 | BUG_ON(!pfn_valid(page_to_pfn(page))); |
472 | 503 | ||
473 | if (atomic_inc_and_test(&page->_mapcount)) | 504 | if (atomic_inc_and_test(&page->_mapcount)) |
474 | inc_page_state(nr_mapped); | 505 | __inc_page_state(nr_mapped); |
475 | } | 506 | } |
476 | 507 | ||
477 | /** | 508 | /** |
@@ -495,7 +526,7 @@ void page_remove_rmap(struct page *page) | |||
495 | */ | 526 | */ |
496 | if (page_test_and_clear_dirty(page)) | 527 | if (page_test_and_clear_dirty(page)) |
497 | set_page_dirty(page); | 528 | set_page_dirty(page); |
498 | dec_page_state(nr_mapped); | 529 | __dec_page_state(nr_mapped); |
499 | } | 530 | } |
500 | } | 531 | } |
501 | 532 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index dc25565a61e9..a1f2f02af724 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -457,7 +457,7 @@ static void shmem_free_pages(struct list_head *next) | |||
457 | } while (next); | 457 | } while (next); |
458 | } | 458 | } |
459 | 459 | ||
460 | static void shmem_truncate(struct inode *inode) | 460 | static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) |
461 | { | 461 | { |
462 | struct shmem_inode_info *info = SHMEM_I(inode); | 462 | struct shmem_inode_info *info = SHMEM_I(inode); |
463 | unsigned long idx; | 463 | unsigned long idx; |
@@ -475,18 +475,27 @@ static void shmem_truncate(struct inode *inode) | |||
475 | long nr_swaps_freed = 0; | 475 | long nr_swaps_freed = 0; |
476 | int offset; | 476 | int offset; |
477 | int freed; | 477 | int freed; |
478 | int punch_hole = 0; | ||
478 | 479 | ||
479 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 480 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
480 | idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 481 | idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
481 | if (idx >= info->next_index) | 482 | if (idx >= info->next_index) |
482 | return; | 483 | return; |
483 | 484 | ||
484 | spin_lock(&info->lock); | 485 | spin_lock(&info->lock); |
485 | info->flags |= SHMEM_TRUNCATE; | 486 | info->flags |= SHMEM_TRUNCATE; |
486 | limit = info->next_index; | 487 | if (likely(end == (loff_t) -1)) { |
487 | info->next_index = idx; | 488 | limit = info->next_index; |
489 | info->next_index = idx; | ||
490 | } else { | ||
491 | limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
492 | if (limit > info->next_index) | ||
493 | limit = info->next_index; | ||
494 | punch_hole = 1; | ||
495 | } | ||
496 | |||
488 | topdir = info->i_indirect; | 497 | topdir = info->i_indirect; |
489 | if (topdir && idx <= SHMEM_NR_DIRECT) { | 498 | if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) { |
490 | info->i_indirect = NULL; | 499 | info->i_indirect = NULL; |
491 | nr_pages_to_free++; | 500 | nr_pages_to_free++; |
492 | list_add(&topdir->lru, &pages_to_free); | 501 | list_add(&topdir->lru, &pages_to_free); |
@@ -573,11 +582,12 @@ static void shmem_truncate(struct inode *inode) | |||
573 | set_page_private(subdir, page_private(subdir) - freed); | 582 | set_page_private(subdir, page_private(subdir) - freed); |
574 | if (offset) | 583 | if (offset) |
575 | spin_unlock(&info->lock); | 584 | spin_unlock(&info->lock); |
576 | BUG_ON(page_private(subdir) > offset); | 585 | if (!punch_hole) |
586 | BUG_ON(page_private(subdir) > offset); | ||
577 | } | 587 | } |
578 | if (offset) | 588 | if (offset) |
579 | offset = 0; | 589 | offset = 0; |
580 | else if (subdir) { | 590 | else if (subdir && !page_private(subdir)) { |
581 | dir[diroff] = NULL; | 591 | dir[diroff] = NULL; |
582 | nr_pages_to_free++; | 592 | nr_pages_to_free++; |
583 | list_add(&subdir->lru, &pages_to_free); | 593 | list_add(&subdir->lru, &pages_to_free); |
@@ -594,7 +604,7 @@ done2: | |||
594 | * Also, though shmem_getpage checks i_size before adding to | 604 | * Also, though shmem_getpage checks i_size before adding to |
595 | * cache, no recheck after: so fix the narrow window there too. | 605 | * cache, no recheck after: so fix the narrow window there too. |
596 | */ | 606 | */ |
597 | truncate_inode_pages(inode->i_mapping, inode->i_size); | 607 | truncate_inode_pages_range(inode->i_mapping, start, end); |
598 | } | 608 | } |
599 | 609 | ||
600 | spin_lock(&info->lock); | 610 | spin_lock(&info->lock); |
@@ -614,6 +624,11 @@ done2: | |||
614 | } | 624 | } |
615 | } | 625 | } |
616 | 626 | ||
627 | static void shmem_truncate(struct inode *inode) | ||
628 | { | ||
629 | shmem_truncate_range(inode, inode->i_size, (loff_t)-1); | ||
630 | } | ||
631 | |||
617 | static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | 632 | static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) |
618 | { | 633 | { |
619 | struct inode *inode = dentry->d_inode; | 634 | struct inode *inode = dentry->d_inode; |
@@ -855,7 +870,7 @@ unlock: | |||
855 | swap_free(swap); | 870 | swap_free(swap); |
856 | redirty: | 871 | redirty: |
857 | set_page_dirty(page); | 872 | set_page_dirty(page); |
858 | return WRITEPAGE_ACTIVATE; /* Return with the page locked */ | 873 | return AOP_WRITEPAGE_ACTIVATE; /* Return with the page locked */ |
859 | } | 874 | } |
860 | 875 | ||
861 | #ifdef CONFIG_NUMA | 876 | #ifdef CONFIG_NUMA |
@@ -1255,7 +1270,7 @@ out_nomem: | |||
1255 | return retval; | 1270 | return retval; |
1256 | } | 1271 | } |
1257 | 1272 | ||
1258 | static int shmem_mmap(struct file *file, struct vm_area_struct *vma) | 1273 | int shmem_mmap(struct file *file, struct vm_area_struct *vma) |
1259 | { | 1274 | { |
1260 | file_accessed(file); | 1275 | file_accessed(file); |
1261 | vma->vm_ops = &shmem_vm_ops; | 1276 | vma->vm_ops = &shmem_vm_ops; |
@@ -2083,6 +2098,7 @@ static struct file_operations shmem_file_operations = { | |||
2083 | static struct inode_operations shmem_inode_operations = { | 2098 | static struct inode_operations shmem_inode_operations = { |
2084 | .truncate = shmem_truncate, | 2099 | .truncate = shmem_truncate, |
2085 | .setattr = shmem_notify_change, | 2100 | .setattr = shmem_notify_change, |
2101 | .truncate_range = shmem_truncate_range, | ||
2086 | }; | 2102 | }; |
2087 | 2103 | ||
2088 | static struct inode_operations shmem_dir_inode_operations = { | 2104 | static struct inode_operations shmem_dir_inode_operations = { |
@@ -156,16 +156,22 @@ void fastcall lru_cache_add_active(struct page *page) | |||
156 | put_cpu_var(lru_add_active_pvecs); | 156 | put_cpu_var(lru_add_active_pvecs); |
157 | } | 157 | } |
158 | 158 | ||
159 | void lru_add_drain(void) | 159 | static void __lru_add_drain(int cpu) |
160 | { | 160 | { |
161 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); | 161 | struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); |
162 | 162 | ||
163 | /* CPU is dead, so no locking needed. */ | ||
163 | if (pagevec_count(pvec)) | 164 | if (pagevec_count(pvec)) |
164 | __pagevec_lru_add(pvec); | 165 | __pagevec_lru_add(pvec); |
165 | pvec = &__get_cpu_var(lru_add_active_pvecs); | 166 | pvec = &per_cpu(lru_add_active_pvecs, cpu); |
166 | if (pagevec_count(pvec)) | 167 | if (pagevec_count(pvec)) |
167 | __pagevec_lru_add_active(pvec); | 168 | __pagevec_lru_add_active(pvec); |
168 | put_cpu_var(lru_add_pvecs); | 169 | } |
170 | |||
171 | void lru_add_drain(void) | ||
172 | { | ||
173 | __lru_add_drain(get_cpu()); | ||
174 | put_cpu(); | ||
169 | } | 175 | } |
170 | 176 | ||
171 | /* | 177 | /* |
@@ -412,17 +418,6 @@ void vm_acct_memory(long pages) | |||
412 | } | 418 | } |
413 | 419 | ||
414 | #ifdef CONFIG_HOTPLUG_CPU | 420 | #ifdef CONFIG_HOTPLUG_CPU |
415 | static void lru_drain_cache(unsigned int cpu) | ||
416 | { | ||
417 | struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); | ||
418 | |||
419 | /* CPU is dead, so no locking needed. */ | ||
420 | if (pagevec_count(pvec)) | ||
421 | __pagevec_lru_add(pvec); | ||
422 | pvec = &per_cpu(lru_add_active_pvecs, cpu); | ||
423 | if (pagevec_count(pvec)) | ||
424 | __pagevec_lru_add_active(pvec); | ||
425 | } | ||
426 | 421 | ||
427 | /* Drop the CPU's cached committed space back into the central pool. */ | 422 | /* Drop the CPU's cached committed space back into the central pool. */ |
428 | static int cpu_swap_callback(struct notifier_block *nfb, | 423 | static int cpu_swap_callback(struct notifier_block *nfb, |
@@ -435,7 +430,7 @@ static int cpu_swap_callback(struct notifier_block *nfb, | |||
435 | if (action == CPU_DEAD) { | 430 | if (action == CPU_DEAD) { |
436 | atomic_add(*committed, &vm_committed_space); | 431 | atomic_add(*committed, &vm_committed_space); |
437 | *committed = 0; | 432 | *committed = 0; |
438 | lru_drain_cache((long)hcpu); | 433 | __lru_add_drain((long)hcpu); |
439 | } | 434 | } |
440 | return NOTIFY_OK; | 435 | return NOTIFY_OK; |
441 | } | 436 | } |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 0df9a57b1de8..fc2aecb70a95 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
15 | #include <linux/buffer_head.h> | 15 | #include <linux/buffer_head.h> |
16 | #include <linux/backing-dev.h> | 16 | #include <linux/backing-dev.h> |
17 | #include <linux/pagevec.h> | ||
17 | 18 | ||
18 | #include <asm/pgtable.h> | 19 | #include <asm/pgtable.h> |
19 | 20 | ||
@@ -272,12 +273,11 @@ void free_page_and_swap_cache(struct page *page) | |||
272 | */ | 273 | */ |
273 | void free_pages_and_swap_cache(struct page **pages, int nr) | 274 | void free_pages_and_swap_cache(struct page **pages, int nr) |
274 | { | 275 | { |
275 | int chunk = 16; | ||
276 | struct page **pagep = pages; | 276 | struct page **pagep = pages; |
277 | 277 | ||
278 | lru_add_drain(); | 278 | lru_add_drain(); |
279 | while (nr) { | 279 | while (nr) { |
280 | int todo = min(chunk, nr); | 280 | int todo = min(nr, PAGEVEC_SIZE); |
281 | int i; | 281 | int i; |
282 | 282 | ||
283 | for (i = 0; i < todo; i++) | 283 | for (i = 0; i < todo; i++) |
diff --git a/mm/swapfile.c b/mm/swapfile.c index edafeace301f..6da4b28b896b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -211,6 +211,26 @@ noswap: | |||
211 | return (swp_entry_t) {0}; | 211 | return (swp_entry_t) {0}; |
212 | } | 212 | } |
213 | 213 | ||
214 | swp_entry_t get_swap_page_of_type(int type) | ||
215 | { | ||
216 | struct swap_info_struct *si; | ||
217 | pgoff_t offset; | ||
218 | |||
219 | spin_lock(&swap_lock); | ||
220 | si = swap_info + type; | ||
221 | if (si->flags & SWP_WRITEOK) { | ||
222 | nr_swap_pages--; | ||
223 | offset = scan_swap_map(si); | ||
224 | if (offset) { | ||
225 | spin_unlock(&swap_lock); | ||
226 | return swp_entry(type, offset); | ||
227 | } | ||
228 | nr_swap_pages++; | ||
229 | } | ||
230 | spin_unlock(&swap_lock); | ||
231 | return (swp_entry_t) {0}; | ||
232 | } | ||
233 | |||
214 | static struct swap_info_struct * swap_info_get(swp_entry_t entry) | 234 | static struct swap_info_struct * swap_info_get(swp_entry_t entry) |
215 | { | 235 | { |
216 | struct swap_info_struct * p; | 236 | struct swap_info_struct * p; |
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index b58abcf44ed6..cdc6d431972b 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c | |||
@@ -81,13 +81,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | |||
81 | goto close_file; | 81 | goto close_file; |
82 | 82 | ||
83 | d_instantiate(dentry, inode); | 83 | d_instantiate(dentry, inode); |
84 | inode->i_size = size; | ||
85 | inode->i_nlink = 0; /* It is unlinked */ | 84 | inode->i_nlink = 0; /* It is unlinked */ |
85 | |||
86 | file->f_vfsmnt = mntget(shm_mnt); | 86 | file->f_vfsmnt = mntget(shm_mnt); |
87 | file->f_dentry = dentry; | 87 | file->f_dentry = dentry; |
88 | file->f_mapping = inode->i_mapping; | 88 | file->f_mapping = inode->i_mapping; |
89 | file->f_op = &ramfs_file_operations; | 89 | file->f_op = &ramfs_file_operations; |
90 | file->f_mode = FMODE_WRITE | FMODE_READ; | 90 | file->f_mode = FMODE_WRITE | FMODE_READ; |
91 | |||
92 | /* notify everyone as to the change of file size */ | ||
93 | error = do_truncate(dentry, size, file); | ||
94 | if (error < 0) | ||
95 | goto close_file; | ||
96 | |||
91 | return file; | 97 | return file; |
92 | 98 | ||
93 | close_file: | 99 | close_file: |
@@ -123,3 +129,24 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
123 | { | 129 | { |
124 | return 0; | 130 | return 0; |
125 | } | 131 | } |
132 | |||
133 | int shmem_mmap(struct file *file, struct vm_area_struct *vma) | ||
134 | { | ||
135 | file_accessed(file); | ||
136 | #ifndef CONFIG_MMU | ||
137 | return ramfs_nommu_mmap(file, vma); | ||
138 | #else | ||
139 | return 0; | ||
140 | #endif | ||
141 | } | ||
142 | |||
143 | #ifndef CONFIG_MMU | ||
144 | unsigned long shmem_get_unmapped_area(struct file *file, | ||
145 | unsigned long addr, | ||
146 | unsigned long len, | ||
147 | unsigned long pgoff, | ||
148 | unsigned long flags) | ||
149 | { | ||
150 | return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags); | ||
151 | } | ||
152 | #endif | ||
diff --git a/mm/truncate.c b/mm/truncate.c index 9173ab500604..7dee32745901 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -82,12 +82,15 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) | |||
82 | } | 82 | } |
83 | 83 | ||
84 | /** | 84 | /** |
85 | * truncate_inode_pages - truncate *all* the pages from an offset | 85 | * truncate_inode_pages - truncate range of pages specified by start and |
86 | * end byte offsets | ||
86 | * @mapping: mapping to truncate | 87 | * @mapping: mapping to truncate |
87 | * @lstart: offset from which to truncate | 88 | * @lstart: offset from which to truncate |
89 | * @lend: offset to which to truncate | ||
88 | * | 90 | * |
89 | * Truncate the page cache at a set offset, removing the pages that are beyond | 91 | * Truncate the page cache, removing the pages that are between |
90 | * that offset (and zeroing out partial pages). | 92 | * specified offsets (and zeroing out partial page |
93 | * (if lstart is not page aligned)). | ||
91 | * | 94 | * |
92 | * Truncate takes two passes - the first pass is nonblocking. It will not | 95 | * Truncate takes two passes - the first pass is nonblocking. It will not |
93 | * block on page locks and it will not block on writeback. The second pass | 96 | * block on page locks and it will not block on writeback. The second pass |
@@ -101,12 +104,12 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) | |||
101 | * We pass down the cache-hot hint to the page freeing code. Even if the | 104 | * We pass down the cache-hot hint to the page freeing code. Even if the |
102 | * mapping is large, it is probably the case that the final pages are the most | 105 | * mapping is large, it is probably the case that the final pages are the most |
103 | * recently touched, and freeing happens in ascending file offset order. | 106 | * recently touched, and freeing happens in ascending file offset order. |
104 | * | ||
105 | * Called under (and serialised by) inode->i_sem. | ||
106 | */ | 107 | */ |
107 | void truncate_inode_pages(struct address_space *mapping, loff_t lstart) | 108 | void truncate_inode_pages_range(struct address_space *mapping, |
109 | loff_t lstart, loff_t lend) | ||
108 | { | 110 | { |
109 | const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; | 111 | const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; |
112 | pgoff_t end; | ||
110 | const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); | 113 | const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); |
111 | struct pagevec pvec; | 114 | struct pagevec pvec; |
112 | pgoff_t next; | 115 | pgoff_t next; |
@@ -115,13 +118,22 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) | |||
115 | if (mapping->nrpages == 0) | 118 | if (mapping->nrpages == 0) |
116 | return; | 119 | return; |
117 | 120 | ||
121 | BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); | ||
122 | end = (lend >> PAGE_CACHE_SHIFT); | ||
123 | |||
118 | pagevec_init(&pvec, 0); | 124 | pagevec_init(&pvec, 0); |
119 | next = start; | 125 | next = start; |
120 | while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 126 | while (next <= end && |
127 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | ||
121 | for (i = 0; i < pagevec_count(&pvec); i++) { | 128 | for (i = 0; i < pagevec_count(&pvec); i++) { |
122 | struct page *page = pvec.pages[i]; | 129 | struct page *page = pvec.pages[i]; |
123 | pgoff_t page_index = page->index; | 130 | pgoff_t page_index = page->index; |
124 | 131 | ||
132 | if (page_index > end) { | ||
133 | next = page_index; | ||
134 | break; | ||
135 | } | ||
136 | |||
125 | if (page_index > next) | 137 | if (page_index > next) |
126 | next = page_index; | 138 | next = page_index; |
127 | next++; | 139 | next++; |
@@ -157,9 +169,15 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) | |||
157 | next = start; | 169 | next = start; |
158 | continue; | 170 | continue; |
159 | } | 171 | } |
172 | if (pvec.pages[0]->index > end) { | ||
173 | pagevec_release(&pvec); | ||
174 | break; | ||
175 | } | ||
160 | for (i = 0; i < pagevec_count(&pvec); i++) { | 176 | for (i = 0; i < pagevec_count(&pvec); i++) { |
161 | struct page *page = pvec.pages[i]; | 177 | struct page *page = pvec.pages[i]; |
162 | 178 | ||
179 | if (page->index > end) | ||
180 | break; | ||
163 | lock_page(page); | 181 | lock_page(page); |
164 | wait_on_page_writeback(page); | 182 | wait_on_page_writeback(page); |
165 | if (page->index > next) | 183 | if (page->index > next) |
@@ -171,7 +189,19 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) | |||
171 | pagevec_release(&pvec); | 189 | pagevec_release(&pvec); |
172 | } | 190 | } |
173 | } | 191 | } |
192 | EXPORT_SYMBOL(truncate_inode_pages_range); | ||
174 | 193 | ||
194 | /** | ||
195 | * truncate_inode_pages - truncate *all* the pages from an offset | ||
196 | * @mapping: mapping to truncate | ||
197 | * @lstart: offset from which to truncate | ||
198 | * | ||
199 | * Called under (and serialised by) inode->i_sem. | ||
200 | */ | ||
201 | void truncate_inode_pages(struct address_space *mapping, loff_t lstart) | ||
202 | { | ||
203 | truncate_inode_pages_range(mapping, lstart, (loff_t)-1); | ||
204 | } | ||
175 | EXPORT_SYMBOL(truncate_inode_pages); | 205 | EXPORT_SYMBOL(truncate_inode_pages); |
176 | 206 | ||
177 | /** | 207 | /** |
diff --git a/mm/vmscan.c b/mm/vmscan.c index b0cd81c32de6..be8235fb1939 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -63,9 +63,6 @@ struct scan_control { | |||
63 | 63 | ||
64 | unsigned long nr_mapped; /* From page_state */ | 64 | unsigned long nr_mapped; /* From page_state */ |
65 | 65 | ||
66 | /* How many pages shrink_cache() should reclaim */ | ||
67 | int nr_to_reclaim; | ||
68 | |||
69 | /* Ask shrink_caches, or shrink_zone to scan at this priority */ | 66 | /* Ask shrink_caches, or shrink_zone to scan at this priority */ |
70 | unsigned int priority; | 67 | unsigned int priority; |
71 | 68 | ||
@@ -74,9 +71,6 @@ struct scan_control { | |||
74 | 71 | ||
75 | int may_writepage; | 72 | int may_writepage; |
76 | 73 | ||
77 | /* Can pages be swapped as part of reclaim? */ | ||
78 | int may_swap; | ||
79 | |||
80 | /* This context's SWAP_CLUSTER_MAX. If freeing memory for | 74 | /* This context's SWAP_CLUSTER_MAX. If freeing memory for |
81 | * suspend, we effectively ignore SWAP_CLUSTER_MAX. | 75 | * suspend, we effectively ignore SWAP_CLUSTER_MAX. |
82 | * In this context, it doesn't matter that we scan the | 76 | * In this context, it doesn't matter that we scan the |
@@ -367,7 +361,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) | |||
367 | res = mapping->a_ops->writepage(page, &wbc); | 361 | res = mapping->a_ops->writepage(page, &wbc); |
368 | if (res < 0) | 362 | if (res < 0) |
369 | handle_write_error(mapping, page, res); | 363 | handle_write_error(mapping, page, res); |
370 | if (res == WRITEPAGE_ACTIVATE) { | 364 | if (res == AOP_WRITEPAGE_ACTIVATE) { |
371 | ClearPageReclaim(page); | 365 | ClearPageReclaim(page); |
372 | return PAGE_ACTIVATE; | 366 | return PAGE_ACTIVATE; |
373 | } | 367 | } |
@@ -430,8 +424,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) | |||
430 | * Try to allocate it some swap space here. | 424 | * Try to allocate it some swap space here. |
431 | */ | 425 | */ |
432 | if (PageAnon(page) && !PageSwapCache(page)) { | 426 | if (PageAnon(page) && !PageSwapCache(page)) { |
433 | if (!sc->may_swap) | ||
434 | goto keep_locked; | ||
435 | if (!add_to_swap(page)) | 427 | if (!add_to_swap(page)) |
436 | goto activate_locked; | 428 | goto activate_locked; |
437 | } | 429 | } |
@@ -653,17 +645,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) | |||
653 | goto done; | 645 | goto done; |
654 | 646 | ||
655 | max_scan -= nr_scan; | 647 | max_scan -= nr_scan; |
656 | if (current_is_kswapd()) | ||
657 | mod_page_state_zone(zone, pgscan_kswapd, nr_scan); | ||
658 | else | ||
659 | mod_page_state_zone(zone, pgscan_direct, nr_scan); | ||
660 | nr_freed = shrink_list(&page_list, sc); | 648 | nr_freed = shrink_list(&page_list, sc); |
661 | if (current_is_kswapd()) | ||
662 | mod_page_state(kswapd_steal, nr_freed); | ||
663 | mod_page_state_zone(zone, pgsteal, nr_freed); | ||
664 | sc->nr_to_reclaim -= nr_freed; | ||
665 | 649 | ||
666 | spin_lock_irq(&zone->lru_lock); | 650 | local_irq_disable(); |
651 | if (current_is_kswapd()) { | ||
652 | __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); | ||
653 | __mod_page_state(kswapd_steal, nr_freed); | ||
654 | } else | ||
655 | __mod_page_state_zone(zone, pgscan_direct, nr_scan); | ||
656 | __mod_page_state_zone(zone, pgsteal, nr_freed); | ||
657 | |||
658 | spin_lock(&zone->lru_lock); | ||
667 | /* | 659 | /* |
668 | * Put back any unfreeable pages. | 660 | * Put back any unfreeable pages. |
669 | */ | 661 | */ |
@@ -825,11 +817,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) | |||
825 | } | 817 | } |
826 | } | 818 | } |
827 | zone->nr_active += pgmoved; | 819 | zone->nr_active += pgmoved; |
828 | spin_unlock_irq(&zone->lru_lock); | 820 | spin_unlock(&zone->lru_lock); |
829 | pagevec_release(&pvec); | 821 | |
822 | __mod_page_state_zone(zone, pgrefill, pgscanned); | ||
823 | __mod_page_state(pgdeactivate, pgdeactivate); | ||
824 | local_irq_enable(); | ||
830 | 825 | ||
831 | mod_page_state_zone(zone, pgrefill, pgscanned); | 826 | pagevec_release(&pvec); |
832 | mod_page_state(pgdeactivate, pgdeactivate); | ||
833 | } | 827 | } |
834 | 828 | ||
835 | /* | 829 | /* |
@@ -861,8 +855,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc) | |||
861 | else | 855 | else |
862 | nr_inactive = 0; | 856 | nr_inactive = 0; |
863 | 857 | ||
864 | sc->nr_to_reclaim = sc->swap_cluster_max; | ||
865 | |||
866 | while (nr_active || nr_inactive) { | 858 | while (nr_active || nr_inactive) { |
867 | if (nr_active) { | 859 | if (nr_active) { |
868 | sc->nr_to_scan = min(nr_active, | 860 | sc->nr_to_scan = min(nr_active, |
@@ -876,8 +868,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc) | |||
876 | (unsigned long)sc->swap_cluster_max); | 868 | (unsigned long)sc->swap_cluster_max); |
877 | nr_inactive -= sc->nr_to_scan; | 869 | nr_inactive -= sc->nr_to_scan; |
878 | shrink_cache(zone, sc); | 870 | shrink_cache(zone, sc); |
879 | if (sc->nr_to_reclaim <= 0) | ||
880 | break; | ||
881 | } | 871 | } |
882 | } | 872 | } |
883 | 873 | ||
@@ -910,7 +900,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc) | |||
910 | for (i = 0; zones[i] != NULL; i++) { | 900 | for (i = 0; zones[i] != NULL; i++) { |
911 | struct zone *zone = zones[i]; | 901 | struct zone *zone = zones[i]; |
912 | 902 | ||
913 | if (zone->present_pages == 0) | 903 | if (!populated_zone(zone)) |
914 | continue; | 904 | continue; |
915 | 905 | ||
916 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) | 906 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) |
@@ -952,7 +942,6 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
952 | 942 | ||
953 | sc.gfp_mask = gfp_mask; | 943 | sc.gfp_mask = gfp_mask; |
954 | sc.may_writepage = 0; | 944 | sc.may_writepage = 0; |
955 | sc.may_swap = 1; | ||
956 | 945 | ||
957 | inc_page_state(allocstall); | 946 | inc_page_state(allocstall); |
958 | 947 | ||
@@ -1055,7 +1044,6 @@ loop_again: | |||
1055 | total_reclaimed = 0; | 1044 | total_reclaimed = 0; |
1056 | sc.gfp_mask = GFP_KERNEL; | 1045 | sc.gfp_mask = GFP_KERNEL; |
1057 | sc.may_writepage = 0; | 1046 | sc.may_writepage = 0; |
1058 | sc.may_swap = 1; | ||
1059 | sc.nr_mapped = read_page_state(nr_mapped); | 1047 | sc.nr_mapped = read_page_state(nr_mapped); |
1060 | 1048 | ||
1061 | inc_page_state(pageoutrun); | 1049 | inc_page_state(pageoutrun); |
@@ -1084,7 +1072,7 @@ loop_again: | |||
1084 | for (i = pgdat->nr_zones - 1; i >= 0; i--) { | 1072 | for (i = pgdat->nr_zones - 1; i >= 0; i--) { |
1085 | struct zone *zone = pgdat->node_zones + i; | 1073 | struct zone *zone = pgdat->node_zones + i; |
1086 | 1074 | ||
1087 | if (zone->present_pages == 0) | 1075 | if (!populated_zone(zone)) |
1088 | continue; | 1076 | continue; |
1089 | 1077 | ||
1090 | if (zone->all_unreclaimable && | 1078 | if (zone->all_unreclaimable && |
@@ -1121,7 +1109,7 @@ scan: | |||
1121 | struct zone *zone = pgdat->node_zones + i; | 1109 | struct zone *zone = pgdat->node_zones + i; |
1122 | int nr_slab; | 1110 | int nr_slab; |
1123 | 1111 | ||
1124 | if (zone->present_pages == 0) | 1112 | if (!populated_zone(zone)) |
1125 | continue; | 1113 | continue; |
1126 | 1114 | ||
1127 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 1115 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
@@ -1273,7 +1261,7 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
1273 | { | 1261 | { |
1274 | pg_data_t *pgdat; | 1262 | pg_data_t *pgdat; |
1275 | 1263 | ||
1276 | if (zone->present_pages == 0) | 1264 | if (!populated_zone(zone)) |
1277 | return; | 1265 | return; |
1278 | 1266 | ||
1279 | pgdat = zone->zone_pgdat; | 1267 | pgdat = zone->zone_pgdat; |
@@ -1353,76 +1341,3 @@ static int __init kswapd_init(void) | |||
1353 | } | 1341 | } |
1354 | 1342 | ||
1355 | module_init(kswapd_init) | 1343 | module_init(kswapd_init) |
1356 | |||
1357 | |||
1358 | /* | ||
1359 | * Try to free up some pages from this zone through reclaim. | ||
1360 | */ | ||
1361 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | ||
1362 | { | ||
1363 | struct scan_control sc; | ||
1364 | int nr_pages = 1 << order; | ||
1365 | int total_reclaimed = 0; | ||
1366 | |||
1367 | /* The reclaim may sleep, so don't do it if sleep isn't allowed */ | ||
1368 | if (!(gfp_mask & __GFP_WAIT)) | ||
1369 | return 0; | ||
1370 | if (zone->all_unreclaimable) | ||
1371 | return 0; | ||
1372 | |||
1373 | sc.gfp_mask = gfp_mask; | ||
1374 | sc.may_writepage = 0; | ||
1375 | sc.may_swap = 0; | ||
1376 | sc.nr_mapped = read_page_state(nr_mapped); | ||
1377 | sc.nr_scanned = 0; | ||
1378 | sc.nr_reclaimed = 0; | ||
1379 | /* scan at the highest priority */ | ||
1380 | sc.priority = 0; | ||
1381 | disable_swap_token(); | ||
1382 | |||
1383 | if (nr_pages > SWAP_CLUSTER_MAX) | ||
1384 | sc.swap_cluster_max = nr_pages; | ||
1385 | else | ||
1386 | sc.swap_cluster_max = SWAP_CLUSTER_MAX; | ||
1387 | |||
1388 | /* Don't reclaim the zone if there are other reclaimers active */ | ||
1389 | if (atomic_read(&zone->reclaim_in_progress) > 0) | ||
1390 | goto out; | ||
1391 | |||
1392 | shrink_zone(zone, &sc); | ||
1393 | total_reclaimed = sc.nr_reclaimed; | ||
1394 | |||
1395 | out: | ||
1396 | return total_reclaimed; | ||
1397 | } | ||
1398 | |||
1399 | asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone, | ||
1400 | unsigned int state) | ||
1401 | { | ||
1402 | struct zone *z; | ||
1403 | int i; | ||
1404 | |||
1405 | if (!capable(CAP_SYS_ADMIN)) | ||
1406 | return -EACCES; | ||
1407 | |||
1408 | if (node >= MAX_NUMNODES || !node_online(node)) | ||
1409 | return -EINVAL; | ||
1410 | |||
1411 | /* This will break if we ever add more zones */ | ||
1412 | if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM))) | ||
1413 | return -EINVAL; | ||
1414 | |||
1415 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
1416 | if (!(zone & 1<<i)) | ||
1417 | continue; | ||
1418 | |||
1419 | z = &NODE_DATA(node)->node_zones[i]; | ||
1420 | |||
1421 | if (state) | ||
1422 | z->reclaim_pages = 1; | ||
1423 | else | ||
1424 | z->reclaim_pages = 0; | ||
1425 | } | ||
1426 | |||
1427 | return 0; | ||
1428 | } | ||