aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/bootmem.c58
-rw-r--r--mm/filemap.c78
-rw-r--r--mm/hugetlb.c192
-rw-r--r--mm/internal.h21
-rw-r--r--mm/madvise.c35
-rw-r--r--mm/memory.c32
-rw-r--r--mm/memory_hotplug.c1
-rw-r--r--mm/mempolicy.c102
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/page_alloc.c343
-rw-r--r--mm/readahead.c15
-rw-r--r--mm/rmap.c57
-rw-r--r--mm/shmem.c36
-rw-r--r--mm/swap.c27
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/swapfile.c20
-rw-r--r--mm/tiny-shmem.c29
-rw-r--r--mm/truncate.c44
-rw-r--r--mm/vmscan.c125
20 files changed, 732 insertions, 496 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 21eb51d4da8f..b3db11f137e0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -11,7 +11,7 @@ choice
11 11
12config FLATMEM_MANUAL 12config FLATMEM_MANUAL
13 bool "Flat Memory" 13 bool "Flat Memory"
14 depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE 14 depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
15 help 15 help
16 This option allows you to change some of the ways that 16 This option allows you to change some of the ways that
17 Linux manages its memory internally. Most users will 17 Linux manages its memory internally. Most users will
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 16b9465eb4eb..35c32290f717 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -296,20 +296,12 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
296 unsigned long v = ~map[i / BITS_PER_LONG]; 296 unsigned long v = ~map[i / BITS_PER_LONG];
297 297
298 if (gofast && v == ~0UL) { 298 if (gofast && v == ~0UL) {
299 int j, order; 299 int order;
300 300
301 page = pfn_to_page(pfn); 301 page = pfn_to_page(pfn);
302 count += BITS_PER_LONG; 302 count += BITS_PER_LONG;
303 __ClearPageReserved(page);
304 order = ffs(BITS_PER_LONG) - 1; 303 order = ffs(BITS_PER_LONG) - 1;
305 set_page_refs(page, order); 304 __free_pages_bootmem(page, order);
306 for (j = 1; j < BITS_PER_LONG; j++) {
307 if (j + 16 < BITS_PER_LONG)
308 prefetchw(page + j + 16);
309 __ClearPageReserved(page + j);
310 set_page_count(page + j, 0);
311 }
312 __free_pages(page, order);
313 i += BITS_PER_LONG; 305 i += BITS_PER_LONG;
314 page += BITS_PER_LONG; 306 page += BITS_PER_LONG;
315 } else if (v) { 307 } else if (v) {
@@ -319,9 +311,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
319 for (m = 1; m && i < idx; m<<=1, page++, i++) { 311 for (m = 1; m && i < idx; m<<=1, page++, i++) {
320 if (v & m) { 312 if (v & m) {
321 count++; 313 count++;
322 __ClearPageReserved(page); 314 __free_pages_bootmem(page, 0);
323 set_page_refs(page, 0);
324 __free_page(page);
325 } 315 }
326 } 316 }
327 } else { 317 } else {
@@ -339,9 +329,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
339 count = 0; 329 count = 0;
340 for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { 330 for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) {
341 count++; 331 count++;
342 __ClearPageReserved(page); 332 __free_pages_bootmem(page, 0);
343 set_page_count(page, 1);
344 __free_page(page);
345 } 333 }
346 total += count; 334 total += count;
347 bdata->node_bootmem_map = NULL; 335 bdata->node_bootmem_map = NULL;
@@ -393,15 +381,14 @@ unsigned long __init free_all_bootmem (void)
393 return(free_all_bootmem_core(NODE_DATA(0))); 381 return(free_all_bootmem_core(NODE_DATA(0)));
394} 382}
395 383
396void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal, 384void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
397 unsigned long limit)
398{ 385{
399 pg_data_t *pgdat = pgdat_list; 386 pg_data_t *pgdat = pgdat_list;
400 void *ptr; 387 void *ptr;
401 388
402 for_each_pgdat(pgdat) 389 for_each_pgdat(pgdat)
403 if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, 390 if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
404 align, goal, limit))) 391 align, goal, 0)))
405 return(ptr); 392 return(ptr);
406 393
407 /* 394 /*
@@ -413,15 +400,40 @@ void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, un
413} 400}
414 401
415 402
416void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align, 403void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align,
417 unsigned long goal, unsigned long limit) 404 unsigned long goal)
418{ 405{
419 void *ptr; 406 void *ptr;
420 407
421 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, limit); 408 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
422 if (ptr) 409 if (ptr)
423 return (ptr); 410 return (ptr);
424 411
425 return __alloc_bootmem_limit(size, align, goal, limit); 412 return __alloc_bootmem(size, align, goal);
413}
414
415#define LOW32LIMIT 0xffffffff
416
417void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
418{
419 pg_data_t *pgdat = pgdat_list;
420 void *ptr;
421
422 for_each_pgdat(pgdat)
423 if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
424 align, goal, LOW32LIMIT)))
425 return(ptr);
426
427 /*
428 * Whoops, we cannot satisfy the allocation request.
429 */
430 printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
431 panic("Out of low memory");
432 return NULL;
426} 433}
427 434
435void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
436 unsigned long align, unsigned long goal)
437{
438 return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT);
439}
diff --git a/mm/filemap.c b/mm/filemap.c
index 33a28bfde158..4ef24a397684 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -555,11 +555,12 @@ repeat:
555 page_cache_get(page); 555 page_cache_get(page);
556 if (TestSetPageLocked(page)) { 556 if (TestSetPageLocked(page)) {
557 read_unlock_irq(&mapping->tree_lock); 557 read_unlock_irq(&mapping->tree_lock);
558 lock_page(page); 558 __lock_page(page);
559 read_lock_irq(&mapping->tree_lock); 559 read_lock_irq(&mapping->tree_lock);
560 560
561 /* Has the page been truncated while we slept? */ 561 /* Has the page been truncated while we slept? */
562 if (page->mapping != mapping || page->index != offset) { 562 if (unlikely(page->mapping != mapping ||
563 page->index != offset)) {
563 unlock_page(page); 564 unlock_page(page);
564 page_cache_release(page); 565 page_cache_release(page);
565 goto repeat; 566 goto repeat;
@@ -831,8 +832,13 @@ readpage:
831 /* Start the actual read. The read will unlock the page. */ 832 /* Start the actual read. The read will unlock the page. */
832 error = mapping->a_ops->readpage(filp, page); 833 error = mapping->a_ops->readpage(filp, page);
833 834
834 if (unlikely(error)) 835 if (unlikely(error)) {
836 if (error == AOP_TRUNCATED_PAGE) {
837 page_cache_release(page);
838 goto find_page;
839 }
835 goto readpage_error; 840 goto readpage_error;
841 }
836 842
837 if (!PageUptodate(page)) { 843 if (!PageUptodate(page)) {
838 lock_page(page); 844 lock_page(page);
@@ -1152,26 +1158,24 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
1152{ 1158{
1153 struct address_space *mapping = file->f_mapping; 1159 struct address_space *mapping = file->f_mapping;
1154 struct page *page; 1160 struct page *page;
1155 int error; 1161 int ret;
1156 1162
1157 page = page_cache_alloc_cold(mapping); 1163 do {
1158 if (!page) 1164 page = page_cache_alloc_cold(mapping);
1159 return -ENOMEM; 1165 if (!page)
1166 return -ENOMEM;
1167
1168 ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1169 if (ret == 0)
1170 ret = mapping->a_ops->readpage(file, page);
1171 else if (ret == -EEXIST)
1172 ret = 0; /* losing race to add is OK */
1160 1173
1161 error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1162 if (!error) {
1163 error = mapping->a_ops->readpage(file, page);
1164 page_cache_release(page); 1174 page_cache_release(page);
1165 return error;
1166 }
1167 1175
1168 /* 1176 } while (ret == AOP_TRUNCATED_PAGE);
1169 * We arrive here in the unlikely event that someone 1177
1170 * raced with us and added our page to the cache first 1178 return ret;
1171 * or we are out of memory for radix-tree nodes.
1172 */
1173 page_cache_release(page);
1174 return error == -EEXIST ? 0 : error;
1175} 1179}
1176 1180
1177#define MMAP_LOTSAMISS (100) 1181#define MMAP_LOTSAMISS (100)
@@ -1331,10 +1335,14 @@ page_not_uptodate:
1331 goto success; 1335 goto success;
1332 } 1336 }
1333 1337
1334 if (!mapping->a_ops->readpage(file, page)) { 1338 error = mapping->a_ops->readpage(file, page);
1339 if (!error) {
1335 wait_on_page_locked(page); 1340 wait_on_page_locked(page);
1336 if (PageUptodate(page)) 1341 if (PageUptodate(page))
1337 goto success; 1342 goto success;
1343 } else if (error == AOP_TRUNCATED_PAGE) {
1344 page_cache_release(page);
1345 goto retry_find;
1338 } 1346 }
1339 1347
1340 /* 1348 /*
@@ -1358,10 +1366,14 @@ page_not_uptodate:
1358 goto success; 1366 goto success;
1359 } 1367 }
1360 ClearPageError(page); 1368 ClearPageError(page);
1361 if (!mapping->a_ops->readpage(file, page)) { 1369 error = mapping->a_ops->readpage(file, page);
1370 if (!error) {
1362 wait_on_page_locked(page); 1371 wait_on_page_locked(page);
1363 if (PageUptodate(page)) 1372 if (PageUptodate(page))
1364 goto success; 1373 goto success;
1374 } else if (error == AOP_TRUNCATED_PAGE) {
1375 page_cache_release(page);
1376 goto retry_find;
1365 } 1377 }
1366 1378
1367 /* 1379 /*
@@ -1444,10 +1456,14 @@ page_not_uptodate:
1444 goto success; 1456 goto success;
1445 } 1457 }
1446 1458
1447 if (!mapping->a_ops->readpage(file, page)) { 1459 error = mapping->a_ops->readpage(file, page);
1460 if (!error) {
1448 wait_on_page_locked(page); 1461 wait_on_page_locked(page);
1449 if (PageUptodate(page)) 1462 if (PageUptodate(page))
1450 goto success; 1463 goto success;
1464 } else if (error == AOP_TRUNCATED_PAGE) {
1465 page_cache_release(page);
1466 goto retry_find;
1451 } 1467 }
1452 1468
1453 /* 1469 /*
@@ -1470,10 +1486,14 @@ page_not_uptodate:
1470 } 1486 }
1471 1487
1472 ClearPageError(page); 1488 ClearPageError(page);
1473 if (!mapping->a_ops->readpage(file, page)) { 1489 error = mapping->a_ops->readpage(file, page);
1490 if (!error) {
1474 wait_on_page_locked(page); 1491 wait_on_page_locked(page);
1475 if (PageUptodate(page)) 1492 if (PageUptodate(page))
1476 goto success; 1493 goto success;
1494 } else if (error == AOP_TRUNCATED_PAGE) {
1495 page_cache_release(page);
1496 goto retry_find;
1477 } 1497 }
1478 1498
1479 /* 1499 /*
@@ -1934,12 +1954,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1934 status = a_ops->prepare_write(file, page, offset, offset+bytes); 1954 status = a_ops->prepare_write(file, page, offset, offset+bytes);
1935 if (unlikely(status)) { 1955 if (unlikely(status)) {
1936 loff_t isize = i_size_read(inode); 1956 loff_t isize = i_size_read(inode);
1957
1958 if (status != AOP_TRUNCATED_PAGE)
1959 unlock_page(page);
1960 page_cache_release(page);
1961 if (status == AOP_TRUNCATED_PAGE)
1962 continue;
1937 /* 1963 /*
1938 * prepare_write() may have instantiated a few blocks 1964 * prepare_write() may have instantiated a few blocks
1939 * outside i_size. Trim these off again. 1965 * outside i_size. Trim these off again.
1940 */ 1966 */
1941 unlock_page(page);
1942 page_cache_release(page);
1943 if (pos + bytes > isize) 1967 if (pos + bytes > isize)
1944 vmtruncate(inode, isize); 1968 vmtruncate(inode, isize);
1945 break; 1969 break;
@@ -1952,6 +1976,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1952 cur_iov, iov_base, bytes); 1976 cur_iov, iov_base, bytes);
1953 flush_dcache_page(page); 1977 flush_dcache_page(page);
1954 status = a_ops->commit_write(file, page, offset, offset+bytes); 1978 status = a_ops->commit_write(file, page, offset, offset+bytes);
1979 if (status == AOP_TRUNCATED_PAGE) {
1980 page_cache_release(page);
1981 continue;
1982 }
1955 if (likely(copied > 0)) { 1983 if (likely(copied > 0)) {
1956 if (!status) 1984 if (!status)
1957 status = copied; 1985 status = copied;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3e52df7c471b..f4c43d7980ba 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -11,6 +11,8 @@
11#include <linux/highmem.h> 11#include <linux/highmem.h>
12#include <linux/nodemask.h> 12#include <linux/nodemask.h>
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/mempolicy.h>
15
14#include <asm/page.h> 16#include <asm/page.h>
15#include <asm/pgtable.h> 17#include <asm/pgtable.h>
16 18
@@ -36,18 +38,21 @@ static void enqueue_huge_page(struct page *page)
36 free_huge_pages_node[nid]++; 38 free_huge_pages_node[nid]++;
37} 39}
38 40
39static struct page *dequeue_huge_page(void) 41static struct page *dequeue_huge_page(struct vm_area_struct *vma,
42 unsigned long address)
40{ 43{
41 int nid = numa_node_id(); 44 int nid = numa_node_id();
42 struct page *page = NULL; 45 struct page *page = NULL;
46 struct zonelist *zonelist = huge_zonelist(vma, address);
47 struct zone **z;
43 48
44 if (list_empty(&hugepage_freelists[nid])) { 49 for (z = zonelist->zones; *z; z++) {
45 for (nid = 0; nid < MAX_NUMNODES; ++nid) 50 nid = (*z)->zone_pgdat->node_id;
46 if (!list_empty(&hugepage_freelists[nid])) 51 if (!list_empty(&hugepage_freelists[nid]))
47 break; 52 break;
48 } 53 }
49 if (nid >= 0 && nid < MAX_NUMNODES && 54
50 !list_empty(&hugepage_freelists[nid])) { 55 if (*z) {
51 page = list_entry(hugepage_freelists[nid].next, 56 page = list_entry(hugepage_freelists[nid].next,
52 struct page, lru); 57 struct page, lru);
53 list_del(&page->lru); 58 list_del(&page->lru);
@@ -85,13 +90,13 @@ void free_huge_page(struct page *page)
85 spin_unlock(&hugetlb_lock); 90 spin_unlock(&hugetlb_lock);
86} 91}
87 92
88struct page *alloc_huge_page(void) 93struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
89{ 94{
90 struct page *page; 95 struct page *page;
91 int i; 96 int i;
92 97
93 spin_lock(&hugetlb_lock); 98 spin_lock(&hugetlb_lock);
94 page = dequeue_huge_page(); 99 page = dequeue_huge_page(vma, addr);
95 if (!page) { 100 if (!page) {
96 spin_unlock(&hugetlb_lock); 101 spin_unlock(&hugetlb_lock);
97 return NULL; 102 return NULL;
@@ -194,7 +199,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
194 spin_lock(&hugetlb_lock); 199 spin_lock(&hugetlb_lock);
195 try_to_free_low(count); 200 try_to_free_low(count);
196 while (count < nr_huge_pages) { 201 while (count < nr_huge_pages) {
197 struct page *page = dequeue_huge_page(); 202 struct page *page = dequeue_huge_page(NULL, 0);
198 if (!page) 203 if (!page)
199 break; 204 break;
200 update_and_free_page(page); 205 update_and_free_page(page);
@@ -261,11 +266,12 @@ struct vm_operations_struct hugetlb_vm_ops = {
261 .nopage = hugetlb_nopage, 266 .nopage = hugetlb_nopage,
262}; 267};
263 268
264static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) 269static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
270 int writable)
265{ 271{
266 pte_t entry; 272 pte_t entry;
267 273
268 if (vma->vm_flags & VM_WRITE) { 274 if (writable) {
269 entry = 275 entry =
270 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 276 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
271 } else { 277 } else {
@@ -277,12 +283,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
277 return entry; 283 return entry;
278} 284}
279 285
286static void set_huge_ptep_writable(struct vm_area_struct *vma,
287 unsigned long address, pte_t *ptep)
288{
289 pte_t entry;
290
291 entry = pte_mkwrite(pte_mkdirty(*ptep));
292 ptep_set_access_flags(vma, address, ptep, entry, 1);
293 update_mmu_cache(vma, address, entry);
294 lazy_mmu_prot_update(entry);
295}
296
297
280int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 298int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
281 struct vm_area_struct *vma) 299 struct vm_area_struct *vma)
282{ 300{
283 pte_t *src_pte, *dst_pte, entry; 301 pte_t *src_pte, *dst_pte, entry;
284 struct page *ptepage; 302 struct page *ptepage;
285 unsigned long addr; 303 unsigned long addr;
304 int cow;
305
306 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
286 307
287 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 308 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
288 src_pte = huge_pte_offset(src, addr); 309 src_pte = huge_pte_offset(src, addr);
@@ -294,6 +315,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
294 spin_lock(&dst->page_table_lock); 315 spin_lock(&dst->page_table_lock);
295 spin_lock(&src->page_table_lock); 316 spin_lock(&src->page_table_lock);
296 if (!pte_none(*src_pte)) { 317 if (!pte_none(*src_pte)) {
318 if (cow)
319 ptep_set_wrprotect(src, addr, src_pte);
297 entry = *src_pte; 320 entry = *src_pte;
298 ptepage = pte_page(entry); 321 ptepage = pte_page(entry);
299 get_page(ptepage); 322 get_page(ptepage);
@@ -345,57 +368,63 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
345 flush_tlb_range(vma, start, end); 368 flush_tlb_range(vma, start, end);
346} 369}
347 370
348static struct page *find_lock_huge_page(struct address_space *mapping, 371static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
349 unsigned long idx) 372 unsigned long address, pte_t *ptep, pte_t pte)
350{ 373{
351 struct page *page; 374 struct page *old_page, *new_page;
352 int err; 375 int i, avoidcopy;
353 struct inode *inode = mapping->host;
354 unsigned long size;
355 376
356retry: 377 old_page = pte_page(pte);
357 page = find_lock_page(mapping, idx);
358 if (page)
359 goto out;
360 378
361 /* Check to make sure the mapping hasn't been truncated */ 379 /* If no-one else is actually using this page, avoid the copy
362 size = i_size_read(inode) >> HPAGE_SHIFT; 380 * and just make the page writable */
363 if (idx >= size) 381 avoidcopy = (page_count(old_page) == 1);
364 goto out; 382 if (avoidcopy) {
383 set_huge_ptep_writable(vma, address, ptep);
384 return VM_FAULT_MINOR;
385 }
365 386
366 if (hugetlb_get_quota(mapping)) 387 page_cache_get(old_page);
367 goto out; 388 new_page = alloc_huge_page(vma, address);
368 page = alloc_huge_page(); 389
369 if (!page) { 390 if (!new_page) {
370 hugetlb_put_quota(mapping); 391 page_cache_release(old_page);
371 goto out; 392
393 /* Logically this is OOM, not a SIGBUS, but an OOM
394 * could cause the kernel to go killing other
395 * processes which won't help the hugepage situation
396 * at all (?) */
397 return VM_FAULT_SIGBUS;
372 } 398 }
373 399
374 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 400 spin_unlock(&mm->page_table_lock);
375 if (err) { 401 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
376 put_page(page); 402 copy_user_highpage(new_page + i, old_page + i,
377 hugetlb_put_quota(mapping); 403 address + i*PAGE_SIZE);
378 if (err == -EEXIST) 404 spin_lock(&mm->page_table_lock);
379 goto retry; 405
380 page = NULL; 406 ptep = huge_pte_offset(mm, address & HPAGE_MASK);
407 if (likely(pte_same(*ptep, pte))) {
408 /* Break COW */
409 set_huge_pte_at(mm, address, ptep,
410 make_huge_pte(vma, new_page, 1));
411 /* Make the old page be freed below */
412 new_page = old_page;
381 } 413 }
382out: 414 page_cache_release(new_page);
383 return page; 415 page_cache_release(old_page);
416 return VM_FAULT_MINOR;
384} 417}
385 418
386int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 419int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
387 unsigned long address, int write_access) 420 unsigned long address, pte_t *ptep, int write_access)
388{ 421{
389 int ret = VM_FAULT_SIGBUS; 422 int ret = VM_FAULT_SIGBUS;
390 unsigned long idx; 423 unsigned long idx;
391 unsigned long size; 424 unsigned long size;
392 pte_t *pte;
393 struct page *page; 425 struct page *page;
394 struct address_space *mapping; 426 struct address_space *mapping;
395 427 pte_t new_pte;
396 pte = huge_pte_alloc(mm, address);
397 if (!pte)
398 goto out;
399 428
400 mapping = vma->vm_file->f_mapping; 429 mapping = vma->vm_file->f_mapping;
401 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 430 idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
@@ -405,9 +434,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
405 * Use page lock to guard against racing truncation 434 * Use page lock to guard against racing truncation
406 * before we get page_table_lock. 435 * before we get page_table_lock.
407 */ 436 */
408 page = find_lock_huge_page(mapping, idx); 437retry:
409 if (!page) 438 page = find_lock_page(mapping, idx);
410 goto out; 439 if (!page) {
440 if (hugetlb_get_quota(mapping))
441 goto out;
442 page = alloc_huge_page(vma, address);
443 if (!page) {
444 hugetlb_put_quota(mapping);
445 goto out;
446 }
447
448 if (vma->vm_flags & VM_SHARED) {
449 int err;
450
451 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
452 if (err) {
453 put_page(page);
454 hugetlb_put_quota(mapping);
455 if (err == -EEXIST)
456 goto retry;
457 goto out;
458 }
459 } else
460 lock_page(page);
461 }
411 462
412 spin_lock(&mm->page_table_lock); 463 spin_lock(&mm->page_table_lock);
413 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 464 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
@@ -415,11 +466,19 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
415 goto backout; 466 goto backout;
416 467
417 ret = VM_FAULT_MINOR; 468 ret = VM_FAULT_MINOR;
418 if (!pte_none(*pte)) 469 if (!pte_none(*ptep))
419 goto backout; 470 goto backout;
420 471
421 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); 472 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
422 set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page)); 473 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
474 && (vma->vm_flags & VM_SHARED)));
475 set_huge_pte_at(mm, address, ptep, new_pte);
476
477 if (write_access && !(vma->vm_flags & VM_SHARED)) {
478 /* Optimization, do the COW without a second fault */
479 ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
480 }
481
423 spin_unlock(&mm->page_table_lock); 482 spin_unlock(&mm->page_table_lock);
424 unlock_page(page); 483 unlock_page(page);
425out: 484out:
@@ -433,6 +492,33 @@ backout:
433 goto out; 492 goto out;
434} 493}
435 494
495int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
496 unsigned long address, int write_access)
497{
498 pte_t *ptep;
499 pte_t entry;
500 int ret;
501
502 ptep = huge_pte_alloc(mm, address);
503 if (!ptep)
504 return VM_FAULT_OOM;
505
506 entry = *ptep;
507 if (pte_none(entry))
508 return hugetlb_no_page(mm, vma, address, ptep, write_access);
509
510 ret = VM_FAULT_MINOR;
511
512 spin_lock(&mm->page_table_lock);
513 /* Check for a racing update before calling hugetlb_cow */
514 if (likely(pte_same(entry, *ptep)))
515 if (write_access && !pte_write(entry))
516 ret = hugetlb_cow(mm, vma, address, ptep, entry);
517 spin_unlock(&mm->page_table_lock);
518
519 return ret;
520}
521
436int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 522int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
437 struct page **pages, struct vm_area_struct **vmas, 523 struct page **pages, struct vm_area_struct **vmas,
438 unsigned long *position, int *length, int i) 524 unsigned long *position, int *length, int i)
diff --git a/mm/internal.h b/mm/internal.h
index 6bf134e8fb3d..17256bb2f4ef 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -9,5 +9,22 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12/* page_alloc.c */ 12static inline void set_page_refs(struct page *page, int order)
13extern void set_page_refs(struct page *page, int order); 13{
14#ifdef CONFIG_MMU
15 set_page_count(page, 1);
16#else
17 int i;
18
19 /*
20 * We need to reference all the pages for this order, otherwise if
21 * anyone accesses one of the pages with (get/put) it will be freed.
22 * - eg: access_process_vm()
23 */
24 for (i = 0; i < (1 << order); i++)
25 set_page_count(page + i, 1);
26#endif /* CONFIG_MMU */
27}
28
29extern void fastcall __init __free_pages_bootmem(struct page *page,
30 unsigned int order);
diff --git a/mm/madvise.c b/mm/madvise.c
index 2b7cf0400a21..ae0ae3ea299a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -140,6 +140,36 @@ static long madvise_dontneed(struct vm_area_struct * vma,
140 return 0; 140 return 0;
141} 141}
142 142
143/*
144 * Application wants to free up the pages and associated backing store.
145 * This is effectively punching a hole into the middle of a file.
146 *
147 * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
148 * Other filesystems return -ENOSYS.
149 */
150static long madvise_remove(struct vm_area_struct *vma,
151 unsigned long start, unsigned long end)
152{
153 struct address_space *mapping;
154 loff_t offset, endoff;
155
156 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
157 return -EINVAL;
158
159 if (!vma->vm_file || !vma->vm_file->f_mapping
160 || !vma->vm_file->f_mapping->host) {
161 return -EINVAL;
162 }
163
164 mapping = vma->vm_file->f_mapping;
165
166 offset = (loff_t)(start - vma->vm_start)
167 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
168 endoff = (loff_t)(end - vma->vm_start - 1)
169 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
170 return vmtruncate_range(mapping->host, offset, endoff);
171}
172
143static long 173static long
144madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 174madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
145 unsigned long start, unsigned long end, int behavior) 175 unsigned long start, unsigned long end, int behavior)
@@ -152,6 +182,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
152 case MADV_RANDOM: 182 case MADV_RANDOM:
153 error = madvise_behavior(vma, prev, start, end, behavior); 183 error = madvise_behavior(vma, prev, start, end, behavior);
154 break; 184 break;
185 case MADV_REMOVE:
186 error = madvise_remove(vma, start, end);
187 break;
155 188
156 case MADV_WILLNEED: 189 case MADV_WILLNEED:
157 error = madvise_willneed(vma, prev, start, end); 190 error = madvise_willneed(vma, prev, start, end);
@@ -190,6 +223,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
190 * some pages ahead. 223 * some pages ahead.
191 * MADV_DONTNEED - the application is finished with the given range, 224 * MADV_DONTNEED - the application is finished with the given range,
192 * so the kernel can free resources associated with it. 225 * so the kernel can free resources associated with it.
226 * MADV_REMOVE - the application wants to free up the given range of
227 * pages and associated backing store.
193 * 228 *
194 * return values: 229 * return values:
195 * zero - success 230 * zero - success
diff --git a/mm/memory.c b/mm/memory.c
index d8dde07a3656..7197f9bcd384 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1498,7 +1498,7 @@ gotten:
1498 update_mmu_cache(vma, address, entry); 1498 update_mmu_cache(vma, address, entry);
1499 lazy_mmu_prot_update(entry); 1499 lazy_mmu_prot_update(entry);
1500 lru_cache_add_active(new_page); 1500 lru_cache_add_active(new_page);
1501 page_add_anon_rmap(new_page, vma, address); 1501 page_add_new_anon_rmap(new_page, vma, address);
1502 1502
1503 /* Free the old page.. */ 1503 /* Free the old page.. */
1504 new_page = old_page; 1504 new_page = old_page;
@@ -1770,9 +1770,32 @@ out_big:
1770out_busy: 1770out_busy:
1771 return -ETXTBSY; 1771 return -ETXTBSY;
1772} 1772}
1773
1774EXPORT_SYMBOL(vmtruncate); 1773EXPORT_SYMBOL(vmtruncate);
1775 1774
1775int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
1776{
1777 struct address_space *mapping = inode->i_mapping;
1778
1779 /*
1780 * If the underlying filesystem is not going to provide
1781 * a way to truncate a range of blocks (punch a hole) -
1782 * we should return failure right now.
1783 */
1784 if (!inode->i_op || !inode->i_op->truncate_range)
1785 return -ENOSYS;
1786
1787 down(&inode->i_sem);
1788 down_write(&inode->i_alloc_sem);
1789 unmap_mapping_range(mapping, offset, (end - offset), 1);
1790 truncate_inode_pages_range(mapping, offset, end);
1791 inode->i_op->truncate_range(inode, offset, end);
1792 up_write(&inode->i_alloc_sem);
1793 up(&inode->i_sem);
1794
1795 return 0;
1796}
1797EXPORT_SYMBOL(vmtruncate_range);
1798
1776/* 1799/*
1777 * Primitive swap readahead code. We simply read an aligned block of 1800 * Primitive swap readahead code. We simply read an aligned block of
1778 * (1 << page_cluster) entries in the swap area. This method is chosen 1801 * (1 << page_cluster) entries in the swap area. This method is chosen
@@ -1954,8 +1977,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
1954 goto release; 1977 goto release;
1955 inc_mm_counter(mm, anon_rss); 1978 inc_mm_counter(mm, anon_rss);
1956 lru_cache_add_active(page); 1979 lru_cache_add_active(page);
1957 SetPageReferenced(page); 1980 page_add_new_anon_rmap(page, vma, address);
1958 page_add_anon_rmap(page, vma, address);
1959 } else { 1981 } else {
1960 /* Map the ZERO_PAGE - vm_page_prot is readonly */ 1982 /* Map the ZERO_PAGE - vm_page_prot is readonly */
1961 page = ZERO_PAGE(address); 1983 page = ZERO_PAGE(address);
@@ -2086,7 +2108,7 @@ retry:
2086 if (anon) { 2108 if (anon) {
2087 inc_mm_counter(mm, anon_rss); 2109 inc_mm_counter(mm, anon_rss);
2088 lru_cache_add_active(new_page); 2110 lru_cache_add_active(new_page);
2089 page_add_anon_rmap(new_page, vma, address); 2111 page_add_new_anon_rmap(new_page, vma, address);
2090 } else { 2112 } else {
2091 inc_mm_counter(mm, file_rss); 2113 inc_mm_counter(mm, file_rss);
2092 page_add_file_rmap(new_page); 2114 page_add_file_rmap(new_page);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f6d4af8af8a8..a918f77f02f3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -42,7 +42,6 @@ extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
42 int nr_pages); 42 int nr_pages);
43static int __add_section(struct zone *zone, unsigned long phys_start_pfn) 43static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
44{ 44{
45 struct pglist_data *pgdat = zone->zone_pgdat;
46 int nr_pages = PAGES_PER_SECTION; 45 int nr_pages = PAGES_PER_SECTION;
47 int ret; 46 int ret;
48 47
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 72f402cc9c9a..0f1d2b8a952b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -93,7 +93,7 @@ static kmem_cache_t *sn_cache;
93 93
94/* Highest zone. An specific allocation for a zone below that is not 94/* Highest zone. An specific allocation for a zone below that is not
95 policied. */ 95 policied. */
96static int policy_zone; 96int policy_zone = ZONE_DMA;
97 97
98struct mempolicy default_policy = { 98struct mempolicy default_policy = {
99 .refcnt = ATOMIC_INIT(1), /* never free it */ 99 .refcnt = ATOMIC_INIT(1), /* never free it */
@@ -131,17 +131,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
131 if (!zl) 131 if (!zl)
132 return NULL; 132 return NULL;
133 num = 0; 133 num = 0;
134 for_each_node_mask(nd, *nodes) { 134 for_each_node_mask(nd, *nodes)
135 int k; 135 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
136 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
137 struct zone *z = &NODE_DATA(nd)->node_zones[k];
138 if (!z->present_pages)
139 continue;
140 zl->zones[num++] = z;
141 if (k > policy_zone)
142 policy_zone = k;
143 }
144 }
145 zl->zones[num] = NULL; 136 zl->zones[num] = NULL;
146 return zl; 137 return zl;
147} 138}
@@ -785,6 +776,34 @@ static unsigned offset_il_node(struct mempolicy *pol,
785 return nid; 776 return nid;
786} 777}
787 778
779/* Determine a node number for interleave */
780static inline unsigned interleave_nid(struct mempolicy *pol,
781 struct vm_area_struct *vma, unsigned long addr, int shift)
782{
783 if (vma) {
784 unsigned long off;
785
786 off = vma->vm_pgoff;
787 off += (addr - vma->vm_start) >> shift;
788 return offset_il_node(pol, vma, off);
789 } else
790 return interleave_nodes(pol);
791}
792
793/* Return a zonelist suitable for a huge page allocation. */
794struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
795{
796 struct mempolicy *pol = get_vma_policy(current, vma, addr);
797
798 if (pol->policy == MPOL_INTERLEAVE) {
799 unsigned nid;
800
801 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
802 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
803 }
804 return zonelist_policy(GFP_HIGHUSER, pol);
805}
806
788/* Allocate a page in interleaved policy. 807/* Allocate a page in interleaved policy.
789 Own path because it needs to do special accounting. */ 808 Own path because it needs to do special accounting. */
790static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, 809static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -833,15 +852,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
833 852
834 if (unlikely(pol->policy == MPOL_INTERLEAVE)) { 853 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
835 unsigned nid; 854 unsigned nid;
836 if (vma) { 855
837 unsigned long off; 856 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
838 off = vma->vm_pgoff;
839 off += (addr - vma->vm_start) >> PAGE_SHIFT;
840 nid = offset_il_node(pol, vma, off);
841 } else {
842 /* fall back to process interleaving */
843 nid = interleave_nodes(pol);
844 }
845 return alloc_page_interleave(gfp, 0, nid); 857 return alloc_page_interleave(gfp, 0, nid);
846 } 858 }
847 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); 859 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
@@ -940,54 +952,6 @@ void __mpol_free(struct mempolicy *p)
940} 952}
941 953
942/* 954/*
943 * Hugetlb policy. Same as above, just works with node numbers instead of
944 * zonelists.
945 */
946
947/* Find first node suitable for an allocation */
948int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
949{
950 struct mempolicy *pol = get_vma_policy(current, vma, addr);
951
952 switch (pol->policy) {
953 case MPOL_DEFAULT:
954 return numa_node_id();
955 case MPOL_BIND:
956 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
957 case MPOL_INTERLEAVE:
958 return interleave_nodes(pol);
959 case MPOL_PREFERRED:
960 return pol->v.preferred_node >= 0 ?
961 pol->v.preferred_node : numa_node_id();
962 }
963 BUG();
964 return 0;
965}
966
967/* Find secondary valid nodes for an allocation */
968int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
969{
970 struct mempolicy *pol = get_vma_policy(current, vma, addr);
971
972 switch (pol->policy) {
973 case MPOL_PREFERRED:
974 case MPOL_DEFAULT:
975 case MPOL_INTERLEAVE:
976 return 1;
977 case MPOL_BIND: {
978 struct zone **z;
979 for (z = pol->v.zonelist->zones; *z; z++)
980 if ((*z)->zone_pgdat->node_id == nid)
981 return 1;
982 return 0;
983 }
984 default:
985 BUG();
986 return 0;
987 }
988}
989
990/*
991 * Shared memory backing store policy support. 955 * Shared memory backing store policy support.
992 * 956 *
993 * Remember policies even when nobody has shared memory mapped. 957 * Remember policies even when nobody has shared memory mapped.
diff --git a/mm/nommu.c b/mm/nommu.c
index c1196812876b..c10262d68232 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1177,3 +1177,10 @@ int in_gate_area_no_task(unsigned long addr)
1177{ 1177{
1178 return 0; 1178 return 0;
1179} 1179}
1180
1181struct page *filemap_nopage(struct vm_area_struct *area,
1182 unsigned long address, int *type)
1183{
1184 BUG();
1185 return NULL;
1186}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fe14a8c87fc2..fd47494cb989 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -36,6 +36,7 @@
36#include <linux/memory_hotplug.h> 36#include <linux/memory_hotplug.h>
37#include <linux/nodemask.h> 37#include <linux/nodemask.h>
38#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
39#include <linux/mempolicy.h>
39 40
40#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
41#include "internal.h" 42#include "internal.h"
@@ -53,6 +54,8 @@ unsigned long totalram_pages __read_mostly;
53unsigned long totalhigh_pages __read_mostly; 54unsigned long totalhigh_pages __read_mostly;
54long nr_swap_pages; 55long nr_swap_pages;
55 56
57static void fastcall free_hot_cold_page(struct page *page, int cold);
58
56/* 59/*
57 * results with 256, 32 in the lowmem_reserve sysctl: 60 * results with 256, 32 in the lowmem_reserve sysctl:
58 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 61 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
@@ -81,6 +84,7 @@ int min_free_kbytes = 1024;
81unsigned long __initdata nr_kernel_pages; 84unsigned long __initdata nr_kernel_pages;
82unsigned long __initdata nr_all_pages; 85unsigned long __initdata nr_all_pages;
83 86
87#ifdef CONFIG_DEBUG_VM
84static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 88static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
85{ 89{
86 int ret = 0; 90 int ret = 0;
@@ -122,16 +126,23 @@ static int bad_range(struct zone *zone, struct page *page)
122 return 0; 126 return 0;
123} 127}
124 128
125static void bad_page(const char *function, struct page *page) 129#else
130static inline int bad_range(struct zone *zone, struct page *page)
131{
132 return 0;
133}
134#endif
135
136static void bad_page(struct page *page)
126{ 137{
127 printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", 138 printk(KERN_EMERG "Bad page state in process '%s'\n"
128 function, current->comm, page); 139 "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
129 printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", 140 "Trying to fix it up, but a reboot is needed\n"
130 (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, 141 "Backtrace:\n",
131 page->mapping, page_mapcount(page), page_count(page)); 142 current->comm, page, (int)(2*sizeof(unsigned long)),
132 printk(KERN_EMERG "Backtrace:\n"); 143 (unsigned long)page->flags, page->mapping,
144 page_mapcount(page), page_count(page));
133 dump_stack(); 145 dump_stack();
134 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
135 page->flags &= ~(1 << PG_lru | 146 page->flags &= ~(1 << PG_lru |
136 1 << PG_private | 147 1 << PG_private |
137 1 << PG_locked | 148 1 << PG_locked |
@@ -184,19 +195,15 @@ static void destroy_compound_page(struct page *page, unsigned long order)
184 int i; 195 int i;
185 int nr_pages = 1 << order; 196 int nr_pages = 1 << order;
186 197
187 if (!PageCompound(page)) 198 if (unlikely(page[1].index != order))
188 return; 199 bad_page(page);
189
190 if (page[1].index != order)
191 bad_page(__FUNCTION__, page);
192 200
193 for (i = 0; i < nr_pages; i++) { 201 for (i = 0; i < nr_pages; i++) {
194 struct page *p = page + i; 202 struct page *p = page + i;
195 203
196 if (!PageCompound(p)) 204 if (unlikely(!PageCompound(p) |
197 bad_page(__FUNCTION__, page); 205 (page_private(p) != (unsigned long)page)))
198 if (page_private(p) != (unsigned long)page) 206 bad_page(page);
199 bad_page(__FUNCTION__, page);
200 ClearPageCompound(p); 207 ClearPageCompound(p);
201 } 208 }
202} 209}
@@ -255,14 +262,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
255/* 262/*
256 * This function checks whether a page is free && is the buddy 263 * This function checks whether a page is free && is the buddy
257 * we can do coalesce a page and its buddy if 264 * we can do coalesce a page and its buddy if
258 * (a) the buddy is free && 265 * (a) the buddy is not in a hole &&
259 * (b) the buddy is on the buddy system && 266 * (b) the buddy is free &&
260 * (c) a page and its buddy have the same order. 267 * (c) the buddy is on the buddy system &&
268 * (d) a page and its buddy have the same order.
261 * for recording page's order, we use page_private(page) and PG_private. 269 * for recording page's order, we use page_private(page) and PG_private.
262 * 270 *
263 */ 271 */
264static inline int page_is_buddy(struct page *page, int order) 272static inline int page_is_buddy(struct page *page, int order)
265{ 273{
274#ifdef CONFIG_HOLES_IN_ZONE
275 if (!pfn_valid(page_to_pfn(page)))
276 return 0;
277#endif
278
266 if (PagePrivate(page) && 279 if (PagePrivate(page) &&
267 (page_order(page) == order) && 280 (page_order(page) == order) &&
268 page_count(page) == 0) 281 page_count(page) == 0)
@@ -300,7 +313,7 @@ static inline void __free_pages_bulk (struct page *page,
300 unsigned long page_idx; 313 unsigned long page_idx;
301 int order_size = 1 << order; 314 int order_size = 1 << order;
302 315
303 if (unlikely(order)) 316 if (unlikely(PageCompound(page)))
304 destroy_compound_page(page, order); 317 destroy_compound_page(page, order);
305 318
306 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 319 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
@@ -314,17 +327,15 @@ static inline void __free_pages_bulk (struct page *page,
314 struct free_area *area; 327 struct free_area *area;
315 struct page *buddy; 328 struct page *buddy;
316 329
317 combined_idx = __find_combined_index(page_idx, order);
318 buddy = __page_find_buddy(page, page_idx, order); 330 buddy = __page_find_buddy(page, page_idx, order);
319
320 if (bad_range(zone, buddy))
321 break;
322 if (!page_is_buddy(buddy, order)) 331 if (!page_is_buddy(buddy, order))
323 break; /* Move the buddy up one level. */ 332 break; /* Move the buddy up one level. */
333
324 list_del(&buddy->lru); 334 list_del(&buddy->lru);
325 area = zone->free_area + order; 335 area = zone->free_area + order;
326 area->nr_free--; 336 area->nr_free--;
327 rmv_page_order(buddy); 337 rmv_page_order(buddy);
338 combined_idx = __find_combined_index(page_idx, order);
328 page = page + (combined_idx - page_idx); 339 page = page + (combined_idx - page_idx);
329 page_idx = combined_idx; 340 page_idx = combined_idx;
330 order++; 341 order++;
@@ -334,11 +345,11 @@ static inline void __free_pages_bulk (struct page *page,
334 zone->free_area[order].nr_free++; 345 zone->free_area[order].nr_free++;
335} 346}
336 347
337static inline int free_pages_check(const char *function, struct page *page) 348static inline int free_pages_check(struct page *page)
338{ 349{
339 if ( page_mapcount(page) || 350 if (unlikely(page_mapcount(page) |
340 page->mapping != NULL || 351 (page->mapping != NULL) |
341 page_count(page) != 0 || 352 (page_count(page) != 0) |
342 (page->flags & ( 353 (page->flags & (
343 1 << PG_lru | 354 1 << PG_lru |
344 1 << PG_private | 355 1 << PG_private |
@@ -348,8 +359,8 @@ static inline int free_pages_check(const char *function, struct page *page)
348 1 << PG_slab | 359 1 << PG_slab |
349 1 << PG_swapcache | 360 1 << PG_swapcache |
350 1 << PG_writeback | 361 1 << PG_writeback |
351 1 << PG_reserved ))) 362 1 << PG_reserved ))))
352 bad_page(function, page); 363 bad_page(page);
353 if (PageDirty(page)) 364 if (PageDirty(page))
354 __ClearPageDirty(page); 365 __ClearPageDirty(page);
355 /* 366 /*
@@ -375,11 +386,10 @@ static int
375free_pages_bulk(struct zone *zone, int count, 386free_pages_bulk(struct zone *zone, int count,
376 struct list_head *list, unsigned int order) 387 struct list_head *list, unsigned int order)
377{ 388{
378 unsigned long flags;
379 struct page *page = NULL; 389 struct page *page = NULL;
380 int ret = 0; 390 int ret = 0;
381 391
382 spin_lock_irqsave(&zone->lock, flags); 392 spin_lock(&zone->lock);
383 zone->all_unreclaimable = 0; 393 zone->all_unreclaimable = 0;
384 zone->pages_scanned = 0; 394 zone->pages_scanned = 0;
385 while (!list_empty(list) && count--) { 395 while (!list_empty(list) && count--) {
@@ -389,12 +399,13 @@ free_pages_bulk(struct zone *zone, int count,
389 __free_pages_bulk(page, zone, order); 399 __free_pages_bulk(page, zone, order);
390 ret++; 400 ret++;
391 } 401 }
392 spin_unlock_irqrestore(&zone->lock, flags); 402 spin_unlock(&zone->lock);
393 return ret; 403 return ret;
394} 404}
395 405
396void __free_pages_ok(struct page *page, unsigned int order) 406void __free_pages_ok(struct page *page, unsigned int order)
397{ 407{
408 unsigned long flags;
398 LIST_HEAD(list); 409 LIST_HEAD(list);
399 int i; 410 int i;
400 int reserved = 0; 411 int reserved = 0;
@@ -408,14 +419,49 @@ void __free_pages_ok(struct page *page, unsigned int order)
408#endif 419#endif
409 420
410 for (i = 0 ; i < (1 << order) ; ++i) 421 for (i = 0 ; i < (1 << order) ; ++i)
411 reserved += free_pages_check(__FUNCTION__, page + i); 422 reserved += free_pages_check(page + i);
412 if (reserved) 423 if (reserved)
413 return; 424 return;
414 425
415 list_add(&page->lru, &list); 426 list_add(&page->lru, &list);
416 mod_page_state(pgfree, 1 << order);
417 kernel_map_pages(page, 1<<order, 0); 427 kernel_map_pages(page, 1<<order, 0);
428 local_irq_save(flags);
429 __mod_page_state(pgfree, 1 << order);
418 free_pages_bulk(page_zone(page), 1, &list, order); 430 free_pages_bulk(page_zone(page), 1, &list, order);
431 local_irq_restore(flags);
432}
433
434/*
435 * permit the bootmem allocator to evade page validation on high-order frees
436 */
437void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
438{
439 if (order == 0) {
440 __ClearPageReserved(page);
441 set_page_count(page, 0);
442
443 free_hot_cold_page(page, 0);
444 } else {
445 LIST_HEAD(list);
446 int loop;
447
448 for (loop = 0; loop < BITS_PER_LONG; loop++) {
449 struct page *p = &page[loop];
450
451 if (loop + 16 < BITS_PER_LONG)
452 prefetchw(p + 16);
453 __ClearPageReserved(p);
454 set_page_count(p, 0);
455 }
456
457 arch_free_page(page, order);
458
459 mod_page_state(pgfree, 1 << order);
460
461 list_add(&page->lru, &list);
462 kernel_map_pages(page, 1 << order, 0);
463 free_pages_bulk(page_zone(page), 1, &list, order);
464 }
419} 465}
420 466
421 467
@@ -433,8 +479,7 @@ void __free_pages_ok(struct page *page, unsigned int order)
433 * 479 *
434 * -- wli 480 * -- wli
435 */ 481 */
436static inline struct page * 482static inline void expand(struct zone *zone, struct page *page,
437expand(struct zone *zone, struct page *page,
438 int low, int high, struct free_area *area) 483 int low, int high, struct free_area *area)
439{ 484{
440 unsigned long size = 1 << high; 485 unsigned long size = 1 << high;
@@ -448,24 +493,6 @@ expand(struct zone *zone, struct page *page,
448 area->nr_free++; 493 area->nr_free++;
449 set_page_order(&page[size], high); 494 set_page_order(&page[size], high);
450 } 495 }
451 return page;
452}
453
454void set_page_refs(struct page *page, int order)
455{
456#ifdef CONFIG_MMU
457 set_page_count(page, 1);
458#else
459 int i;
460
461 /*
462 * We need to reference all the pages for this order, otherwise if
463 * anyone accesses one of the pages with (get/put) it will be freed.
464 * - eg: access_process_vm()
465 */
466 for (i = 0; i < (1 << order); i++)
467 set_page_count(page + i, 1);
468#endif /* CONFIG_MMU */
469} 496}
470 497
471/* 498/*
@@ -473,9 +500,9 @@ void set_page_refs(struct page *page, int order)
473 */ 500 */
474static int prep_new_page(struct page *page, int order) 501static int prep_new_page(struct page *page, int order)
475{ 502{
476 if ( page_mapcount(page) || 503 if (unlikely(page_mapcount(page) |
477 page->mapping != NULL || 504 (page->mapping != NULL) |
478 page_count(page) != 0 || 505 (page_count(page) != 0) |
479 (page->flags & ( 506 (page->flags & (
480 1 << PG_lru | 507 1 << PG_lru |
481 1 << PG_private | 508 1 << PG_private |
@@ -486,8 +513,8 @@ static int prep_new_page(struct page *page, int order)
486 1 << PG_slab | 513 1 << PG_slab |
487 1 << PG_swapcache | 514 1 << PG_swapcache |
488 1 << PG_writeback | 515 1 << PG_writeback |
489 1 << PG_reserved ))) 516 1 << PG_reserved ))))
490 bad_page(__FUNCTION__, page); 517 bad_page(page);
491 518
492 /* 519 /*
493 * For now, we report if PG_reserved was found set, but do not 520 * For now, we report if PG_reserved was found set, but do not
@@ -525,7 +552,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
525 rmv_page_order(page); 552 rmv_page_order(page);
526 area->nr_free--; 553 area->nr_free--;
527 zone->free_pages -= 1UL << order; 554 zone->free_pages -= 1UL << order;
528 return expand(zone, page, order, current_order, area); 555 expand(zone, page, order, current_order, area);
556 return page;
529 } 557 }
530 558
531 return NULL; 559 return NULL;
@@ -539,21 +567,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
539static int rmqueue_bulk(struct zone *zone, unsigned int order, 567static int rmqueue_bulk(struct zone *zone, unsigned int order,
540 unsigned long count, struct list_head *list) 568 unsigned long count, struct list_head *list)
541{ 569{
542 unsigned long flags;
543 int i; 570 int i;
544 int allocated = 0;
545 struct page *page;
546 571
547 spin_lock_irqsave(&zone->lock, flags); 572 spin_lock(&zone->lock);
548 for (i = 0; i < count; ++i) { 573 for (i = 0; i < count; ++i) {
549 page = __rmqueue(zone, order); 574 struct page *page = __rmqueue(zone, order);
550 if (page == NULL) 575 if (unlikely(page == NULL))
551 break; 576 break;
552 allocated++;
553 list_add_tail(&page->lru, list); 577 list_add_tail(&page->lru, list);
554 } 578 }
555 spin_unlock_irqrestore(&zone->lock, flags); 579 spin_unlock(&zone->lock);
556 return allocated; 580 return i;
557} 581}
558 582
559#ifdef CONFIG_NUMA 583#ifdef CONFIG_NUMA
@@ -589,6 +613,7 @@ void drain_remote_pages(void)
589#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) 613#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
590static void __drain_pages(unsigned int cpu) 614static void __drain_pages(unsigned int cpu)
591{ 615{
616 unsigned long flags;
592 struct zone *zone; 617 struct zone *zone;
593 int i; 618 int i;
594 619
@@ -600,8 +625,10 @@ static void __drain_pages(unsigned int cpu)
600 struct per_cpu_pages *pcp; 625 struct per_cpu_pages *pcp;
601 626
602 pcp = &pset->pcp[i]; 627 pcp = &pset->pcp[i];
628 local_irq_save(flags);
603 pcp->count -= free_pages_bulk(zone, pcp->count, 629 pcp->count -= free_pages_bulk(zone, pcp->count,
604 &pcp->list, 0); 630 &pcp->list, 0);
631 local_irq_restore(flags);
605 } 632 }
606 } 633 }
607} 634}
@@ -647,18 +674,14 @@ void drain_local_pages(void)
647} 674}
648#endif /* CONFIG_PM */ 675#endif /* CONFIG_PM */
649 676
650static void zone_statistics(struct zonelist *zonelist, struct zone *z) 677static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
651{ 678{
652#ifdef CONFIG_NUMA 679#ifdef CONFIG_NUMA
653 unsigned long flags;
654 int cpu;
655 pg_data_t *pg = z->zone_pgdat; 680 pg_data_t *pg = z->zone_pgdat;
656 pg_data_t *orig = zonelist->zones[0]->zone_pgdat; 681 pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
657 struct per_cpu_pageset *p; 682 struct per_cpu_pageset *p;
658 683
659 local_irq_save(flags); 684 p = zone_pcp(z, cpu);
660 cpu = smp_processor_id();
661 p = zone_pcp(z,cpu);
662 if (pg == orig) { 685 if (pg == orig) {
663 p->numa_hit++; 686 p->numa_hit++;
664 } else { 687 } else {
@@ -669,14 +692,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
669 p->local_node++; 692 p->local_node++;
670 else 693 else
671 p->other_node++; 694 p->other_node++;
672 local_irq_restore(flags);
673#endif 695#endif
674} 696}
675 697
676/* 698/*
677 * Free a 0-order page 699 * Free a 0-order page
678 */ 700 */
679static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
680static void fastcall free_hot_cold_page(struct page *page, int cold) 701static void fastcall free_hot_cold_page(struct page *page, int cold)
681{ 702{
682 struct zone *zone = page_zone(page); 703 struct zone *zone = page_zone(page);
@@ -687,14 +708,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
687 708
688 if (PageAnon(page)) 709 if (PageAnon(page))
689 page->mapping = NULL; 710 page->mapping = NULL;
690 if (free_pages_check(__FUNCTION__, page)) 711 if (free_pages_check(page))
691 return; 712 return;
692 713
693 inc_page_state(pgfree);
694 kernel_map_pages(page, 1, 0); 714 kernel_map_pages(page, 1, 0);
695 715
696 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 716 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
697 local_irq_save(flags); 717 local_irq_save(flags);
718 __inc_page_state(pgfree);
698 list_add(&page->lru, &pcp->list); 719 list_add(&page->lru, &pcp->list);
699 pcp->count++; 720 pcp->count++;
700 if (pcp->count >= pcp->high) 721 if (pcp->count >= pcp->high)
@@ -727,49 +748,58 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
727 * we cheat by calling it from here, in the order > 0 path. Saves a branch 748 * we cheat by calling it from here, in the order > 0 path. Saves a branch
728 * or two. 749 * or two.
729 */ 750 */
730static struct page * 751static struct page *buffered_rmqueue(struct zonelist *zonelist,
731buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) 752 struct zone *zone, int order, gfp_t gfp_flags)
732{ 753{
733 unsigned long flags; 754 unsigned long flags;
734 struct page *page; 755 struct page *page;
735 int cold = !!(gfp_flags & __GFP_COLD); 756 int cold = !!(gfp_flags & __GFP_COLD);
757 int cpu;
736 758
737again: 759again:
760 cpu = get_cpu();
738 if (order == 0) { 761 if (order == 0) {
739 struct per_cpu_pages *pcp; 762 struct per_cpu_pages *pcp;
740 763
741 page = NULL; 764 pcp = &zone_pcp(zone, cpu)->pcp[cold];
742 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
743 local_irq_save(flags); 765 local_irq_save(flags);
744 if (pcp->count <= pcp->low) 766 if (!pcp->count) {
745 pcp->count += rmqueue_bulk(zone, 0, 767 pcp->count += rmqueue_bulk(zone, 0,
746 pcp->batch, &pcp->list); 768 pcp->batch, &pcp->list);
747 if (pcp->count) { 769 if (unlikely(!pcp->count))
748 page = list_entry(pcp->list.next, struct page, lru); 770 goto failed;
749 list_del(&page->lru);
750 pcp->count--;
751 } 771 }
752 local_irq_restore(flags); 772 page = list_entry(pcp->list.next, struct page, lru);
753 put_cpu(); 773 list_del(&page->lru);
774 pcp->count--;
754 } else { 775 } else {
755 spin_lock_irqsave(&zone->lock, flags); 776 spin_lock_irqsave(&zone->lock, flags);
756 page = __rmqueue(zone, order); 777 page = __rmqueue(zone, order);
757 spin_unlock_irqrestore(&zone->lock, flags); 778 spin_unlock(&zone->lock);
779 if (!page)
780 goto failed;
758 } 781 }
759 782
760 if (page != NULL) { 783 __mod_page_state_zone(zone, pgalloc, 1 << order);
761 BUG_ON(bad_range(zone, page)); 784 zone_statistics(zonelist, zone, cpu);
762 mod_page_state_zone(zone, pgalloc, 1 << order); 785 local_irq_restore(flags);
763 if (prep_new_page(page, order)) 786 put_cpu();
764 goto again; 787
788 BUG_ON(bad_range(zone, page));
789 if (prep_new_page(page, order))
790 goto again;
765 791
766 if (gfp_flags & __GFP_ZERO) 792 if (gfp_flags & __GFP_ZERO)
767 prep_zero_page(page, order, gfp_flags); 793 prep_zero_page(page, order, gfp_flags);
768 794
769 if (order && (gfp_flags & __GFP_COMP)) 795 if (order && (gfp_flags & __GFP_COMP))
770 prep_compound_page(page, order); 796 prep_compound_page(page, order);
771 }
772 return page; 797 return page;
798
799failed:
800 local_irq_restore(flags);
801 put_cpu();
802 return NULL;
773} 803}
774 804
775#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 805#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
@@ -845,9 +875,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
845 continue; 875 continue;
846 } 876 }
847 877
848 page = buffered_rmqueue(*z, order, gfp_mask); 878 page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
849 if (page) { 879 if (page) {
850 zone_statistics(zonelist, *z);
851 break; 880 break;
852 } 881 }
853 } while (*(++z) != NULL); 882 } while (*(++z) != NULL);
@@ -903,8 +932,7 @@ restart:
903 alloc_flags |= ALLOC_HARDER; 932 alloc_flags |= ALLOC_HARDER;
904 if (gfp_mask & __GFP_HIGH) 933 if (gfp_mask & __GFP_HIGH)
905 alloc_flags |= ALLOC_HIGH; 934 alloc_flags |= ALLOC_HIGH;
906 if (wait) 935 alloc_flags |= ALLOC_CPUSET;
907 alloc_flags |= ALLOC_CPUSET;
908 936
909 /* 937 /*
910 * Go through the zonelist again. Let __GFP_HIGH and allocations 938 * Go through the zonelist again. Let __GFP_HIGH and allocations
@@ -926,7 +954,7 @@ restart:
926nofail_alloc: 954nofail_alloc:
927 /* go through the zonelist yet again, ignoring mins */ 955 /* go through the zonelist yet again, ignoring mins */
928 page = get_page_from_freelist(gfp_mask, order, 956 page = get_page_from_freelist(gfp_mask, order,
929 zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET); 957 zonelist, ALLOC_NO_WATERMARKS);
930 if (page) 958 if (page)
931 goto got_pg; 959 goto got_pg;
932 if (gfp_mask & __GFP_NOFAIL) { 960 if (gfp_mask & __GFP_NOFAIL) {
@@ -1171,12 +1199,11 @@ EXPORT_SYMBOL(nr_pagecache);
1171DEFINE_PER_CPU(long, nr_pagecache_local) = 0; 1199DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
1172#endif 1200#endif
1173 1201
1174void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) 1202static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
1175{ 1203{
1176 int cpu = 0; 1204 int cpu = 0;
1177 1205
1178 memset(ret, 0, sizeof(*ret)); 1206 memset(ret, 0, sizeof(*ret));
1179 cpus_and(*cpumask, *cpumask, cpu_online_map);
1180 1207
1181 cpu = first_cpu(*cpumask); 1208 cpu = first_cpu(*cpumask);
1182 while (cpu < NR_CPUS) { 1209 while (cpu < NR_CPUS) {
@@ -1224,12 +1251,12 @@ void get_full_page_state(struct page_state *ret)
1224 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); 1251 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
1225} 1252}
1226 1253
1227unsigned long __read_page_state(unsigned long offset) 1254unsigned long read_page_state_offset(unsigned long offset)
1228{ 1255{
1229 unsigned long ret = 0; 1256 unsigned long ret = 0;
1230 int cpu; 1257 int cpu;
1231 1258
1232 for_each_online_cpu(cpu) { 1259 for_each_cpu(cpu) {
1233 unsigned long in; 1260 unsigned long in;
1234 1261
1235 in = (unsigned long)&per_cpu(page_states, cpu) + offset; 1262 in = (unsigned long)&per_cpu(page_states, cpu) + offset;
@@ -1238,18 +1265,26 @@ unsigned long __read_page_state(unsigned long offset)
1238 return ret; 1265 return ret;
1239} 1266}
1240 1267
1241void __mod_page_state(unsigned long offset, unsigned long delta) 1268void __mod_page_state_offset(unsigned long offset, unsigned long delta)
1269{
1270 void *ptr;
1271
1272 ptr = &__get_cpu_var(page_states);
1273 *(unsigned long *)(ptr + offset) += delta;
1274}
1275EXPORT_SYMBOL(__mod_page_state_offset);
1276
1277void mod_page_state_offset(unsigned long offset, unsigned long delta)
1242{ 1278{
1243 unsigned long flags; 1279 unsigned long flags;
1244 void* ptr; 1280 void *ptr;
1245 1281
1246 local_irq_save(flags); 1282 local_irq_save(flags);
1247 ptr = &__get_cpu_var(page_states); 1283 ptr = &__get_cpu_var(page_states);
1248 *(unsigned long*)(ptr + offset) += delta; 1284 *(unsigned long *)(ptr + offset) += delta;
1249 local_irq_restore(flags); 1285 local_irq_restore(flags);
1250} 1286}
1251 1287EXPORT_SYMBOL(mod_page_state_offset);
1252EXPORT_SYMBOL(__mod_page_state);
1253 1288
1254void __get_zone_counts(unsigned long *active, unsigned long *inactive, 1289void __get_zone_counts(unsigned long *active, unsigned long *inactive,
1255 unsigned long *free, struct pglist_data *pgdat) 1290 unsigned long *free, struct pglist_data *pgdat)
@@ -1335,7 +1370,7 @@ void show_free_areas(void)
1335 show_node(zone); 1370 show_node(zone);
1336 printk("%s per-cpu:", zone->name); 1371 printk("%s per-cpu:", zone->name);
1337 1372
1338 if (!zone->present_pages) { 1373 if (!populated_zone(zone)) {
1339 printk(" empty\n"); 1374 printk(" empty\n");
1340 continue; 1375 continue;
1341 } else 1376 } else
@@ -1347,10 +1382,9 @@ void show_free_areas(void)
1347 pageset = zone_pcp(zone, cpu); 1382 pageset = zone_pcp(zone, cpu);
1348 1383
1349 for (temperature = 0; temperature < 2; temperature++) 1384 for (temperature = 0; temperature < 2; temperature++)
1350 printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", 1385 printk("cpu %d %s: high %d, batch %d used:%d\n",
1351 cpu, 1386 cpu,
1352 temperature ? "cold" : "hot", 1387 temperature ? "cold" : "hot",
1353 pageset->pcp[temperature].low,
1354 pageset->pcp[temperature].high, 1388 pageset->pcp[temperature].high,
1355 pageset->pcp[temperature].batch, 1389 pageset->pcp[temperature].batch,
1356 pageset->pcp[temperature].count); 1390 pageset->pcp[temperature].count);
@@ -1413,7 +1447,7 @@ void show_free_areas(void)
1413 1447
1414 show_node(zone); 1448 show_node(zone);
1415 printk("%s: ", zone->name); 1449 printk("%s: ", zone->name);
1416 if (!zone->present_pages) { 1450 if (!populated_zone(zone)) {
1417 printk("empty\n"); 1451 printk("empty\n");
1418 continue; 1452 continue;
1419 } 1453 }
@@ -1433,36 +1467,29 @@ void show_free_areas(void)
1433 1467
1434/* 1468/*
1435 * Builds allocation fallback zone lists. 1469 * Builds allocation fallback zone lists.
1470 *
1471 * Add all populated zones of a node to the zonelist.
1436 */ 1472 */
1437static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) 1473static int __init build_zonelists_node(pg_data_t *pgdat,
1438{ 1474 struct zonelist *zonelist, int nr_zones, int zone_type)
1439 switch (k) { 1475{
1440 struct zone *zone; 1476 struct zone *zone;
1441 default: 1477
1442 BUG(); 1478 BUG_ON(zone_type > ZONE_HIGHMEM);
1443 case ZONE_HIGHMEM: 1479
1444 zone = pgdat->node_zones + ZONE_HIGHMEM; 1480 do {
1445 if (zone->present_pages) { 1481 zone = pgdat->node_zones + zone_type;
1482 if (populated_zone(zone)) {
1446#ifndef CONFIG_HIGHMEM 1483#ifndef CONFIG_HIGHMEM
1447 BUG(); 1484 BUG_ON(zone_type > ZONE_NORMAL);
1448#endif 1485#endif
1449 zonelist->zones[j++] = zone; 1486 zonelist->zones[nr_zones++] = zone;
1487 check_highest_zone(zone_type);
1450 } 1488 }
1451 case ZONE_NORMAL: 1489 zone_type--;
1452 zone = pgdat->node_zones + ZONE_NORMAL;
1453 if (zone->present_pages)
1454 zonelist->zones[j++] = zone;
1455 case ZONE_DMA32:
1456 zone = pgdat->node_zones + ZONE_DMA32;
1457 if (zone->present_pages)
1458 zonelist->zones[j++] = zone;
1459 case ZONE_DMA:
1460 zone = pgdat->node_zones + ZONE_DMA;
1461 if (zone->present_pages)
1462 zonelist->zones[j++] = zone;
1463 }
1464 1490
1465 return j; 1491 } while (zone_type >= 0);
1492 return nr_zones;
1466} 1493}
1467 1494
1468static inline int highest_zone(int zone_bits) 1495static inline int highest_zone(int zone_bits)
@@ -1709,8 +1736,6 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1709 for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { 1736 for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
1710 if (!early_pfn_valid(pfn)) 1737 if (!early_pfn_valid(pfn))
1711 continue; 1738 continue;
1712 if (!early_pfn_in_nid(pfn, nid))
1713 continue;
1714 page = pfn_to_page(pfn); 1739 page = pfn_to_page(pfn);
1715 set_page_links(page, zone, nid, pfn); 1740 set_page_links(page, zone, nid, pfn);
1716 set_page_count(page, 1); 1741 set_page_count(page, 1);
@@ -1794,14 +1819,12 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1794 1819
1795 pcp = &p->pcp[0]; /* hot */ 1820 pcp = &p->pcp[0]; /* hot */
1796 pcp->count = 0; 1821 pcp->count = 0;
1797 pcp->low = 0;
1798 pcp->high = 6 * batch; 1822 pcp->high = 6 * batch;
1799 pcp->batch = max(1UL, 1 * batch); 1823 pcp->batch = max(1UL, 1 * batch);
1800 INIT_LIST_HEAD(&pcp->list); 1824 INIT_LIST_HEAD(&pcp->list);
1801 1825
1802 pcp = &p->pcp[1]; /* cold*/ 1826 pcp = &p->pcp[1]; /* cold*/
1803 pcp->count = 0; 1827 pcp->count = 0;
1804 pcp->low = 0;
1805 pcp->high = 2 * batch; 1828 pcp->high = 2 * batch;
1806 pcp->batch = max(1UL, batch/2); 1829 pcp->batch = max(1UL, batch/2);
1807 INIT_LIST_HEAD(&pcp->list); 1830 INIT_LIST_HEAD(&pcp->list);
@@ -2116,7 +2139,7 @@ static int frag_show(struct seq_file *m, void *arg)
2116 int order; 2139 int order;
2117 2140
2118 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 2141 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
2119 if (!zone->present_pages) 2142 if (!populated_zone(zone))
2120 continue; 2143 continue;
2121 2144
2122 spin_lock_irqsave(&zone->lock, flags); 2145 spin_lock_irqsave(&zone->lock, flags);
@@ -2149,7 +2172,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
2149 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { 2172 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
2150 int i; 2173 int i;
2151 2174
2152 if (!zone->present_pages) 2175 if (!populated_zone(zone))
2153 continue; 2176 continue;
2154 2177
2155 spin_lock_irqsave(&zone->lock, flags); 2178 spin_lock_irqsave(&zone->lock, flags);
@@ -2197,12 +2220,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
2197 seq_printf(m, 2220 seq_printf(m,
2198 "\n cpu: %i pcp: %i" 2221 "\n cpu: %i pcp: %i"
2199 "\n count: %i" 2222 "\n count: %i"
2200 "\n low: %i"
2201 "\n high: %i" 2223 "\n high: %i"
2202 "\n batch: %i", 2224 "\n batch: %i",
2203 i, j, 2225 i, j,
2204 pageset->pcp[j].count, 2226 pageset->pcp[j].count,
2205 pageset->pcp[j].low,
2206 pageset->pcp[j].high, 2227 pageset->pcp[j].high,
2207 pageset->pcp[j].batch); 2228 pageset->pcp[j].batch);
2208 } 2229 }
@@ -2257,32 +2278,40 @@ static char *vmstat_text[] = {
2257 "pgpgout", 2278 "pgpgout",
2258 "pswpin", 2279 "pswpin",
2259 "pswpout", 2280 "pswpout",
2260 "pgalloc_high",
2261 2281
2282 "pgalloc_high",
2262 "pgalloc_normal", 2283 "pgalloc_normal",
2284 "pgalloc_dma32",
2263 "pgalloc_dma", 2285 "pgalloc_dma",
2286
2264 "pgfree", 2287 "pgfree",
2265 "pgactivate", 2288 "pgactivate",
2266 "pgdeactivate", 2289 "pgdeactivate",
2267 2290
2268 "pgfault", 2291 "pgfault",
2269 "pgmajfault", 2292 "pgmajfault",
2293
2270 "pgrefill_high", 2294 "pgrefill_high",
2271 "pgrefill_normal", 2295 "pgrefill_normal",
2296 "pgrefill_dma32",
2272 "pgrefill_dma", 2297 "pgrefill_dma",
2273 2298
2274 "pgsteal_high", 2299 "pgsteal_high",
2275 "pgsteal_normal", 2300 "pgsteal_normal",
2301 "pgsteal_dma32",
2276 "pgsteal_dma", 2302 "pgsteal_dma",
2303
2277 "pgscan_kswapd_high", 2304 "pgscan_kswapd_high",
2278 "pgscan_kswapd_normal", 2305 "pgscan_kswapd_normal",
2279 2306 "pgscan_kswapd_dma32",
2280 "pgscan_kswapd_dma", 2307 "pgscan_kswapd_dma",
2308
2281 "pgscan_direct_high", 2309 "pgscan_direct_high",
2282 "pgscan_direct_normal", 2310 "pgscan_direct_normal",
2311 "pgscan_direct_dma32",
2283 "pgscan_direct_dma", 2312 "pgscan_direct_dma",
2284 "pginodesteal",
2285 2313
2314 "pginodesteal",
2286 "slabs_scanned", 2315 "slabs_scanned",
2287 "kswapd_steal", 2316 "kswapd_steal",
2288 "kswapd_inodesteal", 2317 "kswapd_inodesteal",
diff --git a/mm/readahead.c b/mm/readahead.c
index 72e7adbb87c7..8d6eeaaa6296 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -158,7 +158,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
158{ 158{
159 unsigned page_idx; 159 unsigned page_idx;
160 struct pagevec lru_pvec; 160 struct pagevec lru_pvec;
161 int ret = 0; 161 int ret;
162 162
163 if (mapping->a_ops->readpages) { 163 if (mapping->a_ops->readpages) {
164 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); 164 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
@@ -171,14 +171,17 @@ static int read_pages(struct address_space *mapping, struct file *filp,
171 list_del(&page->lru); 171 list_del(&page->lru);
172 if (!add_to_page_cache(page, mapping, 172 if (!add_to_page_cache(page, mapping,
173 page->index, GFP_KERNEL)) { 173 page->index, GFP_KERNEL)) {
174 mapping->a_ops->readpage(filp, page); 174 ret = mapping->a_ops->readpage(filp, page);
175 if (!pagevec_add(&lru_pvec, page)) 175 if (ret != AOP_TRUNCATED_PAGE) {
176 __pagevec_lru_add(&lru_pvec); 176 if (!pagevec_add(&lru_pvec, page))
177 } else { 177 __pagevec_lru_add(&lru_pvec);
178 page_cache_release(page); 178 continue;
179 } /* else fall through to release */
179 } 180 }
181 page_cache_release(page);
180 } 182 }
181 pagevec_lru_add(&lru_pvec); 183 pagevec_lru_add(&lru_pvec);
184 ret = 0;
182out: 185out:
183 return ret; 186 return ret;
184} 187}
diff --git a/mm/rmap.c b/mm/rmap.c
index f853c6def159..6f3f7db27128 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -435,6 +435,30 @@ int page_referenced(struct page *page, int is_locked)
435} 435}
436 436
437/** 437/**
438 * page_set_anon_rmap - setup new anonymous rmap
439 * @page: the page to add the mapping to
440 * @vma: the vm area in which the mapping is added
441 * @address: the user virtual address mapped
442 */
443static void __page_set_anon_rmap(struct page *page,
444 struct vm_area_struct *vma, unsigned long address)
445{
446 struct anon_vma *anon_vma = vma->anon_vma;
447
448 BUG_ON(!anon_vma);
449 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
450 page->mapping = (struct address_space *) anon_vma;
451
452 page->index = linear_page_index(vma, address);
453
454 /*
455 * nr_mapped state can be updated without turning off
456 * interrupts because it is not modified via interrupt.
457 */
458 __inc_page_state(nr_mapped);
459}
460
461/**
438 * page_add_anon_rmap - add pte mapping to an anonymous page 462 * page_add_anon_rmap - add pte mapping to an anonymous page
439 * @page: the page to add the mapping to 463 * @page: the page to add the mapping to
440 * @vma: the vm area in which the mapping is added 464 * @vma: the vm area in which the mapping is added
@@ -445,20 +469,27 @@ int page_referenced(struct page *page, int is_locked)
445void page_add_anon_rmap(struct page *page, 469void page_add_anon_rmap(struct page *page,
446 struct vm_area_struct *vma, unsigned long address) 470 struct vm_area_struct *vma, unsigned long address)
447{ 471{
448 if (atomic_inc_and_test(&page->_mapcount)) { 472 if (atomic_inc_and_test(&page->_mapcount))
449 struct anon_vma *anon_vma = vma->anon_vma; 473 __page_set_anon_rmap(page, vma, address);
450
451 BUG_ON(!anon_vma);
452 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
453 page->mapping = (struct address_space *) anon_vma;
454
455 page->index = linear_page_index(vma, address);
456
457 inc_page_state(nr_mapped);
458 }
459 /* else checking page index and mapping is racy */ 474 /* else checking page index and mapping is racy */
460} 475}
461 476
477/*
478 * page_add_new_anon_rmap - add pte mapping to a new anonymous page
479 * @page: the page to add the mapping to
480 * @vma: the vm area in which the mapping is added
481 * @address: the user virtual address mapped
482 *
483 * Same as page_add_anon_rmap but must only be called on *new* pages.
484 * This means the inc-and-test can be bypassed.
485 */
486void page_add_new_anon_rmap(struct page *page,
487 struct vm_area_struct *vma, unsigned long address)
488{
489 atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
490 __page_set_anon_rmap(page, vma, address);
491}
492
462/** 493/**
463 * page_add_file_rmap - add pte mapping to a file page 494 * page_add_file_rmap - add pte mapping to a file page
464 * @page: the page to add the mapping to 495 * @page: the page to add the mapping to
@@ -471,7 +502,7 @@ void page_add_file_rmap(struct page *page)
471 BUG_ON(!pfn_valid(page_to_pfn(page))); 502 BUG_ON(!pfn_valid(page_to_pfn(page)));
472 503
473 if (atomic_inc_and_test(&page->_mapcount)) 504 if (atomic_inc_and_test(&page->_mapcount))
474 inc_page_state(nr_mapped); 505 __inc_page_state(nr_mapped);
475} 506}
476 507
477/** 508/**
@@ -495,7 +526,7 @@ void page_remove_rmap(struct page *page)
495 */ 526 */
496 if (page_test_and_clear_dirty(page)) 527 if (page_test_and_clear_dirty(page))
497 set_page_dirty(page); 528 set_page_dirty(page);
498 dec_page_state(nr_mapped); 529 __dec_page_state(nr_mapped);
499 } 530 }
500} 531}
501 532
diff --git a/mm/shmem.c b/mm/shmem.c
index dc25565a61e9..a1f2f02af724 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -457,7 +457,7 @@ static void shmem_free_pages(struct list_head *next)
457 } while (next); 457 } while (next);
458} 458}
459 459
460static void shmem_truncate(struct inode *inode) 460static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
461{ 461{
462 struct shmem_inode_info *info = SHMEM_I(inode); 462 struct shmem_inode_info *info = SHMEM_I(inode);
463 unsigned long idx; 463 unsigned long idx;
@@ -475,18 +475,27 @@ static void shmem_truncate(struct inode *inode)
475 long nr_swaps_freed = 0; 475 long nr_swaps_freed = 0;
476 int offset; 476 int offset;
477 int freed; 477 int freed;
478 int punch_hole = 0;
478 479
479 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 480 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
480 idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 481 idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
481 if (idx >= info->next_index) 482 if (idx >= info->next_index)
482 return; 483 return;
483 484
484 spin_lock(&info->lock); 485 spin_lock(&info->lock);
485 info->flags |= SHMEM_TRUNCATE; 486 info->flags |= SHMEM_TRUNCATE;
486 limit = info->next_index; 487 if (likely(end == (loff_t) -1)) {
487 info->next_index = idx; 488 limit = info->next_index;
489 info->next_index = idx;
490 } else {
491 limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
492 if (limit > info->next_index)
493 limit = info->next_index;
494 punch_hole = 1;
495 }
496
488 topdir = info->i_indirect; 497 topdir = info->i_indirect;
489 if (topdir && idx <= SHMEM_NR_DIRECT) { 498 if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
490 info->i_indirect = NULL; 499 info->i_indirect = NULL;
491 nr_pages_to_free++; 500 nr_pages_to_free++;
492 list_add(&topdir->lru, &pages_to_free); 501 list_add(&topdir->lru, &pages_to_free);
@@ -573,11 +582,12 @@ static void shmem_truncate(struct inode *inode)
573 set_page_private(subdir, page_private(subdir) - freed); 582 set_page_private(subdir, page_private(subdir) - freed);
574 if (offset) 583 if (offset)
575 spin_unlock(&info->lock); 584 spin_unlock(&info->lock);
576 BUG_ON(page_private(subdir) > offset); 585 if (!punch_hole)
586 BUG_ON(page_private(subdir) > offset);
577 } 587 }
578 if (offset) 588 if (offset)
579 offset = 0; 589 offset = 0;
580 else if (subdir) { 590 else if (subdir && !page_private(subdir)) {
581 dir[diroff] = NULL; 591 dir[diroff] = NULL;
582 nr_pages_to_free++; 592 nr_pages_to_free++;
583 list_add(&subdir->lru, &pages_to_free); 593 list_add(&subdir->lru, &pages_to_free);
@@ -594,7 +604,7 @@ done2:
594 * Also, though shmem_getpage checks i_size before adding to 604 * Also, though shmem_getpage checks i_size before adding to
595 * cache, no recheck after: so fix the narrow window there too. 605 * cache, no recheck after: so fix the narrow window there too.
596 */ 606 */
597 truncate_inode_pages(inode->i_mapping, inode->i_size); 607 truncate_inode_pages_range(inode->i_mapping, start, end);
598 } 608 }
599 609
600 spin_lock(&info->lock); 610 spin_lock(&info->lock);
@@ -614,6 +624,11 @@ done2:
614 } 624 }
615} 625}
616 626
627static void shmem_truncate(struct inode *inode)
628{
629 shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
630}
631
617static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) 632static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
618{ 633{
619 struct inode *inode = dentry->d_inode; 634 struct inode *inode = dentry->d_inode;
@@ -855,7 +870,7 @@ unlock:
855 swap_free(swap); 870 swap_free(swap);
856redirty: 871redirty:
857 set_page_dirty(page); 872 set_page_dirty(page);
858 return WRITEPAGE_ACTIVATE; /* Return with the page locked */ 873 return AOP_WRITEPAGE_ACTIVATE; /* Return with the page locked */
859} 874}
860 875
861#ifdef CONFIG_NUMA 876#ifdef CONFIG_NUMA
@@ -1255,7 +1270,7 @@ out_nomem:
1255 return retval; 1270 return retval;
1256} 1271}
1257 1272
1258static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 1273int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1259{ 1274{
1260 file_accessed(file); 1275 file_accessed(file);
1261 vma->vm_ops = &shmem_vm_ops; 1276 vma->vm_ops = &shmem_vm_ops;
@@ -2083,6 +2098,7 @@ static struct file_operations shmem_file_operations = {
2083static struct inode_operations shmem_inode_operations = { 2098static struct inode_operations shmem_inode_operations = {
2084 .truncate = shmem_truncate, 2099 .truncate = shmem_truncate,
2085 .setattr = shmem_notify_change, 2100 .setattr = shmem_notify_change,
2101 .truncate_range = shmem_truncate_range,
2086}; 2102};
2087 2103
2088static struct inode_operations shmem_dir_inode_operations = { 2104static struct inode_operations shmem_dir_inode_operations = {
diff --git a/mm/swap.c b/mm/swap.c
index 73d351439ef6..ee6d71ccfa56 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -156,16 +156,22 @@ void fastcall lru_cache_add_active(struct page *page)
156 put_cpu_var(lru_add_active_pvecs); 156 put_cpu_var(lru_add_active_pvecs);
157} 157}
158 158
159void lru_add_drain(void) 159static void __lru_add_drain(int cpu)
160{ 160{
161 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); 161 struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
162 162
163 /* CPU is dead, so no locking needed. */
163 if (pagevec_count(pvec)) 164 if (pagevec_count(pvec))
164 __pagevec_lru_add(pvec); 165 __pagevec_lru_add(pvec);
165 pvec = &__get_cpu_var(lru_add_active_pvecs); 166 pvec = &per_cpu(lru_add_active_pvecs, cpu);
166 if (pagevec_count(pvec)) 167 if (pagevec_count(pvec))
167 __pagevec_lru_add_active(pvec); 168 __pagevec_lru_add_active(pvec);
168 put_cpu_var(lru_add_pvecs); 169}
170
171void lru_add_drain(void)
172{
173 __lru_add_drain(get_cpu());
174 put_cpu();
169} 175}
170 176
171/* 177/*
@@ -412,17 +418,6 @@ void vm_acct_memory(long pages)
412} 418}
413 419
414#ifdef CONFIG_HOTPLUG_CPU 420#ifdef CONFIG_HOTPLUG_CPU
415static void lru_drain_cache(unsigned int cpu)
416{
417 struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
418
419 /* CPU is dead, so no locking needed. */
420 if (pagevec_count(pvec))
421 __pagevec_lru_add(pvec);
422 pvec = &per_cpu(lru_add_active_pvecs, cpu);
423 if (pagevec_count(pvec))
424 __pagevec_lru_add_active(pvec);
425}
426 421
427/* Drop the CPU's cached committed space back into the central pool. */ 422/* Drop the CPU's cached committed space back into the central pool. */
428static int cpu_swap_callback(struct notifier_block *nfb, 423static int cpu_swap_callback(struct notifier_block *nfb,
@@ -435,7 +430,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
435 if (action == CPU_DEAD) { 430 if (action == CPU_DEAD) {
436 atomic_add(*committed, &vm_committed_space); 431 atomic_add(*committed, &vm_committed_space);
437 *committed = 0; 432 *committed = 0;
438 lru_drain_cache((long)hcpu); 433 __lru_add_drain((long)hcpu);
439 } 434 }
440 return NOTIFY_OK; 435 return NOTIFY_OK;
441} 436}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0df9a57b1de8..fc2aecb70a95 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
14#include <linux/pagemap.h> 14#include <linux/pagemap.h>
15#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
16#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
17#include <linux/pagevec.h>
17 18
18#include <asm/pgtable.h> 19#include <asm/pgtable.h>
19 20
@@ -272,12 +273,11 @@ void free_page_and_swap_cache(struct page *page)
272 */ 273 */
273void free_pages_and_swap_cache(struct page **pages, int nr) 274void free_pages_and_swap_cache(struct page **pages, int nr)
274{ 275{
275 int chunk = 16;
276 struct page **pagep = pages; 276 struct page **pagep = pages;
277 277
278 lru_add_drain(); 278 lru_add_drain();
279 while (nr) { 279 while (nr) {
280 int todo = min(chunk, nr); 280 int todo = min(nr, PAGEVEC_SIZE);
281 int i; 281 int i;
282 282
283 for (i = 0; i < todo; i++) 283 for (i = 0; i < todo; i++)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index edafeace301f..6da4b28b896b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -211,6 +211,26 @@ noswap:
211 return (swp_entry_t) {0}; 211 return (swp_entry_t) {0};
212} 212}
213 213
214swp_entry_t get_swap_page_of_type(int type)
215{
216 struct swap_info_struct *si;
217 pgoff_t offset;
218
219 spin_lock(&swap_lock);
220 si = swap_info + type;
221 if (si->flags & SWP_WRITEOK) {
222 nr_swap_pages--;
223 offset = scan_swap_map(si);
224 if (offset) {
225 spin_unlock(&swap_lock);
226 return swp_entry(type, offset);
227 }
228 nr_swap_pages++;
229 }
230 spin_unlock(&swap_lock);
231 return (swp_entry_t) {0};
232}
233
214static struct swap_info_struct * swap_info_get(swp_entry_t entry) 234static struct swap_info_struct * swap_info_get(swp_entry_t entry)
215{ 235{
216 struct swap_info_struct * p; 236 struct swap_info_struct * p;
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index b58abcf44ed6..cdc6d431972b 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -81,13 +81,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
81 goto close_file; 81 goto close_file;
82 82
83 d_instantiate(dentry, inode); 83 d_instantiate(dentry, inode);
84 inode->i_size = size;
85 inode->i_nlink = 0; /* It is unlinked */ 84 inode->i_nlink = 0; /* It is unlinked */
85
86 file->f_vfsmnt = mntget(shm_mnt); 86 file->f_vfsmnt = mntget(shm_mnt);
87 file->f_dentry = dentry; 87 file->f_dentry = dentry;
88 file->f_mapping = inode->i_mapping; 88 file->f_mapping = inode->i_mapping;
89 file->f_op = &ramfs_file_operations; 89 file->f_op = &ramfs_file_operations;
90 file->f_mode = FMODE_WRITE | FMODE_READ; 90 file->f_mode = FMODE_WRITE | FMODE_READ;
91
92 /* notify everyone as to the change of file size */
93 error = do_truncate(dentry, size, file);
94 if (error < 0)
95 goto close_file;
96
91 return file; 97 return file;
92 98
93close_file: 99close_file:
@@ -123,3 +129,24 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
123{ 129{
124 return 0; 130 return 0;
125} 131}
132
133int shmem_mmap(struct file *file, struct vm_area_struct *vma)
134{
135 file_accessed(file);
136#ifndef CONFIG_MMU
137 return ramfs_nommu_mmap(file, vma);
138#else
139 return 0;
140#endif
141}
142
143#ifndef CONFIG_MMU
144unsigned long shmem_get_unmapped_area(struct file *file,
145 unsigned long addr,
146 unsigned long len,
147 unsigned long pgoff,
148 unsigned long flags)
149{
150 return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
151}
152#endif
diff --git a/mm/truncate.c b/mm/truncate.c
index 9173ab500604..7dee32745901 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -82,12 +82,15 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
82} 82}
83 83
84/** 84/**
85 * truncate_inode_pages - truncate *all* the pages from an offset 85 * truncate_inode_pages - truncate range of pages specified by start and
86 * end byte offsets
86 * @mapping: mapping to truncate 87 * @mapping: mapping to truncate
87 * @lstart: offset from which to truncate 88 * @lstart: offset from which to truncate
89 * @lend: offset to which to truncate
88 * 90 *
89 * Truncate the page cache at a set offset, removing the pages that are beyond 91 * Truncate the page cache, removing the pages that are between
90 * that offset (and zeroing out partial pages). 92 * specified offsets (and zeroing out partial page
93 * (if lstart is not page aligned)).
91 * 94 *
92 * Truncate takes two passes - the first pass is nonblocking. It will not 95 * Truncate takes two passes - the first pass is nonblocking. It will not
93 * block on page locks and it will not block on writeback. The second pass 96 * block on page locks and it will not block on writeback. The second pass
@@ -101,12 +104,12 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
101 * We pass down the cache-hot hint to the page freeing code. Even if the 104 * We pass down the cache-hot hint to the page freeing code. Even if the
102 * mapping is large, it is probably the case that the final pages are the most 105 * mapping is large, it is probably the case that the final pages are the most
103 * recently touched, and freeing happens in ascending file offset order. 106 * recently touched, and freeing happens in ascending file offset order.
104 *
105 * Called under (and serialised by) inode->i_sem.
106 */ 107 */
107void truncate_inode_pages(struct address_space *mapping, loff_t lstart) 108void truncate_inode_pages_range(struct address_space *mapping,
109 loff_t lstart, loff_t lend)
108{ 110{
109 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 111 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
112 pgoff_t end;
110 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 113 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
111 struct pagevec pvec; 114 struct pagevec pvec;
112 pgoff_t next; 115 pgoff_t next;
@@ -115,13 +118,22 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
115 if (mapping->nrpages == 0) 118 if (mapping->nrpages == 0)
116 return; 119 return;
117 120
121 BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
122 end = (lend >> PAGE_CACHE_SHIFT);
123
118 pagevec_init(&pvec, 0); 124 pagevec_init(&pvec, 0);
119 next = start; 125 next = start;
120 while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 126 while (next <= end &&
127 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
121 for (i = 0; i < pagevec_count(&pvec); i++) { 128 for (i = 0; i < pagevec_count(&pvec); i++) {
122 struct page *page = pvec.pages[i]; 129 struct page *page = pvec.pages[i];
123 pgoff_t page_index = page->index; 130 pgoff_t page_index = page->index;
124 131
132 if (page_index > end) {
133 next = page_index;
134 break;
135 }
136
125 if (page_index > next) 137 if (page_index > next)
126 next = page_index; 138 next = page_index;
127 next++; 139 next++;
@@ -157,9 +169,15 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
157 next = start; 169 next = start;
158 continue; 170 continue;
159 } 171 }
172 if (pvec.pages[0]->index > end) {
173 pagevec_release(&pvec);
174 break;
175 }
160 for (i = 0; i < pagevec_count(&pvec); i++) { 176 for (i = 0; i < pagevec_count(&pvec); i++) {
161 struct page *page = pvec.pages[i]; 177 struct page *page = pvec.pages[i];
162 178
179 if (page->index > end)
180 break;
163 lock_page(page); 181 lock_page(page);
164 wait_on_page_writeback(page); 182 wait_on_page_writeback(page);
165 if (page->index > next) 183 if (page->index > next)
@@ -171,7 +189,19 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
171 pagevec_release(&pvec); 189 pagevec_release(&pvec);
172 } 190 }
173} 191}
192EXPORT_SYMBOL(truncate_inode_pages_range);
174 193
194/**
195 * truncate_inode_pages - truncate *all* the pages from an offset
196 * @mapping: mapping to truncate
197 * @lstart: offset from which to truncate
198 *
199 * Called under (and serialised by) inode->i_sem.
200 */
201void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
202{
203 truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
204}
175EXPORT_SYMBOL(truncate_inode_pages); 205EXPORT_SYMBOL(truncate_inode_pages);
176 206
177/** 207/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b0cd81c32de6..be8235fb1939 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -63,9 +63,6 @@ struct scan_control {
63 63
64 unsigned long nr_mapped; /* From page_state */ 64 unsigned long nr_mapped; /* From page_state */
65 65
66 /* How many pages shrink_cache() should reclaim */
67 int nr_to_reclaim;
68
69 /* Ask shrink_caches, or shrink_zone to scan at this priority */ 66 /* Ask shrink_caches, or shrink_zone to scan at this priority */
70 unsigned int priority; 67 unsigned int priority;
71 68
@@ -74,9 +71,6 @@ struct scan_control {
74 71
75 int may_writepage; 72 int may_writepage;
76 73
77 /* Can pages be swapped as part of reclaim? */
78 int may_swap;
79
80 /* This context's SWAP_CLUSTER_MAX. If freeing memory for 74 /* This context's SWAP_CLUSTER_MAX. If freeing memory for
81 * suspend, we effectively ignore SWAP_CLUSTER_MAX. 75 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
82 * In this context, it doesn't matter that we scan the 76 * In this context, it doesn't matter that we scan the
@@ -367,7 +361,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
367 res = mapping->a_ops->writepage(page, &wbc); 361 res = mapping->a_ops->writepage(page, &wbc);
368 if (res < 0) 362 if (res < 0)
369 handle_write_error(mapping, page, res); 363 handle_write_error(mapping, page, res);
370 if (res == WRITEPAGE_ACTIVATE) { 364 if (res == AOP_WRITEPAGE_ACTIVATE) {
371 ClearPageReclaim(page); 365 ClearPageReclaim(page);
372 return PAGE_ACTIVATE; 366 return PAGE_ACTIVATE;
373 } 367 }
@@ -430,8 +424,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
430 * Try to allocate it some swap space here. 424 * Try to allocate it some swap space here.
431 */ 425 */
432 if (PageAnon(page) && !PageSwapCache(page)) { 426 if (PageAnon(page) && !PageSwapCache(page)) {
433 if (!sc->may_swap)
434 goto keep_locked;
435 if (!add_to_swap(page)) 427 if (!add_to_swap(page))
436 goto activate_locked; 428 goto activate_locked;
437 } 429 }
@@ -653,17 +645,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
653 goto done; 645 goto done;
654 646
655 max_scan -= nr_scan; 647 max_scan -= nr_scan;
656 if (current_is_kswapd())
657 mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
658 else
659 mod_page_state_zone(zone, pgscan_direct, nr_scan);
660 nr_freed = shrink_list(&page_list, sc); 648 nr_freed = shrink_list(&page_list, sc);
661 if (current_is_kswapd())
662 mod_page_state(kswapd_steal, nr_freed);
663 mod_page_state_zone(zone, pgsteal, nr_freed);
664 sc->nr_to_reclaim -= nr_freed;
665 649
666 spin_lock_irq(&zone->lru_lock); 650 local_irq_disable();
651 if (current_is_kswapd()) {
652 __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
653 __mod_page_state(kswapd_steal, nr_freed);
654 } else
655 __mod_page_state_zone(zone, pgscan_direct, nr_scan);
656 __mod_page_state_zone(zone, pgsteal, nr_freed);
657
658 spin_lock(&zone->lru_lock);
667 /* 659 /*
668 * Put back any unfreeable pages. 660 * Put back any unfreeable pages.
669 */ 661 */
@@ -825,11 +817,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
825 } 817 }
826 } 818 }
827 zone->nr_active += pgmoved; 819 zone->nr_active += pgmoved;
828 spin_unlock_irq(&zone->lru_lock); 820 spin_unlock(&zone->lru_lock);
829 pagevec_release(&pvec); 821
822 __mod_page_state_zone(zone, pgrefill, pgscanned);
823 __mod_page_state(pgdeactivate, pgdeactivate);
824 local_irq_enable();
830 825
831 mod_page_state_zone(zone, pgrefill, pgscanned); 826 pagevec_release(&pvec);
832 mod_page_state(pgdeactivate, pgdeactivate);
833} 827}
834 828
835/* 829/*
@@ -861,8 +855,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
861 else 855 else
862 nr_inactive = 0; 856 nr_inactive = 0;
863 857
864 sc->nr_to_reclaim = sc->swap_cluster_max;
865
866 while (nr_active || nr_inactive) { 858 while (nr_active || nr_inactive) {
867 if (nr_active) { 859 if (nr_active) {
868 sc->nr_to_scan = min(nr_active, 860 sc->nr_to_scan = min(nr_active,
@@ -876,8 +868,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
876 (unsigned long)sc->swap_cluster_max); 868 (unsigned long)sc->swap_cluster_max);
877 nr_inactive -= sc->nr_to_scan; 869 nr_inactive -= sc->nr_to_scan;
878 shrink_cache(zone, sc); 870 shrink_cache(zone, sc);
879 if (sc->nr_to_reclaim <= 0)
880 break;
881 } 871 }
882 } 872 }
883 873
@@ -910,7 +900,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
910 for (i = 0; zones[i] != NULL; i++) { 900 for (i = 0; zones[i] != NULL; i++) {
911 struct zone *zone = zones[i]; 901 struct zone *zone = zones[i];
912 902
913 if (zone->present_pages == 0) 903 if (!populated_zone(zone))
914 continue; 904 continue;
915 905
916 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 906 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
@@ -952,7 +942,6 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
952 942
953 sc.gfp_mask = gfp_mask; 943 sc.gfp_mask = gfp_mask;
954 sc.may_writepage = 0; 944 sc.may_writepage = 0;
955 sc.may_swap = 1;
956 945
957 inc_page_state(allocstall); 946 inc_page_state(allocstall);
958 947
@@ -1055,7 +1044,6 @@ loop_again:
1055 total_reclaimed = 0; 1044 total_reclaimed = 0;
1056 sc.gfp_mask = GFP_KERNEL; 1045 sc.gfp_mask = GFP_KERNEL;
1057 sc.may_writepage = 0; 1046 sc.may_writepage = 0;
1058 sc.may_swap = 1;
1059 sc.nr_mapped = read_page_state(nr_mapped); 1047 sc.nr_mapped = read_page_state(nr_mapped);
1060 1048
1061 inc_page_state(pageoutrun); 1049 inc_page_state(pageoutrun);
@@ -1084,7 +1072,7 @@ loop_again:
1084 for (i = pgdat->nr_zones - 1; i >= 0; i--) { 1072 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
1085 struct zone *zone = pgdat->node_zones + i; 1073 struct zone *zone = pgdat->node_zones + i;
1086 1074
1087 if (zone->present_pages == 0) 1075 if (!populated_zone(zone))
1088 continue; 1076 continue;
1089 1077
1090 if (zone->all_unreclaimable && 1078 if (zone->all_unreclaimable &&
@@ -1121,7 +1109,7 @@ scan:
1121 struct zone *zone = pgdat->node_zones + i; 1109 struct zone *zone = pgdat->node_zones + i;
1122 int nr_slab; 1110 int nr_slab;
1123 1111
1124 if (zone->present_pages == 0) 1112 if (!populated_zone(zone))
1125 continue; 1113 continue;
1126 1114
1127 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1115 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
@@ -1273,7 +1261,7 @@ void wakeup_kswapd(struct zone *zone, int order)
1273{ 1261{
1274 pg_data_t *pgdat; 1262 pg_data_t *pgdat;
1275 1263
1276 if (zone->present_pages == 0) 1264 if (!populated_zone(zone))
1277 return; 1265 return;
1278 1266
1279 pgdat = zone->zone_pgdat; 1267 pgdat = zone->zone_pgdat;
@@ -1353,76 +1341,3 @@ static int __init kswapd_init(void)
1353} 1341}
1354 1342
1355module_init(kswapd_init) 1343module_init(kswapd_init)
1356
1357
1358/*
1359 * Try to free up some pages from this zone through reclaim.
1360 */
1361int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1362{
1363 struct scan_control sc;
1364 int nr_pages = 1 << order;
1365 int total_reclaimed = 0;
1366
1367 /* The reclaim may sleep, so don't do it if sleep isn't allowed */
1368 if (!(gfp_mask & __GFP_WAIT))
1369 return 0;
1370 if (zone->all_unreclaimable)
1371 return 0;
1372
1373 sc.gfp_mask = gfp_mask;
1374 sc.may_writepage = 0;
1375 sc.may_swap = 0;
1376 sc.nr_mapped = read_page_state(nr_mapped);
1377 sc.nr_scanned = 0;
1378 sc.nr_reclaimed = 0;
1379 /* scan at the highest priority */
1380 sc.priority = 0;
1381 disable_swap_token();
1382
1383 if (nr_pages > SWAP_CLUSTER_MAX)
1384 sc.swap_cluster_max = nr_pages;
1385 else
1386 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
1387
1388 /* Don't reclaim the zone if there are other reclaimers active */
1389 if (atomic_read(&zone->reclaim_in_progress) > 0)
1390 goto out;
1391
1392 shrink_zone(zone, &sc);
1393 total_reclaimed = sc.nr_reclaimed;
1394
1395 out:
1396 return total_reclaimed;
1397}
1398
1399asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
1400 unsigned int state)
1401{
1402 struct zone *z;
1403 int i;
1404
1405 if (!capable(CAP_SYS_ADMIN))
1406 return -EACCES;
1407
1408 if (node >= MAX_NUMNODES || !node_online(node))
1409 return -EINVAL;
1410
1411 /* This will break if we ever add more zones */
1412 if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
1413 return -EINVAL;
1414
1415 for (i = 0; i < MAX_NR_ZONES; i++) {
1416 if (!(zone & 1<<i))
1417 continue;
1418
1419 z = &NODE_DATA(node)->node_zones[i];
1420
1421 if (state)
1422 z->reclaim_pages = 1;
1423 else
1424 z->reclaim_pages = 0;
1425 }
1426
1427 return 0;
1428}