aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/allocpercpu.c9
-rw-r--r--mm/bootmem.c6
-rw-r--r--mm/filemap.c1
-rw-r--r--mm/fremap.c2
-rw-r--r--mm/hugetlb.c22
-rw-r--r--mm/memory.c3
-rw-r--r--mm/memory_hotplug.c1
-rw-r--r--mm/mempolicy.c10
-rw-r--r--mm/migrate.c19
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mmap.c4
-rw-r--r--mm/mmzone.c5
-rw-r--r--mm/nommu.c6
-rw-r--r--mm/oom_kill.c41
-rw-r--r--mm/page_alloc.c281
-rw-r--r--mm/page_io.c45
-rw-r--r--mm/pdflush.c1
-rw-r--r--mm/readahead.c8
-rw-r--r--mm/shmem.c8
-rw-r--r--mm/slab.c286
-rw-r--r--mm/sparse.c23
-rw-r--r--mm/swap.c6
-rw-r--r--mm/swapfile.c92
-rw-r--r--mm/thrash.c116
-rw-r--r--mm/vmscan.c13
-rw-r--r--mm/vmstat.c22
26 files changed, 654 insertions, 378 deletions
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index eaa9abeea536..b2486cf887a0 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -17,10 +17,9 @@
17void percpu_depopulate(void *__pdata, int cpu) 17void percpu_depopulate(void *__pdata, int cpu)
18{ 18{
19 struct percpu_data *pdata = __percpu_disguise(__pdata); 19 struct percpu_data *pdata = __percpu_disguise(__pdata);
20 if (pdata->ptrs[cpu]) { 20
21 kfree(pdata->ptrs[cpu]); 21 kfree(pdata->ptrs[cpu]);
22 pdata->ptrs[cpu] = NULL; 22 pdata->ptrs[cpu] = NULL;
23 }
24} 23}
25EXPORT_SYMBOL_GPL(percpu_depopulate); 24EXPORT_SYMBOL_GPL(percpu_depopulate);
26 25
@@ -123,6 +122,8 @@ EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
123 */ 122 */
124void percpu_free(void *__pdata) 123void percpu_free(void *__pdata)
125{ 124{
125 if (unlikely(!__pdata))
126 return;
126 __percpu_depopulate_mask(__pdata, &cpu_possible_map); 127 __percpu_depopulate_mask(__pdata, &cpu_possible_map);
127 kfree(__percpu_disguise(__pdata)); 128 kfree(__percpu_disguise(__pdata));
128} 129}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index d53112fcb404..00a96970b237 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -27,8 +27,6 @@ unsigned long max_low_pfn;
27unsigned long min_low_pfn; 27unsigned long min_low_pfn;
28unsigned long max_pfn; 28unsigned long max_pfn;
29 29
30EXPORT_UNUSED_SYMBOL(max_pfn); /* June 2006 */
31
32static LIST_HEAD(bdata_list); 30static LIST_HEAD(bdata_list);
33#ifdef CONFIG_CRASH_DUMP 31#ifdef CONFIG_CRASH_DUMP
34/* 32/*
@@ -196,6 +194,10 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
196 if (limit && bdata->node_boot_start >= limit) 194 if (limit && bdata->node_boot_start >= limit)
197 return NULL; 195 return NULL;
198 196
197 /* on nodes without memory - bootmem_map is NULL */
198 if (!bdata->node_bootmem_map)
199 return NULL;
200
199 end_pfn = bdata->node_low_pfn; 201 end_pfn = bdata->node_low_pfn;
200 limit = PFN_DOWN(limit); 202 limit = PFN_DOWN(limit);
201 if (limit && end_pfn > limit) 203 if (limit && end_pfn > limit)
diff --git a/mm/filemap.c b/mm/filemap.c
index 13df01c50479..af7e2f5caea9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1445,7 +1445,6 @@ no_cached_page:
1445 * effect. 1445 * effect.
1446 */ 1446 */
1447 error = page_cache_read(file, pgoff); 1447 error = page_cache_read(file, pgoff);
1448 grab_swap_token();
1449 1448
1450 /* 1449 /*
1451 * The page we want has now been added to the page cache. 1450 * The page we want has now been added to the page cache.
diff --git a/mm/fremap.c b/mm/fremap.c
index 7a9d0f5d246d..b77a002c3352 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -101,7 +101,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
101{ 101{
102 int err = -ENOMEM; 102 int err = -ENOMEM;
103 pte_t *pte; 103 pte_t *pte;
104 pte_t pte_val;
105 spinlock_t *ptl; 104 spinlock_t *ptl;
106 105
107 pte = get_locked_pte(mm, addr, &ptl); 106 pte = get_locked_pte(mm, addr, &ptl);
@@ -114,7 +113,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
114 } 113 }
115 114
116 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); 115 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
117 pte_val = *pte;
118 /* 116 /*
119 * We don't need to run update_mmu_cache() here because the "file pte" 117 * We don't need to run update_mmu_cache() here because the "file pte"
120 * being installed by install_file_pte() is not a real pte - it's a 118 * being installed by install_file_pte() is not a real pte - it's a
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a088f593a807..0ccc7f230252 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -109,7 +109,7 @@ static int alloc_fresh_huge_page(void)
109 if (nid == MAX_NUMNODES) 109 if (nid == MAX_NUMNODES)
110 nid = first_node(node_online_map); 110 nid = first_node(node_online_map);
111 if (page) { 111 if (page) {
112 page[1].lru.next = (void *)free_huge_page; /* dtor */ 112 set_compound_page_dtor(page, free_huge_page);
113 spin_lock(&hugetlb_lock); 113 spin_lock(&hugetlb_lock);
114 nr_huge_pages++; 114 nr_huge_pages++;
115 nr_huge_pages_node[page_to_nid(page)]++; 115 nr_huge_pages_node[page_to_nid(page)]++;
@@ -344,7 +344,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
344 entry = *src_pte; 344 entry = *src_pte;
345 ptepage = pte_page(entry); 345 ptepage = pte_page(entry);
346 get_page(ptepage); 346 get_page(ptepage);
347 add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
348 set_huge_pte_at(dst, addr, dst_pte, entry); 347 set_huge_pte_at(dst, addr, dst_pte, entry);
349 } 348 }
350 spin_unlock(&src->page_table_lock); 349 spin_unlock(&src->page_table_lock);
@@ -365,6 +364,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
365 pte_t pte; 364 pte_t pte;
366 struct page *page; 365 struct page *page;
367 struct page *tmp; 366 struct page *tmp;
367 /*
368 * A page gathering list, protected by per file i_mmap_lock. The
369 * lock is used to avoid list corruption from multiple unmapping
370 * of the same page since we are using page->lru.
371 */
368 LIST_HEAD(page_list); 372 LIST_HEAD(page_list);
369 373
370 WARN_ON(!is_vm_hugetlb_page(vma)); 374 WARN_ON(!is_vm_hugetlb_page(vma));
@@ -372,24 +376,21 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
372 BUG_ON(end & ~HPAGE_MASK); 376 BUG_ON(end & ~HPAGE_MASK);
373 377
374 spin_lock(&mm->page_table_lock); 378 spin_lock(&mm->page_table_lock);
375
376 /* Update high watermark before we lower rss */
377 update_hiwater_rss(mm);
378
379 for (address = start; address < end; address += HPAGE_SIZE) { 379 for (address = start; address < end; address += HPAGE_SIZE) {
380 ptep = huge_pte_offset(mm, address); 380 ptep = huge_pte_offset(mm, address);
381 if (!ptep) 381 if (!ptep)
382 continue; 382 continue;
383 383
384 if (huge_pmd_unshare(mm, &address, ptep))
385 continue;
386
384 pte = huge_ptep_get_and_clear(mm, address, ptep); 387 pte = huge_ptep_get_and_clear(mm, address, ptep);
385 if (pte_none(pte)) 388 if (pte_none(pte))
386 continue; 389 continue;
387 390
388 page = pte_page(pte); 391 page = pte_page(pte);
389 list_add(&page->lru, &page_list); 392 list_add(&page->lru, &page_list);
390 add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
391 } 393 }
392
393 spin_unlock(&mm->page_table_lock); 394 spin_unlock(&mm->page_table_lock);
394 flush_tlb_range(vma, start, end); 395 flush_tlb_range(vma, start, end);
395 list_for_each_entry_safe(page, tmp, &page_list, lru) { 396 list_for_each_entry_safe(page, tmp, &page_list, lru) {
@@ -515,7 +516,6 @@ retry:
515 if (!pte_none(*ptep)) 516 if (!pte_none(*ptep))
516 goto backout; 517 goto backout;
517 518
518 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
519 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 519 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
520 && (vma->vm_flags & VM_SHARED))); 520 && (vma->vm_flags & VM_SHARED)));
521 set_huge_pte_at(mm, address, ptep, new_pte); 521 set_huge_pte_at(mm, address, ptep, new_pte);
@@ -653,11 +653,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
653 BUG_ON(address >= end); 653 BUG_ON(address >= end);
654 flush_cache_range(vma, address, end); 654 flush_cache_range(vma, address, end);
655 655
656 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
656 spin_lock(&mm->page_table_lock); 657 spin_lock(&mm->page_table_lock);
657 for (; address < end; address += HPAGE_SIZE) { 658 for (; address < end; address += HPAGE_SIZE) {
658 ptep = huge_pte_offset(mm, address); 659 ptep = huge_pte_offset(mm, address);
659 if (!ptep) 660 if (!ptep)
660 continue; 661 continue;
662 if (huge_pmd_unshare(mm, &address, ptep))
663 continue;
661 if (!pte_none(*ptep)) { 664 if (!pte_none(*ptep)) {
662 pte = huge_ptep_get_and_clear(mm, address, ptep); 665 pte = huge_ptep_get_and_clear(mm, address, ptep);
663 pte = pte_mkhuge(pte_modify(pte, newprot)); 666 pte = pte_mkhuge(pte_modify(pte, newprot));
@@ -666,6 +669,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
666 } 669 }
667 } 670 }
668 spin_unlock(&mm->page_table_lock); 671 spin_unlock(&mm->page_table_lock);
672 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
669 673
670 flush_tlb_range(vma, start, end); 674 flush_tlb_range(vma, start, end);
671} 675}
diff --git a/mm/memory.c b/mm/memory.c
index 156861fcac43..4198df0dff1c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1902,7 +1902,6 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
1902 1902
1903 return 0; 1903 return 0;
1904} 1904}
1905EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */
1906 1905
1907/** 1906/**
1908 * swapin_readahead - swap in pages in hope we need them soon 1907 * swapin_readahead - swap in pages in hope we need them soon
@@ -1991,6 +1990,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1991 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 1990 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
1992 page = lookup_swap_cache(entry); 1991 page = lookup_swap_cache(entry);
1993 if (!page) { 1992 if (!page) {
1993 grab_swap_token(); /* Contend for token _before_ read-in */
1994 swapin_readahead(entry, address, vma); 1994 swapin_readahead(entry, address, vma);
1995 page = read_swap_cache_async(entry, vma, address); 1995 page = read_swap_cache_async(entry, vma, address);
1996 if (!page) { 1996 if (!page) {
@@ -2008,7 +2008,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2008 /* Had to read the page from swap area: Major fault */ 2008 /* Had to read the page from swap area: Major fault */
2009 ret = VM_FAULT_MAJOR; 2009 ret = VM_FAULT_MAJOR;
2010 count_vm_event(PGMAJFAULT); 2010 count_vm_event(PGMAJFAULT);
2011 grab_swap_token();
2012 } 2011 }
2013 2012
2014 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2013 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fd678a662eae..0c055a090f4d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -72,7 +72,6 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
72 return ret; 72 return ret;
73 } 73 }
74 memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); 74 memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
75 zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
76 return 0; 75 return 0;
77} 76}
78 77
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 617fb31086ee..b917d6fdc1bb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -141,9 +141,11 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
141 enum zone_type k; 141 enum zone_type k;
142 142
143 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 143 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
144 max++; /* space for zlcache_ptr (see mmzone.h) */
144 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); 145 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
145 if (!zl) 146 if (!zl)
146 return NULL; 147 return NULL;
148 zl->zlcache_ptr = NULL;
147 num = 0; 149 num = 0;
148 /* First put in the highest zones from all nodes, then all the next 150 /* First put in the highest zones from all nodes, then all the next
149 lower zones etc. Avoid empty zones because the memory allocator 151 lower zones etc. Avoid empty zones because the memory allocator
@@ -219,7 +221,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
219 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 221 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
220 do { 222 do {
221 struct page *page; 223 struct page *page;
222 unsigned int nid; 224 int nid;
223 225
224 if (!pte_present(*pte)) 226 if (!pte_present(*pte))
225 continue; 227 continue;
@@ -1324,7 +1326,7 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
1324 atomic_set(&new->refcnt, 1); 1326 atomic_set(&new->refcnt, 1);
1325 if (new->policy == MPOL_BIND) { 1327 if (new->policy == MPOL_BIND) {
1326 int sz = ksize(old->v.zonelist); 1328 int sz = ksize(old->v.zonelist);
1327 new->v.zonelist = kmemdup(old->v.zonelist, sz, SLAB_KERNEL); 1329 new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1328 if (!new->v.zonelist) { 1330 if (!new->v.zonelist) {
1329 kmem_cache_free(policy_cache, new); 1331 kmem_cache_free(policy_cache, new);
1330 return ERR_PTR(-ENOMEM); 1332 return ERR_PTR(-ENOMEM);
@@ -1705,8 +1707,8 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1705 * Display pages allocated per node and memory policy via /proc. 1707 * Display pages allocated per node and memory policy via /proc.
1706 */ 1708 */
1707 1709
1708static const char *policy_types[] = { "default", "prefer", "bind", 1710static const char * const policy_types[] =
1709 "interleave" }; 1711 { "default", "prefer", "bind", "interleave" };
1710 1712
1711/* 1713/*
1712 * Convert a mempolicy into a string. 1714 * Convert a mempolicy into a string.
diff --git a/mm/migrate.c b/mm/migrate.c
index b4979d423d2b..e9b161bde95b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -294,7 +294,7 @@ out:
294static int migrate_page_move_mapping(struct address_space *mapping, 294static int migrate_page_move_mapping(struct address_space *mapping,
295 struct page *newpage, struct page *page) 295 struct page *newpage, struct page *page)
296{ 296{
297 struct page **radix_pointer; 297 void **pslot;
298 298
299 if (!mapping) { 299 if (!mapping) {
300 /* Anonymous page */ 300 /* Anonymous page */
@@ -305,12 +305,11 @@ static int migrate_page_move_mapping(struct address_space *mapping,
305 305
306 write_lock_irq(&mapping->tree_lock); 306 write_lock_irq(&mapping->tree_lock);
307 307
308 radix_pointer = (struct page **)radix_tree_lookup_slot( 308 pslot = radix_tree_lookup_slot(&mapping->page_tree,
309 &mapping->page_tree, 309 page_index(page));
310 page_index(page));
311 310
312 if (page_count(page) != 2 + !!PagePrivate(page) || 311 if (page_count(page) != 2 + !!PagePrivate(page) ||
313 *radix_pointer != page) { 312 (struct page *)radix_tree_deref_slot(pslot) != page) {
314 write_unlock_irq(&mapping->tree_lock); 313 write_unlock_irq(&mapping->tree_lock);
315 return -EAGAIN; 314 return -EAGAIN;
316 } 315 }
@@ -318,7 +317,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
318 /* 317 /*
319 * Now we know that no one else is looking at the page. 318 * Now we know that no one else is looking at the page.
320 */ 319 */
321 get_page(newpage); 320 get_page(newpage); /* add cache reference */
322#ifdef CONFIG_SWAP 321#ifdef CONFIG_SWAP
323 if (PageSwapCache(page)) { 322 if (PageSwapCache(page)) {
324 SetPageSwapCache(newpage); 323 SetPageSwapCache(newpage);
@@ -326,8 +325,14 @@ static int migrate_page_move_mapping(struct address_space *mapping,
326 } 325 }
327#endif 326#endif
328 327
329 *radix_pointer = newpage; 328 radix_tree_replace_slot(pslot, newpage);
329
330 /*
331 * Drop cache reference from old page.
332 * We know this isn't the last reference.
333 */
330 __put_page(page); 334 __put_page(page);
335
331 write_unlock_irq(&mapping->tree_lock); 336 write_unlock_irq(&mapping->tree_lock);
332 337
333 return 0; 338 return 0;
diff --git a/mm/mlock.c b/mm/mlock.c
index b90c59573abf..3446b7ef731e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -65,7 +65,7 @@ success:
65 ret = make_pages_present(start, end); 65 ret = make_pages_present(start, end);
66 } 66 }
67 67
68 vma->vm_mm->locked_vm -= pages; 68 mm->locked_vm -= pages;
69out: 69out:
70 if (ret == -ENOMEM) 70 if (ret == -ENOMEM)
71 ret = -EAGAIN; 71 ret = -EAGAIN;
diff --git a/mm/mmap.c b/mm/mmap.c
index 7b40abd7cba2..7be110e98d4c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1736,7 +1736,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1736 if (mm->map_count >= sysctl_max_map_count) 1736 if (mm->map_count >= sysctl_max_map_count)
1737 return -ENOMEM; 1737 return -ENOMEM;
1738 1738
1739 new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 1739 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
1740 if (!new) 1740 if (!new)
1741 return -ENOMEM; 1741 return -ENOMEM;
1742 1742
@@ -2057,7 +2057,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2057 vma_start < new_vma->vm_end) 2057 vma_start < new_vma->vm_end)
2058 *vmap = new_vma; 2058 *vmap = new_vma;
2059 } else { 2059 } else {
2060 new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 2060 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2061 if (new_vma) { 2061 if (new_vma) {
2062 *new_vma = *vma; 2062 *new_vma = *vma;
2063 pol = mpol_copy(vma_policy(vma)); 2063 pol = mpol_copy(vma_policy(vma));
diff --git a/mm/mmzone.c b/mm/mmzone.c
index febea1c98168..eb5838634f18 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -14,8 +14,6 @@ struct pglist_data *first_online_pgdat(void)
14 return NODE_DATA(first_online_node); 14 return NODE_DATA(first_online_node);
15} 15}
16 16
17EXPORT_UNUSED_SYMBOL(first_online_pgdat); /* June 2006 */
18
19struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) 17struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
20{ 18{
21 int nid = next_online_node(pgdat->node_id); 19 int nid = next_online_node(pgdat->node_id);
@@ -24,8 +22,6 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
24 return NULL; 22 return NULL;
25 return NODE_DATA(nid); 23 return NODE_DATA(nid);
26} 24}
27EXPORT_UNUSED_SYMBOL(next_online_pgdat); /* June 2006 */
28
29 25
30/* 26/*
31 * next_zone - helper magic for for_each_zone() 27 * next_zone - helper magic for for_each_zone()
@@ -45,5 +41,4 @@ struct zone *next_zone(struct zone *zone)
45 } 41 }
46 return zone; 42 return zone;
47} 43}
48EXPORT_UNUSED_SYMBOL(next_zone); /* June 2006 */
49 44
diff --git a/mm/nommu.c b/mm/nommu.c
index 6a2a8aada401..af874569d0f1 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -808,10 +808,9 @@ unsigned long do_mmap_pgoff(struct file *file,
808 vm_flags = determine_vm_flags(file, prot, flags, capabilities); 808 vm_flags = determine_vm_flags(file, prot, flags, capabilities);
809 809
810 /* we're going to need to record the mapping if it works */ 810 /* we're going to need to record the mapping if it works */
811 vml = kmalloc(sizeof(struct vm_list_struct), GFP_KERNEL); 811 vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
812 if (!vml) 812 if (!vml)
813 goto error_getting_vml; 813 goto error_getting_vml;
814 memset(vml, 0, sizeof(*vml));
815 814
816 down_write(&nommu_vma_sem); 815 down_write(&nommu_vma_sem);
817 816
@@ -887,11 +886,10 @@ unsigned long do_mmap_pgoff(struct file *file,
887 } 886 }
888 887
889 /* we're going to need a VMA struct as well */ 888 /* we're going to need a VMA struct as well */
890 vma = kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); 889 vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
891 if (!vma) 890 if (!vma)
892 goto error_getting_vma; 891 goto error_getting_vma;
893 892
894 memset(vma, 0, sizeof(*vma));
895 INIT_LIST_HEAD(&vma->anon_vma_node); 893 INIT_LIST_HEAD(&vma->anon_vma_node);
896 atomic_set(&vma->vm_usage, 1); 894 atomic_set(&vma->vm_usage, 1);
897 if (file) 895 if (file)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2e3ce3a928b9..223d9ccb7d64 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -264,7 +264,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
264 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO 264 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
265 * set. 265 * set.
266 */ 266 */
267static void __oom_kill_task(struct task_struct *p, const char *message) 267static void __oom_kill_task(struct task_struct *p, int verbose)
268{ 268{
269 if (is_init(p)) { 269 if (is_init(p)) {
270 WARN_ON(1); 270 WARN_ON(1);
@@ -278,10 +278,8 @@ static void __oom_kill_task(struct task_struct *p, const char *message)
278 return; 278 return;
279 } 279 }
280 280
281 if (message) { 281 if (verbose)
282 printk(KERN_ERR "%s: Killed process %d (%s).\n", 282 printk(KERN_ERR "Killed process %d (%s)\n", p->pid, p->comm);
283 message, p->pid, p->comm);
284 }
285 283
286 /* 284 /*
287 * We give our sacrificial lamb high priority and access to 285 * We give our sacrificial lamb high priority and access to
@@ -294,7 +292,7 @@ static void __oom_kill_task(struct task_struct *p, const char *message)
294 force_sig(SIGKILL, p); 292 force_sig(SIGKILL, p);
295} 293}
296 294
297static int oom_kill_task(struct task_struct *p, const char *message) 295static int oom_kill_task(struct task_struct *p)
298{ 296{
299 struct mm_struct *mm; 297 struct mm_struct *mm;
300 struct task_struct *g, *q; 298 struct task_struct *g, *q;
@@ -313,15 +311,25 @@ static int oom_kill_task(struct task_struct *p, const char *message)
313 if (mm == NULL) 311 if (mm == NULL)
314 return 1; 312 return 1;
315 313
316 __oom_kill_task(p, message); 314 /*
315 * Don't kill the process if any threads are set to OOM_DISABLE
316 */
317 do_each_thread(g, q) {
318 if (q->mm == mm && p->oomkilladj == OOM_DISABLE)
319 return 1;
320 } while_each_thread(g, q);
321
322 __oom_kill_task(p, 1);
323
317 /* 324 /*
318 * kill all processes that share the ->mm (i.e. all threads), 325 * kill all processes that share the ->mm (i.e. all threads),
319 * but are in a different thread group 326 * but are in a different thread group. Don't let them have access
327 * to memory reserves though, otherwise we might deplete all memory.
320 */ 328 */
321 do_each_thread(g, q) 329 do_each_thread(g, q) {
322 if (q->mm == mm && q->tgid != p->tgid) 330 if (q->mm == mm && q->tgid != p->tgid)
323 __oom_kill_task(q, message); 331 force_sig(SIGKILL, p);
324 while_each_thread(g, q); 332 } while_each_thread(g, q);
325 333
326 return 0; 334 return 0;
327} 335}
@@ -337,21 +345,22 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
337 * its children or threads, just set TIF_MEMDIE so it can die quickly 345 * its children or threads, just set TIF_MEMDIE so it can die quickly
338 */ 346 */
339 if (p->flags & PF_EXITING) { 347 if (p->flags & PF_EXITING) {
340 __oom_kill_task(p, NULL); 348 __oom_kill_task(p, 0);
341 return 0; 349 return 0;
342 } 350 }
343 351
344 printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li" 352 printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
345 " and children.\n", p->pid, p->comm, points); 353 message, p->pid, p->comm, points);
354
346 /* Try to kill a child first */ 355 /* Try to kill a child first */
347 list_for_each(tsk, &p->children) { 356 list_for_each(tsk, &p->children) {
348 c = list_entry(tsk, struct task_struct, sibling); 357 c = list_entry(tsk, struct task_struct, sibling);
349 if (c->mm == p->mm) 358 if (c->mm == p->mm)
350 continue; 359 continue;
351 if (!oom_kill_task(c, message)) 360 if (!oom_kill_task(c))
352 return 0; 361 return 0;
353 } 362 }
354 return oom_kill_task(p, message); 363 return oom_kill_task(p);
355} 364}
356 365
357static BLOCKING_NOTIFIER_HEAD(oom_notify_list); 366static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index aa6fcc7ca66f..cace22b3ac25 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -83,14 +83,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
83 83
84EXPORT_SYMBOL(totalram_pages); 84EXPORT_SYMBOL(totalram_pages);
85 85
86/* 86static char * const zone_names[MAX_NR_ZONES] = {
87 * Used by page_zone() to look up the address of the struct zone whose
88 * id is encoded in the upper bits of page->flags
89 */
90struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
91EXPORT_SYMBOL(zone_table);
92
93static char *zone_names[MAX_NR_ZONES] = {
94 "DMA", 87 "DMA",
95#ifdef CONFIG_ZONE_DMA32 88#ifdef CONFIG_ZONE_DMA32
96 "DMA32", 89 "DMA32",
@@ -237,7 +230,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
237 int i; 230 int i;
238 int nr_pages = 1 << order; 231 int nr_pages = 1 << order;
239 232
240 page[1].lru.next = (void *)free_compound_page; /* set dtor */ 233 set_compound_page_dtor(page, free_compound_page);
241 page[1].lru.prev = (void *)order; 234 page[1].lru.prev = (void *)order;
242 for (i = 0; i < nr_pages; i++) { 235 for (i = 0; i < nr_pages; i++) {
243 struct page *p = page + i; 236 struct page *p = page + i;
@@ -486,7 +479,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order)
486 spin_lock(&zone->lock); 479 spin_lock(&zone->lock);
487 zone->all_unreclaimable = 0; 480 zone->all_unreclaimable = 0;
488 zone->pages_scanned = 0; 481 zone->pages_scanned = 0;
489 __free_one_page(page, zone ,order); 482 __free_one_page(page, zone, order);
490 spin_unlock(&zone->lock); 483 spin_unlock(&zone->lock);
491} 484}
492 485
@@ -605,6 +598,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
605 1 << PG_checked | 1 << PG_mappedtodisk); 598 1 << PG_checked | 1 << PG_mappedtodisk);
606 set_page_private(page, 0); 599 set_page_private(page, 0);
607 set_page_refcounted(page); 600 set_page_refcounted(page);
601
602 arch_alloc_page(page, order);
608 kernel_map_pages(page, 1 << order, 1); 603 kernel_map_pages(page, 1 << order, 1);
609 604
610 if (gfp_flags & __GFP_ZERO) 605 if (gfp_flags & __GFP_ZERO)
@@ -690,9 +685,15 @@ void drain_node_pages(int nodeid)
690 685
691 pcp = &pset->pcp[i]; 686 pcp = &pset->pcp[i];
692 if (pcp->count) { 687 if (pcp->count) {
688 int to_drain;
689
693 local_irq_save(flags); 690 local_irq_save(flags);
694 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 691 if (pcp->count >= pcp->batch)
695 pcp->count = 0; 692 to_drain = pcp->batch;
693 else
694 to_drain = pcp->count;
695 free_pages_bulk(zone, to_drain, &pcp->list, 0);
696 pcp->count -= to_drain;
696 local_irq_restore(flags); 697 local_irq_restore(flags);
697 } 698 }
698 } 699 }
@@ -700,7 +701,6 @@ void drain_node_pages(int nodeid)
700} 701}
701#endif 702#endif
702 703
703#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
704static void __drain_pages(unsigned int cpu) 704static void __drain_pages(unsigned int cpu)
705{ 705{
706 unsigned long flags; 706 unsigned long flags;
@@ -722,7 +722,6 @@ static void __drain_pages(unsigned int cpu)
722 } 722 }
723 } 723 }
724} 724}
725#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
726 725
727#ifdef CONFIG_PM 726#ifdef CONFIG_PM
728 727
@@ -925,31 +924,160 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
925 return 1; 924 return 1;
926} 925}
927 926
927#ifdef CONFIG_NUMA
928/*
929 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
930 * skip over zones that are not allowed by the cpuset, or that have
931 * been recently (in last second) found to be nearly full. See further
932 * comments in mmzone.h. Reduces cache footprint of zonelist scans
933 * that have to skip over alot of full or unallowed zones.
934 *
935 * If the zonelist cache is present in the passed in zonelist, then
936 * returns a pointer to the allowed node mask (either the current
937 * tasks mems_allowed, or node_online_map.)
938 *
939 * If the zonelist cache is not available for this zonelist, does
940 * nothing and returns NULL.
941 *
942 * If the fullzones BITMAP in the zonelist cache is stale (more than
943 * a second since last zap'd) then we zap it out (clear its bits.)
944 *
945 * We hold off even calling zlc_setup, until after we've checked the
946 * first zone in the zonelist, on the theory that most allocations will
947 * be satisfied from that first zone, so best to examine that zone as
948 * quickly as we can.
949 */
950static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
951{
952 struct zonelist_cache *zlc; /* cached zonelist speedup info */
953 nodemask_t *allowednodes; /* zonelist_cache approximation */
954
955 zlc = zonelist->zlcache_ptr;
956 if (!zlc)
957 return NULL;
958
959 if (jiffies - zlc->last_full_zap > 1 * HZ) {
960 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
961 zlc->last_full_zap = jiffies;
962 }
963
964 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
965 &cpuset_current_mems_allowed :
966 &node_online_map;
967 return allowednodes;
968}
969
970/*
971 * Given 'z' scanning a zonelist, run a couple of quick checks to see
972 * if it is worth looking at further for free memory:
973 * 1) Check that the zone isn't thought to be full (doesn't have its
974 * bit set in the zonelist_cache fullzones BITMAP).
975 * 2) Check that the zones node (obtained from the zonelist_cache
976 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
977 * Return true (non-zero) if zone is worth looking at further, or
978 * else return false (zero) if it is not.
979 *
980 * This check -ignores- the distinction between various watermarks,
981 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
982 * found to be full for any variation of these watermarks, it will
983 * be considered full for up to one second by all requests, unless
984 * we are so low on memory on all allowed nodes that we are forced
985 * into the second scan of the zonelist.
986 *
987 * In the second scan we ignore this zonelist cache and exactly
988 * apply the watermarks to all zones, even it is slower to do so.
989 * We are low on memory in the second scan, and should leave no stone
990 * unturned looking for a free page.
991 */
992static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
993 nodemask_t *allowednodes)
994{
995 struct zonelist_cache *zlc; /* cached zonelist speedup info */
996 int i; /* index of *z in zonelist zones */
997 int n; /* node that zone *z is on */
998
999 zlc = zonelist->zlcache_ptr;
1000 if (!zlc)
1001 return 1;
1002
1003 i = z - zonelist->zones;
1004 n = zlc->z_to_n[i];
1005
1006 /* This zone is worth trying if it is allowed but not full */
1007 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1008}
1009
928/* 1010/*
929 * get_page_from_freeliest goes through the zonelist trying to allocate 1011 * Given 'z' scanning a zonelist, set the corresponding bit in
1012 * zlc->fullzones, so that subsequent attempts to allocate a page
1013 * from that zone don't waste time re-examining it.
1014 */
1015static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1016{
1017 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1018 int i; /* index of *z in zonelist zones */
1019
1020 zlc = zonelist->zlcache_ptr;
1021 if (!zlc)
1022 return;
1023
1024 i = z - zonelist->zones;
1025
1026 set_bit(i, zlc->fullzones);
1027}
1028
1029#else /* CONFIG_NUMA */
1030
1031static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1032{
1033 return NULL;
1034}
1035
1036static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1037 nodemask_t *allowednodes)
1038{
1039 return 1;
1040}
1041
1042static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1043{
1044}
1045#endif /* CONFIG_NUMA */
1046
1047/*
1048 * get_page_from_freelist goes through the zonelist trying to allocate
930 * a page. 1049 * a page.
931 */ 1050 */
932static struct page * 1051static struct page *
933get_page_from_freelist(gfp_t gfp_mask, unsigned int order, 1052get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
934 struct zonelist *zonelist, int alloc_flags) 1053 struct zonelist *zonelist, int alloc_flags)
935{ 1054{
936 struct zone **z = zonelist->zones; 1055 struct zone **z;
937 struct page *page = NULL; 1056 struct page *page = NULL;
938 int classzone_idx = zone_idx(*z); 1057 int classzone_idx = zone_idx(zonelist->zones[0]);
939 struct zone *zone; 1058 struct zone *zone;
1059 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1060 int zlc_active = 0; /* set if using zonelist_cache */
1061 int did_zlc_setup = 0; /* just call zlc_setup() one time */
940 1062
1063zonelist_scan:
941 /* 1064 /*
942 * Go through the zonelist once, looking for a zone with enough free. 1065 * Scan zonelist, looking for a zone with enough free.
943 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1066 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
944 */ 1067 */
1068 z = zonelist->zones;
1069
945 do { 1070 do {
1071 if (NUMA_BUILD && zlc_active &&
1072 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1073 continue;
946 zone = *z; 1074 zone = *z;
947 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && 1075 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
948 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) 1076 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
949 break; 1077 break;
950 if ((alloc_flags & ALLOC_CPUSET) && 1078 if ((alloc_flags & ALLOC_CPUSET) &&
951 !cpuset_zone_allowed(zone, gfp_mask)) 1079 !cpuset_zone_allowed(zone, gfp_mask))
952 continue; 1080 goto try_next_zone;
953 1081
954 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1082 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
955 unsigned long mark; 1083 unsigned long mark;
@@ -959,18 +1087,34 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
959 mark = zone->pages_low; 1087 mark = zone->pages_low;
960 else 1088 else
961 mark = zone->pages_high; 1089 mark = zone->pages_high;
962 if (!zone_watermark_ok(zone , order, mark, 1090 if (!zone_watermark_ok(zone, order, mark,
963 classzone_idx, alloc_flags)) 1091 classzone_idx, alloc_flags)) {
964 if (!zone_reclaim_mode || 1092 if (!zone_reclaim_mode ||
965 !zone_reclaim(zone, gfp_mask, order)) 1093 !zone_reclaim(zone, gfp_mask, order))
966 continue; 1094 goto this_zone_full;
1095 }
967 } 1096 }
968 1097
969 page = buffered_rmqueue(zonelist, zone, order, gfp_mask); 1098 page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
970 if (page) { 1099 if (page)
971 break; 1100 break;
1101this_zone_full:
1102 if (NUMA_BUILD)
1103 zlc_mark_zone_full(zonelist, z);
1104try_next_zone:
1105 if (NUMA_BUILD && !did_zlc_setup) {
1106 /* we do zlc_setup after the first zone is tried */
1107 allowednodes = zlc_setup(zonelist, alloc_flags);
1108 zlc_active = 1;
1109 did_zlc_setup = 1;
972 } 1110 }
973 } while (*(++z) != NULL); 1111 } while (*(++z) != NULL);
1112
1113 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1114 /* Disable zlc cache for second zonelist scan */
1115 zlc_active = 0;
1116 goto zonelist_scan;
1117 }
974 return page; 1118 return page;
975} 1119}
976 1120
@@ -1005,9 +1149,19 @@ restart:
1005 if (page) 1149 if (page)
1006 goto got_pg; 1150 goto got_pg;
1007 1151
1008 do { 1152 /*
1153 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
1154 * __GFP_NOWARN set) should not cause reclaim since the subsystem
1155 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
1156 * using a larger set of nodes after it has established that the
1157 * allowed per node queues are empty and that nodes are
1158 * over allocated.
1159 */
1160 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1161 goto nopage;
1162
1163 for (z = zonelist->zones; *z; z++)
1009 wakeup_kswapd(*z, order); 1164 wakeup_kswapd(*z, order);
1010 } while (*(++z));
1011 1165
1012 /* 1166 /*
1013 * OK, we're below the kswapd watermark and have kicked background 1167 * OK, we're below the kswapd watermark and have kicked background
@@ -1041,6 +1195,7 @@ restart:
1041 1195
1042 /* This allocation should allow future memory freeing. */ 1196 /* This allocation should allow future memory freeing. */
1043 1197
1198rebalance:
1044 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1199 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
1045 && !in_interrupt()) { 1200 && !in_interrupt()) {
1046 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1201 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
@@ -1062,7 +1217,6 @@ nofail_alloc:
1062 if (!wait) 1217 if (!wait)
1063 goto nopage; 1218 goto nopage;
1064 1219
1065rebalance:
1066 cond_resched(); 1220 cond_resched();
1067 1221
1068 /* We now go into synchronous reclaim */ 1222 /* We now go into synchronous reclaim */
@@ -1262,7 +1416,7 @@ unsigned int nr_free_pagecache_pages(void)
1262static inline void show_node(struct zone *zone) 1416static inline void show_node(struct zone *zone)
1263{ 1417{
1264 if (NUMA_BUILD) 1418 if (NUMA_BUILD)
1265 printk("Node %ld ", zone_to_nid(zone)); 1419 printk("Node %d ", zone_to_nid(zone));
1266} 1420}
1267 1421
1268void si_meminfo(struct sysinfo *val) 1422void si_meminfo(struct sysinfo *val)
@@ -1542,6 +1696,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1542 } 1696 }
1543} 1697}
1544 1698
1699/* Construct the zonelist performance cache - see further mmzone.h */
1700static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1701{
1702 int i;
1703
1704 for (i = 0; i < MAX_NR_ZONES; i++) {
1705 struct zonelist *zonelist;
1706 struct zonelist_cache *zlc;
1707 struct zone **z;
1708
1709 zonelist = pgdat->node_zonelists + i;
1710 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
1711 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1712 for (z = zonelist->zones; *z; z++)
1713 zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
1714 }
1715}
1716
1545#else /* CONFIG_NUMA */ 1717#else /* CONFIG_NUMA */
1546 1718
1547static void __meminit build_zonelists(pg_data_t *pgdat) 1719static void __meminit build_zonelists(pg_data_t *pgdat)
@@ -1579,14 +1751,26 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1579 } 1751 }
1580} 1752}
1581 1753
1754/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
1755static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1756{
1757 int i;
1758
1759 for (i = 0; i < MAX_NR_ZONES; i++)
1760 pgdat->node_zonelists[i].zlcache_ptr = NULL;
1761}
1762
1582#endif /* CONFIG_NUMA */ 1763#endif /* CONFIG_NUMA */
1583 1764
1584/* return values int ....just for stop_machine_run() */ 1765/* return values int ....just for stop_machine_run() */
1585static int __meminit __build_all_zonelists(void *dummy) 1766static int __meminit __build_all_zonelists(void *dummy)
1586{ 1767{
1587 int nid; 1768 int nid;
1588 for_each_online_node(nid) 1769
1770 for_each_online_node(nid) {
1589 build_zonelists(NODE_DATA(nid)); 1771 build_zonelists(NODE_DATA(nid));
1772 build_zonelist_cache(NODE_DATA(nid));
1773 }
1590 return 0; 1774 return 0;
1591} 1775}
1592 1776
@@ -1715,20 +1899,6 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1715 } 1899 }
1716} 1900}
1717 1901
1718#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
1719void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
1720 unsigned long pfn, unsigned long size)
1721{
1722 unsigned long snum = pfn_to_section_nr(pfn);
1723 unsigned long end = pfn_to_section_nr(pfn + size);
1724
1725 if (FLAGS_HAS_NODE)
1726 zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
1727 else
1728 for (; snum <= end; snum++)
1729 zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
1730}
1731
1732#ifndef __HAVE_ARCH_MEMMAP_INIT 1902#ifndef __HAVE_ARCH_MEMMAP_INIT
1733#define memmap_init(size, nid, zone, start_pfn) \ 1903#define memmap_init(size, nid, zone, start_pfn) \
1734 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1904 memmap_init_zone((size), (nid), (zone), (start_pfn))
@@ -1881,16 +2051,16 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
1881 int ret = NOTIFY_OK; 2051 int ret = NOTIFY_OK;
1882 2052
1883 switch (action) { 2053 switch (action) {
1884 case CPU_UP_PREPARE: 2054 case CPU_UP_PREPARE:
1885 if (process_zones(cpu)) 2055 if (process_zones(cpu))
1886 ret = NOTIFY_BAD; 2056 ret = NOTIFY_BAD;
1887 break; 2057 break;
1888 case CPU_UP_CANCELED: 2058 case CPU_UP_CANCELED:
1889 case CPU_DEAD: 2059 case CPU_DEAD:
1890 free_zone_pagesets(cpu); 2060 free_zone_pagesets(cpu);
1891 break; 2061 break;
1892 default: 2062 default:
1893 break; 2063 break;
1894 } 2064 }
1895 return ret; 2065 return ret;
1896} 2066}
@@ -2421,7 +2591,6 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
2421 if (!size) 2591 if (!size)
2422 continue; 2592 continue;
2423 2593
2424 zonetable_add(zone, nid, j, zone_start_pfn, size);
2425 ret = init_currently_empty_zone(zone, zone_start_pfn, size); 2594 ret = init_currently_empty_zone(zone, zone_start_pfn, size);
2426 BUG_ON(ret); 2595 BUG_ON(ret);
2427 zone_start_pfn += size; 2596 zone_start_pfn += size;
@@ -2736,7 +2905,6 @@ void __init free_area_init(unsigned long *zones_size)
2736 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 2905 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
2737} 2906}
2738 2907
2739#ifdef CONFIG_HOTPLUG_CPU
2740static int page_alloc_cpu_notify(struct notifier_block *self, 2908static int page_alloc_cpu_notify(struct notifier_block *self,
2741 unsigned long action, void *hcpu) 2909 unsigned long action, void *hcpu)
2742{ 2910{
@@ -2751,7 +2919,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
2751 } 2919 }
2752 return NOTIFY_OK; 2920 return NOTIFY_OK;
2753} 2921}
2754#endif /* CONFIG_HOTPLUG_CPU */
2755 2922
2756void __init page_alloc_init(void) 2923void __init page_alloc_init(void)
2757{ 2924{
@@ -3055,7 +3222,7 @@ void *__init alloc_large_system_hash(const char *tablename,
3055 /* allow the kernel cmdline to have a say */ 3222 /* allow the kernel cmdline to have a say */
3056 if (!numentries) { 3223 if (!numentries) {
3057 /* round applicable memory size up to nearest megabyte */ 3224 /* round applicable memory size up to nearest megabyte */
3058 numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; 3225 numentries = nr_kernel_pages;
3059 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 3226 numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
3060 numentries >>= 20 - PAGE_SHIFT; 3227 numentries >>= 20 - PAGE_SHIFT;
3061 numentries <<= 20 - PAGE_SHIFT; 3228 numentries <<= 20 - PAGE_SHIFT;
diff --git a/mm/page_io.c b/mm/page_io.c
index d4840ecbf8f9..dbffec0d78c9 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -147,48 +147,3 @@ int swap_readpage(struct file *file, struct page *page)
147out: 147out:
148 return ret; 148 return ret;
149} 149}
150
151#ifdef CONFIG_SOFTWARE_SUSPEND
152/*
153 * A scruffy utility function to read or write an arbitrary swap page
154 * and wait on the I/O. The caller must have a ref on the page.
155 *
156 * We use end_swap_bio_read() even for writes, because it happens to do what
157 * we want.
158 */
159int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page,
160 struct bio **bio_chain)
161{
162 struct bio *bio;
163 int ret = 0;
164 int bio_rw;
165
166 lock_page(page);
167
168 bio = get_swap_bio(GFP_KERNEL, entry.val, page, end_swap_bio_read);
169 if (bio == NULL) {
170 unlock_page(page);
171 ret = -ENOMEM;
172 goto out;
173 }
174
175 bio_rw = rw;
176 if (!bio_chain)
177 bio_rw |= (1 << BIO_RW_SYNC);
178 if (bio_chain)
179 bio_get(bio);
180 submit_bio(bio_rw, bio);
181 if (bio_chain == NULL) {
182 wait_on_page_locked(page);
183
184 if (!PageUptodate(page) || PageError(page))
185 ret = -EIO;
186 }
187 if (bio_chain) {
188 bio->bi_private = *bio_chain;
189 *bio_chain = bio;
190 }
191out:
192 return ret;
193}
194#endif
diff --git a/mm/pdflush.c b/mm/pdflush.c
index b02102feeb4b..8ce0900dc95c 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -21,6 +21,7 @@
21#include <linux/writeback.h> // Prototypes pdflush_operation() 21#include <linux/writeback.h> // Prototypes pdflush_operation()
22#include <linux/kthread.h> 22#include <linux/kthread.h>
23#include <linux/cpuset.h> 23#include <linux/cpuset.h>
24#include <linux/freezer.h>
24 25
25 26
26/* 27/*
diff --git a/mm/readahead.c b/mm/readahead.c
index 23cb61a01c6e..a386f2b6b335 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -148,13 +148,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
148 if (!pagevec_add(&lru_pvec, page)) 148 if (!pagevec_add(&lru_pvec, page))
149 __pagevec_lru_add(&lru_pvec); 149 __pagevec_lru_add(&lru_pvec);
150 if (ret) { 150 if (ret) {
151 while (!list_empty(pages)) { 151 put_pages_list(pages);
152 struct page *victim;
153
154 victim = list_to_page(pages);
155 list_del(&victim->lru);
156 page_cache_release(victim);
157 }
158 break; 152 break;
159 } 153 }
160 } 154 }
diff --git a/mm/shmem.c b/mm/shmem.c
index 4959535fc14c..c820b4f77b8d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -177,7 +177,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
177 177
178static struct super_operations shmem_ops; 178static struct super_operations shmem_ops;
179static const struct address_space_operations shmem_aops; 179static const struct address_space_operations shmem_aops;
180static struct file_operations shmem_file_operations; 180static const struct file_operations shmem_file_operations;
181static struct inode_operations shmem_inode_operations; 181static struct inode_operations shmem_inode_operations;
182static struct inode_operations shmem_dir_inode_operations; 182static struct inode_operations shmem_dir_inode_operations;
183static struct inode_operations shmem_special_inode_operations; 183static struct inode_operations shmem_special_inode_operations;
@@ -1943,7 +1943,7 @@ static int shmem_xattr_security_set(struct inode *inode, const char *name,
1943 return security_inode_setsecurity(inode, name, value, size, flags); 1943 return security_inode_setsecurity(inode, name, value, size, flags);
1944} 1944}
1945 1945
1946struct xattr_handler shmem_xattr_security_handler = { 1946static struct xattr_handler shmem_xattr_security_handler = {
1947 .prefix = XATTR_SECURITY_PREFIX, 1947 .prefix = XATTR_SECURITY_PREFIX,
1948 .list = shmem_xattr_security_list, 1948 .list = shmem_xattr_security_list,
1949 .get = shmem_xattr_security_get, 1949 .get = shmem_xattr_security_get,
@@ -2263,7 +2263,7 @@ static struct kmem_cache *shmem_inode_cachep;
2263static struct inode *shmem_alloc_inode(struct super_block *sb) 2263static struct inode *shmem_alloc_inode(struct super_block *sb)
2264{ 2264{
2265 struct shmem_inode_info *p; 2265 struct shmem_inode_info *p;
2266 p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL); 2266 p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
2267 if (!p) 2267 if (!p)
2268 return NULL; 2268 return NULL;
2269 return &p->vfs_inode; 2269 return &p->vfs_inode;
@@ -2319,7 +2319,7 @@ static const struct address_space_operations shmem_aops = {
2319 .migratepage = migrate_page, 2319 .migratepage = migrate_page,
2320}; 2320};
2321 2321
2322static struct file_operations shmem_file_operations = { 2322static const struct file_operations shmem_file_operations = {
2323 .mmap = shmem_mmap, 2323 .mmap = shmem_mmap,
2324#ifdef CONFIG_TMPFS 2324#ifdef CONFIG_TMPFS
2325 .llseek = generic_file_llseek, 2325 .llseek = generic_file_llseek,
diff --git a/mm/slab.c b/mm/slab.c
index 5de81473df34..068cb4503c15 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -103,12 +103,12 @@
103#include <linux/module.h> 103#include <linux/module.h>
104#include <linux/rcupdate.h> 104#include <linux/rcupdate.h>
105#include <linux/string.h> 105#include <linux/string.h>
106#include <linux/uaccess.h>
106#include <linux/nodemask.h> 107#include <linux/nodemask.h>
107#include <linux/mempolicy.h> 108#include <linux/mempolicy.h>
108#include <linux/mutex.h> 109#include <linux/mutex.h>
109#include <linux/rtmutex.h> 110#include <linux/rtmutex.h>
110 111
111#include <asm/uaccess.h>
112#include <asm/cacheflush.h> 112#include <asm/cacheflush.h>
113#include <asm/tlbflush.h> 113#include <asm/tlbflush.h>
114#include <asm/page.h> 114#include <asm/page.h>
@@ -730,7 +730,10 @@ static inline void init_lock_keys(void)
730} 730}
731#endif 731#endif
732 732
733/* Guard access to the cache-chain. */ 733/*
734 * 1. Guard access to the cache-chain.
735 * 2. Protect sanity of cpu_online_map against cpu hotplug events
736 */
734static DEFINE_MUTEX(cache_chain_mutex); 737static DEFINE_MUTEX(cache_chain_mutex);
735static struct list_head cache_chain; 738static struct list_head cache_chain;
736 739
@@ -866,6 +869,22 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
866 dump_stack(); 869 dump_stack();
867} 870}
868 871
872/*
873 * By default on NUMA we use alien caches to stage the freeing of
874 * objects allocated from other nodes. This causes massive memory
875 * inefficiencies when using fake NUMA setup to split memory into a
876 * large number of small nodes, so it can be disabled on the command
877 * line
878 */
879
880static int use_alien_caches __read_mostly = 1;
881static int __init noaliencache_setup(char *s)
882{
883 use_alien_caches = 0;
884 return 1;
885}
886__setup("noaliencache", noaliencache_setup);
887
869#ifdef CONFIG_NUMA 888#ifdef CONFIG_NUMA
870/* 889/*
871 * Special reaping functions for NUMA systems called from cache_reap(). 890 * Special reaping functions for NUMA systems called from cache_reap().
@@ -996,7 +1015,7 @@ static inline void *alternate_node_alloc(struct kmem_cache *cachep,
996 return NULL; 1015 return NULL;
997} 1016}
998 1017
999static inline void *__cache_alloc_node(struct kmem_cache *cachep, 1018static inline void *____cache_alloc_node(struct kmem_cache *cachep,
1000 gfp_t flags, int nodeid) 1019 gfp_t flags, int nodeid)
1001{ 1020{
1002 return NULL; 1021 return NULL;
@@ -1004,7 +1023,7 @@ static inline void *__cache_alloc_node(struct kmem_cache *cachep,
1004 1023
1005#else /* CONFIG_NUMA */ 1024#else /* CONFIG_NUMA */
1006 1025
1007static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); 1026static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
1008static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 1027static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
1009 1028
1010static struct array_cache **alloc_alien_cache(int node, int limit) 1029static struct array_cache **alloc_alien_cache(int node, int limit)
@@ -1114,7 +1133,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1114 * Make sure we are not freeing a object from another node to the array 1133 * Make sure we are not freeing a object from another node to the array
1115 * cache on this cpu. 1134 * cache on this cpu.
1116 */ 1135 */
1117 if (likely(slabp->nodeid == node)) 1136 if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches))
1118 return 0; 1137 return 0;
1119 1138
1120 l3 = cachep->nodelists[node]; 1139 l3 = cachep->nodelists[node];
@@ -1192,7 +1211,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1192 list_for_each_entry(cachep, &cache_chain, next) { 1211 list_for_each_entry(cachep, &cache_chain, next) {
1193 struct array_cache *nc; 1212 struct array_cache *nc;
1194 struct array_cache *shared; 1213 struct array_cache *shared;
1195 struct array_cache **alien; 1214 struct array_cache **alien = NULL;
1196 1215
1197 nc = alloc_arraycache(node, cachep->limit, 1216 nc = alloc_arraycache(node, cachep->limit,
1198 cachep->batchcount); 1217 cachep->batchcount);
@@ -1204,9 +1223,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1204 if (!shared) 1223 if (!shared)
1205 goto bad; 1224 goto bad;
1206 1225
1207 alien = alloc_alien_cache(node, cachep->limit); 1226 if (use_alien_caches) {
1208 if (!alien) 1227 alien = alloc_alien_cache(node, cachep->limit);
1209 goto bad; 1228 if (!alien)
1229 goto bad;
1230 }
1210 cachep->array[cpu] = nc; 1231 cachep->array[cpu] = nc;
1211 l3 = cachep->nodelists[node]; 1232 l3 = cachep->nodelists[node];
1212 BUG_ON(!l3); 1233 BUG_ON(!l3);
@@ -1230,12 +1251,18 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1230 kfree(shared); 1251 kfree(shared);
1231 free_alien_cache(alien); 1252 free_alien_cache(alien);
1232 } 1253 }
1233 mutex_unlock(&cache_chain_mutex);
1234 break; 1254 break;
1235 case CPU_ONLINE: 1255 case CPU_ONLINE:
1256 mutex_unlock(&cache_chain_mutex);
1236 start_cpu_timer(cpu); 1257 start_cpu_timer(cpu);
1237 break; 1258 break;
1238#ifdef CONFIG_HOTPLUG_CPU 1259#ifdef CONFIG_HOTPLUG_CPU
1260 case CPU_DOWN_PREPARE:
1261 mutex_lock(&cache_chain_mutex);
1262 break;
1263 case CPU_DOWN_FAILED:
1264 mutex_unlock(&cache_chain_mutex);
1265 break;
1239 case CPU_DEAD: 1266 case CPU_DEAD:
1240 /* 1267 /*
1241 * Even if all the cpus of a node are down, we don't free the 1268 * Even if all the cpus of a node are down, we don't free the
@@ -1246,8 +1273,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1246 * gets destroyed at kmem_cache_destroy(). 1273 * gets destroyed at kmem_cache_destroy().
1247 */ 1274 */
1248 /* fall thru */ 1275 /* fall thru */
1276#endif
1249 case CPU_UP_CANCELED: 1277 case CPU_UP_CANCELED:
1250 mutex_lock(&cache_chain_mutex);
1251 list_for_each_entry(cachep, &cache_chain, next) { 1278 list_for_each_entry(cachep, &cache_chain, next) {
1252 struct array_cache *nc; 1279 struct array_cache *nc;
1253 struct array_cache *shared; 1280 struct array_cache *shared;
@@ -1308,11 +1335,9 @@ free_array_cache:
1308 } 1335 }
1309 mutex_unlock(&cache_chain_mutex); 1336 mutex_unlock(&cache_chain_mutex);
1310 break; 1337 break;
1311#endif
1312 } 1338 }
1313 return NOTIFY_OK; 1339 return NOTIFY_OK;
1314bad: 1340bad:
1315 mutex_unlock(&cache_chain_mutex);
1316 return NOTIFY_BAD; 1341 return NOTIFY_BAD;
1317} 1342}
1318 1343
@@ -1580,12 +1605,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1580 flags |= __GFP_COMP; 1605 flags |= __GFP_COMP;
1581#endif 1606#endif
1582 1607
1583 /* 1608 flags |= cachep->gfpflags;
1584 * Under NUMA we want memory on the indicated node. We will handle
1585 * the needed fallback ourselves since we want to serve from our
1586 * per node object lists first for other nodes.
1587 */
1588 flags |= cachep->gfpflags | GFP_THISNODE;
1589 1609
1590 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1610 page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1591 if (!page) 1611 if (!page)
@@ -2098,15 +2118,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2098 } 2118 }
2099 2119
2100 /* 2120 /*
2101 * Prevent CPUs from coming and going. 2121 * We use cache_chain_mutex to ensure a consistent view of
2102 * lock_cpu_hotplug() nests outside cache_chain_mutex 2122 * cpu_online_map as well. Please see cpuup_callback
2103 */ 2123 */
2104 lock_cpu_hotplug();
2105
2106 mutex_lock(&cache_chain_mutex); 2124 mutex_lock(&cache_chain_mutex);
2107 2125
2108 list_for_each_entry(pc, &cache_chain, next) { 2126 list_for_each_entry(pc, &cache_chain, next) {
2109 mm_segment_t old_fs = get_fs();
2110 char tmp; 2127 char tmp;
2111 int res; 2128 int res;
2112 2129
@@ -2115,9 +2132,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2115 * destroy its slab cache and no-one else reuses the vmalloc 2132 * destroy its slab cache and no-one else reuses the vmalloc
2116 * area of the module. Print a warning. 2133 * area of the module. Print a warning.
2117 */ 2134 */
2118 set_fs(KERNEL_DS); 2135 res = probe_kernel_address(pc->name, tmp);
2119 res = __get_user(tmp, pc->name);
2120 set_fs(old_fs);
2121 if (res) { 2136 if (res) {
2122 printk("SLAB: cache with size %d has lost its name\n", 2137 printk("SLAB: cache with size %d has lost its name\n",
2123 pc->buffer_size); 2138 pc->buffer_size);
@@ -2197,25 +2212,24 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2197 if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) 2212 if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
2198 ralign = BYTES_PER_WORD; 2213 ralign = BYTES_PER_WORD;
2199 2214
2200 /* 2) arch mandated alignment: disables debug if necessary */ 2215 /* 2) arch mandated alignment */
2201 if (ralign < ARCH_SLAB_MINALIGN) { 2216 if (ralign < ARCH_SLAB_MINALIGN) {
2202 ralign = ARCH_SLAB_MINALIGN; 2217 ralign = ARCH_SLAB_MINALIGN;
2203 if (ralign > BYTES_PER_WORD)
2204 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2205 } 2218 }
2206 /* 3) caller mandated alignment: disables debug if necessary */ 2219 /* 3) caller mandated alignment */
2207 if (ralign < align) { 2220 if (ralign < align) {
2208 ralign = align; 2221 ralign = align;
2209 if (ralign > BYTES_PER_WORD)
2210 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2211 } 2222 }
2223 /* disable debug if necessary */
2224 if (ralign > BYTES_PER_WORD)
2225 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2212 /* 2226 /*
2213 * 4) Store it. 2227 * 4) Store it.
2214 */ 2228 */
2215 align = ralign; 2229 align = ralign;
2216 2230
2217 /* Get cache's description obj. */ 2231 /* Get cache's description obj. */
2218 cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL); 2232 cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
2219 if (!cachep) 2233 if (!cachep)
2220 goto oops; 2234 goto oops;
2221 2235
@@ -2326,7 +2340,6 @@ oops:
2326 panic("kmem_cache_create(): failed to create slab `%s'\n", 2340 panic("kmem_cache_create(): failed to create slab `%s'\n",
2327 name); 2341 name);
2328 mutex_unlock(&cache_chain_mutex); 2342 mutex_unlock(&cache_chain_mutex);
2329 unlock_cpu_hotplug();
2330 return cachep; 2343 return cachep;
2331} 2344}
2332EXPORT_SYMBOL(kmem_cache_create); 2345EXPORT_SYMBOL(kmem_cache_create);
@@ -2444,6 +2457,7 @@ out:
2444 return nr_freed; 2457 return nr_freed;
2445} 2458}
2446 2459
2460/* Called with cache_chain_mutex held to protect against cpu hotplug */
2447static int __cache_shrink(struct kmem_cache *cachep) 2461static int __cache_shrink(struct kmem_cache *cachep)
2448{ 2462{
2449 int ret = 0, i = 0; 2463 int ret = 0, i = 0;
@@ -2474,9 +2488,13 @@ static int __cache_shrink(struct kmem_cache *cachep)
2474 */ 2488 */
2475int kmem_cache_shrink(struct kmem_cache *cachep) 2489int kmem_cache_shrink(struct kmem_cache *cachep)
2476{ 2490{
2491 int ret;
2477 BUG_ON(!cachep || in_interrupt()); 2492 BUG_ON(!cachep || in_interrupt());
2478 2493
2479 return __cache_shrink(cachep); 2494 mutex_lock(&cache_chain_mutex);
2495 ret = __cache_shrink(cachep);
2496 mutex_unlock(&cache_chain_mutex);
2497 return ret;
2480} 2498}
2481EXPORT_SYMBOL(kmem_cache_shrink); 2499EXPORT_SYMBOL(kmem_cache_shrink);
2482 2500
@@ -2500,23 +2518,16 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
2500{ 2518{
2501 BUG_ON(!cachep || in_interrupt()); 2519 BUG_ON(!cachep || in_interrupt());
2502 2520
2503 /* Don't let CPUs to come and go */
2504 lock_cpu_hotplug();
2505
2506 /* Find the cache in the chain of caches. */ 2521 /* Find the cache in the chain of caches. */
2507 mutex_lock(&cache_chain_mutex); 2522 mutex_lock(&cache_chain_mutex);
2508 /* 2523 /*
2509 * the chain is never empty, cache_cache is never destroyed 2524 * the chain is never empty, cache_cache is never destroyed
2510 */ 2525 */
2511 list_del(&cachep->next); 2526 list_del(&cachep->next);
2512 mutex_unlock(&cache_chain_mutex);
2513
2514 if (__cache_shrink(cachep)) { 2527 if (__cache_shrink(cachep)) {
2515 slab_error(cachep, "Can't free all objects"); 2528 slab_error(cachep, "Can't free all objects");
2516 mutex_lock(&cache_chain_mutex);
2517 list_add(&cachep->next, &cache_chain); 2529 list_add(&cachep->next, &cache_chain);
2518 mutex_unlock(&cache_chain_mutex); 2530 mutex_unlock(&cache_chain_mutex);
2519 unlock_cpu_hotplug();
2520 return; 2531 return;
2521 } 2532 }
2522 2533
@@ -2524,7 +2535,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
2524 synchronize_rcu(); 2535 synchronize_rcu();
2525 2536
2526 __kmem_cache_destroy(cachep); 2537 __kmem_cache_destroy(cachep);
2527 unlock_cpu_hotplug(); 2538 mutex_unlock(&cache_chain_mutex);
2528} 2539}
2529EXPORT_SYMBOL(kmem_cache_destroy); 2540EXPORT_SYMBOL(kmem_cache_destroy);
2530 2541
@@ -2548,7 +2559,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2548 if (OFF_SLAB(cachep)) { 2559 if (OFF_SLAB(cachep)) {
2549 /* Slab management obj is off-slab. */ 2560 /* Slab management obj is off-slab. */
2550 slabp = kmem_cache_alloc_node(cachep->slabp_cache, 2561 slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2551 local_flags, nodeid); 2562 local_flags & ~GFP_THISNODE, nodeid);
2552 if (!slabp) 2563 if (!slabp)
2553 return NULL; 2564 return NULL;
2554 } else { 2565 } else {
@@ -2618,7 +2629,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2618 2629
2619static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) 2630static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2620{ 2631{
2621 if (flags & SLAB_DMA) 2632 if (flags & GFP_DMA)
2622 BUG_ON(!(cachep->gfpflags & GFP_DMA)); 2633 BUG_ON(!(cachep->gfpflags & GFP_DMA));
2623 else 2634 else
2624 BUG_ON(cachep->gfpflags & GFP_DMA); 2635 BUG_ON(cachep->gfpflags & GFP_DMA);
@@ -2689,10 +2700,10 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2689 * Grow (by 1) the number of slabs within a cache. This is called by 2700 * Grow (by 1) the number of slabs within a cache. This is called by
2690 * kmem_cache_alloc() when there are no active objs left in a cache. 2701 * kmem_cache_alloc() when there are no active objs left in a cache.
2691 */ 2702 */
2692static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) 2703static int cache_grow(struct kmem_cache *cachep,
2704 gfp_t flags, int nodeid, void *objp)
2693{ 2705{
2694 struct slab *slabp; 2706 struct slab *slabp;
2695 void *objp;
2696 size_t offset; 2707 size_t offset;
2697 gfp_t local_flags; 2708 gfp_t local_flags;
2698 unsigned long ctor_flags; 2709 unsigned long ctor_flags;
@@ -2702,12 +2713,12 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2702 * Be lazy and only check for valid flags here, keeping it out of the 2713 * Be lazy and only check for valid flags here, keeping it out of the
2703 * critical path in kmem_cache_alloc(). 2714 * critical path in kmem_cache_alloc().
2704 */ 2715 */
2705 BUG_ON(flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)); 2716 BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW));
2706 if (flags & SLAB_NO_GROW) 2717 if (flags & __GFP_NO_GROW)
2707 return 0; 2718 return 0;
2708 2719
2709 ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2720 ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2710 local_flags = (flags & SLAB_LEVEL_MASK); 2721 local_flags = (flags & GFP_LEVEL_MASK);
2711 if (!(local_flags & __GFP_WAIT)) 2722 if (!(local_flags & __GFP_WAIT))
2712 /* 2723 /*
2713 * Not allowed to sleep. Need to tell a constructor about 2724 * Not allowed to sleep. Need to tell a constructor about
@@ -2744,12 +2755,14 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2744 * Get mem for the objs. Attempt to allocate a physical page from 2755 * Get mem for the objs. Attempt to allocate a physical page from
2745 * 'nodeid'. 2756 * 'nodeid'.
2746 */ 2757 */
2747 objp = kmem_getpages(cachep, flags, nodeid); 2758 if (!objp)
2759 objp = kmem_getpages(cachep, flags, nodeid);
2748 if (!objp) 2760 if (!objp)
2749 goto failed; 2761 goto failed;
2750 2762
2751 /* Get slab management. */ 2763 /* Get slab management. */
2752 slabp = alloc_slabmgmt(cachep, objp, offset, local_flags, nodeid); 2764 slabp = alloc_slabmgmt(cachep, objp, offset,
2765 local_flags & ~GFP_THISNODE, nodeid);
2753 if (!slabp) 2766 if (!slabp)
2754 goto opps1; 2767 goto opps1;
2755 2768
@@ -2987,7 +3000,7 @@ alloc_done:
2987 3000
2988 if (unlikely(!ac->avail)) { 3001 if (unlikely(!ac->avail)) {
2989 int x; 3002 int x;
2990 x = cache_grow(cachep, flags, node); 3003 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
2991 3004
2992 /* cache_grow can reenable interrupts, then ac could change. */ 3005 /* cache_grow can reenable interrupts, then ac could change. */
2993 ac = cpu_cache_get(cachep); 3006 ac = cpu_cache_get(cachep);
@@ -3063,6 +3076,12 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3063 3076
3064 cachep->ctor(objp, cachep, ctor_flags); 3077 cachep->ctor(objp, cachep, ctor_flags);
3065 } 3078 }
3079#if ARCH_SLAB_MINALIGN
3080 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3081 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3082 objp, ARCH_SLAB_MINALIGN);
3083 }
3084#endif
3066 return objp; 3085 return objp;
3067} 3086}
3068#else 3087#else
@@ -3105,10 +3124,10 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
3105 objp = ____cache_alloc(cachep, flags); 3124 objp = ____cache_alloc(cachep, flags);
3106 /* 3125 /*
3107 * We may just have run out of memory on the local node. 3126 * We may just have run out of memory on the local node.
3108 * __cache_alloc_node() knows how to locate memory on other nodes 3127 * ____cache_alloc_node() knows how to locate memory on other nodes
3109 */ 3128 */
3110 if (NUMA_BUILD && !objp) 3129 if (NUMA_BUILD && !objp)
3111 objp = __cache_alloc_node(cachep, flags, numa_node_id()); 3130 objp = ____cache_alloc_node(cachep, flags, numa_node_id());
3112 local_irq_restore(save_flags); 3131 local_irq_restore(save_flags);
3113 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 3132 objp = cache_alloc_debugcheck_after(cachep, flags, objp,
3114 caller); 3133 caller);
@@ -3135,15 +3154,17 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3135 else if (current->mempolicy) 3154 else if (current->mempolicy)
3136 nid_alloc = slab_node(current->mempolicy); 3155 nid_alloc = slab_node(current->mempolicy);
3137 if (nid_alloc != nid_here) 3156 if (nid_alloc != nid_here)
3138 return __cache_alloc_node(cachep, flags, nid_alloc); 3157 return ____cache_alloc_node(cachep, flags, nid_alloc);
3139 return NULL; 3158 return NULL;
3140} 3159}
3141 3160
3142/* 3161/*
3143 * Fallback function if there was no memory available and no objects on a 3162 * Fallback function if there was no memory available and no objects on a
3144 * certain node and we are allowed to fall back. We mimick the behavior of 3163 * certain node and fall back is permitted. First we scan all the
3145 * the page allocator. We fall back according to a zonelist determined by 3164 * available nodelists for available objects. If that fails then we
3146 * the policy layer while obeying cpuset constraints. 3165 * perform an allocation without specifying a node. This allows the page
3166 * allocator to do its reclaim / fallback magic. We then insert the
3167 * slab into the proper nodelist and then allocate from it.
3147 */ 3168 */
3148void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) 3169void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3149{ 3170{
@@ -3151,15 +3172,51 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3151 ->node_zonelists[gfp_zone(flags)]; 3172 ->node_zonelists[gfp_zone(flags)];
3152 struct zone **z; 3173 struct zone **z;
3153 void *obj = NULL; 3174 void *obj = NULL;
3175 int nid;
3154 3176
3177retry:
3178 /*
3179 * Look through allowed nodes for objects available
3180 * from existing per node queues.
3181 */
3155 for (z = zonelist->zones; *z && !obj; z++) { 3182 for (z = zonelist->zones; *z && !obj; z++) {
3156 int nid = zone_to_nid(*z); 3183 nid = zone_to_nid(*z);
3184
3185 if (cpuset_zone_allowed(*z, flags) &&
3186 cache->nodelists[nid] &&
3187 cache->nodelists[nid]->free_objects)
3188 obj = ____cache_alloc_node(cache,
3189 flags | GFP_THISNODE, nid);
3190 }
3157 3191
3158 if (zone_idx(*z) <= ZONE_NORMAL && 3192 if (!obj) {
3159 cpuset_zone_allowed(*z, flags) && 3193 /*
3160 cache->nodelists[nid]) 3194 * This allocation will be performed within the constraints
3161 obj = __cache_alloc_node(cache, 3195 * of the current cpuset / memory policy requirements.
3162 flags | __GFP_THISNODE, nid); 3196 * We may trigger various forms of reclaim on the allowed
3197 * set and go into memory reserves if necessary.
3198 */
3199 obj = kmem_getpages(cache, flags, -1);
3200 if (obj) {
3201 /*
3202 * Insert into the appropriate per node queues
3203 */
3204 nid = page_to_nid(virt_to_page(obj));
3205 if (cache_grow(cache, flags, nid, obj)) {
3206 obj = ____cache_alloc_node(cache,
3207 flags | GFP_THISNODE, nid);
3208 if (!obj)
3209 /*
3210 * Another processor may allocate the
3211 * objects in the slab since we are
3212 * not holding any locks.
3213 */
3214 goto retry;
3215 } else {
3216 kmem_freepages(cache, obj);
3217 obj = NULL;
3218 }
3219 }
3163 } 3220 }
3164 return obj; 3221 return obj;
3165} 3222}
@@ -3167,7 +3224,7 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3167/* 3224/*
3168 * A interface to enable slab creation on nodeid 3225 * A interface to enable slab creation on nodeid
3169 */ 3226 */
3170static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3227static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3171 int nodeid) 3228 int nodeid)
3172{ 3229{
3173 struct list_head *entry; 3230 struct list_head *entry;
@@ -3216,7 +3273,7 @@ retry:
3216 3273
3217must_grow: 3274must_grow:
3218 spin_unlock(&l3->list_lock); 3275 spin_unlock(&l3->list_lock);
3219 x = cache_grow(cachep, flags, nodeid); 3276 x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
3220 if (x) 3277 if (x)
3221 goto retry; 3278 goto retry;
3222 3279
@@ -3434,35 +3491,59 @@ out:
3434 * @flags: See kmalloc(). 3491 * @flags: See kmalloc().
3435 * @nodeid: node number of the target node. 3492 * @nodeid: node number of the target node.
3436 * 3493 *
3437 * Identical to kmem_cache_alloc, except that this function is slow 3494 * Identical to kmem_cache_alloc but it will allocate memory on the given
3438 * and can sleep. And it will allocate memory on the given node, which 3495 * node, which can improve the performance for cpu bound structures.
3439 * can improve the performance for cpu bound structures. 3496 *
3440 * New and improved: it will now make sure that the object gets 3497 * Fallback to other node is possible if __GFP_THISNODE is not set.
3441 * put on the correct node list so that there is no false sharing.
3442 */ 3498 */
3443void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3499static __always_inline void *
3500__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3501 int nodeid, void *caller)
3444{ 3502{
3445 unsigned long save_flags; 3503 unsigned long save_flags;
3446 void *ptr; 3504 void *ptr = NULL;
3447 3505
3448 cache_alloc_debugcheck_before(cachep, flags); 3506 cache_alloc_debugcheck_before(cachep, flags);
3449 local_irq_save(save_flags); 3507 local_irq_save(save_flags);
3450 3508
3451 if (nodeid == -1 || nodeid == numa_node_id() || 3509 if (unlikely(nodeid == -1))
3452 !cachep->nodelists[nodeid]) 3510 nodeid = numa_node_id();
3453 ptr = ____cache_alloc(cachep, flags); 3511
3454 else 3512 if (likely(cachep->nodelists[nodeid])) {
3455 ptr = __cache_alloc_node(cachep, flags, nodeid); 3513 if (nodeid == numa_node_id()) {
3456 local_irq_restore(save_flags); 3514 /*
3515 * Use the locally cached objects if possible.
3516 * However ____cache_alloc does not allow fallback
3517 * to other nodes. It may fail while we still have
3518 * objects on other nodes available.
3519 */
3520 ptr = ____cache_alloc(cachep, flags);
3521 }
3522 if (!ptr) {
3523 /* ___cache_alloc_node can fall back to other nodes */
3524 ptr = ____cache_alloc_node(cachep, flags, nodeid);
3525 }
3526 } else {
3527 /* Node not bootstrapped yet */
3528 if (!(flags & __GFP_THISNODE))
3529 ptr = fallback_alloc(cachep, flags);
3530 }
3457 3531
3458 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, 3532 local_irq_restore(save_flags);
3459 __builtin_return_address(0)); 3533 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3460 3534
3461 return ptr; 3535 return ptr;
3462} 3536}
3537
3538void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3539{
3540 return __cache_alloc_node(cachep, flags, nodeid,
3541 __builtin_return_address(0));
3542}
3463EXPORT_SYMBOL(kmem_cache_alloc_node); 3543EXPORT_SYMBOL(kmem_cache_alloc_node);
3464 3544
3465void *__kmalloc_node(size_t size, gfp_t flags, int node) 3545static __always_inline void *
3546__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3466{ 3547{
3467 struct kmem_cache *cachep; 3548 struct kmem_cache *cachep;
3468 3549
@@ -3471,8 +3552,29 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
3471 return NULL; 3552 return NULL;
3472 return kmem_cache_alloc_node(cachep, flags, node); 3553 return kmem_cache_alloc_node(cachep, flags, node);
3473} 3554}
3555
3556#ifdef CONFIG_DEBUG_SLAB
3557void *__kmalloc_node(size_t size, gfp_t flags, int node)
3558{
3559 return __do_kmalloc_node(size, flags, node,
3560 __builtin_return_address(0));
3561}
3474EXPORT_SYMBOL(__kmalloc_node); 3562EXPORT_SYMBOL(__kmalloc_node);
3475#endif 3563
3564void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3565 int node, void *caller)
3566{
3567 return __do_kmalloc_node(size, flags, node, caller);
3568}
3569EXPORT_SYMBOL(__kmalloc_node_track_caller);
3570#else
3571void *__kmalloc_node(size_t size, gfp_t flags, int node)
3572{
3573 return __do_kmalloc_node(size, flags, node, NULL);
3574}
3575EXPORT_SYMBOL(__kmalloc_node);
3576#endif /* CONFIG_DEBUG_SLAB */
3577#endif /* CONFIG_NUMA */
3476 3578
3477/** 3579/**
3478 * __do_kmalloc - allocate memory 3580 * __do_kmalloc - allocate memory
@@ -3583,13 +3685,15 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3583 int node; 3685 int node;
3584 struct kmem_list3 *l3; 3686 struct kmem_list3 *l3;
3585 struct array_cache *new_shared; 3687 struct array_cache *new_shared;
3586 struct array_cache **new_alien; 3688 struct array_cache **new_alien = NULL;
3587 3689
3588 for_each_online_node(node) { 3690 for_each_online_node(node) {
3589 3691
3590 new_alien = alloc_alien_cache(node, cachep->limit); 3692 if (use_alien_caches) {
3591 if (!new_alien) 3693 new_alien = alloc_alien_cache(node, cachep->limit);
3592 goto fail; 3694 if (!new_alien)
3695 goto fail;
3696 }
3593 3697
3594 new_shared = alloc_arraycache(node, 3698 new_shared = alloc_arraycache(node,
3595 cachep->shared*cachep->batchcount, 3699 cachep->shared*cachep->batchcount,
@@ -4038,7 +4142,7 @@ static int s_show(struct seq_file *m, void *p)
4038 * + further values on SMP and with statistics enabled 4142 * + further values on SMP and with statistics enabled
4039 */ 4143 */
4040 4144
4041struct seq_operations slabinfo_op = { 4145const struct seq_operations slabinfo_op = {
4042 .start = s_start, 4146 .start = s_start,
4043 .next = s_next, 4147 .next = s_next,
4044 .stop = s_stop, 4148 .stop = s_stop,
@@ -4236,7 +4340,7 @@ static int leaks_show(struct seq_file *m, void *p)
4236 return 0; 4340 return 0;
4237} 4341}
4238 4342
4239struct seq_operations slabstats_op = { 4343const struct seq_operations slabstats_op = {
4240 .start = leaks_start, 4344 .start = leaks_start,
4241 .next = s_next, 4345 .next = s_next,
4242 .stop = s_stop, 4346 .stop = s_stop,
diff --git a/mm/sparse.c b/mm/sparse.c
index b3c82ba30012..ac26eb0d73cd 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -24,6 +24,25 @@ struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
24#endif 24#endif
25EXPORT_SYMBOL(mem_section); 25EXPORT_SYMBOL(mem_section);
26 26
27#ifdef NODE_NOT_IN_PAGE_FLAGS
28/*
29 * If we did not store the node number in the page then we have to
30 * do a lookup in the section_to_node_table in order to find which
31 * node the page belongs to.
32 */
33#if MAX_NUMNODES <= 256
34static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
35#else
36static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
37#endif
38
39int page_to_nid(struct page *page)
40{
41 return section_to_node_table[page_to_section(page)];
42}
43EXPORT_SYMBOL(page_to_nid);
44#endif
45
27#ifdef CONFIG_SPARSEMEM_EXTREME 46#ifdef CONFIG_SPARSEMEM_EXTREME
28static struct mem_section *sparse_index_alloc(int nid) 47static struct mem_section *sparse_index_alloc(int nid)
29{ 48{
@@ -49,6 +68,10 @@ static int sparse_index_init(unsigned long section_nr, int nid)
49 struct mem_section *section; 68 struct mem_section *section;
50 int ret = 0; 69 int ret = 0;
51 70
71#ifdef NODE_NOT_IN_PAGE_FLAGS
72 section_to_node_table[section_nr] = nid;
73#endif
74
52 if (mem_section[root]) 75 if (mem_section[root])
53 return -EEXIST; 76 return -EEXIST;
54 77
diff --git a/mm/swap.c b/mm/swap.c
index d9a3770d8f3c..2ed7be39795e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -57,9 +57,9 @@ static void put_compound_page(struct page *page)
57{ 57{
58 page = (struct page *)page_private(page); 58 page = (struct page *)page_private(page);
59 if (put_page_testzero(page)) { 59 if (put_page_testzero(page)) {
60 void (*dtor)(struct page *page); 60 compound_page_dtor *dtor;
61 61
62 dtor = (void (*)(struct page *))page[1].lru.next; 62 dtor = get_compound_page_dtor(page);
63 (*dtor)(page); 63 (*dtor)(page);
64 } 64 }
65} 65}
@@ -514,5 +514,7 @@ void __init swap_setup(void)
514 * Right now other parts of the system means that we 514 * Right now other parts of the system means that we
515 * _really_ don't want to cluster much more 515 * _really_ don't want to cluster much more
516 */ 516 */
517#ifdef CONFIG_HOTPLUG_CPU
517 hotcpu_notifier(cpu_swap_callback, 0); 518 hotcpu_notifier(cpu_swap_callback, 0);
519#endif
518} 520}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a15def63f28f..c5431072f422 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -427,34 +427,48 @@ void free_swap_and_cache(swp_entry_t entry)
427 427
428#ifdef CONFIG_SOFTWARE_SUSPEND 428#ifdef CONFIG_SOFTWARE_SUSPEND
429/* 429/*
430 * Find the swap type that corresponds to given device (if any) 430 * Find the swap type that corresponds to given device (if any).
431 * 431 *
432 * This is needed for software suspend and is done in such a way that inode 432 * @offset - number of the PAGE_SIZE-sized block of the device, starting
433 * aliasing is allowed. 433 * from 0, in which the swap header is expected to be located.
434 *
435 * This is needed for the suspend to disk (aka swsusp).
434 */ 436 */
435int swap_type_of(dev_t device) 437int swap_type_of(dev_t device, sector_t offset)
436{ 438{
439 struct block_device *bdev = NULL;
437 int i; 440 int i;
438 441
442 if (device)
443 bdev = bdget(device);
444
439 spin_lock(&swap_lock); 445 spin_lock(&swap_lock);
440 for (i = 0; i < nr_swapfiles; i++) { 446 for (i = 0; i < nr_swapfiles; i++) {
441 struct inode *inode; 447 struct swap_info_struct *sis = swap_info + i;
442 448
443 if (!(swap_info[i].flags & SWP_WRITEOK)) 449 if (!(sis->flags & SWP_WRITEOK))
444 continue; 450 continue;
445 451
446 if (!device) { 452 if (!bdev) {
447 spin_unlock(&swap_lock); 453 spin_unlock(&swap_lock);
448 return i; 454 return i;
449 } 455 }
450 inode = swap_info[i].swap_file->f_dentry->d_inode; 456 if (bdev == sis->bdev) {
451 if (S_ISBLK(inode->i_mode) && 457 struct swap_extent *se;
452 device == MKDEV(imajor(inode), iminor(inode))) { 458
453 spin_unlock(&swap_lock); 459 se = list_entry(sis->extent_list.next,
454 return i; 460 struct swap_extent, list);
461 if (se->start_block == offset) {
462 spin_unlock(&swap_lock);
463 bdput(bdev);
464 return i;
465 }
455 } 466 }
456 } 467 }
457 spin_unlock(&swap_lock); 468 spin_unlock(&swap_lock);
469 if (bdev)
470 bdput(bdev);
471
458 return -ENODEV; 472 return -ENODEV;
459} 473}
460 474
@@ -931,6 +945,23 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
931 } 945 }
932} 946}
933 947
948#ifdef CONFIG_SOFTWARE_SUSPEND
949/*
950 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
951 * corresponding to given index in swap_info (swap type).
952 */
953sector_t swapdev_block(int swap_type, pgoff_t offset)
954{
955 struct swap_info_struct *sis;
956
957 if (swap_type >= nr_swapfiles)
958 return 0;
959
960 sis = swap_info + swap_type;
961 return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;
962}
963#endif /* CONFIG_SOFTWARE_SUSPEND */
964
934/* 965/*
935 * Free all of a swapdev's extent information 966 * Free all of a swapdev's extent information
936 */ 967 */
@@ -1274,10 +1305,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
1274 1305
1275 mutex_lock(&swapon_mutex); 1306 mutex_lock(&swapon_mutex);
1276 1307
1308 if (!l)
1309 return SEQ_START_TOKEN;
1310
1277 for (i = 0; i < nr_swapfiles; i++, ptr++) { 1311 for (i = 0; i < nr_swapfiles; i++, ptr++) {
1278 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1312 if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
1279 continue; 1313 continue;
1280 if (!l--) 1314 if (!--l)
1281 return ptr; 1315 return ptr;
1282 } 1316 }
1283 1317
@@ -1286,10 +1320,17 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
1286 1320
1287static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 1321static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1288{ 1322{
1289 struct swap_info_struct *ptr = v; 1323 struct swap_info_struct *ptr;
1290 struct swap_info_struct *endptr = swap_info + nr_swapfiles; 1324 struct swap_info_struct *endptr = swap_info + nr_swapfiles;
1291 1325
1292 for (++ptr; ptr < endptr; ptr++) { 1326 if (v == SEQ_START_TOKEN)
1327 ptr = swap_info;
1328 else {
1329 ptr = v;
1330 ptr++;
1331 }
1332
1333 for (; ptr < endptr; ptr++) {
1293 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1334 if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
1294 continue; 1335 continue;
1295 ++*pos; 1336 ++*pos;
@@ -1310,8 +1351,10 @@ static int swap_show(struct seq_file *swap, void *v)
1310 struct file *file; 1351 struct file *file;
1311 int len; 1352 int len;
1312 1353
1313 if (v == swap_info) 1354 if (ptr == SEQ_START_TOKEN) {
1314 seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); 1355 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1356 return 0;
1357 }
1315 1358
1316 file = ptr->swap_file; 1359 file = ptr->swap_file;
1317 len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\"); 1360 len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\");
@@ -1325,7 +1368,7 @@ static int swap_show(struct seq_file *swap, void *v)
1325 return 0; 1368 return 0;
1326} 1369}
1327 1370
1328static struct seq_operations swaps_op = { 1371static const struct seq_operations swaps_op = {
1329 .start = swap_start, 1372 .start = swap_start,
1330 .next = swap_next, 1373 .next = swap_next,
1331 .stop = swap_stop, 1374 .stop = swap_stop,
@@ -1337,7 +1380,7 @@ static int swaps_open(struct inode *inode, struct file *file)
1337 return seq_open(file, &swaps_op); 1380 return seq_open(file, &swaps_op);
1338} 1381}
1339 1382
1340static struct file_operations proc_swaps_operations = { 1383static const struct file_operations proc_swaps_operations = {
1341 .open = swaps_open, 1384 .open = swaps_open,
1342 .read = seq_read, 1385 .read = seq_read,
1343 .llseek = seq_lseek, 1386 .llseek = seq_lseek,
@@ -1540,6 +1583,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1540 error = -EINVAL; 1583 error = -EINVAL;
1541 if (!maxpages) 1584 if (!maxpages)
1542 goto bad_swap; 1585 goto bad_swap;
1586 if (swapfilesize && maxpages > swapfilesize) {
1587 printk(KERN_WARNING
1588 "Swap area shorter than signature indicates\n");
1589 goto bad_swap;
1590 }
1543 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 1591 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1544 goto bad_swap; 1592 goto bad_swap;
1545 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 1593 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
@@ -1567,12 +1615,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1567 goto bad_swap; 1615 goto bad_swap;
1568 } 1616 }
1569 1617
1570 if (swapfilesize && maxpages > swapfilesize) {
1571 printk(KERN_WARNING
1572 "Swap area shorter than signature indicates\n");
1573 error = -EINVAL;
1574 goto bad_swap;
1575 }
1576 if (nr_good_pages) { 1618 if (nr_good_pages) {
1577 p->swap_map[0] = SWAP_MAP_BAD; 1619 p->swap_map[0] = SWAP_MAP_BAD;
1578 p->max = maxpages; 1620 p->max = maxpages;
diff --git a/mm/thrash.c b/mm/thrash.c
index f4c560b4a2b7..9ef9071f99bc 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -7,100 +7,74 @@
7 * 7 *
8 * Simple token based thrashing protection, using the algorithm 8 * Simple token based thrashing protection, using the algorithm
9 * described in: http://www.cs.wm.edu/~sjiang/token.pdf 9 * described in: http://www.cs.wm.edu/~sjiang/token.pdf
10 *
11 * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
12 * Improved algorithm to pass token:
13 * Each task has a priority which is incremented if it contended
14 * for the token in an interval less than its previous attempt.
15 * If the token is acquired, that task's priority is boosted to prevent
16 * the token from bouncing around too often and to let the task make
17 * some progress in its execution.
10 */ 18 */
19
11#include <linux/jiffies.h> 20#include <linux/jiffies.h>
12#include <linux/mm.h> 21#include <linux/mm.h>
13#include <linux/sched.h> 22#include <linux/sched.h>
14#include <linux/swap.h> 23#include <linux/swap.h>
15 24
16static DEFINE_SPINLOCK(swap_token_lock); 25static DEFINE_SPINLOCK(swap_token_lock);
17static unsigned long swap_token_timeout; 26struct mm_struct *swap_token_mm;
18static unsigned long swap_token_check; 27static unsigned int global_faults;
19struct mm_struct * swap_token_mm = &init_mm;
20
21#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2)
22#define SWAP_TOKEN_TIMEOUT (300 * HZ)
23/*
24 * Currently disabled; Needs further code to work at HZ * 300.
25 */
26unsigned long swap_token_default_timeout = SWAP_TOKEN_TIMEOUT;
27
28/*
29 * Take the token away if the process had no page faults
30 * in the last interval, or if it has held the token for
31 * too long.
32 */
33#define SWAP_TOKEN_ENOUGH_RSS 1
34#define SWAP_TOKEN_TIMED_OUT 2
35static int should_release_swap_token(struct mm_struct *mm)
36{
37 int ret = 0;
38 if (!mm->recent_pagein)
39 ret = SWAP_TOKEN_ENOUGH_RSS;
40 else if (time_after(jiffies, swap_token_timeout))
41 ret = SWAP_TOKEN_TIMED_OUT;
42 mm->recent_pagein = 0;
43 return ret;
44}
45 28
46/*
47 * Try to grab the swapout protection token. We only try to
48 * grab it once every TOKEN_CHECK_INTERVAL, both to prevent
49 * SMP lock contention and to check that the process that held
50 * the token before is no longer thrashing.
51 */
52void grab_swap_token(void) 29void grab_swap_token(void)
53{ 30{
54 struct mm_struct *mm; 31 int current_interval;
55 int reason;
56 32
57 /* We have the token. Let others know we still need it. */ 33 global_faults++;
58 if (has_swap_token(current->mm)) {
59 current->mm->recent_pagein = 1;
60 if (unlikely(!swap_token_default_timeout))
61 disable_swap_token();
62 return;
63 }
64
65 if (time_after(jiffies, swap_token_check)) {
66 34
67 if (!swap_token_default_timeout) { 35 current_interval = global_faults - current->mm->faultstamp;
68 swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
69 return;
70 }
71
72 /* ... or if we recently held the token. */
73 if (time_before(jiffies, current->mm->swap_token_time))
74 return;
75 36
76 if (!spin_trylock(&swap_token_lock)) 37 if (!spin_trylock(&swap_token_lock))
77 return; 38 return;
78 39
79 swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL; 40 /* First come first served */
41 if (swap_token_mm == NULL) {
42 current->mm->token_priority = current->mm->token_priority + 2;
43 swap_token_mm = current->mm;
44 goto out;
45 }
80 46
81 mm = swap_token_mm; 47 if (current->mm != swap_token_mm) {
82 if ((reason = should_release_swap_token(mm))) { 48 if (current_interval < current->mm->last_interval)
83 unsigned long eligible = jiffies; 49 current->mm->token_priority++;
84 if (reason == SWAP_TOKEN_TIMED_OUT) { 50 else {
85 eligible += swap_token_default_timeout; 51 current->mm->token_priority--;
86 } 52 if (unlikely(current->mm->token_priority < 0))
87 mm->swap_token_time = eligible; 53 current->mm->token_priority = 0;
88 swap_token_timeout = jiffies + swap_token_default_timeout; 54 }
55 /* Check if we deserve the token */
56 if (current->mm->token_priority >
57 swap_token_mm->token_priority) {
58 current->mm->token_priority += 2;
89 swap_token_mm = current->mm; 59 swap_token_mm = current->mm;
90 } 60 }
91 spin_unlock(&swap_token_lock); 61 } else {
62 /* Token holder came in again! */
63 current->mm->token_priority += 2;
92 } 64 }
93 return; 65
66out:
67 current->mm->faultstamp = global_faults;
68 current->mm->last_interval = current_interval;
69 spin_unlock(&swap_token_lock);
70return;
94} 71}
95 72
96/* Called on process exit. */ 73/* Called on process exit. */
97void __put_swap_token(struct mm_struct *mm) 74void __put_swap_token(struct mm_struct *mm)
98{ 75{
99 spin_lock(&swap_token_lock); 76 spin_lock(&swap_token_lock);
100 if (likely(mm == swap_token_mm)) { 77 if (likely(mm == swap_token_mm))
101 mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL; 78 swap_token_mm = NULL;
102 swap_token_mm = &init_mm;
103 swap_token_check = jiffies;
104 }
105 spin_unlock(&swap_token_lock); 79 spin_unlock(&swap_token_lock);
106} 80}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 518540a4a2a6..093f5fe6dd77 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -36,6 +36,7 @@
36#include <linux/rwsem.h> 36#include <linux/rwsem.h>
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/kthread.h> 38#include <linux/kthread.h>
39#include <linux/freezer.h>
39 40
40#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
41#include <asm/div64.h> 42#include <asm/div64.h>
@@ -1172,11 +1173,12 @@ loop_again:
1172 if (!zone_watermark_ok(zone, order, zone->pages_high, 1173 if (!zone_watermark_ok(zone, order, zone->pages_high,
1173 0, 0)) { 1174 0, 0)) {
1174 end_zone = i; 1175 end_zone = i;
1175 goto scan; 1176 break;
1176 } 1177 }
1177 } 1178 }
1178 goto out; 1179 if (i < 0)
1179scan: 1180 goto out;
1181
1180 for (i = 0; i <= end_zone; i++) { 1182 for (i = 0; i <= end_zone; i++) {
1181 struct zone *zone = pgdat->node_zones + i; 1183 struct zone *zone = pgdat->node_zones + i;
1182 1184
@@ -1259,6 +1261,9 @@ out:
1259 } 1261 }
1260 if (!all_zones_ok) { 1262 if (!all_zones_ok) {
1261 cond_resched(); 1263 cond_resched();
1264
1265 try_to_freeze();
1266
1262 goto loop_again; 1267 goto loop_again;
1263 } 1268 }
1264 1269
@@ -1508,7 +1513,6 @@ out:
1508} 1513}
1509#endif 1514#endif
1510 1515
1511#ifdef CONFIG_HOTPLUG_CPU
1512/* It's optimal to keep kswapds on the same CPUs as their memory, but 1516/* It's optimal to keep kswapds on the same CPUs as their memory, but
1513 not required for correctness. So if the last cpu in a node goes 1517 not required for correctness. So if the last cpu in a node goes
1514 away, we get changed to run anywhere: as the first one comes back, 1518 away, we get changed to run anywhere: as the first one comes back,
@@ -1529,7 +1533,6 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
1529 } 1533 }
1530 return NOTIFY_OK; 1534 return NOTIFY_OK;
1531} 1535}
1532#endif /* CONFIG_HOTPLUG_CPU */
1533 1536
1534/* 1537/*
1535 * This kswapd start function will be called by init and node-hot-add. 1538 * This kswapd start function will be called by init and node-hot-add.
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8614e8f6743b..dc005a0c96ae 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -430,7 +430,7 @@ static int frag_show(struct seq_file *m, void *arg)
430 return 0; 430 return 0;
431} 431}
432 432
433struct seq_operations fragmentation_op = { 433const struct seq_operations fragmentation_op = {
434 .start = frag_start, 434 .start = frag_start,
435 .next = frag_next, 435 .next = frag_next,
436 .stop = frag_stop, 436 .stop = frag_stop,
@@ -452,7 +452,7 @@ struct seq_operations fragmentation_op = {
452#define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \ 452#define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \
453 TEXT_FOR_HIGHMEM(xx) 453 TEXT_FOR_HIGHMEM(xx)
454 454
455static char *vmstat_text[] = { 455static const char * const vmstat_text[] = {
456 /* Zoned VM counters */ 456 /* Zoned VM counters */
457 "nr_anon_pages", 457 "nr_anon_pages",
458 "nr_mapped", 458 "nr_mapped",
@@ -597,7 +597,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
597 return 0; 597 return 0;
598} 598}
599 599
600struct seq_operations zoneinfo_op = { 600const struct seq_operations zoneinfo_op = {
601 .start = frag_start, /* iterate over all zones. The same as in 601 .start = frag_start, /* iterate over all zones. The same as in
602 * fragmentation. */ 602 * fragmentation. */
603 .next = frag_next, 603 .next = frag_next,
@@ -660,7 +660,7 @@ static void vmstat_stop(struct seq_file *m, void *arg)
660 m->private = NULL; 660 m->private = NULL;
661} 661}
662 662
663struct seq_operations vmstat_op = { 663const struct seq_operations vmstat_op = {
664 .start = vmstat_start, 664 .start = vmstat_start,
665 .next = vmstat_next, 665 .next = vmstat_next,
666 .stop = vmstat_stop, 666 .stop = vmstat_stop,
@@ -679,13 +679,13 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
679 void *hcpu) 679 void *hcpu)
680{ 680{
681 switch (action) { 681 switch (action) {
682 case CPU_UP_PREPARE: 682 case CPU_UP_PREPARE:
683 case CPU_UP_CANCELED: 683 case CPU_UP_CANCELED:
684 case CPU_DEAD: 684 case CPU_DEAD:
685 refresh_zone_stat_thresholds(); 685 refresh_zone_stat_thresholds();
686 break; 686 break;
687 default: 687 default:
688 break; 688 break;
689 } 689 }
690 return NOTIFY_OK; 690 return NOTIFY_OK;
691} 691}