diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/allocpercpu.c | 9 | ||||
-rw-r--r-- | mm/bootmem.c | 6 | ||||
-rw-r--r-- | mm/filemap.c | 1 | ||||
-rw-r--r-- | mm/fremap.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 22 | ||||
-rw-r--r-- | mm/memory.c | 3 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 1 | ||||
-rw-r--r-- | mm/mempolicy.c | 10 | ||||
-rw-r--r-- | mm/migrate.c | 19 | ||||
-rw-r--r-- | mm/mlock.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 4 | ||||
-rw-r--r-- | mm/mmzone.c | 5 | ||||
-rw-r--r-- | mm/nommu.c | 6 | ||||
-rw-r--r-- | mm/oom_kill.c | 41 | ||||
-rw-r--r-- | mm/page_alloc.c | 281 | ||||
-rw-r--r-- | mm/page_io.c | 45 | ||||
-rw-r--r-- | mm/pdflush.c | 1 | ||||
-rw-r--r-- | mm/readahead.c | 8 | ||||
-rw-r--r-- | mm/shmem.c | 8 | ||||
-rw-r--r-- | mm/slab.c | 286 | ||||
-rw-r--r-- | mm/sparse.c | 23 | ||||
-rw-r--r-- | mm/swap.c | 6 | ||||
-rw-r--r-- | mm/swapfile.c | 92 | ||||
-rw-r--r-- | mm/thrash.c | 116 | ||||
-rw-r--r-- | mm/vmscan.c | 13 | ||||
-rw-r--r-- | mm/vmstat.c | 22 |
26 files changed, 654 insertions, 378 deletions
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index eaa9abeea536..b2486cf887a0 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c | |||
@@ -17,10 +17,9 @@ | |||
17 | void percpu_depopulate(void *__pdata, int cpu) | 17 | void percpu_depopulate(void *__pdata, int cpu) |
18 | { | 18 | { |
19 | struct percpu_data *pdata = __percpu_disguise(__pdata); | 19 | struct percpu_data *pdata = __percpu_disguise(__pdata); |
20 | if (pdata->ptrs[cpu]) { | 20 | |
21 | kfree(pdata->ptrs[cpu]); | 21 | kfree(pdata->ptrs[cpu]); |
22 | pdata->ptrs[cpu] = NULL; | 22 | pdata->ptrs[cpu] = NULL; |
23 | } | ||
24 | } | 23 | } |
25 | EXPORT_SYMBOL_GPL(percpu_depopulate); | 24 | EXPORT_SYMBOL_GPL(percpu_depopulate); |
26 | 25 | ||
@@ -123,6 +122,8 @@ EXPORT_SYMBOL_GPL(__percpu_alloc_mask); | |||
123 | */ | 122 | */ |
124 | void percpu_free(void *__pdata) | 123 | void percpu_free(void *__pdata) |
125 | { | 124 | { |
125 | if (unlikely(!__pdata)) | ||
126 | return; | ||
126 | __percpu_depopulate_mask(__pdata, &cpu_possible_map); | 127 | __percpu_depopulate_mask(__pdata, &cpu_possible_map); |
127 | kfree(__percpu_disguise(__pdata)); | 128 | kfree(__percpu_disguise(__pdata)); |
128 | } | 129 | } |
diff --git a/mm/bootmem.c b/mm/bootmem.c index d53112fcb404..00a96970b237 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -27,8 +27,6 @@ unsigned long max_low_pfn; | |||
27 | unsigned long min_low_pfn; | 27 | unsigned long min_low_pfn; |
28 | unsigned long max_pfn; | 28 | unsigned long max_pfn; |
29 | 29 | ||
30 | EXPORT_UNUSED_SYMBOL(max_pfn); /* June 2006 */ | ||
31 | |||
32 | static LIST_HEAD(bdata_list); | 30 | static LIST_HEAD(bdata_list); |
33 | #ifdef CONFIG_CRASH_DUMP | 31 | #ifdef CONFIG_CRASH_DUMP |
34 | /* | 32 | /* |
@@ -196,6 +194,10 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, | |||
196 | if (limit && bdata->node_boot_start >= limit) | 194 | if (limit && bdata->node_boot_start >= limit) |
197 | return NULL; | 195 | return NULL; |
198 | 196 | ||
197 | /* on nodes without memory - bootmem_map is NULL */ | ||
198 | if (!bdata->node_bootmem_map) | ||
199 | return NULL; | ||
200 | |||
199 | end_pfn = bdata->node_low_pfn; | 201 | end_pfn = bdata->node_low_pfn; |
200 | limit = PFN_DOWN(limit); | 202 | limit = PFN_DOWN(limit); |
201 | if (limit && end_pfn > limit) | 203 | if (limit && end_pfn > limit) |
diff --git a/mm/filemap.c b/mm/filemap.c index 13df01c50479..af7e2f5caea9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -1445,7 +1445,6 @@ no_cached_page: | |||
1445 | * effect. | 1445 | * effect. |
1446 | */ | 1446 | */ |
1447 | error = page_cache_read(file, pgoff); | 1447 | error = page_cache_read(file, pgoff); |
1448 | grab_swap_token(); | ||
1449 | 1448 | ||
1450 | /* | 1449 | /* |
1451 | * The page we want has now been added to the page cache. | 1450 | * The page we want has now been added to the page cache. |
diff --git a/mm/fremap.c b/mm/fremap.c index 7a9d0f5d246d..b77a002c3352 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -101,7 +101,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
101 | { | 101 | { |
102 | int err = -ENOMEM; | 102 | int err = -ENOMEM; |
103 | pte_t *pte; | 103 | pte_t *pte; |
104 | pte_t pte_val; | ||
105 | spinlock_t *ptl; | 104 | spinlock_t *ptl; |
106 | 105 | ||
107 | pte = get_locked_pte(mm, addr, &ptl); | 106 | pte = get_locked_pte(mm, addr, &ptl); |
@@ -114,7 +113,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
114 | } | 113 | } |
115 | 114 | ||
116 | set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); | 115 | set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); |
117 | pte_val = *pte; | ||
118 | /* | 116 | /* |
119 | * We don't need to run update_mmu_cache() here because the "file pte" | 117 | * We don't need to run update_mmu_cache() here because the "file pte" |
120 | * being installed by install_file_pte() is not a real pte - it's a | 118 | * being installed by install_file_pte() is not a real pte - it's a |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a088f593a807..0ccc7f230252 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -109,7 +109,7 @@ static int alloc_fresh_huge_page(void) | |||
109 | if (nid == MAX_NUMNODES) | 109 | if (nid == MAX_NUMNODES) |
110 | nid = first_node(node_online_map); | 110 | nid = first_node(node_online_map); |
111 | if (page) { | 111 | if (page) { |
112 | page[1].lru.next = (void *)free_huge_page; /* dtor */ | 112 | set_compound_page_dtor(page, free_huge_page); |
113 | spin_lock(&hugetlb_lock); | 113 | spin_lock(&hugetlb_lock); |
114 | nr_huge_pages++; | 114 | nr_huge_pages++; |
115 | nr_huge_pages_node[page_to_nid(page)]++; | 115 | nr_huge_pages_node[page_to_nid(page)]++; |
@@ -344,7 +344,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
344 | entry = *src_pte; | 344 | entry = *src_pte; |
345 | ptepage = pte_page(entry); | 345 | ptepage = pte_page(entry); |
346 | get_page(ptepage); | 346 | get_page(ptepage); |
347 | add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE); | ||
348 | set_huge_pte_at(dst, addr, dst_pte, entry); | 347 | set_huge_pte_at(dst, addr, dst_pte, entry); |
349 | } | 348 | } |
350 | spin_unlock(&src->page_table_lock); | 349 | spin_unlock(&src->page_table_lock); |
@@ -365,6 +364,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
365 | pte_t pte; | 364 | pte_t pte; |
366 | struct page *page; | 365 | struct page *page; |
367 | struct page *tmp; | 366 | struct page *tmp; |
367 | /* | ||
368 | * A page gathering list, protected by per file i_mmap_lock. The | ||
369 | * lock is used to avoid list corruption from multiple unmapping | ||
370 | * of the same page since we are using page->lru. | ||
371 | */ | ||
368 | LIST_HEAD(page_list); | 372 | LIST_HEAD(page_list); |
369 | 373 | ||
370 | WARN_ON(!is_vm_hugetlb_page(vma)); | 374 | WARN_ON(!is_vm_hugetlb_page(vma)); |
@@ -372,24 +376,21 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
372 | BUG_ON(end & ~HPAGE_MASK); | 376 | BUG_ON(end & ~HPAGE_MASK); |
373 | 377 | ||
374 | spin_lock(&mm->page_table_lock); | 378 | spin_lock(&mm->page_table_lock); |
375 | |||
376 | /* Update high watermark before we lower rss */ | ||
377 | update_hiwater_rss(mm); | ||
378 | |||
379 | for (address = start; address < end; address += HPAGE_SIZE) { | 379 | for (address = start; address < end; address += HPAGE_SIZE) { |
380 | ptep = huge_pte_offset(mm, address); | 380 | ptep = huge_pte_offset(mm, address); |
381 | if (!ptep) | 381 | if (!ptep) |
382 | continue; | 382 | continue; |
383 | 383 | ||
384 | if (huge_pmd_unshare(mm, &address, ptep)) | ||
385 | continue; | ||
386 | |||
384 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 387 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
385 | if (pte_none(pte)) | 388 | if (pte_none(pte)) |
386 | continue; | 389 | continue; |
387 | 390 | ||
388 | page = pte_page(pte); | 391 | page = pte_page(pte); |
389 | list_add(&page->lru, &page_list); | 392 | list_add(&page->lru, &page_list); |
390 | add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE)); | ||
391 | } | 393 | } |
392 | |||
393 | spin_unlock(&mm->page_table_lock); | 394 | spin_unlock(&mm->page_table_lock); |
394 | flush_tlb_range(vma, start, end); | 395 | flush_tlb_range(vma, start, end); |
395 | list_for_each_entry_safe(page, tmp, &page_list, lru) { | 396 | list_for_each_entry_safe(page, tmp, &page_list, lru) { |
@@ -515,7 +516,6 @@ retry: | |||
515 | if (!pte_none(*ptep)) | 516 | if (!pte_none(*ptep)) |
516 | goto backout; | 517 | goto backout; |
517 | 518 | ||
518 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); | ||
519 | new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) | 519 | new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) |
520 | && (vma->vm_flags & VM_SHARED))); | 520 | && (vma->vm_flags & VM_SHARED))); |
521 | set_huge_pte_at(mm, address, ptep, new_pte); | 521 | set_huge_pte_at(mm, address, ptep, new_pte); |
@@ -653,11 +653,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
653 | BUG_ON(address >= end); | 653 | BUG_ON(address >= end); |
654 | flush_cache_range(vma, address, end); | 654 | flush_cache_range(vma, address, end); |
655 | 655 | ||
656 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); | ||
656 | spin_lock(&mm->page_table_lock); | 657 | spin_lock(&mm->page_table_lock); |
657 | for (; address < end; address += HPAGE_SIZE) { | 658 | for (; address < end; address += HPAGE_SIZE) { |
658 | ptep = huge_pte_offset(mm, address); | 659 | ptep = huge_pte_offset(mm, address); |
659 | if (!ptep) | 660 | if (!ptep) |
660 | continue; | 661 | continue; |
662 | if (huge_pmd_unshare(mm, &address, ptep)) | ||
663 | continue; | ||
661 | if (!pte_none(*ptep)) { | 664 | if (!pte_none(*ptep)) { |
662 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 665 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
663 | pte = pte_mkhuge(pte_modify(pte, newprot)); | 666 | pte = pte_mkhuge(pte_modify(pte, newprot)); |
@@ -666,6 +669,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
666 | } | 669 | } |
667 | } | 670 | } |
668 | spin_unlock(&mm->page_table_lock); | 671 | spin_unlock(&mm->page_table_lock); |
672 | spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); | ||
669 | 673 | ||
670 | flush_tlb_range(vma, start, end); | 674 | flush_tlb_range(vma, start, end); |
671 | } | 675 | } |
diff --git a/mm/memory.c b/mm/memory.c index 156861fcac43..4198df0dff1c 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1902,7 +1902,6 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | |||
1902 | 1902 | ||
1903 | return 0; | 1903 | return 0; |
1904 | } | 1904 | } |
1905 | EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */ | ||
1906 | 1905 | ||
1907 | /** | 1906 | /** |
1908 | * swapin_readahead - swap in pages in hope we need them soon | 1907 | * swapin_readahead - swap in pages in hope we need them soon |
@@ -1991,6 +1990,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1991 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); | 1990 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); |
1992 | page = lookup_swap_cache(entry); | 1991 | page = lookup_swap_cache(entry); |
1993 | if (!page) { | 1992 | if (!page) { |
1993 | grab_swap_token(); /* Contend for token _before_ read-in */ | ||
1994 | swapin_readahead(entry, address, vma); | 1994 | swapin_readahead(entry, address, vma); |
1995 | page = read_swap_cache_async(entry, vma, address); | 1995 | page = read_swap_cache_async(entry, vma, address); |
1996 | if (!page) { | 1996 | if (!page) { |
@@ -2008,7 +2008,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2008 | /* Had to read the page from swap area: Major fault */ | 2008 | /* Had to read the page from swap area: Major fault */ |
2009 | ret = VM_FAULT_MAJOR; | 2009 | ret = VM_FAULT_MAJOR; |
2010 | count_vm_event(PGMAJFAULT); | 2010 | count_vm_event(PGMAJFAULT); |
2011 | grab_swap_token(); | ||
2012 | } | 2011 | } |
2013 | 2012 | ||
2014 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2013 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fd678a662eae..0c055a090f4d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -72,7 +72,6 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) | |||
72 | return ret; | 72 | return ret; |
73 | } | 73 | } |
74 | memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); | 74 | memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); |
75 | zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); | ||
76 | return 0; | 75 | return 0; |
77 | } | 76 | } |
78 | 77 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 617fb31086ee..b917d6fdc1bb 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -141,9 +141,11 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) | |||
141 | enum zone_type k; | 141 | enum zone_type k; |
142 | 142 | ||
143 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); | 143 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); |
144 | max++; /* space for zlcache_ptr (see mmzone.h) */ | ||
144 | zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); | 145 | zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); |
145 | if (!zl) | 146 | if (!zl) |
146 | return NULL; | 147 | return NULL; |
148 | zl->zlcache_ptr = NULL; | ||
147 | num = 0; | 149 | num = 0; |
148 | /* First put in the highest zones from all nodes, then all the next | 150 | /* First put in the highest zones from all nodes, then all the next |
149 | lower zones etc. Avoid empty zones because the memory allocator | 151 | lower zones etc. Avoid empty zones because the memory allocator |
@@ -219,7 +221,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
219 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 221 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
220 | do { | 222 | do { |
221 | struct page *page; | 223 | struct page *page; |
222 | unsigned int nid; | 224 | int nid; |
223 | 225 | ||
224 | if (!pte_present(*pte)) | 226 | if (!pte_present(*pte)) |
225 | continue; | 227 | continue; |
@@ -1324,7 +1326,7 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) | |||
1324 | atomic_set(&new->refcnt, 1); | 1326 | atomic_set(&new->refcnt, 1); |
1325 | if (new->policy == MPOL_BIND) { | 1327 | if (new->policy == MPOL_BIND) { |
1326 | int sz = ksize(old->v.zonelist); | 1328 | int sz = ksize(old->v.zonelist); |
1327 | new->v.zonelist = kmemdup(old->v.zonelist, sz, SLAB_KERNEL); | 1329 | new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL); |
1328 | if (!new->v.zonelist) { | 1330 | if (!new->v.zonelist) { |
1329 | kmem_cache_free(policy_cache, new); | 1331 | kmem_cache_free(policy_cache, new); |
1330 | return ERR_PTR(-ENOMEM); | 1332 | return ERR_PTR(-ENOMEM); |
@@ -1705,8 +1707,8 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) | |||
1705 | * Display pages allocated per node and memory policy via /proc. | 1707 | * Display pages allocated per node and memory policy via /proc. |
1706 | */ | 1708 | */ |
1707 | 1709 | ||
1708 | static const char *policy_types[] = { "default", "prefer", "bind", | 1710 | static const char * const policy_types[] = |
1709 | "interleave" }; | 1711 | { "default", "prefer", "bind", "interleave" }; |
1710 | 1712 | ||
1711 | /* | 1713 | /* |
1712 | * Convert a mempolicy into a string. | 1714 | * Convert a mempolicy into a string. |
diff --git a/mm/migrate.c b/mm/migrate.c index b4979d423d2b..e9b161bde95b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -294,7 +294,7 @@ out: | |||
294 | static int migrate_page_move_mapping(struct address_space *mapping, | 294 | static int migrate_page_move_mapping(struct address_space *mapping, |
295 | struct page *newpage, struct page *page) | 295 | struct page *newpage, struct page *page) |
296 | { | 296 | { |
297 | struct page **radix_pointer; | 297 | void **pslot; |
298 | 298 | ||
299 | if (!mapping) { | 299 | if (!mapping) { |
300 | /* Anonymous page */ | 300 | /* Anonymous page */ |
@@ -305,12 +305,11 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
305 | 305 | ||
306 | write_lock_irq(&mapping->tree_lock); | 306 | write_lock_irq(&mapping->tree_lock); |
307 | 307 | ||
308 | radix_pointer = (struct page **)radix_tree_lookup_slot( | 308 | pslot = radix_tree_lookup_slot(&mapping->page_tree, |
309 | &mapping->page_tree, | 309 | page_index(page)); |
310 | page_index(page)); | ||
311 | 310 | ||
312 | if (page_count(page) != 2 + !!PagePrivate(page) || | 311 | if (page_count(page) != 2 + !!PagePrivate(page) || |
313 | *radix_pointer != page) { | 312 | (struct page *)radix_tree_deref_slot(pslot) != page) { |
314 | write_unlock_irq(&mapping->tree_lock); | 313 | write_unlock_irq(&mapping->tree_lock); |
315 | return -EAGAIN; | 314 | return -EAGAIN; |
316 | } | 315 | } |
@@ -318,7 +317,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
318 | /* | 317 | /* |
319 | * Now we know that no one else is looking at the page. | 318 | * Now we know that no one else is looking at the page. |
320 | */ | 319 | */ |
321 | get_page(newpage); | 320 | get_page(newpage); /* add cache reference */ |
322 | #ifdef CONFIG_SWAP | 321 | #ifdef CONFIG_SWAP |
323 | if (PageSwapCache(page)) { | 322 | if (PageSwapCache(page)) { |
324 | SetPageSwapCache(newpage); | 323 | SetPageSwapCache(newpage); |
@@ -326,8 +325,14 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
326 | } | 325 | } |
327 | #endif | 326 | #endif |
328 | 327 | ||
329 | *radix_pointer = newpage; | 328 | radix_tree_replace_slot(pslot, newpage); |
329 | |||
330 | /* | ||
331 | * Drop cache reference from old page. | ||
332 | * We know this isn't the last reference. | ||
333 | */ | ||
330 | __put_page(page); | 334 | __put_page(page); |
335 | |||
331 | write_unlock_irq(&mapping->tree_lock); | 336 | write_unlock_irq(&mapping->tree_lock); |
332 | 337 | ||
333 | return 0; | 338 | return 0; |
diff --git a/mm/mlock.c b/mm/mlock.c index b90c59573abf..3446b7ef731e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -65,7 +65,7 @@ success: | |||
65 | ret = make_pages_present(start, end); | 65 | ret = make_pages_present(start, end); |
66 | } | 66 | } |
67 | 67 | ||
68 | vma->vm_mm->locked_vm -= pages; | 68 | mm->locked_vm -= pages; |
69 | out: | 69 | out: |
70 | if (ret == -ENOMEM) | 70 | if (ret == -ENOMEM) |
71 | ret = -EAGAIN; | 71 | ret = -EAGAIN; |
@@ -1736,7 +1736,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
1736 | if (mm->map_count >= sysctl_max_map_count) | 1736 | if (mm->map_count >= sysctl_max_map_count) |
1737 | return -ENOMEM; | 1737 | return -ENOMEM; |
1738 | 1738 | ||
1739 | new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); | 1739 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
1740 | if (!new) | 1740 | if (!new) |
1741 | return -ENOMEM; | 1741 | return -ENOMEM; |
1742 | 1742 | ||
@@ -2057,7 +2057,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2057 | vma_start < new_vma->vm_end) | 2057 | vma_start < new_vma->vm_end) |
2058 | *vmap = new_vma; | 2058 | *vmap = new_vma; |
2059 | } else { | 2059 | } else { |
2060 | new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); | 2060 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
2061 | if (new_vma) { | 2061 | if (new_vma) { |
2062 | *new_vma = *vma; | 2062 | *new_vma = *vma; |
2063 | pol = mpol_copy(vma_policy(vma)); | 2063 | pol = mpol_copy(vma_policy(vma)); |
diff --git a/mm/mmzone.c b/mm/mmzone.c index febea1c98168..eb5838634f18 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -14,8 +14,6 @@ struct pglist_data *first_online_pgdat(void) | |||
14 | return NODE_DATA(first_online_node); | 14 | return NODE_DATA(first_online_node); |
15 | } | 15 | } |
16 | 16 | ||
17 | EXPORT_UNUSED_SYMBOL(first_online_pgdat); /* June 2006 */ | ||
18 | |||
19 | struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) | 17 | struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) |
20 | { | 18 | { |
21 | int nid = next_online_node(pgdat->node_id); | 19 | int nid = next_online_node(pgdat->node_id); |
@@ -24,8 +22,6 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) | |||
24 | return NULL; | 22 | return NULL; |
25 | return NODE_DATA(nid); | 23 | return NODE_DATA(nid); |
26 | } | 24 | } |
27 | EXPORT_UNUSED_SYMBOL(next_online_pgdat); /* June 2006 */ | ||
28 | |||
29 | 25 | ||
30 | /* | 26 | /* |
31 | * next_zone - helper magic for for_each_zone() | 27 | * next_zone - helper magic for for_each_zone() |
@@ -45,5 +41,4 @@ struct zone *next_zone(struct zone *zone) | |||
45 | } | 41 | } |
46 | return zone; | 42 | return zone; |
47 | } | 43 | } |
48 | EXPORT_UNUSED_SYMBOL(next_zone); /* June 2006 */ | ||
49 | 44 | ||
diff --git a/mm/nommu.c b/mm/nommu.c index 6a2a8aada401..af874569d0f1 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -808,10 +808,9 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
808 | vm_flags = determine_vm_flags(file, prot, flags, capabilities); | 808 | vm_flags = determine_vm_flags(file, prot, flags, capabilities); |
809 | 809 | ||
810 | /* we're going to need to record the mapping if it works */ | 810 | /* we're going to need to record the mapping if it works */ |
811 | vml = kmalloc(sizeof(struct vm_list_struct), GFP_KERNEL); | 811 | vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL); |
812 | if (!vml) | 812 | if (!vml) |
813 | goto error_getting_vml; | 813 | goto error_getting_vml; |
814 | memset(vml, 0, sizeof(*vml)); | ||
815 | 814 | ||
816 | down_write(&nommu_vma_sem); | 815 | down_write(&nommu_vma_sem); |
817 | 816 | ||
@@ -887,11 +886,10 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
887 | } | 886 | } |
888 | 887 | ||
889 | /* we're going to need a VMA struct as well */ | 888 | /* we're going to need a VMA struct as well */ |
890 | vma = kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); | 889 | vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL); |
891 | if (!vma) | 890 | if (!vma) |
892 | goto error_getting_vma; | 891 | goto error_getting_vma; |
893 | 892 | ||
894 | memset(vma, 0, sizeof(*vma)); | ||
895 | INIT_LIST_HEAD(&vma->anon_vma_node); | 893 | INIT_LIST_HEAD(&vma->anon_vma_node); |
896 | atomic_set(&vma->vm_usage, 1); | 894 | atomic_set(&vma->vm_usage, 1); |
897 | if (file) | 895 | if (file) |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2e3ce3a928b9..223d9ccb7d64 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -264,7 +264,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) | |||
264 | * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO | 264 | * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO |
265 | * set. | 265 | * set. |
266 | */ | 266 | */ |
267 | static void __oom_kill_task(struct task_struct *p, const char *message) | 267 | static void __oom_kill_task(struct task_struct *p, int verbose) |
268 | { | 268 | { |
269 | if (is_init(p)) { | 269 | if (is_init(p)) { |
270 | WARN_ON(1); | 270 | WARN_ON(1); |
@@ -278,10 +278,8 @@ static void __oom_kill_task(struct task_struct *p, const char *message) | |||
278 | return; | 278 | return; |
279 | } | 279 | } |
280 | 280 | ||
281 | if (message) { | 281 | if (verbose) |
282 | printk(KERN_ERR "%s: Killed process %d (%s).\n", | 282 | printk(KERN_ERR "Killed process %d (%s)\n", p->pid, p->comm); |
283 | message, p->pid, p->comm); | ||
284 | } | ||
285 | 283 | ||
286 | /* | 284 | /* |
287 | * We give our sacrificial lamb high priority and access to | 285 | * We give our sacrificial lamb high priority and access to |
@@ -294,7 +292,7 @@ static void __oom_kill_task(struct task_struct *p, const char *message) | |||
294 | force_sig(SIGKILL, p); | 292 | force_sig(SIGKILL, p); |
295 | } | 293 | } |
296 | 294 | ||
297 | static int oom_kill_task(struct task_struct *p, const char *message) | 295 | static int oom_kill_task(struct task_struct *p) |
298 | { | 296 | { |
299 | struct mm_struct *mm; | 297 | struct mm_struct *mm; |
300 | struct task_struct *g, *q; | 298 | struct task_struct *g, *q; |
@@ -313,15 +311,25 @@ static int oom_kill_task(struct task_struct *p, const char *message) | |||
313 | if (mm == NULL) | 311 | if (mm == NULL) |
314 | return 1; | 312 | return 1; |
315 | 313 | ||
316 | __oom_kill_task(p, message); | 314 | /* |
315 | * Don't kill the process if any threads are set to OOM_DISABLE | ||
316 | */ | ||
317 | do_each_thread(g, q) { | ||
318 | if (q->mm == mm && p->oomkilladj == OOM_DISABLE) | ||
319 | return 1; | ||
320 | } while_each_thread(g, q); | ||
321 | |||
322 | __oom_kill_task(p, 1); | ||
323 | |||
317 | /* | 324 | /* |
318 | * kill all processes that share the ->mm (i.e. all threads), | 325 | * kill all processes that share the ->mm (i.e. all threads), |
319 | * but are in a different thread group | 326 | * but are in a different thread group. Don't let them have access |
327 | * to memory reserves though, otherwise we might deplete all memory. | ||
320 | */ | 328 | */ |
321 | do_each_thread(g, q) | 329 | do_each_thread(g, q) { |
322 | if (q->mm == mm && q->tgid != p->tgid) | 330 | if (q->mm == mm && q->tgid != p->tgid) |
323 | __oom_kill_task(q, message); | 331 | force_sig(SIGKILL, p); |
324 | while_each_thread(g, q); | 332 | } while_each_thread(g, q); |
325 | 333 | ||
326 | return 0; | 334 | return 0; |
327 | } | 335 | } |
@@ -337,21 +345,22 @@ static int oom_kill_process(struct task_struct *p, unsigned long points, | |||
337 | * its children or threads, just set TIF_MEMDIE so it can die quickly | 345 | * its children or threads, just set TIF_MEMDIE so it can die quickly |
338 | */ | 346 | */ |
339 | if (p->flags & PF_EXITING) { | 347 | if (p->flags & PF_EXITING) { |
340 | __oom_kill_task(p, NULL); | 348 | __oom_kill_task(p, 0); |
341 | return 0; | 349 | return 0; |
342 | } | 350 | } |
343 | 351 | ||
344 | printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li" | 352 | printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", |
345 | " and children.\n", p->pid, p->comm, points); | 353 | message, p->pid, p->comm, points); |
354 | |||
346 | /* Try to kill a child first */ | 355 | /* Try to kill a child first */ |
347 | list_for_each(tsk, &p->children) { | 356 | list_for_each(tsk, &p->children) { |
348 | c = list_entry(tsk, struct task_struct, sibling); | 357 | c = list_entry(tsk, struct task_struct, sibling); |
349 | if (c->mm == p->mm) | 358 | if (c->mm == p->mm) |
350 | continue; | 359 | continue; |
351 | if (!oom_kill_task(c, message)) | 360 | if (!oom_kill_task(c)) |
352 | return 0; | 361 | return 0; |
353 | } | 362 | } |
354 | return oom_kill_task(p, message); | 363 | return oom_kill_task(p); |
355 | } | 364 | } |
356 | 365 | ||
357 | static BLOCKING_NOTIFIER_HEAD(oom_notify_list); | 366 | static BLOCKING_NOTIFIER_HEAD(oom_notify_list); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index aa6fcc7ca66f..cace22b3ac25 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -83,14 +83,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { | |||
83 | 83 | ||
84 | EXPORT_SYMBOL(totalram_pages); | 84 | EXPORT_SYMBOL(totalram_pages); |
85 | 85 | ||
86 | /* | 86 | static char * const zone_names[MAX_NR_ZONES] = { |
87 | * Used by page_zone() to look up the address of the struct zone whose | ||
88 | * id is encoded in the upper bits of page->flags | ||
89 | */ | ||
90 | struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; | ||
91 | EXPORT_SYMBOL(zone_table); | ||
92 | |||
93 | static char *zone_names[MAX_NR_ZONES] = { | ||
94 | "DMA", | 87 | "DMA", |
95 | #ifdef CONFIG_ZONE_DMA32 | 88 | #ifdef CONFIG_ZONE_DMA32 |
96 | "DMA32", | 89 | "DMA32", |
@@ -237,7 +230,7 @@ static void prep_compound_page(struct page *page, unsigned long order) | |||
237 | int i; | 230 | int i; |
238 | int nr_pages = 1 << order; | 231 | int nr_pages = 1 << order; |
239 | 232 | ||
240 | page[1].lru.next = (void *)free_compound_page; /* set dtor */ | 233 | set_compound_page_dtor(page, free_compound_page); |
241 | page[1].lru.prev = (void *)order; | 234 | page[1].lru.prev = (void *)order; |
242 | for (i = 0; i < nr_pages; i++) { | 235 | for (i = 0; i < nr_pages; i++) { |
243 | struct page *p = page + i; | 236 | struct page *p = page + i; |
@@ -486,7 +479,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order) | |||
486 | spin_lock(&zone->lock); | 479 | spin_lock(&zone->lock); |
487 | zone->all_unreclaimable = 0; | 480 | zone->all_unreclaimable = 0; |
488 | zone->pages_scanned = 0; | 481 | zone->pages_scanned = 0; |
489 | __free_one_page(page, zone ,order); | 482 | __free_one_page(page, zone, order); |
490 | spin_unlock(&zone->lock); | 483 | spin_unlock(&zone->lock); |
491 | } | 484 | } |
492 | 485 | ||
@@ -605,6 +598,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
605 | 1 << PG_checked | 1 << PG_mappedtodisk); | 598 | 1 << PG_checked | 1 << PG_mappedtodisk); |
606 | set_page_private(page, 0); | 599 | set_page_private(page, 0); |
607 | set_page_refcounted(page); | 600 | set_page_refcounted(page); |
601 | |||
602 | arch_alloc_page(page, order); | ||
608 | kernel_map_pages(page, 1 << order, 1); | 603 | kernel_map_pages(page, 1 << order, 1); |
609 | 604 | ||
610 | if (gfp_flags & __GFP_ZERO) | 605 | if (gfp_flags & __GFP_ZERO) |
@@ -690,9 +685,15 @@ void drain_node_pages(int nodeid) | |||
690 | 685 | ||
691 | pcp = &pset->pcp[i]; | 686 | pcp = &pset->pcp[i]; |
692 | if (pcp->count) { | 687 | if (pcp->count) { |
688 | int to_drain; | ||
689 | |||
693 | local_irq_save(flags); | 690 | local_irq_save(flags); |
694 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); | 691 | if (pcp->count >= pcp->batch) |
695 | pcp->count = 0; | 692 | to_drain = pcp->batch; |
693 | else | ||
694 | to_drain = pcp->count; | ||
695 | free_pages_bulk(zone, to_drain, &pcp->list, 0); | ||
696 | pcp->count -= to_drain; | ||
696 | local_irq_restore(flags); | 697 | local_irq_restore(flags); |
697 | } | 698 | } |
698 | } | 699 | } |
@@ -700,7 +701,6 @@ void drain_node_pages(int nodeid) | |||
700 | } | 701 | } |
701 | #endif | 702 | #endif |
702 | 703 | ||
703 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) | ||
704 | static void __drain_pages(unsigned int cpu) | 704 | static void __drain_pages(unsigned int cpu) |
705 | { | 705 | { |
706 | unsigned long flags; | 706 | unsigned long flags; |
@@ -722,7 +722,6 @@ static void __drain_pages(unsigned int cpu) | |||
722 | } | 722 | } |
723 | } | 723 | } |
724 | } | 724 | } |
725 | #endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ | ||
726 | 725 | ||
727 | #ifdef CONFIG_PM | 726 | #ifdef CONFIG_PM |
728 | 727 | ||
@@ -925,31 +924,160 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
925 | return 1; | 924 | return 1; |
926 | } | 925 | } |
927 | 926 | ||
927 | #ifdef CONFIG_NUMA | ||
928 | /* | ||
929 | * zlc_setup - Setup for "zonelist cache". Uses cached zone data to | ||
930 | * skip over zones that are not allowed by the cpuset, or that have | ||
931 | * been recently (in last second) found to be nearly full. See further | ||
932 | * comments in mmzone.h. Reduces cache footprint of zonelist scans | ||
933 | * that have to skip over alot of full or unallowed zones. | ||
934 | * | ||
935 | * If the zonelist cache is present in the passed in zonelist, then | ||
936 | * returns a pointer to the allowed node mask (either the current | ||
937 | * tasks mems_allowed, or node_online_map.) | ||
938 | * | ||
939 | * If the zonelist cache is not available for this zonelist, does | ||
940 | * nothing and returns NULL. | ||
941 | * | ||
942 | * If the fullzones BITMAP in the zonelist cache is stale (more than | ||
943 | * a second since last zap'd) then we zap it out (clear its bits.) | ||
944 | * | ||
945 | * We hold off even calling zlc_setup, until after we've checked the | ||
946 | * first zone in the zonelist, on the theory that most allocations will | ||
947 | * be satisfied from that first zone, so best to examine that zone as | ||
948 | * quickly as we can. | ||
949 | */ | ||
950 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | ||
951 | { | ||
952 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
953 | nodemask_t *allowednodes; /* zonelist_cache approximation */ | ||
954 | |||
955 | zlc = zonelist->zlcache_ptr; | ||
956 | if (!zlc) | ||
957 | return NULL; | ||
958 | |||
959 | if (jiffies - zlc->last_full_zap > 1 * HZ) { | ||
960 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | ||
961 | zlc->last_full_zap = jiffies; | ||
962 | } | ||
963 | |||
964 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? | ||
965 | &cpuset_current_mems_allowed : | ||
966 | &node_online_map; | ||
967 | return allowednodes; | ||
968 | } | ||
969 | |||
970 | /* | ||
971 | * Given 'z' scanning a zonelist, run a couple of quick checks to see | ||
972 | * if it is worth looking at further for free memory: | ||
973 | * 1) Check that the zone isn't thought to be full (doesn't have its | ||
974 | * bit set in the zonelist_cache fullzones BITMAP). | ||
975 | * 2) Check that the zones node (obtained from the zonelist_cache | ||
976 | * z_to_n[] mapping) is allowed in the passed in allowednodes mask. | ||
977 | * Return true (non-zero) if zone is worth looking at further, or | ||
978 | * else return false (zero) if it is not. | ||
979 | * | ||
980 | * This check -ignores- the distinction between various watermarks, | ||
981 | * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is | ||
982 | * found to be full for any variation of these watermarks, it will | ||
983 | * be considered full for up to one second by all requests, unless | ||
984 | * we are so low on memory on all allowed nodes that we are forced | ||
985 | * into the second scan of the zonelist. | ||
986 | * | ||
987 | * In the second scan we ignore this zonelist cache and exactly | ||
988 | * apply the watermarks to all zones, even it is slower to do so. | ||
989 | * We are low on memory in the second scan, and should leave no stone | ||
990 | * unturned looking for a free page. | ||
991 | */ | ||
992 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | ||
993 | nodemask_t *allowednodes) | ||
994 | { | ||
995 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
996 | int i; /* index of *z in zonelist zones */ | ||
997 | int n; /* node that zone *z is on */ | ||
998 | |||
999 | zlc = zonelist->zlcache_ptr; | ||
1000 | if (!zlc) | ||
1001 | return 1; | ||
1002 | |||
1003 | i = z - zonelist->zones; | ||
1004 | n = zlc->z_to_n[i]; | ||
1005 | |||
1006 | /* This zone is worth trying if it is allowed but not full */ | ||
1007 | return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); | ||
1008 | } | ||
1009 | |||
928 | /* | 1010 | /* |
929 | * get_page_from_freeliest goes through the zonelist trying to allocate | 1011 | * Given 'z' scanning a zonelist, set the corresponding bit in |
1012 | * zlc->fullzones, so that subsequent attempts to allocate a page | ||
1013 | * from that zone don't waste time re-examining it. | ||
1014 | */ | ||
1015 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | ||
1016 | { | ||
1017 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
1018 | int i; /* index of *z in zonelist zones */ | ||
1019 | |||
1020 | zlc = zonelist->zlcache_ptr; | ||
1021 | if (!zlc) | ||
1022 | return; | ||
1023 | |||
1024 | i = z - zonelist->zones; | ||
1025 | |||
1026 | set_bit(i, zlc->fullzones); | ||
1027 | } | ||
1028 | |||
1029 | #else /* CONFIG_NUMA */ | ||
1030 | |||
1031 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | ||
1032 | { | ||
1033 | return NULL; | ||
1034 | } | ||
1035 | |||
1036 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | ||
1037 | nodemask_t *allowednodes) | ||
1038 | { | ||
1039 | return 1; | ||
1040 | } | ||
1041 | |||
1042 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | ||
1043 | { | ||
1044 | } | ||
1045 | #endif /* CONFIG_NUMA */ | ||
1046 | |||
1047 | /* | ||
1048 | * get_page_from_freelist goes through the zonelist trying to allocate | ||
930 | * a page. | 1049 | * a page. |
931 | */ | 1050 | */ |
932 | static struct page * | 1051 | static struct page * |
933 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | 1052 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, |
934 | struct zonelist *zonelist, int alloc_flags) | 1053 | struct zonelist *zonelist, int alloc_flags) |
935 | { | 1054 | { |
936 | struct zone **z = zonelist->zones; | 1055 | struct zone **z; |
937 | struct page *page = NULL; | 1056 | struct page *page = NULL; |
938 | int classzone_idx = zone_idx(*z); | 1057 | int classzone_idx = zone_idx(zonelist->zones[0]); |
939 | struct zone *zone; | 1058 | struct zone *zone; |
1059 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ | ||
1060 | int zlc_active = 0; /* set if using zonelist_cache */ | ||
1061 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | ||
940 | 1062 | ||
1063 | zonelist_scan: | ||
941 | /* | 1064 | /* |
942 | * Go through the zonelist once, looking for a zone with enough free. | 1065 | * Scan zonelist, looking for a zone with enough free. |
943 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1066 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
944 | */ | 1067 | */ |
1068 | z = zonelist->zones; | ||
1069 | |||
945 | do { | 1070 | do { |
1071 | if (NUMA_BUILD && zlc_active && | ||
1072 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | ||
1073 | continue; | ||
946 | zone = *z; | 1074 | zone = *z; |
947 | if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && | 1075 | if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && |
948 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) | 1076 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) |
949 | break; | 1077 | break; |
950 | if ((alloc_flags & ALLOC_CPUSET) && | 1078 | if ((alloc_flags & ALLOC_CPUSET) && |
951 | !cpuset_zone_allowed(zone, gfp_mask)) | 1079 | !cpuset_zone_allowed(zone, gfp_mask)) |
952 | continue; | 1080 | goto try_next_zone; |
953 | 1081 | ||
954 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1082 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
955 | unsigned long mark; | 1083 | unsigned long mark; |
@@ -959,18 +1087,34 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | |||
959 | mark = zone->pages_low; | 1087 | mark = zone->pages_low; |
960 | else | 1088 | else |
961 | mark = zone->pages_high; | 1089 | mark = zone->pages_high; |
962 | if (!zone_watermark_ok(zone , order, mark, | 1090 | if (!zone_watermark_ok(zone, order, mark, |
963 | classzone_idx, alloc_flags)) | 1091 | classzone_idx, alloc_flags)) { |
964 | if (!zone_reclaim_mode || | 1092 | if (!zone_reclaim_mode || |
965 | !zone_reclaim(zone, gfp_mask, order)) | 1093 | !zone_reclaim(zone, gfp_mask, order)) |
966 | continue; | 1094 | goto this_zone_full; |
1095 | } | ||
967 | } | 1096 | } |
968 | 1097 | ||
969 | page = buffered_rmqueue(zonelist, zone, order, gfp_mask); | 1098 | page = buffered_rmqueue(zonelist, zone, order, gfp_mask); |
970 | if (page) { | 1099 | if (page) |
971 | break; | 1100 | break; |
1101 | this_zone_full: | ||
1102 | if (NUMA_BUILD) | ||
1103 | zlc_mark_zone_full(zonelist, z); | ||
1104 | try_next_zone: | ||
1105 | if (NUMA_BUILD && !did_zlc_setup) { | ||
1106 | /* we do zlc_setup after the first zone is tried */ | ||
1107 | allowednodes = zlc_setup(zonelist, alloc_flags); | ||
1108 | zlc_active = 1; | ||
1109 | did_zlc_setup = 1; | ||
972 | } | 1110 | } |
973 | } while (*(++z) != NULL); | 1111 | } while (*(++z) != NULL); |
1112 | |||
1113 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { | ||
1114 | /* Disable zlc cache for second zonelist scan */ | ||
1115 | zlc_active = 0; | ||
1116 | goto zonelist_scan; | ||
1117 | } | ||
974 | return page; | 1118 | return page; |
975 | } | 1119 | } |
976 | 1120 | ||
@@ -1005,9 +1149,19 @@ restart: | |||
1005 | if (page) | 1149 | if (page) |
1006 | goto got_pg; | 1150 | goto got_pg; |
1007 | 1151 | ||
1008 | do { | 1152 | /* |
1153 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and | ||
1154 | * __GFP_NOWARN set) should not cause reclaim since the subsystem | ||
1155 | * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim | ||
1156 | * using a larger set of nodes after it has established that the | ||
1157 | * allowed per node queues are empty and that nodes are | ||
1158 | * over allocated. | ||
1159 | */ | ||
1160 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | ||
1161 | goto nopage; | ||
1162 | |||
1163 | for (z = zonelist->zones; *z; z++) | ||
1009 | wakeup_kswapd(*z, order); | 1164 | wakeup_kswapd(*z, order); |
1010 | } while (*(++z)); | ||
1011 | 1165 | ||
1012 | /* | 1166 | /* |
1013 | * OK, we're below the kswapd watermark and have kicked background | 1167 | * OK, we're below the kswapd watermark and have kicked background |
@@ -1041,6 +1195,7 @@ restart: | |||
1041 | 1195 | ||
1042 | /* This allocation should allow future memory freeing. */ | 1196 | /* This allocation should allow future memory freeing. */ |
1043 | 1197 | ||
1198 | rebalance: | ||
1044 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) | 1199 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) |
1045 | && !in_interrupt()) { | 1200 | && !in_interrupt()) { |
1046 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { | 1201 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { |
@@ -1062,7 +1217,6 @@ nofail_alloc: | |||
1062 | if (!wait) | 1217 | if (!wait) |
1063 | goto nopage; | 1218 | goto nopage; |
1064 | 1219 | ||
1065 | rebalance: | ||
1066 | cond_resched(); | 1220 | cond_resched(); |
1067 | 1221 | ||
1068 | /* We now go into synchronous reclaim */ | 1222 | /* We now go into synchronous reclaim */ |
@@ -1262,7 +1416,7 @@ unsigned int nr_free_pagecache_pages(void) | |||
1262 | static inline void show_node(struct zone *zone) | 1416 | static inline void show_node(struct zone *zone) |
1263 | { | 1417 | { |
1264 | if (NUMA_BUILD) | 1418 | if (NUMA_BUILD) |
1265 | printk("Node %ld ", zone_to_nid(zone)); | 1419 | printk("Node %d ", zone_to_nid(zone)); |
1266 | } | 1420 | } |
1267 | 1421 | ||
1268 | void si_meminfo(struct sysinfo *val) | 1422 | void si_meminfo(struct sysinfo *val) |
@@ -1542,6 +1696,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1542 | } | 1696 | } |
1543 | } | 1697 | } |
1544 | 1698 | ||
1699 | /* Construct the zonelist performance cache - see further mmzone.h */ | ||
1700 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) | ||
1701 | { | ||
1702 | int i; | ||
1703 | |||
1704 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
1705 | struct zonelist *zonelist; | ||
1706 | struct zonelist_cache *zlc; | ||
1707 | struct zone **z; | ||
1708 | |||
1709 | zonelist = pgdat->node_zonelists + i; | ||
1710 | zonelist->zlcache_ptr = zlc = &zonelist->zlcache; | ||
1711 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | ||
1712 | for (z = zonelist->zones; *z; z++) | ||
1713 | zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); | ||
1714 | } | ||
1715 | } | ||
1716 | |||
1545 | #else /* CONFIG_NUMA */ | 1717 | #else /* CONFIG_NUMA */ |
1546 | 1718 | ||
1547 | static void __meminit build_zonelists(pg_data_t *pgdat) | 1719 | static void __meminit build_zonelists(pg_data_t *pgdat) |
@@ -1579,14 +1751,26 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1579 | } | 1751 | } |
1580 | } | 1752 | } |
1581 | 1753 | ||
1754 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ | ||
1755 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) | ||
1756 | { | ||
1757 | int i; | ||
1758 | |||
1759 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
1760 | pgdat->node_zonelists[i].zlcache_ptr = NULL; | ||
1761 | } | ||
1762 | |||
1582 | #endif /* CONFIG_NUMA */ | 1763 | #endif /* CONFIG_NUMA */ |
1583 | 1764 | ||
1584 | /* return values int ....just for stop_machine_run() */ | 1765 | /* return values int ....just for stop_machine_run() */ |
1585 | static int __meminit __build_all_zonelists(void *dummy) | 1766 | static int __meminit __build_all_zonelists(void *dummy) |
1586 | { | 1767 | { |
1587 | int nid; | 1768 | int nid; |
1588 | for_each_online_node(nid) | 1769 | |
1770 | for_each_online_node(nid) { | ||
1589 | build_zonelists(NODE_DATA(nid)); | 1771 | build_zonelists(NODE_DATA(nid)); |
1772 | build_zonelist_cache(NODE_DATA(nid)); | ||
1773 | } | ||
1590 | return 0; | 1774 | return 0; |
1591 | } | 1775 | } |
1592 | 1776 | ||
@@ -1715,20 +1899,6 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, | |||
1715 | } | 1899 | } |
1716 | } | 1900 | } |
1717 | 1901 | ||
1718 | #define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) | ||
1719 | void zonetable_add(struct zone *zone, int nid, enum zone_type zid, | ||
1720 | unsigned long pfn, unsigned long size) | ||
1721 | { | ||
1722 | unsigned long snum = pfn_to_section_nr(pfn); | ||
1723 | unsigned long end = pfn_to_section_nr(pfn + size); | ||
1724 | |||
1725 | if (FLAGS_HAS_NODE) | ||
1726 | zone_table[ZONETABLE_INDEX(nid, zid)] = zone; | ||
1727 | else | ||
1728 | for (; snum <= end; snum++) | ||
1729 | zone_table[ZONETABLE_INDEX(snum, zid)] = zone; | ||
1730 | } | ||
1731 | |||
1732 | #ifndef __HAVE_ARCH_MEMMAP_INIT | 1902 | #ifndef __HAVE_ARCH_MEMMAP_INIT |
1733 | #define memmap_init(size, nid, zone, start_pfn) \ | 1903 | #define memmap_init(size, nid, zone, start_pfn) \ |
1734 | memmap_init_zone((size), (nid), (zone), (start_pfn)) | 1904 | memmap_init_zone((size), (nid), (zone), (start_pfn)) |
@@ -1881,16 +2051,16 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, | |||
1881 | int ret = NOTIFY_OK; | 2051 | int ret = NOTIFY_OK; |
1882 | 2052 | ||
1883 | switch (action) { | 2053 | switch (action) { |
1884 | case CPU_UP_PREPARE: | 2054 | case CPU_UP_PREPARE: |
1885 | if (process_zones(cpu)) | 2055 | if (process_zones(cpu)) |
1886 | ret = NOTIFY_BAD; | 2056 | ret = NOTIFY_BAD; |
1887 | break; | 2057 | break; |
1888 | case CPU_UP_CANCELED: | 2058 | case CPU_UP_CANCELED: |
1889 | case CPU_DEAD: | 2059 | case CPU_DEAD: |
1890 | free_zone_pagesets(cpu); | 2060 | free_zone_pagesets(cpu); |
1891 | break; | 2061 | break; |
1892 | default: | 2062 | default: |
1893 | break; | 2063 | break; |
1894 | } | 2064 | } |
1895 | return ret; | 2065 | return ret; |
1896 | } | 2066 | } |
@@ -2421,7 +2591,6 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, | |||
2421 | if (!size) | 2591 | if (!size) |
2422 | continue; | 2592 | continue; |
2423 | 2593 | ||
2424 | zonetable_add(zone, nid, j, zone_start_pfn, size); | ||
2425 | ret = init_currently_empty_zone(zone, zone_start_pfn, size); | 2594 | ret = init_currently_empty_zone(zone, zone_start_pfn, size); |
2426 | BUG_ON(ret); | 2595 | BUG_ON(ret); |
2427 | zone_start_pfn += size; | 2596 | zone_start_pfn += size; |
@@ -2736,7 +2905,6 @@ void __init free_area_init(unsigned long *zones_size) | |||
2736 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); | 2905 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); |
2737 | } | 2906 | } |
2738 | 2907 | ||
2739 | #ifdef CONFIG_HOTPLUG_CPU | ||
2740 | static int page_alloc_cpu_notify(struct notifier_block *self, | 2908 | static int page_alloc_cpu_notify(struct notifier_block *self, |
2741 | unsigned long action, void *hcpu) | 2909 | unsigned long action, void *hcpu) |
2742 | { | 2910 | { |
@@ -2751,7 +2919,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self, | |||
2751 | } | 2919 | } |
2752 | return NOTIFY_OK; | 2920 | return NOTIFY_OK; |
2753 | } | 2921 | } |
2754 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
2755 | 2922 | ||
2756 | void __init page_alloc_init(void) | 2923 | void __init page_alloc_init(void) |
2757 | { | 2924 | { |
@@ -3055,7 +3222,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
3055 | /* allow the kernel cmdline to have a say */ | 3222 | /* allow the kernel cmdline to have a say */ |
3056 | if (!numentries) { | 3223 | if (!numentries) { |
3057 | /* round applicable memory size up to nearest megabyte */ | 3224 | /* round applicable memory size up to nearest megabyte */ |
3058 | numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; | 3225 | numentries = nr_kernel_pages; |
3059 | numentries += (1UL << (20 - PAGE_SHIFT)) - 1; | 3226 | numentries += (1UL << (20 - PAGE_SHIFT)) - 1; |
3060 | numentries >>= 20 - PAGE_SHIFT; | 3227 | numentries >>= 20 - PAGE_SHIFT; |
3061 | numentries <<= 20 - PAGE_SHIFT; | 3228 | numentries <<= 20 - PAGE_SHIFT; |
diff --git a/mm/page_io.c b/mm/page_io.c index d4840ecbf8f9..dbffec0d78c9 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -147,48 +147,3 @@ int swap_readpage(struct file *file, struct page *page) | |||
147 | out: | 147 | out: |
148 | return ret; | 148 | return ret; |
149 | } | 149 | } |
150 | |||
151 | #ifdef CONFIG_SOFTWARE_SUSPEND | ||
152 | /* | ||
153 | * A scruffy utility function to read or write an arbitrary swap page | ||
154 | * and wait on the I/O. The caller must have a ref on the page. | ||
155 | * | ||
156 | * We use end_swap_bio_read() even for writes, because it happens to do what | ||
157 | * we want. | ||
158 | */ | ||
159 | int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page, | ||
160 | struct bio **bio_chain) | ||
161 | { | ||
162 | struct bio *bio; | ||
163 | int ret = 0; | ||
164 | int bio_rw; | ||
165 | |||
166 | lock_page(page); | ||
167 | |||
168 | bio = get_swap_bio(GFP_KERNEL, entry.val, page, end_swap_bio_read); | ||
169 | if (bio == NULL) { | ||
170 | unlock_page(page); | ||
171 | ret = -ENOMEM; | ||
172 | goto out; | ||
173 | } | ||
174 | |||
175 | bio_rw = rw; | ||
176 | if (!bio_chain) | ||
177 | bio_rw |= (1 << BIO_RW_SYNC); | ||
178 | if (bio_chain) | ||
179 | bio_get(bio); | ||
180 | submit_bio(bio_rw, bio); | ||
181 | if (bio_chain == NULL) { | ||
182 | wait_on_page_locked(page); | ||
183 | |||
184 | if (!PageUptodate(page) || PageError(page)) | ||
185 | ret = -EIO; | ||
186 | } | ||
187 | if (bio_chain) { | ||
188 | bio->bi_private = *bio_chain; | ||
189 | *bio_chain = bio; | ||
190 | } | ||
191 | out: | ||
192 | return ret; | ||
193 | } | ||
194 | #endif | ||
diff --git a/mm/pdflush.c b/mm/pdflush.c index b02102feeb4b..8ce0900dc95c 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/writeback.h> // Prototypes pdflush_operation() | 21 | #include <linux/writeback.h> // Prototypes pdflush_operation() |
22 | #include <linux/kthread.h> | 22 | #include <linux/kthread.h> |
23 | #include <linux/cpuset.h> | 23 | #include <linux/cpuset.h> |
24 | #include <linux/freezer.h> | ||
24 | 25 | ||
25 | 26 | ||
26 | /* | 27 | /* |
diff --git a/mm/readahead.c b/mm/readahead.c index 23cb61a01c6e..a386f2b6b335 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -148,13 +148,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, | |||
148 | if (!pagevec_add(&lru_pvec, page)) | 148 | if (!pagevec_add(&lru_pvec, page)) |
149 | __pagevec_lru_add(&lru_pvec); | 149 | __pagevec_lru_add(&lru_pvec); |
150 | if (ret) { | 150 | if (ret) { |
151 | while (!list_empty(pages)) { | 151 | put_pages_list(pages); |
152 | struct page *victim; | ||
153 | |||
154 | victim = list_to_page(pages); | ||
155 | list_del(&victim->lru); | ||
156 | page_cache_release(victim); | ||
157 | } | ||
158 | break; | 152 | break; |
159 | } | 153 | } |
160 | } | 154 | } |
diff --git a/mm/shmem.c b/mm/shmem.c index 4959535fc14c..c820b4f77b8d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -177,7 +177,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) | |||
177 | 177 | ||
178 | static struct super_operations shmem_ops; | 178 | static struct super_operations shmem_ops; |
179 | static const struct address_space_operations shmem_aops; | 179 | static const struct address_space_operations shmem_aops; |
180 | static struct file_operations shmem_file_operations; | 180 | static const struct file_operations shmem_file_operations; |
181 | static struct inode_operations shmem_inode_operations; | 181 | static struct inode_operations shmem_inode_operations; |
182 | static struct inode_operations shmem_dir_inode_operations; | 182 | static struct inode_operations shmem_dir_inode_operations; |
183 | static struct inode_operations shmem_special_inode_operations; | 183 | static struct inode_operations shmem_special_inode_operations; |
@@ -1943,7 +1943,7 @@ static int shmem_xattr_security_set(struct inode *inode, const char *name, | |||
1943 | return security_inode_setsecurity(inode, name, value, size, flags); | 1943 | return security_inode_setsecurity(inode, name, value, size, flags); |
1944 | } | 1944 | } |
1945 | 1945 | ||
1946 | struct xattr_handler shmem_xattr_security_handler = { | 1946 | static struct xattr_handler shmem_xattr_security_handler = { |
1947 | .prefix = XATTR_SECURITY_PREFIX, | 1947 | .prefix = XATTR_SECURITY_PREFIX, |
1948 | .list = shmem_xattr_security_list, | 1948 | .list = shmem_xattr_security_list, |
1949 | .get = shmem_xattr_security_get, | 1949 | .get = shmem_xattr_security_get, |
@@ -2263,7 +2263,7 @@ static struct kmem_cache *shmem_inode_cachep; | |||
2263 | static struct inode *shmem_alloc_inode(struct super_block *sb) | 2263 | static struct inode *shmem_alloc_inode(struct super_block *sb) |
2264 | { | 2264 | { |
2265 | struct shmem_inode_info *p; | 2265 | struct shmem_inode_info *p; |
2266 | p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL); | 2266 | p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); |
2267 | if (!p) | 2267 | if (!p) |
2268 | return NULL; | 2268 | return NULL; |
2269 | return &p->vfs_inode; | 2269 | return &p->vfs_inode; |
@@ -2319,7 +2319,7 @@ static const struct address_space_operations shmem_aops = { | |||
2319 | .migratepage = migrate_page, | 2319 | .migratepage = migrate_page, |
2320 | }; | 2320 | }; |
2321 | 2321 | ||
2322 | static struct file_operations shmem_file_operations = { | 2322 | static const struct file_operations shmem_file_operations = { |
2323 | .mmap = shmem_mmap, | 2323 | .mmap = shmem_mmap, |
2324 | #ifdef CONFIG_TMPFS | 2324 | #ifdef CONFIG_TMPFS |
2325 | .llseek = generic_file_llseek, | 2325 | .llseek = generic_file_llseek, |
@@ -103,12 +103,12 @@ | |||
103 | #include <linux/module.h> | 103 | #include <linux/module.h> |
104 | #include <linux/rcupdate.h> | 104 | #include <linux/rcupdate.h> |
105 | #include <linux/string.h> | 105 | #include <linux/string.h> |
106 | #include <linux/uaccess.h> | ||
106 | #include <linux/nodemask.h> | 107 | #include <linux/nodemask.h> |
107 | #include <linux/mempolicy.h> | 108 | #include <linux/mempolicy.h> |
108 | #include <linux/mutex.h> | 109 | #include <linux/mutex.h> |
109 | #include <linux/rtmutex.h> | 110 | #include <linux/rtmutex.h> |
110 | 111 | ||
111 | #include <asm/uaccess.h> | ||
112 | #include <asm/cacheflush.h> | 112 | #include <asm/cacheflush.h> |
113 | #include <asm/tlbflush.h> | 113 | #include <asm/tlbflush.h> |
114 | #include <asm/page.h> | 114 | #include <asm/page.h> |
@@ -730,7 +730,10 @@ static inline void init_lock_keys(void) | |||
730 | } | 730 | } |
731 | #endif | 731 | #endif |
732 | 732 | ||
733 | /* Guard access to the cache-chain. */ | 733 | /* |
734 | * 1. Guard access to the cache-chain. | ||
735 | * 2. Protect sanity of cpu_online_map against cpu hotplug events | ||
736 | */ | ||
734 | static DEFINE_MUTEX(cache_chain_mutex); | 737 | static DEFINE_MUTEX(cache_chain_mutex); |
735 | static struct list_head cache_chain; | 738 | static struct list_head cache_chain; |
736 | 739 | ||
@@ -866,6 +869,22 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, | |||
866 | dump_stack(); | 869 | dump_stack(); |
867 | } | 870 | } |
868 | 871 | ||
872 | /* | ||
873 | * By default on NUMA we use alien caches to stage the freeing of | ||
874 | * objects allocated from other nodes. This causes massive memory | ||
875 | * inefficiencies when using fake NUMA setup to split memory into a | ||
876 | * large number of small nodes, so it can be disabled on the command | ||
877 | * line | ||
878 | */ | ||
879 | |||
880 | static int use_alien_caches __read_mostly = 1; | ||
881 | static int __init noaliencache_setup(char *s) | ||
882 | { | ||
883 | use_alien_caches = 0; | ||
884 | return 1; | ||
885 | } | ||
886 | __setup("noaliencache", noaliencache_setup); | ||
887 | |||
869 | #ifdef CONFIG_NUMA | 888 | #ifdef CONFIG_NUMA |
870 | /* | 889 | /* |
871 | * Special reaping functions for NUMA systems called from cache_reap(). | 890 | * Special reaping functions for NUMA systems called from cache_reap(). |
@@ -996,7 +1015,7 @@ static inline void *alternate_node_alloc(struct kmem_cache *cachep, | |||
996 | return NULL; | 1015 | return NULL; |
997 | } | 1016 | } |
998 | 1017 | ||
999 | static inline void *__cache_alloc_node(struct kmem_cache *cachep, | 1018 | static inline void *____cache_alloc_node(struct kmem_cache *cachep, |
1000 | gfp_t flags, int nodeid) | 1019 | gfp_t flags, int nodeid) |
1001 | { | 1020 | { |
1002 | return NULL; | 1021 | return NULL; |
@@ -1004,7 +1023,7 @@ static inline void *__cache_alloc_node(struct kmem_cache *cachep, | |||
1004 | 1023 | ||
1005 | #else /* CONFIG_NUMA */ | 1024 | #else /* CONFIG_NUMA */ |
1006 | 1025 | ||
1007 | static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); | 1026 | static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); |
1008 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); | 1027 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); |
1009 | 1028 | ||
1010 | static struct array_cache **alloc_alien_cache(int node, int limit) | 1029 | static struct array_cache **alloc_alien_cache(int node, int limit) |
@@ -1114,7 +1133,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | |||
1114 | * Make sure we are not freeing a object from another node to the array | 1133 | * Make sure we are not freeing a object from another node to the array |
1115 | * cache on this cpu. | 1134 | * cache on this cpu. |
1116 | */ | 1135 | */ |
1117 | if (likely(slabp->nodeid == node)) | 1136 | if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches)) |
1118 | return 0; | 1137 | return 0; |
1119 | 1138 | ||
1120 | l3 = cachep->nodelists[node]; | 1139 | l3 = cachep->nodelists[node]; |
@@ -1192,7 +1211,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1192 | list_for_each_entry(cachep, &cache_chain, next) { | 1211 | list_for_each_entry(cachep, &cache_chain, next) { |
1193 | struct array_cache *nc; | 1212 | struct array_cache *nc; |
1194 | struct array_cache *shared; | 1213 | struct array_cache *shared; |
1195 | struct array_cache **alien; | 1214 | struct array_cache **alien = NULL; |
1196 | 1215 | ||
1197 | nc = alloc_arraycache(node, cachep->limit, | 1216 | nc = alloc_arraycache(node, cachep->limit, |
1198 | cachep->batchcount); | 1217 | cachep->batchcount); |
@@ -1204,9 +1223,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1204 | if (!shared) | 1223 | if (!shared) |
1205 | goto bad; | 1224 | goto bad; |
1206 | 1225 | ||
1207 | alien = alloc_alien_cache(node, cachep->limit); | 1226 | if (use_alien_caches) { |
1208 | if (!alien) | 1227 | alien = alloc_alien_cache(node, cachep->limit); |
1209 | goto bad; | 1228 | if (!alien) |
1229 | goto bad; | ||
1230 | } | ||
1210 | cachep->array[cpu] = nc; | 1231 | cachep->array[cpu] = nc; |
1211 | l3 = cachep->nodelists[node]; | 1232 | l3 = cachep->nodelists[node]; |
1212 | BUG_ON(!l3); | 1233 | BUG_ON(!l3); |
@@ -1230,12 +1251,18 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1230 | kfree(shared); | 1251 | kfree(shared); |
1231 | free_alien_cache(alien); | 1252 | free_alien_cache(alien); |
1232 | } | 1253 | } |
1233 | mutex_unlock(&cache_chain_mutex); | ||
1234 | break; | 1254 | break; |
1235 | case CPU_ONLINE: | 1255 | case CPU_ONLINE: |
1256 | mutex_unlock(&cache_chain_mutex); | ||
1236 | start_cpu_timer(cpu); | 1257 | start_cpu_timer(cpu); |
1237 | break; | 1258 | break; |
1238 | #ifdef CONFIG_HOTPLUG_CPU | 1259 | #ifdef CONFIG_HOTPLUG_CPU |
1260 | case CPU_DOWN_PREPARE: | ||
1261 | mutex_lock(&cache_chain_mutex); | ||
1262 | break; | ||
1263 | case CPU_DOWN_FAILED: | ||
1264 | mutex_unlock(&cache_chain_mutex); | ||
1265 | break; | ||
1239 | case CPU_DEAD: | 1266 | case CPU_DEAD: |
1240 | /* | 1267 | /* |
1241 | * Even if all the cpus of a node are down, we don't free the | 1268 | * Even if all the cpus of a node are down, we don't free the |
@@ -1246,8 +1273,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1246 | * gets destroyed at kmem_cache_destroy(). | 1273 | * gets destroyed at kmem_cache_destroy(). |
1247 | */ | 1274 | */ |
1248 | /* fall thru */ | 1275 | /* fall thru */ |
1276 | #endif | ||
1249 | case CPU_UP_CANCELED: | 1277 | case CPU_UP_CANCELED: |
1250 | mutex_lock(&cache_chain_mutex); | ||
1251 | list_for_each_entry(cachep, &cache_chain, next) { | 1278 | list_for_each_entry(cachep, &cache_chain, next) { |
1252 | struct array_cache *nc; | 1279 | struct array_cache *nc; |
1253 | struct array_cache *shared; | 1280 | struct array_cache *shared; |
@@ -1308,11 +1335,9 @@ free_array_cache: | |||
1308 | } | 1335 | } |
1309 | mutex_unlock(&cache_chain_mutex); | 1336 | mutex_unlock(&cache_chain_mutex); |
1310 | break; | 1337 | break; |
1311 | #endif | ||
1312 | } | 1338 | } |
1313 | return NOTIFY_OK; | 1339 | return NOTIFY_OK; |
1314 | bad: | 1340 | bad: |
1315 | mutex_unlock(&cache_chain_mutex); | ||
1316 | return NOTIFY_BAD; | 1341 | return NOTIFY_BAD; |
1317 | } | 1342 | } |
1318 | 1343 | ||
@@ -1580,12 +1605,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1580 | flags |= __GFP_COMP; | 1605 | flags |= __GFP_COMP; |
1581 | #endif | 1606 | #endif |
1582 | 1607 | ||
1583 | /* | 1608 | flags |= cachep->gfpflags; |
1584 | * Under NUMA we want memory on the indicated node. We will handle | ||
1585 | * the needed fallback ourselves since we want to serve from our | ||
1586 | * per node object lists first for other nodes. | ||
1587 | */ | ||
1588 | flags |= cachep->gfpflags | GFP_THISNODE; | ||
1589 | 1609 | ||
1590 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); | 1610 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); |
1591 | if (!page) | 1611 | if (!page) |
@@ -2098,15 +2118,12 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2098 | } | 2118 | } |
2099 | 2119 | ||
2100 | /* | 2120 | /* |
2101 | * Prevent CPUs from coming and going. | 2121 | * We use cache_chain_mutex to ensure a consistent view of |
2102 | * lock_cpu_hotplug() nests outside cache_chain_mutex | 2122 | * cpu_online_map as well. Please see cpuup_callback |
2103 | */ | 2123 | */ |
2104 | lock_cpu_hotplug(); | ||
2105 | |||
2106 | mutex_lock(&cache_chain_mutex); | 2124 | mutex_lock(&cache_chain_mutex); |
2107 | 2125 | ||
2108 | list_for_each_entry(pc, &cache_chain, next) { | 2126 | list_for_each_entry(pc, &cache_chain, next) { |
2109 | mm_segment_t old_fs = get_fs(); | ||
2110 | char tmp; | 2127 | char tmp; |
2111 | int res; | 2128 | int res; |
2112 | 2129 | ||
@@ -2115,9 +2132,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2115 | * destroy its slab cache and no-one else reuses the vmalloc | 2132 | * destroy its slab cache and no-one else reuses the vmalloc |
2116 | * area of the module. Print a warning. | 2133 | * area of the module. Print a warning. |
2117 | */ | 2134 | */ |
2118 | set_fs(KERNEL_DS); | 2135 | res = probe_kernel_address(pc->name, tmp); |
2119 | res = __get_user(tmp, pc->name); | ||
2120 | set_fs(old_fs); | ||
2121 | if (res) { | 2136 | if (res) { |
2122 | printk("SLAB: cache with size %d has lost its name\n", | 2137 | printk("SLAB: cache with size %d has lost its name\n", |
2123 | pc->buffer_size); | 2138 | pc->buffer_size); |
@@ -2197,25 +2212,24 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2197 | if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) | 2212 | if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) |
2198 | ralign = BYTES_PER_WORD; | 2213 | ralign = BYTES_PER_WORD; |
2199 | 2214 | ||
2200 | /* 2) arch mandated alignment: disables debug if necessary */ | 2215 | /* 2) arch mandated alignment */ |
2201 | if (ralign < ARCH_SLAB_MINALIGN) { | 2216 | if (ralign < ARCH_SLAB_MINALIGN) { |
2202 | ralign = ARCH_SLAB_MINALIGN; | 2217 | ralign = ARCH_SLAB_MINALIGN; |
2203 | if (ralign > BYTES_PER_WORD) | ||
2204 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | ||
2205 | } | 2218 | } |
2206 | /* 3) caller mandated alignment: disables debug if necessary */ | 2219 | /* 3) caller mandated alignment */ |
2207 | if (ralign < align) { | 2220 | if (ralign < align) { |
2208 | ralign = align; | 2221 | ralign = align; |
2209 | if (ralign > BYTES_PER_WORD) | ||
2210 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | ||
2211 | } | 2222 | } |
2223 | /* disable debug if necessary */ | ||
2224 | if (ralign > BYTES_PER_WORD) | ||
2225 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | ||
2212 | /* | 2226 | /* |
2213 | * 4) Store it. | 2227 | * 4) Store it. |
2214 | */ | 2228 | */ |
2215 | align = ralign; | 2229 | align = ralign; |
2216 | 2230 | ||
2217 | /* Get cache's description obj. */ | 2231 | /* Get cache's description obj. */ |
2218 | cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL); | 2232 | cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL); |
2219 | if (!cachep) | 2233 | if (!cachep) |
2220 | goto oops; | 2234 | goto oops; |
2221 | 2235 | ||
@@ -2326,7 +2340,6 @@ oops: | |||
2326 | panic("kmem_cache_create(): failed to create slab `%s'\n", | 2340 | panic("kmem_cache_create(): failed to create slab `%s'\n", |
2327 | name); | 2341 | name); |
2328 | mutex_unlock(&cache_chain_mutex); | 2342 | mutex_unlock(&cache_chain_mutex); |
2329 | unlock_cpu_hotplug(); | ||
2330 | return cachep; | 2343 | return cachep; |
2331 | } | 2344 | } |
2332 | EXPORT_SYMBOL(kmem_cache_create); | 2345 | EXPORT_SYMBOL(kmem_cache_create); |
@@ -2444,6 +2457,7 @@ out: | |||
2444 | return nr_freed; | 2457 | return nr_freed; |
2445 | } | 2458 | } |
2446 | 2459 | ||
2460 | /* Called with cache_chain_mutex held to protect against cpu hotplug */ | ||
2447 | static int __cache_shrink(struct kmem_cache *cachep) | 2461 | static int __cache_shrink(struct kmem_cache *cachep) |
2448 | { | 2462 | { |
2449 | int ret = 0, i = 0; | 2463 | int ret = 0, i = 0; |
@@ -2474,9 +2488,13 @@ static int __cache_shrink(struct kmem_cache *cachep) | |||
2474 | */ | 2488 | */ |
2475 | int kmem_cache_shrink(struct kmem_cache *cachep) | 2489 | int kmem_cache_shrink(struct kmem_cache *cachep) |
2476 | { | 2490 | { |
2491 | int ret; | ||
2477 | BUG_ON(!cachep || in_interrupt()); | 2492 | BUG_ON(!cachep || in_interrupt()); |
2478 | 2493 | ||
2479 | return __cache_shrink(cachep); | 2494 | mutex_lock(&cache_chain_mutex); |
2495 | ret = __cache_shrink(cachep); | ||
2496 | mutex_unlock(&cache_chain_mutex); | ||
2497 | return ret; | ||
2480 | } | 2498 | } |
2481 | EXPORT_SYMBOL(kmem_cache_shrink); | 2499 | EXPORT_SYMBOL(kmem_cache_shrink); |
2482 | 2500 | ||
@@ -2500,23 +2518,16 @@ void kmem_cache_destroy(struct kmem_cache *cachep) | |||
2500 | { | 2518 | { |
2501 | BUG_ON(!cachep || in_interrupt()); | 2519 | BUG_ON(!cachep || in_interrupt()); |
2502 | 2520 | ||
2503 | /* Don't let CPUs to come and go */ | ||
2504 | lock_cpu_hotplug(); | ||
2505 | |||
2506 | /* Find the cache in the chain of caches. */ | 2521 | /* Find the cache in the chain of caches. */ |
2507 | mutex_lock(&cache_chain_mutex); | 2522 | mutex_lock(&cache_chain_mutex); |
2508 | /* | 2523 | /* |
2509 | * the chain is never empty, cache_cache is never destroyed | 2524 | * the chain is never empty, cache_cache is never destroyed |
2510 | */ | 2525 | */ |
2511 | list_del(&cachep->next); | 2526 | list_del(&cachep->next); |
2512 | mutex_unlock(&cache_chain_mutex); | ||
2513 | |||
2514 | if (__cache_shrink(cachep)) { | 2527 | if (__cache_shrink(cachep)) { |
2515 | slab_error(cachep, "Can't free all objects"); | 2528 | slab_error(cachep, "Can't free all objects"); |
2516 | mutex_lock(&cache_chain_mutex); | ||
2517 | list_add(&cachep->next, &cache_chain); | 2529 | list_add(&cachep->next, &cache_chain); |
2518 | mutex_unlock(&cache_chain_mutex); | 2530 | mutex_unlock(&cache_chain_mutex); |
2519 | unlock_cpu_hotplug(); | ||
2520 | return; | 2531 | return; |
2521 | } | 2532 | } |
2522 | 2533 | ||
@@ -2524,7 +2535,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) | |||
2524 | synchronize_rcu(); | 2535 | synchronize_rcu(); |
2525 | 2536 | ||
2526 | __kmem_cache_destroy(cachep); | 2537 | __kmem_cache_destroy(cachep); |
2527 | unlock_cpu_hotplug(); | 2538 | mutex_unlock(&cache_chain_mutex); |
2528 | } | 2539 | } |
2529 | EXPORT_SYMBOL(kmem_cache_destroy); | 2540 | EXPORT_SYMBOL(kmem_cache_destroy); |
2530 | 2541 | ||
@@ -2548,7 +2559,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, | |||
2548 | if (OFF_SLAB(cachep)) { | 2559 | if (OFF_SLAB(cachep)) { |
2549 | /* Slab management obj is off-slab. */ | 2560 | /* Slab management obj is off-slab. */ |
2550 | slabp = kmem_cache_alloc_node(cachep->slabp_cache, | 2561 | slabp = kmem_cache_alloc_node(cachep->slabp_cache, |
2551 | local_flags, nodeid); | 2562 | local_flags & ~GFP_THISNODE, nodeid); |
2552 | if (!slabp) | 2563 | if (!slabp) |
2553 | return NULL; | 2564 | return NULL; |
2554 | } else { | 2565 | } else { |
@@ -2618,7 +2629,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2618 | 2629 | ||
2619 | static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) | 2630 | static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) |
2620 | { | 2631 | { |
2621 | if (flags & SLAB_DMA) | 2632 | if (flags & GFP_DMA) |
2622 | BUG_ON(!(cachep->gfpflags & GFP_DMA)); | 2633 | BUG_ON(!(cachep->gfpflags & GFP_DMA)); |
2623 | else | 2634 | else |
2624 | BUG_ON(cachep->gfpflags & GFP_DMA); | 2635 | BUG_ON(cachep->gfpflags & GFP_DMA); |
@@ -2689,10 +2700,10 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, | |||
2689 | * Grow (by 1) the number of slabs within a cache. This is called by | 2700 | * Grow (by 1) the number of slabs within a cache. This is called by |
2690 | * kmem_cache_alloc() when there are no active objs left in a cache. | 2701 | * kmem_cache_alloc() when there are no active objs left in a cache. |
2691 | */ | 2702 | */ |
2692 | static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) | 2703 | static int cache_grow(struct kmem_cache *cachep, |
2704 | gfp_t flags, int nodeid, void *objp) | ||
2693 | { | 2705 | { |
2694 | struct slab *slabp; | 2706 | struct slab *slabp; |
2695 | void *objp; | ||
2696 | size_t offset; | 2707 | size_t offset; |
2697 | gfp_t local_flags; | 2708 | gfp_t local_flags; |
2698 | unsigned long ctor_flags; | 2709 | unsigned long ctor_flags; |
@@ -2702,12 +2713,12 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
2702 | * Be lazy and only check for valid flags here, keeping it out of the | 2713 | * Be lazy and only check for valid flags here, keeping it out of the |
2703 | * critical path in kmem_cache_alloc(). | 2714 | * critical path in kmem_cache_alloc(). |
2704 | */ | 2715 | */ |
2705 | BUG_ON(flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)); | 2716 | BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW)); |
2706 | if (flags & SLAB_NO_GROW) | 2717 | if (flags & __GFP_NO_GROW) |
2707 | return 0; | 2718 | return 0; |
2708 | 2719 | ||
2709 | ctor_flags = SLAB_CTOR_CONSTRUCTOR; | 2720 | ctor_flags = SLAB_CTOR_CONSTRUCTOR; |
2710 | local_flags = (flags & SLAB_LEVEL_MASK); | 2721 | local_flags = (flags & GFP_LEVEL_MASK); |
2711 | if (!(local_flags & __GFP_WAIT)) | 2722 | if (!(local_flags & __GFP_WAIT)) |
2712 | /* | 2723 | /* |
2713 | * Not allowed to sleep. Need to tell a constructor about | 2724 | * Not allowed to sleep. Need to tell a constructor about |
@@ -2744,12 +2755,14 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
2744 | * Get mem for the objs. Attempt to allocate a physical page from | 2755 | * Get mem for the objs. Attempt to allocate a physical page from |
2745 | * 'nodeid'. | 2756 | * 'nodeid'. |
2746 | */ | 2757 | */ |
2747 | objp = kmem_getpages(cachep, flags, nodeid); | 2758 | if (!objp) |
2759 | objp = kmem_getpages(cachep, flags, nodeid); | ||
2748 | if (!objp) | 2760 | if (!objp) |
2749 | goto failed; | 2761 | goto failed; |
2750 | 2762 | ||
2751 | /* Get slab management. */ | 2763 | /* Get slab management. */ |
2752 | slabp = alloc_slabmgmt(cachep, objp, offset, local_flags, nodeid); | 2764 | slabp = alloc_slabmgmt(cachep, objp, offset, |
2765 | local_flags & ~GFP_THISNODE, nodeid); | ||
2753 | if (!slabp) | 2766 | if (!slabp) |
2754 | goto opps1; | 2767 | goto opps1; |
2755 | 2768 | ||
@@ -2987,7 +3000,7 @@ alloc_done: | |||
2987 | 3000 | ||
2988 | if (unlikely(!ac->avail)) { | 3001 | if (unlikely(!ac->avail)) { |
2989 | int x; | 3002 | int x; |
2990 | x = cache_grow(cachep, flags, node); | 3003 | x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); |
2991 | 3004 | ||
2992 | /* cache_grow can reenable interrupts, then ac could change. */ | 3005 | /* cache_grow can reenable interrupts, then ac could change. */ |
2993 | ac = cpu_cache_get(cachep); | 3006 | ac = cpu_cache_get(cachep); |
@@ -3063,6 +3076,12 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3063 | 3076 | ||
3064 | cachep->ctor(objp, cachep, ctor_flags); | 3077 | cachep->ctor(objp, cachep, ctor_flags); |
3065 | } | 3078 | } |
3079 | #if ARCH_SLAB_MINALIGN | ||
3080 | if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { | ||
3081 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", | ||
3082 | objp, ARCH_SLAB_MINALIGN); | ||
3083 | } | ||
3084 | #endif | ||
3066 | return objp; | 3085 | return objp; |
3067 | } | 3086 | } |
3068 | #else | 3087 | #else |
@@ -3105,10 +3124,10 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep, | |||
3105 | objp = ____cache_alloc(cachep, flags); | 3124 | objp = ____cache_alloc(cachep, flags); |
3106 | /* | 3125 | /* |
3107 | * We may just have run out of memory on the local node. | 3126 | * We may just have run out of memory on the local node. |
3108 | * __cache_alloc_node() knows how to locate memory on other nodes | 3127 | * ____cache_alloc_node() knows how to locate memory on other nodes |
3109 | */ | 3128 | */ |
3110 | if (NUMA_BUILD && !objp) | 3129 | if (NUMA_BUILD && !objp) |
3111 | objp = __cache_alloc_node(cachep, flags, numa_node_id()); | 3130 | objp = ____cache_alloc_node(cachep, flags, numa_node_id()); |
3112 | local_irq_restore(save_flags); | 3131 | local_irq_restore(save_flags); |
3113 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, | 3132 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, |
3114 | caller); | 3133 | caller); |
@@ -3135,15 +3154,17 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3135 | else if (current->mempolicy) | 3154 | else if (current->mempolicy) |
3136 | nid_alloc = slab_node(current->mempolicy); | 3155 | nid_alloc = slab_node(current->mempolicy); |
3137 | if (nid_alloc != nid_here) | 3156 | if (nid_alloc != nid_here) |
3138 | return __cache_alloc_node(cachep, flags, nid_alloc); | 3157 | return ____cache_alloc_node(cachep, flags, nid_alloc); |
3139 | return NULL; | 3158 | return NULL; |
3140 | } | 3159 | } |
3141 | 3160 | ||
3142 | /* | 3161 | /* |
3143 | * Fallback function if there was no memory available and no objects on a | 3162 | * Fallback function if there was no memory available and no objects on a |
3144 | * certain node and we are allowed to fall back. We mimick the behavior of | 3163 | * certain node and fall back is permitted. First we scan all the |
3145 | * the page allocator. We fall back according to a zonelist determined by | 3164 | * available nodelists for available objects. If that fails then we |
3146 | * the policy layer while obeying cpuset constraints. | 3165 | * perform an allocation without specifying a node. This allows the page |
3166 | * allocator to do its reclaim / fallback magic. We then insert the | ||
3167 | * slab into the proper nodelist and then allocate from it. | ||
3147 | */ | 3168 | */ |
3148 | void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | 3169 | void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) |
3149 | { | 3170 | { |
@@ -3151,15 +3172,51 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3151 | ->node_zonelists[gfp_zone(flags)]; | 3172 | ->node_zonelists[gfp_zone(flags)]; |
3152 | struct zone **z; | 3173 | struct zone **z; |
3153 | void *obj = NULL; | 3174 | void *obj = NULL; |
3175 | int nid; | ||
3154 | 3176 | ||
3177 | retry: | ||
3178 | /* | ||
3179 | * Look through allowed nodes for objects available | ||
3180 | * from existing per node queues. | ||
3181 | */ | ||
3155 | for (z = zonelist->zones; *z && !obj; z++) { | 3182 | for (z = zonelist->zones; *z && !obj; z++) { |
3156 | int nid = zone_to_nid(*z); | 3183 | nid = zone_to_nid(*z); |
3184 | |||
3185 | if (cpuset_zone_allowed(*z, flags) && | ||
3186 | cache->nodelists[nid] && | ||
3187 | cache->nodelists[nid]->free_objects) | ||
3188 | obj = ____cache_alloc_node(cache, | ||
3189 | flags | GFP_THISNODE, nid); | ||
3190 | } | ||
3157 | 3191 | ||
3158 | if (zone_idx(*z) <= ZONE_NORMAL && | 3192 | if (!obj) { |
3159 | cpuset_zone_allowed(*z, flags) && | 3193 | /* |
3160 | cache->nodelists[nid]) | 3194 | * This allocation will be performed within the constraints |
3161 | obj = __cache_alloc_node(cache, | 3195 | * of the current cpuset / memory policy requirements. |
3162 | flags | __GFP_THISNODE, nid); | 3196 | * We may trigger various forms of reclaim on the allowed |
3197 | * set and go into memory reserves if necessary. | ||
3198 | */ | ||
3199 | obj = kmem_getpages(cache, flags, -1); | ||
3200 | if (obj) { | ||
3201 | /* | ||
3202 | * Insert into the appropriate per node queues | ||
3203 | */ | ||
3204 | nid = page_to_nid(virt_to_page(obj)); | ||
3205 | if (cache_grow(cache, flags, nid, obj)) { | ||
3206 | obj = ____cache_alloc_node(cache, | ||
3207 | flags | GFP_THISNODE, nid); | ||
3208 | if (!obj) | ||
3209 | /* | ||
3210 | * Another processor may allocate the | ||
3211 | * objects in the slab since we are | ||
3212 | * not holding any locks. | ||
3213 | */ | ||
3214 | goto retry; | ||
3215 | } else { | ||
3216 | kmem_freepages(cache, obj); | ||
3217 | obj = NULL; | ||
3218 | } | ||
3219 | } | ||
3163 | } | 3220 | } |
3164 | return obj; | 3221 | return obj; |
3165 | } | 3222 | } |
@@ -3167,7 +3224,7 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3167 | /* | 3224 | /* |
3168 | * A interface to enable slab creation on nodeid | 3225 | * A interface to enable slab creation on nodeid |
3169 | */ | 3226 | */ |
3170 | static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, | 3227 | static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, |
3171 | int nodeid) | 3228 | int nodeid) |
3172 | { | 3229 | { |
3173 | struct list_head *entry; | 3230 | struct list_head *entry; |
@@ -3216,7 +3273,7 @@ retry: | |||
3216 | 3273 | ||
3217 | must_grow: | 3274 | must_grow: |
3218 | spin_unlock(&l3->list_lock); | 3275 | spin_unlock(&l3->list_lock); |
3219 | x = cache_grow(cachep, flags, nodeid); | 3276 | x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); |
3220 | if (x) | 3277 | if (x) |
3221 | goto retry; | 3278 | goto retry; |
3222 | 3279 | ||
@@ -3434,35 +3491,59 @@ out: | |||
3434 | * @flags: See kmalloc(). | 3491 | * @flags: See kmalloc(). |
3435 | * @nodeid: node number of the target node. | 3492 | * @nodeid: node number of the target node. |
3436 | * | 3493 | * |
3437 | * Identical to kmem_cache_alloc, except that this function is slow | 3494 | * Identical to kmem_cache_alloc but it will allocate memory on the given |
3438 | * and can sleep. And it will allocate memory on the given node, which | 3495 | * node, which can improve the performance for cpu bound structures. |
3439 | * can improve the performance for cpu bound structures. | 3496 | * |
3440 | * New and improved: it will now make sure that the object gets | 3497 | * Fallback to other node is possible if __GFP_THISNODE is not set. |
3441 | * put on the correct node list so that there is no false sharing. | ||
3442 | */ | 3498 | */ |
3443 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | 3499 | static __always_inline void * |
3500 | __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, | ||
3501 | int nodeid, void *caller) | ||
3444 | { | 3502 | { |
3445 | unsigned long save_flags; | 3503 | unsigned long save_flags; |
3446 | void *ptr; | 3504 | void *ptr = NULL; |
3447 | 3505 | ||
3448 | cache_alloc_debugcheck_before(cachep, flags); | 3506 | cache_alloc_debugcheck_before(cachep, flags); |
3449 | local_irq_save(save_flags); | 3507 | local_irq_save(save_flags); |
3450 | 3508 | ||
3451 | if (nodeid == -1 || nodeid == numa_node_id() || | 3509 | if (unlikely(nodeid == -1)) |
3452 | !cachep->nodelists[nodeid]) | 3510 | nodeid = numa_node_id(); |
3453 | ptr = ____cache_alloc(cachep, flags); | 3511 | |
3454 | else | 3512 | if (likely(cachep->nodelists[nodeid])) { |
3455 | ptr = __cache_alloc_node(cachep, flags, nodeid); | 3513 | if (nodeid == numa_node_id()) { |
3456 | local_irq_restore(save_flags); | 3514 | /* |
3515 | * Use the locally cached objects if possible. | ||
3516 | * However ____cache_alloc does not allow fallback | ||
3517 | * to other nodes. It may fail while we still have | ||
3518 | * objects on other nodes available. | ||
3519 | */ | ||
3520 | ptr = ____cache_alloc(cachep, flags); | ||
3521 | } | ||
3522 | if (!ptr) { | ||
3523 | /* ___cache_alloc_node can fall back to other nodes */ | ||
3524 | ptr = ____cache_alloc_node(cachep, flags, nodeid); | ||
3525 | } | ||
3526 | } else { | ||
3527 | /* Node not bootstrapped yet */ | ||
3528 | if (!(flags & __GFP_THISNODE)) | ||
3529 | ptr = fallback_alloc(cachep, flags); | ||
3530 | } | ||
3457 | 3531 | ||
3458 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, | 3532 | local_irq_restore(save_flags); |
3459 | __builtin_return_address(0)); | 3533 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); |
3460 | 3534 | ||
3461 | return ptr; | 3535 | return ptr; |
3462 | } | 3536 | } |
3537 | |||
3538 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | ||
3539 | { | ||
3540 | return __cache_alloc_node(cachep, flags, nodeid, | ||
3541 | __builtin_return_address(0)); | ||
3542 | } | ||
3463 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 3543 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
3464 | 3544 | ||
3465 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 3545 | static __always_inline void * |
3546 | __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) | ||
3466 | { | 3547 | { |
3467 | struct kmem_cache *cachep; | 3548 | struct kmem_cache *cachep; |
3468 | 3549 | ||
@@ -3471,8 +3552,29 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) | |||
3471 | return NULL; | 3552 | return NULL; |
3472 | return kmem_cache_alloc_node(cachep, flags, node); | 3553 | return kmem_cache_alloc_node(cachep, flags, node); |
3473 | } | 3554 | } |
3555 | |||
3556 | #ifdef CONFIG_DEBUG_SLAB | ||
3557 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | ||
3558 | { | ||
3559 | return __do_kmalloc_node(size, flags, node, | ||
3560 | __builtin_return_address(0)); | ||
3561 | } | ||
3474 | EXPORT_SYMBOL(__kmalloc_node); | 3562 | EXPORT_SYMBOL(__kmalloc_node); |
3475 | #endif | 3563 | |
3564 | void *__kmalloc_node_track_caller(size_t size, gfp_t flags, | ||
3565 | int node, void *caller) | ||
3566 | { | ||
3567 | return __do_kmalloc_node(size, flags, node, caller); | ||
3568 | } | ||
3569 | EXPORT_SYMBOL(__kmalloc_node_track_caller); | ||
3570 | #else | ||
3571 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | ||
3572 | { | ||
3573 | return __do_kmalloc_node(size, flags, node, NULL); | ||
3574 | } | ||
3575 | EXPORT_SYMBOL(__kmalloc_node); | ||
3576 | #endif /* CONFIG_DEBUG_SLAB */ | ||
3577 | #endif /* CONFIG_NUMA */ | ||
3476 | 3578 | ||
3477 | /** | 3579 | /** |
3478 | * __do_kmalloc - allocate memory | 3580 | * __do_kmalloc - allocate memory |
@@ -3583,13 +3685,15 @@ static int alloc_kmemlist(struct kmem_cache *cachep) | |||
3583 | int node; | 3685 | int node; |
3584 | struct kmem_list3 *l3; | 3686 | struct kmem_list3 *l3; |
3585 | struct array_cache *new_shared; | 3687 | struct array_cache *new_shared; |
3586 | struct array_cache **new_alien; | 3688 | struct array_cache **new_alien = NULL; |
3587 | 3689 | ||
3588 | for_each_online_node(node) { | 3690 | for_each_online_node(node) { |
3589 | 3691 | ||
3590 | new_alien = alloc_alien_cache(node, cachep->limit); | 3692 | if (use_alien_caches) { |
3591 | if (!new_alien) | 3693 | new_alien = alloc_alien_cache(node, cachep->limit); |
3592 | goto fail; | 3694 | if (!new_alien) |
3695 | goto fail; | ||
3696 | } | ||
3593 | 3697 | ||
3594 | new_shared = alloc_arraycache(node, | 3698 | new_shared = alloc_arraycache(node, |
3595 | cachep->shared*cachep->batchcount, | 3699 | cachep->shared*cachep->batchcount, |
@@ -4038,7 +4142,7 @@ static int s_show(struct seq_file *m, void *p) | |||
4038 | * + further values on SMP and with statistics enabled | 4142 | * + further values on SMP and with statistics enabled |
4039 | */ | 4143 | */ |
4040 | 4144 | ||
4041 | struct seq_operations slabinfo_op = { | 4145 | const struct seq_operations slabinfo_op = { |
4042 | .start = s_start, | 4146 | .start = s_start, |
4043 | .next = s_next, | 4147 | .next = s_next, |
4044 | .stop = s_stop, | 4148 | .stop = s_stop, |
@@ -4236,7 +4340,7 @@ static int leaks_show(struct seq_file *m, void *p) | |||
4236 | return 0; | 4340 | return 0; |
4237 | } | 4341 | } |
4238 | 4342 | ||
4239 | struct seq_operations slabstats_op = { | 4343 | const struct seq_operations slabstats_op = { |
4240 | .start = leaks_start, | 4344 | .start = leaks_start, |
4241 | .next = s_next, | 4345 | .next = s_next, |
4242 | .stop = s_stop, | 4346 | .stop = s_stop, |
diff --git a/mm/sparse.c b/mm/sparse.c index b3c82ba30012..ac26eb0d73cd 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -24,6 +24,25 @@ struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] | |||
24 | #endif | 24 | #endif |
25 | EXPORT_SYMBOL(mem_section); | 25 | EXPORT_SYMBOL(mem_section); |
26 | 26 | ||
27 | #ifdef NODE_NOT_IN_PAGE_FLAGS | ||
28 | /* | ||
29 | * If we did not store the node number in the page then we have to | ||
30 | * do a lookup in the section_to_node_table in order to find which | ||
31 | * node the page belongs to. | ||
32 | */ | ||
33 | #if MAX_NUMNODES <= 256 | ||
34 | static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; | ||
35 | #else | ||
36 | static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; | ||
37 | #endif | ||
38 | |||
39 | int page_to_nid(struct page *page) | ||
40 | { | ||
41 | return section_to_node_table[page_to_section(page)]; | ||
42 | } | ||
43 | EXPORT_SYMBOL(page_to_nid); | ||
44 | #endif | ||
45 | |||
27 | #ifdef CONFIG_SPARSEMEM_EXTREME | 46 | #ifdef CONFIG_SPARSEMEM_EXTREME |
28 | static struct mem_section *sparse_index_alloc(int nid) | 47 | static struct mem_section *sparse_index_alloc(int nid) |
29 | { | 48 | { |
@@ -49,6 +68,10 @@ static int sparse_index_init(unsigned long section_nr, int nid) | |||
49 | struct mem_section *section; | 68 | struct mem_section *section; |
50 | int ret = 0; | 69 | int ret = 0; |
51 | 70 | ||
71 | #ifdef NODE_NOT_IN_PAGE_FLAGS | ||
72 | section_to_node_table[section_nr] = nid; | ||
73 | #endif | ||
74 | |||
52 | if (mem_section[root]) | 75 | if (mem_section[root]) |
53 | return -EEXIST; | 76 | return -EEXIST; |
54 | 77 | ||
@@ -57,9 +57,9 @@ static void put_compound_page(struct page *page) | |||
57 | { | 57 | { |
58 | page = (struct page *)page_private(page); | 58 | page = (struct page *)page_private(page); |
59 | if (put_page_testzero(page)) { | 59 | if (put_page_testzero(page)) { |
60 | void (*dtor)(struct page *page); | 60 | compound_page_dtor *dtor; |
61 | 61 | ||
62 | dtor = (void (*)(struct page *))page[1].lru.next; | 62 | dtor = get_compound_page_dtor(page); |
63 | (*dtor)(page); | 63 | (*dtor)(page); |
64 | } | 64 | } |
65 | } | 65 | } |
@@ -514,5 +514,7 @@ void __init swap_setup(void) | |||
514 | * Right now other parts of the system means that we | 514 | * Right now other parts of the system means that we |
515 | * _really_ don't want to cluster much more | 515 | * _really_ don't want to cluster much more |
516 | */ | 516 | */ |
517 | #ifdef CONFIG_HOTPLUG_CPU | ||
517 | hotcpu_notifier(cpu_swap_callback, 0); | 518 | hotcpu_notifier(cpu_swap_callback, 0); |
519 | #endif | ||
518 | } | 520 | } |
diff --git a/mm/swapfile.c b/mm/swapfile.c index a15def63f28f..c5431072f422 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -427,34 +427,48 @@ void free_swap_and_cache(swp_entry_t entry) | |||
427 | 427 | ||
428 | #ifdef CONFIG_SOFTWARE_SUSPEND | 428 | #ifdef CONFIG_SOFTWARE_SUSPEND |
429 | /* | 429 | /* |
430 | * Find the swap type that corresponds to given device (if any) | 430 | * Find the swap type that corresponds to given device (if any). |
431 | * | 431 | * |
432 | * This is needed for software suspend and is done in such a way that inode | 432 | * @offset - number of the PAGE_SIZE-sized block of the device, starting |
433 | * aliasing is allowed. | 433 | * from 0, in which the swap header is expected to be located. |
434 | * | ||
435 | * This is needed for the suspend to disk (aka swsusp). | ||
434 | */ | 436 | */ |
435 | int swap_type_of(dev_t device) | 437 | int swap_type_of(dev_t device, sector_t offset) |
436 | { | 438 | { |
439 | struct block_device *bdev = NULL; | ||
437 | int i; | 440 | int i; |
438 | 441 | ||
442 | if (device) | ||
443 | bdev = bdget(device); | ||
444 | |||
439 | spin_lock(&swap_lock); | 445 | spin_lock(&swap_lock); |
440 | for (i = 0; i < nr_swapfiles; i++) { | 446 | for (i = 0; i < nr_swapfiles; i++) { |
441 | struct inode *inode; | 447 | struct swap_info_struct *sis = swap_info + i; |
442 | 448 | ||
443 | if (!(swap_info[i].flags & SWP_WRITEOK)) | 449 | if (!(sis->flags & SWP_WRITEOK)) |
444 | continue; | 450 | continue; |
445 | 451 | ||
446 | if (!device) { | 452 | if (!bdev) { |
447 | spin_unlock(&swap_lock); | 453 | spin_unlock(&swap_lock); |
448 | return i; | 454 | return i; |
449 | } | 455 | } |
450 | inode = swap_info[i].swap_file->f_dentry->d_inode; | 456 | if (bdev == sis->bdev) { |
451 | if (S_ISBLK(inode->i_mode) && | 457 | struct swap_extent *se; |
452 | device == MKDEV(imajor(inode), iminor(inode))) { | 458 | |
453 | spin_unlock(&swap_lock); | 459 | se = list_entry(sis->extent_list.next, |
454 | return i; | 460 | struct swap_extent, list); |
461 | if (se->start_block == offset) { | ||
462 | spin_unlock(&swap_lock); | ||
463 | bdput(bdev); | ||
464 | return i; | ||
465 | } | ||
455 | } | 466 | } |
456 | } | 467 | } |
457 | spin_unlock(&swap_lock); | 468 | spin_unlock(&swap_lock); |
469 | if (bdev) | ||
470 | bdput(bdev); | ||
471 | |||
458 | return -ENODEV; | 472 | return -ENODEV; |
459 | } | 473 | } |
460 | 474 | ||
@@ -931,6 +945,23 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) | |||
931 | } | 945 | } |
932 | } | 946 | } |
933 | 947 | ||
948 | #ifdef CONFIG_SOFTWARE_SUSPEND | ||
949 | /* | ||
950 | * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev | ||
951 | * corresponding to given index in swap_info (swap type). | ||
952 | */ | ||
953 | sector_t swapdev_block(int swap_type, pgoff_t offset) | ||
954 | { | ||
955 | struct swap_info_struct *sis; | ||
956 | |||
957 | if (swap_type >= nr_swapfiles) | ||
958 | return 0; | ||
959 | |||
960 | sis = swap_info + swap_type; | ||
961 | return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; | ||
962 | } | ||
963 | #endif /* CONFIG_SOFTWARE_SUSPEND */ | ||
964 | |||
934 | /* | 965 | /* |
935 | * Free all of a swapdev's extent information | 966 | * Free all of a swapdev's extent information |
936 | */ | 967 | */ |
@@ -1274,10 +1305,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) | |||
1274 | 1305 | ||
1275 | mutex_lock(&swapon_mutex); | 1306 | mutex_lock(&swapon_mutex); |
1276 | 1307 | ||
1308 | if (!l) | ||
1309 | return SEQ_START_TOKEN; | ||
1310 | |||
1277 | for (i = 0; i < nr_swapfiles; i++, ptr++) { | 1311 | for (i = 0; i < nr_swapfiles; i++, ptr++) { |
1278 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | 1312 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) |
1279 | continue; | 1313 | continue; |
1280 | if (!l--) | 1314 | if (!--l) |
1281 | return ptr; | 1315 | return ptr; |
1282 | } | 1316 | } |
1283 | 1317 | ||
@@ -1286,10 +1320,17 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) | |||
1286 | 1320 | ||
1287 | static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) | 1321 | static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) |
1288 | { | 1322 | { |
1289 | struct swap_info_struct *ptr = v; | 1323 | struct swap_info_struct *ptr; |
1290 | struct swap_info_struct *endptr = swap_info + nr_swapfiles; | 1324 | struct swap_info_struct *endptr = swap_info + nr_swapfiles; |
1291 | 1325 | ||
1292 | for (++ptr; ptr < endptr; ptr++) { | 1326 | if (v == SEQ_START_TOKEN) |
1327 | ptr = swap_info; | ||
1328 | else { | ||
1329 | ptr = v; | ||
1330 | ptr++; | ||
1331 | } | ||
1332 | |||
1333 | for (; ptr < endptr; ptr++) { | ||
1293 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | 1334 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) |
1294 | continue; | 1335 | continue; |
1295 | ++*pos; | 1336 | ++*pos; |
@@ -1310,8 +1351,10 @@ static int swap_show(struct seq_file *swap, void *v) | |||
1310 | struct file *file; | 1351 | struct file *file; |
1311 | int len; | 1352 | int len; |
1312 | 1353 | ||
1313 | if (v == swap_info) | 1354 | if (ptr == SEQ_START_TOKEN) { |
1314 | seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); | 1355 | seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); |
1356 | return 0; | ||
1357 | } | ||
1315 | 1358 | ||
1316 | file = ptr->swap_file; | 1359 | file = ptr->swap_file; |
1317 | len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\"); | 1360 | len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\"); |
@@ -1325,7 +1368,7 @@ static int swap_show(struct seq_file *swap, void *v) | |||
1325 | return 0; | 1368 | return 0; |
1326 | } | 1369 | } |
1327 | 1370 | ||
1328 | static struct seq_operations swaps_op = { | 1371 | static const struct seq_operations swaps_op = { |
1329 | .start = swap_start, | 1372 | .start = swap_start, |
1330 | .next = swap_next, | 1373 | .next = swap_next, |
1331 | .stop = swap_stop, | 1374 | .stop = swap_stop, |
@@ -1337,7 +1380,7 @@ static int swaps_open(struct inode *inode, struct file *file) | |||
1337 | return seq_open(file, &swaps_op); | 1380 | return seq_open(file, &swaps_op); |
1338 | } | 1381 | } |
1339 | 1382 | ||
1340 | static struct file_operations proc_swaps_operations = { | 1383 | static const struct file_operations proc_swaps_operations = { |
1341 | .open = swaps_open, | 1384 | .open = swaps_open, |
1342 | .read = seq_read, | 1385 | .read = seq_read, |
1343 | .llseek = seq_lseek, | 1386 | .llseek = seq_lseek, |
@@ -1540,6 +1583,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1540 | error = -EINVAL; | 1583 | error = -EINVAL; |
1541 | if (!maxpages) | 1584 | if (!maxpages) |
1542 | goto bad_swap; | 1585 | goto bad_swap; |
1586 | if (swapfilesize && maxpages > swapfilesize) { | ||
1587 | printk(KERN_WARNING | ||
1588 | "Swap area shorter than signature indicates\n"); | ||
1589 | goto bad_swap; | ||
1590 | } | ||
1543 | if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) | 1591 | if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) |
1544 | goto bad_swap; | 1592 | goto bad_swap; |
1545 | if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) | 1593 | if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) |
@@ -1567,12 +1615,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1567 | goto bad_swap; | 1615 | goto bad_swap; |
1568 | } | 1616 | } |
1569 | 1617 | ||
1570 | if (swapfilesize && maxpages > swapfilesize) { | ||
1571 | printk(KERN_WARNING | ||
1572 | "Swap area shorter than signature indicates\n"); | ||
1573 | error = -EINVAL; | ||
1574 | goto bad_swap; | ||
1575 | } | ||
1576 | if (nr_good_pages) { | 1618 | if (nr_good_pages) { |
1577 | p->swap_map[0] = SWAP_MAP_BAD; | 1619 | p->swap_map[0] = SWAP_MAP_BAD; |
1578 | p->max = maxpages; | 1620 | p->max = maxpages; |
diff --git a/mm/thrash.c b/mm/thrash.c index f4c560b4a2b7..9ef9071f99bc 100644 --- a/mm/thrash.c +++ b/mm/thrash.c | |||
@@ -7,100 +7,74 @@ | |||
7 | * | 7 | * |
8 | * Simple token based thrashing protection, using the algorithm | 8 | * Simple token based thrashing protection, using the algorithm |
9 | * described in: http://www.cs.wm.edu/~sjiang/token.pdf | 9 | * described in: http://www.cs.wm.edu/~sjiang/token.pdf |
10 | * | ||
11 | * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> | ||
12 | * Improved algorithm to pass token: | ||
13 | * Each task has a priority which is incremented if it contended | ||
14 | * for the token in an interval less than its previous attempt. | ||
15 | * If the token is acquired, that task's priority is boosted to prevent | ||
16 | * the token from bouncing around too often and to let the task make | ||
17 | * some progress in its execution. | ||
10 | */ | 18 | */ |
19 | |||
11 | #include <linux/jiffies.h> | 20 | #include <linux/jiffies.h> |
12 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
13 | #include <linux/sched.h> | 22 | #include <linux/sched.h> |
14 | #include <linux/swap.h> | 23 | #include <linux/swap.h> |
15 | 24 | ||
16 | static DEFINE_SPINLOCK(swap_token_lock); | 25 | static DEFINE_SPINLOCK(swap_token_lock); |
17 | static unsigned long swap_token_timeout; | 26 | struct mm_struct *swap_token_mm; |
18 | static unsigned long swap_token_check; | 27 | static unsigned int global_faults; |
19 | struct mm_struct * swap_token_mm = &init_mm; | ||
20 | |||
21 | #define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2) | ||
22 | #define SWAP_TOKEN_TIMEOUT (300 * HZ) | ||
23 | /* | ||
24 | * Currently disabled; Needs further code to work at HZ * 300. | ||
25 | */ | ||
26 | unsigned long swap_token_default_timeout = SWAP_TOKEN_TIMEOUT; | ||
27 | |||
28 | /* | ||
29 | * Take the token away if the process had no page faults | ||
30 | * in the last interval, or if it has held the token for | ||
31 | * too long. | ||
32 | */ | ||
33 | #define SWAP_TOKEN_ENOUGH_RSS 1 | ||
34 | #define SWAP_TOKEN_TIMED_OUT 2 | ||
35 | static int should_release_swap_token(struct mm_struct *mm) | ||
36 | { | ||
37 | int ret = 0; | ||
38 | if (!mm->recent_pagein) | ||
39 | ret = SWAP_TOKEN_ENOUGH_RSS; | ||
40 | else if (time_after(jiffies, swap_token_timeout)) | ||
41 | ret = SWAP_TOKEN_TIMED_OUT; | ||
42 | mm->recent_pagein = 0; | ||
43 | return ret; | ||
44 | } | ||
45 | 28 | ||
46 | /* | ||
47 | * Try to grab the swapout protection token. We only try to | ||
48 | * grab it once every TOKEN_CHECK_INTERVAL, both to prevent | ||
49 | * SMP lock contention and to check that the process that held | ||
50 | * the token before is no longer thrashing. | ||
51 | */ | ||
52 | void grab_swap_token(void) | 29 | void grab_swap_token(void) |
53 | { | 30 | { |
54 | struct mm_struct *mm; | 31 | int current_interval; |
55 | int reason; | ||
56 | 32 | ||
57 | /* We have the token. Let others know we still need it. */ | 33 | global_faults++; |
58 | if (has_swap_token(current->mm)) { | ||
59 | current->mm->recent_pagein = 1; | ||
60 | if (unlikely(!swap_token_default_timeout)) | ||
61 | disable_swap_token(); | ||
62 | return; | ||
63 | } | ||
64 | |||
65 | if (time_after(jiffies, swap_token_check)) { | ||
66 | 34 | ||
67 | if (!swap_token_default_timeout) { | 35 | current_interval = global_faults - current->mm->faultstamp; |
68 | swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL; | ||
69 | return; | ||
70 | } | ||
71 | |||
72 | /* ... or if we recently held the token. */ | ||
73 | if (time_before(jiffies, current->mm->swap_token_time)) | ||
74 | return; | ||
75 | 36 | ||
76 | if (!spin_trylock(&swap_token_lock)) | 37 | if (!spin_trylock(&swap_token_lock)) |
77 | return; | 38 | return; |
78 | 39 | ||
79 | swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL; | 40 | /* First come first served */ |
41 | if (swap_token_mm == NULL) { | ||
42 | current->mm->token_priority = current->mm->token_priority + 2; | ||
43 | swap_token_mm = current->mm; | ||
44 | goto out; | ||
45 | } | ||
80 | 46 | ||
81 | mm = swap_token_mm; | 47 | if (current->mm != swap_token_mm) { |
82 | if ((reason = should_release_swap_token(mm))) { | 48 | if (current_interval < current->mm->last_interval) |
83 | unsigned long eligible = jiffies; | 49 | current->mm->token_priority++; |
84 | if (reason == SWAP_TOKEN_TIMED_OUT) { | 50 | else { |
85 | eligible += swap_token_default_timeout; | 51 | current->mm->token_priority--; |
86 | } | 52 | if (unlikely(current->mm->token_priority < 0)) |
87 | mm->swap_token_time = eligible; | 53 | current->mm->token_priority = 0; |
88 | swap_token_timeout = jiffies + swap_token_default_timeout; | 54 | } |
55 | /* Check if we deserve the token */ | ||
56 | if (current->mm->token_priority > | ||
57 | swap_token_mm->token_priority) { | ||
58 | current->mm->token_priority += 2; | ||
89 | swap_token_mm = current->mm; | 59 | swap_token_mm = current->mm; |
90 | } | 60 | } |
91 | spin_unlock(&swap_token_lock); | 61 | } else { |
62 | /* Token holder came in again! */ | ||
63 | current->mm->token_priority += 2; | ||
92 | } | 64 | } |
93 | return; | 65 | |
66 | out: | ||
67 | current->mm->faultstamp = global_faults; | ||
68 | current->mm->last_interval = current_interval; | ||
69 | spin_unlock(&swap_token_lock); | ||
70 | return; | ||
94 | } | 71 | } |
95 | 72 | ||
96 | /* Called on process exit. */ | 73 | /* Called on process exit. */ |
97 | void __put_swap_token(struct mm_struct *mm) | 74 | void __put_swap_token(struct mm_struct *mm) |
98 | { | 75 | { |
99 | spin_lock(&swap_token_lock); | 76 | spin_lock(&swap_token_lock); |
100 | if (likely(mm == swap_token_mm)) { | 77 | if (likely(mm == swap_token_mm)) |
101 | mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL; | 78 | swap_token_mm = NULL; |
102 | swap_token_mm = &init_mm; | ||
103 | swap_token_check = jiffies; | ||
104 | } | ||
105 | spin_unlock(&swap_token_lock); | 79 | spin_unlock(&swap_token_lock); |
106 | } | 80 | } |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 518540a4a2a6..093f5fe6dd77 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/rwsem.h> | 36 | #include <linux/rwsem.h> |
37 | #include <linux/delay.h> | 37 | #include <linux/delay.h> |
38 | #include <linux/kthread.h> | 38 | #include <linux/kthread.h> |
39 | #include <linux/freezer.h> | ||
39 | 40 | ||
40 | #include <asm/tlbflush.h> | 41 | #include <asm/tlbflush.h> |
41 | #include <asm/div64.h> | 42 | #include <asm/div64.h> |
@@ -1172,11 +1173,12 @@ loop_again: | |||
1172 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1173 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
1173 | 0, 0)) { | 1174 | 0, 0)) { |
1174 | end_zone = i; | 1175 | end_zone = i; |
1175 | goto scan; | 1176 | break; |
1176 | } | 1177 | } |
1177 | } | 1178 | } |
1178 | goto out; | 1179 | if (i < 0) |
1179 | scan: | 1180 | goto out; |
1181 | |||
1180 | for (i = 0; i <= end_zone; i++) { | 1182 | for (i = 0; i <= end_zone; i++) { |
1181 | struct zone *zone = pgdat->node_zones + i; | 1183 | struct zone *zone = pgdat->node_zones + i; |
1182 | 1184 | ||
@@ -1259,6 +1261,9 @@ out: | |||
1259 | } | 1261 | } |
1260 | if (!all_zones_ok) { | 1262 | if (!all_zones_ok) { |
1261 | cond_resched(); | 1263 | cond_resched(); |
1264 | |||
1265 | try_to_freeze(); | ||
1266 | |||
1262 | goto loop_again; | 1267 | goto loop_again; |
1263 | } | 1268 | } |
1264 | 1269 | ||
@@ -1508,7 +1513,6 @@ out: | |||
1508 | } | 1513 | } |
1509 | #endif | 1514 | #endif |
1510 | 1515 | ||
1511 | #ifdef CONFIG_HOTPLUG_CPU | ||
1512 | /* It's optimal to keep kswapds on the same CPUs as their memory, but | 1516 | /* It's optimal to keep kswapds on the same CPUs as their memory, but |
1513 | not required for correctness. So if the last cpu in a node goes | 1517 | not required for correctness. So if the last cpu in a node goes |
1514 | away, we get changed to run anywhere: as the first one comes back, | 1518 | away, we get changed to run anywhere: as the first one comes back, |
@@ -1529,7 +1533,6 @@ static int __devinit cpu_callback(struct notifier_block *nfb, | |||
1529 | } | 1533 | } |
1530 | return NOTIFY_OK; | 1534 | return NOTIFY_OK; |
1531 | } | 1535 | } |
1532 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
1533 | 1536 | ||
1534 | /* | 1537 | /* |
1535 | * This kswapd start function will be called by init and node-hot-add. | 1538 | * This kswapd start function will be called by init and node-hot-add. |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 8614e8f6743b..dc005a0c96ae 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -430,7 +430,7 @@ static int frag_show(struct seq_file *m, void *arg) | |||
430 | return 0; | 430 | return 0; |
431 | } | 431 | } |
432 | 432 | ||
433 | struct seq_operations fragmentation_op = { | 433 | const struct seq_operations fragmentation_op = { |
434 | .start = frag_start, | 434 | .start = frag_start, |
435 | .next = frag_next, | 435 | .next = frag_next, |
436 | .stop = frag_stop, | 436 | .stop = frag_stop, |
@@ -452,7 +452,7 @@ struct seq_operations fragmentation_op = { | |||
452 | #define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \ | 452 | #define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \ |
453 | TEXT_FOR_HIGHMEM(xx) | 453 | TEXT_FOR_HIGHMEM(xx) |
454 | 454 | ||
455 | static char *vmstat_text[] = { | 455 | static const char * const vmstat_text[] = { |
456 | /* Zoned VM counters */ | 456 | /* Zoned VM counters */ |
457 | "nr_anon_pages", | 457 | "nr_anon_pages", |
458 | "nr_mapped", | 458 | "nr_mapped", |
@@ -597,7 +597,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) | |||
597 | return 0; | 597 | return 0; |
598 | } | 598 | } |
599 | 599 | ||
600 | struct seq_operations zoneinfo_op = { | 600 | const struct seq_operations zoneinfo_op = { |
601 | .start = frag_start, /* iterate over all zones. The same as in | 601 | .start = frag_start, /* iterate over all zones. The same as in |
602 | * fragmentation. */ | 602 | * fragmentation. */ |
603 | .next = frag_next, | 603 | .next = frag_next, |
@@ -660,7 +660,7 @@ static void vmstat_stop(struct seq_file *m, void *arg) | |||
660 | m->private = NULL; | 660 | m->private = NULL; |
661 | } | 661 | } |
662 | 662 | ||
663 | struct seq_operations vmstat_op = { | 663 | const struct seq_operations vmstat_op = { |
664 | .start = vmstat_start, | 664 | .start = vmstat_start, |
665 | .next = vmstat_next, | 665 | .next = vmstat_next, |
666 | .stop = vmstat_stop, | 666 | .stop = vmstat_stop, |
@@ -679,13 +679,13 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, | |||
679 | void *hcpu) | 679 | void *hcpu) |
680 | { | 680 | { |
681 | switch (action) { | 681 | switch (action) { |
682 | case CPU_UP_PREPARE: | 682 | case CPU_UP_PREPARE: |
683 | case CPU_UP_CANCELED: | 683 | case CPU_UP_CANCELED: |
684 | case CPU_DEAD: | 684 | case CPU_DEAD: |
685 | refresh_zone_stat_thresholds(); | 685 | refresh_zone_stat_thresholds(); |
686 | break; | 686 | break; |
687 | default: | 687 | default: |
688 | break; | 688 | break; |
689 | } | 689 | } |
690 | return NOTIFY_OK; | 690 | return NOTIFY_OK; |
691 | } | 691 | } |