aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorHugh Dickins <hugh@veritas.com>2008-02-05 01:28:40 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-05 12:44:14 -0500
commitc4cc6d07b2f465fbf5efd99bbe772a49c515f3f2 (patch)
tree0aec353fce4ee6cd16e3051ecfdf55e8022aaa5a /mm/memory.c
parent75897d60a54ccee94253312107f941a83b5077cb (diff)
swapin_readahead: excise NUMA bogosity
For three years swapin_readahead has been cluttered with fanciful CONFIG_NUMA code, advancing addr, and stepping on to the next vma at the boundary, to line up the mempolicy for each page allocation. It _might_ be a good idea to allocate swap more according to vma layout; but the fact is, that's not how we do it at all, 2.6 even less than 2.4: swap is allocated as needed for pages as they sink to the bottom of the inactive LRUs. Sometimes that may match vma layout, but not so often that it's worth going to these misleading vma->vm_next lengths: rip all that out. Originally I intended to retain the incrementation of addr, but correct its initial value: valid_swaphandles generally supplies an offset below the target addr (this is readaround rather than readahead), but addr has not been adjusted accordingly, so in the interleave case it has usually been allocating the target page from the "wrong" node (though that may not matter very much). But look at the equivalent shmem_swapin code: either by oversight or by design, though it has all the apparatus for choosing a new mempolicy per page, it uses the same idx throughout, choosing the same mempolicy and interleave node for each page of the cluster. Which is actually a much better strategy: each node has its own LRUs and its own kswapd, so if you're betting on any particular relationship between swap and node, the best bet is that nearby swap entries belong to pages from the same node - even when the mempolicy of the target page is to interleave. And examining a map of nodes corresponding to swap entries on a numa=fake system bears this out. (We could later tweak swap allocation to make it even more likely, but this patch is merely about removing cruft.) So, neither adjust nor increment addr in swapin_readahead, and then shmem_swapin can use it too; the pseudo-vma to pass policy need only be set up once per cluster, and so few fields of pvma are used, let's skip the memset - from shmem_alloc_page also. Signed-off-by: Hugh Dickins <hugh@veritas.com> Acked-by: Rik van Riel <riel@redhat.com> Cc: Andi Kleen <ak@suse.de> Cc: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c47
1 files changed, 14 insertions, 33 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 1b8ca160f1d0..1d803c2d0184 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1998,45 +1998,26 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
1998 */ 1998 */
1999void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma) 1999void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
2000{ 2000{
2001#ifdef CONFIG_NUMA 2001 int nr_pages;
2002 struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL; 2002 struct page *page;
2003#endif
2004 int i, num;
2005 struct page *new_page;
2006 unsigned long offset; 2003 unsigned long offset;
2004 unsigned long end_offset;
2007 2005
2008 /* 2006 /*
2009 * Get the number of handles we should do readahead io to. 2007 * Get starting offset for readaround, and number of pages to read.
2008 * Adjust starting address by readbehind (for NUMA interleave case)?
2009 * No, it's very unlikely that swap layout would follow vma layout,
2010 * more likely that neighbouring swap pages came from the same node:
2011 * so use the same "addr" to choose the same node for each swap read.
2010 */ 2012 */
2011 num = valid_swaphandles(entry, &offset); 2013 nr_pages = valid_swaphandles(entry, &offset);
2012 for (i = 0; i < num; offset++, i++) { 2014 for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
2013 /* Ok, do the async read-ahead now */ 2015 /* Ok, do the async read-ahead now */
2014 new_page = read_swap_cache_async(swp_entry(swp_type(entry), 2016 page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
2015 offset), vma, addr); 2017 vma, addr);
2016 if (!new_page) 2018 if (!page)
2017 break; 2019 break;
2018 page_cache_release(new_page); 2020 page_cache_release(page);
2019#ifdef CONFIG_NUMA
2020 /*
2021 * Find the next applicable VMA for the NUMA policy.
2022 */
2023 addr += PAGE_SIZE;
2024 if (addr == 0)
2025 vma = NULL;
2026 if (vma) {
2027 if (addr >= vma->vm_end) {
2028 vma = next_vma;
2029 next_vma = vma ? vma->vm_next : NULL;
2030 }
2031 if (vma && addr < vma->vm_start)
2032 vma = NULL;
2033 } else {
2034 if (next_vma && addr >= next_vma->vm_start) {
2035 vma = next_vma;
2036 next_vma = vma->vm_next;
2037 }
2038 }
2039#endif
2040 } 2021 }
2041 lru_add_drain(); /* Push any new pages onto the LRU now */ 2022 lru_add_drain(); /* Push any new pages onto the LRU now */
2042} 2023}