aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2008-02-05 01:29:10 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-05 12:44:17 -0500
commite2848a0efedef4dad52d1334d37f8719cd6268fd (patch)
treef5d2b600b1275793e7c490f34ae9ec902af138b5
parente31d9eb5c17ae3b80f9e9403f8a5eaf6dba879c9 (diff)
radix-tree: avoid atomic allocations for preloaded insertions
Most pagecache (and some other) radix tree insertions have the great opportunity to preallocate a few nodes with relaxed gfp flags. But the preallocation is squandered when it comes time to allocate a node, we default to first attempting a GFP_ATOMIC allocation -- that doesn't normally fail, but it can eat into atomic memory reserves that we don't need to be using. Another upshot of this is that it removes the sometimes highly contended zone->lock from underneath tree_lock. Pagecache insertions are always performed with a radix tree preload, and after this change, such a situation will never fall back to kmem_cache_alloc within radix_tree_node_alloc. David Miller reports seeing this allocation fail on a highly threaded sparc64 system: [527319.459981] dd: page allocation failure. order:0, mode:0x20 [527319.460403] Call Trace: [527319.460568] [00000000004b71e0] __slab_alloc+0x1b0/0x6a8 [527319.460636] [00000000004b7bbc] kmem_cache_alloc+0x4c/0xa8 [527319.460698] [000000000055309c] radix_tree_node_alloc+0x20/0x90 [527319.460763] [0000000000553238] radix_tree_insert+0x12c/0x260 [527319.460830] [0000000000495cd0] add_to_page_cache+0x38/0xb0 [527319.460893] [00000000004e4794] mpage_readpages+0x6c/0x134 [527319.460955] [000000000049c7fc] __do_page_cache_readahead+0x170/0x280 [527319.461028] [000000000049cc88] ondemand_readahead+0x208/0x214 [527319.461094] [0000000000496018] do_generic_mapping_read+0xe8/0x428 [527319.461152] [0000000000497948] generic_file_aio_read+0x108/0x170 [527319.461217] [00000000004badac] do_sync_read+0x88/0xd0 [527319.461292] [00000000004bb5cc] vfs_read+0x78/0x10c [527319.461361] [00000000004bb920] sys_read+0x34/0x60 [527319.461424] [0000000000406294] linux_sparc_syscall32+0x3c/0x40 The calltrace is significant: __do_page_cache_readahead allocates a number of pages with GFP_KERNEL, and hence it should have reclaimed sufficient memory to satisfy GFP_ATOMIC allocations. However after the list of pages goes to mpage_readpages, there can be significant intervals (including disk IO) before all the pages are inserted into the radix-tree. So the reserves can easily be depleted at that point. The patch is confirmed to fix the problem. Signed-off-by: Nick Piggin <npiggin@suse.de> Cc: "David S. Miller" <davem@davemloft.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--lib/radix-tree.c15
-rw-r--r--mm/filemap.c1
-rw-r--r--mm/rmap.c1
3 files changed, 11 insertions, 6 deletions
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 48c250fe2233..65f0e758ec38 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -95,14 +95,17 @@ static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
95static struct radix_tree_node * 95static struct radix_tree_node *
96radix_tree_node_alloc(struct radix_tree_root *root) 96radix_tree_node_alloc(struct radix_tree_root *root)
97{ 97{
98 struct radix_tree_node *ret; 98 struct radix_tree_node *ret = NULL;
99 gfp_t gfp_mask = root_gfp_mask(root); 99 gfp_t gfp_mask = root_gfp_mask(root);
100 100
101 ret = kmem_cache_alloc(radix_tree_node_cachep, 101 if (!(gfp_mask & __GFP_WAIT)) {
102 set_migrateflags(gfp_mask, __GFP_RECLAIMABLE));
103 if (ret == NULL && !(gfp_mask & __GFP_WAIT)) {
104 struct radix_tree_preload *rtp; 102 struct radix_tree_preload *rtp;
105 103
104 /*
105 * Provided the caller has preloaded here, we will always
106 * succeed in getting a node here (and never reach
107 * kmem_cache_alloc)
108 */
106 rtp = &__get_cpu_var(radix_tree_preloads); 109 rtp = &__get_cpu_var(radix_tree_preloads);
107 if (rtp->nr) { 110 if (rtp->nr) {
108 ret = rtp->nodes[rtp->nr - 1]; 111 ret = rtp->nodes[rtp->nr - 1];
@@ -110,6 +113,10 @@ radix_tree_node_alloc(struct radix_tree_root *root)
110 rtp->nr--; 113 rtp->nr--;
111 } 114 }
112 } 115 }
116 if (ret == NULL)
117 ret = kmem_cache_alloc(radix_tree_node_cachep,
118 set_migrateflags(gfp_mask, __GFP_RECLAIMABLE));
119
113 BUG_ON(radix_tree_is_indirect_ptr(ret)); 120 BUG_ON(radix_tree_is_indirect_ptr(ret));
114 return ret; 121 return ret;
115} 122}
diff --git a/mm/filemap.c b/mm/filemap.c
index 76bea88cbebc..96920f840562 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -65,7 +65,6 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
65 * ->private_lock (__free_pte->__set_page_dirty_buffers) 65 * ->private_lock (__free_pte->__set_page_dirty_buffers)
66 * ->swap_lock (exclusive_swap_page, others) 66 * ->swap_lock (exclusive_swap_page, others)
67 * ->mapping->tree_lock 67 * ->mapping->tree_lock
68 * ->zone.lock
69 * 68 *
70 * ->i_mutex 69 * ->i_mutex
71 * ->i_mmap_lock (truncate->unmap_mapping_range) 70 * ->i_mmap_lock (truncate->unmap_mapping_range)
diff --git a/mm/rmap.c b/mm/rmap.c
index dbc2ca2057a5..0334c8f6b741 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -36,7 +36,6 @@
36 * mapping->tree_lock (widely used, in set_page_dirty, 36 * mapping->tree_lock (widely used, in set_page_dirty,
37 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
38 * within inode_lock in __sync_single_inode) 38 * within inode_lock in __sync_single_inode)
39 * zone->lock (within radix tree node alloc)
40 */ 39 */
41 40
42#include <linux/mm.h> 41#include <linux/mm.h>