aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJeff Garzik <jgarzik@pobox.com>2005-09-14 08:12:20 -0400
committerJeff Garzik <jgarzik@pobox.com>2005-09-14 08:12:20 -0400
commit165415f700b0c77fa1f8db6198f48582639adf78 (patch)
tree088e305b0b5b0c6753072e13be1177824c3ed59d /mm
parentc324b44c34050cf2a9b58830e11c974806bd85d8 (diff)
parent2f4ba45a75d6383b4a1201169a808ffea416ffa0 (diff)
Merge /spare/repo/linux-2.6/
Diffstat (limited to 'mm')
-rw-r--r--mm/bootmem.c14
-rw-r--r--mm/filemap.c17
-rw-r--r--mm/memory.c2
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/nommu.c17
-rw-r--r--mm/oom_kill.c3
-rw-r--r--mm/page-writeback.c6
-rw-r--r--mm/page_alloc.c2
-rw-r--r--mm/shmem.c21
-rw-r--r--mm/slab.c1138
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/swapfile.c3
-rw-r--r--mm/vmalloc.c7
13 files changed, 883 insertions, 353 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c
index c1330cc19783..8ec4e4c2a179 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -61,9 +61,17 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
61{ 61{
62 bootmem_data_t *bdata = pgdat->bdata; 62 bootmem_data_t *bdata = pgdat->bdata;
63 unsigned long mapsize = ((end - start)+7)/8; 63 unsigned long mapsize = ((end - start)+7)/8;
64 64 static struct pglist_data *pgdat_last;
65 pgdat->pgdat_next = pgdat_list; 65
66 pgdat_list = pgdat; 66 pgdat->pgdat_next = NULL;
67 /* Add new nodes last so that bootmem always starts
68 searching in the first nodes, not the last ones */
69 if (pgdat_last)
70 pgdat_last->pgdat_next = pgdat;
71 else {
72 pgdat_list = pgdat;
73 pgdat_last = pgdat;
74 }
67 75
68 mapsize = ALIGN(mapsize, sizeof(long)); 76 mapsize = ALIGN(mapsize, sizeof(long));
69 bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); 77 bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
diff --git a/mm/filemap.c b/mm/filemap.c
index 88611928e71f..b5346576e58d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -37,6 +37,10 @@
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38#include <asm/mman.h> 38#include <asm/mman.h>
39 39
40static ssize_t
41generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
42 loff_t offset, unsigned long nr_segs);
43
40/* 44/*
41 * Shared mappings implemented 30.11.1994. It's not fully working yet, 45 * Shared mappings implemented 30.11.1994. It's not fully working yet,
42 * though. 46 * though.
@@ -301,8 +305,9 @@ EXPORT_SYMBOL(sync_page_range);
301 * as it forces O_SYNC writers to different parts of the same file 305 * as it forces O_SYNC writers to different parts of the same file
302 * to be serialised right until io completion. 306 * to be serialised right until io completion.
303 */ 307 */
304int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, 308static int sync_page_range_nolock(struct inode *inode,
305 loff_t pos, size_t count) 309 struct address_space *mapping,
310 loff_t pos, size_t count)
306{ 311{
307 pgoff_t start = pos >> PAGE_CACHE_SHIFT; 312 pgoff_t start = pos >> PAGE_CACHE_SHIFT;
308 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; 313 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -317,7 +322,6 @@ int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
317 ret = wait_on_page_writeback_range(mapping, start, end); 322 ret = wait_on_page_writeback_range(mapping, start, end);
318 return ret; 323 return ret;
319} 324}
320EXPORT_SYMBOL(sync_page_range_nolock);
321 325
322/** 326/**
323 * filemap_fdatawait - walk the list of under-writeback pages of the given 327 * filemap_fdatawait - walk the list of under-writeback pages of the given
@@ -2008,7 +2012,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2008} 2012}
2009EXPORT_SYMBOL(generic_file_buffered_write); 2013EXPORT_SYMBOL(generic_file_buffered_write);
2010 2014
2011ssize_t 2015static ssize_t
2012__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, 2016__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2013 unsigned long nr_segs, loff_t *ppos) 2017 unsigned long nr_segs, loff_t *ppos)
2014{ 2018{
@@ -2108,7 +2112,7 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2108 return ret; 2112 return ret;
2109} 2113}
2110 2114
2111ssize_t 2115static ssize_t
2112__generic_file_write_nolock(struct file *file, const struct iovec *iov, 2116__generic_file_write_nolock(struct file *file, const struct iovec *iov,
2113 unsigned long nr_segs, loff_t *ppos) 2117 unsigned long nr_segs, loff_t *ppos)
2114{ 2118{
@@ -2229,7 +2233,7 @@ EXPORT_SYMBOL(generic_file_writev);
2229 * Called under i_sem for writes to S_ISREG files. Returns -EIO if something 2233 * Called under i_sem for writes to S_ISREG files. Returns -EIO if something
2230 * went wrong during pagecache shootdown. 2234 * went wrong during pagecache shootdown.
2231 */ 2235 */
2232ssize_t 2236static ssize_t
2233generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 2237generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2234 loff_t offset, unsigned long nr_segs) 2238 loff_t offset, unsigned long nr_segs)
2235{ 2239{
@@ -2264,4 +2268,3 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2264 } 2268 }
2265 return retval; 2269 return retval;
2266} 2270}
2267EXPORT_SYMBOL_GPL(generic_file_direct_IO);
diff --git a/mm/memory.c b/mm/memory.c
index 788a62810340..ae8161f1f459 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2225,7 +2225,7 @@ void update_mem_hiwater(struct task_struct *tsk)
2225#if !defined(__HAVE_ARCH_GATE_AREA) 2225#if !defined(__HAVE_ARCH_GATE_AREA)
2226 2226
2227#if defined(AT_SYSINFO_EHDR) 2227#if defined(AT_SYSINFO_EHDR)
2228struct vm_area_struct gate_vma; 2228static struct vm_area_struct gate_vma;
2229 2229
2230static int __init gate_vma_init(void) 2230static int __init gate_vma_init(void)
2231{ 2231{
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 13492d66b7c8..afa06e184d88 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -88,7 +88,7 @@ static kmem_cache_t *sn_cache;
88 policied. */ 88 policied. */
89static int policy_zone; 89static int policy_zone;
90 90
91static struct mempolicy default_policy = { 91struct mempolicy default_policy = {
92 .refcnt = ATOMIC_INIT(1), /* never free it */ 92 .refcnt = ATOMIC_INIT(1), /* never free it */
93 .policy = MPOL_DEFAULT, 93 .policy = MPOL_DEFAULT,
94}; 94};
diff --git a/mm/nommu.c b/mm/nommu.c
index fd4e8df0f02d..064d70442895 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -57,6 +57,11 @@ DECLARE_RWSEM(nommu_vma_sem);
57struct vm_operations_struct generic_file_vm_ops = { 57struct vm_operations_struct generic_file_vm_ops = {
58}; 58};
59 59
60EXPORT_SYMBOL(vmalloc);
61EXPORT_SYMBOL(vfree);
62EXPORT_SYMBOL(vmalloc_to_page);
63EXPORT_SYMBOL(vmalloc_32);
64
60/* 65/*
61 * Handle all mappings that got truncated by a "truncate()" 66 * Handle all mappings that got truncated by a "truncate()"
62 * system call. 67 * system call.
@@ -142,6 +147,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
142 return(i); 147 return(i);
143} 148}
144 149
150EXPORT_SYMBOL(get_user_pages);
151
145DEFINE_RWLOCK(vmlist_lock); 152DEFINE_RWLOCK(vmlist_lock);
146struct vm_struct *vmlist; 153struct vm_struct *vmlist;
147 154
@@ -852,7 +859,7 @@ unsigned long do_mmap_pgoff(struct file *file,
852 error_getting_vma: 859 error_getting_vma:
853 up_write(&nommu_vma_sem); 860 up_write(&nommu_vma_sem);
854 kfree(vml); 861 kfree(vml);
855 printk("Allocation of vml for %lu byte allocation from process %d failed\n", 862 printk("Allocation of vma for %lu byte allocation from process %d failed\n",
856 len, current->pid); 863 len, current->pid);
857 show_free_areas(); 864 show_free_areas();
858 return -ENOMEM; 865 return -ENOMEM;
@@ -909,7 +916,7 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
909 916
910 for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) 917 for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next)
911 if ((*parent)->vma->vm_start == addr && 918 if ((*parent)->vma->vm_start == addr &&
912 (*parent)->vma->vm_end == end) 919 ((len == 0) || ((*parent)->vma->vm_end == end)))
913 goto found; 920 goto found;
914 921
915 printk("munmap of non-mmaped memory by process %d (%s): %p\n", 922 printk("munmap of non-mmaped memory by process %d (%s): %p\n",
@@ -1054,7 +1061,8 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
1054int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, 1061int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
1055 unsigned long to, unsigned long size, pgprot_t prot) 1062 unsigned long to, unsigned long size, pgprot_t prot)
1056{ 1063{
1057 return -EPERM; 1064 vma->vm_start = vma->vm_pgoff << PAGE_SHIFT;
1065 return 0;
1058} 1066}
1059 1067
1060void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 1068void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
@@ -1073,9 +1081,10 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1073 1081
1074void update_mem_hiwater(struct task_struct *tsk) 1082void update_mem_hiwater(struct task_struct *tsk)
1075{ 1083{
1076 unsigned long rss = get_mm_counter(tsk->mm, rss); 1084 unsigned long rss;
1077 1085
1078 if (likely(tsk->mm)) { 1086 if (likely(tsk->mm)) {
1087 rss = get_mm_counter(tsk->mm, rss);
1079 if (tsk->mm->hiwater_rss < rss) 1088 if (tsk->mm->hiwater_rss < rss)
1080 tsk->mm->hiwater_rss = rss; 1089 tsk->mm->hiwater_rss = rss;
1081 if (tsk->mm->hiwater_vm < tsk->mm->total_vm) 1090 if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5ec8da12cfd9..ac3bf33e5370 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -300,6 +300,5 @@ retry:
300 * Give "p" a good chance of killing itself before we 300 * Give "p" a good chance of killing itself before we
301 * retry to allocate memory. 301 * retry to allocate memory.
302 */ 302 */
303 __set_current_state(TASK_INTERRUPTIBLE); 303 schedule_timeout_interruptible(1);
304 schedule_timeout(1);
305} 304}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a6329fa8f862..0166ea15c9ee 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -368,10 +368,8 @@ int wakeup_pdflush(long nr_pages)
368static void wb_timer_fn(unsigned long unused); 368static void wb_timer_fn(unsigned long unused);
369static void laptop_timer_fn(unsigned long unused); 369static void laptop_timer_fn(unsigned long unused);
370 370
371static struct timer_list wb_timer = 371static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
372 TIMER_INITIALIZER(wb_timer_fn, 0, 0); 372static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
373static struct timer_list laptop_mode_wb_timer =
374 TIMER_INITIALIZER(laptop_timer_fn, 0, 0);
375 373
376/* 374/*
377 * Periodic writeback of "old" data. 375 * Periodic writeback of "old" data.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3974fd81d27c..c5823c395f71 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -335,7 +335,7 @@ static inline void free_pages_check(const char *function, struct page *page)
335/* 335/*
336 * Frees a list of pages. 336 * Frees a list of pages.
337 * Assumes all pages on list are in same zone, and of same order. 337 * Assumes all pages on list are in same zone, and of same order.
338 * count is the number of pages to free, or 0 for all on the list. 338 * count is the number of pages to free.
339 * 339 *
340 * If the zone was previously in an "all pages pinned" state then look to 340 * If the zone was previously in an "all pages pinned" state then look to
341 * see if this freeing clears that state. 341 * see if this freeing clears that state.
diff --git a/mm/shmem.c b/mm/shmem.c
index db2c9e8d9909..1f7aeb210c7b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -666,6 +666,7 @@ static void shmem_delete_inode(struct inode *inode)
666 struct shmem_inode_info *info = SHMEM_I(inode); 666 struct shmem_inode_info *info = SHMEM_I(inode);
667 667
668 if (inode->i_op->truncate == shmem_truncate) { 668 if (inode->i_op->truncate == shmem_truncate) {
669 truncate_inode_pages(inode->i_mapping, 0);
669 shmem_unacct_size(info->flags, inode->i_size); 670 shmem_unacct_size(info->flags, inode->i_size);
670 inode->i_size = 0; 671 inode->i_size = 0;
671 shmem_truncate(inode); 672 shmem_truncate(inode);
@@ -1607,6 +1608,15 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1607 int error = -ENOSPC; 1608 int error = -ENOSPC;
1608 1609
1609 if (inode) { 1610 if (inode) {
1611 error = security_inode_init_security(inode, dir, NULL, NULL,
1612 NULL);
1613 if (error) {
1614 if (error != -EOPNOTSUPP) {
1615 iput(inode);
1616 return error;
1617 }
1618 error = 0;
1619 }
1610 if (dir->i_mode & S_ISGID) { 1620 if (dir->i_mode & S_ISGID) {
1611 inode->i_gid = dir->i_gid; 1621 inode->i_gid = dir->i_gid;
1612 if (S_ISDIR(mode)) 1622 if (S_ISDIR(mode))
@@ -1616,7 +1626,6 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1616 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1626 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1617 d_instantiate(dentry, inode); 1627 d_instantiate(dentry, inode);
1618 dget(dentry); /* Extra count - pin the dentry in core */ 1628 dget(dentry); /* Extra count - pin the dentry in core */
1619 error = 0;
1620 } 1629 }
1621 return error; 1630 return error;
1622} 1631}
@@ -1746,6 +1755,16 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1746 if (!inode) 1755 if (!inode)
1747 return -ENOSPC; 1756 return -ENOSPC;
1748 1757
1758 error = security_inode_init_security(inode, dir, NULL, NULL,
1759 NULL);
1760 if (error) {
1761 if (error != -EOPNOTSUPP) {
1762 iput(inode);
1763 return error;
1764 }
1765 error = 0;
1766 }
1767
1749 info = SHMEM_I(inode); 1768 info = SHMEM_I(inode);
1750 inode->i_size = len-1; 1769 inode->i_size = len-1;
1751 if (len <= (char *)inode - (char *)info) { 1770 if (len <= (char *)inode - (char *)info) {
diff --git a/mm/slab.c b/mm/slab.c
index d7c4443991fe..9e876d6dfad9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -75,6 +75,15 @@
75 * 75 *
76 * At present, each engine can be growing a cache. This should be blocked. 76 * At present, each engine can be growing a cache. This should be blocked.
77 * 77 *
78 * 15 March 2005. NUMA slab allocator.
79 * Shai Fultheim <shai@scalex86.org>.
80 * Shobhit Dayal <shobhit@calsoftinc.com>
81 * Alok N Kataria <alokk@calsoftinc.com>
82 * Christoph Lameter <christoph@lameter.com>
83 *
84 * Modified the slab allocator to be node aware on NUMA systems.
85 * Each node has its own list of partial, free and full slabs.
86 * All object allocations for a node occur from node specific slab lists.
78 */ 87 */
79 88
80#include <linux/config.h> 89#include <linux/config.h>
@@ -93,6 +102,7 @@
93#include <linux/module.h> 102#include <linux/module.h>
94#include <linux/rcupdate.h> 103#include <linux/rcupdate.h>
95#include <linux/string.h> 104#include <linux/string.h>
105#include <linux/nodemask.h>
96 106
97#include <asm/uaccess.h> 107#include <asm/uaccess.h>
98#include <asm/cacheflush.h> 108#include <asm/cacheflush.h>
@@ -212,6 +222,7 @@ struct slab {
212 void *s_mem; /* including colour offset */ 222 void *s_mem; /* including colour offset */
213 unsigned int inuse; /* num of objs active in slab */ 223 unsigned int inuse; /* num of objs active in slab */
214 kmem_bufctl_t free; 224 kmem_bufctl_t free;
225 unsigned short nodeid;
215}; 226};
216 227
217/* 228/*
@@ -239,7 +250,6 @@ struct slab_rcu {
239/* 250/*
240 * struct array_cache 251 * struct array_cache
241 * 252 *
242 * Per cpu structures
243 * Purpose: 253 * Purpose:
244 * - LIFO ordering, to hand out cache-warm objects from _alloc 254 * - LIFO ordering, to hand out cache-warm objects from _alloc
245 * - reduce the number of linked list operations 255 * - reduce the number of linked list operations
@@ -254,6 +264,13 @@ struct array_cache {
254 unsigned int limit; 264 unsigned int limit;
255 unsigned int batchcount; 265 unsigned int batchcount;
256 unsigned int touched; 266 unsigned int touched;
267 spinlock_t lock;
268 void *entry[0]; /*
269 * Must have this definition in here for the proper
270 * alignment of array_cache. Also simplifies accessing
271 * the entries.
272 * [0] is for gcc 2.95. It should really be [].
273 */
257}; 274};
258 275
259/* bootstrap: The caches do not work without cpuarrays anymore, 276/* bootstrap: The caches do not work without cpuarrays anymore,
@@ -266,34 +283,83 @@ struct arraycache_init {
266}; 283};
267 284
268/* 285/*
269 * The slab lists of all objects. 286 * The slab lists for all objects.
270 * Hopefully reduce the internal fragmentation
271 * NUMA: The spinlock could be moved from the kmem_cache_t
272 * into this structure, too. Figure out what causes
273 * fewer cross-node spinlock operations.
274 */ 287 */
275struct kmem_list3 { 288struct kmem_list3 {
276 struct list_head slabs_partial; /* partial list first, better asm code */ 289 struct list_head slabs_partial; /* partial list first, better asm code */
277 struct list_head slabs_full; 290 struct list_head slabs_full;
278 struct list_head slabs_free; 291 struct list_head slabs_free;
279 unsigned long free_objects; 292 unsigned long free_objects;
280 int free_touched;
281 unsigned long next_reap; 293 unsigned long next_reap;
282 struct array_cache *shared; 294 int free_touched;
295 unsigned int free_limit;
296 spinlock_t list_lock;
297 struct array_cache *shared; /* shared per node */
298 struct array_cache **alien; /* on other nodes */
283}; 299};
284 300
285#define LIST3_INIT(parent) \ 301/*
286 { \ 302 * Need this for bootstrapping a per node allocator.
287 .slabs_full = LIST_HEAD_INIT(parent.slabs_full), \ 303 */
288 .slabs_partial = LIST_HEAD_INIT(parent.slabs_partial), \ 304#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
289 .slabs_free = LIST_HEAD_INIT(parent.slabs_free) \ 305struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
306#define CACHE_CACHE 0
307#define SIZE_AC 1
308#define SIZE_L3 (1 + MAX_NUMNODES)
309
310/*
311 * This function may be completely optimized away if
312 * a constant is passed to it. Mostly the same as
313 * what is in linux/slab.h except it returns an
314 * index.
315 */
316static inline int index_of(const size_t size)
317{
318 if (__builtin_constant_p(size)) {
319 int i = 0;
320
321#define CACHE(x) \
322 if (size <=x) \
323 return i; \
324 else \
325 i++;
326#include "linux/kmalloc_sizes.h"
327#undef CACHE
328 {
329 extern void __bad_size(void);
330 __bad_size();
331 }
290 } 332 }
291#define list3_data(cachep) \ 333 return 0;
292 (&(cachep)->lists) 334}
335
336#define INDEX_AC index_of(sizeof(struct arraycache_init))
337#define INDEX_L3 index_of(sizeof(struct kmem_list3))
338
339static inline void kmem_list3_init(struct kmem_list3 *parent)
340{
341 INIT_LIST_HEAD(&parent->slabs_full);
342 INIT_LIST_HEAD(&parent->slabs_partial);
343 INIT_LIST_HEAD(&parent->slabs_free);
344 parent->shared = NULL;
345 parent->alien = NULL;
346 spin_lock_init(&parent->list_lock);
347 parent->free_objects = 0;
348 parent->free_touched = 0;
349}
293 350
294/* NUMA: per-node */ 351#define MAKE_LIST(cachep, listp, slab, nodeid) \
295#define list3_data_ptr(cachep, ptr) \ 352 do { \
296 list3_data(cachep) 353 INIT_LIST_HEAD(listp); \
354 list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
355 } while (0)
356
357#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
358 do { \
359 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \
360 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
361 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
362 } while (0)
297 363
298/* 364/*
299 * kmem_cache_t 365 * kmem_cache_t
@@ -306,13 +372,12 @@ struct kmem_cache_s {
306 struct array_cache *array[NR_CPUS]; 372 struct array_cache *array[NR_CPUS];
307 unsigned int batchcount; 373 unsigned int batchcount;
308 unsigned int limit; 374 unsigned int limit;
309/* 2) touched by every alloc & free from the backend */ 375 unsigned int shared;
310 struct kmem_list3 lists;
311 /* NUMA: kmem_3list_t *nodelists[MAX_NUMNODES] */
312 unsigned int objsize; 376 unsigned int objsize;
377/* 2) touched by every alloc & free from the backend */
378 struct kmem_list3 *nodelists[MAX_NUMNODES];
313 unsigned int flags; /* constant flags */ 379 unsigned int flags; /* constant flags */
314 unsigned int num; /* # of objs per slab */ 380 unsigned int num; /* # of objs per slab */
315 unsigned int free_limit; /* upper limit of objects in the lists */
316 spinlock_t spinlock; 381 spinlock_t spinlock;
317 382
318/* 3) cache_grow/shrink */ 383/* 3) cache_grow/shrink */
@@ -349,6 +414,7 @@ struct kmem_cache_s {
349 unsigned long errors; 414 unsigned long errors;
350 unsigned long max_freeable; 415 unsigned long max_freeable;
351 unsigned long node_allocs; 416 unsigned long node_allocs;
417 unsigned long node_frees;
352 atomic_t allochit; 418 atomic_t allochit;
353 atomic_t allocmiss; 419 atomic_t allocmiss;
354 atomic_t freehit; 420 atomic_t freehit;
@@ -384,6 +450,7 @@ struct kmem_cache_s {
384 } while (0) 450 } while (0)
385#define STATS_INC_ERR(x) ((x)->errors++) 451#define STATS_INC_ERR(x) ((x)->errors++)
386#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) 452#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
453#define STATS_INC_NODEFREES(x) ((x)->node_frees++)
387#define STATS_SET_FREEABLE(x, i) \ 454#define STATS_SET_FREEABLE(x, i) \
388 do { if ((x)->max_freeable < i) \ 455 do { if ((x)->max_freeable < i) \
389 (x)->max_freeable = i; \ 456 (x)->max_freeable = i; \
@@ -402,6 +469,7 @@ struct kmem_cache_s {
402#define STATS_SET_HIGH(x) do { } while (0) 469#define STATS_SET_HIGH(x) do { } while (0)
403#define STATS_INC_ERR(x) do { } while (0) 470#define STATS_INC_ERR(x) do { } while (0)
404#define STATS_INC_NODEALLOCS(x) do { } while (0) 471#define STATS_INC_NODEALLOCS(x) do { } while (0)
472#define STATS_INC_NODEFREES(x) do { } while (0)
405#define STATS_SET_FREEABLE(x, i) \ 473#define STATS_SET_FREEABLE(x, i) \
406 do { } while (0) 474 do { } while (0)
407 475
@@ -534,9 +602,9 @@ static struct arraycache_init initarray_generic =
534 602
535/* internal cache of cache description objs */ 603/* internal cache of cache description objs */
536static kmem_cache_t cache_cache = { 604static kmem_cache_t cache_cache = {
537 .lists = LIST3_INIT(cache_cache.lists),
538 .batchcount = 1, 605 .batchcount = 1,
539 .limit = BOOT_CPUCACHE_ENTRIES, 606 .limit = BOOT_CPUCACHE_ENTRIES,
607 .shared = 1,
540 .objsize = sizeof(kmem_cache_t), 608 .objsize = sizeof(kmem_cache_t),
541 .flags = SLAB_NO_REAP, 609 .flags = SLAB_NO_REAP,
542 .spinlock = SPIN_LOCK_UNLOCKED, 610 .spinlock = SPIN_LOCK_UNLOCKED,
@@ -557,7 +625,6 @@ static struct list_head cache_chain;
557 * SLAB_RECLAIM_ACCOUNT turns this on per-slab 625 * SLAB_RECLAIM_ACCOUNT turns this on per-slab
558 */ 626 */
559atomic_t slab_reclaim_pages; 627atomic_t slab_reclaim_pages;
560EXPORT_SYMBOL(slab_reclaim_pages);
561 628
562/* 629/*
563 * chicken and egg problem: delay the per-cpu array allocation 630 * chicken and egg problem: delay the per-cpu array allocation
@@ -565,7 +632,8 @@ EXPORT_SYMBOL(slab_reclaim_pages);
565 */ 632 */
566static enum { 633static enum {
567 NONE, 634 NONE,
568 PARTIAL, 635 PARTIAL_AC,
636 PARTIAL_L3,
569 FULL 637 FULL
570} g_cpucache_up; 638} g_cpucache_up;
571 639
@@ -574,11 +642,7 @@ static DEFINE_PER_CPU(struct work_struct, reap_work);
574static void free_block(kmem_cache_t* cachep, void** objpp, int len); 642static void free_block(kmem_cache_t* cachep, void** objpp, int len);
575static void enable_cpucache (kmem_cache_t *cachep); 643static void enable_cpucache (kmem_cache_t *cachep);
576static void cache_reap (void *unused); 644static void cache_reap (void *unused);
577 645static int __node_shrink(kmem_cache_t *cachep, int node);
578static inline void **ac_entry(struct array_cache *ac)
579{
580 return (void**)(ac+1);
581}
582 646
583static inline struct array_cache *ac_data(kmem_cache_t *cachep) 647static inline struct array_cache *ac_data(kmem_cache_t *cachep)
584{ 648{
@@ -676,48 +740,160 @@ static void __devinit start_cpu_timer(int cpu)
676 } 740 }
677} 741}
678 742
679static struct array_cache *alloc_arraycache(int cpu, int entries, 743static struct array_cache *alloc_arraycache(int node, int entries,
680 int batchcount) 744 int batchcount)
681{ 745{
682 int memsize = sizeof(void*)*entries+sizeof(struct array_cache); 746 int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
683 struct array_cache *nc = NULL; 747 struct array_cache *nc = NULL;
684 748
685 if (cpu == -1) 749 nc = kmalloc_node(memsize, GFP_KERNEL, node);
686 nc = kmalloc(memsize, GFP_KERNEL);
687 else
688 nc = kmalloc_node(memsize, GFP_KERNEL, cpu_to_node(cpu));
689
690 if (nc) { 750 if (nc) {
691 nc->avail = 0; 751 nc->avail = 0;
692 nc->limit = entries; 752 nc->limit = entries;
693 nc->batchcount = batchcount; 753 nc->batchcount = batchcount;
694 nc->touched = 0; 754 nc->touched = 0;
755 spin_lock_init(&nc->lock);
695 } 756 }
696 return nc; 757 return nc;
697} 758}
698 759
760#ifdef CONFIG_NUMA
761static inline struct array_cache **alloc_alien_cache(int node, int limit)
762{
763 struct array_cache **ac_ptr;
764 int memsize = sizeof(void*)*MAX_NUMNODES;
765 int i;
766
767 if (limit > 1)
768 limit = 12;
769 ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
770 if (ac_ptr) {
771 for_each_node(i) {
772 if (i == node || !node_online(i)) {
773 ac_ptr[i] = NULL;
774 continue;
775 }
776 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
777 if (!ac_ptr[i]) {
778 for (i--; i <=0; i--)
779 kfree(ac_ptr[i]);
780 kfree(ac_ptr);
781 return NULL;
782 }
783 }
784 }
785 return ac_ptr;
786}
787
788static inline void free_alien_cache(struct array_cache **ac_ptr)
789{
790 int i;
791
792 if (!ac_ptr)
793 return;
794
795 for_each_node(i)
796 kfree(ac_ptr[i]);
797
798 kfree(ac_ptr);
799}
800
801static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node)
802{
803 struct kmem_list3 *rl3 = cachep->nodelists[node];
804
805 if (ac->avail) {
806 spin_lock(&rl3->list_lock);
807 free_block(cachep, ac->entry, ac->avail);
808 ac->avail = 0;
809 spin_unlock(&rl3->list_lock);
810 }
811}
812
813static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
814{
815 int i=0;
816 struct array_cache *ac;
817 unsigned long flags;
818
819 for_each_online_node(i) {
820 ac = l3->alien[i];
821 if (ac) {
822 spin_lock_irqsave(&ac->lock, flags);
823 __drain_alien_cache(cachep, ac, i);
824 spin_unlock_irqrestore(&ac->lock, flags);
825 }
826 }
827}
828#else
829#define alloc_alien_cache(node, limit) do { } while (0)
830#define free_alien_cache(ac_ptr) do { } while (0)
831#define drain_alien_cache(cachep, l3) do { } while (0)
832#endif
833
699static int __devinit cpuup_callback(struct notifier_block *nfb, 834static int __devinit cpuup_callback(struct notifier_block *nfb,
700 unsigned long action, void *hcpu) 835 unsigned long action, void *hcpu)
701{ 836{
702 long cpu = (long)hcpu; 837 long cpu = (long)hcpu;
703 kmem_cache_t* cachep; 838 kmem_cache_t* cachep;
839 struct kmem_list3 *l3 = NULL;
840 int node = cpu_to_node(cpu);
841 int memsize = sizeof(struct kmem_list3);
842 struct array_cache *nc = NULL;
704 843
705 switch (action) { 844 switch (action) {
706 case CPU_UP_PREPARE: 845 case CPU_UP_PREPARE:
707 down(&cache_chain_sem); 846 down(&cache_chain_sem);
847 /* we need to do this right in the beginning since
848 * alloc_arraycache's are going to use this list.
849 * kmalloc_node allows us to add the slab to the right
850 * kmem_list3 and not this cpu's kmem_list3
851 */
852
708 list_for_each_entry(cachep, &cache_chain, next) { 853 list_for_each_entry(cachep, &cache_chain, next) {
709 struct array_cache *nc; 854 /* setup the size64 kmemlist for cpu before we can
855 * begin anything. Make sure some other cpu on this
856 * node has not already allocated this
857 */
858 if (!cachep->nodelists[node]) {
859 if (!(l3 = kmalloc_node(memsize,
860 GFP_KERNEL, node)))
861 goto bad;
862 kmem_list3_init(l3);
863 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
864 ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
865
866 cachep->nodelists[node] = l3;
867 }
868
869 spin_lock_irq(&cachep->nodelists[node]->list_lock);
870 cachep->nodelists[node]->free_limit =
871 (1 + nr_cpus_node(node)) *
872 cachep->batchcount + cachep->num;
873 spin_unlock_irq(&cachep->nodelists[node]->list_lock);
874 }
710 875
711 nc = alloc_arraycache(cpu, cachep->limit, cachep->batchcount); 876 /* Now we can go ahead with allocating the shared array's
877 & array cache's */
878 list_for_each_entry(cachep, &cache_chain, next) {
879 nc = alloc_arraycache(node, cachep->limit,
880 cachep->batchcount);
712 if (!nc) 881 if (!nc)
713 goto bad; 882 goto bad;
714
715 spin_lock_irq(&cachep->spinlock);
716 cachep->array[cpu] = nc; 883 cachep->array[cpu] = nc;
717 cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
718 + cachep->num;
719 spin_unlock_irq(&cachep->spinlock);
720 884
885 l3 = cachep->nodelists[node];
886 BUG_ON(!l3);
887 if (!l3->shared) {
888 if (!(nc = alloc_arraycache(node,
889 cachep->shared*cachep->batchcount,
890 0xbaadf00d)))
891 goto bad;
892
893 /* we are serialised from CPU_DEAD or
894 CPU_UP_CANCELLED by the cpucontrol lock */
895 l3->shared = nc;
896 }
721 } 897 }
722 up(&cache_chain_sem); 898 up(&cache_chain_sem);
723 break; 899 break;
@@ -732,13 +908,51 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
732 908
733 list_for_each_entry(cachep, &cache_chain, next) { 909 list_for_each_entry(cachep, &cache_chain, next) {
734 struct array_cache *nc; 910 struct array_cache *nc;
911 cpumask_t mask;
735 912
913 mask = node_to_cpumask(node);
736 spin_lock_irq(&cachep->spinlock); 914 spin_lock_irq(&cachep->spinlock);
737 /* cpu is dead; no one can alloc from it. */ 915 /* cpu is dead; no one can alloc from it. */
738 nc = cachep->array[cpu]; 916 nc = cachep->array[cpu];
739 cachep->array[cpu] = NULL; 917 cachep->array[cpu] = NULL;
740 cachep->free_limit -= cachep->batchcount; 918 l3 = cachep->nodelists[node];
741 free_block(cachep, ac_entry(nc), nc->avail); 919
920 if (!l3)
921 goto unlock_cache;
922
923 spin_lock(&l3->list_lock);
924
925 /* Free limit for this kmem_list3 */
926 l3->free_limit -= cachep->batchcount;
927 if (nc)
928 free_block(cachep, nc->entry, nc->avail);
929
930 if (!cpus_empty(mask)) {
931 spin_unlock(&l3->list_lock);
932 goto unlock_cache;
933 }
934
935 if (l3->shared) {
936 free_block(cachep, l3->shared->entry,
937 l3->shared->avail);
938 kfree(l3->shared);
939 l3->shared = NULL;
940 }
941 if (l3->alien) {
942 drain_alien_cache(cachep, l3);
943 free_alien_cache(l3->alien);
944 l3->alien = NULL;
945 }
946
947 /* free slabs belonging to this node */
948 if (__node_shrink(cachep, node)) {
949 cachep->nodelists[node] = NULL;
950 spin_unlock(&l3->list_lock);
951 kfree(l3);
952 } else {
953 spin_unlock(&l3->list_lock);
954 }
955unlock_cache:
742 spin_unlock_irq(&cachep->spinlock); 956 spin_unlock_irq(&cachep->spinlock);
743 kfree(nc); 957 kfree(nc);
744 } 958 }
@@ -754,6 +968,25 @@ bad:
754 968
755static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; 969static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
756 970
971/*
972 * swap the static kmem_list3 with kmalloced memory
973 */
974static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list,
975 int nodeid)
976{
977 struct kmem_list3 *ptr;
978
979 BUG_ON(cachep->nodelists[nodeid] != list);
980 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
981 BUG_ON(!ptr);
982
983 local_irq_disable();
984 memcpy(ptr, list, sizeof(struct kmem_list3));
985 MAKE_ALL_LISTS(cachep, ptr, nodeid);
986 cachep->nodelists[nodeid] = ptr;
987 local_irq_enable();
988}
989
757/* Initialisation. 990/* Initialisation.
758 * Called after the gfp() functions have been enabled, and before smp_init(). 991 * Called after the gfp() functions have been enabled, and before smp_init().
759 */ 992 */
@@ -762,6 +995,13 @@ void __init kmem_cache_init(void)
762 size_t left_over; 995 size_t left_over;
763 struct cache_sizes *sizes; 996 struct cache_sizes *sizes;
764 struct cache_names *names; 997 struct cache_names *names;
998 int i;
999
1000 for (i = 0; i < NUM_INIT_LISTS; i++) {
1001 kmem_list3_init(&initkmem_list3[i]);
1002 if (i < MAX_NUMNODES)
1003 cache_cache.nodelists[i] = NULL;
1004 }
765 1005
766 /* 1006 /*
767 * Fragmentation resistance on low memory - only use bigger 1007 * Fragmentation resistance on low memory - only use bigger
@@ -770,21 +1010,24 @@ void __init kmem_cache_init(void)
770 if (num_physpages > (32 << 20) >> PAGE_SHIFT) 1010 if (num_physpages > (32 << 20) >> PAGE_SHIFT)
771 slab_break_gfp_order = BREAK_GFP_ORDER_HI; 1011 slab_break_gfp_order = BREAK_GFP_ORDER_HI;
772 1012
773
774 /* Bootstrap is tricky, because several objects are allocated 1013 /* Bootstrap is tricky, because several objects are allocated
775 * from caches that do not exist yet: 1014 * from caches that do not exist yet:
776 * 1) initialize the cache_cache cache: it contains the kmem_cache_t 1015 * 1) initialize the cache_cache cache: it contains the kmem_cache_t
777 * structures of all caches, except cache_cache itself: cache_cache 1016 * structures of all caches, except cache_cache itself: cache_cache
778 * is statically allocated. 1017 * is statically allocated.
779 * Initially an __init data area is used for the head array, it's 1018 * Initially an __init data area is used for the head array and the
780 * replaced with a kmalloc allocated array at the end of the bootstrap. 1019 * kmem_list3 structures, it's replaced with a kmalloc allocated
1020 * array at the end of the bootstrap.
781 * 2) Create the first kmalloc cache. 1021 * 2) Create the first kmalloc cache.
782 * The kmem_cache_t for the new cache is allocated normally. An __init 1022 * The kmem_cache_t for the new cache is allocated normally.
783 * data area is used for the head array. 1023 * An __init data area is used for the head array.
784 * 3) Create the remaining kmalloc caches, with minimally sized head arrays. 1024 * 3) Create the remaining kmalloc caches, with minimally sized
1025 * head arrays.
785 * 4) Replace the __init data head arrays for cache_cache and the first 1026 * 4) Replace the __init data head arrays for cache_cache and the first
786 * kmalloc cache with kmalloc allocated arrays. 1027 * kmalloc cache with kmalloc allocated arrays.
787 * 5) Resize the head arrays of the kmalloc caches to their final sizes. 1028 * 5) Replace the __init data for kmem_list3 for cache_cache and
1029 * the other cache's with kmalloc allocated memory.
1030 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
788 */ 1031 */
789 1032
790 /* 1) create the cache_cache */ 1033 /* 1) create the cache_cache */
@@ -793,6 +1036,7 @@ void __init kmem_cache_init(void)
793 list_add(&cache_cache.next, &cache_chain); 1036 list_add(&cache_cache.next, &cache_chain);
794 cache_cache.colour_off = cache_line_size(); 1037 cache_cache.colour_off = cache_line_size();
795 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1038 cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1039 cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
796 1040
797 cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); 1041 cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
798 1042
@@ -810,15 +1054,33 @@ void __init kmem_cache_init(void)
810 sizes = malloc_sizes; 1054 sizes = malloc_sizes;
811 names = cache_names; 1055 names = cache_names;
812 1056
1057 /* Initialize the caches that provide memory for the array cache
1058 * and the kmem_list3 structures first.
1059 * Without this, further allocations will bug
1060 */
1061
1062 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1063 sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN,
1064 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
1065
1066 if (INDEX_AC != INDEX_L3)
1067 sizes[INDEX_L3].cs_cachep =
1068 kmem_cache_create(names[INDEX_L3].name,
1069 sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN,
1070 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
1071
813 while (sizes->cs_size != ULONG_MAX) { 1072 while (sizes->cs_size != ULONG_MAX) {
814 /* For performance, all the general caches are L1 aligned. 1073 /*
1074 * For performance, all the general caches are L1 aligned.
815 * This should be particularly beneficial on SMP boxes, as it 1075 * This should be particularly beneficial on SMP boxes, as it
816 * eliminates "false sharing". 1076 * eliminates "false sharing".
817 * Note for systems short on memory removing the alignment will 1077 * Note for systems short on memory removing the alignment will
818 * allow tighter packing of the smaller caches. */ 1078 * allow tighter packing of the smaller caches.
819 sizes->cs_cachep = kmem_cache_create(names->name, 1079 */
820 sizes->cs_size, ARCH_KMALLOC_MINALIGN, 1080 if(!sizes->cs_cachep)
821 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 1081 sizes->cs_cachep = kmem_cache_create(names->name,
1082 sizes->cs_size, ARCH_KMALLOC_MINALIGN,
1083 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
822 1084
823 /* Inc off-slab bufctl limit until the ceiling is hit. */ 1085 /* Inc off-slab bufctl limit until the ceiling is hit. */
824 if (!(OFF_SLAB(sizes->cs_cachep))) { 1086 if (!(OFF_SLAB(sizes->cs_cachep))) {
@@ -837,24 +1099,47 @@ void __init kmem_cache_init(void)
837 /* 4) Replace the bootstrap head arrays */ 1099 /* 4) Replace the bootstrap head arrays */
838 { 1100 {
839 void * ptr; 1101 void * ptr;
840 1102
841 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1103 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1104
842 local_irq_disable(); 1105 local_irq_disable();
843 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); 1106 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
844 memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init)); 1107 memcpy(ptr, ac_data(&cache_cache),
1108 sizeof(struct arraycache_init));
845 cache_cache.array[smp_processor_id()] = ptr; 1109 cache_cache.array[smp_processor_id()] = ptr;
846 local_irq_enable(); 1110 local_irq_enable();
847 1111
848 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1112 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1113
849 local_irq_disable(); 1114 local_irq_disable();
850 BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache); 1115 BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
851 memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep), 1116 != &initarray_generic.cache);
1117 memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
852 sizeof(struct arraycache_init)); 1118 sizeof(struct arraycache_init));
853 malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr; 1119 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1120 ptr;
854 local_irq_enable(); 1121 local_irq_enable();
855 } 1122 }
1123 /* 5) Replace the bootstrap kmem_list3's */
1124 {
1125 int node;
1126 /* Replace the static kmem_list3 structures for the boot cpu */
1127 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
1128 numa_node_id());
1129
1130 for_each_online_node(node) {
1131 init_list(malloc_sizes[INDEX_AC].cs_cachep,
1132 &initkmem_list3[SIZE_AC+node], node);
1133
1134 if (INDEX_AC != INDEX_L3) {
1135 init_list(malloc_sizes[INDEX_L3].cs_cachep,
1136 &initkmem_list3[SIZE_L3+node],
1137 node);
1138 }
1139 }
1140 }
856 1141
857 /* 5) resize the head arrays to their final sizes */ 1142 /* 6) resize the head arrays to their final sizes */
858 { 1143 {
859 kmem_cache_t *cachep; 1144 kmem_cache_t *cachep;
860 down(&cache_chain_sem); 1145 down(&cache_chain_sem);
@@ -870,7 +1155,6 @@ void __init kmem_cache_init(void)
870 * that initializes ac_data for all new cpus 1155 * that initializes ac_data for all new cpus
871 */ 1156 */
872 register_cpu_notifier(&cpucache_notifier); 1157 register_cpu_notifier(&cpucache_notifier);
873
874 1158
875 /* The reap timers are started later, with a module init call: 1159 /* The reap timers are started later, with a module init call:
876 * That part of the kernel is not yet operational. 1160 * That part of the kernel is not yet operational.
@@ -885,10 +1169,8 @@ static int __init cpucache_init(void)
885 * Register the timers that return unneeded 1169 * Register the timers that return unneeded
886 * pages to gfp. 1170 * pages to gfp.
887 */ 1171 */
888 for (cpu = 0; cpu < NR_CPUS; cpu++) { 1172 for_each_online_cpu(cpu)
889 if (cpu_online(cpu)) 1173 start_cpu_timer(cpu);
890 start_cpu_timer(cpu);
891 }
892 1174
893 return 0; 1175 return 0;
894} 1176}
@@ -1167,6 +1449,20 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
1167 } 1449 }
1168} 1450}
1169 1451
1452/* For setting up all the kmem_list3s for cache whose objsize is same
1453 as size of kmem_list3. */
1454static inline void set_up_list3s(kmem_cache_t *cachep, int index)
1455{
1456 int node;
1457
1458 for_each_online_node(node) {
1459 cachep->nodelists[node] = &initkmem_list3[index+node];
1460 cachep->nodelists[node]->next_reap = jiffies +
1461 REAPTIMEOUT_LIST3 +
1462 ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
1463 }
1464}
1465
1170/** 1466/**
1171 * kmem_cache_create - Create a cache. 1467 * kmem_cache_create - Create a cache.
1172 * @name: A string which is used in /proc/slabinfo to identify this cache. 1468 * @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -1320,7 +1616,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1320 size += BYTES_PER_WORD; 1616 size += BYTES_PER_WORD;
1321 } 1617 }
1322#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 1618#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
1323 if (size > 128 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { 1619 if (size >= malloc_sizes[INDEX_L3+1].cs_size && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
1324 cachep->dbghead += PAGE_SIZE - size; 1620 cachep->dbghead += PAGE_SIZE - size;
1325 size = PAGE_SIZE; 1621 size = PAGE_SIZE;
1326 } 1622 }
@@ -1422,13 +1718,9 @@ next:
1422 cachep->gfpflags |= GFP_DMA; 1718 cachep->gfpflags |= GFP_DMA;
1423 spin_lock_init(&cachep->spinlock); 1719 spin_lock_init(&cachep->spinlock);
1424 cachep->objsize = size; 1720 cachep->objsize = size;
1425 /* NUMA */
1426 INIT_LIST_HEAD(&cachep->lists.slabs_full);
1427 INIT_LIST_HEAD(&cachep->lists.slabs_partial);
1428 INIT_LIST_HEAD(&cachep->lists.slabs_free);
1429 1721
1430 if (flags & CFLGS_OFF_SLAB) 1722 if (flags & CFLGS_OFF_SLAB)
1431 cachep->slabp_cache = kmem_find_general_cachep(slab_size,0); 1723 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
1432 cachep->ctor = ctor; 1724 cachep->ctor = ctor;
1433 cachep->dtor = dtor; 1725 cachep->dtor = dtor;
1434 cachep->name = name; 1726 cachep->name = name;
@@ -1444,11 +1736,43 @@ next:
1444 * the cache that's used by kmalloc(24), otherwise 1736 * the cache that's used by kmalloc(24), otherwise
1445 * the creation of further caches will BUG(). 1737 * the creation of further caches will BUG().
1446 */ 1738 */
1447 cachep->array[smp_processor_id()] = &initarray_generic.cache; 1739 cachep->array[smp_processor_id()] =
1448 g_cpucache_up = PARTIAL; 1740 &initarray_generic.cache;
1741
1742 /* If the cache that's used by
1743 * kmalloc(sizeof(kmem_list3)) is the first cache,
1744 * then we need to set up all its list3s, otherwise
1745 * the creation of further caches will BUG().
1746 */
1747 set_up_list3s(cachep, SIZE_AC);
1748 if (INDEX_AC == INDEX_L3)
1749 g_cpucache_up = PARTIAL_L3;
1750 else
1751 g_cpucache_up = PARTIAL_AC;
1449 } else { 1752 } else {
1450 cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL); 1753 cachep->array[smp_processor_id()] =
1754 kmalloc(sizeof(struct arraycache_init),
1755 GFP_KERNEL);
1756
1757 if (g_cpucache_up == PARTIAL_AC) {
1758 set_up_list3s(cachep, SIZE_L3);
1759 g_cpucache_up = PARTIAL_L3;
1760 } else {
1761 int node;
1762 for_each_online_node(node) {
1763
1764 cachep->nodelists[node] =
1765 kmalloc_node(sizeof(struct kmem_list3),
1766 GFP_KERNEL, node);
1767 BUG_ON(!cachep->nodelists[node]);
1768 kmem_list3_init(cachep->nodelists[node]);
1769 }
1770 }
1451 } 1771 }
1772 cachep->nodelists[numa_node_id()]->next_reap =
1773 jiffies + REAPTIMEOUT_LIST3 +
1774 ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
1775
1452 BUG_ON(!ac_data(cachep)); 1776 BUG_ON(!ac_data(cachep));
1453 ac_data(cachep)->avail = 0; 1777 ac_data(cachep)->avail = 0;
1454 ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 1778 ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
@@ -1456,13 +1780,8 @@ next:
1456 ac_data(cachep)->touched = 0; 1780 ac_data(cachep)->touched = 0;
1457 cachep->batchcount = 1; 1781 cachep->batchcount = 1;
1458 cachep->limit = BOOT_CPUCACHE_ENTRIES; 1782 cachep->limit = BOOT_CPUCACHE_ENTRIES;
1459 cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
1460 + cachep->num;
1461 } 1783 }
1462 1784
1463 cachep->lists.next_reap = jiffies + REAPTIMEOUT_LIST3 +
1464 ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
1465
1466 /* Need the semaphore to access the chain. */ 1785 /* Need the semaphore to access the chain. */
1467 down(&cache_chain_sem); 1786 down(&cache_chain_sem);
1468 { 1787 {
@@ -1519,13 +1838,23 @@ static void check_spinlock_acquired(kmem_cache_t *cachep)
1519{ 1838{
1520#ifdef CONFIG_SMP 1839#ifdef CONFIG_SMP
1521 check_irq_off(); 1840 check_irq_off();
1522 BUG_ON(spin_trylock(&cachep->spinlock)); 1841 assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
1523#endif 1842#endif
1524} 1843}
1844
1845static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
1846{
1847#ifdef CONFIG_SMP
1848 check_irq_off();
1849 assert_spin_locked(&cachep->nodelists[node]->list_lock);
1850#endif
1851}
1852
1525#else 1853#else
1526#define check_irq_off() do { } while(0) 1854#define check_irq_off() do { } while(0)
1527#define check_irq_on() do { } while(0) 1855#define check_irq_on() do { } while(0)
1528#define check_spinlock_acquired(x) do { } while(0) 1856#define check_spinlock_acquired(x) do { } while(0)
1857#define check_spinlock_acquired_node(x, y) do { } while(0)
1529#endif 1858#endif
1530 1859
1531/* 1860/*
@@ -1547,7 +1876,7 @@ static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
1547} 1876}
1548 1877
1549static void drain_array_locked(kmem_cache_t* cachep, 1878static void drain_array_locked(kmem_cache_t* cachep,
1550 struct array_cache *ac, int force); 1879 struct array_cache *ac, int force, int node);
1551 1880
1552static void do_drain(void *arg) 1881static void do_drain(void *arg)
1553{ 1882{
@@ -1556,59 +1885,82 @@ static void do_drain(void *arg)
1556 1885
1557 check_irq_off(); 1886 check_irq_off();
1558 ac = ac_data(cachep); 1887 ac = ac_data(cachep);
1559 spin_lock(&cachep->spinlock); 1888 spin_lock(&cachep->nodelists[numa_node_id()]->list_lock);
1560 free_block(cachep, &ac_entry(ac)[0], ac->avail); 1889 free_block(cachep, ac->entry, ac->avail);
1561 spin_unlock(&cachep->spinlock); 1890 spin_unlock(&cachep->nodelists[numa_node_id()]->list_lock);
1562 ac->avail = 0; 1891 ac->avail = 0;
1563} 1892}
1564 1893
1565static void drain_cpu_caches(kmem_cache_t *cachep) 1894static void drain_cpu_caches(kmem_cache_t *cachep)
1566{ 1895{
1896 struct kmem_list3 *l3;
1897 int node;
1898
1567 smp_call_function_all_cpus(do_drain, cachep); 1899 smp_call_function_all_cpus(do_drain, cachep);
1568 check_irq_on(); 1900 check_irq_on();
1569 spin_lock_irq(&cachep->spinlock); 1901 spin_lock_irq(&cachep->spinlock);
1570 if (cachep->lists.shared) 1902 for_each_online_node(node) {
1571 drain_array_locked(cachep, cachep->lists.shared, 1); 1903 l3 = cachep->nodelists[node];
1904 if (l3) {
1905 spin_lock(&l3->list_lock);
1906 drain_array_locked(cachep, l3->shared, 1, node);
1907 spin_unlock(&l3->list_lock);
1908 if (l3->alien)
1909 drain_alien_cache(cachep, l3);
1910 }
1911 }
1572 spin_unlock_irq(&cachep->spinlock); 1912 spin_unlock_irq(&cachep->spinlock);
1573} 1913}
1574 1914
1575 1915static int __node_shrink(kmem_cache_t *cachep, int node)
1576/* NUMA shrink all list3s */
1577static int __cache_shrink(kmem_cache_t *cachep)
1578{ 1916{
1579 struct slab *slabp; 1917 struct slab *slabp;
1918 struct kmem_list3 *l3 = cachep->nodelists[node];
1580 int ret; 1919 int ret;
1581 1920
1582 drain_cpu_caches(cachep); 1921 for (;;) {
1583
1584 check_irq_on();
1585 spin_lock_irq(&cachep->spinlock);
1586
1587 for(;;) {
1588 struct list_head *p; 1922 struct list_head *p;
1589 1923
1590 p = cachep->lists.slabs_free.prev; 1924 p = l3->slabs_free.prev;
1591 if (p == &cachep->lists.slabs_free) 1925 if (p == &l3->slabs_free)
1592 break; 1926 break;
1593 1927
1594 slabp = list_entry(cachep->lists.slabs_free.prev, struct slab, list); 1928 slabp = list_entry(l3->slabs_free.prev, struct slab, list);
1595#if DEBUG 1929#if DEBUG
1596 if (slabp->inuse) 1930 if (slabp->inuse)
1597 BUG(); 1931 BUG();
1598#endif 1932#endif
1599 list_del(&slabp->list); 1933 list_del(&slabp->list);
1600 1934
1601 cachep->lists.free_objects -= cachep->num; 1935 l3->free_objects -= cachep->num;
1602 spin_unlock_irq(&cachep->spinlock); 1936 spin_unlock_irq(&l3->list_lock);
1603 slab_destroy(cachep, slabp); 1937 slab_destroy(cachep, slabp);
1604 spin_lock_irq(&cachep->spinlock); 1938 spin_lock_irq(&l3->list_lock);
1605 } 1939 }
1606 ret = !list_empty(&cachep->lists.slabs_full) || 1940 ret = !list_empty(&l3->slabs_full) ||
1607 !list_empty(&cachep->lists.slabs_partial); 1941 !list_empty(&l3->slabs_partial);
1608 spin_unlock_irq(&cachep->spinlock);
1609 return ret; 1942 return ret;
1610} 1943}
1611 1944
1945static int __cache_shrink(kmem_cache_t *cachep)
1946{
1947 int ret = 0, i = 0;
1948 struct kmem_list3 *l3;
1949
1950 drain_cpu_caches(cachep);
1951
1952 check_irq_on();
1953 for_each_online_node(i) {
1954 l3 = cachep->nodelists[i];
1955 if (l3) {
1956 spin_lock_irq(&l3->list_lock);
1957 ret += __node_shrink(cachep, i);
1958 spin_unlock_irq(&l3->list_lock);
1959 }
1960 }
1961 return (ret ? 1 : 0);
1962}
1963
1612/** 1964/**
1613 * kmem_cache_shrink - Shrink a cache. 1965 * kmem_cache_shrink - Shrink a cache.
1614 * @cachep: The cache to shrink. 1966 * @cachep: The cache to shrink.
@@ -1645,6 +1997,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
1645int kmem_cache_destroy(kmem_cache_t * cachep) 1997int kmem_cache_destroy(kmem_cache_t * cachep)
1646{ 1998{
1647 int i; 1999 int i;
2000 struct kmem_list3 *l3;
1648 2001
1649 if (!cachep || in_interrupt()) 2002 if (!cachep || in_interrupt())
1650 BUG(); 2003 BUG();
@@ -1672,15 +2025,17 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
1672 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 2025 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
1673 synchronize_rcu(); 2026 synchronize_rcu();
1674 2027
1675 /* no cpu_online check required here since we clear the percpu 2028 for_each_online_cpu(i)
1676 * array on cpu offline and set this to NULL.
1677 */
1678 for (i = 0; i < NR_CPUS; i++)
1679 kfree(cachep->array[i]); 2029 kfree(cachep->array[i]);
1680 2030
1681 /* NUMA: free the list3 structures */ 2031 /* NUMA: free the list3 structures */
1682 kfree(cachep->lists.shared); 2032 for_each_online_node(i) {
1683 cachep->lists.shared = NULL; 2033 if ((l3 = cachep->nodelists[i])) {
2034 kfree(l3->shared);
2035 free_alien_cache(l3->alien);
2036 kfree(l3);
2037 }
2038 }
1684 kmem_cache_free(&cache_cache, cachep); 2039 kmem_cache_free(&cache_cache, cachep);
1685 2040
1686 unlock_cpu_hotplug(); 2041 unlock_cpu_hotplug();
@@ -1690,8 +2045,8 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
1690EXPORT_SYMBOL(kmem_cache_destroy); 2045EXPORT_SYMBOL(kmem_cache_destroy);
1691 2046
1692/* Get the memory for a slab management obj. */ 2047/* Get the memory for a slab management obj. */
1693static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, 2048static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
1694 void *objp, int colour_off, unsigned int __nocast local_flags) 2049 int colour_off, unsigned int __nocast local_flags)
1695{ 2050{
1696 struct slab *slabp; 2051 struct slab *slabp;
1697 2052
@@ -1722,7 +2077,7 @@ static void cache_init_objs(kmem_cache_t *cachep,
1722 int i; 2077 int i;
1723 2078
1724 for (i = 0; i < cachep->num; i++) { 2079 for (i = 0; i < cachep->num; i++) {
1725 void* objp = slabp->s_mem+cachep->objsize*i; 2080 void *objp = slabp->s_mem+cachep->objsize*i;
1726#if DEBUG 2081#if DEBUG
1727 /* need to poison the objs? */ 2082 /* need to poison the objs? */
1728 if (cachep->flags & SLAB_POISON) 2083 if (cachep->flags & SLAB_POISON)
@@ -1799,6 +2154,7 @@ static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nod
1799 size_t offset; 2154 size_t offset;
1800 unsigned int local_flags; 2155 unsigned int local_flags;
1801 unsigned long ctor_flags; 2156 unsigned long ctor_flags;
2157 struct kmem_list3 *l3;
1802 2158
1803 /* Be lazy and only check for valid flags here, 2159 /* Be lazy and only check for valid flags here,
1804 * keeping it out of the critical path in kmem_cache_alloc(). 2160 * keeping it out of the critical path in kmem_cache_alloc().
@@ -1830,6 +2186,7 @@ static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nod
1830 2186
1831 spin_unlock(&cachep->spinlock); 2187 spin_unlock(&cachep->spinlock);
1832 2188
2189 check_irq_off();
1833 if (local_flags & __GFP_WAIT) 2190 if (local_flags & __GFP_WAIT)
1834 local_irq_enable(); 2191 local_irq_enable();
1835 2192
@@ -1841,8 +2198,9 @@ static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nod
1841 */ 2198 */
1842 kmem_flagcheck(cachep, flags); 2199 kmem_flagcheck(cachep, flags);
1843 2200
1844 2201 /* Get mem for the objs.
1845 /* Get mem for the objs. */ 2202 * Attempt to allocate a physical page from 'nodeid',
2203 */
1846 if (!(objp = kmem_getpages(cachep, flags, nodeid))) 2204 if (!(objp = kmem_getpages(cachep, flags, nodeid)))
1847 goto failed; 2205 goto failed;
1848 2206
@@ -1850,6 +2208,7 @@ static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nod
1850 if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags))) 2208 if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
1851 goto opps1; 2209 goto opps1;
1852 2210
2211 slabp->nodeid = nodeid;
1853 set_slab_attr(cachep, slabp, objp); 2212 set_slab_attr(cachep, slabp, objp);
1854 2213
1855 cache_init_objs(cachep, slabp, ctor_flags); 2214 cache_init_objs(cachep, slabp, ctor_flags);
@@ -1857,13 +2216,14 @@ static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nod
1857 if (local_flags & __GFP_WAIT) 2216 if (local_flags & __GFP_WAIT)
1858 local_irq_disable(); 2217 local_irq_disable();
1859 check_irq_off(); 2218 check_irq_off();
1860 spin_lock(&cachep->spinlock); 2219 l3 = cachep->nodelists[nodeid];
2220 spin_lock(&l3->list_lock);
1861 2221
1862 /* Make slab active. */ 2222 /* Make slab active. */
1863 list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_free)); 2223 list_add_tail(&slabp->list, &(l3->slabs_free));
1864 STATS_INC_GROWN(cachep); 2224 STATS_INC_GROWN(cachep);
1865 list3_data(cachep)->free_objects += cachep->num; 2225 l3->free_objects += cachep->num;
1866 spin_unlock(&cachep->spinlock); 2226 spin_unlock(&l3->list_lock);
1867 return 1; 2227 return 1;
1868opps1: 2228opps1:
1869 kmem_freepages(cachep, objp); 2229 kmem_freepages(cachep, objp);
@@ -1969,7 +2329,6 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
1969 kmem_bufctl_t i; 2329 kmem_bufctl_t i;
1970 int entries = 0; 2330 int entries = 0;
1971 2331
1972 check_spinlock_acquired(cachep);
1973 /* Check slab's freelist to see if this obj is there. */ 2332 /* Check slab's freelist to see if this obj is there. */
1974 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { 2333 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
1975 entries++; 2334 entries++;
@@ -2012,10 +2371,11 @@ retry:
2012 */ 2371 */
2013 batchcount = BATCHREFILL_LIMIT; 2372 batchcount = BATCHREFILL_LIMIT;
2014 } 2373 }
2015 l3 = list3_data(cachep); 2374 l3 = cachep->nodelists[numa_node_id()];
2375
2376 BUG_ON(ac->avail > 0 || !l3);
2377 spin_lock(&l3->list_lock);
2016 2378
2017 BUG_ON(ac->avail > 0);
2018 spin_lock(&cachep->spinlock);
2019 if (l3->shared) { 2379 if (l3->shared) {
2020 struct array_cache *shared_array = l3->shared; 2380 struct array_cache *shared_array = l3->shared;
2021 if (shared_array->avail) { 2381 if (shared_array->avail) {
@@ -2023,8 +2383,9 @@ retry:
2023 batchcount = shared_array->avail; 2383 batchcount = shared_array->avail;
2024 shared_array->avail -= batchcount; 2384 shared_array->avail -= batchcount;
2025 ac->avail = batchcount; 2385 ac->avail = batchcount;
2026 memcpy(ac_entry(ac), &ac_entry(shared_array)[shared_array->avail], 2386 memcpy(ac->entry,
2027 sizeof(void*)*batchcount); 2387 &(shared_array->entry[shared_array->avail]),
2388 sizeof(void*)*batchcount);
2028 shared_array->touched = 1; 2389 shared_array->touched = 1;
2029 goto alloc_done; 2390 goto alloc_done;
2030 } 2391 }
@@ -2051,7 +2412,8 @@ retry:
2051 STATS_SET_HIGH(cachep); 2412 STATS_SET_HIGH(cachep);
2052 2413
2053 /* get obj pointer */ 2414 /* get obj pointer */
2054 ac_entry(ac)[ac->avail++] = slabp->s_mem + slabp->free*cachep->objsize; 2415 ac->entry[ac->avail++] = slabp->s_mem +
2416 slabp->free*cachep->objsize;
2055 2417
2056 slabp->inuse++; 2418 slabp->inuse++;
2057 next = slab_bufctl(slabp)[slabp->free]; 2419 next = slab_bufctl(slabp)[slabp->free];
@@ -2073,12 +2435,12 @@ retry:
2073must_grow: 2435must_grow:
2074 l3->free_objects -= ac->avail; 2436 l3->free_objects -= ac->avail;
2075alloc_done: 2437alloc_done:
2076 spin_unlock(&cachep->spinlock); 2438 spin_unlock(&l3->list_lock);
2077 2439
2078 if (unlikely(!ac->avail)) { 2440 if (unlikely(!ac->avail)) {
2079 int x; 2441 int x;
2080 x = cache_grow(cachep, flags, -1); 2442 x = cache_grow(cachep, flags, numa_node_id());
2081 2443
2082 // cache_grow can reenable interrupts, then ac could change. 2444 // cache_grow can reenable interrupts, then ac could change.
2083 ac = ac_data(cachep); 2445 ac = ac_data(cachep);
2084 if (!x && ac->avail == 0) // no objects in sight? abort 2446 if (!x && ac->avail == 0) // no objects in sight? abort
@@ -2088,7 +2450,7 @@ alloc_done:
2088 goto retry; 2450 goto retry;
2089 } 2451 }
2090 ac->touched = 1; 2452 ac->touched = 1;
2091 return ac_entry(ac)[--ac->avail]; 2453 return ac->entry[--ac->avail];
2092} 2454}
2093 2455
2094static inline void 2456static inline void
@@ -2160,7 +2522,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast fl
2160 if (likely(ac->avail)) { 2522 if (likely(ac->avail)) {
2161 STATS_INC_ALLOCHIT(cachep); 2523 STATS_INC_ALLOCHIT(cachep);
2162 ac->touched = 1; 2524 ac->touched = 1;
2163 objp = ac_entry(ac)[--ac->avail]; 2525 objp = ac->entry[--ac->avail];
2164 } else { 2526 } else {
2165 STATS_INC_ALLOCMISS(cachep); 2527 STATS_INC_ALLOCMISS(cachep);
2166 objp = cache_alloc_refill(cachep, flags); 2528 objp = cache_alloc_refill(cachep, flags);
@@ -2172,33 +2534,104 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast fl
2172 return objp; 2534 return objp;
2173} 2535}
2174 2536
2175/* 2537#ifdef CONFIG_NUMA
2176 * NUMA: different approach needed if the spinlock is moved into 2538/*
2177 * the l3 structure 2539 * A interface to enable slab creation on nodeid
2178 */ 2540 */
2541static void *__cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
2542{
2543 struct list_head *entry;
2544 struct slab *slabp;
2545 struct kmem_list3 *l3;
2546 void *obj;
2547 kmem_bufctl_t next;
2548 int x;
2549
2550 l3 = cachep->nodelists[nodeid];
2551 BUG_ON(!l3);
2552
2553retry:
2554 spin_lock(&l3->list_lock);
2555 entry = l3->slabs_partial.next;
2556 if (entry == &l3->slabs_partial) {
2557 l3->free_touched = 1;
2558 entry = l3->slabs_free.next;
2559 if (entry == &l3->slabs_free)
2560 goto must_grow;
2561 }
2562
2563 slabp = list_entry(entry, struct slab, list);
2564 check_spinlock_acquired_node(cachep, nodeid);
2565 check_slabp(cachep, slabp);
2566
2567 STATS_INC_NODEALLOCS(cachep);
2568 STATS_INC_ACTIVE(cachep);
2569 STATS_SET_HIGH(cachep);
2570
2571 BUG_ON(slabp->inuse == cachep->num);
2572
2573 /* get obj pointer */
2574 obj = slabp->s_mem + slabp->free*cachep->objsize;
2575 slabp->inuse++;
2576 next = slab_bufctl(slabp)[slabp->free];
2577#if DEBUG
2578 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2579#endif
2580 slabp->free = next;
2581 check_slabp(cachep, slabp);
2582 l3->free_objects--;
2583 /* move slabp to correct slabp list: */
2584 list_del(&slabp->list);
2179 2585
2586 if (slabp->free == BUFCTL_END) {
2587 list_add(&slabp->list, &l3->slabs_full);
2588 } else {
2589 list_add(&slabp->list, &l3->slabs_partial);
2590 }
2591
2592 spin_unlock(&l3->list_lock);
2593 goto done;
2594
2595must_grow:
2596 spin_unlock(&l3->list_lock);
2597 x = cache_grow(cachep, flags, nodeid);
2598
2599 if (!x)
2600 return NULL;
2601
2602 goto retry;
2603done:
2604 return obj;
2605}
2606#endif
2607
2608/*
2609 * Caller needs to acquire correct kmem_list's list_lock
2610 */
2180static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects) 2611static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
2181{ 2612{
2182 int i; 2613 int i;
2183 2614 struct kmem_list3 *l3;
2184 check_spinlock_acquired(cachep);
2185
2186 /* NUMA: move add into loop */
2187 cachep->lists.free_objects += nr_objects;
2188 2615
2189 for (i = 0; i < nr_objects; i++) { 2616 for (i = 0; i < nr_objects; i++) {
2190 void *objp = objpp[i]; 2617 void *objp = objpp[i];
2191 struct slab *slabp; 2618 struct slab *slabp;
2192 unsigned int objnr; 2619 unsigned int objnr;
2620 int nodeid = 0;
2193 2621
2194 slabp = GET_PAGE_SLAB(virt_to_page(objp)); 2622 slabp = GET_PAGE_SLAB(virt_to_page(objp));
2623 nodeid = slabp->nodeid;
2624 l3 = cachep->nodelists[nodeid];
2195 list_del(&slabp->list); 2625 list_del(&slabp->list);
2196 objnr = (objp - slabp->s_mem) / cachep->objsize; 2626 objnr = (objp - slabp->s_mem) / cachep->objsize;
2627 check_spinlock_acquired_node(cachep, nodeid);
2197 check_slabp(cachep, slabp); 2628 check_slabp(cachep, slabp);
2629
2630
2198#if DEBUG 2631#if DEBUG
2199 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { 2632 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2200 printk(KERN_ERR "slab: double free detected in cache '%s', objp %p.\n", 2633 printk(KERN_ERR "slab: double free detected in cache "
2201 cachep->name, objp); 2634 "'%s', objp %p\n", cachep->name, objp);
2202 BUG(); 2635 BUG();
2203 } 2636 }
2204#endif 2637#endif
@@ -2206,24 +2639,23 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
2206 slabp->free = objnr; 2639 slabp->free = objnr;
2207 STATS_DEC_ACTIVE(cachep); 2640 STATS_DEC_ACTIVE(cachep);
2208 slabp->inuse--; 2641 slabp->inuse--;
2642 l3->free_objects++;
2209 check_slabp(cachep, slabp); 2643 check_slabp(cachep, slabp);
2210 2644
2211 /* fixup slab chains */ 2645 /* fixup slab chains */
2212 if (slabp->inuse == 0) { 2646 if (slabp->inuse == 0) {
2213 if (cachep->lists.free_objects > cachep->free_limit) { 2647 if (l3->free_objects > l3->free_limit) {
2214 cachep->lists.free_objects -= cachep->num; 2648 l3->free_objects -= cachep->num;
2215 slab_destroy(cachep, slabp); 2649 slab_destroy(cachep, slabp);
2216 } else { 2650 } else {
2217 list_add(&slabp->list, 2651 list_add(&slabp->list, &l3->slabs_free);
2218 &list3_data_ptr(cachep, objp)->slabs_free);
2219 } 2652 }
2220 } else { 2653 } else {
2221 /* Unconditionally move a slab to the end of the 2654 /* Unconditionally move a slab to the end of the
2222 * partial list on free - maximum time for the 2655 * partial list on free - maximum time for the
2223 * other objects to be freed, too. 2656 * other objects to be freed, too.
2224 */ 2657 */
2225 list_add_tail(&slabp->list, 2658 list_add_tail(&slabp->list, &l3->slabs_partial);
2226 &list3_data_ptr(cachep, objp)->slabs_partial);
2227 } 2659 }
2228 } 2660 }
2229} 2661}
@@ -2231,36 +2663,38 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
2231static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac) 2663static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
2232{ 2664{
2233 int batchcount; 2665 int batchcount;
2666 struct kmem_list3 *l3;
2234 2667
2235 batchcount = ac->batchcount; 2668 batchcount = ac->batchcount;
2236#if DEBUG 2669#if DEBUG
2237 BUG_ON(!batchcount || batchcount > ac->avail); 2670 BUG_ON(!batchcount || batchcount > ac->avail);
2238#endif 2671#endif
2239 check_irq_off(); 2672 check_irq_off();
2240 spin_lock(&cachep->spinlock); 2673 l3 = cachep->nodelists[numa_node_id()];
2241 if (cachep->lists.shared) { 2674 spin_lock(&l3->list_lock);
2242 struct array_cache *shared_array = cachep->lists.shared; 2675 if (l3->shared) {
2676 struct array_cache *shared_array = l3->shared;
2243 int max = shared_array->limit-shared_array->avail; 2677 int max = shared_array->limit-shared_array->avail;
2244 if (max) { 2678 if (max) {
2245 if (batchcount > max) 2679 if (batchcount > max)
2246 batchcount = max; 2680 batchcount = max;
2247 memcpy(&ac_entry(shared_array)[shared_array->avail], 2681 memcpy(&(shared_array->entry[shared_array->avail]),
2248 &ac_entry(ac)[0], 2682 ac->entry,
2249 sizeof(void*)*batchcount); 2683 sizeof(void*)*batchcount);
2250 shared_array->avail += batchcount; 2684 shared_array->avail += batchcount;
2251 goto free_done; 2685 goto free_done;
2252 } 2686 }
2253 } 2687 }
2254 2688
2255 free_block(cachep, &ac_entry(ac)[0], batchcount); 2689 free_block(cachep, ac->entry, batchcount);
2256free_done: 2690free_done:
2257#if STATS 2691#if STATS
2258 { 2692 {
2259 int i = 0; 2693 int i = 0;
2260 struct list_head *p; 2694 struct list_head *p;
2261 2695
2262 p = list3_data(cachep)->slabs_free.next; 2696 p = l3->slabs_free.next;
2263 while (p != &(list3_data(cachep)->slabs_free)) { 2697 while (p != &(l3->slabs_free)) {
2264 struct slab *slabp; 2698 struct slab *slabp;
2265 2699
2266 slabp = list_entry(p, struct slab, list); 2700 slabp = list_entry(p, struct slab, list);
@@ -2272,12 +2706,13 @@ free_done:
2272 STATS_SET_FREEABLE(cachep, i); 2706 STATS_SET_FREEABLE(cachep, i);
2273 } 2707 }
2274#endif 2708#endif
2275 spin_unlock(&cachep->spinlock); 2709 spin_unlock(&l3->list_lock);
2276 ac->avail -= batchcount; 2710 ac->avail -= batchcount;
2277 memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount], 2711 memmove(ac->entry, &(ac->entry[batchcount]),
2278 sizeof(void*)*ac->avail); 2712 sizeof(void*)*ac->avail);
2279} 2713}
2280 2714
2715
2281/* 2716/*
2282 * __cache_free 2717 * __cache_free
2283 * Release an obj back to its cache. If the obj has a constructed 2718 * Release an obj back to its cache. If the obj has a constructed
@@ -2292,14 +2727,46 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
2292 check_irq_off(); 2727 check_irq_off();
2293 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 2728 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
2294 2729
2730 /* Make sure we are not freeing a object from another
2731 * node to the array cache on this cpu.
2732 */
2733#ifdef CONFIG_NUMA
2734 {
2735 struct slab *slabp;
2736 slabp = GET_PAGE_SLAB(virt_to_page(objp));
2737 if (unlikely(slabp->nodeid != numa_node_id())) {
2738 struct array_cache *alien = NULL;
2739 int nodeid = slabp->nodeid;
2740 struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()];
2741
2742 STATS_INC_NODEFREES(cachep);
2743 if (l3->alien && l3->alien[nodeid]) {
2744 alien = l3->alien[nodeid];
2745 spin_lock(&alien->lock);
2746 if (unlikely(alien->avail == alien->limit))
2747 __drain_alien_cache(cachep,
2748 alien, nodeid);
2749 alien->entry[alien->avail++] = objp;
2750 spin_unlock(&alien->lock);
2751 } else {
2752 spin_lock(&(cachep->nodelists[nodeid])->
2753 list_lock);
2754 free_block(cachep, &objp, 1);
2755 spin_unlock(&(cachep->nodelists[nodeid])->
2756 list_lock);
2757 }
2758 return;
2759 }
2760 }
2761#endif
2295 if (likely(ac->avail < ac->limit)) { 2762 if (likely(ac->avail < ac->limit)) {
2296 STATS_INC_FREEHIT(cachep); 2763 STATS_INC_FREEHIT(cachep);
2297 ac_entry(ac)[ac->avail++] = objp; 2764 ac->entry[ac->avail++] = objp;
2298 return; 2765 return;
2299 } else { 2766 } else {
2300 STATS_INC_FREEMISS(cachep); 2767 STATS_INC_FREEMISS(cachep);
2301 cache_flusharray(cachep, ac); 2768 cache_flusharray(cachep, ac);
2302 ac_entry(ac)[ac->avail++] = objp; 2769 ac->entry[ac->avail++] = objp;
2303 } 2770 }
2304} 2771}
2305 2772
@@ -2369,81 +2836,30 @@ out:
2369 * Identical to kmem_cache_alloc, except that this function is slow 2836 * Identical to kmem_cache_alloc, except that this function is slow
2370 * and can sleep. And it will allocate memory on the given node, which 2837 * and can sleep. And it will allocate memory on the given node, which
2371 * can improve the performance for cpu bound structures. 2838 * can improve the performance for cpu bound structures.
2839 * New and improved: it will now make sure that the object gets
2840 * put on the correct node list so that there is no false sharing.
2372 */ 2841 */
2373void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid) 2842void *kmem_cache_alloc_node(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
2374{ 2843{
2375 int loop; 2844 unsigned long save_flags;
2376 void *objp; 2845 void *ptr;
2377 struct slab *slabp;
2378 kmem_bufctl_t next;
2379
2380 if (nodeid == -1)
2381 return kmem_cache_alloc(cachep, flags);
2382
2383 for (loop = 0;;loop++) {
2384 struct list_head *q;
2385
2386 objp = NULL;
2387 check_irq_on();
2388 spin_lock_irq(&cachep->spinlock);
2389 /* walk through all partial and empty slab and find one
2390 * from the right node */
2391 list_for_each(q,&cachep->lists.slabs_partial) {
2392 slabp = list_entry(q, struct slab, list);
2393
2394 if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid ||
2395 loop > 2)
2396 goto got_slabp;
2397 }
2398 list_for_each(q, &cachep->lists.slabs_free) {
2399 slabp = list_entry(q, struct slab, list);
2400 2846
2401 if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid || 2847 if (nodeid == numa_node_id() || nodeid == -1)
2402 loop > 2) 2848 return __cache_alloc(cachep, flags);
2403 goto got_slabp;
2404 }
2405 spin_unlock_irq(&cachep->spinlock);
2406 2849
2407 local_irq_disable(); 2850 if (unlikely(!cachep->nodelists[nodeid])) {
2408 if (!cache_grow(cachep, flags, nodeid)) { 2851 /* Fall back to __cache_alloc if we run into trouble */
2409 local_irq_enable(); 2852 printk(KERN_WARNING "slab: not allocating in inactive node %d for cache %s\n", nodeid, cachep->name);
2410 return NULL; 2853 return __cache_alloc(cachep,flags);
2411 }
2412 local_irq_enable();
2413 } 2854 }
2414got_slabp:
2415 /* found one: allocate object */
2416 check_slabp(cachep, slabp);
2417 check_spinlock_acquired(cachep);
2418
2419 STATS_INC_ALLOCED(cachep);
2420 STATS_INC_ACTIVE(cachep);
2421 STATS_SET_HIGH(cachep);
2422 STATS_INC_NODEALLOCS(cachep);
2423 2855
2424 objp = slabp->s_mem + slabp->free*cachep->objsize; 2856 cache_alloc_debugcheck_before(cachep, flags);
2425 2857 local_irq_save(save_flags);
2426 slabp->inuse++; 2858 ptr = __cache_alloc_node(cachep, flags, nodeid);
2427 next = slab_bufctl(slabp)[slabp->free]; 2859 local_irq_restore(save_flags);
2428#if DEBUG 2860 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0));
2429 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2430#endif
2431 slabp->free = next;
2432 check_slabp(cachep, slabp);
2433
2434 /* move slabp to correct slabp list: */
2435 list_del(&slabp->list);
2436 if (slabp->free == BUFCTL_END)
2437 list_add(&slabp->list, &cachep->lists.slabs_full);
2438 else
2439 list_add(&slabp->list, &cachep->lists.slabs_partial);
2440
2441 list3_data(cachep)->free_objects--;
2442 spin_unlock_irq(&cachep->spinlock);
2443 2861
2444 objp = cache_alloc_debugcheck_after(cachep, GFP_KERNEL, objp, 2862 return ptr;
2445 __builtin_return_address(0));
2446 return objp;
2447} 2863}
2448EXPORT_SYMBOL(kmem_cache_alloc_node); 2864EXPORT_SYMBOL(kmem_cache_alloc_node);
2449 2865
@@ -2513,11 +2929,18 @@ void *__alloc_percpu(size_t size, size_t align)
2513 if (!pdata) 2929 if (!pdata)
2514 return NULL; 2930 return NULL;
2515 2931
2516 for (i = 0; i < NR_CPUS; i++) { 2932 /*
2517 if (!cpu_possible(i)) 2933 * Cannot use for_each_online_cpu since a cpu may come online
2518 continue; 2934 * and we have no way of figuring out how to fix the array
2519 pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, 2935 * that we have allocated then....
2520 cpu_to_node(i)); 2936 */
2937 for_each_cpu(i) {
2938 int node = cpu_to_node(i);
2939
2940 if (node_online(node))
2941 pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node);
2942 else
2943 pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
2521 2944
2522 if (!pdata->ptrs[i]) 2945 if (!pdata->ptrs[i])
2523 goto unwind_oom; 2946 goto unwind_oom;
@@ -2575,6 +2998,8 @@ EXPORT_SYMBOL(kzalloc);
2575 * kfree - free previously allocated memory 2998 * kfree - free previously allocated memory
2576 * @objp: pointer returned by kmalloc. 2999 * @objp: pointer returned by kmalloc.
2577 * 3000 *
3001 * If @objp is NULL, no operation is performed.
3002 *
2578 * Don't free memory not originally allocated by kmalloc() 3003 * Don't free memory not originally allocated by kmalloc()
2579 * or you will run into trouble. 3004 * or you will run into trouble.
2580 */ 3005 */
@@ -2607,11 +3032,11 @@ free_percpu(const void *objp)
2607 int i; 3032 int i;
2608 struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp); 3033 struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
2609 3034
2610 for (i = 0; i < NR_CPUS; i++) { 3035 /*
2611 if (!cpu_possible(i)) 3036 * We allocate for all cpus so we cannot use for online cpu here.
2612 continue; 3037 */
3038 for_each_cpu(i)
2613 kfree(p->ptrs[i]); 3039 kfree(p->ptrs[i]);
2614 }
2615 kfree(p); 3040 kfree(p);
2616} 3041}
2617EXPORT_SYMBOL(free_percpu); 3042EXPORT_SYMBOL(free_percpu);
@@ -2629,6 +3054,64 @@ const char *kmem_cache_name(kmem_cache_t *cachep)
2629} 3054}
2630EXPORT_SYMBOL_GPL(kmem_cache_name); 3055EXPORT_SYMBOL_GPL(kmem_cache_name);
2631 3056
3057/*
3058 * This initializes kmem_list3 for all nodes.
3059 */
3060static int alloc_kmemlist(kmem_cache_t *cachep)
3061{
3062 int node;
3063 struct kmem_list3 *l3;
3064 int err = 0;
3065
3066 for_each_online_node(node) {
3067 struct array_cache *nc = NULL, *new;
3068 struct array_cache **new_alien = NULL;
3069#ifdef CONFIG_NUMA
3070 if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
3071 goto fail;
3072#endif
3073 if (!(new = alloc_arraycache(node, (cachep->shared*
3074 cachep->batchcount), 0xbaadf00d)))
3075 goto fail;
3076 if ((l3 = cachep->nodelists[node])) {
3077
3078 spin_lock_irq(&l3->list_lock);
3079
3080 if ((nc = cachep->nodelists[node]->shared))
3081 free_block(cachep, nc->entry,
3082 nc->avail);
3083
3084 l3->shared = new;
3085 if (!cachep->nodelists[node]->alien) {
3086 l3->alien = new_alien;
3087 new_alien = NULL;
3088 }
3089 l3->free_limit = (1 + nr_cpus_node(node))*
3090 cachep->batchcount + cachep->num;
3091 spin_unlock_irq(&l3->list_lock);
3092 kfree(nc);
3093 free_alien_cache(new_alien);
3094 continue;
3095 }
3096 if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
3097 GFP_KERNEL, node)))
3098 goto fail;
3099
3100 kmem_list3_init(l3);
3101 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3102 ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
3103 l3->shared = new;
3104 l3->alien = new_alien;
3105 l3->free_limit = (1 + nr_cpus_node(node))*
3106 cachep->batchcount + cachep->num;
3107 cachep->nodelists[node] = l3;
3108 }
3109 return err;
3110fail:
3111 err = -ENOMEM;
3112 return err;
3113}
3114
2632struct ccupdate_struct { 3115struct ccupdate_struct {
2633 kmem_cache_t *cachep; 3116 kmem_cache_t *cachep;
2634 struct array_cache *new[NR_CPUS]; 3117 struct array_cache *new[NR_CPUS];
@@ -2641,7 +3124,7 @@ static void do_ccupdate_local(void *info)
2641 3124
2642 check_irq_off(); 3125 check_irq_off();
2643 old = ac_data(new->cachep); 3126 old = ac_data(new->cachep);
2644 3127
2645 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; 3128 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
2646 new->new[smp_processor_id()] = old; 3129 new->new[smp_processor_id()] = old;
2647} 3130}
@@ -2651,54 +3134,43 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
2651 int shared) 3134 int shared)
2652{ 3135{
2653 struct ccupdate_struct new; 3136 struct ccupdate_struct new;
2654 struct array_cache *new_shared; 3137 int i, err;
2655 int i;
2656 3138
2657 memset(&new.new,0,sizeof(new.new)); 3139 memset(&new.new,0,sizeof(new.new));
2658 for (i = 0; i < NR_CPUS; i++) { 3140 for_each_online_cpu(i) {
2659 if (cpu_online(i)) { 3141 new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount);
2660 new.new[i] = alloc_arraycache(i, limit, batchcount); 3142 if (!new.new[i]) {
2661 if (!new.new[i]) { 3143 for (i--; i >= 0; i--) kfree(new.new[i]);
2662 for (i--; i >= 0; i--) kfree(new.new[i]); 3144 return -ENOMEM;
2663 return -ENOMEM;
2664 }
2665 } else {
2666 new.new[i] = NULL;
2667 } 3145 }
2668 } 3146 }
2669 new.cachep = cachep; 3147 new.cachep = cachep;
2670 3148
2671 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); 3149 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
2672 3150
2673 check_irq_on(); 3151 check_irq_on();
2674 spin_lock_irq(&cachep->spinlock); 3152 spin_lock_irq(&cachep->spinlock);
2675 cachep->batchcount = batchcount; 3153 cachep->batchcount = batchcount;
2676 cachep->limit = limit; 3154 cachep->limit = limit;
2677 cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num; 3155 cachep->shared = shared;
2678 spin_unlock_irq(&cachep->spinlock); 3156 spin_unlock_irq(&cachep->spinlock);
2679 3157
2680 for (i = 0; i < NR_CPUS; i++) { 3158 for_each_online_cpu(i) {
2681 struct array_cache *ccold = new.new[i]; 3159 struct array_cache *ccold = new.new[i];
2682 if (!ccold) 3160 if (!ccold)
2683 continue; 3161 continue;
2684 spin_lock_irq(&cachep->spinlock); 3162 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
2685 free_block(cachep, ac_entry(ccold), ccold->avail); 3163 free_block(cachep, ccold->entry, ccold->avail);
2686 spin_unlock_irq(&cachep->spinlock); 3164 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
2687 kfree(ccold); 3165 kfree(ccold);
2688 } 3166 }
2689 new_shared = alloc_arraycache(-1, batchcount*shared, 0xbaadf00d);
2690 if (new_shared) {
2691 struct array_cache *old;
2692 3167
2693 spin_lock_irq(&cachep->spinlock); 3168 err = alloc_kmemlist(cachep);
2694 old = cachep->lists.shared; 3169 if (err) {
2695 cachep->lists.shared = new_shared; 3170 printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
2696 if (old) 3171 cachep->name, -err);
2697 free_block(cachep, ac_entry(old), old->avail); 3172 BUG();
2698 spin_unlock_irq(&cachep->spinlock);
2699 kfree(old);
2700 } 3173 }
2701
2702 return 0; 3174 return 0;
2703} 3175}
2704 3176
@@ -2756,11 +3228,11 @@ static void enable_cpucache(kmem_cache_t *cachep)
2756} 3228}
2757 3229
2758static void drain_array_locked(kmem_cache_t *cachep, 3230static void drain_array_locked(kmem_cache_t *cachep,
2759 struct array_cache *ac, int force) 3231 struct array_cache *ac, int force, int node)
2760{ 3232{
2761 int tofree; 3233 int tofree;
2762 3234
2763 check_spinlock_acquired(cachep); 3235 check_spinlock_acquired_node(cachep, node);
2764 if (ac->touched && !force) { 3236 if (ac->touched && !force) {
2765 ac->touched = 0; 3237 ac->touched = 0;
2766 } else if (ac->avail) { 3238 } else if (ac->avail) {
@@ -2768,9 +3240,9 @@ static void drain_array_locked(kmem_cache_t *cachep,
2768 if (tofree > ac->avail) { 3240 if (tofree > ac->avail) {
2769 tofree = (ac->avail+1)/2; 3241 tofree = (ac->avail+1)/2;
2770 } 3242 }
2771 free_block(cachep, ac_entry(ac), tofree); 3243 free_block(cachep, ac->entry, tofree);
2772 ac->avail -= tofree; 3244 ac->avail -= tofree;
2773 memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree], 3245 memmove(ac->entry, &(ac->entry[tofree]),
2774 sizeof(void*)*ac->avail); 3246 sizeof(void*)*ac->avail);
2775 } 3247 }
2776} 3248}
@@ -2789,6 +3261,7 @@ static void drain_array_locked(kmem_cache_t *cachep,
2789static void cache_reap(void *unused) 3261static void cache_reap(void *unused)
2790{ 3262{
2791 struct list_head *walk; 3263 struct list_head *walk;
3264 struct kmem_list3 *l3;
2792 3265
2793 if (down_trylock(&cache_chain_sem)) { 3266 if (down_trylock(&cache_chain_sem)) {
2794 /* Give up. Setup the next iteration. */ 3267 /* Give up. Setup the next iteration. */
@@ -2809,27 +3282,32 @@ static void cache_reap(void *unused)
2809 3282
2810 check_irq_on(); 3283 check_irq_on();
2811 3284
2812 spin_lock_irq(&searchp->spinlock); 3285 l3 = searchp->nodelists[numa_node_id()];
3286 if (l3->alien)
3287 drain_alien_cache(searchp, l3);
3288 spin_lock_irq(&l3->list_lock);
2813 3289
2814 drain_array_locked(searchp, ac_data(searchp), 0); 3290 drain_array_locked(searchp, ac_data(searchp), 0,
3291 numa_node_id());
2815 3292
2816 if(time_after(searchp->lists.next_reap, jiffies)) 3293 if (time_after(l3->next_reap, jiffies))
2817 goto next_unlock; 3294 goto next_unlock;
2818 3295
2819 searchp->lists.next_reap = jiffies + REAPTIMEOUT_LIST3; 3296 l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
2820 3297
2821 if (searchp->lists.shared) 3298 if (l3->shared)
2822 drain_array_locked(searchp, searchp->lists.shared, 0); 3299 drain_array_locked(searchp, l3->shared, 0,
3300 numa_node_id());
2823 3301
2824 if (searchp->lists.free_touched) { 3302 if (l3->free_touched) {
2825 searchp->lists.free_touched = 0; 3303 l3->free_touched = 0;
2826 goto next_unlock; 3304 goto next_unlock;
2827 } 3305 }
2828 3306
2829 tofree = (searchp->free_limit+5*searchp->num-1)/(5*searchp->num); 3307 tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num);
2830 do { 3308 do {
2831 p = list3_data(searchp)->slabs_free.next; 3309 p = l3->slabs_free.next;
2832 if (p == &(list3_data(searchp)->slabs_free)) 3310 if (p == &(l3->slabs_free))
2833 break; 3311 break;
2834 3312
2835 slabp = list_entry(p, struct slab, list); 3313 slabp = list_entry(p, struct slab, list);
@@ -2842,13 +3320,13 @@ static void cache_reap(void *unused)
2842 * searchp cannot disappear, we hold 3320 * searchp cannot disappear, we hold
2843 * cache_chain_lock 3321 * cache_chain_lock
2844 */ 3322 */
2845 searchp->lists.free_objects -= searchp->num; 3323 l3->free_objects -= searchp->num;
2846 spin_unlock_irq(&searchp->spinlock); 3324 spin_unlock_irq(&l3->list_lock);
2847 slab_destroy(searchp, slabp); 3325 slab_destroy(searchp, slabp);
2848 spin_lock_irq(&searchp->spinlock); 3326 spin_lock_irq(&l3->list_lock);
2849 } while(--tofree > 0); 3327 } while(--tofree > 0);
2850next_unlock: 3328next_unlock:
2851 spin_unlock_irq(&searchp->spinlock); 3329 spin_unlock_irq(&l3->list_lock);
2852next: 3330next:
2853 cond_resched(); 3331 cond_resched();
2854 } 3332 }
@@ -2882,7 +3360,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
2882 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 3360 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
2883#if STATS 3361#if STATS
2884 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>" 3362 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>"
2885 " <error> <maxfreeable> <freelimit> <nodeallocs>"); 3363 " <error> <maxfreeable> <nodeallocs> <remotefrees>");
2886 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); 3364 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
2887#endif 3365#endif
2888 seq_putc(m, '\n'); 3366 seq_putc(m, '\n');
@@ -2917,39 +3395,53 @@ static int s_show(struct seq_file *m, void *p)
2917 unsigned long active_objs; 3395 unsigned long active_objs;
2918 unsigned long num_objs; 3396 unsigned long num_objs;
2919 unsigned long active_slabs = 0; 3397 unsigned long active_slabs = 0;
2920 unsigned long num_slabs; 3398 unsigned long num_slabs, free_objects = 0, shared_avail = 0;
2921 const char *name; 3399 const char *name;
2922 char *error = NULL; 3400 char *error = NULL;
3401 int node;
3402 struct kmem_list3 *l3;
2923 3403
2924 check_irq_on(); 3404 check_irq_on();
2925 spin_lock_irq(&cachep->spinlock); 3405 spin_lock_irq(&cachep->spinlock);
2926 active_objs = 0; 3406 active_objs = 0;
2927 num_slabs = 0; 3407 num_slabs = 0;
2928 list_for_each(q,&cachep->lists.slabs_full) { 3408 for_each_online_node(node) {
2929 slabp = list_entry(q, struct slab, list); 3409 l3 = cachep->nodelists[node];
2930 if (slabp->inuse != cachep->num && !error) 3410 if (!l3)
2931 error = "slabs_full accounting error"; 3411 continue;
2932 active_objs += cachep->num; 3412
2933 active_slabs++; 3413 spin_lock(&l3->list_lock);
2934 } 3414
2935 list_for_each(q,&cachep->lists.slabs_partial) { 3415 list_for_each(q,&l3->slabs_full) {
2936 slabp = list_entry(q, struct slab, list); 3416 slabp = list_entry(q, struct slab, list);
2937 if (slabp->inuse == cachep->num && !error) 3417 if (slabp->inuse != cachep->num && !error)
2938 error = "slabs_partial inuse accounting error"; 3418 error = "slabs_full accounting error";
2939 if (!slabp->inuse && !error) 3419 active_objs += cachep->num;
2940 error = "slabs_partial/inuse accounting error"; 3420 active_slabs++;
2941 active_objs += slabp->inuse; 3421 }
2942 active_slabs++; 3422 list_for_each(q,&l3->slabs_partial) {
2943 } 3423 slabp = list_entry(q, struct slab, list);
2944 list_for_each(q,&cachep->lists.slabs_free) { 3424 if (slabp->inuse == cachep->num && !error)
2945 slabp = list_entry(q, struct slab, list); 3425 error = "slabs_partial inuse accounting error";
2946 if (slabp->inuse && !error) 3426 if (!slabp->inuse && !error)
2947 error = "slabs_free/inuse accounting error"; 3427 error = "slabs_partial/inuse accounting error";
2948 num_slabs++; 3428 active_objs += slabp->inuse;
3429 active_slabs++;
3430 }
3431 list_for_each(q,&l3->slabs_free) {
3432 slabp = list_entry(q, struct slab, list);
3433 if (slabp->inuse && !error)
3434 error = "slabs_free/inuse accounting error";
3435 num_slabs++;
3436 }
3437 free_objects += l3->free_objects;
3438 shared_avail += l3->shared->avail;
3439
3440 spin_unlock(&l3->list_lock);
2949 } 3441 }
2950 num_slabs+=active_slabs; 3442 num_slabs+=active_slabs;
2951 num_objs = num_slabs*cachep->num; 3443 num_objs = num_slabs*cachep->num;
2952 if (num_objs - active_objs != cachep->lists.free_objects && !error) 3444 if (num_objs - active_objs != free_objects && !error)
2953 error = "free_objects accounting error"; 3445 error = "free_objects accounting error";
2954 3446
2955 name = cachep->name; 3447 name = cachep->name;
@@ -2961,9 +3453,9 @@ static int s_show(struct seq_file *m, void *p)
2961 cachep->num, (1<<cachep->gfporder)); 3453 cachep->num, (1<<cachep->gfporder));
2962 seq_printf(m, " : tunables %4u %4u %4u", 3454 seq_printf(m, " : tunables %4u %4u %4u",
2963 cachep->limit, cachep->batchcount, 3455 cachep->limit, cachep->batchcount,
2964 cachep->lists.shared->limit/cachep->batchcount); 3456 cachep->shared);
2965 seq_printf(m, " : slabdata %6lu %6lu %6u", 3457 seq_printf(m, " : slabdata %6lu %6lu %6lu",
2966 active_slabs, num_slabs, cachep->lists.shared->avail); 3458 active_slabs, num_slabs, shared_avail);
2967#if STATS 3459#if STATS
2968 { /* list3 stats */ 3460 { /* list3 stats */
2969 unsigned long high = cachep->high_mark; 3461 unsigned long high = cachep->high_mark;
@@ -2972,12 +3464,13 @@ static int s_show(struct seq_file *m, void *p)
2972 unsigned long reaped = cachep->reaped; 3464 unsigned long reaped = cachep->reaped;
2973 unsigned long errors = cachep->errors; 3465 unsigned long errors = cachep->errors;
2974 unsigned long max_freeable = cachep->max_freeable; 3466 unsigned long max_freeable = cachep->max_freeable;
2975 unsigned long free_limit = cachep->free_limit;
2976 unsigned long node_allocs = cachep->node_allocs; 3467 unsigned long node_allocs = cachep->node_allocs;
3468 unsigned long node_frees = cachep->node_frees;
2977 3469
2978 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu", 3470 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
2979 allocs, high, grown, reaped, errors, 3471 %4lu %4lu %4lu %4lu",
2980 max_freeable, free_limit, node_allocs); 3472 allocs, high, grown, reaped, errors,
3473 max_freeable, node_allocs, node_frees);
2981 } 3474 }
2982 /* cpu stats */ 3475 /* cpu stats */
2983 { 3476 {
@@ -3056,9 +3549,10 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
3056 batchcount < 1 || 3549 batchcount < 1 ||
3057 batchcount > limit || 3550 batchcount > limit ||
3058 shared < 0) { 3551 shared < 0) {
3059 res = -EINVAL; 3552 res = 0;
3060 } else { 3553 } else {
3061 res = do_tune_cpucache(cachep, limit, batchcount, shared); 3554 res = do_tune_cpucache(cachep, limit,
3555 batchcount, shared);
3062 } 3556 }
3063 break; 3557 break;
3064 } 3558 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 029e56eb5e77..adbc2b426c2f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -67,8 +67,8 @@ void show_swap_cache_info(void)
67 * __add_to_swap_cache resembles add_to_page_cache on swapper_space, 67 * __add_to_swap_cache resembles add_to_page_cache on swapper_space,
68 * but sets SwapCache flag and private instead of mapping and index. 68 * but sets SwapCache flag and private instead of mapping and index.
69 */ 69 */
70static int __add_to_swap_cache(struct page *page, 70static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
71 swp_entry_t entry, int gfp_mask) 71 unsigned int __nocast gfp_mask)
72{ 72{
73 int error; 73 int error;
74 74
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4b6e8bf986bc..0184f510aace 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1153,8 +1153,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1153 p->highest_bit = 0; /* cuts scans short */ 1153 p->highest_bit = 0; /* cuts scans short */
1154 while (p->flags >= SWP_SCANNING) { 1154 while (p->flags >= SWP_SCANNING) {
1155 spin_unlock(&swap_lock); 1155 spin_unlock(&swap_lock);
1156 set_current_state(TASK_UNINTERRUPTIBLE); 1156 schedule_timeout_uninterruptible(1);
1157 schedule_timeout(1);
1158 spin_lock(&swap_lock); 1157 spin_lock(&swap_lock);
1159 } 1158 }
1160 1159
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 67b358e57ef6..13c3d82968ae 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -332,9 +332,10 @@ void __vunmap(void *addr, int deallocate_pages)
332 * @addr: memory base address 332 * @addr: memory base address
333 * 333 *
334 * Free the virtually contiguous memory area starting at @addr, as 334 * Free the virtually contiguous memory area starting at @addr, as
335 * obtained from vmalloc(), vmalloc_32() or __vmalloc(). 335 * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
336 * NULL, no operation is performed.
336 * 337 *
337 * May not be called in interrupt context. 338 * Must not be called in interrupt context.
338 */ 339 */
339void vfree(void *addr) 340void vfree(void *addr)
340{ 341{
@@ -352,7 +353,7 @@ EXPORT_SYMBOL(vfree);
352 * Free the virtually contiguous memory area starting at @addr, 353 * Free the virtually contiguous memory area starting at @addr,
353 * which was created from the page array passed to vmap(). 354 * which was created from the page array passed to vmap().
354 * 355 *
355 * May not be called in interrupt context. 356 * Must not be called in interrupt context.
356 */ 357 */
357void vunmap(void *addr) 358void vunmap(void *addr)
358{ 359{