aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig7
-rw-r--r--mm/Makefile6
-rw-r--r--mm/fadvise.c5
-rw-r--r--mm/filemap.c48
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/memory.c2
-rw-r--r--mm/mempolicy.c561
-rw-r--r--mm/oom_kill.c5
-rw-r--r--mm/page_alloc.c129
-rw-r--r--mm/pdflush.c2
-rw-r--r--mm/rmap.c7
-rw-r--r--mm/slab.c1139
-rw-r--r--mm/slob.c385
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/swapfile.c12
-rw-r--r--mm/truncate.c1
-rw-r--r--mm/util.c39
-rw-r--r--mm/vmscan.c343
19 files changed, 1934 insertions, 769 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index b3db11f137e0..a9cb80ae6409 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -132,3 +132,10 @@ config SPLIT_PTLOCK_CPUS
132 default "4096" if ARM && !CPU_CACHE_VIPT 132 default "4096" if ARM && !CPU_CACHE_VIPT
133 default "4096" if PARISC && !PA20 133 default "4096" if PARISC && !PA20
134 default "4" 134 default "4"
135
136#
137# support for page migration
138#
139config MIGRATION
140 def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
141 depends on SWAP
diff --git a/mm/Makefile b/mm/Makefile
index 2fa6d2ca9f28..9aa03fa1dcc3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,8 +9,8 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 page_alloc.o page-writeback.o pdflush.o \ 11 page_alloc.o page-writeback.o pdflush.o \
12 readahead.o slab.o swap.o truncate.o vmscan.o \ 12 readahead.o swap.o truncate.o vmscan.o \
13 prio_tree.o $(mmu-y) 13 prio_tree.o util.o $(mmu-y)
14 14
15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
16obj-$(CONFIG_HUGETLBFS) += hugetlb.o 16obj-$(CONFIG_HUGETLBFS) += hugetlb.o
@@ -18,5 +18,7 @@ obj-$(CONFIG_NUMA) += mempolicy.o
18obj-$(CONFIG_SPARSEMEM) += sparse.o 18obj-$(CONFIG_SPARSEMEM) += sparse.o
19obj-$(CONFIG_SHMEM) += shmem.o 19obj-$(CONFIG_SHMEM) += shmem.o
20obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o 20obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
21obj-$(CONFIG_SLOB) += slob.o
22obj-$(CONFIG_SLAB) += slab.o
21obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 23obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
22obj-$(CONFIG_FS_XIP) += filemap_xip.o 24obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 5f19e87bc5af..d257c89e7704 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -37,6 +37,11 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
37 if (!file) 37 if (!file)
38 return -EBADF; 38 return -EBADF;
39 39
40 if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) {
41 ret = -ESPIPE;
42 goto out;
43 }
44
40 mapping = file->f_mapping; 45 mapping = file->f_mapping;
41 if (!mapping || len < 0) { 46 if (!mapping || len < 0) {
42 ret = -EINVAL; 47 ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index 4ef24a397684..478f4c74cc31 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -280,7 +280,7 @@ static int wait_on_page_writeback_range(struct address_space *mapping,
280 * it is otherwise livelockable. 280 * it is otherwise livelockable.
281 */ 281 */
282int sync_page_range(struct inode *inode, struct address_space *mapping, 282int sync_page_range(struct inode *inode, struct address_space *mapping,
283 loff_t pos, size_t count) 283 loff_t pos, loff_t count)
284{ 284{
285 pgoff_t start = pos >> PAGE_CACHE_SHIFT; 285 pgoff_t start = pos >> PAGE_CACHE_SHIFT;
286 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; 286 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -305,9 +305,8 @@ EXPORT_SYMBOL(sync_page_range);
305 * as it forces O_SYNC writers to different parts of the same file 305 * as it forces O_SYNC writers to different parts of the same file
306 * to be serialised right until io completion. 306 * to be serialised right until io completion.
307 */ 307 */
308static int sync_page_range_nolock(struct inode *inode, 308int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
309 struct address_space *mapping, 309 loff_t pos, loff_t count)
310 loff_t pos, size_t count)
311{ 310{
312 pgoff_t start = pos >> PAGE_CACHE_SHIFT; 311 pgoff_t start = pos >> PAGE_CACHE_SHIFT;
313 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; 312 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -322,6 +321,7 @@ static int sync_page_range_nolock(struct inode *inode,
322 ret = wait_on_page_writeback_range(mapping, start, end); 321 ret = wait_on_page_writeback_range(mapping, start, end);
323 return ret; 322 return ret;
324} 323}
324EXPORT_SYMBOL(sync_page_range_nolock);
325 325
326/** 326/**
327 * filemap_fdatawait - walk the list of under-writeback pages of the given 327 * filemap_fdatawait - walk the list of under-writeback pages of the given
@@ -343,30 +343,44 @@ EXPORT_SYMBOL(filemap_fdatawait);
343 343
344int filemap_write_and_wait(struct address_space *mapping) 344int filemap_write_and_wait(struct address_space *mapping)
345{ 345{
346 int retval = 0; 346 int err = 0;
347 347
348 if (mapping->nrpages) { 348 if (mapping->nrpages) {
349 retval = filemap_fdatawrite(mapping); 349 err = filemap_fdatawrite(mapping);
350 if (retval == 0) 350 /*
351 retval = filemap_fdatawait(mapping); 351 * Even if the above returned error, the pages may be
352 * written partially (e.g. -ENOSPC), so we wait for it.
353 * But the -EIO is special case, it may indicate the worst
354 * thing (e.g. bug) happened, so we avoid waiting for it.
355 */
356 if (err != -EIO) {
357 int err2 = filemap_fdatawait(mapping);
358 if (!err)
359 err = err2;
360 }
352 } 361 }
353 return retval; 362 return err;
354} 363}
364EXPORT_SYMBOL(filemap_write_and_wait);
355 365
356int filemap_write_and_wait_range(struct address_space *mapping, 366int filemap_write_and_wait_range(struct address_space *mapping,
357 loff_t lstart, loff_t lend) 367 loff_t lstart, loff_t lend)
358{ 368{
359 int retval = 0; 369 int err = 0;
360 370
361 if (mapping->nrpages) { 371 if (mapping->nrpages) {
362 retval = __filemap_fdatawrite_range(mapping, lstart, lend, 372 err = __filemap_fdatawrite_range(mapping, lstart, lend,
363 WB_SYNC_ALL); 373 WB_SYNC_ALL);
364 if (retval == 0) 374 /* See comment of filemap_write_and_wait() */
365 retval = wait_on_page_writeback_range(mapping, 375 if (err != -EIO) {
366 lstart >> PAGE_CACHE_SHIFT, 376 int err2 = wait_on_page_writeback_range(mapping,
367 lend >> PAGE_CACHE_SHIFT); 377 lstart >> PAGE_CACHE_SHIFT,
378 lend >> PAGE_CACHE_SHIFT);
379 if (!err)
380 err = err2;
381 }
368 } 382 }
369 return retval; 383 return err;
370} 384}
371 385
372/* 386/*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f4c43d7980ba..b21d78c941b5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -12,6 +12,7 @@
12#include <linux/nodemask.h> 12#include <linux/nodemask.h>
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/mempolicy.h> 14#include <linux/mempolicy.h>
15#include <linux/cpuset.h>
15 16
16#include <asm/page.h> 17#include <asm/page.h>
17#include <asm/pgtable.h> 18#include <asm/pgtable.h>
@@ -48,7 +49,8 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
48 49
49 for (z = zonelist->zones; *z; z++) { 50 for (z = zonelist->zones; *z; z++) {
50 nid = (*z)->zone_pgdat->node_id; 51 nid = (*z)->zone_pgdat->node_id;
51 if (!list_empty(&hugepage_freelists[nid])) 52 if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
53 !list_empty(&hugepage_freelists[nid]))
52 break; 54 break;
53 } 55 }
54 56
diff --git a/mm/memory.c b/mm/memory.c
index 7197f9bcd384..3944fec38012 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2267,6 +2267,8 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2267 return handle_pte_fault(mm, vma, address, pte, pmd, write_access); 2267 return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
2268} 2268}
2269 2269
2270EXPORT_SYMBOL_GPL(__handle_mm_fault);
2271
2270#ifndef __PAGETABLE_PUD_FOLDED 2272#ifndef __PAGETABLE_PUD_FOLDED
2271/* 2273/*
2272 * Allocate page upper directory. 2274 * Allocate page upper directory.
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0f1d2b8a952b..1850d0aef4ac 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -83,9 +83,18 @@
83#include <linux/init.h> 83#include <linux/init.h>
84#include <linux/compat.h> 84#include <linux/compat.h>
85#include <linux/mempolicy.h> 85#include <linux/mempolicy.h>
86#include <linux/swap.h>
87#include <linux/seq_file.h>
88#include <linux/proc_fs.h>
89
86#include <asm/tlbflush.h> 90#include <asm/tlbflush.h>
87#include <asm/uaccess.h> 91#include <asm/uaccess.h>
88 92
93/* Internal flags */
94#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
95#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
96#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
97
89static kmem_cache_t *policy_cache; 98static kmem_cache_t *policy_cache;
90static kmem_cache_t *sn_cache; 99static kmem_cache_t *sn_cache;
91 100
@@ -171,12 +180,19 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
171 break; 180 break;
172 } 181 }
173 policy->policy = mode; 182 policy->policy = mode;
183 policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
174 return policy; 184 return policy;
175} 185}
176 186
177/* Ensure all existing pages follow the policy. */ 187static void gather_stats(struct page *, void *);
188static void migrate_page_add(struct vm_area_struct *vma,
189 struct page *page, struct list_head *pagelist, unsigned long flags);
190
191/* Scan through pages checking if pages follow certain conditions. */
178static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 192static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
179 unsigned long addr, unsigned long end, nodemask_t *nodes) 193 unsigned long addr, unsigned long end,
194 const nodemask_t *nodes, unsigned long flags,
195 void *private)
180{ 196{
181 pte_t *orig_pte; 197 pte_t *orig_pte;
182 pte_t *pte; 198 pte_t *pte;
@@ -193,7 +209,17 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
193 if (!page) 209 if (!page)
194 continue; 210 continue;
195 nid = page_to_nid(page); 211 nid = page_to_nid(page);
196 if (!node_isset(nid, *nodes)) 212 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
213 continue;
214
215 if (flags & MPOL_MF_STATS)
216 gather_stats(page, private);
217 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
218 spin_unlock(ptl);
219 migrate_page_add(vma, page, private, flags);
220 spin_lock(ptl);
221 }
222 else
197 break; 223 break;
198 } while (pte++, addr += PAGE_SIZE, addr != end); 224 } while (pte++, addr += PAGE_SIZE, addr != end);
199 pte_unmap_unlock(orig_pte, ptl); 225 pte_unmap_unlock(orig_pte, ptl);
@@ -201,7 +227,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
201} 227}
202 228
203static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, 229static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
204 unsigned long addr, unsigned long end, nodemask_t *nodes) 230 unsigned long addr, unsigned long end,
231 const nodemask_t *nodes, unsigned long flags,
232 void *private)
205{ 233{
206 pmd_t *pmd; 234 pmd_t *pmd;
207 unsigned long next; 235 unsigned long next;
@@ -211,14 +239,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
211 next = pmd_addr_end(addr, end); 239 next = pmd_addr_end(addr, end);
212 if (pmd_none_or_clear_bad(pmd)) 240 if (pmd_none_or_clear_bad(pmd))
213 continue; 241 continue;
214 if (check_pte_range(vma, pmd, addr, next, nodes)) 242 if (check_pte_range(vma, pmd, addr, next, nodes,
243 flags, private))
215 return -EIO; 244 return -EIO;
216 } while (pmd++, addr = next, addr != end); 245 } while (pmd++, addr = next, addr != end);
217 return 0; 246 return 0;
218} 247}
219 248
220static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 249static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
221 unsigned long addr, unsigned long end, nodemask_t *nodes) 250 unsigned long addr, unsigned long end,
251 const nodemask_t *nodes, unsigned long flags,
252 void *private)
222{ 253{
223 pud_t *pud; 254 pud_t *pud;
224 unsigned long next; 255 unsigned long next;
@@ -228,14 +259,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
228 next = pud_addr_end(addr, end); 259 next = pud_addr_end(addr, end);
229 if (pud_none_or_clear_bad(pud)) 260 if (pud_none_or_clear_bad(pud))
230 continue; 261 continue;
231 if (check_pmd_range(vma, pud, addr, next, nodes)) 262 if (check_pmd_range(vma, pud, addr, next, nodes,
263 flags, private))
232 return -EIO; 264 return -EIO;
233 } while (pud++, addr = next, addr != end); 265 } while (pud++, addr = next, addr != end);
234 return 0; 266 return 0;
235} 267}
236 268
237static inline int check_pgd_range(struct vm_area_struct *vma, 269static inline int check_pgd_range(struct vm_area_struct *vma,
238 unsigned long addr, unsigned long end, nodemask_t *nodes) 270 unsigned long addr, unsigned long end,
271 const nodemask_t *nodes, unsigned long flags,
272 void *private)
239{ 273{
240 pgd_t *pgd; 274 pgd_t *pgd;
241 unsigned long next; 275 unsigned long next;
@@ -245,16 +279,30 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
245 next = pgd_addr_end(addr, end); 279 next = pgd_addr_end(addr, end);
246 if (pgd_none_or_clear_bad(pgd)) 280 if (pgd_none_or_clear_bad(pgd))
247 continue; 281 continue;
248 if (check_pud_range(vma, pgd, addr, next, nodes)) 282 if (check_pud_range(vma, pgd, addr, next, nodes,
283 flags, private))
249 return -EIO; 284 return -EIO;
250 } while (pgd++, addr = next, addr != end); 285 } while (pgd++, addr = next, addr != end);
251 return 0; 286 return 0;
252} 287}
253 288
254/* Step 1: check the range */ 289/* Check if a vma is migratable */
290static inline int vma_migratable(struct vm_area_struct *vma)
291{
292 if (vma->vm_flags & (
293 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
294 return 0;
295 return 1;
296}
297
298/*
299 * Check if all pages in a range are on a set of nodes.
300 * If pagelist != NULL then isolate pages from the LRU and
301 * put them on the pagelist.
302 */
255static struct vm_area_struct * 303static struct vm_area_struct *
256check_range(struct mm_struct *mm, unsigned long start, unsigned long end, 304check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
257 nodemask_t *nodes, unsigned long flags) 305 const nodemask_t *nodes, unsigned long flags, void *private)
258{ 306{
259 int err; 307 int err;
260 struct vm_area_struct *first, *vma, *prev; 308 struct vm_area_struct *first, *vma, *prev;
@@ -264,17 +312,24 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
264 return ERR_PTR(-EFAULT); 312 return ERR_PTR(-EFAULT);
265 prev = NULL; 313 prev = NULL;
266 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 314 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
267 if (!vma->vm_next && vma->vm_end < end) 315 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
268 return ERR_PTR(-EFAULT); 316 if (!vma->vm_next && vma->vm_end < end)
269 if (prev && prev->vm_end < vma->vm_start) 317 return ERR_PTR(-EFAULT);
270 return ERR_PTR(-EFAULT); 318 if (prev && prev->vm_end < vma->vm_start)
271 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { 319 return ERR_PTR(-EFAULT);
320 }
321 if (!is_vm_hugetlb_page(vma) &&
322 ((flags & MPOL_MF_STRICT) ||
323 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
324 vma_migratable(vma)))) {
272 unsigned long endvma = vma->vm_end; 325 unsigned long endvma = vma->vm_end;
326
273 if (endvma > end) 327 if (endvma > end)
274 endvma = end; 328 endvma = end;
275 if (vma->vm_start > start) 329 if (vma->vm_start > start)
276 start = vma->vm_start; 330 start = vma->vm_start;
277 err = check_pgd_range(vma, start, endvma, nodes); 331 err = check_pgd_range(vma, start, endvma, nodes,
332 flags, private);
278 if (err) { 333 if (err) {
279 first = ERR_PTR(err); 334 first = ERR_PTR(err);
280 break; 335 break;
@@ -333,51 +388,10 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
333 if (!nodes) 388 if (!nodes)
334 return 0; 389 return 0;
335 390
336 /* Update current mems_allowed */ 391 cpuset_update_task_memory_state();
337 cpuset_update_current_mems_allowed(); 392 if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
338 /* Ignore nodes not set in current->mems_allowed */
339 cpuset_restrict_to_mems_allowed(nodes->bits);
340 return mpol_check_policy(mode, nodes);
341}
342
343long do_mbind(unsigned long start, unsigned long len,
344 unsigned long mode, nodemask_t *nmask, unsigned long flags)
345{
346 struct vm_area_struct *vma;
347 struct mm_struct *mm = current->mm;
348 struct mempolicy *new;
349 unsigned long end;
350 int err;
351
352 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
353 return -EINVAL;
354 if (start & ~PAGE_MASK)
355 return -EINVAL;
356 if (mode == MPOL_DEFAULT)
357 flags &= ~MPOL_MF_STRICT;
358 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
359 end = start + len;
360 if (end < start)
361 return -EINVAL; 393 return -EINVAL;
362 if (end == start) 394 return mpol_check_policy(mode, nodes);
363 return 0;
364 if (mpol_check_policy(mode, nmask))
365 return -EINVAL;
366 new = mpol_new(mode, nmask);
367 if (IS_ERR(new))
368 return PTR_ERR(new);
369
370 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
371 mode,nodes_addr(nodes)[0]);
372
373 down_write(&mm->mmap_sem);
374 vma = check_range(mm, start, end, nmask, flags);
375 err = PTR_ERR(vma);
376 if (!IS_ERR(vma))
377 err = mbind_range(vma, start, end, new);
378 up_write(&mm->mmap_sem);
379 mpol_free(new);
380 return err;
381} 395}
382 396
383/* Set the process memory policy */ 397/* Set the process memory policy */
@@ -448,7 +462,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
448 struct vm_area_struct *vma = NULL; 462 struct vm_area_struct *vma = NULL;
449 struct mempolicy *pol = current->mempolicy; 463 struct mempolicy *pol = current->mempolicy;
450 464
451 cpuset_update_current_mems_allowed(); 465 cpuset_update_task_memory_state();
452 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) 466 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
453 return -EINVAL; 467 return -EINVAL;
454 if (flags & MPOL_F_ADDR) { 468 if (flags & MPOL_F_ADDR) {
@@ -500,11 +514,177 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
500} 514}
501 515
502/* 516/*
517 * page migration
518 */
519
520/* Check if we are the only process mapping the page in question */
521static inline int single_mm_mapping(struct mm_struct *mm,
522 struct address_space *mapping)
523{
524 struct vm_area_struct *vma;
525 struct prio_tree_iter iter;
526 int rc = 1;
527
528 spin_lock(&mapping->i_mmap_lock);
529 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
530 if (mm != vma->vm_mm) {
531 rc = 0;
532 goto out;
533 }
534 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
535 if (mm != vma->vm_mm) {
536 rc = 0;
537 goto out;
538 }
539out:
540 spin_unlock(&mapping->i_mmap_lock);
541 return rc;
542}
543
544/*
545 * Add a page to be migrated to the pagelist
546 */
547static void migrate_page_add(struct vm_area_struct *vma,
548 struct page *page, struct list_head *pagelist, unsigned long flags)
549{
550 /*
551 * Avoid migrating a page that is shared by others and not writable.
552 */
553 if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
554 mapping_writably_mapped(page->mapping) ||
555 single_mm_mapping(vma->vm_mm, page->mapping)) {
556 int rc = isolate_lru_page(page);
557
558 if (rc == 1)
559 list_add(&page->lru, pagelist);
560 /*
561 * If the isolate attempt was not successful then we just
562 * encountered an unswappable page. Something must be wrong.
563 */
564 WARN_ON(rc == 0);
565 }
566}
567
568static int swap_pages(struct list_head *pagelist)
569{
570 LIST_HEAD(moved);
571 LIST_HEAD(failed);
572 int n;
573
574 n = migrate_pages(pagelist, NULL, &moved, &failed);
575 putback_lru_pages(&failed);
576 putback_lru_pages(&moved);
577
578 return n;
579}
580
581/*
582 * For now migrate_pages simply swaps out the pages from nodes that are in
583 * the source set but not in the target set. In the future, we would
584 * want a function that moves pages between the two nodesets in such
585 * a way as to preserve the physical layout as much as possible.
586 *
587 * Returns the number of page that could not be moved.
588 */
589int do_migrate_pages(struct mm_struct *mm,
590 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
591{
592 LIST_HEAD(pagelist);
593 int count = 0;
594 nodemask_t nodes;
595
596 nodes_andnot(nodes, *from_nodes, *to_nodes);
597
598 down_read(&mm->mmap_sem);
599 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
600 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
601
602 if (!list_empty(&pagelist)) {
603 count = swap_pages(&pagelist);
604 putback_lru_pages(&pagelist);
605 }
606
607 up_read(&mm->mmap_sem);
608 return count;
609}
610
611long do_mbind(unsigned long start, unsigned long len,
612 unsigned long mode, nodemask_t *nmask, unsigned long flags)
613{
614 struct vm_area_struct *vma;
615 struct mm_struct *mm = current->mm;
616 struct mempolicy *new;
617 unsigned long end;
618 int err;
619 LIST_HEAD(pagelist);
620
621 if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
622 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
623 || mode > MPOL_MAX)
624 return -EINVAL;
625 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
626 return -EPERM;
627
628 if (start & ~PAGE_MASK)
629 return -EINVAL;
630
631 if (mode == MPOL_DEFAULT)
632 flags &= ~MPOL_MF_STRICT;
633
634 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
635 end = start + len;
636
637 if (end < start)
638 return -EINVAL;
639 if (end == start)
640 return 0;
641
642 if (mpol_check_policy(mode, nmask))
643 return -EINVAL;
644
645 new = mpol_new(mode, nmask);
646 if (IS_ERR(new))
647 return PTR_ERR(new);
648
649 /*
650 * If we are using the default policy then operation
651 * on discontinuous address spaces is okay after all
652 */
653 if (!new)
654 flags |= MPOL_MF_DISCONTIG_OK;
655
656 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
657 mode,nodes_addr(nodes)[0]);
658
659 down_write(&mm->mmap_sem);
660 vma = check_range(mm, start, end, nmask,
661 flags | MPOL_MF_INVERT, &pagelist);
662
663 err = PTR_ERR(vma);
664 if (!IS_ERR(vma)) {
665 int nr_failed = 0;
666
667 err = mbind_range(vma, start, end, new);
668 if (!list_empty(&pagelist))
669 nr_failed = swap_pages(&pagelist);
670
671 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
672 err = -EIO;
673 }
674 if (!list_empty(&pagelist))
675 putback_lru_pages(&pagelist);
676
677 up_write(&mm->mmap_sem);
678 mpol_free(new);
679 return err;
680}
681
682/*
503 * User space interface with variable sized bitmaps for nodelists. 683 * User space interface with variable sized bitmaps for nodelists.
504 */ 684 */
505 685
506/* Copy a node mask from user space. */ 686/* Copy a node mask from user space. */
507static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, 687static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
508 unsigned long maxnode) 688 unsigned long maxnode)
509{ 689{
510 unsigned long k; 690 unsigned long k;
@@ -593,6 +773,65 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
593 return do_set_mempolicy(mode, &nodes); 773 return do_set_mempolicy(mode, &nodes);
594} 774}
595 775
776asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
777 const unsigned long __user *old_nodes,
778 const unsigned long __user *new_nodes)
779{
780 struct mm_struct *mm;
781 struct task_struct *task;
782 nodemask_t old;
783 nodemask_t new;
784 nodemask_t task_nodes;
785 int err;
786
787 err = get_nodes(&old, old_nodes, maxnode);
788 if (err)
789 return err;
790
791 err = get_nodes(&new, new_nodes, maxnode);
792 if (err)
793 return err;
794
795 /* Find the mm_struct */
796 read_lock(&tasklist_lock);
797 task = pid ? find_task_by_pid(pid) : current;
798 if (!task) {
799 read_unlock(&tasklist_lock);
800 return -ESRCH;
801 }
802 mm = get_task_mm(task);
803 read_unlock(&tasklist_lock);
804
805 if (!mm)
806 return -EINVAL;
807
808 /*
809 * Check if this process has the right to modify the specified
810 * process. The right exists if the process has administrative
811 * capabilities, superuser priviledges or the same
812 * userid as the target process.
813 */
814 if ((current->euid != task->suid) && (current->euid != task->uid) &&
815 (current->uid != task->suid) && (current->uid != task->uid) &&
816 !capable(CAP_SYS_ADMIN)) {
817 err = -EPERM;
818 goto out;
819 }
820
821 task_nodes = cpuset_mems_allowed(task);
822 /* Is the user allowed to access the target nodes? */
823 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
824 err = -EPERM;
825 goto out;
826 }
827
828 err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
829out:
830 mmput(mm);
831 return err;
832}
833
834
596/* Retrieve NUMA policy */ 835/* Retrieve NUMA policy */
597asmlinkage long sys_get_mempolicy(int __user *policy, 836asmlinkage long sys_get_mempolicy(int __user *policy,
598 unsigned long __user *nmask, 837 unsigned long __user *nmask,
@@ -699,8 +938,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
699#endif 938#endif
700 939
701/* Return effective policy for a VMA */ 940/* Return effective policy for a VMA */
702struct mempolicy * 941static struct mempolicy * get_vma_policy(struct task_struct *task,
703get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) 942 struct vm_area_struct *vma, unsigned long addr)
704{ 943{
705 struct mempolicy *pol = task->mempolicy; 944 struct mempolicy *pol = task->mempolicy;
706 945
@@ -848,7 +1087,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
848{ 1087{
849 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1088 struct mempolicy *pol = get_vma_policy(current, vma, addr);
850 1089
851 cpuset_update_current_mems_allowed(); 1090 cpuset_update_task_memory_state();
852 1091
853 if (unlikely(pol->policy == MPOL_INTERLEAVE)) { 1092 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
854 unsigned nid; 1093 unsigned nid;
@@ -874,7 +1113,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
874 * interrupt context and apply the current process NUMA policy. 1113 * interrupt context and apply the current process NUMA policy.
875 * Returns NULL when no page can be allocated. 1114 * Returns NULL when no page can be allocated.
876 * 1115 *
877 * Don't call cpuset_update_current_mems_allowed() unless 1116 * Don't call cpuset_update_task_memory_state() unless
878 * 1) it's ok to take cpuset_sem (can WAIT), and 1117 * 1) it's ok to take cpuset_sem (can WAIT), and
879 * 2) allocating for current task (not interrupt). 1118 * 2) allocating for current task (not interrupt).
880 */ 1119 */
@@ -883,7 +1122,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
883 struct mempolicy *pol = current->mempolicy; 1122 struct mempolicy *pol = current->mempolicy;
884 1123
885 if ((gfp & __GFP_WAIT) && !in_interrupt()) 1124 if ((gfp & __GFP_WAIT) && !in_interrupt())
886 cpuset_update_current_mems_allowed(); 1125 cpuset_update_task_memory_state();
887 if (!pol || in_interrupt()) 1126 if (!pol || in_interrupt())
888 pol = &default_policy; 1127 pol = &default_policy;
889 if (pol->policy == MPOL_INTERLEAVE) 1128 if (pol->policy == MPOL_INTERLEAVE)
@@ -892,6 +1131,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
892} 1131}
893EXPORT_SYMBOL(alloc_pages_current); 1132EXPORT_SYMBOL(alloc_pages_current);
894 1133
1134/*
1135 * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1136 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1137 * with the mems_allowed returned by cpuset_mems_allowed(). This
1138 * keeps mempolicies cpuset relative after its cpuset moves. See
1139 * further kernel/cpuset.c update_nodemask().
1140 */
1141void *cpuset_being_rebound;
1142
895/* Slow path of a mempolicy copy */ 1143/* Slow path of a mempolicy copy */
896struct mempolicy *__mpol_copy(struct mempolicy *old) 1144struct mempolicy *__mpol_copy(struct mempolicy *old)
897{ 1145{
@@ -899,6 +1147,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
899 1147
900 if (!new) 1148 if (!new)
901 return ERR_PTR(-ENOMEM); 1149 return ERR_PTR(-ENOMEM);
1150 if (current_cpuset_is_being_rebound()) {
1151 nodemask_t mems = cpuset_mems_allowed(current);
1152 mpol_rebind_policy(old, &mems);
1153 }
902 *new = *old; 1154 *new = *old;
903 atomic_set(&new->refcnt, 1); 1155 atomic_set(&new->refcnt, 1);
904 if (new->policy == MPOL_BIND) { 1156 if (new->policy == MPOL_BIND) {
@@ -1173,25 +1425,31 @@ void numa_default_policy(void)
1173} 1425}
1174 1426
1175/* Migrate a policy to a different set of nodes */ 1427/* Migrate a policy to a different set of nodes */
1176static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, 1428void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1177 const nodemask_t *new)
1178{ 1429{
1430 nodemask_t *mpolmask;
1179 nodemask_t tmp; 1431 nodemask_t tmp;
1180 1432
1181 if (!pol) 1433 if (!pol)
1182 return; 1434 return;
1435 mpolmask = &pol->cpuset_mems_allowed;
1436 if (nodes_equal(*mpolmask, *newmask))
1437 return;
1183 1438
1184 switch (pol->policy) { 1439 switch (pol->policy) {
1185 case MPOL_DEFAULT: 1440 case MPOL_DEFAULT:
1186 break; 1441 break;
1187 case MPOL_INTERLEAVE: 1442 case MPOL_INTERLEAVE:
1188 nodes_remap(tmp, pol->v.nodes, *old, *new); 1443 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1189 pol->v.nodes = tmp; 1444 pol->v.nodes = tmp;
1190 current->il_next = node_remap(current->il_next, *old, *new); 1445 *mpolmask = *newmask;
1446 current->il_next = node_remap(current->il_next,
1447 *mpolmask, *newmask);
1191 break; 1448 break;
1192 case MPOL_PREFERRED: 1449 case MPOL_PREFERRED:
1193 pol->v.preferred_node = node_remap(pol->v.preferred_node, 1450 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1194 *old, *new); 1451 *mpolmask, *newmask);
1452 *mpolmask = *newmask;
1195 break; 1453 break;
1196 case MPOL_BIND: { 1454 case MPOL_BIND: {
1197 nodemask_t nodes; 1455 nodemask_t nodes;
@@ -1201,7 +1459,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1201 nodes_clear(nodes); 1459 nodes_clear(nodes);
1202 for (z = pol->v.zonelist->zones; *z; z++) 1460 for (z = pol->v.zonelist->zones; *z; z++)
1203 node_set((*z)->zone_pgdat->node_id, nodes); 1461 node_set((*z)->zone_pgdat->node_id, nodes);
1204 nodes_remap(tmp, nodes, *old, *new); 1462 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1205 nodes = tmp; 1463 nodes = tmp;
1206 1464
1207 zonelist = bind_zonelist(&nodes); 1465 zonelist = bind_zonelist(&nodes);
@@ -1216,6 +1474,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1216 kfree(pol->v.zonelist); 1474 kfree(pol->v.zonelist);
1217 pol->v.zonelist = zonelist; 1475 pol->v.zonelist = zonelist;
1218 } 1476 }
1477 *mpolmask = *newmask;
1219 break; 1478 break;
1220 } 1479 }
1221 default: 1480 default:
@@ -1225,12 +1484,156 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1225} 1484}
1226 1485
1227/* 1486/*
1228 * Someone moved this task to different nodes. Fixup mempolicies. 1487 * Wrapper for mpol_rebind_policy() that just requires task
1488 * pointer, and updates task mempolicy.
1489 */
1490
1491void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1492{
1493 mpol_rebind_policy(tsk->mempolicy, new);
1494}
1495
1496/*
1497 * Rebind each vma in mm to new nodemask.
1229 * 1498 *
1230 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well, 1499 * Call holding a reference to mm. Takes mm->mmap_sem during call.
1231 * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1232 */ 1500 */
1233void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new) 1501
1502void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1234{ 1503{
1235 rebind_policy(current->mempolicy, old, new); 1504 struct vm_area_struct *vma;
1505
1506 down_write(&mm->mmap_sem);
1507 for (vma = mm->mmap; vma; vma = vma->vm_next)
1508 mpol_rebind_policy(vma->vm_policy, new);
1509 up_write(&mm->mmap_sem);
1236} 1510}
1511
1512/*
1513 * Display pages allocated per node and memory policy via /proc.
1514 */
1515
1516static const char *policy_types[] = { "default", "prefer", "bind",
1517 "interleave" };
1518
1519/*
1520 * Convert a mempolicy into a string.
1521 * Returns the number of characters in buffer (if positive)
1522 * or an error (negative)
1523 */
1524static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1525{
1526 char *p = buffer;
1527 int l;
1528 nodemask_t nodes;
1529 int mode = pol ? pol->policy : MPOL_DEFAULT;
1530
1531 switch (mode) {
1532 case MPOL_DEFAULT:
1533 nodes_clear(nodes);
1534 break;
1535
1536 case MPOL_PREFERRED:
1537 nodes_clear(nodes);
1538 node_set(pol->v.preferred_node, nodes);
1539 break;
1540
1541 case MPOL_BIND:
1542 get_zonemask(pol, &nodes);
1543 break;
1544
1545 case MPOL_INTERLEAVE:
1546 nodes = pol->v.nodes;
1547 break;
1548
1549 default:
1550 BUG();
1551 return -EFAULT;
1552 }
1553
1554 l = strlen(policy_types[mode]);
1555 if (buffer + maxlen < p + l + 1)
1556 return -ENOSPC;
1557
1558 strcpy(p, policy_types[mode]);
1559 p += l;
1560
1561 if (!nodes_empty(nodes)) {
1562 if (buffer + maxlen < p + 2)
1563 return -ENOSPC;
1564 *p++ = '=';
1565 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1566 }
1567 return p - buffer;
1568}
1569
1570struct numa_maps {
1571 unsigned long pages;
1572 unsigned long anon;
1573 unsigned long mapped;
1574 unsigned long mapcount_max;
1575 unsigned long node[MAX_NUMNODES];
1576};
1577
1578static void gather_stats(struct page *page, void *private)
1579{
1580 struct numa_maps *md = private;
1581 int count = page_mapcount(page);
1582
1583 if (count)
1584 md->mapped++;
1585
1586 if (count > md->mapcount_max)
1587 md->mapcount_max = count;
1588
1589 md->pages++;
1590
1591 if (PageAnon(page))
1592 md->anon++;
1593
1594 md->node[page_to_nid(page)]++;
1595 cond_resched();
1596}
1597
1598int show_numa_map(struct seq_file *m, void *v)
1599{
1600 struct task_struct *task = m->private;
1601 struct vm_area_struct *vma = v;
1602 struct numa_maps *md;
1603 int n;
1604 char buffer[50];
1605
1606 if (!vma->vm_mm)
1607 return 0;
1608
1609 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1610 if (!md)
1611 return 0;
1612
1613 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1614 &node_online_map, MPOL_MF_STATS, md);
1615
1616 if (md->pages) {
1617 mpol_to_str(buffer, sizeof(buffer),
1618 get_vma_policy(task, vma, vma->vm_start));
1619
1620 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1621 vma->vm_start, buffer, md->pages,
1622 md->mapped, md->mapcount_max);
1623
1624 if (md->anon)
1625 seq_printf(m," anon=%lu",md->anon);
1626
1627 for_each_online_node(n)
1628 if (md->node[n])
1629 seq_printf(m, " N%d=%lu", n, md->node[n]);
1630
1631 seq_putc(m, '\n');
1632 }
1633 kfree(md);
1634
1635 if (m->count < m->size)
1636 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1637 return 0;
1638}
1639
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d348b9035955..4748b906aff2 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -298,7 +298,8 @@ retry:
298 298
299 /* 299 /*
300 * Give "p" a good chance of killing itself before we 300 * Give "p" a good chance of killing itself before we
301 * retry to allocate memory. 301 * retry to allocate memory unless "p" is current
302 */ 302 */
303 schedule_timeout_interruptible(1); 303 if (!test_thread_flag(TIF_MEMDIE))
304 schedule_timeout_interruptible(1);
304} 305}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fd47494cb989..e0e84924171b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -53,6 +53,7 @@ struct pglist_data *pgdat_list __read_mostly;
53unsigned long totalram_pages __read_mostly; 53unsigned long totalram_pages __read_mostly;
54unsigned long totalhigh_pages __read_mostly; 54unsigned long totalhigh_pages __read_mostly;
55long nr_swap_pages; 55long nr_swap_pages;
56int percpu_pagelist_fraction;
56 57
57static void fastcall free_hot_cold_page(struct page *page, int cold); 58static void fastcall free_hot_cold_page(struct page *page, int cold);
58 59
@@ -307,7 +308,7 @@ static inline int page_is_buddy(struct page *page, int order)
307 * -- wli 308 * -- wli
308 */ 309 */
309 310
310static inline void __free_pages_bulk (struct page *page, 311static inline void __free_one_page(struct page *page,
311 struct zone *zone, unsigned int order) 312 struct zone *zone, unsigned int order)
312{ 313{
313 unsigned long page_idx; 314 unsigned long page_idx;
@@ -382,40 +383,42 @@ static inline int free_pages_check(struct page *page)
382 * And clear the zone's pages_scanned counter, to hold off the "all pages are 383 * And clear the zone's pages_scanned counter, to hold off the "all pages are
383 * pinned" detection logic. 384 * pinned" detection logic.
384 */ 385 */
385static int 386static void free_pages_bulk(struct zone *zone, int count,
386free_pages_bulk(struct zone *zone, int count, 387 struct list_head *list, int order)
387 struct list_head *list, unsigned int order)
388{ 388{
389 struct page *page = NULL;
390 int ret = 0;
391
392 spin_lock(&zone->lock); 389 spin_lock(&zone->lock);
393 zone->all_unreclaimable = 0; 390 zone->all_unreclaimable = 0;
394 zone->pages_scanned = 0; 391 zone->pages_scanned = 0;
395 while (!list_empty(list) && count--) { 392 while (count--) {
393 struct page *page;
394
395 BUG_ON(list_empty(list));
396 page = list_entry(list->prev, struct page, lru); 396 page = list_entry(list->prev, struct page, lru);
397 /* have to delete it as __free_pages_bulk list manipulates */ 397 /* have to delete it as __free_one_page list manipulates */
398 list_del(&page->lru); 398 list_del(&page->lru);
399 __free_pages_bulk(page, zone, order); 399 __free_one_page(page, zone, order);
400 ret++;
401 } 400 }
402 spin_unlock(&zone->lock); 401 spin_unlock(&zone->lock);
403 return ret;
404} 402}
405 403
406void __free_pages_ok(struct page *page, unsigned int order) 404static void free_one_page(struct zone *zone, struct page *page, int order)
407{ 405{
408 unsigned long flags;
409 LIST_HEAD(list); 406 LIST_HEAD(list);
407 list_add(&page->lru, &list);
408 free_pages_bulk(zone, 1, &list, order);
409}
410
411static void __free_pages_ok(struct page *page, unsigned int order)
412{
413 unsigned long flags;
410 int i; 414 int i;
411 int reserved = 0; 415 int reserved = 0;
412 416
413 arch_free_page(page, order); 417 arch_free_page(page, order);
414 418
415#ifndef CONFIG_MMU 419#ifndef CONFIG_MMU
416 if (order > 0) 420 for (i = 1 ; i < (1 << order) ; ++i)
417 for (i = 1 ; i < (1 << order) ; ++i) 421 __put_page(page + i);
418 __put_page(page + i);
419#endif 422#endif
420 423
421 for (i = 0 ; i < (1 << order) ; ++i) 424 for (i = 0 ; i < (1 << order) ; ++i)
@@ -423,11 +426,10 @@ void __free_pages_ok(struct page *page, unsigned int order)
423 if (reserved) 426 if (reserved)
424 return; 427 return;
425 428
426 list_add(&page->lru, &list); 429 kernel_map_pages(page, 1 << order, 0);
427 kernel_map_pages(page, 1<<order, 0);
428 local_irq_save(flags); 430 local_irq_save(flags);
429 __mod_page_state(pgfree, 1 << order); 431 __mod_page_state(pgfree, 1 << order);
430 free_pages_bulk(page_zone(page), 1, &list, order); 432 free_one_page(page_zone(page), page, order);
431 local_irq_restore(flags); 433 local_irq_restore(flags);
432} 434}
433 435
@@ -596,14 +598,13 @@ void drain_remote_pages(void)
596 if (zone->zone_pgdat->node_id == numa_node_id()) 598 if (zone->zone_pgdat->node_id == numa_node_id())
597 continue; 599 continue;
598 600
599 pset = zone->pageset[smp_processor_id()]; 601 pset = zone_pcp(zone, smp_processor_id());
600 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 602 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
601 struct per_cpu_pages *pcp; 603 struct per_cpu_pages *pcp;
602 604
603 pcp = &pset->pcp[i]; 605 pcp = &pset->pcp[i];
604 if (pcp->count) 606 free_pages_bulk(zone, pcp->count, &pcp->list, 0);
605 pcp->count -= free_pages_bulk(zone, pcp->count, 607 pcp->count = 0;
606 &pcp->list, 0);
607 } 608 }
608 } 609 }
609 local_irq_restore(flags); 610 local_irq_restore(flags);
@@ -626,8 +627,8 @@ static void __drain_pages(unsigned int cpu)
626 627
627 pcp = &pset->pcp[i]; 628 pcp = &pset->pcp[i];
628 local_irq_save(flags); 629 local_irq_save(flags);
629 pcp->count -= free_pages_bulk(zone, pcp->count, 630 free_pages_bulk(zone, pcp->count, &pcp->list, 0);
630 &pcp->list, 0); 631 pcp->count = 0;
631 local_irq_restore(flags); 632 local_irq_restore(flags);
632 } 633 }
633 } 634 }
@@ -718,8 +719,10 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
718 __inc_page_state(pgfree); 719 __inc_page_state(pgfree);
719 list_add(&page->lru, &pcp->list); 720 list_add(&page->lru, &pcp->list);
720 pcp->count++; 721 pcp->count++;
721 if (pcp->count >= pcp->high) 722 if (pcp->count >= pcp->high) {
722 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 723 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
724 pcp->count -= pcp->batch;
725 }
723 local_irq_restore(flags); 726 local_irq_restore(flags);
724 put_cpu(); 727 put_cpu();
725} 728}
@@ -758,7 +761,7 @@ static struct page *buffered_rmqueue(struct zonelist *zonelist,
758 761
759again: 762again:
760 cpu = get_cpu(); 763 cpu = get_cpu();
761 if (order == 0) { 764 if (likely(order == 0)) {
762 struct per_cpu_pages *pcp; 765 struct per_cpu_pages *pcp;
763 766
764 pcp = &zone_pcp(zone, cpu)->pcp[cold]; 767 pcp = &zone_pcp(zone, cpu)->pcp[cold];
@@ -973,6 +976,7 @@ rebalance:
973 cond_resched(); 976 cond_resched();
974 977
975 /* We now go into synchronous reclaim */ 978 /* We now go into synchronous reclaim */
979 cpuset_memory_pressure_bump();
976 p->flags |= PF_MEMALLOC; 980 p->flags |= PF_MEMALLOC;
977 reclaim_state.reclaimed_slab = 0; 981 reclaim_state.reclaimed_slab = 0;
978 p->reclaim_state = &reclaim_state; 982 p->reclaim_state = &reclaim_state;
@@ -1204,6 +1208,7 @@ static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
1204 int cpu = 0; 1208 int cpu = 0;
1205 1209
1206 memset(ret, 0, sizeof(*ret)); 1210 memset(ret, 0, sizeof(*ret));
1211 cpus_and(*cpumask, *cpumask, cpu_online_map);
1207 1212
1208 cpu = first_cpu(*cpumask); 1213 cpu = first_cpu(*cpumask);
1209 while (cpu < NR_CPUS) { 1214 while (cpu < NR_CPUS) {
@@ -1256,7 +1261,7 @@ unsigned long read_page_state_offset(unsigned long offset)
1256 unsigned long ret = 0; 1261 unsigned long ret = 0;
1257 int cpu; 1262 int cpu;
1258 1263
1259 for_each_cpu(cpu) { 1264 for_each_online_cpu(cpu) {
1260 unsigned long in; 1265 unsigned long in;
1261 1266
1262 in = (unsigned long)&per_cpu(page_states, cpu) + offset; 1267 in = (unsigned long)&per_cpu(page_states, cpu) + offset;
@@ -1830,6 +1835,24 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1830 INIT_LIST_HEAD(&pcp->list); 1835 INIT_LIST_HEAD(&pcp->list);
1831} 1836}
1832 1837
1838/*
1839 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
1840 * to the value high for the pageset p.
1841 */
1842
1843static void setup_pagelist_highmark(struct per_cpu_pageset *p,
1844 unsigned long high)
1845{
1846 struct per_cpu_pages *pcp;
1847
1848 pcp = &p->pcp[0]; /* hot list */
1849 pcp->high = high;
1850 pcp->batch = max(1UL, high/4);
1851 if ((high/4) > (PAGE_SHIFT * 8))
1852 pcp->batch = PAGE_SHIFT * 8;
1853}
1854
1855
1833#ifdef CONFIG_NUMA 1856#ifdef CONFIG_NUMA
1834/* 1857/*
1835 * Boot pageset table. One per cpu which is going to be used for all 1858 * Boot pageset table. One per cpu which is going to be used for all
@@ -1861,12 +1884,16 @@ static int __devinit process_zones(int cpu)
1861 1884
1862 for_each_zone(zone) { 1885 for_each_zone(zone) {
1863 1886
1864 zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), 1887 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
1865 GFP_KERNEL, cpu_to_node(cpu)); 1888 GFP_KERNEL, cpu_to_node(cpu));
1866 if (!zone->pageset[cpu]) 1889 if (!zone_pcp(zone, cpu))
1867 goto bad; 1890 goto bad;
1868 1891
1869 setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); 1892 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
1893
1894 if (percpu_pagelist_fraction)
1895 setup_pagelist_highmark(zone_pcp(zone, cpu),
1896 (zone->present_pages / percpu_pagelist_fraction));
1870 } 1897 }
1871 1898
1872 return 0; 1899 return 0;
@@ -1874,15 +1901,14 @@ bad:
1874 for_each_zone(dzone) { 1901 for_each_zone(dzone) {
1875 if (dzone == zone) 1902 if (dzone == zone)
1876 break; 1903 break;
1877 kfree(dzone->pageset[cpu]); 1904 kfree(zone_pcp(dzone, cpu));
1878 dzone->pageset[cpu] = NULL; 1905 zone_pcp(dzone, cpu) = NULL;
1879 } 1906 }
1880 return -ENOMEM; 1907 return -ENOMEM;
1881} 1908}
1882 1909
1883static inline void free_zone_pagesets(int cpu) 1910static inline void free_zone_pagesets(int cpu)
1884{ 1911{
1885#ifdef CONFIG_NUMA
1886 struct zone *zone; 1912 struct zone *zone;
1887 1913
1888 for_each_zone(zone) { 1914 for_each_zone(zone) {
@@ -1891,7 +1917,6 @@ static inline void free_zone_pagesets(int cpu)
1891 zone_pcp(zone, cpu) = NULL; 1917 zone_pcp(zone, cpu) = NULL;
1892 kfree(pset); 1918 kfree(pset);
1893 } 1919 }
1894#endif
1895} 1920}
1896 1921
1897static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, 1922static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
@@ -1962,7 +1987,7 @@ static __devinit void zone_pcp_init(struct zone *zone)
1962 for (cpu = 0; cpu < NR_CPUS; cpu++) { 1987 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1963#ifdef CONFIG_NUMA 1988#ifdef CONFIG_NUMA
1964 /* Early boot. Slab allocator not functional yet */ 1989 /* Early boot. Slab allocator not functional yet */
1965 zone->pageset[cpu] = &boot_pageset[cpu]; 1990 zone_pcp(zone, cpu) = &boot_pageset[cpu];
1966 setup_pageset(&boot_pageset[cpu],0); 1991 setup_pageset(&boot_pageset[cpu],0);
1967#else 1992#else
1968 setup_pageset(zone_pcp(zone,cpu), batch); 1993 setup_pageset(zone_pcp(zone,cpu), batch);
@@ -2205,7 +2230,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
2205 seq_printf(m, 2230 seq_printf(m,
2206 ")" 2231 ")"
2207 "\n pagesets"); 2232 "\n pagesets");
2208 for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { 2233 for_each_online_cpu(i) {
2209 struct per_cpu_pageset *pageset; 2234 struct per_cpu_pageset *pageset;
2210 int j; 2235 int j;
2211 2236
@@ -2568,6 +2593,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
2568 return 0; 2593 return 0;
2569} 2594}
2570 2595
2596/*
2597 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
2598 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
2599 * can have before it gets flushed back to buddy allocator.
2600 */
2601
2602int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
2603 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2604{
2605 struct zone *zone;
2606 unsigned int cpu;
2607 int ret;
2608
2609 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
2610 if (!write || (ret == -EINVAL))
2611 return ret;
2612 for_each_zone(zone) {
2613 for_each_online_cpu(cpu) {
2614 unsigned long high;
2615 high = zone->present_pages / percpu_pagelist_fraction;
2616 setup_pagelist_highmark(zone_pcp(zone, cpu), high);
2617 }
2618 }
2619 return 0;
2620}
2621
2571__initdata int hashdist = HASHDIST_DEFAULT; 2622__initdata int hashdist = HASHDIST_DEFAULT;
2572 2623
2573#ifdef CONFIG_NUMA 2624#ifdef CONFIG_NUMA
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 52822c98c489..c4b6d0afd736 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -90,7 +90,7 @@ struct pdflush_work {
90 90
91static int __pdflush(struct pdflush_work *my_work) 91static int __pdflush(struct pdflush_work *my_work)
92{ 92{
93 current->flags |= PF_FLUSHER; 93 current->flags |= PF_FLUSHER | PF_SWAPWRITE;
94 my_work->fn = NULL; 94 my_work->fn = NULL;
95 my_work->who = current; 95 my_work->who = current;
96 INIT_LIST_HEAD(&my_work->list); 96 INIT_LIST_HEAD(&my_work->list);
diff --git a/mm/rmap.c b/mm/rmap.c
index 6f3f7db27128..66ec43053a4d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -514,6 +514,13 @@ void page_add_file_rmap(struct page *page)
514void page_remove_rmap(struct page *page) 514void page_remove_rmap(struct page *page)
515{ 515{
516 if (atomic_add_negative(-1, &page->_mapcount)) { 516 if (atomic_add_negative(-1, &page->_mapcount)) {
517 if (page_mapcount(page) < 0) {
518 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
519 printk (KERN_EMERG " page->flags = %lx\n", page->flags);
520 printk (KERN_EMERG " page->count = %x\n", page_count(page));
521 printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
522 }
523
517 BUG_ON(page_mapcount(page) < 0); 524 BUG_ON(page_mapcount(page) < 0);
518 /* 525 /*
519 * It would be tidy to reset the PageAnon mapping here, 526 * It would be tidy to reset the PageAnon mapping here,
diff --git a/mm/slab.c b/mm/slab.c
index e5ec26e0c460..1c46c6383552 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -130,7 +130,6 @@
130#define FORCED_DEBUG 0 130#define FORCED_DEBUG 0
131#endif 131#endif
132 132
133
134/* Shouldn't this be in a header file somewhere? */ 133/* Shouldn't this be in a header file somewhere? */
135#define BYTES_PER_WORD sizeof(void *) 134#define BYTES_PER_WORD sizeof(void *)
136 135
@@ -217,12 +216,12 @@ static unsigned long offslab_limit;
217 * Slabs are chained into three list: fully used, partial, fully free slabs. 216 * Slabs are chained into three list: fully used, partial, fully free slabs.
218 */ 217 */
219struct slab { 218struct slab {
220 struct list_head list; 219 struct list_head list;
221 unsigned long colouroff; 220 unsigned long colouroff;
222 void *s_mem; /* including colour offset */ 221 void *s_mem; /* including colour offset */
223 unsigned int inuse; /* num of objs active in slab */ 222 unsigned int inuse; /* num of objs active in slab */
224 kmem_bufctl_t free; 223 kmem_bufctl_t free;
225 unsigned short nodeid; 224 unsigned short nodeid;
226}; 225};
227 226
228/* 227/*
@@ -242,9 +241,9 @@ struct slab {
242 * We assume struct slab_rcu can overlay struct slab when destroying. 241 * We assume struct slab_rcu can overlay struct slab when destroying.
243 */ 242 */
244struct slab_rcu { 243struct slab_rcu {
245 struct rcu_head head; 244 struct rcu_head head;
246 kmem_cache_t *cachep; 245 kmem_cache_t *cachep;
247 void *addr; 246 void *addr;
248}; 247};
249 248
250/* 249/*
@@ -279,23 +278,23 @@ struct array_cache {
279#define BOOT_CPUCACHE_ENTRIES 1 278#define BOOT_CPUCACHE_ENTRIES 1
280struct arraycache_init { 279struct arraycache_init {
281 struct array_cache cache; 280 struct array_cache cache;
282 void * entries[BOOT_CPUCACHE_ENTRIES]; 281 void *entries[BOOT_CPUCACHE_ENTRIES];
283}; 282};
284 283
285/* 284/*
286 * The slab lists for all objects. 285 * The slab lists for all objects.
287 */ 286 */
288struct kmem_list3 { 287struct kmem_list3 {
289 struct list_head slabs_partial; /* partial list first, better asm code */ 288 struct list_head slabs_partial; /* partial list first, better asm code */
290 struct list_head slabs_full; 289 struct list_head slabs_full;
291 struct list_head slabs_free; 290 struct list_head slabs_free;
292 unsigned long free_objects; 291 unsigned long free_objects;
293 unsigned long next_reap; 292 unsigned long next_reap;
294 int free_touched; 293 int free_touched;
295 unsigned int free_limit; 294 unsigned int free_limit;
296 spinlock_t list_lock; 295 spinlock_t list_lock;
297 struct array_cache *shared; /* shared per node */ 296 struct array_cache *shared; /* shared per node */
298 struct array_cache **alien; /* on other nodes */ 297 struct array_cache **alien; /* on other nodes */
299}; 298};
300 299
301/* 300/*
@@ -367,63 +366,63 @@ static inline void kmem_list3_init(struct kmem_list3 *parent)
367 * 366 *
368 * manages a cache. 367 * manages a cache.
369 */ 368 */
370 369
371struct kmem_cache { 370struct kmem_cache {
372/* 1) per-cpu data, touched during every alloc/free */ 371/* 1) per-cpu data, touched during every alloc/free */
373 struct array_cache *array[NR_CPUS]; 372 struct array_cache *array[NR_CPUS];
374 unsigned int batchcount; 373 unsigned int batchcount;
375 unsigned int limit; 374 unsigned int limit;
376 unsigned int shared; 375 unsigned int shared;
377 unsigned int objsize; 376 unsigned int objsize;
378/* 2) touched by every alloc & free from the backend */ 377/* 2) touched by every alloc & free from the backend */
379 struct kmem_list3 *nodelists[MAX_NUMNODES]; 378 struct kmem_list3 *nodelists[MAX_NUMNODES];
380 unsigned int flags; /* constant flags */ 379 unsigned int flags; /* constant flags */
381 unsigned int num; /* # of objs per slab */ 380 unsigned int num; /* # of objs per slab */
382 spinlock_t spinlock; 381 spinlock_t spinlock;
383 382
384/* 3) cache_grow/shrink */ 383/* 3) cache_grow/shrink */
385 /* order of pgs per slab (2^n) */ 384 /* order of pgs per slab (2^n) */
386 unsigned int gfporder; 385 unsigned int gfporder;
387 386
388 /* force GFP flags, e.g. GFP_DMA */ 387 /* force GFP flags, e.g. GFP_DMA */
389 gfp_t gfpflags; 388 gfp_t gfpflags;
390 389
391 size_t colour; /* cache colouring range */ 390 size_t colour; /* cache colouring range */
392 unsigned int colour_off; /* colour offset */ 391 unsigned int colour_off; /* colour offset */
393 unsigned int colour_next; /* cache colouring */ 392 unsigned int colour_next; /* cache colouring */
394 kmem_cache_t *slabp_cache; 393 kmem_cache_t *slabp_cache;
395 unsigned int slab_size; 394 unsigned int slab_size;
396 unsigned int dflags; /* dynamic flags */ 395 unsigned int dflags; /* dynamic flags */
397 396
398 /* constructor func */ 397 /* constructor func */
399 void (*ctor)(void *, kmem_cache_t *, unsigned long); 398 void (*ctor) (void *, kmem_cache_t *, unsigned long);
400 399
401 /* de-constructor func */ 400 /* de-constructor func */
402 void (*dtor)(void *, kmem_cache_t *, unsigned long); 401 void (*dtor) (void *, kmem_cache_t *, unsigned long);
403 402
404/* 4) cache creation/removal */ 403/* 4) cache creation/removal */
405 const char *name; 404 const char *name;
406 struct list_head next; 405 struct list_head next;
407 406
408/* 5) statistics */ 407/* 5) statistics */
409#if STATS 408#if STATS
410 unsigned long num_active; 409 unsigned long num_active;
411 unsigned long num_allocations; 410 unsigned long num_allocations;
412 unsigned long high_mark; 411 unsigned long high_mark;
413 unsigned long grown; 412 unsigned long grown;
414 unsigned long reaped; 413 unsigned long reaped;
415 unsigned long errors; 414 unsigned long errors;
416 unsigned long max_freeable; 415 unsigned long max_freeable;
417 unsigned long node_allocs; 416 unsigned long node_allocs;
418 unsigned long node_frees; 417 unsigned long node_frees;
419 atomic_t allochit; 418 atomic_t allochit;
420 atomic_t allocmiss; 419 atomic_t allocmiss;
421 atomic_t freehit; 420 atomic_t freehit;
422 atomic_t freemiss; 421 atomic_t freemiss;
423#endif 422#endif
424#if DEBUG 423#if DEBUG
425 int dbghead; 424 int dbghead;
426 int reallen; 425 int reallen;
427#endif 426#endif
428}; 427};
429 428
@@ -523,14 +522,15 @@ static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
523{ 522{
524 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 523 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
525 if (cachep->flags & SLAB_STORE_USER) 524 if (cachep->flags & SLAB_STORE_USER)
526 return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD); 525 return (unsigned long *)(objp + cachep->objsize -
527 return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD); 526 2 * BYTES_PER_WORD);
527 return (unsigned long *)(objp + cachep->objsize - BYTES_PER_WORD);
528} 528}
529 529
530static void **dbg_userword(kmem_cache_t *cachep, void *objp) 530static void **dbg_userword(kmem_cache_t *cachep, void *objp)
531{ 531{
532 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 532 BUG_ON(!(cachep->flags & SLAB_STORE_USER));
533 return (void**)(objp+cachep->objsize-BYTES_PER_WORD); 533 return (void **)(objp + cachep->objsize - BYTES_PER_WORD);
534} 534}
535 535
536#else 536#else
@@ -607,31 +607,31 @@ struct cache_names {
607static struct cache_names __initdata cache_names[] = { 607static struct cache_names __initdata cache_names[] = {
608#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, 608#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
609#include <linux/kmalloc_sizes.h> 609#include <linux/kmalloc_sizes.h>
610 { NULL, } 610 {NULL,}
611#undef CACHE 611#undef CACHE
612}; 612};
613 613
614static struct arraycache_init initarray_cache __initdata = 614static struct arraycache_init initarray_cache __initdata =
615 { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 615 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
616static struct arraycache_init initarray_generic = 616static struct arraycache_init initarray_generic =
617 { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 617 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
618 618
619/* internal cache of cache description objs */ 619/* internal cache of cache description objs */
620static kmem_cache_t cache_cache = { 620static kmem_cache_t cache_cache = {
621 .batchcount = 1, 621 .batchcount = 1,
622 .limit = BOOT_CPUCACHE_ENTRIES, 622 .limit = BOOT_CPUCACHE_ENTRIES,
623 .shared = 1, 623 .shared = 1,
624 .objsize = sizeof(kmem_cache_t), 624 .objsize = sizeof(kmem_cache_t),
625 .flags = SLAB_NO_REAP, 625 .flags = SLAB_NO_REAP,
626 .spinlock = SPIN_LOCK_UNLOCKED, 626 .spinlock = SPIN_LOCK_UNLOCKED,
627 .name = "kmem_cache", 627 .name = "kmem_cache",
628#if DEBUG 628#if DEBUG
629 .reallen = sizeof(kmem_cache_t), 629 .reallen = sizeof(kmem_cache_t),
630#endif 630#endif
631}; 631};
632 632
633/* Guard access to the cache-chain. */ 633/* Guard access to the cache-chain. */
634static struct semaphore cache_chain_sem; 634static struct semaphore cache_chain_sem;
635static struct list_head cache_chain; 635static struct list_head cache_chain;
636 636
637/* 637/*
@@ -655,9 +655,9 @@ static enum {
655 655
656static DEFINE_PER_CPU(struct work_struct, reap_work); 656static DEFINE_PER_CPU(struct work_struct, reap_work);
657 657
658static void free_block(kmem_cache_t* cachep, void** objpp, int len, int node); 658static void free_block(kmem_cache_t *cachep, void **objpp, int len, int node);
659static void enable_cpucache (kmem_cache_t *cachep); 659static void enable_cpucache(kmem_cache_t *cachep);
660static void cache_reap (void *unused); 660static void cache_reap(void *unused);
661static int __node_shrink(kmem_cache_t *cachep, int node); 661static int __node_shrink(kmem_cache_t *cachep, int node);
662 662
663static inline struct array_cache *ac_data(kmem_cache_t *cachep) 663static inline struct array_cache *ac_data(kmem_cache_t *cachep)
@@ -671,9 +671,9 @@ static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
671 671
672#if DEBUG 672#if DEBUG
673 /* This happens if someone tries to call 673 /* This happens if someone tries to call
674 * kmem_cache_create(), or __kmalloc(), before 674 * kmem_cache_create(), or __kmalloc(), before
675 * the generic caches are initialized. 675 * the generic caches are initialized.
676 */ 676 */
677 BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); 677 BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
678#endif 678#endif
679 while (size > csizep->cs_size) 679 while (size > csizep->cs_size)
@@ -697,10 +697,10 @@ EXPORT_SYMBOL(kmem_find_general_cachep);
697 697
698/* Cal the num objs, wastage, and bytes left over for a given slab size. */ 698/* Cal the num objs, wastage, and bytes left over for a given slab size. */
699static void cache_estimate(unsigned long gfporder, size_t size, size_t align, 699static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
700 int flags, size_t *left_over, unsigned int *num) 700 int flags, size_t *left_over, unsigned int *num)
701{ 701{
702 int i; 702 int i;
703 size_t wastage = PAGE_SIZE<<gfporder; 703 size_t wastage = PAGE_SIZE << gfporder;
704 size_t extra = 0; 704 size_t extra = 0;
705 size_t base = 0; 705 size_t base = 0;
706 706
@@ -709,7 +709,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
709 extra = sizeof(kmem_bufctl_t); 709 extra = sizeof(kmem_bufctl_t);
710 } 710 }
711 i = 0; 711 i = 0;
712 while (i*size + ALIGN(base+i*extra, align) <= wastage) 712 while (i * size + ALIGN(base + i * extra, align) <= wastage)
713 i++; 713 i++;
714 if (i > 0) 714 if (i > 0)
715 i--; 715 i--;
@@ -718,8 +718,8 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
718 i = SLAB_LIMIT; 718 i = SLAB_LIMIT;
719 719
720 *num = i; 720 *num = i;
721 wastage -= i*size; 721 wastage -= i * size;
722 wastage -= ALIGN(base+i*extra, align); 722 wastage -= ALIGN(base + i * extra, align);
723 *left_over = wastage; 723 *left_over = wastage;
724} 724}
725 725
@@ -728,7 +728,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
728static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg) 728static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
729{ 729{
730 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 730 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
731 function, cachep->name, msg); 731 function, cachep->name, msg);
732 dump_stack(); 732 dump_stack();
733} 733}
734 734
@@ -755,9 +755,9 @@ static void __devinit start_cpu_timer(int cpu)
755} 755}
756 756
757static struct array_cache *alloc_arraycache(int node, int entries, 757static struct array_cache *alloc_arraycache(int node, int entries,
758 int batchcount) 758 int batchcount)
759{ 759{
760 int memsize = sizeof(void*)*entries+sizeof(struct array_cache); 760 int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
761 struct array_cache *nc = NULL; 761 struct array_cache *nc = NULL;
762 762
763 nc = kmalloc_node(memsize, GFP_KERNEL, node); 763 nc = kmalloc_node(memsize, GFP_KERNEL, node);
@@ -775,7 +775,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
775static inline struct array_cache **alloc_alien_cache(int node, int limit) 775static inline struct array_cache **alloc_alien_cache(int node, int limit)
776{ 776{
777 struct array_cache **ac_ptr; 777 struct array_cache **ac_ptr;
778 int memsize = sizeof(void*)*MAX_NUMNODES; 778 int memsize = sizeof(void *) * MAX_NUMNODES;
779 int i; 779 int i;
780 780
781 if (limit > 1) 781 if (limit > 1)
@@ -789,7 +789,7 @@ static inline struct array_cache **alloc_alien_cache(int node, int limit)
789 } 789 }
790 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); 790 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
791 if (!ac_ptr[i]) { 791 if (!ac_ptr[i]) {
792 for (i--; i <=0; i--) 792 for (i--; i <= 0; i--)
793 kfree(ac_ptr[i]); 793 kfree(ac_ptr[i]);
794 kfree(ac_ptr); 794 kfree(ac_ptr);
795 return NULL; 795 return NULL;
@@ -807,12 +807,13 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
807 return; 807 return;
808 808
809 for_each_node(i) 809 for_each_node(i)
810 kfree(ac_ptr[i]); 810 kfree(ac_ptr[i]);
811 811
812 kfree(ac_ptr); 812 kfree(ac_ptr);
813} 813}
814 814
815static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node) 815static inline void __drain_alien_cache(kmem_cache_t *cachep,
816 struct array_cache *ac, int node)
816{ 817{
817 struct kmem_list3 *rl3 = cachep->nodelists[node]; 818 struct kmem_list3 *rl3 = cachep->nodelists[node];
818 819
@@ -826,7 +827,7 @@ static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache
826 827
827static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3) 828static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
828{ 829{
829 int i=0; 830 int i = 0;
830 struct array_cache *ac; 831 struct array_cache *ac;
831 unsigned long flags; 832 unsigned long flags;
832 833
@@ -846,14 +847,13 @@ static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
846#endif 847#endif
847 848
848static int __devinit cpuup_callback(struct notifier_block *nfb, 849static int __devinit cpuup_callback(struct notifier_block *nfb,
849 unsigned long action, void *hcpu) 850 unsigned long action, void *hcpu)
850{ 851{
851 long cpu = (long)hcpu; 852 long cpu = (long)hcpu;
852 kmem_cache_t* cachep; 853 kmem_cache_t *cachep;
853 struct kmem_list3 *l3 = NULL; 854 struct kmem_list3 *l3 = NULL;
854 int node = cpu_to_node(cpu); 855 int node = cpu_to_node(cpu);
855 int memsize = sizeof(struct kmem_list3); 856 int memsize = sizeof(struct kmem_list3);
856 struct array_cache *nc = NULL;
857 857
858 switch (action) { 858 switch (action) {
859 case CPU_UP_PREPARE: 859 case CPU_UP_PREPARE:
@@ -871,27 +871,29 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
871 */ 871 */
872 if (!cachep->nodelists[node]) { 872 if (!cachep->nodelists[node]) {
873 if (!(l3 = kmalloc_node(memsize, 873 if (!(l3 = kmalloc_node(memsize,
874 GFP_KERNEL, node))) 874 GFP_KERNEL, node)))
875 goto bad; 875 goto bad;
876 kmem_list3_init(l3); 876 kmem_list3_init(l3);
877 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 877 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
878 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 878 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
879 879
880 cachep->nodelists[node] = l3; 880 cachep->nodelists[node] = l3;
881 } 881 }
882 882
883 spin_lock_irq(&cachep->nodelists[node]->list_lock); 883 spin_lock_irq(&cachep->nodelists[node]->list_lock);
884 cachep->nodelists[node]->free_limit = 884 cachep->nodelists[node]->free_limit =
885 (1 + nr_cpus_node(node)) * 885 (1 + nr_cpus_node(node)) *
886 cachep->batchcount + cachep->num; 886 cachep->batchcount + cachep->num;
887 spin_unlock_irq(&cachep->nodelists[node]->list_lock); 887 spin_unlock_irq(&cachep->nodelists[node]->list_lock);
888 } 888 }
889 889
890 /* Now we can go ahead with allocating the shared array's 890 /* Now we can go ahead with allocating the shared array's
891 & array cache's */ 891 & array cache's */
892 list_for_each_entry(cachep, &cache_chain, next) { 892 list_for_each_entry(cachep, &cache_chain, next) {
893 struct array_cache *nc;
894
893 nc = alloc_arraycache(node, cachep->limit, 895 nc = alloc_arraycache(node, cachep->limit,
894 cachep->batchcount); 896 cachep->batchcount);
895 if (!nc) 897 if (!nc)
896 goto bad; 898 goto bad;
897 cachep->array[cpu] = nc; 899 cachep->array[cpu] = nc;
@@ -900,12 +902,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
900 BUG_ON(!l3); 902 BUG_ON(!l3);
901 if (!l3->shared) { 903 if (!l3->shared) {
902 if (!(nc = alloc_arraycache(node, 904 if (!(nc = alloc_arraycache(node,
903 cachep->shared*cachep->batchcount, 905 cachep->shared *
904 0xbaadf00d))) 906 cachep->batchcount,
905 goto bad; 907 0xbaadf00d)))
908 goto bad;
906 909
907 /* we are serialised from CPU_DEAD or 910 /* we are serialised from CPU_DEAD or
908 CPU_UP_CANCELLED by the cpucontrol lock */ 911 CPU_UP_CANCELLED by the cpucontrol lock */
909 l3->shared = nc; 912 l3->shared = nc;
910 } 913 }
911 } 914 }
@@ -942,13 +945,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
942 free_block(cachep, nc->entry, nc->avail, node); 945 free_block(cachep, nc->entry, nc->avail, node);
943 946
944 if (!cpus_empty(mask)) { 947 if (!cpus_empty(mask)) {
945 spin_unlock(&l3->list_lock); 948 spin_unlock(&l3->list_lock);
946 goto unlock_cache; 949 goto unlock_cache;
947 } 950 }
948 951
949 if (l3->shared) { 952 if (l3->shared) {
950 free_block(cachep, l3->shared->entry, 953 free_block(cachep, l3->shared->entry,
951 l3->shared->avail, node); 954 l3->shared->avail, node);
952 kfree(l3->shared); 955 kfree(l3->shared);
953 l3->shared = NULL; 956 l3->shared = NULL;
954 } 957 }
@@ -966,7 +969,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
966 } else { 969 } else {
967 spin_unlock(&l3->list_lock); 970 spin_unlock(&l3->list_lock);
968 } 971 }
969unlock_cache: 972 unlock_cache:
970 spin_unlock_irq(&cachep->spinlock); 973 spin_unlock_irq(&cachep->spinlock);
971 kfree(nc); 974 kfree(nc);
972 } 975 }
@@ -975,7 +978,7 @@ unlock_cache:
975#endif 978#endif
976 } 979 }
977 return NOTIFY_OK; 980 return NOTIFY_OK;
978bad: 981 bad:
979 up(&cache_chain_sem); 982 up(&cache_chain_sem);
980 return NOTIFY_BAD; 983 return NOTIFY_BAD;
981} 984}
@@ -985,8 +988,7 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
985/* 988/*
986 * swap the static kmem_list3 with kmalloced memory 989 * swap the static kmem_list3 with kmalloced memory
987 */ 990 */
988static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, 991static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid)
989 int nodeid)
990{ 992{
991 struct kmem_list3 *ptr; 993 struct kmem_list3 *ptr;
992 994
@@ -1055,14 +1057,14 @@ void __init kmem_cache_init(void)
1055 cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); 1057 cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
1056 1058
1057 cache_estimate(0, cache_cache.objsize, cache_line_size(), 0, 1059 cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
1058 &left_over, &cache_cache.num); 1060 &left_over, &cache_cache.num);
1059 if (!cache_cache.num) 1061 if (!cache_cache.num)
1060 BUG(); 1062 BUG();
1061 1063
1062 cache_cache.colour = left_over/cache_cache.colour_off; 1064 cache_cache.colour = left_over / cache_cache.colour_off;
1063 cache_cache.colour_next = 0; 1065 cache_cache.colour_next = 0;
1064 cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) + 1066 cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1065 sizeof(struct slab), cache_line_size()); 1067 sizeof(struct slab), cache_line_size());
1066 1068
1067 /* 2+3) create the kmalloc caches */ 1069 /* 2+3) create the kmalloc caches */
1068 sizes = malloc_sizes; 1070 sizes = malloc_sizes;
@@ -1074,14 +1076,18 @@ void __init kmem_cache_init(void)
1074 */ 1076 */
1075 1077
1076 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, 1078 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1077 sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN, 1079 sizes[INDEX_AC].cs_size,
1078 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 1080 ARCH_KMALLOC_MINALIGN,
1081 (ARCH_KMALLOC_FLAGS |
1082 SLAB_PANIC), NULL, NULL);
1079 1083
1080 if (INDEX_AC != INDEX_L3) 1084 if (INDEX_AC != INDEX_L3)
1081 sizes[INDEX_L3].cs_cachep = 1085 sizes[INDEX_L3].cs_cachep =
1082 kmem_cache_create(names[INDEX_L3].name, 1086 kmem_cache_create(names[INDEX_L3].name,
1083 sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN, 1087 sizes[INDEX_L3].cs_size,
1084 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 1088 ARCH_KMALLOC_MINALIGN,
1089 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
1090 NULL);
1085 1091
1086 while (sizes->cs_size != ULONG_MAX) { 1092 while (sizes->cs_size != ULONG_MAX) {
1087 /* 1093 /*
@@ -1091,35 +1097,41 @@ void __init kmem_cache_init(void)
1091 * Note for systems short on memory removing the alignment will 1097 * Note for systems short on memory removing the alignment will
1092 * allow tighter packing of the smaller caches. 1098 * allow tighter packing of the smaller caches.
1093 */ 1099 */
1094 if(!sizes->cs_cachep) 1100 if (!sizes->cs_cachep)
1095 sizes->cs_cachep = kmem_cache_create(names->name, 1101 sizes->cs_cachep = kmem_cache_create(names->name,
1096 sizes->cs_size, ARCH_KMALLOC_MINALIGN, 1102 sizes->cs_size,
1097 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 1103 ARCH_KMALLOC_MINALIGN,
1104 (ARCH_KMALLOC_FLAGS
1105 | SLAB_PANIC),
1106 NULL, NULL);
1098 1107
1099 /* Inc off-slab bufctl limit until the ceiling is hit. */ 1108 /* Inc off-slab bufctl limit until the ceiling is hit. */
1100 if (!(OFF_SLAB(sizes->cs_cachep))) { 1109 if (!(OFF_SLAB(sizes->cs_cachep))) {
1101 offslab_limit = sizes->cs_size-sizeof(struct slab); 1110 offslab_limit = sizes->cs_size - sizeof(struct slab);
1102 offslab_limit /= sizeof(kmem_bufctl_t); 1111 offslab_limit /= sizeof(kmem_bufctl_t);
1103 } 1112 }
1104 1113
1105 sizes->cs_dmacachep = kmem_cache_create(names->name_dma, 1114 sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
1106 sizes->cs_size, ARCH_KMALLOC_MINALIGN, 1115 sizes->cs_size,
1107 (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC), 1116 ARCH_KMALLOC_MINALIGN,
1108 NULL, NULL); 1117 (ARCH_KMALLOC_FLAGS |
1118 SLAB_CACHE_DMA |
1119 SLAB_PANIC), NULL,
1120 NULL);
1109 1121
1110 sizes++; 1122 sizes++;
1111 names++; 1123 names++;
1112 } 1124 }
1113 /* 4) Replace the bootstrap head arrays */ 1125 /* 4) Replace the bootstrap head arrays */
1114 { 1126 {
1115 void * ptr; 1127 void *ptr;
1116 1128
1117 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1129 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1118 1130
1119 local_irq_disable(); 1131 local_irq_disable();
1120 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); 1132 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
1121 memcpy(ptr, ac_data(&cache_cache), 1133 memcpy(ptr, ac_data(&cache_cache),
1122 sizeof(struct arraycache_init)); 1134 sizeof(struct arraycache_init));
1123 cache_cache.array[smp_processor_id()] = ptr; 1135 cache_cache.array[smp_processor_id()] = ptr;
1124 local_irq_enable(); 1136 local_irq_enable();
1125 1137
@@ -1127,11 +1139,11 @@ void __init kmem_cache_init(void)
1127 1139
1128 local_irq_disable(); 1140 local_irq_disable();
1129 BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep) 1141 BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
1130 != &initarray_generic.cache); 1142 != &initarray_generic.cache);
1131 memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep), 1143 memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
1132 sizeof(struct arraycache_init)); 1144 sizeof(struct arraycache_init));
1133 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1145 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1134 ptr; 1146 ptr;
1135 local_irq_enable(); 1147 local_irq_enable();
1136 } 1148 }
1137 /* 5) Replace the bootstrap kmem_list3's */ 1149 /* 5) Replace the bootstrap kmem_list3's */
@@ -1139,16 +1151,16 @@ void __init kmem_cache_init(void)
1139 int node; 1151 int node;
1140 /* Replace the static kmem_list3 structures for the boot cpu */ 1152 /* Replace the static kmem_list3 structures for the boot cpu */
1141 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], 1153 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
1142 numa_node_id()); 1154 numa_node_id());
1143 1155
1144 for_each_online_node(node) { 1156 for_each_online_node(node) {
1145 init_list(malloc_sizes[INDEX_AC].cs_cachep, 1157 init_list(malloc_sizes[INDEX_AC].cs_cachep,
1146 &initkmem_list3[SIZE_AC+node], node); 1158 &initkmem_list3[SIZE_AC + node], node);
1147 1159
1148 if (INDEX_AC != INDEX_L3) { 1160 if (INDEX_AC != INDEX_L3) {
1149 init_list(malloc_sizes[INDEX_L3].cs_cachep, 1161 init_list(malloc_sizes[INDEX_L3].cs_cachep,
1150 &initkmem_list3[SIZE_L3+node], 1162 &initkmem_list3[SIZE_L3 + node],
1151 node); 1163 node);
1152 } 1164 }
1153 } 1165 }
1154 } 1166 }
@@ -1158,7 +1170,7 @@ void __init kmem_cache_init(void)
1158 kmem_cache_t *cachep; 1170 kmem_cache_t *cachep;
1159 down(&cache_chain_sem); 1171 down(&cache_chain_sem);
1160 list_for_each_entry(cachep, &cache_chain, next) 1172 list_for_each_entry(cachep, &cache_chain, next)
1161 enable_cpucache(cachep); 1173 enable_cpucache(cachep);
1162 up(&cache_chain_sem); 1174 up(&cache_chain_sem);
1163 } 1175 }
1164 1176
@@ -1184,7 +1196,7 @@ static int __init cpucache_init(void)
1184 * pages to gfp. 1196 * pages to gfp.
1185 */ 1197 */
1186 for_each_online_cpu(cpu) 1198 for_each_online_cpu(cpu)
1187 start_cpu_timer(cpu); 1199 start_cpu_timer(cpu);
1188 1200
1189 return 0; 1201 return 0;
1190} 1202}
@@ -1226,7 +1238,7 @@ static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
1226 */ 1238 */
1227static void kmem_freepages(kmem_cache_t *cachep, void *addr) 1239static void kmem_freepages(kmem_cache_t *cachep, void *addr)
1228{ 1240{
1229 unsigned long i = (1<<cachep->gfporder); 1241 unsigned long i = (1 << cachep->gfporder);
1230 struct page *page = virt_to_page(addr); 1242 struct page *page = virt_to_page(addr);
1231 const unsigned long nr_freed = i; 1243 const unsigned long nr_freed = i;
1232 1244
@@ -1239,13 +1251,13 @@ static void kmem_freepages(kmem_cache_t *cachep, void *addr)
1239 if (current->reclaim_state) 1251 if (current->reclaim_state)
1240 current->reclaim_state->reclaimed_slab += nr_freed; 1252 current->reclaim_state->reclaimed_slab += nr_freed;
1241 free_pages((unsigned long)addr, cachep->gfporder); 1253 free_pages((unsigned long)addr, cachep->gfporder);
1242 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1254 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1243 atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages); 1255 atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
1244} 1256}
1245 1257
1246static void kmem_rcu_free(struct rcu_head *head) 1258static void kmem_rcu_free(struct rcu_head *head)
1247{ 1259{
1248 struct slab_rcu *slab_rcu = (struct slab_rcu *) head; 1260 struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1249 kmem_cache_t *cachep = slab_rcu->cachep; 1261 kmem_cache_t *cachep = slab_rcu->cachep;
1250 1262
1251 kmem_freepages(cachep, slab_rcu->addr); 1263 kmem_freepages(cachep, slab_rcu->addr);
@@ -1257,19 +1269,19 @@ static void kmem_rcu_free(struct rcu_head *head)
1257 1269
1258#ifdef CONFIG_DEBUG_PAGEALLOC 1270#ifdef CONFIG_DEBUG_PAGEALLOC
1259static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, 1271static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
1260 unsigned long caller) 1272 unsigned long caller)
1261{ 1273{
1262 int size = obj_reallen(cachep); 1274 int size = obj_reallen(cachep);
1263 1275
1264 addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)]; 1276 addr = (unsigned long *)&((char *)addr)[obj_dbghead(cachep)];
1265 1277
1266 if (size < 5*sizeof(unsigned long)) 1278 if (size < 5 * sizeof(unsigned long))
1267 return; 1279 return;
1268 1280
1269 *addr++=0x12345678; 1281 *addr++ = 0x12345678;
1270 *addr++=caller; 1282 *addr++ = caller;
1271 *addr++=smp_processor_id(); 1283 *addr++ = smp_processor_id();
1272 size -= 3*sizeof(unsigned long); 1284 size -= 3 * sizeof(unsigned long);
1273 { 1285 {
1274 unsigned long *sptr = &caller; 1286 unsigned long *sptr = &caller;
1275 unsigned long svalue; 1287 unsigned long svalue;
@@ -1277,7 +1289,7 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
1277 while (!kstack_end(sptr)) { 1289 while (!kstack_end(sptr)) {
1278 svalue = *sptr++; 1290 svalue = *sptr++;
1279 if (kernel_text_address(svalue)) { 1291 if (kernel_text_address(svalue)) {
1280 *addr++=svalue; 1292 *addr++ = svalue;
1281 size -= sizeof(unsigned long); 1293 size -= sizeof(unsigned long);
1282 if (size <= sizeof(unsigned long)) 1294 if (size <= sizeof(unsigned long))
1283 break; 1295 break;
@@ -1285,25 +1297,25 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
1285 } 1297 }
1286 1298
1287 } 1299 }
1288 *addr++=0x87654321; 1300 *addr++ = 0x87654321;
1289} 1301}
1290#endif 1302#endif
1291 1303
1292static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) 1304static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
1293{ 1305{
1294 int size = obj_reallen(cachep); 1306 int size = obj_reallen(cachep);
1295 addr = &((char*)addr)[obj_dbghead(cachep)]; 1307 addr = &((char *)addr)[obj_dbghead(cachep)];
1296 1308
1297 memset(addr, val, size); 1309 memset(addr, val, size);
1298 *(unsigned char *)(addr+size-1) = POISON_END; 1310 *(unsigned char *)(addr + size - 1) = POISON_END;
1299} 1311}
1300 1312
1301static void dump_line(char *data, int offset, int limit) 1313static void dump_line(char *data, int offset, int limit)
1302{ 1314{
1303 int i; 1315 int i;
1304 printk(KERN_ERR "%03x:", offset); 1316 printk(KERN_ERR "%03x:", offset);
1305 for (i=0;i<limit;i++) { 1317 for (i = 0; i < limit; i++) {
1306 printk(" %02x", (unsigned char)data[offset+i]); 1318 printk(" %02x", (unsigned char)data[offset + i]);
1307 } 1319 }
1308 printk("\n"); 1320 printk("\n");
1309} 1321}
@@ -1318,24 +1330,24 @@ static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
1318 1330
1319 if (cachep->flags & SLAB_RED_ZONE) { 1331 if (cachep->flags & SLAB_RED_ZONE) {
1320 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", 1332 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
1321 *dbg_redzone1(cachep, objp), 1333 *dbg_redzone1(cachep, objp),
1322 *dbg_redzone2(cachep, objp)); 1334 *dbg_redzone2(cachep, objp));
1323 } 1335 }
1324 1336
1325 if (cachep->flags & SLAB_STORE_USER) { 1337 if (cachep->flags & SLAB_STORE_USER) {
1326 printk(KERN_ERR "Last user: [<%p>]", 1338 printk(KERN_ERR "Last user: [<%p>]",
1327 *dbg_userword(cachep, objp)); 1339 *dbg_userword(cachep, objp));
1328 print_symbol("(%s)", 1340 print_symbol("(%s)",
1329 (unsigned long)*dbg_userword(cachep, objp)); 1341 (unsigned long)*dbg_userword(cachep, objp));
1330 printk("\n"); 1342 printk("\n");
1331 } 1343 }
1332 realobj = (char*)objp+obj_dbghead(cachep); 1344 realobj = (char *)objp + obj_dbghead(cachep);
1333 size = obj_reallen(cachep); 1345 size = obj_reallen(cachep);
1334 for (i=0; i<size && lines;i+=16, lines--) { 1346 for (i = 0; i < size && lines; i += 16, lines--) {
1335 int limit; 1347 int limit;
1336 limit = 16; 1348 limit = 16;
1337 if (i+limit > size) 1349 if (i + limit > size)
1338 limit = size-i; 1350 limit = size - i;
1339 dump_line(realobj, i, limit); 1351 dump_line(realobj, i, limit);
1340 } 1352 }
1341} 1353}
@@ -1346,27 +1358,28 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
1346 int size, i; 1358 int size, i;
1347 int lines = 0; 1359 int lines = 0;
1348 1360
1349 realobj = (char*)objp+obj_dbghead(cachep); 1361 realobj = (char *)objp + obj_dbghead(cachep);
1350 size = obj_reallen(cachep); 1362 size = obj_reallen(cachep);
1351 1363
1352 for (i=0;i<size;i++) { 1364 for (i = 0; i < size; i++) {
1353 char exp = POISON_FREE; 1365 char exp = POISON_FREE;
1354 if (i == size-1) 1366 if (i == size - 1)
1355 exp = POISON_END; 1367 exp = POISON_END;
1356 if (realobj[i] != exp) { 1368 if (realobj[i] != exp) {
1357 int limit; 1369 int limit;
1358 /* Mismatch ! */ 1370 /* Mismatch ! */
1359 /* Print header */ 1371 /* Print header */
1360 if (lines == 0) { 1372 if (lines == 0) {
1361 printk(KERN_ERR "Slab corruption: start=%p, len=%d\n", 1373 printk(KERN_ERR
1362 realobj, size); 1374 "Slab corruption: start=%p, len=%d\n",
1375 realobj, size);
1363 print_objinfo(cachep, objp, 0); 1376 print_objinfo(cachep, objp, 0);
1364 } 1377 }
1365 /* Hexdump the affected line */ 1378 /* Hexdump the affected line */
1366 i = (i/16)*16; 1379 i = (i / 16) * 16;
1367 limit = 16; 1380 limit = 16;
1368 if (i+limit > size) 1381 if (i + limit > size)
1369 limit = size-i; 1382 limit = size - i;
1370 dump_line(realobj, i, limit); 1383 dump_line(realobj, i, limit);
1371 i += 16; 1384 i += 16;
1372 lines++; 1385 lines++;
@@ -1382,19 +1395,19 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
1382 struct slab *slabp = page_get_slab(virt_to_page(objp)); 1395 struct slab *slabp = page_get_slab(virt_to_page(objp));
1383 int objnr; 1396 int objnr;
1384 1397
1385 objnr = (objp-slabp->s_mem)/cachep->objsize; 1398 objnr = (objp - slabp->s_mem) / cachep->objsize;
1386 if (objnr) { 1399 if (objnr) {
1387 objp = slabp->s_mem+(objnr-1)*cachep->objsize; 1400 objp = slabp->s_mem + (objnr - 1) * cachep->objsize;
1388 realobj = (char*)objp+obj_dbghead(cachep); 1401 realobj = (char *)objp + obj_dbghead(cachep);
1389 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1402 printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1390 realobj, size); 1403 realobj, size);
1391 print_objinfo(cachep, objp, 2); 1404 print_objinfo(cachep, objp, 2);
1392 } 1405 }
1393 if (objnr+1 < cachep->num) { 1406 if (objnr + 1 < cachep->num) {
1394 objp = slabp->s_mem+(objnr+1)*cachep->objsize; 1407 objp = slabp->s_mem + (objnr + 1) * cachep->objsize;
1395 realobj = (char*)objp+obj_dbghead(cachep); 1408 realobj = (char *)objp + obj_dbghead(cachep);
1396 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1409 printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1397 realobj, size); 1410 realobj, size);
1398 print_objinfo(cachep, objp, 2); 1411 print_objinfo(cachep, objp, 2);
1399 } 1412 }
1400 } 1413 }
@@ -1405,7 +1418,7 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
1405 * Before calling the slab must have been unlinked from the cache. 1418 * Before calling the slab must have been unlinked from the cache.
1406 * The cache-lock is not held/needed. 1419 * The cache-lock is not held/needed.
1407 */ 1420 */
1408static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) 1421static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
1409{ 1422{
1410 void *addr = slabp->s_mem - slabp->colouroff; 1423 void *addr = slabp->s_mem - slabp->colouroff;
1411 1424
@@ -1416,8 +1429,11 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
1416 1429
1417 if (cachep->flags & SLAB_POISON) { 1430 if (cachep->flags & SLAB_POISON) {
1418#ifdef CONFIG_DEBUG_PAGEALLOC 1431#ifdef CONFIG_DEBUG_PAGEALLOC
1419 if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep)) 1432 if ((cachep->objsize % PAGE_SIZE) == 0
1420 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1); 1433 && OFF_SLAB(cachep))
1434 kernel_map_pages(virt_to_page(objp),
1435 cachep->objsize / PAGE_SIZE,
1436 1);
1421 else 1437 else
1422 check_poison_obj(cachep, objp); 1438 check_poison_obj(cachep, objp);
1423#else 1439#else
@@ -1427,20 +1443,20 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
1427 if (cachep->flags & SLAB_RED_ZONE) { 1443 if (cachep->flags & SLAB_RED_ZONE) {
1428 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 1444 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1429 slab_error(cachep, "start of a freed object " 1445 slab_error(cachep, "start of a freed object "
1430 "was overwritten"); 1446 "was overwritten");
1431 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 1447 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1432 slab_error(cachep, "end of a freed object " 1448 slab_error(cachep, "end of a freed object "
1433 "was overwritten"); 1449 "was overwritten");
1434 } 1450 }
1435 if (cachep->dtor && !(cachep->flags & SLAB_POISON)) 1451 if (cachep->dtor && !(cachep->flags & SLAB_POISON))
1436 (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0); 1452 (cachep->dtor) (objp + obj_dbghead(cachep), cachep, 0);
1437 } 1453 }
1438#else 1454#else
1439 if (cachep->dtor) { 1455 if (cachep->dtor) {
1440 int i; 1456 int i;
1441 for (i = 0; i < cachep->num; i++) { 1457 for (i = 0; i < cachep->num; i++) {
1442 void* objp = slabp->s_mem+cachep->objsize*i; 1458 void *objp = slabp->s_mem + cachep->objsize * i;
1443 (cachep->dtor)(objp, cachep, 0); 1459 (cachep->dtor) (objp, cachep, 0);
1444 } 1460 }
1445 } 1461 }
1446#endif 1462#endif
@@ -1448,7 +1464,7 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
1448 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { 1464 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1449 struct slab_rcu *slab_rcu; 1465 struct slab_rcu *slab_rcu;
1450 1466
1451 slab_rcu = (struct slab_rcu *) slabp; 1467 slab_rcu = (struct slab_rcu *)slabp;
1452 slab_rcu->cachep = cachep; 1468 slab_rcu->cachep = cachep;
1453 slab_rcu->addr = addr; 1469 slab_rcu->addr = addr;
1454 call_rcu(&slab_rcu->head, kmem_rcu_free); 1470 call_rcu(&slab_rcu->head, kmem_rcu_free);
@@ -1466,11 +1482,58 @@ static inline void set_up_list3s(kmem_cache_t *cachep, int index)
1466 int node; 1482 int node;
1467 1483
1468 for_each_online_node(node) { 1484 for_each_online_node(node) {
1469 cachep->nodelists[node] = &initkmem_list3[index+node]; 1485 cachep->nodelists[node] = &initkmem_list3[index + node];
1470 cachep->nodelists[node]->next_reap = jiffies + 1486 cachep->nodelists[node]->next_reap = jiffies +
1471 REAPTIMEOUT_LIST3 + 1487 REAPTIMEOUT_LIST3 +
1472 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 1488 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1489 }
1490}
1491
1492/**
1493 * calculate_slab_order - calculate size (page order) of slabs and the number
1494 * of objects per slab.
1495 *
1496 * This could be made much more intelligent. For now, try to avoid using
1497 * high order pages for slabs. When the gfp() functions are more friendly
1498 * towards high-order requests, this should be changed.
1499 */
1500static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size,
1501 size_t align, gfp_t flags)
1502{
1503 size_t left_over = 0;
1504
1505 for (;; cachep->gfporder++) {
1506 unsigned int num;
1507 size_t remainder;
1508
1509 if (cachep->gfporder > MAX_GFP_ORDER) {
1510 cachep->num = 0;
1511 break;
1512 }
1513
1514 cache_estimate(cachep->gfporder, size, align, flags,
1515 &remainder, &num);
1516 if (!num)
1517 continue;
1518 /* More than offslab_limit objects will cause problems */
1519 if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)
1520 break;
1521
1522 cachep->num = num;
1523 left_over = remainder;
1524
1525 /*
1526 * Large number of objects is good, but very large slabs are
1527 * currently bad for the gfp()s.
1528 */
1529 if (cachep->gfporder >= slab_break_gfp_order)
1530 break;
1531
1532 if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))
1533 /* Acceptable internal fragmentation */
1534 break;
1473 } 1535 }
1536 return left_over;
1474} 1537}
1475 1538
1476/** 1539/**
@@ -1519,14 +1582,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1519 * Sanity checks... these are all serious usage bugs. 1582 * Sanity checks... these are all serious usage bugs.
1520 */ 1583 */
1521 if ((!name) || 1584 if ((!name) ||
1522 in_interrupt() || 1585 in_interrupt() ||
1523 (size < BYTES_PER_WORD) || 1586 (size < BYTES_PER_WORD) ||
1524 (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) || 1587 (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
1525 (dtor && !ctor)) { 1588 printk(KERN_ERR "%s: Early error in slab %s\n",
1526 printk(KERN_ERR "%s: Early error in slab %s\n", 1589 __FUNCTION__, name);
1527 __FUNCTION__, name); 1590 BUG();
1528 BUG(); 1591 }
1529 }
1530 1592
1531 down(&cache_chain_sem); 1593 down(&cache_chain_sem);
1532 1594
@@ -1546,11 +1608,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1546 set_fs(old_fs); 1608 set_fs(old_fs);
1547 if (res) { 1609 if (res) {
1548 printk("SLAB: cache with size %d has lost its name\n", 1610 printk("SLAB: cache with size %d has lost its name\n",
1549 pc->objsize); 1611 pc->objsize);
1550 continue; 1612 continue;
1551 } 1613 }
1552 1614
1553 if (!strcmp(pc->name,name)) { 1615 if (!strcmp(pc->name, name)) {
1554 printk("kmem_cache_create: duplicate cache %s\n", name); 1616 printk("kmem_cache_create: duplicate cache %s\n", name);
1555 dump_stack(); 1617 dump_stack();
1556 goto oops; 1618 goto oops;
@@ -1562,10 +1624,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1562 if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { 1624 if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
1563 /* No constructor, but inital state check requested */ 1625 /* No constructor, but inital state check requested */
1564 printk(KERN_ERR "%s: No con, but init state check " 1626 printk(KERN_ERR "%s: No con, but init state check "
1565 "requested - %s\n", __FUNCTION__, name); 1627 "requested - %s\n", __FUNCTION__, name);
1566 flags &= ~SLAB_DEBUG_INITIAL; 1628 flags &= ~SLAB_DEBUG_INITIAL;
1567 } 1629 }
1568
1569#if FORCED_DEBUG 1630#if FORCED_DEBUG
1570 /* 1631 /*
1571 * Enable redzoning and last user accounting, except for caches with 1632 * Enable redzoning and last user accounting, except for caches with
@@ -1573,8 +1634,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1573 * above the next power of two: caches with object sizes just above a 1634 * above the next power of two: caches with object sizes just above a
1574 * power of two have a significant amount of internal fragmentation. 1635 * power of two have a significant amount of internal fragmentation.
1575 */ 1636 */
1576 if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD))) 1637 if ((size < 4096
1577 flags |= SLAB_RED_ZONE|SLAB_STORE_USER; 1638 || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
1639 flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
1578 if (!(flags & SLAB_DESTROY_BY_RCU)) 1640 if (!(flags & SLAB_DESTROY_BY_RCU))
1579 flags |= SLAB_POISON; 1641 flags |= SLAB_POISON;
1580#endif 1642#endif
@@ -1595,9 +1657,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1595 * unaligned accesses for some archs when redzoning is used, and makes 1657 * unaligned accesses for some archs when redzoning is used, and makes
1596 * sure any on-slab bufctl's are also correctly aligned. 1658 * sure any on-slab bufctl's are also correctly aligned.
1597 */ 1659 */
1598 if (size & (BYTES_PER_WORD-1)) { 1660 if (size & (BYTES_PER_WORD - 1)) {
1599 size += (BYTES_PER_WORD-1); 1661 size += (BYTES_PER_WORD - 1);
1600 size &= ~(BYTES_PER_WORD-1); 1662 size &= ~(BYTES_PER_WORD - 1);
1601 } 1663 }
1602 1664
1603 /* calculate out the final buffer alignment: */ 1665 /* calculate out the final buffer alignment: */
@@ -1608,7 +1670,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1608 * objects into one cacheline. 1670 * objects into one cacheline.
1609 */ 1671 */
1610 ralign = cache_line_size(); 1672 ralign = cache_line_size();
1611 while (size <= ralign/2) 1673 while (size <= ralign / 2)
1612 ralign /= 2; 1674 ralign /= 2;
1613 } else { 1675 } else {
1614 ralign = BYTES_PER_WORD; 1676 ralign = BYTES_PER_WORD;
@@ -1617,13 +1679,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1617 if (ralign < ARCH_SLAB_MINALIGN) { 1679 if (ralign < ARCH_SLAB_MINALIGN) {
1618 ralign = ARCH_SLAB_MINALIGN; 1680 ralign = ARCH_SLAB_MINALIGN;
1619 if (ralign > BYTES_PER_WORD) 1681 if (ralign > BYTES_PER_WORD)
1620 flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); 1682 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
1621 } 1683 }
1622 /* 3) caller mandated alignment: disables debug if necessary */ 1684 /* 3) caller mandated alignment: disables debug if necessary */
1623 if (ralign < align) { 1685 if (ralign < align) {
1624 ralign = align; 1686 ralign = align;
1625 if (ralign > BYTES_PER_WORD) 1687 if (ralign > BYTES_PER_WORD)
1626 flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); 1688 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
1627 } 1689 }
1628 /* 4) Store it. Note that the debug code below can reduce 1690 /* 4) Store it. Note that the debug code below can reduce
1629 * the alignment to BYTES_PER_WORD. 1691 * the alignment to BYTES_PER_WORD.
@@ -1645,7 +1707,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1645 1707
1646 /* add space for red zone words */ 1708 /* add space for red zone words */
1647 cachep->dbghead += BYTES_PER_WORD; 1709 cachep->dbghead += BYTES_PER_WORD;
1648 size += 2*BYTES_PER_WORD; 1710 size += 2 * BYTES_PER_WORD;
1649 } 1711 }
1650 if (flags & SLAB_STORE_USER) { 1712 if (flags & SLAB_STORE_USER) {
1651 /* user store requires word alignment and 1713 /* user store requires word alignment and
@@ -1656,7 +1718,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1656 size += BYTES_PER_WORD; 1718 size += BYTES_PER_WORD;
1657 } 1719 }
1658#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 1720#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
1659 if (size >= malloc_sizes[INDEX_L3+1].cs_size && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { 1721 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
1722 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
1660 cachep->dbghead += PAGE_SIZE - size; 1723 cachep->dbghead += PAGE_SIZE - size;
1661 size = PAGE_SIZE; 1724 size = PAGE_SIZE;
1662 } 1725 }
@@ -1664,7 +1727,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1664#endif 1727#endif
1665 1728
1666 /* Determine if the slab management is 'on' or 'off' slab. */ 1729 /* Determine if the slab management is 'on' or 'off' slab. */
1667 if (size >= (PAGE_SIZE>>3)) 1730 if (size >= (PAGE_SIZE >> 3))
1668 /* 1731 /*
1669 * Size is large, assume best to place the slab management obj 1732 * Size is large, assume best to place the slab management obj
1670 * off-slab (should allow better packing of objs). 1733 * off-slab (should allow better packing of objs).
@@ -1681,47 +1744,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1681 */ 1744 */
1682 cachep->gfporder = 0; 1745 cachep->gfporder = 0;
1683 cache_estimate(cachep->gfporder, size, align, flags, 1746 cache_estimate(cachep->gfporder, size, align, flags,
1684 &left_over, &cachep->num); 1747 &left_over, &cachep->num);
1685 } else { 1748 } else
1686 /* 1749 left_over = calculate_slab_order(cachep, size, align, flags);
1687 * Calculate size (in pages) of slabs, and the num of objs per
1688 * slab. This could be made much more intelligent. For now,
1689 * try to avoid using high page-orders for slabs. When the
1690 * gfp() funcs are more friendly towards high-order requests,
1691 * this should be changed.
1692 */
1693 do {
1694 unsigned int break_flag = 0;
1695cal_wastage:
1696 cache_estimate(cachep->gfporder, size, align, flags,
1697 &left_over, &cachep->num);
1698 if (break_flag)
1699 break;
1700 if (cachep->gfporder >= MAX_GFP_ORDER)
1701 break;
1702 if (!cachep->num)
1703 goto next;
1704 if (flags & CFLGS_OFF_SLAB &&
1705 cachep->num > offslab_limit) {
1706 /* This num of objs will cause problems. */
1707 cachep->gfporder--;
1708 break_flag++;
1709 goto cal_wastage;
1710 }
1711
1712 /*
1713 * Large num of objs is good, but v. large slabs are
1714 * currently bad for the gfp()s.
1715 */
1716 if (cachep->gfporder >= slab_break_gfp_order)
1717 break;
1718
1719 if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))
1720 break; /* Acceptable internal fragmentation. */
1721next:
1722 cachep->gfporder++;
1723 } while (1);
1724 }
1725 1750
1726 if (!cachep->num) { 1751 if (!cachep->num) {
1727 printk("kmem_cache_create: couldn't create cache %s.\n", name); 1752 printk("kmem_cache_create: couldn't create cache %s.\n", name);
@@ -1729,8 +1754,8 @@ next:
1729 cachep = NULL; 1754 cachep = NULL;
1730 goto oops; 1755 goto oops;
1731 } 1756 }
1732 slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t) 1757 slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
1733 + sizeof(struct slab), align); 1758 + sizeof(struct slab), align);
1734 1759
1735 /* 1760 /*
1736 * If the slab has been placed off-slab, and we have enough space then 1761 * If the slab has been placed off-slab, and we have enough space then
@@ -1743,14 +1768,15 @@ next:
1743 1768
1744 if (flags & CFLGS_OFF_SLAB) { 1769 if (flags & CFLGS_OFF_SLAB) {
1745 /* really off slab. No need for manual alignment */ 1770 /* really off slab. No need for manual alignment */
1746 slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab); 1771 slab_size =
1772 cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
1747 } 1773 }
1748 1774
1749 cachep->colour_off = cache_line_size(); 1775 cachep->colour_off = cache_line_size();
1750 /* Offset must be a multiple of the alignment. */ 1776 /* Offset must be a multiple of the alignment. */
1751 if (cachep->colour_off < align) 1777 if (cachep->colour_off < align)
1752 cachep->colour_off = align; 1778 cachep->colour_off = align;
1753 cachep->colour = left_over/cachep->colour_off; 1779 cachep->colour = left_over / cachep->colour_off;
1754 cachep->slab_size = slab_size; 1780 cachep->slab_size = slab_size;
1755 cachep->flags = flags; 1781 cachep->flags = flags;
1756 cachep->gfpflags = 0; 1782 cachep->gfpflags = 0;
@@ -1777,7 +1803,7 @@ next:
1777 * the creation of further caches will BUG(). 1803 * the creation of further caches will BUG().
1778 */ 1804 */
1779 cachep->array[smp_processor_id()] = 1805 cachep->array[smp_processor_id()] =
1780 &initarray_generic.cache; 1806 &initarray_generic.cache;
1781 1807
1782 /* If the cache that's used by 1808 /* If the cache that's used by
1783 * kmalloc(sizeof(kmem_list3)) is the first cache, 1809 * kmalloc(sizeof(kmem_list3)) is the first cache,
@@ -1791,8 +1817,7 @@ next:
1791 g_cpucache_up = PARTIAL_AC; 1817 g_cpucache_up = PARTIAL_AC;
1792 } else { 1818 } else {
1793 cachep->array[smp_processor_id()] = 1819 cachep->array[smp_processor_id()] =
1794 kmalloc(sizeof(struct arraycache_init), 1820 kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1795 GFP_KERNEL);
1796 1821
1797 if (g_cpucache_up == PARTIAL_AC) { 1822 if (g_cpucache_up == PARTIAL_AC) {
1798 set_up_list3s(cachep, SIZE_L3); 1823 set_up_list3s(cachep, SIZE_L3);
@@ -1802,16 +1827,18 @@ next:
1802 for_each_online_node(node) { 1827 for_each_online_node(node) {
1803 1828
1804 cachep->nodelists[node] = 1829 cachep->nodelists[node] =
1805 kmalloc_node(sizeof(struct kmem_list3), 1830 kmalloc_node(sizeof
1806 GFP_KERNEL, node); 1831 (struct kmem_list3),
1832 GFP_KERNEL, node);
1807 BUG_ON(!cachep->nodelists[node]); 1833 BUG_ON(!cachep->nodelists[node]);
1808 kmem_list3_init(cachep->nodelists[node]); 1834 kmem_list3_init(cachep->
1835 nodelists[node]);
1809 } 1836 }
1810 } 1837 }
1811 } 1838 }
1812 cachep->nodelists[numa_node_id()]->next_reap = 1839 cachep->nodelists[numa_node_id()]->next_reap =
1813 jiffies + REAPTIMEOUT_LIST3 + 1840 jiffies + REAPTIMEOUT_LIST3 +
1814 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 1841 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1815 1842
1816 BUG_ON(!ac_data(cachep)); 1843 BUG_ON(!ac_data(cachep));
1817 ac_data(cachep)->avail = 0; 1844 ac_data(cachep)->avail = 0;
@@ -1820,15 +1847,15 @@ next:
1820 ac_data(cachep)->touched = 0; 1847 ac_data(cachep)->touched = 0;
1821 cachep->batchcount = 1; 1848 cachep->batchcount = 1;
1822 cachep->limit = BOOT_CPUCACHE_ENTRIES; 1849 cachep->limit = BOOT_CPUCACHE_ENTRIES;
1823 } 1850 }
1824 1851
1825 /* cache setup completed, link it into the list */ 1852 /* cache setup completed, link it into the list */
1826 list_add(&cachep->next, &cache_chain); 1853 list_add(&cachep->next, &cache_chain);
1827 unlock_cpu_hotplug(); 1854 unlock_cpu_hotplug();
1828oops: 1855 oops:
1829 if (!cachep && (flags & SLAB_PANIC)) 1856 if (!cachep && (flags & SLAB_PANIC))
1830 panic("kmem_cache_create(): failed to create slab `%s'\n", 1857 panic("kmem_cache_create(): failed to create slab `%s'\n",
1831 name); 1858 name);
1832 up(&cache_chain_sem); 1859 up(&cache_chain_sem);
1833 return cachep; 1860 return cachep;
1834} 1861}
@@ -1871,7 +1898,7 @@ static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
1871/* 1898/*
1872 * Waits for all CPUs to execute func(). 1899 * Waits for all CPUs to execute func().
1873 */ 1900 */
1874static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg) 1901static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
1875{ 1902{
1876 check_irq_on(); 1903 check_irq_on();
1877 preempt_disable(); 1904 preempt_disable();
@@ -1886,12 +1913,12 @@ static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
1886 preempt_enable(); 1913 preempt_enable();
1887} 1914}
1888 1915
1889static void drain_array_locked(kmem_cache_t* cachep, 1916static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
1890 struct array_cache *ac, int force, int node); 1917 int force, int node);
1891 1918
1892static void do_drain(void *arg) 1919static void do_drain(void *arg)
1893{ 1920{
1894 kmem_cache_t *cachep = (kmem_cache_t*)arg; 1921 kmem_cache_t *cachep = (kmem_cache_t *) arg;
1895 struct array_cache *ac; 1922 struct array_cache *ac;
1896 int node = numa_node_id(); 1923 int node = numa_node_id();
1897 1924
@@ -1911,7 +1938,7 @@ static void drain_cpu_caches(kmem_cache_t *cachep)
1911 smp_call_function_all_cpus(do_drain, cachep); 1938 smp_call_function_all_cpus(do_drain, cachep);
1912 check_irq_on(); 1939 check_irq_on();
1913 spin_lock_irq(&cachep->spinlock); 1940 spin_lock_irq(&cachep->spinlock);
1914 for_each_online_node(node) { 1941 for_each_online_node(node) {
1915 l3 = cachep->nodelists[node]; 1942 l3 = cachep->nodelists[node];
1916 if (l3) { 1943 if (l3) {
1917 spin_lock(&l3->list_lock); 1944 spin_lock(&l3->list_lock);
@@ -1949,8 +1976,7 @@ static int __node_shrink(kmem_cache_t *cachep, int node)
1949 slab_destroy(cachep, slabp); 1976 slab_destroy(cachep, slabp);
1950 spin_lock_irq(&l3->list_lock); 1977 spin_lock_irq(&l3->list_lock);
1951 } 1978 }
1952 ret = !list_empty(&l3->slabs_full) || 1979 ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial);
1953 !list_empty(&l3->slabs_partial);
1954 return ret; 1980 return ret;
1955} 1981}
1956 1982
@@ -2006,7 +2032,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2006 * The caller must guarantee that noone will allocate memory from the cache 2032 * The caller must guarantee that noone will allocate memory from the cache
2007 * during the kmem_cache_destroy(). 2033 * during the kmem_cache_destroy().
2008 */ 2034 */
2009int kmem_cache_destroy(kmem_cache_t * cachep) 2035int kmem_cache_destroy(kmem_cache_t *cachep)
2010{ 2036{
2011 int i; 2037 int i;
2012 struct kmem_list3 *l3; 2038 struct kmem_list3 *l3;
@@ -2028,7 +2054,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
2028 if (__cache_shrink(cachep)) { 2054 if (__cache_shrink(cachep)) {
2029 slab_error(cachep, "Can't free all objects"); 2055 slab_error(cachep, "Can't free all objects");
2030 down(&cache_chain_sem); 2056 down(&cache_chain_sem);
2031 list_add(&cachep->next,&cache_chain); 2057 list_add(&cachep->next, &cache_chain);
2032 up(&cache_chain_sem); 2058 up(&cache_chain_sem);
2033 unlock_cpu_hotplug(); 2059 unlock_cpu_hotplug();
2034 return 1; 2060 return 1;
@@ -2038,7 +2064,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
2038 synchronize_rcu(); 2064 synchronize_rcu();
2039 2065
2040 for_each_online_cpu(i) 2066 for_each_online_cpu(i)
2041 kfree(cachep->array[i]); 2067 kfree(cachep->array[i]);
2042 2068
2043 /* NUMA: free the list3 structures */ 2069 /* NUMA: free the list3 structures */
2044 for_each_online_node(i) { 2070 for_each_online_node(i) {
@@ -2057,39 +2083,39 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
2057EXPORT_SYMBOL(kmem_cache_destroy); 2083EXPORT_SYMBOL(kmem_cache_destroy);
2058 2084
2059/* Get the memory for a slab management obj. */ 2085/* Get the memory for a slab management obj. */
2060static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp, 2086static struct slab *alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
2061 int colour_off, gfp_t local_flags) 2087 int colour_off, gfp_t local_flags)
2062{ 2088{
2063 struct slab *slabp; 2089 struct slab *slabp;
2064 2090
2065 if (OFF_SLAB(cachep)) { 2091 if (OFF_SLAB(cachep)) {
2066 /* Slab management obj is off-slab. */ 2092 /* Slab management obj is off-slab. */
2067 slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); 2093 slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
2068 if (!slabp) 2094 if (!slabp)
2069 return NULL; 2095 return NULL;
2070 } else { 2096 } else {
2071 slabp = objp+colour_off; 2097 slabp = objp + colour_off;
2072 colour_off += cachep->slab_size; 2098 colour_off += cachep->slab_size;
2073 } 2099 }
2074 slabp->inuse = 0; 2100 slabp->inuse = 0;
2075 slabp->colouroff = colour_off; 2101 slabp->colouroff = colour_off;
2076 slabp->s_mem = objp+colour_off; 2102 slabp->s_mem = objp + colour_off;
2077 2103
2078 return slabp; 2104 return slabp;
2079} 2105}
2080 2106
2081static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) 2107static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2082{ 2108{
2083 return (kmem_bufctl_t *)(slabp+1); 2109 return (kmem_bufctl_t *) (slabp + 1);
2084} 2110}
2085 2111
2086static void cache_init_objs(kmem_cache_t *cachep, 2112static void cache_init_objs(kmem_cache_t *cachep,
2087 struct slab *slabp, unsigned long ctor_flags) 2113 struct slab *slabp, unsigned long ctor_flags)
2088{ 2114{
2089 int i; 2115 int i;
2090 2116
2091 for (i = 0; i < cachep->num; i++) { 2117 for (i = 0; i < cachep->num; i++) {
2092 void *objp = slabp->s_mem+cachep->objsize*i; 2118 void *objp = slabp->s_mem + cachep->objsize * i;
2093#if DEBUG 2119#if DEBUG
2094 /* need to poison the objs? */ 2120 /* need to poison the objs? */
2095 if (cachep->flags & SLAB_POISON) 2121 if (cachep->flags & SLAB_POISON)
@@ -2107,25 +2133,28 @@ static void cache_init_objs(kmem_cache_t *cachep,
2107 * Otherwise, deadlock. They must also be threaded. 2133 * Otherwise, deadlock. They must also be threaded.
2108 */ 2134 */
2109 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2135 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2110 cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags); 2136 cachep->ctor(objp + obj_dbghead(cachep), cachep,
2137 ctor_flags);
2111 2138
2112 if (cachep->flags & SLAB_RED_ZONE) { 2139 if (cachep->flags & SLAB_RED_ZONE) {
2113 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2140 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2114 slab_error(cachep, "constructor overwrote the" 2141 slab_error(cachep, "constructor overwrote the"
2115 " end of an object"); 2142 " end of an object");
2116 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 2143 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2117 slab_error(cachep, "constructor overwrote the" 2144 slab_error(cachep, "constructor overwrote the"
2118 " start of an object"); 2145 " start of an object");
2119 } 2146 }
2120 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) 2147 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
2121 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); 2148 && cachep->flags & SLAB_POISON)
2149 kernel_map_pages(virt_to_page(objp),
2150 cachep->objsize / PAGE_SIZE, 0);
2122#else 2151#else
2123 if (cachep->ctor) 2152 if (cachep->ctor)
2124 cachep->ctor(objp, cachep, ctor_flags); 2153 cachep->ctor(objp, cachep, ctor_flags);
2125#endif 2154#endif
2126 slab_bufctl(slabp)[i] = i+1; 2155 slab_bufctl(slabp)[i] = i + 1;
2127 } 2156 }
2128 slab_bufctl(slabp)[i-1] = BUFCTL_END; 2157 slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2129 slabp->free = 0; 2158 slabp->free = 0;
2130} 2159}
2131 2160
@@ -2161,17 +2190,17 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
2161 */ 2190 */
2162static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2191static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2163{ 2192{
2164 struct slab *slabp; 2193 struct slab *slabp;
2165 void *objp; 2194 void *objp;
2166 size_t offset; 2195 size_t offset;
2167 gfp_t local_flags; 2196 gfp_t local_flags;
2168 unsigned long ctor_flags; 2197 unsigned long ctor_flags;
2169 struct kmem_list3 *l3; 2198 struct kmem_list3 *l3;
2170 2199
2171 /* Be lazy and only check for valid flags here, 2200 /* Be lazy and only check for valid flags here,
2172 * keeping it out of the critical path in kmem_cache_alloc(). 2201 * keeping it out of the critical path in kmem_cache_alloc().
2173 */ 2202 */
2174 if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) 2203 if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
2175 BUG(); 2204 BUG();
2176 if (flags & SLAB_NO_GROW) 2205 if (flags & SLAB_NO_GROW)
2177 return 0; 2206 return 0;
@@ -2237,9 +2266,9 @@ static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2237 l3->free_objects += cachep->num; 2266 l3->free_objects += cachep->num;
2238 spin_unlock(&l3->list_lock); 2267 spin_unlock(&l3->list_lock);
2239 return 1; 2268 return 1;
2240opps1: 2269 opps1:
2241 kmem_freepages(cachep, objp); 2270 kmem_freepages(cachep, objp);
2242failed: 2271 failed:
2243 if (local_flags & __GFP_WAIT) 2272 if (local_flags & __GFP_WAIT)
2244 local_irq_disable(); 2273 local_irq_disable();
2245 return 0; 2274 return 0;
@@ -2259,18 +2288,19 @@ static void kfree_debugcheck(const void *objp)
2259 2288
2260 if (!virt_addr_valid(objp)) { 2289 if (!virt_addr_valid(objp)) {
2261 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", 2290 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2262 (unsigned long)objp); 2291 (unsigned long)objp);
2263 BUG(); 2292 BUG();
2264 } 2293 }
2265 page = virt_to_page(objp); 2294 page = virt_to_page(objp);
2266 if (!PageSlab(page)) { 2295 if (!PageSlab(page)) {
2267 printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp); 2296 printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
2297 (unsigned long)objp);
2268 BUG(); 2298 BUG();
2269 } 2299 }
2270} 2300}
2271 2301
2272static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, 2302static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
2273 void *caller) 2303 void *caller)
2274{ 2304{
2275 struct page *page; 2305 struct page *page;
2276 unsigned int objnr; 2306 unsigned int objnr;
@@ -2281,20 +2311,26 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
2281 page = virt_to_page(objp); 2311 page = virt_to_page(objp);
2282 2312
2283 if (page_get_cache(page) != cachep) { 2313 if (page_get_cache(page) != cachep) {
2284 printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n", 2314 printk(KERN_ERR
2285 page_get_cache(page),cachep); 2315 "mismatch in kmem_cache_free: expected cache %p, got %p\n",
2316 page_get_cache(page), cachep);
2286 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); 2317 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
2287 printk(KERN_ERR "%p is %s.\n", page_get_cache(page), page_get_cache(page)->name); 2318 printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
2319 page_get_cache(page)->name);
2288 WARN_ON(1); 2320 WARN_ON(1);
2289 } 2321 }
2290 slabp = page_get_slab(page); 2322 slabp = page_get_slab(page);
2291 2323
2292 if (cachep->flags & SLAB_RED_ZONE) { 2324 if (cachep->flags & SLAB_RED_ZONE) {
2293 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { 2325 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE
2294 slab_error(cachep, "double free, or memory outside" 2326 || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
2295 " object was overwritten"); 2327 slab_error(cachep,
2296 printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 2328 "double free, or memory outside"
2297 objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); 2329 " object was overwritten");
2330 printk(KERN_ERR
2331 "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
2332 objp, *dbg_redzone1(cachep, objp),
2333 *dbg_redzone2(cachep, objp));
2298 } 2334 }
2299 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2335 *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2300 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2336 *dbg_redzone2(cachep, objp) = RED_INACTIVE;
@@ -2302,30 +2338,31 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
2302 if (cachep->flags & SLAB_STORE_USER) 2338 if (cachep->flags & SLAB_STORE_USER)
2303 *dbg_userword(cachep, objp) = caller; 2339 *dbg_userword(cachep, objp) = caller;
2304 2340
2305 objnr = (objp-slabp->s_mem)/cachep->objsize; 2341 objnr = (objp - slabp->s_mem) / cachep->objsize;
2306 2342
2307 BUG_ON(objnr >= cachep->num); 2343 BUG_ON(objnr >= cachep->num);
2308 BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize); 2344 BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize);
2309 2345
2310 if (cachep->flags & SLAB_DEBUG_INITIAL) { 2346 if (cachep->flags & SLAB_DEBUG_INITIAL) {
2311 /* Need to call the slab's constructor so the 2347 /* Need to call the slab's constructor so the
2312 * caller can perform a verify of its state (debugging). 2348 * caller can perform a verify of its state (debugging).
2313 * Called without the cache-lock held. 2349 * Called without the cache-lock held.
2314 */ 2350 */
2315 cachep->ctor(objp+obj_dbghead(cachep), 2351 cachep->ctor(objp + obj_dbghead(cachep),
2316 cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY); 2352 cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
2317 } 2353 }
2318 if (cachep->flags & SLAB_POISON && cachep->dtor) { 2354 if (cachep->flags & SLAB_POISON && cachep->dtor) {
2319 /* we want to cache poison the object, 2355 /* we want to cache poison the object,
2320 * call the destruction callback 2356 * call the destruction callback
2321 */ 2357 */
2322 cachep->dtor(objp+obj_dbghead(cachep), cachep, 0); 2358 cachep->dtor(objp + obj_dbghead(cachep), cachep, 0);
2323 } 2359 }
2324 if (cachep->flags & SLAB_POISON) { 2360 if (cachep->flags & SLAB_POISON) {
2325#ifdef CONFIG_DEBUG_PAGEALLOC 2361#ifdef CONFIG_DEBUG_PAGEALLOC
2326 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { 2362 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
2327 store_stackinfo(cachep, objp, (unsigned long)caller); 2363 store_stackinfo(cachep, objp, (unsigned long)caller);
2328 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); 2364 kernel_map_pages(virt_to_page(objp),
2365 cachep->objsize / PAGE_SIZE, 0);
2329 } else { 2366 } else {
2330 poison_obj(cachep, objp, POISON_FREE); 2367 poison_obj(cachep, objp, POISON_FREE);
2331 } 2368 }
@@ -2340,7 +2377,7 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
2340{ 2377{
2341 kmem_bufctl_t i; 2378 kmem_bufctl_t i;
2342 int entries = 0; 2379 int entries = 0;
2343 2380
2344 /* Check slab's freelist to see if this obj is there. */ 2381 /* Check slab's freelist to see if this obj is there. */
2345 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { 2382 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2346 entries++; 2383 entries++;
@@ -2348,13 +2385,16 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
2348 goto bad; 2385 goto bad;
2349 } 2386 }
2350 if (entries != cachep->num - slabp->inuse) { 2387 if (entries != cachep->num - slabp->inuse) {
2351bad: 2388 bad:
2352 printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", 2389 printk(KERN_ERR
2353 cachep->name, cachep->num, slabp, slabp->inuse); 2390 "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2354 for (i=0;i<sizeof(slabp)+cachep->num*sizeof(kmem_bufctl_t);i++) { 2391 cachep->name, cachep->num, slabp, slabp->inuse);
2355 if ((i%16)==0) 2392 for (i = 0;
2393 i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t);
2394 i++) {
2395 if ((i % 16) == 0)
2356 printk("\n%03x:", i); 2396 printk("\n%03x:", i);
2357 printk(" %02x", ((unsigned char*)slabp)[i]); 2397 printk(" %02x", ((unsigned char *)slabp)[i]);
2358 } 2398 }
2359 printk("\n"); 2399 printk("\n");
2360 BUG(); 2400 BUG();
@@ -2374,7 +2414,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
2374 2414
2375 check_irq_off(); 2415 check_irq_off();
2376 ac = ac_data(cachep); 2416 ac = ac_data(cachep);
2377retry: 2417 retry:
2378 batchcount = ac->batchcount; 2418 batchcount = ac->batchcount;
2379 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2419 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2380 /* if there was little recent activity on this 2420 /* if there was little recent activity on this
@@ -2396,8 +2436,8 @@ retry:
2396 shared_array->avail -= batchcount; 2436 shared_array->avail -= batchcount;
2397 ac->avail = batchcount; 2437 ac->avail = batchcount;
2398 memcpy(ac->entry, 2438 memcpy(ac->entry,
2399 &(shared_array->entry[shared_array->avail]), 2439 &(shared_array->entry[shared_array->avail]),
2400 sizeof(void*)*batchcount); 2440 sizeof(void *) * batchcount);
2401 shared_array->touched = 1; 2441 shared_array->touched = 1;
2402 goto alloc_done; 2442 goto alloc_done;
2403 } 2443 }
@@ -2425,7 +2465,7 @@ retry:
2425 2465
2426 /* get obj pointer */ 2466 /* get obj pointer */
2427 ac->entry[ac->avail++] = slabp->s_mem + 2467 ac->entry[ac->avail++] = slabp->s_mem +
2428 slabp->free*cachep->objsize; 2468 slabp->free * cachep->objsize;
2429 2469
2430 slabp->inuse++; 2470 slabp->inuse++;
2431 next = slab_bufctl(slabp)[slabp->free]; 2471 next = slab_bufctl(slabp)[slabp->free];
@@ -2433,7 +2473,7 @@ retry:
2433 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2473 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2434 WARN_ON(numa_node_id() != slabp->nodeid); 2474 WARN_ON(numa_node_id() != slabp->nodeid);
2435#endif 2475#endif
2436 slabp->free = next; 2476 slabp->free = next;
2437 } 2477 }
2438 check_slabp(cachep, slabp); 2478 check_slabp(cachep, slabp);
2439 2479
@@ -2445,9 +2485,9 @@ retry:
2445 list_add(&slabp->list, &l3->slabs_partial); 2485 list_add(&slabp->list, &l3->slabs_partial);
2446 } 2486 }
2447 2487
2448must_grow: 2488 must_grow:
2449 l3->free_objects -= ac->avail; 2489 l3->free_objects -= ac->avail;
2450alloc_done: 2490 alloc_done:
2451 spin_unlock(&l3->list_lock); 2491 spin_unlock(&l3->list_lock);
2452 2492
2453 if (unlikely(!ac->avail)) { 2493 if (unlikely(!ac->avail)) {
@@ -2459,7 +2499,7 @@ alloc_done:
2459 if (!x && ac->avail == 0) // no objects in sight? abort 2499 if (!x && ac->avail == 0) // no objects in sight? abort
2460 return NULL; 2500 return NULL;
2461 2501
2462 if (!ac->avail) // objects refilled by interrupt? 2502 if (!ac->avail) // objects refilled by interrupt?
2463 goto retry; 2503 goto retry;
2464 } 2504 }
2465 ac->touched = 1; 2505 ac->touched = 1;
@@ -2476,16 +2516,16 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
2476} 2516}
2477 2517
2478#if DEBUG 2518#if DEBUG
2479static void * 2519static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
2480cache_alloc_debugcheck_after(kmem_cache_t *cachep, 2520 void *objp, void *caller)
2481 gfp_t flags, void *objp, void *caller)
2482{ 2521{
2483 if (!objp) 2522 if (!objp)
2484 return objp; 2523 return objp;
2485 if (cachep->flags & SLAB_POISON) { 2524 if (cachep->flags & SLAB_POISON) {
2486#ifdef CONFIG_DEBUG_PAGEALLOC 2525#ifdef CONFIG_DEBUG_PAGEALLOC
2487 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 2526 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
2488 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1); 2527 kernel_map_pages(virt_to_page(objp),
2528 cachep->objsize / PAGE_SIZE, 1);
2489 else 2529 else
2490 check_poison_obj(cachep, objp); 2530 check_poison_obj(cachep, objp);
2491#else 2531#else
@@ -2497,24 +2537,28 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
2497 *dbg_userword(cachep, objp) = caller; 2537 *dbg_userword(cachep, objp) = caller;
2498 2538
2499 if (cachep->flags & SLAB_RED_ZONE) { 2539 if (cachep->flags & SLAB_RED_ZONE) {
2500 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 2540 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE
2501 slab_error(cachep, "double free, or memory outside" 2541 || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
2502 " object was overwritten"); 2542 slab_error(cachep,
2503 printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 2543 "double free, or memory outside"
2504 objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); 2544 " object was overwritten");
2545 printk(KERN_ERR
2546 "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
2547 objp, *dbg_redzone1(cachep, objp),
2548 *dbg_redzone2(cachep, objp));
2505 } 2549 }
2506 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2550 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2507 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2551 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2508 } 2552 }
2509 objp += obj_dbghead(cachep); 2553 objp += obj_dbghead(cachep);
2510 if (cachep->ctor && cachep->flags & SLAB_POISON) { 2554 if (cachep->ctor && cachep->flags & SLAB_POISON) {
2511 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2555 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2512 2556
2513 if (!(flags & __GFP_WAIT)) 2557 if (!(flags & __GFP_WAIT))
2514 ctor_flags |= SLAB_CTOR_ATOMIC; 2558 ctor_flags |= SLAB_CTOR_ATOMIC;
2515 2559
2516 cachep->ctor(objp, cachep, ctor_flags); 2560 cachep->ctor(objp, cachep, ctor_flags);
2517 } 2561 }
2518 return objp; 2562 return objp;
2519} 2563}
2520#else 2564#else
@@ -2523,7 +2567,7 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
2523 2567
2524static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2568static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2525{ 2569{
2526 void* objp; 2570 void *objp;
2527 struct array_cache *ac; 2571 struct array_cache *ac;
2528 2572
2529 check_irq_off(); 2573 check_irq_off();
@@ -2542,7 +2586,7 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2542static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2586static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2543{ 2587{
2544 unsigned long save_flags; 2588 unsigned long save_flags;
2545 void* objp; 2589 void *objp;
2546 2590
2547 cache_alloc_debugcheck_before(cachep, flags); 2591 cache_alloc_debugcheck_before(cachep, flags);
2548 2592
@@ -2550,7 +2594,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2550 objp = ____cache_alloc(cachep, flags); 2594 objp = ____cache_alloc(cachep, flags);
2551 local_irq_restore(save_flags); 2595 local_irq_restore(save_flags);
2552 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 2596 objp = cache_alloc_debugcheck_after(cachep, flags, objp,
2553 __builtin_return_address(0)); 2597 __builtin_return_address(0));
2554 prefetchw(objp); 2598 prefetchw(objp);
2555 return objp; 2599 return objp;
2556} 2600}
@@ -2562,74 +2606,75 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2562static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2606static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2563{ 2607{
2564 struct list_head *entry; 2608 struct list_head *entry;
2565 struct slab *slabp; 2609 struct slab *slabp;
2566 struct kmem_list3 *l3; 2610 struct kmem_list3 *l3;
2567 void *obj; 2611 void *obj;
2568 kmem_bufctl_t next; 2612 kmem_bufctl_t next;
2569 int x; 2613 int x;
2570 2614
2571 l3 = cachep->nodelists[nodeid]; 2615 l3 = cachep->nodelists[nodeid];
2572 BUG_ON(!l3); 2616 BUG_ON(!l3);
2573 2617
2574retry: 2618 retry:
2575 spin_lock(&l3->list_lock); 2619 spin_lock(&l3->list_lock);
2576 entry = l3->slabs_partial.next; 2620 entry = l3->slabs_partial.next;
2577 if (entry == &l3->slabs_partial) { 2621 if (entry == &l3->slabs_partial) {
2578 l3->free_touched = 1; 2622 l3->free_touched = 1;
2579 entry = l3->slabs_free.next; 2623 entry = l3->slabs_free.next;
2580 if (entry == &l3->slabs_free) 2624 if (entry == &l3->slabs_free)
2581 goto must_grow; 2625 goto must_grow;
2582 } 2626 }
2583 2627
2584 slabp = list_entry(entry, struct slab, list); 2628 slabp = list_entry(entry, struct slab, list);
2585 check_spinlock_acquired_node(cachep, nodeid); 2629 check_spinlock_acquired_node(cachep, nodeid);
2586 check_slabp(cachep, slabp); 2630 check_slabp(cachep, slabp);
2587 2631
2588 STATS_INC_NODEALLOCS(cachep); 2632 STATS_INC_NODEALLOCS(cachep);
2589 STATS_INC_ACTIVE(cachep); 2633 STATS_INC_ACTIVE(cachep);
2590 STATS_SET_HIGH(cachep); 2634 STATS_SET_HIGH(cachep);
2591 2635
2592 BUG_ON(slabp->inuse == cachep->num); 2636 BUG_ON(slabp->inuse == cachep->num);
2593 2637
2594 /* get obj pointer */ 2638 /* get obj pointer */
2595 obj = slabp->s_mem + slabp->free*cachep->objsize; 2639 obj = slabp->s_mem + slabp->free * cachep->objsize;
2596 slabp->inuse++; 2640 slabp->inuse++;
2597 next = slab_bufctl(slabp)[slabp->free]; 2641 next = slab_bufctl(slabp)[slabp->free];
2598#if DEBUG 2642#if DEBUG
2599 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2643 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2600#endif 2644#endif
2601 slabp->free = next; 2645 slabp->free = next;
2602 check_slabp(cachep, slabp); 2646 check_slabp(cachep, slabp);
2603 l3->free_objects--; 2647 l3->free_objects--;
2604 /* move slabp to correct slabp list: */ 2648 /* move slabp to correct slabp list: */
2605 list_del(&slabp->list); 2649 list_del(&slabp->list);
2606 2650
2607 if (slabp->free == BUFCTL_END) { 2651 if (slabp->free == BUFCTL_END) {
2608 list_add(&slabp->list, &l3->slabs_full); 2652 list_add(&slabp->list, &l3->slabs_full);
2609 } else { 2653 } else {
2610 list_add(&slabp->list, &l3->slabs_partial); 2654 list_add(&slabp->list, &l3->slabs_partial);
2611 } 2655 }
2612 2656
2613 spin_unlock(&l3->list_lock); 2657 spin_unlock(&l3->list_lock);
2614 goto done; 2658 goto done;
2615 2659
2616must_grow: 2660 must_grow:
2617 spin_unlock(&l3->list_lock); 2661 spin_unlock(&l3->list_lock);
2618 x = cache_grow(cachep, flags, nodeid); 2662 x = cache_grow(cachep, flags, nodeid);
2619 2663
2620 if (!x) 2664 if (!x)
2621 return NULL; 2665 return NULL;
2622 2666
2623 goto retry; 2667 goto retry;
2624done: 2668 done:
2625 return obj; 2669 return obj;
2626} 2670}
2627#endif 2671#endif
2628 2672
2629/* 2673/*
2630 * Caller needs to acquire correct kmem_list's list_lock 2674 * Caller needs to acquire correct kmem_list's list_lock
2631 */ 2675 */
2632static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int node) 2676static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
2677 int node)
2633{ 2678{
2634 int i; 2679 int i;
2635 struct kmem_list3 *l3; 2680 struct kmem_list3 *l3;
@@ -2652,7 +2697,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n
2652 2697
2653 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { 2698 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2654 printk(KERN_ERR "slab: double free detected in cache " 2699 printk(KERN_ERR "slab: double free detected in cache "
2655 "'%s', objp %p\n", cachep->name, objp); 2700 "'%s', objp %p\n", cachep->name, objp);
2656 BUG(); 2701 BUG();
2657 } 2702 }
2658#endif 2703#endif
@@ -2696,20 +2741,19 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
2696 spin_lock(&l3->list_lock); 2741 spin_lock(&l3->list_lock);
2697 if (l3->shared) { 2742 if (l3->shared) {
2698 struct array_cache *shared_array = l3->shared; 2743 struct array_cache *shared_array = l3->shared;
2699 int max = shared_array->limit-shared_array->avail; 2744 int max = shared_array->limit - shared_array->avail;
2700 if (max) { 2745 if (max) {
2701 if (batchcount > max) 2746 if (batchcount > max)
2702 batchcount = max; 2747 batchcount = max;
2703 memcpy(&(shared_array->entry[shared_array->avail]), 2748 memcpy(&(shared_array->entry[shared_array->avail]),
2704 ac->entry, 2749 ac->entry, sizeof(void *) * batchcount);
2705 sizeof(void*)*batchcount);
2706 shared_array->avail += batchcount; 2750 shared_array->avail += batchcount;
2707 goto free_done; 2751 goto free_done;
2708 } 2752 }
2709 } 2753 }
2710 2754
2711 free_block(cachep, ac->entry, batchcount, node); 2755 free_block(cachep, ac->entry, batchcount, node);
2712free_done: 2756 free_done:
2713#if STATS 2757#if STATS
2714 { 2758 {
2715 int i = 0; 2759 int i = 0;
@@ -2731,10 +2775,9 @@ free_done:
2731 spin_unlock(&l3->list_lock); 2775 spin_unlock(&l3->list_lock);
2732 ac->avail -= batchcount; 2776 ac->avail -= batchcount;
2733 memmove(ac->entry, &(ac->entry[batchcount]), 2777 memmove(ac->entry, &(ac->entry[batchcount]),
2734 sizeof(void*)*ac->avail); 2778 sizeof(void *) * ac->avail);
2735} 2779}
2736 2780
2737
2738/* 2781/*
2739 * __cache_free 2782 * __cache_free
2740 * Release an obj back to its cache. If the obj has a constructed 2783 * Release an obj back to its cache. If the obj has a constructed
@@ -2759,7 +2802,8 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
2759 if (unlikely(slabp->nodeid != numa_node_id())) { 2802 if (unlikely(slabp->nodeid != numa_node_id())) {
2760 struct array_cache *alien = NULL; 2803 struct array_cache *alien = NULL;
2761 int nodeid = slabp->nodeid; 2804 int nodeid = slabp->nodeid;
2762 struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()]; 2805 struct kmem_list3 *l3 =
2806 cachep->nodelists[numa_node_id()];
2763 2807
2764 STATS_INC_NODEFREES(cachep); 2808 STATS_INC_NODEFREES(cachep);
2765 if (l3->alien && l3->alien[nodeid]) { 2809 if (l3->alien && l3->alien[nodeid]) {
@@ -2767,15 +2811,15 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
2767 spin_lock(&alien->lock); 2811 spin_lock(&alien->lock);
2768 if (unlikely(alien->avail == alien->limit)) 2812 if (unlikely(alien->avail == alien->limit))
2769 __drain_alien_cache(cachep, 2813 __drain_alien_cache(cachep,
2770 alien, nodeid); 2814 alien, nodeid);
2771 alien->entry[alien->avail++] = objp; 2815 alien->entry[alien->avail++] = objp;
2772 spin_unlock(&alien->lock); 2816 spin_unlock(&alien->lock);
2773 } else { 2817 } else {
2774 spin_lock(&(cachep->nodelists[nodeid])-> 2818 spin_lock(&(cachep->nodelists[nodeid])->
2775 list_lock); 2819 list_lock);
2776 free_block(cachep, &objp, 1, nodeid); 2820 free_block(cachep, &objp, 1, nodeid);
2777 spin_unlock(&(cachep->nodelists[nodeid])-> 2821 spin_unlock(&(cachep->nodelists[nodeid])->
2778 list_lock); 2822 list_lock);
2779 } 2823 }
2780 return; 2824 return;
2781 } 2825 }
@@ -2822,9 +2866,9 @@ EXPORT_SYMBOL(kmem_cache_alloc);
2822 */ 2866 */
2823int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr) 2867int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
2824{ 2868{
2825 unsigned long addr = (unsigned long) ptr; 2869 unsigned long addr = (unsigned long)ptr;
2826 unsigned long min_addr = PAGE_OFFSET; 2870 unsigned long min_addr = PAGE_OFFSET;
2827 unsigned long align_mask = BYTES_PER_WORD-1; 2871 unsigned long align_mask = BYTES_PER_WORD - 1;
2828 unsigned long size = cachep->objsize; 2872 unsigned long size = cachep->objsize;
2829 struct page *page; 2873 struct page *page;
2830 2874
@@ -2844,7 +2888,7 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
2844 if (unlikely(page_get_cache(page) != cachep)) 2888 if (unlikely(page_get_cache(page) != cachep))
2845 goto out; 2889 goto out;
2846 return 1; 2890 return 1;
2847out: 2891 out:
2848 return 0; 2892 return 0;
2849} 2893}
2850 2894
@@ -2871,8 +2915,10 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2871 2915
2872 if (unlikely(!cachep->nodelists[nodeid])) { 2916 if (unlikely(!cachep->nodelists[nodeid])) {
2873 /* Fall back to __cache_alloc if we run into trouble */ 2917 /* Fall back to __cache_alloc if we run into trouble */
2874 printk(KERN_WARNING "slab: not allocating in inactive node %d for cache %s\n", nodeid, cachep->name); 2918 printk(KERN_WARNING
2875 return __cache_alloc(cachep,flags); 2919 "slab: not allocating in inactive node %d for cache %s\n",
2920 nodeid, cachep->name);
2921 return __cache_alloc(cachep, flags);
2876 } 2922 }
2877 2923
2878 cache_alloc_debugcheck_before(cachep, flags); 2924 cache_alloc_debugcheck_before(cachep, flags);
@@ -2882,7 +2928,9 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2882 else 2928 else
2883 ptr = __cache_alloc_node(cachep, flags, nodeid); 2929 ptr = __cache_alloc_node(cachep, flags, nodeid);
2884 local_irq_restore(save_flags); 2930 local_irq_restore(save_flags);
2885 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0)); 2931 ptr =
2932 cache_alloc_debugcheck_after(cachep, flags, ptr,
2933 __builtin_return_address(0));
2886 2934
2887 return ptr; 2935 return ptr;
2888} 2936}
@@ -2944,12 +2992,11 @@ EXPORT_SYMBOL(__kmalloc);
2944 * Objects should be dereferenced using the per_cpu_ptr macro only. 2992 * Objects should be dereferenced using the per_cpu_ptr macro only.
2945 * 2993 *
2946 * @size: how many bytes of memory are required. 2994 * @size: how many bytes of memory are required.
2947 * @align: the alignment, which can't be greater than SMP_CACHE_BYTES.
2948 */ 2995 */
2949void *__alloc_percpu(size_t size, size_t align) 2996void *__alloc_percpu(size_t size)
2950{ 2997{
2951 int i; 2998 int i;
2952 struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL); 2999 struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
2953 3000
2954 if (!pdata) 3001 if (!pdata)
2955 return NULL; 3002 return NULL;
@@ -2973,9 +3020,9 @@ void *__alloc_percpu(size_t size, size_t align)
2973 } 3020 }
2974 3021
2975 /* Catch derefs w/o wrappers */ 3022 /* Catch derefs w/o wrappers */
2976 return (void *) (~(unsigned long) pdata); 3023 return (void *)(~(unsigned long)pdata);
2977 3024
2978unwind_oom: 3025 unwind_oom:
2979 while (--i >= 0) { 3026 while (--i >= 0) {
2980 if (!cpu_possible(i)) 3027 if (!cpu_possible(i))
2981 continue; 3028 continue;
@@ -3006,20 +3053,6 @@ void kmem_cache_free(kmem_cache_t *cachep, void *objp)
3006EXPORT_SYMBOL(kmem_cache_free); 3053EXPORT_SYMBOL(kmem_cache_free);
3007 3054
3008/** 3055/**
3009 * kzalloc - allocate memory. The memory is set to zero.
3010 * @size: how many bytes of memory are required.
3011 * @flags: the type of memory to allocate.
3012 */
3013void *kzalloc(size_t size, gfp_t flags)
3014{
3015 void *ret = kmalloc(size, flags);
3016 if (ret)
3017 memset(ret, 0, size);
3018 return ret;
3019}
3020EXPORT_SYMBOL(kzalloc);
3021
3022/**
3023 * kfree - free previously allocated memory 3056 * kfree - free previously allocated memory
3024 * @objp: pointer returned by kmalloc. 3057 * @objp: pointer returned by kmalloc.
3025 * 3058 *
@@ -3038,7 +3071,7 @@ void kfree(const void *objp)
3038 local_irq_save(flags); 3071 local_irq_save(flags);
3039 kfree_debugcheck(objp); 3072 kfree_debugcheck(objp);
3040 c = page_get_cache(virt_to_page(objp)); 3073 c = page_get_cache(virt_to_page(objp));
3041 __cache_free(c, (void*)objp); 3074 __cache_free(c, (void *)objp);
3042 local_irq_restore(flags); 3075 local_irq_restore(flags);
3043} 3076}
3044EXPORT_SYMBOL(kfree); 3077EXPORT_SYMBOL(kfree);
@@ -3051,17 +3084,16 @@ EXPORT_SYMBOL(kfree);
3051 * Don't free memory not originally allocated by alloc_percpu() 3084 * Don't free memory not originally allocated by alloc_percpu()
3052 * The complemented objp is to check for that. 3085 * The complemented objp is to check for that.
3053 */ 3086 */
3054void 3087void free_percpu(const void *objp)
3055free_percpu(const void *objp)
3056{ 3088{
3057 int i; 3089 int i;
3058 struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp); 3090 struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
3059 3091
3060 /* 3092 /*
3061 * We allocate for all cpus so we cannot use for online cpu here. 3093 * We allocate for all cpus so we cannot use for online cpu here.
3062 */ 3094 */
3063 for_each_cpu(i) 3095 for_each_cpu(i)
3064 kfree(p->ptrs[i]); 3096 kfree(p->ptrs[i]);
3065 kfree(p); 3097 kfree(p);
3066} 3098}
3067EXPORT_SYMBOL(free_percpu); 3099EXPORT_SYMBOL(free_percpu);
@@ -3095,44 +3127,44 @@ static int alloc_kmemlist(kmem_cache_t *cachep)
3095 if (!(new_alien = alloc_alien_cache(node, cachep->limit))) 3127 if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
3096 goto fail; 3128 goto fail;
3097#endif 3129#endif
3098 if (!(new = alloc_arraycache(node, (cachep->shared* 3130 if (!(new = alloc_arraycache(node, (cachep->shared *
3099 cachep->batchcount), 0xbaadf00d))) 3131 cachep->batchcount),
3132 0xbaadf00d)))
3100 goto fail; 3133 goto fail;
3101 if ((l3 = cachep->nodelists[node])) { 3134 if ((l3 = cachep->nodelists[node])) {
3102 3135
3103 spin_lock_irq(&l3->list_lock); 3136 spin_lock_irq(&l3->list_lock);
3104 3137
3105 if ((nc = cachep->nodelists[node]->shared)) 3138 if ((nc = cachep->nodelists[node]->shared))
3106 free_block(cachep, nc->entry, 3139 free_block(cachep, nc->entry, nc->avail, node);
3107 nc->avail, node);
3108 3140
3109 l3->shared = new; 3141 l3->shared = new;
3110 if (!cachep->nodelists[node]->alien) { 3142 if (!cachep->nodelists[node]->alien) {
3111 l3->alien = new_alien; 3143 l3->alien = new_alien;
3112 new_alien = NULL; 3144 new_alien = NULL;
3113 } 3145 }
3114 l3->free_limit = (1 + nr_cpus_node(node))* 3146 l3->free_limit = (1 + nr_cpus_node(node)) *
3115 cachep->batchcount + cachep->num; 3147 cachep->batchcount + cachep->num;
3116 spin_unlock_irq(&l3->list_lock); 3148 spin_unlock_irq(&l3->list_lock);
3117 kfree(nc); 3149 kfree(nc);
3118 free_alien_cache(new_alien); 3150 free_alien_cache(new_alien);
3119 continue; 3151 continue;
3120 } 3152 }
3121 if (!(l3 = kmalloc_node(sizeof(struct kmem_list3), 3153 if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
3122 GFP_KERNEL, node))) 3154 GFP_KERNEL, node)))
3123 goto fail; 3155 goto fail;
3124 3156
3125 kmem_list3_init(l3); 3157 kmem_list3_init(l3);
3126 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 3158 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3127 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 3159 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3128 l3->shared = new; 3160 l3->shared = new;
3129 l3->alien = new_alien; 3161 l3->alien = new_alien;
3130 l3->free_limit = (1 + nr_cpus_node(node))* 3162 l3->free_limit = (1 + nr_cpus_node(node)) *
3131 cachep->batchcount + cachep->num; 3163 cachep->batchcount + cachep->num;
3132 cachep->nodelists[node] = l3; 3164 cachep->nodelists[node] = l3;
3133 } 3165 }
3134 return err; 3166 return err;
3135fail: 3167 fail:
3136 err = -ENOMEM; 3168 err = -ENOMEM;
3137 return err; 3169 return err;
3138} 3170}
@@ -3154,18 +3186,19 @@ static void do_ccupdate_local(void *info)
3154 new->new[smp_processor_id()] = old; 3186 new->new[smp_processor_id()] = old;
3155} 3187}
3156 3188
3157
3158static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount, 3189static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
3159 int shared) 3190 int shared)
3160{ 3191{
3161 struct ccupdate_struct new; 3192 struct ccupdate_struct new;
3162 int i, err; 3193 int i, err;
3163 3194
3164 memset(&new.new,0,sizeof(new.new)); 3195 memset(&new.new, 0, sizeof(new.new));
3165 for_each_online_cpu(i) { 3196 for_each_online_cpu(i) {
3166 new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount); 3197 new.new[i] =
3198 alloc_arraycache(cpu_to_node(i), limit, batchcount);
3167 if (!new.new[i]) { 3199 if (!new.new[i]) {
3168 for (i--; i >= 0; i--) kfree(new.new[i]); 3200 for (i--; i >= 0; i--)
3201 kfree(new.new[i]);
3169 return -ENOMEM; 3202 return -ENOMEM;
3170 } 3203 }
3171 } 3204 }
@@ -3193,13 +3226,12 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
3193 err = alloc_kmemlist(cachep); 3226 err = alloc_kmemlist(cachep);
3194 if (err) { 3227 if (err) {
3195 printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n", 3228 printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
3196 cachep->name, -err); 3229 cachep->name, -err);
3197 BUG(); 3230 BUG();
3198 } 3231 }
3199 return 0; 3232 return 0;
3200} 3233}
3201 3234
3202
3203static void enable_cpucache(kmem_cache_t *cachep) 3235static void enable_cpucache(kmem_cache_t *cachep)
3204{ 3236{
3205 int err; 3237 int err;
@@ -3246,14 +3278,14 @@ static void enable_cpucache(kmem_cache_t *cachep)
3246 if (limit > 32) 3278 if (limit > 32)
3247 limit = 32; 3279 limit = 32;
3248#endif 3280#endif
3249 err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared); 3281 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
3250 if (err) 3282 if (err)
3251 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3283 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
3252 cachep->name, -err); 3284 cachep->name, -err);
3253} 3285}
3254 3286
3255static void drain_array_locked(kmem_cache_t *cachep, 3287static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
3256 struct array_cache *ac, int force, int node) 3288 int force, int node)
3257{ 3289{
3258 int tofree; 3290 int tofree;
3259 3291
@@ -3261,14 +3293,14 @@ static void drain_array_locked(kmem_cache_t *cachep,
3261 if (ac->touched && !force) { 3293 if (ac->touched && !force) {
3262 ac->touched = 0; 3294 ac->touched = 0;
3263 } else if (ac->avail) { 3295 } else if (ac->avail) {
3264 tofree = force ? ac->avail : (ac->limit+4)/5; 3296 tofree = force ? ac->avail : (ac->limit + 4) / 5;
3265 if (tofree > ac->avail) { 3297 if (tofree > ac->avail) {
3266 tofree = (ac->avail+1)/2; 3298 tofree = (ac->avail + 1) / 2;
3267 } 3299 }
3268 free_block(cachep, ac->entry, tofree, node); 3300 free_block(cachep, ac->entry, tofree, node);
3269 ac->avail -= tofree; 3301 ac->avail -= tofree;
3270 memmove(ac->entry, &(ac->entry[tofree]), 3302 memmove(ac->entry, &(ac->entry[tofree]),
3271 sizeof(void*)*ac->avail); 3303 sizeof(void *) * ac->avail);
3272 } 3304 }
3273} 3305}
3274 3306
@@ -3291,13 +3323,14 @@ static void cache_reap(void *unused)
3291 3323
3292 if (down_trylock(&cache_chain_sem)) { 3324 if (down_trylock(&cache_chain_sem)) {
3293 /* Give up. Setup the next iteration. */ 3325 /* Give up. Setup the next iteration. */
3294 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 3326 schedule_delayed_work(&__get_cpu_var(reap_work),
3327 REAPTIMEOUT_CPUC);
3295 return; 3328 return;
3296 } 3329 }
3297 3330
3298 list_for_each(walk, &cache_chain) { 3331 list_for_each(walk, &cache_chain) {
3299 kmem_cache_t *searchp; 3332 kmem_cache_t *searchp;
3300 struct list_head* p; 3333 struct list_head *p;
3301 int tofree; 3334 int tofree;
3302 struct slab *slabp; 3335 struct slab *slabp;
3303 3336
@@ -3314,7 +3347,7 @@ static void cache_reap(void *unused)
3314 spin_lock_irq(&l3->list_lock); 3347 spin_lock_irq(&l3->list_lock);
3315 3348
3316 drain_array_locked(searchp, ac_data(searchp), 0, 3349 drain_array_locked(searchp, ac_data(searchp), 0,
3317 numa_node_id()); 3350 numa_node_id());
3318 3351
3319 if (time_after(l3->next_reap, jiffies)) 3352 if (time_after(l3->next_reap, jiffies))
3320 goto next_unlock; 3353 goto next_unlock;
@@ -3323,14 +3356,16 @@ static void cache_reap(void *unused)
3323 3356
3324 if (l3->shared) 3357 if (l3->shared)
3325 drain_array_locked(searchp, l3->shared, 0, 3358 drain_array_locked(searchp, l3->shared, 0,
3326 numa_node_id()); 3359 numa_node_id());
3327 3360
3328 if (l3->free_touched) { 3361 if (l3->free_touched) {
3329 l3->free_touched = 0; 3362 l3->free_touched = 0;
3330 goto next_unlock; 3363 goto next_unlock;
3331 } 3364 }
3332 3365
3333 tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num); 3366 tofree =
3367 (l3->free_limit + 5 * searchp->num -
3368 1) / (5 * searchp->num);
3334 do { 3369 do {
3335 p = l3->slabs_free.next; 3370 p = l3->slabs_free.next;
3336 if (p == &(l3->slabs_free)) 3371 if (p == &(l3->slabs_free))
@@ -3350,10 +3385,10 @@ static void cache_reap(void *unused)
3350 spin_unlock_irq(&l3->list_lock); 3385 spin_unlock_irq(&l3->list_lock);
3351 slab_destroy(searchp, slabp); 3386 slab_destroy(searchp, slabp);
3352 spin_lock_irq(&l3->list_lock); 3387 spin_lock_irq(&l3->list_lock);
3353 } while(--tofree > 0); 3388 } while (--tofree > 0);
3354next_unlock: 3389 next_unlock:
3355 spin_unlock_irq(&l3->list_lock); 3390 spin_unlock_irq(&l3->list_lock);
3356next: 3391 next:
3357 cond_resched(); 3392 cond_resched();
3358 } 3393 }
3359 check_irq_on(); 3394 check_irq_on();
@@ -3365,32 +3400,37 @@ next:
3365 3400
3366#ifdef CONFIG_PROC_FS 3401#ifdef CONFIG_PROC_FS
3367 3402
3368static void *s_start(struct seq_file *m, loff_t *pos) 3403static void print_slabinfo_header(struct seq_file *m)
3369{ 3404{
3370 loff_t n = *pos; 3405 /*
3371 struct list_head *p; 3406 * Output format version, so at least we can change it
3372 3407 * without _too_ many complaints.
3373 down(&cache_chain_sem); 3408 */
3374 if (!n) {
3375 /*
3376 * Output format version, so at least we can change it
3377 * without _too_ many complaints.
3378 */
3379#if STATS 3409#if STATS
3380 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); 3410 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
3381#else 3411#else
3382 seq_puts(m, "slabinfo - version: 2.1\n"); 3412 seq_puts(m, "slabinfo - version: 2.1\n");
3383#endif 3413#endif
3384 seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>"); 3414 seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
3385 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 3415 "<objperslab> <pagesperslab>");
3386 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 3416 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
3417 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
3387#if STATS 3418#if STATS
3388 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>" 3419 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
3389 " <error> <maxfreeable> <nodeallocs> <remotefrees>"); 3420 "<error> <maxfreeable> <nodeallocs> <remotefrees>");
3390 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); 3421 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
3391#endif 3422#endif
3392 seq_putc(m, '\n'); 3423 seq_putc(m, '\n');
3393 } 3424}
3425
3426static void *s_start(struct seq_file *m, loff_t *pos)
3427{
3428 loff_t n = *pos;
3429 struct list_head *p;
3430
3431 down(&cache_chain_sem);
3432 if (!n)
3433 print_slabinfo_header(m);
3394 p = cache_chain.next; 3434 p = cache_chain.next;
3395 while (n--) { 3435 while (n--) {
3396 p = p->next; 3436 p = p->next;
@@ -3405,7 +3445,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3405 kmem_cache_t *cachep = p; 3445 kmem_cache_t *cachep = p;
3406 ++*pos; 3446 ++*pos;
3407 return cachep->next.next == &cache_chain ? NULL 3447 return cachep->next.next == &cache_chain ? NULL
3408 : list_entry(cachep->next.next, kmem_cache_t, next); 3448 : list_entry(cachep->next.next, kmem_cache_t, next);
3409} 3449}
3410 3450
3411static void s_stop(struct seq_file *m, void *p) 3451static void s_stop(struct seq_file *m, void *p)
@@ -3417,11 +3457,11 @@ static int s_show(struct seq_file *m, void *p)
3417{ 3457{
3418 kmem_cache_t *cachep = p; 3458 kmem_cache_t *cachep = p;
3419 struct list_head *q; 3459 struct list_head *q;
3420 struct slab *slabp; 3460 struct slab *slabp;
3421 unsigned long active_objs; 3461 unsigned long active_objs;
3422 unsigned long num_objs; 3462 unsigned long num_objs;
3423 unsigned long active_slabs = 0; 3463 unsigned long active_slabs = 0;
3424 unsigned long num_slabs, free_objects = 0, shared_avail = 0; 3464 unsigned long num_slabs, free_objects = 0, shared_avail = 0;
3425 const char *name; 3465 const char *name;
3426 char *error = NULL; 3466 char *error = NULL;
3427 int node; 3467 int node;
@@ -3438,14 +3478,14 @@ static int s_show(struct seq_file *m, void *p)
3438 3478
3439 spin_lock(&l3->list_lock); 3479 spin_lock(&l3->list_lock);
3440 3480
3441 list_for_each(q,&l3->slabs_full) { 3481 list_for_each(q, &l3->slabs_full) {
3442 slabp = list_entry(q, struct slab, list); 3482 slabp = list_entry(q, struct slab, list);
3443 if (slabp->inuse != cachep->num && !error) 3483 if (slabp->inuse != cachep->num && !error)
3444 error = "slabs_full accounting error"; 3484 error = "slabs_full accounting error";
3445 active_objs += cachep->num; 3485 active_objs += cachep->num;
3446 active_slabs++; 3486 active_slabs++;
3447 } 3487 }
3448 list_for_each(q,&l3->slabs_partial) { 3488 list_for_each(q, &l3->slabs_partial) {
3449 slabp = list_entry(q, struct slab, list); 3489 slabp = list_entry(q, struct slab, list);
3450 if (slabp->inuse == cachep->num && !error) 3490 if (slabp->inuse == cachep->num && !error)
3451 error = "slabs_partial inuse accounting error"; 3491 error = "slabs_partial inuse accounting error";
@@ -3454,7 +3494,7 @@ static int s_show(struct seq_file *m, void *p)
3454 active_objs += slabp->inuse; 3494 active_objs += slabp->inuse;
3455 active_slabs++; 3495 active_slabs++;
3456 } 3496 }
3457 list_for_each(q,&l3->slabs_free) { 3497 list_for_each(q, &l3->slabs_free) {
3458 slabp = list_entry(q, struct slab, list); 3498 slabp = list_entry(q, struct slab, list);
3459 if (slabp->inuse && !error) 3499 if (slabp->inuse && !error)
3460 error = "slabs_free/inuse accounting error"; 3500 error = "slabs_free/inuse accounting error";
@@ -3465,25 +3505,24 @@ static int s_show(struct seq_file *m, void *p)
3465 3505
3466 spin_unlock(&l3->list_lock); 3506 spin_unlock(&l3->list_lock);
3467 } 3507 }
3468 num_slabs+=active_slabs; 3508 num_slabs += active_slabs;
3469 num_objs = num_slabs*cachep->num; 3509 num_objs = num_slabs * cachep->num;
3470 if (num_objs - active_objs != free_objects && !error) 3510 if (num_objs - active_objs != free_objects && !error)
3471 error = "free_objects accounting error"; 3511 error = "free_objects accounting error";
3472 3512
3473 name = cachep->name; 3513 name = cachep->name;
3474 if (error) 3514 if (error)
3475 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 3515 printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
3476 3516
3477 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 3517 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
3478 name, active_objs, num_objs, cachep->objsize, 3518 name, active_objs, num_objs, cachep->objsize,
3479 cachep->num, (1<<cachep->gfporder)); 3519 cachep->num, (1 << cachep->gfporder));
3480 seq_printf(m, " : tunables %4u %4u %4u", 3520 seq_printf(m, " : tunables %4u %4u %4u",
3481 cachep->limit, cachep->batchcount, 3521 cachep->limit, cachep->batchcount, cachep->shared);
3482 cachep->shared);
3483 seq_printf(m, " : slabdata %6lu %6lu %6lu", 3522 seq_printf(m, " : slabdata %6lu %6lu %6lu",
3484 active_slabs, num_slabs, shared_avail); 3523 active_slabs, num_slabs, shared_avail);
3485#if STATS 3524#if STATS
3486 { /* list3 stats */ 3525 { /* list3 stats */
3487 unsigned long high = cachep->high_mark; 3526 unsigned long high = cachep->high_mark;
3488 unsigned long allocs = cachep->num_allocations; 3527 unsigned long allocs = cachep->num_allocations;
3489 unsigned long grown = cachep->grown; 3528 unsigned long grown = cachep->grown;
@@ -3494,9 +3533,7 @@ static int s_show(struct seq_file *m, void *p)
3494 unsigned long node_frees = cachep->node_frees; 3533 unsigned long node_frees = cachep->node_frees;
3495 3534
3496 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ 3535 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
3497 %4lu %4lu %4lu %4lu", 3536 %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees);
3498 allocs, high, grown, reaped, errors,
3499 max_freeable, node_allocs, node_frees);
3500 } 3537 }
3501 /* cpu stats */ 3538 /* cpu stats */
3502 { 3539 {
@@ -3506,7 +3543,7 @@ static int s_show(struct seq_file *m, void *p)
3506 unsigned long freemiss = atomic_read(&cachep->freemiss); 3543 unsigned long freemiss = atomic_read(&cachep->freemiss);
3507 3544
3508 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", 3545 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
3509 allochit, allocmiss, freehit, freemiss); 3546 allochit, allocmiss, freehit, freemiss);
3510 } 3547 }
3511#endif 3548#endif
3512 seq_putc(m, '\n'); 3549 seq_putc(m, '\n');
@@ -3529,10 +3566,10 @@ static int s_show(struct seq_file *m, void *p)
3529 */ 3566 */
3530 3567
3531struct seq_operations slabinfo_op = { 3568struct seq_operations slabinfo_op = {
3532 .start = s_start, 3569 .start = s_start,
3533 .next = s_next, 3570 .next = s_next,
3534 .stop = s_stop, 3571 .stop = s_stop,
3535 .show = s_show, 3572 .show = s_show,
3536}; 3573};
3537 3574
3538#define MAX_SLABINFO_WRITE 128 3575#define MAX_SLABINFO_WRITE 128
@@ -3543,18 +3580,18 @@ struct seq_operations slabinfo_op = {
3543 * @count: data length 3580 * @count: data length
3544 * @ppos: unused 3581 * @ppos: unused
3545 */ 3582 */
3546ssize_t slabinfo_write(struct file *file, const char __user *buffer, 3583ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3547 size_t count, loff_t *ppos) 3584 size_t count, loff_t *ppos)
3548{ 3585{
3549 char kbuf[MAX_SLABINFO_WRITE+1], *tmp; 3586 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
3550 int limit, batchcount, shared, res; 3587 int limit, batchcount, shared, res;
3551 struct list_head *p; 3588 struct list_head *p;
3552 3589
3553 if (count > MAX_SLABINFO_WRITE) 3590 if (count > MAX_SLABINFO_WRITE)
3554 return -EINVAL; 3591 return -EINVAL;
3555 if (copy_from_user(&kbuf, buffer, count)) 3592 if (copy_from_user(&kbuf, buffer, count))
3556 return -EFAULT; 3593 return -EFAULT;
3557 kbuf[MAX_SLABINFO_WRITE] = '\0'; 3594 kbuf[MAX_SLABINFO_WRITE] = '\0';
3558 3595
3559 tmp = strchr(kbuf, ' '); 3596 tmp = strchr(kbuf, ' ');
3560 if (!tmp) 3597 if (!tmp)
@@ -3567,18 +3604,17 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
3567 /* Find the cache in the chain of caches. */ 3604 /* Find the cache in the chain of caches. */
3568 down(&cache_chain_sem); 3605 down(&cache_chain_sem);
3569 res = -EINVAL; 3606 res = -EINVAL;
3570 list_for_each(p,&cache_chain) { 3607 list_for_each(p, &cache_chain) {
3571 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); 3608 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
3572 3609
3573 if (!strcmp(cachep->name, kbuf)) { 3610 if (!strcmp(cachep->name, kbuf)) {
3574 if (limit < 1 || 3611 if (limit < 1 ||
3575 batchcount < 1 || 3612 batchcount < 1 ||
3576 batchcount > limit || 3613 batchcount > limit || shared < 0) {
3577 shared < 0) {
3578 res = 0; 3614 res = 0;
3579 } else { 3615 } else {
3580 res = do_tune_cpucache(cachep, limit, 3616 res = do_tune_cpucache(cachep, limit,
3581 batchcount, shared); 3617 batchcount, shared);
3582 } 3618 }
3583 break; 3619 break;
3584 } 3620 }
@@ -3609,26 +3645,3 @@ unsigned int ksize(const void *objp)
3609 3645
3610 return obj_reallen(page_get_cache(virt_to_page(objp))); 3646 return obj_reallen(page_get_cache(virt_to_page(objp)));
3611} 3647}
3612
3613
3614/*
3615 * kstrdup - allocate space for and copy an existing string
3616 *
3617 * @s: the string to duplicate
3618 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
3619 */
3620char *kstrdup(const char *s, gfp_t gfp)
3621{
3622 size_t len;
3623 char *buf;
3624
3625 if (!s)
3626 return NULL;
3627
3628 len = strlen(s) + 1;
3629 buf = kmalloc(len, gfp);
3630 if (buf)
3631 memcpy(buf, s, len);
3632 return buf;
3633}
3634EXPORT_SYMBOL(kstrdup);
diff --git a/mm/slob.c b/mm/slob.c
new file mode 100644
index 000000000000..1c240c4b71d9
--- /dev/null
+++ b/mm/slob.c
@@ -0,0 +1,385 @@
1/*
2 * SLOB Allocator: Simple List Of Blocks
3 *
4 * Matt Mackall <mpm@selenic.com> 12/30/03
5 *
6 * How SLOB works:
7 *
8 * The core of SLOB is a traditional K&R style heap allocator, with
9 * support for returning aligned objects. The granularity of this
10 * allocator is 8 bytes on x86, though it's perhaps possible to reduce
11 * this to 4 if it's deemed worth the effort. The slob heap is a
12 * singly-linked list of pages from __get_free_page, grown on demand
13 * and allocation from the heap is currently first-fit.
14 *
15 * Above this is an implementation of kmalloc/kfree. Blocks returned
16 * from kmalloc are 8-byte aligned and prepended with a 8-byte header.
17 * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
18 * __get_free_pages directly so that it can return page-aligned blocks
19 * and keeps a linked list of such pages and their orders. These
20 * objects are detected in kfree() by their page alignment.
21 *
22 * SLAB is emulated on top of SLOB by simply calling constructors and
23 * destructors for every SLAB allocation. Objects are returned with
24 * the 8-byte alignment unless the SLAB_MUST_HWCACHE_ALIGN flag is
25 * set, in which case the low-level allocator will fragment blocks to
26 * create the proper alignment. Again, objects of page-size or greater
27 * are allocated by calling __get_free_pages. As SLAB objects know
28 * their size, no separate size bookkeeping is necessary and there is
29 * essentially no allocation space overhead.
30 */
31
32#include <linux/config.h>
33#include <linux/slab.h>
34#include <linux/mm.h>
35#include <linux/cache.h>
36#include <linux/init.h>
37#include <linux/module.h>
38#include <linux/timer.h>
39
40struct slob_block {
41 int units;
42 struct slob_block *next;
43};
44typedef struct slob_block slob_t;
45
46#define SLOB_UNIT sizeof(slob_t)
47#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
48#define SLOB_ALIGN L1_CACHE_BYTES
49
50struct bigblock {
51 int order;
52 void *pages;
53 struct bigblock *next;
54};
55typedef struct bigblock bigblock_t;
56
57static slob_t arena = { .next = &arena, .units = 1 };
58static slob_t *slobfree = &arena;
59static bigblock_t *bigblocks;
60static DEFINE_SPINLOCK(slob_lock);
61static DEFINE_SPINLOCK(block_lock);
62
63static void slob_free(void *b, int size);
64
65static void *slob_alloc(size_t size, gfp_t gfp, int align)
66{
67 slob_t *prev, *cur, *aligned = 0;
68 int delta = 0, units = SLOB_UNITS(size);
69 unsigned long flags;
70
71 spin_lock_irqsave(&slob_lock, flags);
72 prev = slobfree;
73 for (cur = prev->next; ; prev = cur, cur = cur->next) {
74 if (align) {
75 aligned = (slob_t *)ALIGN((unsigned long)cur, align);
76 delta = aligned - cur;
77 }
78 if (cur->units >= units + delta) { /* room enough? */
79 if (delta) { /* need to fragment head to align? */
80 aligned->units = cur->units - delta;
81 aligned->next = cur->next;
82 cur->next = aligned;
83 cur->units = delta;
84 prev = cur;
85 cur = aligned;
86 }
87
88 if (cur->units == units) /* exact fit? */
89 prev->next = cur->next; /* unlink */
90 else { /* fragment */
91 prev->next = cur + units;
92 prev->next->units = cur->units - units;
93 prev->next->next = cur->next;
94 cur->units = units;
95 }
96
97 slobfree = prev;
98 spin_unlock_irqrestore(&slob_lock, flags);
99 return cur;
100 }
101 if (cur == slobfree) {
102 spin_unlock_irqrestore(&slob_lock, flags);
103
104 if (size == PAGE_SIZE) /* trying to shrink arena? */
105 return 0;
106
107 cur = (slob_t *)__get_free_page(gfp);
108 if (!cur)
109 return 0;
110
111 slob_free(cur, PAGE_SIZE);
112 spin_lock_irqsave(&slob_lock, flags);
113 cur = slobfree;
114 }
115 }
116}
117
118static void slob_free(void *block, int size)
119{
120 slob_t *cur, *b = (slob_t *)block;
121 unsigned long flags;
122
123 if (!block)
124 return;
125
126 if (size)
127 b->units = SLOB_UNITS(size);
128
129 /* Find reinsertion point */
130 spin_lock_irqsave(&slob_lock, flags);
131 for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
132 if (cur >= cur->next && (b > cur || b < cur->next))
133 break;
134
135 if (b + b->units == cur->next) {
136 b->units += cur->next->units;
137 b->next = cur->next->next;
138 } else
139 b->next = cur->next;
140
141 if (cur + cur->units == b) {
142 cur->units += b->units;
143 cur->next = b->next;
144 } else
145 cur->next = b;
146
147 slobfree = cur;
148
149 spin_unlock_irqrestore(&slob_lock, flags);
150}
151
152static int FASTCALL(find_order(int size));
153static int fastcall find_order(int size)
154{
155 int order = 0;
156 for ( ; size > 4096 ; size >>=1)
157 order++;
158 return order;
159}
160
161void *kmalloc(size_t size, gfp_t gfp)
162{
163 slob_t *m;
164 bigblock_t *bb;
165 unsigned long flags;
166
167 if (size < PAGE_SIZE - SLOB_UNIT) {
168 m = slob_alloc(size + SLOB_UNIT, gfp, 0);
169 return m ? (void *)(m + 1) : 0;
170 }
171
172 bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
173 if (!bb)
174 return 0;
175
176 bb->order = find_order(size);
177 bb->pages = (void *)__get_free_pages(gfp, bb->order);
178
179 if (bb->pages) {
180 spin_lock_irqsave(&block_lock, flags);
181 bb->next = bigblocks;
182 bigblocks = bb;
183 spin_unlock_irqrestore(&block_lock, flags);
184 return bb->pages;
185 }
186
187 slob_free(bb, sizeof(bigblock_t));
188 return 0;
189}
190
191EXPORT_SYMBOL(kmalloc);
192
193void kfree(const void *block)
194{
195 bigblock_t *bb, **last = &bigblocks;
196 unsigned long flags;
197
198 if (!block)
199 return;
200
201 if (!((unsigned long)block & (PAGE_SIZE-1))) {
202 /* might be on the big block list */
203 spin_lock_irqsave(&block_lock, flags);
204 for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
205 if (bb->pages == block) {
206 *last = bb->next;
207 spin_unlock_irqrestore(&block_lock, flags);
208 free_pages((unsigned long)block, bb->order);
209 slob_free(bb, sizeof(bigblock_t));
210 return;
211 }
212 }
213 spin_unlock_irqrestore(&block_lock, flags);
214 }
215
216 slob_free((slob_t *)block - 1, 0);
217 return;
218}
219
220EXPORT_SYMBOL(kfree);
221
222unsigned int ksize(const void *block)
223{
224 bigblock_t *bb;
225 unsigned long flags;
226
227 if (!block)
228 return 0;
229
230 if (!((unsigned long)block & (PAGE_SIZE-1))) {
231 spin_lock_irqsave(&block_lock, flags);
232 for (bb = bigblocks; bb; bb = bb->next)
233 if (bb->pages == block) {
234 spin_unlock_irqrestore(&slob_lock, flags);
235 return PAGE_SIZE << bb->order;
236 }
237 spin_unlock_irqrestore(&block_lock, flags);
238 }
239
240 return ((slob_t *)block - 1)->units * SLOB_UNIT;
241}
242
243struct kmem_cache {
244 unsigned int size, align;
245 const char *name;
246 void (*ctor)(void *, struct kmem_cache *, unsigned long);
247 void (*dtor)(void *, struct kmem_cache *, unsigned long);
248};
249
250struct kmem_cache *kmem_cache_create(const char *name, size_t size,
251 size_t align, unsigned long flags,
252 void (*ctor)(void*, struct kmem_cache *, unsigned long),
253 void (*dtor)(void*, struct kmem_cache *, unsigned long))
254{
255 struct kmem_cache *c;
256
257 c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
258
259 if (c) {
260 c->name = name;
261 c->size = size;
262 c->ctor = ctor;
263 c->dtor = dtor;
264 /* ignore alignment unless it's forced */
265 c->align = (flags & SLAB_MUST_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
266 if (c->align < align)
267 c->align = align;
268 }
269
270 return c;
271}
272EXPORT_SYMBOL(kmem_cache_create);
273
274int kmem_cache_destroy(struct kmem_cache *c)
275{
276 slob_free(c, sizeof(struct kmem_cache));
277 return 0;
278}
279EXPORT_SYMBOL(kmem_cache_destroy);
280
281void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
282{
283 void *b;
284
285 if (c->size < PAGE_SIZE)
286 b = slob_alloc(c->size, flags, c->align);
287 else
288 b = (void *)__get_free_pages(flags, find_order(c->size));
289
290 if (c->ctor)
291 c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR);
292
293 return b;
294}
295EXPORT_SYMBOL(kmem_cache_alloc);
296
297void kmem_cache_free(struct kmem_cache *c, void *b)
298{
299 if (c->dtor)
300 c->dtor(b, c, 0);
301
302 if (c->size < PAGE_SIZE)
303 slob_free(b, c->size);
304 else
305 free_pages((unsigned long)b, find_order(c->size));
306}
307EXPORT_SYMBOL(kmem_cache_free);
308
309unsigned int kmem_cache_size(struct kmem_cache *c)
310{
311 return c->size;
312}
313EXPORT_SYMBOL(kmem_cache_size);
314
315const char *kmem_cache_name(struct kmem_cache *c)
316{
317 return c->name;
318}
319EXPORT_SYMBOL(kmem_cache_name);
320
321static struct timer_list slob_timer = TIMER_INITIALIZER(
322 (void (*)(unsigned long))kmem_cache_init, 0, 0);
323
324void kmem_cache_init(void)
325{
326 void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
327
328 if (p)
329 free_page((unsigned long)p);
330
331 mod_timer(&slob_timer, jiffies + HZ);
332}
333
334atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
335EXPORT_SYMBOL(slab_reclaim_pages);
336
337#ifdef CONFIG_SMP
338
339void *__alloc_percpu(size_t size, size_t align)
340{
341 int i;
342 struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
343
344 if (!pdata)
345 return NULL;
346
347 for (i = 0; i < NR_CPUS; i++) {
348 if (!cpu_possible(i))
349 continue;
350 pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
351 if (!pdata->ptrs[i])
352 goto unwind_oom;
353 memset(pdata->ptrs[i], 0, size);
354 }
355
356 /* Catch derefs w/o wrappers */
357 return (void *) (~(unsigned long) pdata);
358
359unwind_oom:
360 while (--i >= 0) {
361 if (!cpu_possible(i))
362 continue;
363 kfree(pdata->ptrs[i]);
364 }
365 kfree(pdata);
366 return NULL;
367}
368EXPORT_SYMBOL(__alloc_percpu);
369
370void
371free_percpu(const void *objp)
372{
373 int i;
374 struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
375
376 for (i = 0; i < NR_CPUS; i++) {
377 if (!cpu_possible(i))
378 continue;
379 kfree(p->ptrs[i]);
380 }
381 kfree(p);
382}
383EXPORT_SYMBOL(free_percpu);
384
385#endif
diff --git a/mm/sparse.c b/mm/sparse.c
index 72079b538e2d..0a51f36ba3a1 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -18,10 +18,10 @@
18 */ 18 */
19#ifdef CONFIG_SPARSEMEM_EXTREME 19#ifdef CONFIG_SPARSEMEM_EXTREME
20struct mem_section *mem_section[NR_SECTION_ROOTS] 20struct mem_section *mem_section[NR_SECTION_ROOTS]
21 ____cacheline_maxaligned_in_smp; 21 ____cacheline_internodealigned_in_smp;
22#else 22#else
23struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] 23struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
24 ____cacheline_maxaligned_in_smp; 24 ____cacheline_internodealigned_in_smp;
25#endif 25#endif
26EXPORT_SYMBOL(mem_section); 26EXPORT_SYMBOL(mem_section);
27 27
diff --git a/mm/swap_state.c b/mm/swap_state.c
index fc2aecb70a95..7b09ac503fec 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -141,7 +141,7 @@ void __delete_from_swap_cache(struct page *page)
141 * Allocate swap space for the page and add the page to the 141 * Allocate swap space for the page and add the page to the
142 * swap cache. Caller needs to hold the page lock. 142 * swap cache. Caller needs to hold the page lock.
143 */ 143 */
144int add_to_swap(struct page * page) 144int add_to_swap(struct page * page, gfp_t gfp_mask)
145{ 145{
146 swp_entry_t entry; 146 swp_entry_t entry;
147 int err; 147 int err;
@@ -166,7 +166,7 @@ int add_to_swap(struct page * page)
166 * Add it to the swap cache and mark it dirty 166 * Add it to the swap cache and mark it dirty
167 */ 167 */
168 err = __add_to_swap_cache(page, entry, 168 err = __add_to_swap_cache(page, entry,
169 GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_NOWARN); 169 gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
170 170
171 switch (err) { 171 switch (err) {
172 case 0: /* Success */ 172 case 0: /* Success */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6da4b28b896b..80f948a2028b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1493,7 +1493,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1493 goto bad_swap; 1493 goto bad_swap;
1494 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 1494 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1495 goto bad_swap; 1495 goto bad_swap;
1496 1496
1497 /* OK, set up the swap map and apply the bad block list */ 1497 /* OK, set up the swap map and apply the bad block list */
1498 if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { 1498 if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
1499 error = -ENOMEM; 1499 error = -ENOMEM;
@@ -1502,17 +1502,17 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1502 1502
1503 error = 0; 1503 error = 0;
1504 memset(p->swap_map, 0, maxpages * sizeof(short)); 1504 memset(p->swap_map, 0, maxpages * sizeof(short));
1505 for (i=0; i<swap_header->info.nr_badpages; i++) { 1505 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1506 int page = swap_header->info.badpages[i]; 1506 int page_nr = swap_header->info.badpages[i];
1507 if (page <= 0 || page >= swap_header->info.last_page) 1507 if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
1508 error = -EINVAL; 1508 error = -EINVAL;
1509 else 1509 else
1510 p->swap_map[page] = SWAP_MAP_BAD; 1510 p->swap_map[page_nr] = SWAP_MAP_BAD;
1511 } 1511 }
1512 nr_good_pages = swap_header->info.last_page - 1512 nr_good_pages = swap_header->info.last_page -
1513 swap_header->info.nr_badpages - 1513 swap_header->info.nr_badpages -
1514 1 /* header page */; 1514 1 /* header page */;
1515 if (error) 1515 if (error)
1516 goto bad_swap; 1516 goto bad_swap;
1517 } 1517 }
1518 1518
diff --git a/mm/truncate.c b/mm/truncate.c
index 7dee32745901..b1a463d0fe71 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -249,7 +249,6 @@ unlock:
249 break; 249 break;
250 } 250 }
251 pagevec_release(&pvec); 251 pagevec_release(&pvec);
252 cond_resched();
253 } 252 }
254 return ret; 253 return ret;
255} 254}
diff --git a/mm/util.c b/mm/util.c
new file mode 100644
index 000000000000..5f4bb59da63c
--- /dev/null
+++ b/mm/util.c
@@ -0,0 +1,39 @@
1#include <linux/slab.h>
2#include <linux/string.h>
3#include <linux/module.h>
4
5/**
6 * kzalloc - allocate memory. The memory is set to zero.
7 * @size: how many bytes of memory are required.
8 * @flags: the type of memory to allocate.
9 */
10void *kzalloc(size_t size, gfp_t flags)
11{
12 void *ret = kmalloc(size, flags);
13 if (ret)
14 memset(ret, 0, size);
15 return ret;
16}
17EXPORT_SYMBOL(kzalloc);
18
19/*
20 * kstrdup - allocate space for and copy an existing string
21 *
22 * @s: the string to duplicate
23 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
24 */
25char *kstrdup(const char *s, gfp_t gfp)
26{
27 size_t len;
28 char *buf;
29
30 if (!s)
31 return NULL;
32
33 len = strlen(s) + 1;
34 buf = kmalloc(len, gfp);
35 if (buf)
36 memcpy(buf, s, len);
37 return buf;
38}
39EXPORT_SYMBOL(kstrdup);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index be8235fb1939..bf903b2d198f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -180,8 +180,7 @@ EXPORT_SYMBOL(remove_shrinker);
180 * 180 *
181 * Returns the number of slab objects which we shrunk. 181 * Returns the number of slab objects which we shrunk.
182 */ 182 */
183static int shrink_slab(unsigned long scanned, gfp_t gfp_mask, 183int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
184 unsigned long lru_pages)
185{ 184{
186 struct shrinker *shrinker; 185 struct shrinker *shrinker;
187 int ret = 0; 186 int ret = 0;
@@ -269,9 +268,7 @@ static inline int is_page_cache_freeable(struct page *page)
269 268
270static int may_write_to_queue(struct backing_dev_info *bdi) 269static int may_write_to_queue(struct backing_dev_info *bdi)
271{ 270{
272 if (current_is_kswapd()) 271 if (current->flags & PF_SWAPWRITE)
273 return 1;
274 if (current_is_pdflush()) /* This is unlikely, but why not... */
275 return 1; 272 return 1;
276 if (!bdi_write_congested(bdi)) 273 if (!bdi_write_congested(bdi))
277 return 1; 274 return 1;
@@ -376,6 +373,43 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
376 return PAGE_CLEAN; 373 return PAGE_CLEAN;
377} 374}
378 375
376static int remove_mapping(struct address_space *mapping, struct page *page)
377{
378 if (!mapping)
379 return 0; /* truncate got there first */
380
381 write_lock_irq(&mapping->tree_lock);
382
383 /*
384 * The non-racy check for busy page. It is critical to check
385 * PageDirty _after_ making sure that the page is freeable and
386 * not in use by anybody. (pagecache + us == 2)
387 */
388 if (unlikely(page_count(page) != 2))
389 goto cannot_free;
390 smp_rmb();
391 if (unlikely(PageDirty(page)))
392 goto cannot_free;
393
394 if (PageSwapCache(page)) {
395 swp_entry_t swap = { .val = page_private(page) };
396 __delete_from_swap_cache(page);
397 write_unlock_irq(&mapping->tree_lock);
398 swap_free(swap);
399 __put_page(page); /* The pagecache ref */
400 return 1;
401 }
402
403 __remove_from_page_cache(page);
404 write_unlock_irq(&mapping->tree_lock);
405 __put_page(page);
406 return 1;
407
408cannot_free:
409 write_unlock_irq(&mapping->tree_lock);
410 return 0;
411}
412
379/* 413/*
380 * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed 414 * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
381 */ 415 */
@@ -424,7 +458,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
424 * Try to allocate it some swap space here. 458 * Try to allocate it some swap space here.
425 */ 459 */
426 if (PageAnon(page) && !PageSwapCache(page)) { 460 if (PageAnon(page) && !PageSwapCache(page)) {
427 if (!add_to_swap(page)) 461 if (!add_to_swap(page, GFP_ATOMIC))
428 goto activate_locked; 462 goto activate_locked;
429 } 463 }
430#endif /* CONFIG_SWAP */ 464#endif /* CONFIG_SWAP */
@@ -507,36 +541,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
507 goto free_it; 541 goto free_it;
508 } 542 }
509 543
510 if (!mapping) 544 if (!remove_mapping(mapping, page))
511 goto keep_locked; /* truncate got there first */ 545 goto keep_locked;
512
513 write_lock_irq(&mapping->tree_lock);
514
515 /*
516 * The non-racy check for busy page. It is critical to check
517 * PageDirty _after_ making sure that the page is freeable and
518 * not in use by anybody. (pagecache + us == 2)
519 */
520 if (unlikely(page_count(page) != 2))
521 goto cannot_free;
522 smp_rmb();
523 if (unlikely(PageDirty(page)))
524 goto cannot_free;
525
526#ifdef CONFIG_SWAP
527 if (PageSwapCache(page)) {
528 swp_entry_t swap = { .val = page_private(page) };
529 __delete_from_swap_cache(page);
530 write_unlock_irq(&mapping->tree_lock);
531 swap_free(swap);
532 __put_page(page); /* The pagecache ref */
533 goto free_it;
534 }
535#endif /* CONFIG_SWAP */
536
537 __remove_from_page_cache(page);
538 write_unlock_irq(&mapping->tree_lock);
539 __put_page(page);
540 546
541free_it: 547free_it:
542 unlock_page(page); 548 unlock_page(page);
@@ -545,10 +551,6 @@ free_it:
545 __pagevec_release_nonlru(&freed_pvec); 551 __pagevec_release_nonlru(&freed_pvec);
546 continue; 552 continue;
547 553
548cannot_free:
549 write_unlock_irq(&mapping->tree_lock);
550 goto keep_locked;
551
552activate_locked: 554activate_locked:
553 SetPageActive(page); 555 SetPageActive(page);
554 pgactivate++; 556 pgactivate++;
@@ -566,6 +568,241 @@ keep:
566 return reclaimed; 568 return reclaimed;
567} 569}
568 570
571#ifdef CONFIG_MIGRATION
572static inline void move_to_lru(struct page *page)
573{
574 list_del(&page->lru);
575 if (PageActive(page)) {
576 /*
577 * lru_cache_add_active checks that
578 * the PG_active bit is off.
579 */
580 ClearPageActive(page);
581 lru_cache_add_active(page);
582 } else {
583 lru_cache_add(page);
584 }
585 put_page(page);
586}
587
588/*
589 * Add isolated pages on the list back to the LRU
590 *
591 * returns the number of pages put back.
592 */
593int putback_lru_pages(struct list_head *l)
594{
595 struct page *page;
596 struct page *page2;
597 int count = 0;
598
599 list_for_each_entry_safe(page, page2, l, lru) {
600 move_to_lru(page);
601 count++;
602 }
603 return count;
604}
605
606/*
607 * swapout a single page
608 * page is locked upon entry, unlocked on exit
609 */
610static int swap_page(struct page *page)
611{
612 struct address_space *mapping = page_mapping(page);
613
614 if (page_mapped(page) && mapping)
615 if (try_to_unmap(page) != SWAP_SUCCESS)
616 goto unlock_retry;
617
618 if (PageDirty(page)) {
619 /* Page is dirty, try to write it out here */
620 switch(pageout(page, mapping)) {
621 case PAGE_KEEP:
622 case PAGE_ACTIVATE:
623 goto unlock_retry;
624
625 case PAGE_SUCCESS:
626 goto retry;
627
628 case PAGE_CLEAN:
629 ; /* try to free the page below */
630 }
631 }
632
633 if (PagePrivate(page)) {
634 if (!try_to_release_page(page, GFP_KERNEL) ||
635 (!mapping && page_count(page) == 1))
636 goto unlock_retry;
637 }
638
639 if (remove_mapping(mapping, page)) {
640 /* Success */
641 unlock_page(page);
642 return 0;
643 }
644
645unlock_retry:
646 unlock_page(page);
647
648retry:
649 return -EAGAIN;
650}
651/*
652 * migrate_pages
653 *
654 * Two lists are passed to this function. The first list
655 * contains the pages isolated from the LRU to be migrated.
656 * The second list contains new pages that the pages isolated
657 * can be moved to. If the second list is NULL then all
658 * pages are swapped out.
659 *
660 * The function returns after 10 attempts or if no pages
661 * are movable anymore because t has become empty
662 * or no retryable pages exist anymore.
663 *
664 * SIMPLIFIED VERSION: This implementation of migrate_pages
665 * is only swapping out pages and never touches the second
666 * list. The direct migration patchset
667 * extends this function to avoid the use of swap.
668 *
669 * Return: Number of pages not migrated when "to" ran empty.
670 */
671int migrate_pages(struct list_head *from, struct list_head *to,
672 struct list_head *moved, struct list_head *failed)
673{
674 int retry;
675 int nr_failed = 0;
676 int pass = 0;
677 struct page *page;
678 struct page *page2;
679 int swapwrite = current->flags & PF_SWAPWRITE;
680 int rc;
681
682 if (!swapwrite)
683 current->flags |= PF_SWAPWRITE;
684
685redo:
686 retry = 0;
687
688 list_for_each_entry_safe(page, page2, from, lru) {
689 cond_resched();
690
691 rc = 0;
692 if (page_count(page) == 1)
693 /* page was freed from under us. So we are done. */
694 goto next;
695
696 /*
697 * Skip locked pages during the first two passes to give the
698 * functions holding the lock time to release the page. Later we
699 * use lock_page() to have a higher chance of acquiring the
700 * lock.
701 */
702 rc = -EAGAIN;
703 if (pass > 2)
704 lock_page(page);
705 else
706 if (TestSetPageLocked(page))
707 goto next;
708
709 /*
710 * Only wait on writeback if we have already done a pass where
711 * we we may have triggered writeouts for lots of pages.
712 */
713 if (pass > 0) {
714 wait_on_page_writeback(page);
715 } else {
716 if (PageWriteback(page))
717 goto unlock_page;
718 }
719
720 /*
721 * Anonymous pages must have swap cache references otherwise
722 * the information contained in the page maps cannot be
723 * preserved.
724 */
725 if (PageAnon(page) && !PageSwapCache(page)) {
726 if (!add_to_swap(page, GFP_KERNEL)) {
727 rc = -ENOMEM;
728 goto unlock_page;
729 }
730 }
731
732 /*
733 * Page is properly locked and writeback is complete.
734 * Try to migrate the page.
735 */
736 rc = swap_page(page);
737 goto next;
738
739unlock_page:
740 unlock_page(page);
741
742next:
743 if (rc == -EAGAIN) {
744 retry++;
745 } else if (rc) {
746 /* Permanent failure */
747 list_move(&page->lru, failed);
748 nr_failed++;
749 } else {
750 /* Success */
751 list_move(&page->lru, moved);
752 }
753 }
754 if (retry && pass++ < 10)
755 goto redo;
756
757 if (!swapwrite)
758 current->flags &= ~PF_SWAPWRITE;
759
760 return nr_failed + retry;
761}
762
763static void lru_add_drain_per_cpu(void *dummy)
764{
765 lru_add_drain();
766}
767
768/*
769 * Isolate one page from the LRU lists and put it on the
770 * indicated list. Do necessary cache draining if the
771 * page is not on the LRU lists yet.
772 *
773 * Result:
774 * 0 = page not on LRU list
775 * 1 = page removed from LRU list and added to the specified list.
776 * -ENOENT = page is being freed elsewhere.
777 */
778int isolate_lru_page(struct page *page)
779{
780 int rc = 0;
781 struct zone *zone = page_zone(page);
782
783redo:
784 spin_lock_irq(&zone->lru_lock);
785 rc = __isolate_lru_page(page);
786 if (rc == 1) {
787 if (PageActive(page))
788 del_page_from_active_list(zone, page);
789 else
790 del_page_from_inactive_list(zone, page);
791 }
792 spin_unlock_irq(&zone->lru_lock);
793 if (rc == 0) {
794 /*
795 * Maybe this page is still waiting for a cpu to drain it
796 * from one of the lru lists?
797 */
798 rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
799 if (rc == 0 && PageLRU(page))
800 goto redo;
801 }
802 return rc;
803}
804#endif
805
569/* 806/*
570 * zone->lru_lock is heavily contended. Some of the functions that 807 * zone->lru_lock is heavily contended. Some of the functions that
571 * shrink the lists perform better by taking out a batch of pages 808 * shrink the lists perform better by taking out a batch of pages
@@ -594,20 +831,18 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
594 page = lru_to_page(src); 831 page = lru_to_page(src);
595 prefetchw_prev_lru_page(page, src, flags); 832 prefetchw_prev_lru_page(page, src, flags);
596 833
597 if (!TestClearPageLRU(page)) 834 switch (__isolate_lru_page(page)) {
598 BUG(); 835 case 1:
599 list_del(&page->lru); 836 /* Succeeded to isolate page */
600 if (get_page_testone(page)) { 837 list_move(&page->lru, dst);
601 /*
602 * It is being freed elsewhere
603 */
604 __put_page(page);
605 SetPageLRU(page);
606 list_add(&page->lru, src);
607 continue;
608 } else {
609 list_add(&page->lru, dst);
610 nr_taken++; 838 nr_taken++;
839 break;
840 case -ENOENT:
841 /* Not possible to isolate */
842 list_move(&page->lru, src);
843 break;
844 default:
845 BUG();
611 } 846 }
612 } 847 }
613 848
@@ -1226,7 +1461,7 @@ static int kswapd(void *p)
1226 * us from recursively trying to free more memory as we're 1461 * us from recursively trying to free more memory as we're
1227 * trying to free the first piece of memory in the first place). 1462 * trying to free the first piece of memory in the first place).
1228 */ 1463 */
1229 tsk->flags |= PF_MEMALLOC|PF_KSWAPD; 1464 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
1230 1465
1231 order = 0; 1466 order = 0;
1232 for ( ; ; ) { 1467 for ( ; ; ) {