aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/filemap.c101
-rw-r--r--mm/filemap_xip.c8
-rw-r--r--mm/hugetlb.c33
-rw-r--r--mm/madvise.c1
-rw-r--r--mm/memory.c2
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mlock.c11
-rw-r--r--mm/mmap.c3
-rw-r--r--mm/msync.c1
-rw-r--r--mm/nommu.c8
-rw-r--r--mm/page-writeback.c64
-rw-r--r--mm/page_alloc.c127
-rw-r--r--mm/rmap.c67
-rw-r--r--mm/shmem.c8
-rw-r--r--mm/slab.c152
-rw-r--r--mm/slob.c53
-rw-r--r--mm/slub.c1361
-rw-r--r--mm/sparse.c23
-rw-r--r--mm/swap.c2
-rw-r--r--mm/thrash.c5
-rw-r--r--mm/truncate.c3
-rw-r--r--mm/vmalloc.c9
-rw-r--r--mm/vmscan.c10
-rw-r--r--mm/vmstat.c96
25 files changed, 1272 insertions, 880 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 1ac718f636ec..8ac412b45f18 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -166,5 +166,5 @@ config ZONE_DMA_FLAG
166config NR_QUICK 166config NR_QUICK
167 int 167 int
168 depends on QUICKLIST 168 depends on QUICKLIST
169 default "2" if (SUPERH && !SUPERH64)
169 default "1" 170 default "1"
170
diff --git a/mm/filemap.c b/mm/filemap.c
index 5631d6b2a62d..edb1b0b5cc8d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -670,7 +670,8 @@ repeat:
670 page = find_lock_page(mapping, index); 670 page = find_lock_page(mapping, index);
671 if (!page) { 671 if (!page) {
672 if (!cached_page) { 672 if (!cached_page) {
673 cached_page = alloc_page(gfp_mask); 673 cached_page =
674 __page_cache_alloc(gfp_mask);
674 if (!cached_page) 675 if (!cached_page)
675 return NULL; 676 return NULL;
676 } 677 }
@@ -750,6 +751,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
750 read_unlock_irq(&mapping->tree_lock); 751 read_unlock_irq(&mapping->tree_lock);
751 return i; 752 return i;
752} 753}
754EXPORT_SYMBOL(find_get_pages_contig);
753 755
754/** 756/**
755 * find_get_pages_tag - find and return pages that match @tag 757 * find_get_pages_tag - find and return pages that match @tag
@@ -778,6 +780,7 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
778 read_unlock_irq(&mapping->tree_lock); 780 read_unlock_irq(&mapping->tree_lock);
779 return ret; 781 return ret;
780} 782}
783EXPORT_SYMBOL(find_get_pages_tag);
781 784
782/** 785/**
783 * grab_cache_page_nowait - returns locked page at given index in given cache 786 * grab_cache_page_nowait - returns locked page at given index in given cache
@@ -1110,6 +1113,45 @@ success:
1110 return size; 1113 return size;
1111} 1114}
1112 1115
1116/*
1117 * Performs necessary checks before doing a write
1118 * @iov: io vector request
1119 * @nr_segs: number of segments in the iovec
1120 * @count: number of bytes to write
1121 * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
1122 *
1123 * Adjust number of segments and amount of bytes to write (nr_segs should be
1124 * properly initialized first). Returns appropriate error code that caller
1125 * should return or zero in case that write should be allowed.
1126 */
1127int generic_segment_checks(const struct iovec *iov,
1128 unsigned long *nr_segs, size_t *count, int access_flags)
1129{
1130 unsigned long seg;
1131 size_t cnt = 0;
1132 for (seg = 0; seg < *nr_segs; seg++) {
1133 const struct iovec *iv = &iov[seg];
1134
1135 /*
1136 * If any segment has a negative length, or the cumulative
1137 * length ever wraps negative then return -EINVAL.
1138 */
1139 cnt += iv->iov_len;
1140 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1141 return -EINVAL;
1142 if (access_ok(access_flags, iv->iov_base, iv->iov_len))
1143 continue;
1144 if (seg == 0)
1145 return -EFAULT;
1146 *nr_segs = seg;
1147 cnt -= iv->iov_len; /* This segment is no good */
1148 break;
1149 }
1150 *count = cnt;
1151 return 0;
1152}
1153EXPORT_SYMBOL(generic_segment_checks);
1154
1113/** 1155/**
1114 * generic_file_aio_read - generic filesystem read routine 1156 * generic_file_aio_read - generic filesystem read routine
1115 * @iocb: kernel I/O control block 1157 * @iocb: kernel I/O control block
@@ -1131,24 +1173,9 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1131 loff_t *ppos = &iocb->ki_pos; 1173 loff_t *ppos = &iocb->ki_pos;
1132 1174
1133 count = 0; 1175 count = 0;
1134 for (seg = 0; seg < nr_segs; seg++) { 1176 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1135 const struct iovec *iv = &iov[seg]; 1177 if (retval)
1136 1178 return retval;
1137 /*
1138 * If any segment has a negative length, or the cumulative
1139 * length ever wraps negative then return -EINVAL.
1140 */
1141 count += iv->iov_len;
1142 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
1143 return -EINVAL;
1144 if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
1145 continue;
1146 if (seg == 0)
1147 return -EFAULT;
1148 nr_segs = seg;
1149 count -= iv->iov_len; /* This segment is no good */
1150 break;
1151 }
1152 1179
1153 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 1180 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1154 if (filp->f_flags & O_DIRECT) { 1181 if (filp->f_flags & O_DIRECT) {
@@ -1758,7 +1785,7 @@ struct page *read_cache_page_async(struct address_space *mapping,
1758retry: 1785retry:
1759 page = __read_cache_page(mapping, index, filler, data); 1786 page = __read_cache_page(mapping, index, filler, data);
1760 if (IS_ERR(page)) 1787 if (IS_ERR(page))
1761 goto out; 1788 return page;
1762 mark_page_accessed(page); 1789 mark_page_accessed(page);
1763 if (PageUptodate(page)) 1790 if (PageUptodate(page))
1764 goto out; 1791 goto out;
@@ -1776,9 +1803,9 @@ retry:
1776 err = filler(data, page); 1803 err = filler(data, page);
1777 if (err < 0) { 1804 if (err < 0) {
1778 page_cache_release(page); 1805 page_cache_release(page);
1779 page = ERR_PTR(err); 1806 return ERR_PTR(err);
1780 } 1807 }
1781 out: 1808out:
1782 mark_page_accessed(page); 1809 mark_page_accessed(page);
1783 return page; 1810 return page;
1784} 1811}
@@ -2218,30 +2245,14 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2218 size_t ocount; /* original count */ 2245 size_t ocount; /* original count */
2219 size_t count; /* after file limit checks */ 2246 size_t count; /* after file limit checks */
2220 struct inode *inode = mapping->host; 2247 struct inode *inode = mapping->host;
2221 unsigned long seg;
2222 loff_t pos; 2248 loff_t pos;
2223 ssize_t written; 2249 ssize_t written;
2224 ssize_t err; 2250 ssize_t err;
2225 2251
2226 ocount = 0; 2252 ocount = 0;
2227 for (seg = 0; seg < nr_segs; seg++) { 2253 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
2228 const struct iovec *iv = &iov[seg]; 2254 if (err)
2229 2255 return err;
2230 /*
2231 * If any segment has a negative length, or the cumulative
2232 * length ever wraps negative then return -EINVAL.
2233 */
2234 ocount += iv->iov_len;
2235 if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
2236 return -EINVAL;
2237 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
2238 continue;
2239 if (seg == 0)
2240 return -EFAULT;
2241 nr_segs = seg;
2242 ocount -= iv->iov_len; /* This segment is no good */
2243 break;
2244 }
2245 2256
2246 count = ocount; 2257 count = ocount;
2247 pos = *ppos; 2258 pos = *ppos;
@@ -2301,10 +2312,10 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2301 * semantics. 2312 * semantics.
2302 */ 2313 */
2303 endbyte = pos + written_buffered - written - 1; 2314 endbyte = pos + written_buffered - written - 1;
2304 err = do_sync_file_range(file, pos, endbyte, 2315 err = do_sync_mapping_range(file->f_mapping, pos, endbyte,
2305 SYNC_FILE_RANGE_WAIT_BEFORE| 2316 SYNC_FILE_RANGE_WAIT_BEFORE|
2306 SYNC_FILE_RANGE_WRITE| 2317 SYNC_FILE_RANGE_WRITE|
2307 SYNC_FILE_RANGE_WAIT_AFTER); 2318 SYNC_FILE_RANGE_WAIT_AFTER);
2308 if (err == 0) { 2319 if (err == 0) {
2309 written = written_buffered; 2320 written = written_buffered;
2310 invalidate_mapping_pages(mapping, 2321 invalidate_mapping_pages(mapping,
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index cbb335813ec0..fa360e566d88 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/uio.h> 14#include <linux/uio.h>
15#include <linux/rmap.h> 15#include <linux/rmap.h>
16#include <linux/sched.h>
16#include <asm/tlbflush.h> 17#include <asm/tlbflush.h>
17#include "filemap.h" 18#include "filemap.h"
18 19
@@ -434,7 +435,6 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
434 unsigned blocksize; 435 unsigned blocksize;
435 unsigned length; 436 unsigned length;
436 struct page *page; 437 struct page *page;
437 void *kaddr;
438 438
439 BUG_ON(!mapping->a_ops->get_xip_page); 439 BUG_ON(!mapping->a_ops->get_xip_page);
440 440
@@ -458,11 +458,7 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
458 else 458 else
459 return PTR_ERR(page); 459 return PTR_ERR(page);
460 } 460 }
461 kaddr = kmap_atomic(page, KM_USER0); 461 zero_user_page(page, offset, length, KM_USER0);
462 memset(kaddr + offset, 0, length);
463 kunmap_atomic(kaddr, KM_USER0);
464
465 flush_dcache_page(page);
466 return 0; 462 return 0;
467} 463}
468EXPORT_SYMBOL_GPL(xip_truncate_page); 464EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 36db012b38dd..eb7180db3033 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -140,6 +140,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
140 return page; 140 return page;
141 141
142fail: 142fail:
143 if (vma->vm_flags & VM_MAYSHARE)
144 resv_huge_pages++;
143 spin_unlock(&hugetlb_lock); 145 spin_unlock(&hugetlb_lock);
144 return NULL; 146 return NULL;
145} 147}
@@ -172,6 +174,17 @@ static int __init hugetlb_setup(char *s)
172} 174}
173__setup("hugepages=", hugetlb_setup); 175__setup("hugepages=", hugetlb_setup);
174 176
177static unsigned int cpuset_mems_nr(unsigned int *array)
178{
179 int node;
180 unsigned int nr = 0;
181
182 for_each_node_mask(node, cpuset_current_mems_allowed)
183 nr += array[node];
184
185 return nr;
186}
187
175#ifdef CONFIG_SYSCTL 188#ifdef CONFIG_SYSCTL
176static void update_and_free_page(struct page *page) 189static void update_and_free_page(struct page *page)
177{ 190{
@@ -817,6 +830,26 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to)
817 chg = region_chg(&inode->i_mapping->private_list, from, to); 830 chg = region_chg(&inode->i_mapping->private_list, from, to);
818 if (chg < 0) 831 if (chg < 0)
819 return chg; 832 return chg;
833 /*
834 * When cpuset is configured, it breaks the strict hugetlb page
835 * reservation as the accounting is done on a global variable. Such
836 * reservation is completely rubbish in the presence of cpuset because
837 * the reservation is not checked against page availability for the
838 * current cpuset. Application can still potentially OOM'ed by kernel
839 * with lack of free htlb page in cpuset that the task is in.
840 * Attempt to enforce strict accounting with cpuset is almost
841 * impossible (or too ugly) because cpuset is too fluid that
842 * task or memory node can be dynamically moved between cpusets.
843 *
844 * The change of semantics for shared hugetlb mapping with cpuset is
845 * undesirable. However, in order to preserve some of the semantics,
846 * we fall back to check against current free page availability as
847 * a best attempt and hopefully to minimize the impact of changing
848 * semantics that cpuset has.
849 */
850 if (chg > cpuset_mems_nr(free_huge_pages_node))
851 return -ENOMEM;
852
820 ret = hugetlb_acct_memory(chg); 853 ret = hugetlb_acct_memory(chg);
821 if (ret < 0) 854 if (ret < 0)
822 return ret; 855 return ret;
diff --git a/mm/madvise.c b/mm/madvise.c
index e75096b5a6d3..60542d006ec1 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -10,6 +10,7 @@
10#include <linux/syscalls.h> 10#include <linux/syscalls.h>
11#include <linux/mempolicy.h> 11#include <linux/mempolicy.h>
12#include <linux/hugetlb.h> 12#include <linux/hugetlb.h>
13#include <linux/sched.h>
13 14
14/* 15/*
15 * Any behaviour which results in changes to the vma->vm_flags needs to 16 * Any behaviour which results in changes to the vma->vm_flags needs to
diff --git a/mm/memory.c b/mm/memory.c
index 1d647ab0ee72..cb94488ab96d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -481,7 +481,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
481 page = vm_normal_page(vma, addr, pte); 481 page = vm_normal_page(vma, addr, pte);
482 if (page) { 482 if (page) {
483 get_page(page); 483 get_page(page);
484 page_dup_rmap(page); 484 page_dup_rmap(page, vma, addr);
485 rss[!!PageAnon(page)]++; 485 rss[!!PageAnon(page)]++;
486 } 486 }
487 487
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 84279127fcd3..df9d554bea30 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -65,7 +65,7 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
65 int zone_type; 65 int zone_type;
66 66
67 zone_type = zone - pgdat->node_zones; 67 zone_type = zone - pgdat->node_zones;
68 if (!populated_zone(zone)) { 68 if (!zone->wait_table) {
69 int ret = 0; 69 int ret = 0;
70 ret = init_currently_empty_zone(zone, phys_start_pfn, 70 ret = init_currently_empty_zone(zone, phys_start_pfn,
71 nr_pages, MEMMAP_HOTPLUG); 71 nr_pages, MEMMAP_HOTPLUG);
diff --git a/mm/mlock.c b/mm/mlock.c
index 3446b7ef731e..4d3fea267e0d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -10,7 +10,18 @@
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/mempolicy.h> 11#include <linux/mempolicy.h>
12#include <linux/syscalls.h> 12#include <linux/syscalls.h>
13#include <linux/sched.h>
14#include <linux/module.h>
13 15
16int can_do_mlock(void)
17{
18 if (capable(CAP_IPC_LOCK))
19 return 1;
20 if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0)
21 return 1;
22 return 0;
23}
24EXPORT_SYMBOL(can_do_mlock);
14 25
15static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, 26static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
16 unsigned long start, unsigned long end, unsigned int newflags) 27 unsigned long start, unsigned long end, unsigned int newflags)
diff --git a/mm/mmap.c b/mm/mmap.c
index 52646d61ff69..68b9ad2ef1d6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1366,7 +1366,6 @@ unsigned long
1366get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, 1366get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1367 unsigned long pgoff, unsigned long flags) 1367 unsigned long pgoff, unsigned long flags)
1368{ 1368{
1369 unsigned long ret;
1370 unsigned long (*get_area)(struct file *, unsigned long, 1369 unsigned long (*get_area)(struct file *, unsigned long,
1371 unsigned long, unsigned long, unsigned long); 1370 unsigned long, unsigned long, unsigned long);
1372 1371
@@ -1721,7 +1720,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1721 1720
1722/* 1721/*
1723 * Split a vma into two pieces at address 'addr', a new vma is allocated 1722 * Split a vma into two pieces at address 'addr', a new vma is allocated
1724 * either for the first part or the the tail. 1723 * either for the first part or the tail.
1725 */ 1724 */
1726int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, 1725int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1727 unsigned long addr, int new_below) 1726 unsigned long addr, int new_below)
diff --git a/mm/msync.c b/mm/msync.c
index 358d73cf7b78..144a7570535d 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -12,6 +12,7 @@
12#include <linux/mman.h> 12#include <linux/mman.h>
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/syscalls.h> 14#include <linux/syscalls.h>
15#include <linux/sched.h>
15 16
16/* 17/*
17 * MS_SYNC syncs the entire file - including mappings. 18 * MS_SYNC syncs the entire file - including mappings.
diff --git a/mm/nommu.c b/mm/nommu.c
index 1f60194d9b9b..2b16b00a5b11 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -262,6 +262,14 @@ void vunmap(void *addr)
262} 262}
263 263
264/* 264/*
265 * Implement a stub for vmalloc_sync_all() if the architecture chose not to
266 * have one.
267 */
268void __attribute__((weak)) vmalloc_sync_all(void)
269{
270}
271
272/*
265 * sys_brk() for the most part doesn't need the global kernel 273 * sys_brk() for the most part doesn't need the global kernel
266 * lock, except when an application is doing something nasty 274 * lock, except when an application is doing something nasty
267 * like trying to un-brk an area that has already been mapped 275 * like trying to un-brk an area that has already been mapped
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 029dfad5a235..eec1481ba44f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -588,31 +588,27 @@ void __init page_writeback_init(void)
588} 588}
589 589
590/** 590/**
591 * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. 591 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
592 * @mapping: address space structure to write 592 * @mapping: address space structure to write
593 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 593 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
594 * @writepage: function called for each page
595 * @data: data passed to writepage function
594 * 596 *
595 * This is a library function, which implements the writepages() 597 * If a page is already under I/O, write_cache_pages() skips it, even
596 * address_space_operation.
597 *
598 * If a page is already under I/O, generic_writepages() skips it, even
599 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 598 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
600 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 599 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
601 * and msync() need to guarantee that all the data which was dirty at the time 600 * and msync() need to guarantee that all the data which was dirty at the time
602 * the call was made get new I/O started against them. If wbc->sync_mode is 601 * the call was made get new I/O started against them. If wbc->sync_mode is
603 * WB_SYNC_ALL then we were called for data integrity and we must wait for 602 * WB_SYNC_ALL then we were called for data integrity and we must wait for
604 * existing IO to complete. 603 * existing IO to complete.
605 *
606 * Derived from mpage_writepages() - if you fix this you should check that
607 * also!
608 */ 604 */
609int generic_writepages(struct address_space *mapping, 605int write_cache_pages(struct address_space *mapping,
610 struct writeback_control *wbc) 606 struct writeback_control *wbc, writepage_t writepage,
607 void *data)
611{ 608{
612 struct backing_dev_info *bdi = mapping->backing_dev_info; 609 struct backing_dev_info *bdi = mapping->backing_dev_info;
613 int ret = 0; 610 int ret = 0;
614 int done = 0; 611 int done = 0;
615 int (*writepage)(struct page *page, struct writeback_control *wbc);
616 struct pagevec pvec; 612 struct pagevec pvec;
617 int nr_pages; 613 int nr_pages;
618 pgoff_t index; 614 pgoff_t index;
@@ -625,12 +621,6 @@ int generic_writepages(struct address_space *mapping,
625 return 0; 621 return 0;
626 } 622 }
627 623
628 writepage = mapping->a_ops->writepage;
629
630 /* deal with chardevs and other special file */
631 if (!writepage)
632 return 0;
633
634 pagevec_init(&pvec, 0); 624 pagevec_init(&pvec, 0);
635 if (wbc->range_cyclic) { 625 if (wbc->range_cyclic) {
636 index = mapping->writeback_index; /* Start from prev offset */ 626 index = mapping->writeback_index; /* Start from prev offset */
@@ -682,13 +672,7 @@ retry:
682 continue; 672 continue;
683 } 673 }
684 674
685 ret = (*writepage)(page, wbc); 675 ret = (*writepage)(page, wbc, data);
686 if (ret) {
687 if (ret == -ENOSPC)
688 set_bit(AS_ENOSPC, &mapping->flags);
689 else
690 set_bit(AS_EIO, &mapping->flags);
691 }
692 676
693 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) 677 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
694 unlock_page(page); 678 unlock_page(page);
@@ -715,6 +699,38 @@ retry:
715 mapping->writeback_index = index; 699 mapping->writeback_index = index;
716 return ret; 700 return ret;
717} 701}
702EXPORT_SYMBOL(write_cache_pages);
703
704/*
705 * Function used by generic_writepages to call the real writepage
706 * function and set the mapping flags on error
707 */
708static int __writepage(struct page *page, struct writeback_control *wbc,
709 void *data)
710{
711 struct address_space *mapping = data;
712 int ret = mapping->a_ops->writepage(page, wbc);
713 mapping_set_error(mapping, ret);
714 return ret;
715}
716
717/**
718 * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
719 * @mapping: address space structure to write
720 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
721 *
722 * This is a library function, which implements the writepages()
723 * address_space_operation.
724 */
725int generic_writepages(struct address_space *mapping,
726 struct writeback_control *wbc)
727{
728 /* deal with chardevs and other special file */
729 if (!mapping->a_ops->writepage)
730 return 0;
731
732 return write_cache_pages(mapping, wbc, __writepage, mapping);
733}
718 734
719EXPORT_SYMBOL(generic_writepages); 735EXPORT_SYMBOL(generic_writepages);
720 736
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 59164313167f..bd8e33582d25 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -103,7 +103,7 @@ int min_free_kbytes = 1024;
103 103
104unsigned long __meminitdata nr_kernel_pages; 104unsigned long __meminitdata nr_kernel_pages;
105unsigned long __meminitdata nr_all_pages; 105unsigned long __meminitdata nr_all_pages;
106static unsigned long __initdata dma_reserve; 106static unsigned long __meminitdata dma_reserve;
107 107
108#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 108#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
109 /* 109 /*
@@ -126,16 +126,21 @@ static unsigned long __initdata dma_reserve;
126 #endif 126 #endif
127 #endif 127 #endif
128 128
129 struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS]; 129 struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
130 int __initdata nr_nodemap_entries; 130 int __meminitdata nr_nodemap_entries;
131 unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 131 unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
132 unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 132 unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
133#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 133#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
134 unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES]; 134 unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
135 unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES]; 135 unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
136#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 136#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
137#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 137#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
138 138
139#if MAX_NUMNODES > 1
140int nr_node_ids __read_mostly = MAX_NUMNODES;
141EXPORT_SYMBOL(nr_node_ids);
142#endif
143
139#ifdef CONFIG_DEBUG_VM 144#ifdef CONFIG_DEBUG_VM
140static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 145static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
141{ 146{
@@ -669,65 +674,28 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
669 return i; 674 return i;
670} 675}
671 676
672#if MAX_NUMNODES > 1
673int nr_node_ids __read_mostly = MAX_NUMNODES;
674EXPORT_SYMBOL(nr_node_ids);
675
676/*
677 * Figure out the number of possible node ids.
678 */
679static void __init setup_nr_node_ids(void)
680{
681 unsigned int node;
682 unsigned int highest = 0;
683
684 for_each_node_mask(node, node_possible_map)
685 highest = node;
686 nr_node_ids = highest + 1;
687}
688#else
689static void __init setup_nr_node_ids(void) {}
690#endif
691
692#ifdef CONFIG_NUMA 677#ifdef CONFIG_NUMA
693/* 678/*
694 * Called from the slab reaper to drain pagesets on a particular node that 679 * Called from the vmstat counter updater to drain pagesets of this
695 * belongs to the currently executing processor. 680 * currently executing processor on remote nodes after they have
681 * expired.
682 *
696 * Note that this function must be called with the thread pinned to 683 * Note that this function must be called with the thread pinned to
697 * a single processor. 684 * a single processor.
698 */ 685 */
699void drain_node_pages(int nodeid) 686void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
700{ 687{
701 int i;
702 enum zone_type z;
703 unsigned long flags; 688 unsigned long flags;
689 int to_drain;
704 690
705 for (z = 0; z < MAX_NR_ZONES; z++) { 691 local_irq_save(flags);
706 struct zone *zone = NODE_DATA(nodeid)->node_zones + z; 692 if (pcp->count >= pcp->batch)
707 struct per_cpu_pageset *pset; 693 to_drain = pcp->batch;
708 694 else
709 if (!populated_zone(zone)) 695 to_drain = pcp->count;
710 continue; 696 free_pages_bulk(zone, to_drain, &pcp->list, 0);
711 697 pcp->count -= to_drain;
712 pset = zone_pcp(zone, smp_processor_id()); 698 local_irq_restore(flags);
713 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
714 struct per_cpu_pages *pcp;
715
716 pcp = &pset->pcp[i];
717 if (pcp->count) {
718 int to_drain;
719
720 local_irq_save(flags);
721 if (pcp->count >= pcp->batch)
722 to_drain = pcp->batch;
723 else
724 to_drain = pcp->count;
725 free_pages_bulk(zone, to_drain, &pcp->list, 0);
726 pcp->count -= to_drain;
727 local_irq_restore(flags);
728 }
729 }
730 }
731} 699}
732#endif 700#endif
733 701
@@ -2148,11 +2116,14 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
2148 2116
2149 switch (action) { 2117 switch (action) {
2150 case CPU_UP_PREPARE: 2118 case CPU_UP_PREPARE:
2119 case CPU_UP_PREPARE_FROZEN:
2151 if (process_zones(cpu)) 2120 if (process_zones(cpu))
2152 ret = NOTIFY_BAD; 2121 ret = NOTIFY_BAD;
2153 break; 2122 break;
2154 case CPU_UP_CANCELED: 2123 case CPU_UP_CANCELED:
2124 case CPU_UP_CANCELED_FROZEN:
2155 case CPU_DEAD: 2125 case CPU_DEAD:
2126 case CPU_DEAD_FROZEN:
2156 free_zone_pagesets(cpu); 2127 free_zone_pagesets(cpu);
2157 break; 2128 break;
2158 default: 2129 default:
@@ -2179,7 +2150,7 @@ void __init setup_per_cpu_pageset(void)
2179 2150
2180#endif 2151#endif
2181 2152
2182static __meminit 2153static noinline __init_refok
2183int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 2154int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
2184{ 2155{
2185 int i; 2156 int i;
@@ -2267,7 +2238,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
2267 * Basic iterator support. Return the first range of PFNs for a node 2238 * Basic iterator support. Return the first range of PFNs for a node
2268 * Note: nid == MAX_NUMNODES returns first region regardless of node 2239 * Note: nid == MAX_NUMNODES returns first region regardless of node
2269 */ 2240 */
2270static int __init first_active_region_index_in_nid(int nid) 2241static int __meminit first_active_region_index_in_nid(int nid)
2271{ 2242{
2272 int i; 2243 int i;
2273 2244
@@ -2282,7 +2253,7 @@ static int __init first_active_region_index_in_nid(int nid)
2282 * Basic iterator support. Return the next active range of PFNs for a node 2253 * Basic iterator support. Return the next active range of PFNs for a node
2283 * Note: nid == MAX_NUMNODES returns next region regardles of node 2254 * Note: nid == MAX_NUMNODES returns next region regardles of node
2284 */ 2255 */
2285static int __init next_active_region_index_in_nid(int index, int nid) 2256static int __meminit next_active_region_index_in_nid(int index, int nid)
2286{ 2257{
2287 for (index = index + 1; index < nr_nodemap_entries; index++) 2258 for (index = index + 1; index < nr_nodemap_entries; index++)
2288 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) 2259 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
@@ -2298,7 +2269,7 @@ static int __init next_active_region_index_in_nid(int index, int nid)
2298 * was used and there are no special requirements, this is a convenient 2269 * was used and there are no special requirements, this is a convenient
2299 * alternative 2270 * alternative
2300 */ 2271 */
2301int __init early_pfn_to_nid(unsigned long pfn) 2272int __meminit early_pfn_to_nid(unsigned long pfn)
2302{ 2273{
2303 int i; 2274 int i;
2304 2275
@@ -2435,7 +2406,7 @@ static void __init account_node_boundary(unsigned int nid,
2435 * with no available memory, a warning is printed and the start and end 2406 * with no available memory, a warning is printed and the start and end
2436 * PFNs will be 0. 2407 * PFNs will be 0.
2437 */ 2408 */
2438void __init get_pfn_range_for_nid(unsigned int nid, 2409void __meminit get_pfn_range_for_nid(unsigned int nid,
2439 unsigned long *start_pfn, unsigned long *end_pfn) 2410 unsigned long *start_pfn, unsigned long *end_pfn)
2440{ 2411{
2441 int i; 2412 int i;
@@ -2460,7 +2431,7 @@ void __init get_pfn_range_for_nid(unsigned int nid,
2460 * Return the number of pages a zone spans in a node, including holes 2431 * Return the number of pages a zone spans in a node, including holes
2461 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 2432 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
2462 */ 2433 */
2463unsigned long __init zone_spanned_pages_in_node(int nid, 2434unsigned long __meminit zone_spanned_pages_in_node(int nid,
2464 unsigned long zone_type, 2435 unsigned long zone_type,
2465 unsigned long *ignored) 2436 unsigned long *ignored)
2466{ 2437{
@@ -2488,7 +2459,7 @@ unsigned long __init zone_spanned_pages_in_node(int nid,
2488 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 2459 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
2489 * then all holes in the requested range will be accounted for. 2460 * then all holes in the requested range will be accounted for.
2490 */ 2461 */
2491unsigned long __init __absent_pages_in_range(int nid, 2462unsigned long __meminit __absent_pages_in_range(int nid,
2492 unsigned long range_start_pfn, 2463 unsigned long range_start_pfn,
2493 unsigned long range_end_pfn) 2464 unsigned long range_end_pfn)
2494{ 2465{
@@ -2548,7 +2519,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
2548} 2519}
2549 2520
2550/* Return the number of page frames in holes in a zone on a node */ 2521/* Return the number of page frames in holes in a zone on a node */
2551unsigned long __init zone_absent_pages_in_node(int nid, 2522unsigned long __meminit zone_absent_pages_in_node(int nid,
2552 unsigned long zone_type, 2523 unsigned long zone_type,
2553 unsigned long *ignored) 2524 unsigned long *ignored)
2554{ 2525{
@@ -2584,7 +2555,7 @@ static inline unsigned long zone_absent_pages_in_node(int nid,
2584 2555
2585#endif 2556#endif
2586 2557
2587static void __init calculate_node_totalpages(struct pglist_data *pgdat, 2558static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
2588 unsigned long *zones_size, unsigned long *zholes_size) 2559 unsigned long *zones_size, unsigned long *zholes_size)
2589{ 2560{
2590 unsigned long realtotalpages, totalpages = 0; 2561 unsigned long realtotalpages, totalpages = 0;
@@ -2692,7 +2663,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
2692 } 2663 }
2693} 2664}
2694 2665
2695static void __init alloc_node_mem_map(struct pglist_data *pgdat) 2666static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
2696{ 2667{
2697 /* Skip empty nodes */ 2668 /* Skip empty nodes */
2698 if (!pgdat->node_spanned_pages) 2669 if (!pgdat->node_spanned_pages)
@@ -2718,7 +2689,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
2718 map = alloc_bootmem_node(pgdat, size); 2689 map = alloc_bootmem_node(pgdat, size);
2719 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 2690 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
2720 } 2691 }
2721#ifdef CONFIG_FLATMEM 2692#ifndef CONFIG_NEED_MULTIPLE_NODES
2722 /* 2693 /*
2723 * With no DISCONTIG, the global mem_map is just set as node 0's 2694 * With no DISCONTIG, the global mem_map is just set as node 0's
2724 */ 2695 */
@@ -2747,6 +2718,26 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
2747} 2718}
2748 2719
2749#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 2720#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2721
2722#if MAX_NUMNODES > 1
2723/*
2724 * Figure out the number of possible node ids.
2725 */
2726static void __init setup_nr_node_ids(void)
2727{
2728 unsigned int node;
2729 unsigned int highest = 0;
2730
2731 for_each_node_mask(node, node_possible_map)
2732 highest = node;
2733 nr_node_ids = highest + 1;
2734}
2735#else
2736static inline void setup_nr_node_ids(void)
2737{
2738}
2739#endif
2740
2750/** 2741/**
2751 * add_active_range - Register a range of PFNs backed by physical memory 2742 * add_active_range - Register a range of PFNs backed by physical memory
2752 * @nid: The node ID the range resides on 2743 * @nid: The node ID the range resides on
@@ -3012,7 +3003,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
3012{ 3003{
3013 int cpu = (unsigned long)hcpu; 3004 int cpu = (unsigned long)hcpu;
3014 3005
3015 if (action == CPU_DEAD) { 3006 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
3016 local_irq_disable(); 3007 local_irq_disable();
3017 __drain_pages(cpu); 3008 __drain_pages(cpu);
3018 vm_events_fold_cpu(cpu); 3009 vm_events_fold_cpu(cpu);
diff --git a/mm/rmap.c b/mm/rmap.c
index 75a32be64a21..850165d32b7a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -162,12 +162,10 @@ void anon_vma_unlink(struct vm_area_struct *vma)
162static void anon_vma_ctor(void *data, struct kmem_cache *cachep, 162static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
163 unsigned long flags) 163 unsigned long flags)
164{ 164{
165 if (flags & SLAB_CTOR_CONSTRUCTOR) { 165 struct anon_vma *anon_vma = data;
166 struct anon_vma *anon_vma = data;
167 166
168 spin_lock_init(&anon_vma->lock); 167 spin_lock_init(&anon_vma->lock);
169 INIT_LIST_HEAD(&anon_vma->head); 168 INIT_LIST_HEAD(&anon_vma->head);
170 }
171} 169}
172 170
173void __init anon_vma_init(void) 171void __init anon_vma_init(void)
@@ -505,6 +503,7 @@ int page_mkclean(struct page *page)
505 503
506 return ret; 504 return ret;
507} 505}
506EXPORT_SYMBOL_GPL(page_mkclean);
508 507
509/** 508/**
510 * page_set_anon_rmap - setup new anonymous rmap 509 * page_set_anon_rmap - setup new anonymous rmap
@@ -531,19 +530,51 @@ static void __page_set_anon_rmap(struct page *page,
531} 530}
532 531
533/** 532/**
533 * page_set_anon_rmap - sanity check anonymous rmap addition
534 * @page: the page to add the mapping to
535 * @vma: the vm area in which the mapping is added
536 * @address: the user virtual address mapped
537 */
538static void __page_check_anon_rmap(struct page *page,
539 struct vm_area_struct *vma, unsigned long address)
540{
541#ifdef CONFIG_DEBUG_VM
542 /*
543 * The page's anon-rmap details (mapping and index) are guaranteed to
544 * be set up correctly at this point.
545 *
546 * We have exclusion against page_add_anon_rmap because the caller
547 * always holds the page locked, except if called from page_dup_rmap,
548 * in which case the page is already known to be setup.
549 *
550 * We have exclusion against page_add_new_anon_rmap because those pages
551 * are initially only visible via the pagetables, and the pte is locked
552 * over the call to page_add_new_anon_rmap.
553 */
554 struct anon_vma *anon_vma = vma->anon_vma;
555 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
556 BUG_ON(page->mapping != (struct address_space *)anon_vma);
557 BUG_ON(page->index != linear_page_index(vma, address));
558#endif
559}
560
561/**
534 * page_add_anon_rmap - add pte mapping to an anonymous page 562 * page_add_anon_rmap - add pte mapping to an anonymous page
535 * @page: the page to add the mapping to 563 * @page: the page to add the mapping to
536 * @vma: the vm area in which the mapping is added 564 * @vma: the vm area in which the mapping is added
537 * @address: the user virtual address mapped 565 * @address: the user virtual address mapped
538 * 566 *
539 * The caller needs to hold the pte lock. 567 * The caller needs to hold the pte lock and the page must be locked.
540 */ 568 */
541void page_add_anon_rmap(struct page *page, 569void page_add_anon_rmap(struct page *page,
542 struct vm_area_struct *vma, unsigned long address) 570 struct vm_area_struct *vma, unsigned long address)
543{ 571{
572 VM_BUG_ON(!PageLocked(page));
573 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
544 if (atomic_inc_and_test(&page->_mapcount)) 574 if (atomic_inc_and_test(&page->_mapcount))
545 __page_set_anon_rmap(page, vma, address); 575 __page_set_anon_rmap(page, vma, address);
546 /* else checking page index and mapping is racy */ 576 else
577 __page_check_anon_rmap(page, vma, address);
547} 578}
548 579
549/* 580/*
@@ -554,10 +585,12 @@ void page_add_anon_rmap(struct page *page,
554 * 585 *
555 * Same as page_add_anon_rmap but must only be called on *new* pages. 586 * Same as page_add_anon_rmap but must only be called on *new* pages.
556 * This means the inc-and-test can be bypassed. 587 * This means the inc-and-test can be bypassed.
588 * Page does not have to be locked.
557 */ 589 */
558void page_add_new_anon_rmap(struct page *page, 590void page_add_new_anon_rmap(struct page *page,
559 struct vm_area_struct *vma, unsigned long address) 591 struct vm_area_struct *vma, unsigned long address)
560{ 592{
593 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
561 atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ 594 atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
562 __page_set_anon_rmap(page, vma, address); 595 __page_set_anon_rmap(page, vma, address);
563} 596}
@@ -574,6 +607,26 @@ void page_add_file_rmap(struct page *page)
574 __inc_zone_page_state(page, NR_FILE_MAPPED); 607 __inc_zone_page_state(page, NR_FILE_MAPPED);
575} 608}
576 609
610#ifdef CONFIG_DEBUG_VM
611/**
612 * page_dup_rmap - duplicate pte mapping to a page
613 * @page: the page to add the mapping to
614 *
615 * For copy_page_range only: minimal extract from page_add_file_rmap /
616 * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's
617 * quicker.
618 *
619 * The caller needs to hold the pte lock.
620 */
621void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
622{
623 BUG_ON(page_mapcount(page) == 0);
624 if (PageAnon(page))
625 __page_check_anon_rmap(page, vma, address);
626 atomic_inc(&page->_mapcount);
627}
628#endif
629
577/** 630/**
578 * page_remove_rmap - take down pte mapping from a page 631 * page_remove_rmap - take down pte mapping from a page
579 * @page: page to remove mapping from 632 * @page: page to remove mapping from
diff --git a/mm/shmem.c b/mm/shmem.c
index f01e8deed645..e537317bec4d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2358,13 +2358,11 @@ static void init_once(void *foo, struct kmem_cache *cachep,
2358{ 2358{
2359 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2359 struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2360 2360
2361 if (flags & SLAB_CTOR_CONSTRUCTOR) { 2361 inode_init_once(&p->vfs_inode);
2362 inode_init_once(&p->vfs_inode);
2363#ifdef CONFIG_TMPFS_POSIX_ACL 2362#ifdef CONFIG_TMPFS_POSIX_ACL
2364 p->i_acl = NULL; 2363 p->i_acl = NULL;
2365 p->i_default_acl = NULL; 2364 p->i_default_acl = NULL;
2366#endif 2365#endif
2367 }
2368} 2366}
2369 2367
2370static int init_inodecache(void) 2368static int init_inodecache(void)
diff --git a/mm/slab.c b/mm/slab.c
index 5920a412b377..2e71a328aa09 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -148,10 +148,11 @@
148 * Usually, the kmalloc caches are cache_line_size() aligned, except when 148 * Usually, the kmalloc caches are cache_line_size() aligned, except when
149 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. 149 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
150 * Some archs want to perform DMA into kmalloc caches and need a guaranteed 150 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
151 * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that. 151 * alignment larger than the alignment of a 64-bit integer.
152 * Note that this flag disables some debug features. 152 * ARCH_KMALLOC_MINALIGN allows that.
153 * Note that increasing this value may disable some debug features.
153 */ 154 */
154#define ARCH_KMALLOC_MINALIGN 0 155#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
155#endif 156#endif
156 157
157#ifndef ARCH_SLAB_MINALIGN 158#ifndef ARCH_SLAB_MINALIGN
@@ -408,9 +409,6 @@ struct kmem_cache {
408 /* constructor func */ 409 /* constructor func */
409 void (*ctor) (void *, struct kmem_cache *, unsigned long); 410 void (*ctor) (void *, struct kmem_cache *, unsigned long);
410 411
411 /* de-constructor func */
412 void (*dtor) (void *, struct kmem_cache *, unsigned long);
413
414/* 5) cache creation/removal */ 412/* 5) cache creation/removal */
415 const char *name; 413 const char *name;
416 struct list_head next; 414 struct list_head next;
@@ -536,19 +534,22 @@ static int obj_size(struct kmem_cache *cachep)
536 return cachep->obj_size; 534 return cachep->obj_size;
537} 535}
538 536
539static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp) 537static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
540{ 538{
541 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 539 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
542 return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD); 540 return (unsigned long long*) (objp + obj_offset(cachep) -
541 sizeof(unsigned long long));
543} 542}
544 543
545static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp) 544static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
546{ 545{
547 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 546 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
548 if (cachep->flags & SLAB_STORE_USER) 547 if (cachep->flags & SLAB_STORE_USER)
549 return (unsigned long *)(objp + cachep->buffer_size - 548 return (unsigned long long *)(objp + cachep->buffer_size -
550 2 * BYTES_PER_WORD); 549 sizeof(unsigned long long) -
551 return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD); 550 BYTES_PER_WORD);
551 return (unsigned long long *) (objp + cachep->buffer_size -
552 sizeof(unsigned long long));
552} 553}
553 554
554static void **dbg_userword(struct kmem_cache *cachep, void *objp) 555static void **dbg_userword(struct kmem_cache *cachep, void *objp)
@@ -561,28 +562,13 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
561 562
562#define obj_offset(x) 0 563#define obj_offset(x) 0
563#define obj_size(cachep) (cachep->buffer_size) 564#define obj_size(cachep) (cachep->buffer_size)
564#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 565#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
565#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 566#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
566#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 567#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})
567 568
568#endif 569#endif
569 570
570/* 571/*
571 * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp
572 * order.
573 */
574#if defined(CONFIG_LARGE_ALLOCS)
575#define MAX_OBJ_ORDER 13 /* up to 32Mb */
576#define MAX_GFP_ORDER 13 /* up to 32Mb */
577#elif defined(CONFIG_MMU)
578#define MAX_OBJ_ORDER 5 /* 32 pages */
579#define MAX_GFP_ORDER 5 /* 32 pages */
580#else
581#define MAX_OBJ_ORDER 8 /* up to 1Mb */
582#define MAX_GFP_ORDER 8 /* up to 1Mb */
583#endif
584
585/*
586 * Do not go above this order unless 0 objects fit into the slab. 572 * Do not go above this order unless 0 objects fit into the slab.
587 */ 573 */
588#define BREAK_GFP_ORDER_HI 1 574#define BREAK_GFP_ORDER_HI 1
@@ -788,6 +774,7 @@ static inline struct kmem_cache *__find_general_cachep(size_t size,
788 */ 774 */
789 BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); 775 BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
790#endif 776#endif
777 WARN_ON_ONCE(size == 0);
791 while (size > csizep->cs_size) 778 while (size > csizep->cs_size)
792 csizep++; 779 csizep++;
793 780
@@ -924,12 +911,6 @@ static void next_reap_node(void)
924{ 911{
925 int node = __get_cpu_var(reap_node); 912 int node = __get_cpu_var(reap_node);
926 913
927 /*
928 * Also drain per cpu pages on remote zones
929 */
930 if (node != numa_node_id())
931 drain_node_pages(node);
932
933 node = next_node(node, node_online_map); 914 node = next_node(node, node_online_map);
934 if (unlikely(node >= MAX_NUMNODES)) 915 if (unlikely(node >= MAX_NUMNODES))
935 node = first_node(node_online_map); 916 node = first_node(node_online_map);
@@ -1182,8 +1163,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1182 int memsize = sizeof(struct kmem_list3); 1163 int memsize = sizeof(struct kmem_list3);
1183 1164
1184 switch (action) { 1165 switch (action) {
1185 case CPU_UP_PREPARE: 1166 case CPU_LOCK_ACQUIRE:
1186 mutex_lock(&cache_chain_mutex); 1167 mutex_lock(&cache_chain_mutex);
1168 break;
1169 case CPU_UP_PREPARE:
1170 case CPU_UP_PREPARE_FROZEN:
1187 /* 1171 /*
1188 * We need to do this right in the beginning since 1172 * We need to do this right in the beginning since
1189 * alloc_arraycache's are going to use this list. 1173 * alloc_arraycache's are going to use this list.
@@ -1270,17 +1254,28 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1270 } 1254 }
1271 break; 1255 break;
1272 case CPU_ONLINE: 1256 case CPU_ONLINE:
1273 mutex_unlock(&cache_chain_mutex); 1257 case CPU_ONLINE_FROZEN:
1274 start_cpu_timer(cpu); 1258 start_cpu_timer(cpu);
1275 break; 1259 break;
1276#ifdef CONFIG_HOTPLUG_CPU 1260#ifdef CONFIG_HOTPLUG_CPU
1277 case CPU_DOWN_PREPARE: 1261 case CPU_DOWN_PREPARE:
1278 mutex_lock(&cache_chain_mutex); 1262 case CPU_DOWN_PREPARE_FROZEN:
1279 break; 1263 /*
1280 case CPU_DOWN_FAILED: 1264 * Shutdown cache reaper. Note that the cache_chain_mutex is
1281 mutex_unlock(&cache_chain_mutex); 1265 * held so that if cache_reap() is invoked it cannot do
1282 break; 1266 * anything expensive but will only modify reap_work
1267 * and reschedule the timer.
1268 */
1269 cancel_rearming_delayed_work(&per_cpu(reap_work, cpu));
1270 /* Now the cache_reaper is guaranteed to be not running. */
1271 per_cpu(reap_work, cpu).work.func = NULL;
1272 break;
1273 case CPU_DOWN_FAILED:
1274 case CPU_DOWN_FAILED_FROZEN:
1275 start_cpu_timer(cpu);
1276 break;
1283 case CPU_DEAD: 1277 case CPU_DEAD:
1278 case CPU_DEAD_FROZEN:
1284 /* 1279 /*
1285 * Even if all the cpus of a node are down, we don't free the 1280 * Even if all the cpus of a node are down, we don't free the
1286 * kmem_list3 of any cache. This to avoid a race between 1281 * kmem_list3 of any cache. This to avoid a race between
@@ -1292,6 +1287,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1292 /* fall thru */ 1287 /* fall thru */
1293#endif 1288#endif
1294 case CPU_UP_CANCELED: 1289 case CPU_UP_CANCELED:
1290 case CPU_UP_CANCELED_FROZEN:
1295 list_for_each_entry(cachep, &cache_chain, next) { 1291 list_for_each_entry(cachep, &cache_chain, next) {
1296 struct array_cache *nc; 1292 struct array_cache *nc;
1297 struct array_cache *shared; 1293 struct array_cache *shared;
@@ -1350,6 +1346,8 @@ free_array_cache:
1350 continue; 1346 continue;
1351 drain_freelist(cachep, l3, l3->free_objects); 1347 drain_freelist(cachep, l3, l3->free_objects);
1352 } 1348 }
1349 break;
1350 case CPU_LOCK_RELEASE:
1353 mutex_unlock(&cache_chain_mutex); 1351 mutex_unlock(&cache_chain_mutex);
1354 break; 1352 break;
1355 } 1353 }
@@ -1776,7 +1774,7 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1776 char *realobj; 1774 char *realobj;
1777 1775
1778 if (cachep->flags & SLAB_RED_ZONE) { 1776 if (cachep->flags & SLAB_RED_ZONE) {
1779 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", 1777 printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
1780 *dbg_redzone1(cachep, objp), 1778 *dbg_redzone1(cachep, objp),
1781 *dbg_redzone2(cachep, objp)); 1779 *dbg_redzone2(cachep, objp));
1782 } 1780 }
@@ -1896,20 +1894,11 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1896 slab_error(cachep, "end of a freed object " 1894 slab_error(cachep, "end of a freed object "
1897 "was overwritten"); 1895 "was overwritten");
1898 } 1896 }
1899 if (cachep->dtor && !(cachep->flags & SLAB_POISON))
1900 (cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
1901 } 1897 }
1902} 1898}
1903#else 1899#else
1904static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) 1900static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1905{ 1901{
1906 if (cachep->dtor) {
1907 int i;
1908 for (i = 0; i < cachep->num; i++) {
1909 void *objp = index_to_obj(cachep, slabp, i);
1910 (cachep->dtor) (objp, cachep, 0);
1911 }
1912 }
1913} 1902}
1914#endif 1903#endif
1915 1904
@@ -1998,7 +1987,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
1998 size_t left_over = 0; 1987 size_t left_over = 0;
1999 int gfporder; 1988 int gfporder;
2000 1989
2001 for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) { 1990 for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
2002 unsigned int num; 1991 unsigned int num;
2003 size_t remainder; 1992 size_t remainder;
2004 1993
@@ -2048,7 +2037,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
2048 return left_over; 2037 return left_over;
2049} 2038}
2050 2039
2051static int setup_cpu_cache(struct kmem_cache *cachep) 2040static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
2052{ 2041{
2053 if (g_cpucache_up == FULL) 2042 if (g_cpucache_up == FULL)
2054 return enable_cpucache(cachep); 2043 return enable_cpucache(cachep);
@@ -2109,7 +2098,7 @@ static int setup_cpu_cache(struct kmem_cache *cachep)
2109 * @align: The required alignment for the objects. 2098 * @align: The required alignment for the objects.
2110 * @flags: SLAB flags 2099 * @flags: SLAB flags
2111 * @ctor: A constructor for the objects. 2100 * @ctor: A constructor for the objects.
2112 * @dtor: A destructor for the objects. 2101 * @dtor: A destructor for the objects (not implemented anymore).
2113 * 2102 *
2114 * Returns a ptr to the cache on success, NULL on failure. 2103 * Returns a ptr to the cache on success, NULL on failure.
2115 * Cannot be called within a int, but can be interrupted. 2104 * Cannot be called within a int, but can be interrupted.
@@ -2144,7 +2133,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2144 * Sanity checks... these are all serious usage bugs. 2133 * Sanity checks... these are all serious usage bugs.
2145 */ 2134 */
2146 if (!name || in_interrupt() || (size < BYTES_PER_WORD) || 2135 if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
2147 (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { 2136 size > KMALLOC_MAX_SIZE || dtor) {
2148 printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__, 2137 printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
2149 name); 2138 name);
2150 BUG(); 2139 BUG();
@@ -2198,9 +2187,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2198 if (flags & SLAB_DESTROY_BY_RCU) 2187 if (flags & SLAB_DESTROY_BY_RCU)
2199 BUG_ON(flags & SLAB_POISON); 2188 BUG_ON(flags & SLAB_POISON);
2200#endif 2189#endif
2201 if (flags & SLAB_DESTROY_BY_RCU)
2202 BUG_ON(dtor);
2203
2204 /* 2190 /*
2205 * Always checks flags, a caller might be expecting debug support which 2191 * Always checks flags, a caller might be expecting debug support which
2206 * isn't available. 2192 * isn't available.
@@ -2239,7 +2225,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2239 * is greater than BYTES_PER_WORD. 2225 * is greater than BYTES_PER_WORD.
2240 */ 2226 */
2241 if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) 2227 if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
2242 ralign = BYTES_PER_WORD; 2228 ralign = __alignof__(unsigned long long);
2243 2229
2244 /* 2) arch mandated alignment */ 2230 /* 2) arch mandated alignment */
2245 if (ralign < ARCH_SLAB_MINALIGN) { 2231 if (ralign < ARCH_SLAB_MINALIGN) {
@@ -2250,7 +2236,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2250 ralign = align; 2236 ralign = align;
2251 } 2237 }
2252 /* disable debug if necessary */ 2238 /* disable debug if necessary */
2253 if (ralign > BYTES_PER_WORD) 2239 if (ralign > __alignof__(unsigned long long))
2254 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2240 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2255 /* 2241 /*
2256 * 4) Store it. 2242 * 4) Store it.
@@ -2271,8 +2257,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2271 */ 2257 */
2272 if (flags & SLAB_RED_ZONE) { 2258 if (flags & SLAB_RED_ZONE) {
2273 /* add space for red zone words */ 2259 /* add space for red zone words */
2274 cachep->obj_offset += BYTES_PER_WORD; 2260 cachep->obj_offset += sizeof(unsigned long long);
2275 size += 2 * BYTES_PER_WORD; 2261 size += 2 * sizeof(unsigned long long);
2276 } 2262 }
2277 if (flags & SLAB_STORE_USER) { 2263 if (flags & SLAB_STORE_USER) {
2278 /* user store requires one word storage behind the end of 2264 /* user store requires one word storage behind the end of
@@ -2355,7 +2341,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2355 BUG_ON(!cachep->slabp_cache); 2341 BUG_ON(!cachep->slabp_cache);
2356 } 2342 }
2357 cachep->ctor = ctor; 2343 cachep->ctor = ctor;
2358 cachep->dtor = dtor;
2359 cachep->name = name; 2344 cachep->name = name;
2360 2345
2361 if (setup_cpu_cache(cachep)) { 2346 if (setup_cpu_cache(cachep)) {
@@ -2610,7 +2595,7 @@ static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2610} 2595}
2611 2596
2612static void cache_init_objs(struct kmem_cache *cachep, 2597static void cache_init_objs(struct kmem_cache *cachep,
2613 struct slab *slabp, unsigned long ctor_flags) 2598 struct slab *slabp)
2614{ 2599{
2615 int i; 2600 int i;
2616 2601
@@ -2634,7 +2619,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2634 */ 2619 */
2635 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2620 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2636 cachep->ctor(objp + obj_offset(cachep), cachep, 2621 cachep->ctor(objp + obj_offset(cachep), cachep,
2637 ctor_flags); 2622 0);
2638 2623
2639 if (cachep->flags & SLAB_RED_ZONE) { 2624 if (cachep->flags & SLAB_RED_ZONE) {
2640 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2625 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2650,7 +2635,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2650 cachep->buffer_size / PAGE_SIZE, 0); 2635 cachep->buffer_size / PAGE_SIZE, 0);
2651#else 2636#else
2652 if (cachep->ctor) 2637 if (cachep->ctor)
2653 cachep->ctor(objp, cachep, ctor_flags); 2638 cachep->ctor(objp, cachep, 0);
2654#endif 2639#endif
2655 slab_bufctl(slabp)[i] = i + 1; 2640 slab_bufctl(slabp)[i] = i + 1;
2656 } 2641 }
@@ -2739,7 +2724,6 @@ static int cache_grow(struct kmem_cache *cachep,
2739 struct slab *slabp; 2724 struct slab *slabp;
2740 size_t offset; 2725 size_t offset;
2741 gfp_t local_flags; 2726 gfp_t local_flags;
2742 unsigned long ctor_flags;
2743 struct kmem_list3 *l3; 2727 struct kmem_list3 *l3;
2744 2728
2745 /* 2729 /*
@@ -2748,7 +2732,6 @@ static int cache_grow(struct kmem_cache *cachep,
2748 */ 2732 */
2749 BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK)); 2733 BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK));
2750 2734
2751 ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2752 local_flags = (flags & GFP_LEVEL_MASK); 2735 local_flags = (flags & GFP_LEVEL_MASK);
2753 /* Take the l3 list lock to change the colour_next on this node */ 2736 /* Take the l3 list lock to change the colour_next on this node */
2754 check_irq_off(); 2737 check_irq_off();
@@ -2793,7 +2776,7 @@ static int cache_grow(struct kmem_cache *cachep,
2793 slabp->nodeid = nodeid; 2776 slabp->nodeid = nodeid;
2794 slab_map_pages(cachep, slabp, objp); 2777 slab_map_pages(cachep, slabp, objp);
2795 2778
2796 cache_init_objs(cachep, slabp, ctor_flags); 2779 cache_init_objs(cachep, slabp);
2797 2780
2798 if (local_flags & __GFP_WAIT) 2781 if (local_flags & __GFP_WAIT)
2799 local_irq_disable(); 2782 local_irq_disable();
@@ -2820,7 +2803,6 @@ failed:
2820 * Perform extra freeing checks: 2803 * Perform extra freeing checks:
2821 * - detect bad pointers. 2804 * - detect bad pointers.
2822 * - POISON/RED_ZONE checking 2805 * - POISON/RED_ZONE checking
2823 * - destructor calls, for caches with POISON+dtor
2824 */ 2806 */
2825static void kfree_debugcheck(const void *objp) 2807static void kfree_debugcheck(const void *objp)
2826{ 2808{
@@ -2833,7 +2815,7 @@ static void kfree_debugcheck(const void *objp)
2833 2815
2834static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) 2816static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2835{ 2817{
2836 unsigned long redzone1, redzone2; 2818 unsigned long long redzone1, redzone2;
2837 2819
2838 redzone1 = *dbg_redzone1(cache, obj); 2820 redzone1 = *dbg_redzone1(cache, obj);
2839 redzone2 = *dbg_redzone2(cache, obj); 2821 redzone2 = *dbg_redzone2(cache, obj);
@@ -2849,7 +2831,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2849 else 2831 else
2850 slab_error(cache, "memory outside object was overwritten"); 2832 slab_error(cache, "memory outside object was overwritten");
2851 2833
2852 printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n", 2834 printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
2853 obj, redzone1, redzone2); 2835 obj, redzone1, redzone2);
2854} 2836}
2855 2837
@@ -2879,12 +2861,6 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2879 BUG_ON(objnr >= cachep->num); 2861 BUG_ON(objnr >= cachep->num);
2880 BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); 2862 BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
2881 2863
2882 if (cachep->flags & SLAB_POISON && cachep->dtor) {
2883 /* we want to cache poison the object,
2884 * call the destruction callback
2885 */
2886 cachep->dtor(objp + obj_offset(cachep), cachep, 0);
2887 }
2888#ifdef CONFIG_DEBUG_SLAB_LEAK 2864#ifdef CONFIG_DEBUG_SLAB_LEAK
2889 slab_bufctl(slabp)[objnr] = BUFCTL_FREE; 2865 slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
2890#endif 2866#endif
@@ -3065,7 +3041,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3065 slab_error(cachep, "double free, or memory outside" 3041 slab_error(cachep, "double free, or memory outside"
3066 " object was overwritten"); 3042 " object was overwritten");
3067 printk(KERN_ERR 3043 printk(KERN_ERR
3068 "%p: redzone 1:0x%lx, redzone 2:0x%lx\n", 3044 "%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
3069 objp, *dbg_redzone1(cachep, objp), 3045 objp, *dbg_redzone1(cachep, objp),
3070 *dbg_redzone2(cachep, objp)); 3046 *dbg_redzone2(cachep, objp));
3071 } 3047 }
@@ -3084,7 +3060,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3084#endif 3060#endif
3085 objp += obj_offset(cachep); 3061 objp += obj_offset(cachep);
3086 if (cachep->ctor && cachep->flags & SLAB_POISON) 3062 if (cachep->ctor && cachep->flags & SLAB_POISON)
3087 cachep->ctor(objp, cachep, SLAB_CTOR_CONSTRUCTOR); 3063 cachep->ctor(objp, cachep, 0);
3088#if ARCH_SLAB_MINALIGN 3064#if ARCH_SLAB_MINALIGN
3089 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { 3065 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3090 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 3066 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
@@ -3738,7 +3714,6 @@ EXPORT_SYMBOL(__kmalloc);
3738 3714
3739/** 3715/**
3740 * krealloc - reallocate memory. The contents will remain unchanged. 3716 * krealloc - reallocate memory. The contents will remain unchanged.
3741 *
3742 * @p: object to reallocate memory for. 3717 * @p: object to reallocate memory for.
3743 * @new_size: how many bytes of memory are required. 3718 * @new_size: how many bytes of memory are required.
3744 * @flags: the type of memory to allocate. 3719 * @flags: the type of memory to allocate.
@@ -4136,7 +4111,6 @@ next:
4136 check_irq_on(); 4111 check_irq_on();
4137 mutex_unlock(&cache_chain_mutex); 4112 mutex_unlock(&cache_chain_mutex);
4138 next_reap_node(); 4113 next_reap_node();
4139 refresh_cpu_vm_stats(smp_processor_id());
4140out: 4114out:
4141 /* Set up the next iteration */ 4115 /* Set up the next iteration */
4142 schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); 4116 schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
@@ -4428,16 +4402,12 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4428static void show_symbol(struct seq_file *m, unsigned long address) 4402static void show_symbol(struct seq_file *m, unsigned long address)
4429{ 4403{
4430#ifdef CONFIG_KALLSYMS 4404#ifdef CONFIG_KALLSYMS
4431 char *modname;
4432 const char *name;
4433 unsigned long offset, size; 4405 unsigned long offset, size;
4434 char namebuf[KSYM_NAME_LEN+1]; 4406 char modname[MODULE_NAME_LEN + 1], name[KSYM_NAME_LEN + 1];
4435
4436 name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
4437 4407
4438 if (name) { 4408 if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
4439 seq_printf(m, "%s+%#lx/%#lx", name, offset, size); 4409 seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
4440 if (modname) 4410 if (modname[0])
4441 seq_printf(m, " [%s]", modname); 4411 seq_printf(m, " [%s]", modname);
4442 return; 4412 return;
4443 } 4413 }
diff --git a/mm/slob.c b/mm/slob.c
index c6933bc19bcd..71976c5d40d3 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -35,6 +35,7 @@
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/module.h> 36#include <linux/module.h>
37#include <linux/timer.h> 37#include <linux/timer.h>
38#include <linux/rcupdate.h>
38 39
39struct slob_block { 40struct slob_block {
40 int units; 41 int units;
@@ -53,6 +54,16 @@ struct bigblock {
53}; 54};
54typedef struct bigblock bigblock_t; 55typedef struct bigblock bigblock_t;
55 56
57/*
58 * struct slob_rcu is inserted at the tail of allocated slob blocks, which
59 * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free
60 * the block using call_rcu.
61 */
62struct slob_rcu {
63 struct rcu_head head;
64 int size;
65};
66
56static slob_t arena = { .next = &arena, .units = 1 }; 67static slob_t arena = { .next = &arena, .units = 1 };
57static slob_t *slobfree = &arena; 68static slob_t *slobfree = &arena;
58static bigblock_t *bigblocks; 69static bigblock_t *bigblocks;
@@ -266,9 +277,9 @@ size_t ksize(const void *block)
266 277
267struct kmem_cache { 278struct kmem_cache {
268 unsigned int size, align; 279 unsigned int size, align;
280 unsigned long flags;
269 const char *name; 281 const char *name;
270 void (*ctor)(void *, struct kmem_cache *, unsigned long); 282 void (*ctor)(void *, struct kmem_cache *, unsigned long);
271 void (*dtor)(void *, struct kmem_cache *, unsigned long);
272}; 283};
273 284
274struct kmem_cache *kmem_cache_create(const char *name, size_t size, 285struct kmem_cache *kmem_cache_create(const char *name, size_t size,
@@ -283,8 +294,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
283 if (c) { 294 if (c) {
284 c->name = name; 295 c->name = name;
285 c->size = size; 296 c->size = size;
297 if (flags & SLAB_DESTROY_BY_RCU) {
298 /* leave room for rcu footer at the end of object */
299 c->size += sizeof(struct slob_rcu);
300 }
301 c->flags = flags;
286 c->ctor = ctor; 302 c->ctor = ctor;
287 c->dtor = dtor;
288 /* ignore alignment unless it's forced */ 303 /* ignore alignment unless it's forced */
289 c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; 304 c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
290 if (c->align < align) 305 if (c->align < align)
@@ -312,7 +327,7 @@ void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
312 b = (void *)__get_free_pages(flags, get_order(c->size)); 327 b = (void *)__get_free_pages(flags, get_order(c->size));
313 328
314 if (c->ctor) 329 if (c->ctor)
315 c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR); 330 c->ctor(b, c, 0);
316 331
317 return b; 332 return b;
318} 333}
@@ -328,15 +343,33 @@ void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
328} 343}
329EXPORT_SYMBOL(kmem_cache_zalloc); 344EXPORT_SYMBOL(kmem_cache_zalloc);
330 345
331void kmem_cache_free(struct kmem_cache *c, void *b) 346static void __kmem_cache_free(void *b, int size)
332{ 347{
333 if (c->dtor) 348 if (size < PAGE_SIZE)
334 c->dtor(b, c, 0); 349 slob_free(b, size);
335
336 if (c->size < PAGE_SIZE)
337 slob_free(b, c->size);
338 else 350 else
339 free_pages((unsigned long)b, get_order(c->size)); 351 free_pages((unsigned long)b, get_order(size));
352}
353
354static void kmem_rcu_free(struct rcu_head *head)
355{
356 struct slob_rcu *slob_rcu = (struct slob_rcu *)head;
357 void *b = (void *)slob_rcu - (slob_rcu->size - sizeof(struct slob_rcu));
358
359 __kmem_cache_free(b, slob_rcu->size);
360}
361
362void kmem_cache_free(struct kmem_cache *c, void *b)
363{
364 if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) {
365 struct slob_rcu *slob_rcu;
366 slob_rcu = b + (c->size - sizeof(struct slob_rcu));
367 INIT_RCU_HEAD(&slob_rcu->head);
368 slob_rcu->size = c->size;
369 call_rcu(&slob_rcu->head, kmem_rcu_free);
370 } else {
371 __kmem_cache_free(b, c->size);
372 }
340} 373}
341EXPORT_SYMBOL(kmem_cache_free); 374EXPORT_SYMBOL(kmem_cache_free);
342 375
diff --git a/mm/slub.c b/mm/slub.c
index 5db3da5a60bf..51663a3c3c24 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -66,11 +66,11 @@
66 * SLUB assigns one slab for allocation to each processor. 66 * SLUB assigns one slab for allocation to each processor.
67 * Allocations only occur from these slabs called cpu slabs. 67 * Allocations only occur from these slabs called cpu slabs.
68 * 68 *
69 * Slabs with free elements are kept on a partial list. 69 * Slabs with free elements are kept on a partial list and during regular
70 * There is no list for full slabs. If an object in a full slab is 70 * operations no list for full slabs is used. If an object in a full slab is
71 * freed then the slab will show up again on the partial lists. 71 * freed then the slab will show up again on the partial lists.
72 * Otherwise there is no need to track full slabs unless we have to 72 * We track full slabs for debugging purposes though because otherwise we
73 * track full slabs for debugging purposes. 73 * cannot scan all objects.
74 * 74 *
75 * Slabs are freed when they become empty. Teardown and setup is 75 * Slabs are freed when they become empty. Teardown and setup is
76 * minimal so we rely on the page allocators per cpu caches for 76 * minimal so we rely on the page allocators per cpu caches for
@@ -78,22 +78,72 @@
78 * 78 *
79 * Overloading of page flags that are otherwise used for LRU management. 79 * Overloading of page flags that are otherwise used for LRU management.
80 * 80 *
81 * PageActive The slab is used as a cpu cache. Allocations 81 * PageActive The slab is frozen and exempt from list processing.
82 * may be performed from the slab. The slab is not 82 * This means that the slab is dedicated to a purpose
83 * on any slab list and cannot be moved onto one. 83 * such as satisfying allocations for a specific
84 * processor. Objects may be freed in the slab while
85 * it is frozen but slab_free will then skip the usual
86 * list operations. It is up to the processor holding
87 * the slab to integrate the slab into the slab lists
88 * when the slab is no longer needed.
89 *
90 * One use of this flag is to mark slabs that are
91 * used for allocations. Then such a slab becomes a cpu
92 * slab. The cpu slab may be equipped with an additional
93 * lockless_freelist that allows lockless access to
94 * free objects in addition to the regular freelist
95 * that requires the slab lock.
84 * 96 *
85 * PageError Slab requires special handling due to debug 97 * PageError Slab requires special handling due to debug
86 * options set. This moves slab handling out of 98 * options set. This moves slab handling out of
87 * the fast path. 99 * the fast path and disables lockless freelists.
88 */ 100 */
89 101
102#define FROZEN (1 << PG_active)
103
104#ifdef CONFIG_SLUB_DEBUG
105#define SLABDEBUG (1 << PG_error)
106#else
107#define SLABDEBUG 0
108#endif
109
110static inline int SlabFrozen(struct page *page)
111{
112 return page->flags & FROZEN;
113}
114
115static inline void SetSlabFrozen(struct page *page)
116{
117 page->flags |= FROZEN;
118}
119
120static inline void ClearSlabFrozen(struct page *page)
121{
122 page->flags &= ~FROZEN;
123}
124
125static inline int SlabDebug(struct page *page)
126{
127 return page->flags & SLABDEBUG;
128}
129
130static inline void SetSlabDebug(struct page *page)
131{
132 page->flags |= SLABDEBUG;
133}
134
135static inline void ClearSlabDebug(struct page *page)
136{
137 page->flags &= ~SLABDEBUG;
138}
139
90/* 140/*
91 * Issues still to be resolved: 141 * Issues still to be resolved:
92 * 142 *
93 * - The per cpu array is updated for each new slab and and is a remote 143 * - The per cpu array is updated for each new slab and and is a remote
94 * cacheline for most nodes. This could become a bouncing cacheline given 144 * cacheline for most nodes. This could become a bouncing cacheline given
95 * enough frequent updates. There are 16 pointers in a cacheline.so at 145 * enough frequent updates. There are 16 pointers in a cacheline, so at
96 * max 16 cpus could compete. Likely okay. 146 * max 16 cpus could compete for the cacheline which may be okay.
97 * 147 *
98 * - Support PAGE_ALLOC_DEBUG. Should be easy to do. 148 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
99 * 149 *
@@ -137,6 +187,7 @@
137 187
138#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ 188#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
139 SLAB_POISON | SLAB_STORE_USER) 189 SLAB_POISON | SLAB_STORE_USER)
190
140/* 191/*
141 * Set of flags that will prevent slab merging 192 * Set of flags that will prevent slab merging
142 */ 193 */
@@ -157,6 +208,11 @@
157/* Internal SLUB flags */ 208/* Internal SLUB flags */
158#define __OBJECT_POISON 0x80000000 /* Poison object */ 209#define __OBJECT_POISON 0x80000000 /* Poison object */
159 210
211/* Not all arches define cache_line_size */
212#ifndef cache_line_size
213#define cache_line_size() L1_CACHE_BYTES
214#endif
215
160static int kmem_size = sizeof(struct kmem_cache); 216static int kmem_size = sizeof(struct kmem_cache);
161 217
162#ifdef CONFIG_SMP 218#ifdef CONFIG_SMP
@@ -166,7 +222,7 @@ static struct notifier_block slab_notifier;
166static enum { 222static enum {
167 DOWN, /* No slab functionality available */ 223 DOWN, /* No slab functionality available */
168 PARTIAL, /* kmem_cache_open() works but kmalloc does not */ 224 PARTIAL, /* kmem_cache_open() works but kmalloc does not */
169 UP, /* Everything works */ 225 UP, /* Everything works but does not show up in sysfs */
170 SYSFS /* Sysfs up */ 226 SYSFS /* Sysfs up */
171} slab_state = DOWN; 227} slab_state = DOWN;
172 228
@@ -174,7 +230,19 @@ static enum {
174static DECLARE_RWSEM(slub_lock); 230static DECLARE_RWSEM(slub_lock);
175LIST_HEAD(slab_caches); 231LIST_HEAD(slab_caches);
176 232
177#ifdef CONFIG_SYSFS 233/*
234 * Tracking user of a slab.
235 */
236struct track {
237 void *addr; /* Called from address */
238 int cpu; /* Was running on cpu */
239 int pid; /* Pid context */
240 unsigned long when; /* When did the operation occur */
241};
242
243enum track_item { TRACK_ALLOC, TRACK_FREE };
244
245#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
178static int sysfs_slab_add(struct kmem_cache *); 246static int sysfs_slab_add(struct kmem_cache *);
179static int sysfs_slab_alias(struct kmem_cache *, const char *); 247static int sysfs_slab_alias(struct kmem_cache *, const char *);
180static void sysfs_slab_remove(struct kmem_cache *); 248static void sysfs_slab_remove(struct kmem_cache *);
@@ -202,6 +270,63 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
202#endif 270#endif
203} 271}
204 272
273static inline int check_valid_pointer(struct kmem_cache *s,
274 struct page *page, const void *object)
275{
276 void *base;
277
278 if (!object)
279 return 1;
280
281 base = page_address(page);
282 if (object < base || object >= base + s->objects * s->size ||
283 (object - base) % s->size) {
284 return 0;
285 }
286
287 return 1;
288}
289
290/*
291 * Slow version of get and set free pointer.
292 *
293 * This version requires touching the cache lines of kmem_cache which
294 * we avoid to do in the fast alloc free paths. There we obtain the offset
295 * from the page struct.
296 */
297static inline void *get_freepointer(struct kmem_cache *s, void *object)
298{
299 return *(void **)(object + s->offset);
300}
301
302static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
303{
304 *(void **)(object + s->offset) = fp;
305}
306
307/* Loop over all objects in a slab */
308#define for_each_object(__p, __s, __addr) \
309 for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\
310 __p += (__s)->size)
311
312/* Scan freelist */
313#define for_each_free_object(__p, __s, __free) \
314 for (__p = (__free); __p; __p = get_freepointer((__s), __p))
315
316/* Determine object index from a given position */
317static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
318{
319 return (p - addr) / s->size;
320}
321
322#ifdef CONFIG_SLUB_DEBUG
323/*
324 * Debug settings:
325 */
326static int slub_debug;
327
328static char *slub_debug_slabs;
329
205/* 330/*
206 * Object debugging 331 * Object debugging
207 */ 332 */
@@ -237,35 +362,6 @@ static void print_section(char *text, u8 *addr, unsigned int length)
237 } 362 }
238} 363}
239 364
240/*
241 * Slow version of get and set free pointer.
242 *
243 * This requires touching the cache lines of kmem_cache.
244 * The offset can also be obtained from the page. In that
245 * case it is in the cacheline that we already need to touch.
246 */
247static void *get_freepointer(struct kmem_cache *s, void *object)
248{
249 return *(void **)(object + s->offset);
250}
251
252static void set_freepointer(struct kmem_cache *s, void *object, void *fp)
253{
254 *(void **)(object + s->offset) = fp;
255}
256
257/*
258 * Tracking user of a slab.
259 */
260struct track {
261 void *addr; /* Called from address */
262 int cpu; /* Was running on cpu */
263 int pid; /* Pid context */
264 unsigned long when; /* When did the operation occur */
265};
266
267enum track_item { TRACK_ALLOC, TRACK_FREE };
268
269static struct track *get_track(struct kmem_cache *s, void *object, 365static struct track *get_track(struct kmem_cache *s, void *object,
270 enum track_item alloc) 366 enum track_item alloc)
271{ 367{
@@ -400,24 +496,6 @@ static int check_bytes(u8 *start, unsigned int value, unsigned int bytes)
400 return 1; 496 return 1;
401} 497}
402 498
403
404static int check_valid_pointer(struct kmem_cache *s, struct page *page,
405 void *object)
406{
407 void *base;
408
409 if (!object)
410 return 1;
411
412 base = page_address(page);
413 if (object < base || object >= base + s->objects * s->size ||
414 (object - base) % s->size) {
415 return 0;
416 }
417
418 return 1;
419}
420
421/* 499/*
422 * Object layout: 500 * Object layout:
423 * 501 *
@@ -425,26 +503,34 @@ static int check_valid_pointer(struct kmem_cache *s, struct page *page,
425 * Bytes of the object to be managed. 503 * Bytes of the object to be managed.
426 * If the freepointer may overlay the object then the free 504 * If the freepointer may overlay the object then the free
427 * pointer is the first word of the object. 505 * pointer is the first word of the object.
506 *
428 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 507 * Poisoning uses 0x6b (POISON_FREE) and the last byte is
429 * 0xa5 (POISON_END) 508 * 0xa5 (POISON_END)
430 * 509 *
431 * object + s->objsize 510 * object + s->objsize
432 * Padding to reach word boundary. This is also used for Redzoning. 511 * Padding to reach word boundary. This is also used for Redzoning.
433 * Padding is extended to word size if Redzoning is enabled 512 * Padding is extended by another word if Redzoning is enabled and
434 * and objsize == inuse. 513 * objsize == inuse.
514 *
435 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 515 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with
436 * 0xcc (RED_ACTIVE) for objects in use. 516 * 0xcc (RED_ACTIVE) for objects in use.
437 * 517 *
438 * object + s->inuse 518 * object + s->inuse
519 * Meta data starts here.
520 *
439 * A. Free pointer (if we cannot overwrite object on free) 521 * A. Free pointer (if we cannot overwrite object on free)
440 * B. Tracking data for SLAB_STORE_USER 522 * B. Tracking data for SLAB_STORE_USER
441 * C. Padding to reach required alignment boundary 523 * C. Padding to reach required alignment boundary or at mininum
442 * Padding is done using 0x5a (POISON_INUSE) 524 * one word if debuggin is on to be able to detect writes
525 * before the word boundary.
526 *
527 * Padding is done using 0x5a (POISON_INUSE)
443 * 528 *
444 * object + s->size 529 * object + s->size
530 * Nothing is used beyond s->size.
445 * 531 *
446 * If slabcaches are merged then the objsize and inuse boundaries are to 532 * If slabcaches are merged then the objsize and inuse boundaries are mostly
447 * be ignored. And therefore no slab options that rely on these boundaries 533 * ignored. And therefore no slab options that rely on these boundaries
448 * may be used with merged slabcaches. 534 * may be used with merged slabcaches.
449 */ 535 */
450 536
@@ -570,8 +656,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
570 /* 656 /*
571 * No choice but to zap it and thus loose the remainder 657 * No choice but to zap it and thus loose the remainder
572 * of the free objects in this slab. May cause 658 * of the free objects in this slab. May cause
573 * another error because the object count maybe 659 * another error because the object count is now wrong.
574 * wrong now.
575 */ 660 */
576 set_freepointer(s, p, NULL); 661 set_freepointer(s, p, NULL);
577 return 0; 662 return 0;
@@ -611,9 +696,8 @@ static int check_slab(struct kmem_cache *s, struct page *page)
611} 696}
612 697
613/* 698/*
614 * Determine if a certain object on a page is on the freelist and 699 * Determine if a certain object on a page is on the freelist. Must hold the
615 * therefore free. Must hold the slab lock for cpu slabs to 700 * slab lock to guarantee that the chains are in a consistent state.
616 * guarantee that the chains are consistent.
617 */ 701 */
618static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 702static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
619{ 703{
@@ -658,8 +742,24 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
658 return search == NULL; 742 return search == NULL;
659} 743}
660 744
745static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc)
746{
747 if (s->flags & SLAB_TRACE) {
748 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
749 s->name,
750 alloc ? "alloc" : "free",
751 object, page->inuse,
752 page->freelist);
753
754 if (!alloc)
755 print_section("Object", (void *)object, s->objsize);
756
757 dump_stack();
758 }
759}
760
661/* 761/*
662 * Tracking of fully allocated slabs for debugging 762 * Tracking of fully allocated slabs for debugging purposes.
663 */ 763 */
664static void add_full(struct kmem_cache_node *n, struct page *page) 764static void add_full(struct kmem_cache_node *n, struct page *page)
665{ 765{
@@ -682,8 +782,18 @@ static void remove_full(struct kmem_cache *s, struct page *page)
682 spin_unlock(&n->list_lock); 782 spin_unlock(&n->list_lock);
683} 783}
684 784
685static int alloc_object_checks(struct kmem_cache *s, struct page *page, 785static void setup_object_debug(struct kmem_cache *s, struct page *page,
686 void *object) 786 void *object)
787{
788 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
789 return;
790
791 init_object(s, object, 0);
792 init_tracking(s, object);
793}
794
795static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
796 void *object, void *addr)
687{ 797{
688 if (!check_slab(s, page)) 798 if (!check_slab(s, page))
689 goto bad; 799 goto bad;
@@ -698,19 +808,22 @@ static int alloc_object_checks(struct kmem_cache *s, struct page *page,
698 goto bad; 808 goto bad;
699 } 809 }
700 810
701 if (!object) 811 if (object && !check_object(s, page, object, 0))
702 return 1;
703
704 if (!check_object(s, page, object, 0))
705 goto bad; 812 goto bad;
706 813
814 /* Success perform special debug activities for allocs */
815 if (s->flags & SLAB_STORE_USER)
816 set_track(s, object, TRACK_ALLOC, addr);
817 trace(s, page, object, 1);
818 init_object(s, object, 1);
707 return 1; 819 return 1;
820
708bad: 821bad:
709 if (PageSlab(page)) { 822 if (PageSlab(page)) {
710 /* 823 /*
711 * If this is a slab page then lets do the best we can 824 * If this is a slab page then lets do the best we can
712 * to avoid issues in the future. Marking all objects 825 * to avoid issues in the future. Marking all objects
713 * as used avoids touching the remainder. 826 * as used avoids touching the remaining objects.
714 */ 827 */
715 printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n", 828 printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n",
716 s->name, page); 829 s->name, page);
@@ -722,8 +835,8 @@ bad:
722 return 0; 835 return 0;
723} 836}
724 837
725static int free_object_checks(struct kmem_cache *s, struct page *page, 838static int free_debug_processing(struct kmem_cache *s, struct page *page,
726 void *object) 839 void *object, void *addr)
727{ 840{
728 if (!check_slab(s, page)) 841 if (!check_slab(s, page))
729 goto fail; 842 goto fail;
@@ -757,13 +870,107 @@ static int free_object_checks(struct kmem_cache *s, struct page *page,
757 "to slab %s", object, page->slab->name); 870 "to slab %s", object, page->slab->name);
758 goto fail; 871 goto fail;
759 } 872 }
873
874 /* Special debug activities for freeing objects */
875 if (!SlabFrozen(page) && !page->freelist)
876 remove_full(s, page);
877 if (s->flags & SLAB_STORE_USER)
878 set_track(s, object, TRACK_FREE, addr);
879 trace(s, page, object, 0);
880 init_object(s, object, 0);
760 return 1; 881 return 1;
882
761fail: 883fail:
762 printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n", 884 printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n",
763 s->name, page, object); 885 s->name, page, object);
764 return 0; 886 return 0;
765} 887}
766 888
889static int __init setup_slub_debug(char *str)
890{
891 if (!str || *str != '=')
892 slub_debug = DEBUG_DEFAULT_FLAGS;
893 else {
894 str++;
895 if (*str == 0 || *str == ',')
896 slub_debug = DEBUG_DEFAULT_FLAGS;
897 else
898 for( ;*str && *str != ','; str++)
899 switch (*str) {
900 case 'f' : case 'F' :
901 slub_debug |= SLAB_DEBUG_FREE;
902 break;
903 case 'z' : case 'Z' :
904 slub_debug |= SLAB_RED_ZONE;
905 break;
906 case 'p' : case 'P' :
907 slub_debug |= SLAB_POISON;
908 break;
909 case 'u' : case 'U' :
910 slub_debug |= SLAB_STORE_USER;
911 break;
912 case 't' : case 'T' :
913 slub_debug |= SLAB_TRACE;
914 break;
915 default:
916 printk(KERN_ERR "slub_debug option '%c' "
917 "unknown. skipped\n",*str);
918 }
919 }
920
921 if (*str == ',')
922 slub_debug_slabs = str + 1;
923 return 1;
924}
925
926__setup("slub_debug", setup_slub_debug);
927
928static void kmem_cache_open_debug_check(struct kmem_cache *s)
929{
930 /*
931 * The page->offset field is only 16 bit wide. This is an offset
932 * in units of words from the beginning of an object. If the slab
933 * size is bigger then we cannot move the free pointer behind the
934 * object anymore.
935 *
936 * On 32 bit platforms the limit is 256k. On 64bit platforms
937 * the limit is 512k.
938 *
939 * Debugging or ctor may create a need to move the free
940 * pointer. Fail if this happens.
941 */
942 if (s->objsize >= 65535 * sizeof(void *)) {
943 BUG_ON(s->flags & (SLAB_RED_ZONE | SLAB_POISON |
944 SLAB_STORE_USER | SLAB_DESTROY_BY_RCU));
945 BUG_ON(s->ctor);
946 }
947 else
948 /*
949 * Enable debugging if selected on the kernel commandline.
950 */
951 if (slub_debug && (!slub_debug_slabs ||
952 strncmp(slub_debug_slabs, s->name,
953 strlen(slub_debug_slabs)) == 0))
954 s->flags |= slub_debug;
955}
956#else
957static inline void setup_object_debug(struct kmem_cache *s,
958 struct page *page, void *object) {}
959
960static inline int alloc_debug_processing(struct kmem_cache *s,
961 struct page *page, void *object, void *addr) { return 0; }
962
963static inline int free_debug_processing(struct kmem_cache *s,
964 struct page *page, void *object, void *addr) { return 0; }
965
966static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
967 { return 1; }
968static inline int check_object(struct kmem_cache *s, struct page *page,
969 void *object, int active) { return 1; }
970static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
971static inline void kmem_cache_open_debug_check(struct kmem_cache *s) {}
972#define slub_debug 0
973#endif
767/* 974/*
768 * Slab allocation and freeing 975 * Slab allocation and freeing
769 */ 976 */
@@ -797,13 +1004,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
797static void setup_object(struct kmem_cache *s, struct page *page, 1004static void setup_object(struct kmem_cache *s, struct page *page,
798 void *object) 1005 void *object)
799{ 1006{
800 if (PageError(page)) { 1007 setup_object_debug(s, page, object);
801 init_object(s, object, 0);
802 init_tracking(s, object);
803 }
804
805 if (unlikely(s->ctor)) 1008 if (unlikely(s->ctor))
806 s->ctor(object, s, SLAB_CTOR_CONSTRUCTOR); 1009 s->ctor(object, s, 0);
807} 1010}
808 1011
809static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1012static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -832,7 +1035,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
832 page->flags |= 1 << PG_slab; 1035 page->flags |= 1 << PG_slab;
833 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | 1036 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
834 SLAB_STORE_USER | SLAB_TRACE)) 1037 SLAB_STORE_USER | SLAB_TRACE))
835 page->flags |= 1 << PG_error; 1038 SetSlabDebug(page);
836 1039
837 start = page_address(page); 1040 start = page_address(page);
838 end = start + s->objects * s->size; 1041 end = start + s->objects * s->size;
@@ -841,7 +1044,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
841 memset(start, POISON_INUSE, PAGE_SIZE << s->order); 1044 memset(start, POISON_INUSE, PAGE_SIZE << s->order);
842 1045
843 last = start; 1046 last = start;
844 for (p = start + s->size; p < end; p += s->size) { 1047 for_each_object(p, s, start) {
845 setup_object(s, page, last); 1048 setup_object(s, page, last);
846 set_freepointer(s, last, p); 1049 set_freepointer(s, last, p);
847 last = p; 1050 last = p;
@@ -850,6 +1053,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
850 set_freepointer(s, last, NULL); 1053 set_freepointer(s, last, NULL);
851 1054
852 page->freelist = start; 1055 page->freelist = start;
1056 page->lockless_freelist = NULL;
853 page->inuse = 0; 1057 page->inuse = 0;
854out: 1058out:
855 if (flags & __GFP_WAIT) 1059 if (flags & __GFP_WAIT)
@@ -861,17 +1065,12 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
861{ 1065{
862 int pages = 1 << s->order; 1066 int pages = 1 << s->order;
863 1067
864 if (unlikely(PageError(page) || s->dtor)) { 1068 if (unlikely(SlabDebug(page))) {
865 void *start = page_address(page);
866 void *end = start + (pages << PAGE_SHIFT);
867 void *p; 1069 void *p;
868 1070
869 slab_pad_check(s, page); 1071 slab_pad_check(s, page);
870 for (p = start; p <= end - s->size; p += s->size) { 1072 for_each_object(p, s, page_address(page))
871 if (s->dtor)
872 s->dtor(p, s, 0);
873 check_object(s, page, p, 0); 1073 check_object(s, page, p, 0);
874 }
875 } 1074 }
876 1075
877 mod_zone_page_state(page_zone(page), 1076 mod_zone_page_state(page_zone(page),
@@ -910,7 +1109,8 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
910 1109
911 atomic_long_dec(&n->nr_slabs); 1110 atomic_long_dec(&n->nr_slabs);
912 reset_page_mapcount(page); 1111 reset_page_mapcount(page);
913 page->flags &= ~(1 << PG_slab | 1 << PG_error); 1112 ClearSlabDebug(page);
1113 __ClearPageSlab(page);
914 free_slab(s, page); 1114 free_slab(s, page);
915} 1115}
916 1116
@@ -966,22 +1166,23 @@ static void remove_partial(struct kmem_cache *s,
966} 1166}
967 1167
968/* 1168/*
969 * Lock page and remove it from the partial list 1169 * Lock slab and remove from the partial list.
970 * 1170 *
971 * Must hold list_lock 1171 * Must hold list_lock.
972 */ 1172 */
973static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page) 1173static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page)
974{ 1174{
975 if (slab_trylock(page)) { 1175 if (slab_trylock(page)) {
976 list_del(&page->lru); 1176 list_del(&page->lru);
977 n->nr_partial--; 1177 n->nr_partial--;
1178 SetSlabFrozen(page);
978 return 1; 1179 return 1;
979 } 1180 }
980 return 0; 1181 return 0;
981} 1182}
982 1183
983/* 1184/*
984 * Try to get a partial slab from a specific node 1185 * Try to allocate a partial slab from a specific node.
985 */ 1186 */
986static struct page *get_partial_node(struct kmem_cache_node *n) 1187static struct page *get_partial_node(struct kmem_cache_node *n)
987{ 1188{
@@ -990,14 +1191,15 @@ static struct page *get_partial_node(struct kmem_cache_node *n)
990 /* 1191 /*
991 * Racy check. If we mistakenly see no partial slabs then we 1192 * Racy check. If we mistakenly see no partial slabs then we
992 * just allocate an empty slab. If we mistakenly try to get a 1193 * just allocate an empty slab. If we mistakenly try to get a
993 * partial slab then get_partials() will return NULL. 1194 * partial slab and there is none available then get_partials()
1195 * will return NULL.
994 */ 1196 */
995 if (!n || !n->nr_partial) 1197 if (!n || !n->nr_partial)
996 return NULL; 1198 return NULL;
997 1199
998 spin_lock(&n->list_lock); 1200 spin_lock(&n->list_lock);
999 list_for_each_entry(page, &n->partial, lru) 1201 list_for_each_entry(page, &n->partial, lru)
1000 if (lock_and_del_slab(n, page)) 1202 if (lock_and_freeze_slab(n, page))
1001 goto out; 1203 goto out;
1002 page = NULL; 1204 page = NULL;
1003out: 1205out:
@@ -1006,8 +1208,7 @@ out:
1006} 1208}
1007 1209
1008/* 1210/*
1009 * Get a page from somewhere. Search in increasing NUMA 1211 * Get a page from somewhere. Search in increasing NUMA distances.
1010 * distances.
1011 */ 1212 */
1012static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) 1213static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1013{ 1214{
@@ -1017,24 +1218,22 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1017 struct page *page; 1218 struct page *page;
1018 1219
1019 /* 1220 /*
1020 * The defrag ratio allows to configure the tradeoffs between 1221 * The defrag ratio allows a configuration of the tradeoffs between
1021 * inter node defragmentation and node local allocations. 1222 * inter node defragmentation and node local allocations. A lower
1022 * A lower defrag_ratio increases the tendency to do local 1223 * defrag_ratio increases the tendency to do local allocations
1023 * allocations instead of scanning throught the partial 1224 * instead of attempting to obtain partial slabs from other nodes.
1024 * lists on other nodes.
1025 * 1225 *
1026 * If defrag_ratio is set to 0 then kmalloc() always 1226 * If the defrag_ratio is set to 0 then kmalloc() always
1027 * returns node local objects. If its higher then kmalloc() 1227 * returns node local objects. If the ratio is higher then kmalloc()
1028 * may return off node objects in order to avoid fragmentation. 1228 * may return off node objects because partial slabs are obtained
1029 * 1229 * from other nodes and filled up.
1030 * A higher ratio means slabs may be taken from other nodes
1031 * thus reducing the number of partial slabs on those nodes.
1032 * 1230 *
1033 * If /sys/slab/xx/defrag_ratio is set to 100 (which makes 1231 * If /sys/slab/xx/defrag_ratio is set to 100 (which makes
1034 * defrag_ratio = 1000) then every (well almost) allocation 1232 * defrag_ratio = 1000) then every (well almost) allocation will
1035 * will first attempt to defrag slab caches on other nodes. This 1233 * first attempt to defrag slab caches on other nodes. This means
1036 * means scanning over all nodes to look for partial slabs which 1234 * scanning over all nodes to look for partial slabs which may be
1037 * may be a bit expensive to do on every slab allocation. 1235 * expensive if we do it every time we are trying to find a slab
1236 * with available objects.
1038 */ 1237 */
1039 if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) 1238 if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio)
1040 return NULL; 1239 return NULL;
@@ -1079,26 +1278,28 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1079 * 1278 *
1080 * On exit the slab lock will have been dropped. 1279 * On exit the slab lock will have been dropped.
1081 */ 1280 */
1082static void putback_slab(struct kmem_cache *s, struct page *page) 1281static void unfreeze_slab(struct kmem_cache *s, struct page *page)
1083{ 1282{
1084 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1283 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1085 1284
1285 ClearSlabFrozen(page);
1086 if (page->inuse) { 1286 if (page->inuse) {
1087 1287
1088 if (page->freelist) 1288 if (page->freelist)
1089 add_partial(n, page); 1289 add_partial(n, page);
1090 else if (PageError(page) && (s->flags & SLAB_STORE_USER)) 1290 else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER))
1091 add_full(n, page); 1291 add_full(n, page);
1092 slab_unlock(page); 1292 slab_unlock(page);
1093 1293
1094 } else { 1294 } else {
1095 if (n->nr_partial < MIN_PARTIAL) { 1295 if (n->nr_partial < MIN_PARTIAL) {
1096 /* 1296 /*
1097 * Adding an empty page to the partial slabs in order 1297 * Adding an empty slab to the partial slabs in order
1098 * to avoid page allocator overhead. This page needs to 1298 * to avoid page allocator overhead. This slab needs
1099 * come after all the others that are not fully empty 1299 * to come after the other slabs with objects in
1100 * in order to make sure that we do maximum 1300 * order to fill them up. That way the size of the
1101 * defragmentation. 1301 * partial list stays small. kmem_cache_shrink can
1302 * reclaim empty slabs from the partial list.
1102 */ 1303 */
1103 add_partial_tail(n, page); 1304 add_partial_tail(n, page);
1104 slab_unlock(page); 1305 slab_unlock(page);
@@ -1114,10 +1315,25 @@ static void putback_slab(struct kmem_cache *s, struct page *page)
1114 */ 1315 */
1115static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) 1316static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu)
1116{ 1317{
1117 s->cpu_slab[cpu] = NULL; 1318 /*
1118 ClearPageActive(page); 1319 * Merge cpu freelist into freelist. Typically we get here
1320 * because both freelists are empty. So this is unlikely
1321 * to occur.
1322 */
1323 while (unlikely(page->lockless_freelist)) {
1324 void **object;
1325
1326 /* Retrieve object from cpu_freelist */
1327 object = page->lockless_freelist;
1328 page->lockless_freelist = page->lockless_freelist[page->offset];
1119 1329
1120 putback_slab(s, page); 1330 /* And put onto the regular freelist */
1331 object[page->offset] = page->freelist;
1332 page->freelist = object;
1333 page->inuse--;
1334 }
1335 s->cpu_slab[cpu] = NULL;
1336 unfreeze_slab(s, page);
1121} 1337}
1122 1338
1123static void flush_slab(struct kmem_cache *s, struct page *page, int cpu) 1339static void flush_slab(struct kmem_cache *s, struct page *page, int cpu)
@@ -1160,47 +1376,46 @@ static void flush_all(struct kmem_cache *s)
1160} 1376}
1161 1377
1162/* 1378/*
1163 * slab_alloc is optimized to only modify two cachelines on the fast path 1379 * Slow path. The lockless freelist is empty or we need to perform
1164 * (aside from the stack): 1380 * debugging duties.
1381 *
1382 * Interrupts are disabled.
1165 * 1383 *
1166 * 1. The page struct 1384 * Processing is still very fast if new objects have been freed to the
1167 * 2. The first cacheline of the object to be allocated. 1385 * regular freelist. In that case we simply take over the regular freelist
1386 * as the lockless freelist and zap the regular freelist.
1168 * 1387 *
1169 * The only cache lines that are read (apart from code) is the 1388 * If that is not working then we fall back to the partial lists. We take the
1170 * per cpu array in the kmem_cache struct. 1389 * first element of the freelist as the object to allocate now and move the
1390 * rest of the freelist to the lockless freelist.
1171 * 1391 *
1172 * Fastpath is not possible if we need to get a new slab or have 1392 * And if we were unable to get a new slab from the partial slab lists then
1173 * debugging enabled (which means all slabs are marked with PageError) 1393 * we need to allocate a new slab. This is slowest path since we may sleep.
1174 */ 1394 */
1175static void *slab_alloc(struct kmem_cache *s, 1395static void *__slab_alloc(struct kmem_cache *s,
1176 gfp_t gfpflags, int node, void *addr) 1396 gfp_t gfpflags, int node, void *addr, struct page *page)
1177{ 1397{
1178 struct page *page;
1179 void **object; 1398 void **object;
1180 unsigned long flags; 1399 int cpu = smp_processor_id();
1181 int cpu;
1182 1400
1183 local_irq_save(flags);
1184 cpu = smp_processor_id();
1185 page = s->cpu_slab[cpu];
1186 if (!page) 1401 if (!page)
1187 goto new_slab; 1402 goto new_slab;
1188 1403
1189 slab_lock(page); 1404 slab_lock(page);
1190 if (unlikely(node != -1 && page_to_nid(page) != node)) 1405 if (unlikely(node != -1 && page_to_nid(page) != node))
1191 goto another_slab; 1406 goto another_slab;
1192redo: 1407load_freelist:
1193 object = page->freelist; 1408 object = page->freelist;
1194 if (unlikely(!object)) 1409 if (unlikely(!object))
1195 goto another_slab; 1410 goto another_slab;
1196 if (unlikely(PageError(page))) 1411 if (unlikely(SlabDebug(page)))
1197 goto debug; 1412 goto debug;
1198 1413
1199have_object: 1414 object = page->freelist;
1200 page->inuse++; 1415 page->lockless_freelist = object[page->offset];
1201 page->freelist = object[page->offset]; 1416 page->inuse = s->objects;
1417 page->freelist = NULL;
1202 slab_unlock(page); 1418 slab_unlock(page);
1203 local_irq_restore(flags);
1204 return object; 1419 return object;
1205 1420
1206another_slab: 1421another_slab:
@@ -1208,11 +1423,9 @@ another_slab:
1208 1423
1209new_slab: 1424new_slab:
1210 page = get_partial(s, gfpflags, node); 1425 page = get_partial(s, gfpflags, node);
1211 if (likely(page)) { 1426 if (page) {
1212have_slab:
1213 s->cpu_slab[cpu] = page; 1427 s->cpu_slab[cpu] = page;
1214 SetPageActive(page); 1428 goto load_freelist;
1215 goto redo;
1216 } 1429 }
1217 1430
1218 page = new_slab(s, gfpflags, node); 1431 page = new_slab(s, gfpflags, node);
@@ -1220,9 +1433,11 @@ have_slab:
1220 cpu = smp_processor_id(); 1433 cpu = smp_processor_id();
1221 if (s->cpu_slab[cpu]) { 1434 if (s->cpu_slab[cpu]) {
1222 /* 1435 /*
1223 * Someone else populated the cpu_slab while we enabled 1436 * Someone else populated the cpu_slab while we
1224 * interrupts, or we have got scheduled on another cpu. 1437 * enabled interrupts, or we have gotten scheduled
1225 * The page may not be on the requested node. 1438 * on another cpu. The page may not be on the
1439 * requested node even if __GFP_THISNODE was
1440 * specified. So we need to recheck.
1226 */ 1441 */
1227 if (node == -1 || 1442 if (node == -1 ||
1228 page_to_nid(s->cpu_slab[cpu]) == node) { 1443 page_to_nid(s->cpu_slab[cpu]) == node) {
@@ -1233,29 +1448,58 @@ have_slab:
1233 discard_slab(s, page); 1448 discard_slab(s, page);
1234 page = s->cpu_slab[cpu]; 1449 page = s->cpu_slab[cpu];
1235 slab_lock(page); 1450 slab_lock(page);
1236 goto redo; 1451 goto load_freelist;
1237 } 1452 }
1238 /* Dump the current slab */ 1453 /* New slab does not fit our expectations */
1239 flush_slab(s, s->cpu_slab[cpu], cpu); 1454 flush_slab(s, s->cpu_slab[cpu], cpu);
1240 } 1455 }
1241 slab_lock(page); 1456 slab_lock(page);
1242 goto have_slab; 1457 SetSlabFrozen(page);
1458 s->cpu_slab[cpu] = page;
1459 goto load_freelist;
1243 } 1460 }
1244 local_irq_restore(flags);
1245 return NULL; 1461 return NULL;
1246debug: 1462debug:
1247 if (!alloc_object_checks(s, page, object)) 1463 object = page->freelist;
1464 if (!alloc_debug_processing(s, page, object, addr))
1248 goto another_slab; 1465 goto another_slab;
1249 if (s->flags & SLAB_STORE_USER) 1466
1250 set_track(s, object, TRACK_ALLOC, addr); 1467 page->inuse++;
1251 if (s->flags & SLAB_TRACE) { 1468 page->freelist = object[page->offset];
1252 printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n", 1469 slab_unlock(page);
1253 s->name, object, page->inuse, 1470 return object;
1254 page->freelist); 1471}
1255 dump_stack(); 1472
1473/*
1474 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
1475 * have the fastpath folded into their functions. So no function call
1476 * overhead for requests that can be satisfied on the fastpath.
1477 *
1478 * The fastpath works by first checking if the lockless freelist can be used.
1479 * If not then __slab_alloc is called for slow processing.
1480 *
1481 * Otherwise we can simply pick the next object from the lockless free list.
1482 */
1483static void __always_inline *slab_alloc(struct kmem_cache *s,
1484 gfp_t gfpflags, int node, void *addr)
1485{
1486 struct page *page;
1487 void **object;
1488 unsigned long flags;
1489
1490 local_irq_save(flags);
1491 page = s->cpu_slab[smp_processor_id()];
1492 if (unlikely(!page || !page->lockless_freelist ||
1493 (node != -1 && page_to_nid(page) != node)))
1494
1495 object = __slab_alloc(s, gfpflags, node, addr, page);
1496
1497 else {
1498 object = page->lockless_freelist;
1499 page->lockless_freelist = object[page->offset];
1256 } 1500 }
1257 init_object(s, object, 1); 1501 local_irq_restore(flags);
1258 goto have_object; 1502 return object;
1259} 1503}
1260 1504
1261void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 1505void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
@@ -1273,33 +1517,29 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
1273#endif 1517#endif
1274 1518
1275/* 1519/*
1276 * The fastpath only writes the cacheline of the page struct and the first 1520 * Slow patch handling. This may still be called frequently since objects
1277 * cacheline of the object. 1521 * have a longer lifetime than the cpu slabs in most processing loads.
1278 * 1522 *
1279 * No special cachelines need to be read 1523 * So we still attempt to reduce cache line usage. Just take the slab
1524 * lock and free the item. If there is no additional partial page
1525 * handling required then we can return immediately.
1280 */ 1526 */
1281static void slab_free(struct kmem_cache *s, struct page *page, 1527static void __slab_free(struct kmem_cache *s, struct page *page,
1282 void *x, void *addr) 1528 void *x, void *addr)
1283{ 1529{
1284 void *prior; 1530 void *prior;
1285 void **object = (void *)x; 1531 void **object = (void *)x;
1286 unsigned long flags;
1287 1532
1288 local_irq_save(flags);
1289 slab_lock(page); 1533 slab_lock(page);
1290 1534
1291 if (unlikely(PageError(page))) 1535 if (unlikely(SlabDebug(page)))
1292 goto debug; 1536 goto debug;
1293checks_ok: 1537checks_ok:
1294 prior = object[page->offset] = page->freelist; 1538 prior = object[page->offset] = page->freelist;
1295 page->freelist = object; 1539 page->freelist = object;
1296 page->inuse--; 1540 page->inuse--;
1297 1541
1298 if (unlikely(PageActive(page))) 1542 if (unlikely(SlabFrozen(page)))
1299 /*
1300 * Cpu slabs are never on partial lists and are
1301 * never freed.
1302 */
1303 goto out_unlock; 1543 goto out_unlock;
1304 1544
1305 if (unlikely(!page->inuse)) 1545 if (unlikely(!page->inuse))
@@ -1315,39 +1555,53 @@ checks_ok:
1315 1555
1316out_unlock: 1556out_unlock:
1317 slab_unlock(page); 1557 slab_unlock(page);
1318 local_irq_restore(flags);
1319 return; 1558 return;
1320 1559
1321slab_empty: 1560slab_empty:
1322 if (prior) 1561 if (prior)
1323 /* 1562 /*
1324 * Slab on the partial list. 1563 * Slab still on the partial list.
1325 */ 1564 */
1326 remove_partial(s, page); 1565 remove_partial(s, page);
1327 1566
1328 slab_unlock(page); 1567 slab_unlock(page);
1329 discard_slab(s, page); 1568 discard_slab(s, page);
1330 local_irq_restore(flags);
1331 return; 1569 return;
1332 1570
1333debug: 1571debug:
1334 if (!free_object_checks(s, page, x)) 1572 if (!free_debug_processing(s, page, x, addr))
1335 goto out_unlock; 1573 goto out_unlock;
1336 if (!PageActive(page) && !page->freelist)
1337 remove_full(s, page);
1338 if (s->flags & SLAB_STORE_USER)
1339 set_track(s, x, TRACK_FREE, addr);
1340 if (s->flags & SLAB_TRACE) {
1341 printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n",
1342 s->name, object, page->inuse,
1343 page->freelist);
1344 print_section("Object", (void *)object, s->objsize);
1345 dump_stack();
1346 }
1347 init_object(s, object, 0);
1348 goto checks_ok; 1574 goto checks_ok;
1349} 1575}
1350 1576
1577/*
1578 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
1579 * can perform fastpath freeing without additional function calls.
1580 *
1581 * The fastpath is only possible if we are freeing to the current cpu slab
1582 * of this processor. This typically the case if we have just allocated
1583 * the item before.
1584 *
1585 * If fastpath is not possible then fall back to __slab_free where we deal
1586 * with all sorts of special processing.
1587 */
1588static void __always_inline slab_free(struct kmem_cache *s,
1589 struct page *page, void *x, void *addr)
1590{
1591 void **object = (void *)x;
1592 unsigned long flags;
1593
1594 local_irq_save(flags);
1595 if (likely(page == s->cpu_slab[smp_processor_id()] &&
1596 !SlabDebug(page))) {
1597 object[page->offset] = page->lockless_freelist;
1598 page->lockless_freelist = object;
1599 } else
1600 __slab_free(s, page, x, addr);
1601
1602 local_irq_restore(flags);
1603}
1604
1351void kmem_cache_free(struct kmem_cache *s, void *x) 1605void kmem_cache_free(struct kmem_cache *s, void *x)
1352{ 1606{
1353 struct page *page; 1607 struct page *page;
@@ -1370,22 +1624,16 @@ static struct page *get_object_page(const void *x)
1370} 1624}
1371 1625
1372/* 1626/*
1373 * kmem_cache_open produces objects aligned at "size" and the first object 1627 * Object placement in a slab is made very easy because we always start at
1374 * is placed at offset 0 in the slab (We have no metainformation on the 1628 * offset 0. If we tune the size of the object to the alignment then we can
1375 * slab, all slabs are in essence "off slab"). 1629 * get the required alignment by putting one properly sized object after
1376 * 1630 * another.
1377 * In order to get the desired alignment one just needs to align the
1378 * size.
1379 * 1631 *
1380 * Notice that the allocation order determines the sizes of the per cpu 1632 * Notice that the allocation order determines the sizes of the per cpu
1381 * caches. Each processor has always one slab available for allocations. 1633 * caches. Each processor has always one slab available for allocations.
1382 * Increasing the allocation order reduces the number of times that slabs 1634 * Increasing the allocation order reduces the number of times that slabs
1383 * must be moved on and off the partial lists and therefore may influence 1635 * must be moved on and off the partial lists and is therefore a factor in
1384 * locking overhead. 1636 * locking overhead.
1385 *
1386 * The offset is used to relocate the free list link in each object. It is
1387 * therefore possible to move the free list link behind the object. This
1388 * is necessary for RCU to work properly and also useful for debugging.
1389 */ 1637 */
1390 1638
1391/* 1639/*
@@ -1396,76 +1644,110 @@ static struct page *get_object_page(const void *x)
1396 */ 1644 */
1397static int slub_min_order; 1645static int slub_min_order;
1398static int slub_max_order = DEFAULT_MAX_ORDER; 1646static int slub_max_order = DEFAULT_MAX_ORDER;
1399
1400/*
1401 * Minimum number of objects per slab. This is necessary in order to
1402 * reduce locking overhead. Similar to the queue size in SLAB.
1403 */
1404static int slub_min_objects = DEFAULT_MIN_OBJECTS; 1647static int slub_min_objects = DEFAULT_MIN_OBJECTS;
1405 1648
1406/* 1649/*
1407 * Merge control. If this is set then no merging of slab caches will occur. 1650 * Merge control. If this is set then no merging of slab caches will occur.
1651 * (Could be removed. This was introduced to pacify the merge skeptics.)
1408 */ 1652 */
1409static int slub_nomerge; 1653static int slub_nomerge;
1410 1654
1411/* 1655/*
1412 * Debug settings:
1413 */
1414static int slub_debug;
1415
1416static char *slub_debug_slabs;
1417
1418/*
1419 * Calculate the order of allocation given an slab object size. 1656 * Calculate the order of allocation given an slab object size.
1420 * 1657 *
1421 * The order of allocation has significant impact on other elements 1658 * The order of allocation has significant impact on performance and other
1422 * of the system. Generally order 0 allocations should be preferred 1659 * system components. Generally order 0 allocations should be preferred since
1423 * since they do not cause fragmentation in the page allocator. Larger 1660 * order 0 does not cause fragmentation in the page allocator. Larger objects
1424 * objects may have problems with order 0 because there may be too much 1661 * be problematic to put into order 0 slabs because there may be too much
1425 * space left unused in a slab. We go to a higher order if more than 1/8th 1662 * unused space left. We go to a higher order if more than 1/8th of the slab
1426 * of the slab would be wasted. 1663 * would be wasted.
1427 * 1664 *
1428 * In order to reach satisfactory performance we must ensure that 1665 * In order to reach satisfactory performance we must ensure that a minimum
1429 * a minimum number of objects is in one slab. Otherwise we may 1666 * number of objects is in one slab. Otherwise we may generate too much
1430 * generate too much activity on the partial lists. This is less a 1667 * activity on the partial lists which requires taking the list_lock. This is
1431 * concern for large slabs though. slub_max_order specifies the order 1668 * less a concern for large slabs though which are rarely used.
1432 * where we begin to stop considering the number of objects in a slab.
1433 * 1669 *
1434 * Higher order allocations also allow the placement of more objects 1670 * slub_max_order specifies the order where we begin to stop considering the
1435 * in a slab and thereby reduce object handling overhead. If the user 1671 * number of objects in a slab as critical. If we reach slub_max_order then
1436 * has requested a higher mininum order then we start with that one 1672 * we try to keep the page order as low as possible. So we accept more waste
1437 * instead of zero. 1673 * of space in favor of a small page order.
1674 *
1675 * Higher order allocations also allow the placement of more objects in a
1676 * slab and thereby reduce object handling overhead. If the user has
1677 * requested a higher mininum order then we start with that one instead of
1678 * the smallest order which will fit the object.
1438 */ 1679 */
1439static int calculate_order(int size) 1680static inline int slab_order(int size, int min_objects,
1681 int max_order, int fract_leftover)
1440{ 1682{
1441 int order; 1683 int order;
1442 int rem; 1684 int rem;
1443 1685
1444 for (order = max(slub_min_order, fls(size - 1) - PAGE_SHIFT); 1686 for (order = max(slub_min_order,
1445 order < MAX_ORDER; order++) { 1687 fls(min_objects * size - 1) - PAGE_SHIFT);
1446 unsigned long slab_size = PAGE_SIZE << order; 1688 order <= max_order; order++) {
1447 1689
1448 if (slub_max_order > order && 1690 unsigned long slab_size = PAGE_SIZE << order;
1449 slab_size < slub_min_objects * size)
1450 continue;
1451 1691
1452 if (slab_size < size) 1692 if (slab_size < min_objects * size)
1453 continue; 1693 continue;
1454 1694
1455 rem = slab_size % size; 1695 rem = slab_size % size;
1456 1696
1457 if (rem <= (PAGE_SIZE << order) / 8) 1697 if (rem <= slab_size / fract_leftover)
1458 break; 1698 break;
1459 1699
1460 } 1700 }
1461 if (order >= MAX_ORDER) 1701
1462 return -E2BIG;
1463 return order; 1702 return order;
1464} 1703}
1465 1704
1705static inline int calculate_order(int size)
1706{
1707 int order;
1708 int min_objects;
1709 int fraction;
1710
1711 /*
1712 * Attempt to find best configuration for a slab. This
1713 * works by first attempting to generate a layout with
1714 * the best configuration and backing off gradually.
1715 *
1716 * First we reduce the acceptable waste in a slab. Then
1717 * we reduce the minimum objects required in a slab.
1718 */
1719 min_objects = slub_min_objects;
1720 while (min_objects > 1) {
1721 fraction = 8;
1722 while (fraction >= 4) {
1723 order = slab_order(size, min_objects,
1724 slub_max_order, fraction);
1725 if (order <= slub_max_order)
1726 return order;
1727 fraction /= 2;
1728 }
1729 min_objects /= 2;
1730 }
1731
1732 /*
1733 * We were unable to place multiple objects in a slab. Now
1734 * lets see if we can place a single object there.
1735 */
1736 order = slab_order(size, 1, slub_max_order, 1);
1737 if (order <= slub_max_order)
1738 return order;
1739
1740 /*
1741 * Doh this slab cannot be placed using slub_max_order.
1742 */
1743 order = slab_order(size, 1, MAX_ORDER, 1);
1744 if (order <= MAX_ORDER)
1745 return order;
1746 return -ENOSYS;
1747}
1748
1466/* 1749/*
1467 * Function to figure out which alignment to use from the 1750 * Figure out what the alignment of the objects will be.
1468 * various ways of specifying it.
1469 */ 1751 */
1470static unsigned long calculate_alignment(unsigned long flags, 1752static unsigned long calculate_alignment(unsigned long flags,
1471 unsigned long align, unsigned long size) 1753 unsigned long align, unsigned long size)
@@ -1480,8 +1762,8 @@ static unsigned long calculate_alignment(unsigned long flags,
1480 * then use it. 1762 * then use it.
1481 */ 1763 */
1482 if ((flags & SLAB_HWCACHE_ALIGN) && 1764 if ((flags & SLAB_HWCACHE_ALIGN) &&
1483 size > L1_CACHE_BYTES / 2) 1765 size > cache_line_size() / 2)
1484 return max_t(unsigned long, align, L1_CACHE_BYTES); 1766 return max_t(unsigned long, align, cache_line_size());
1485 1767
1486 if (align < ARCH_SLAB_MINALIGN) 1768 if (align < ARCH_SLAB_MINALIGN)
1487 return ARCH_SLAB_MINALIGN; 1769 return ARCH_SLAB_MINALIGN;
@@ -1525,7 +1807,7 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag
1525 page->freelist = get_freepointer(kmalloc_caches, n); 1807 page->freelist = get_freepointer(kmalloc_caches, n);
1526 page->inuse++; 1808 page->inuse++;
1527 kmalloc_caches->node[node] = n; 1809 kmalloc_caches->node[node] = n;
1528 init_object(kmalloc_caches, n, 1); 1810 setup_object_debug(kmalloc_caches, page, n);
1529 init_kmem_cache_node(n); 1811 init_kmem_cache_node(n);
1530 atomic_long_inc(&n->nr_slabs); 1812 atomic_long_inc(&n->nr_slabs);
1531 add_partial(n, page); 1813 add_partial(n, page);
@@ -1607,7 +1889,7 @@ static int calculate_sizes(struct kmem_cache *s)
1607 * then we should never poison the object itself. 1889 * then we should never poison the object itself.
1608 */ 1890 */
1609 if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && 1891 if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
1610 !s->ctor && !s->dtor) 1892 !s->ctor)
1611 s->flags |= __OBJECT_POISON; 1893 s->flags |= __OBJECT_POISON;
1612 else 1894 else
1613 s->flags &= ~__OBJECT_POISON; 1895 s->flags &= ~__OBJECT_POISON;
@@ -1619,24 +1901,24 @@ static int calculate_sizes(struct kmem_cache *s)
1619 */ 1901 */
1620 size = ALIGN(size, sizeof(void *)); 1902 size = ALIGN(size, sizeof(void *));
1621 1903
1904#ifdef CONFIG_SLUB_DEBUG
1622 /* 1905 /*
1623 * If we are redzoning then check if there is some space between the 1906 * If we are Redzoning then check if there is some space between the
1624 * end of the object and the free pointer. If not then add an 1907 * end of the object and the free pointer. If not then add an
1625 * additional word, so that we can establish a redzone between 1908 * additional word to have some bytes to store Redzone information.
1626 * the object and the freepointer to be able to check for overwrites.
1627 */ 1909 */
1628 if ((flags & SLAB_RED_ZONE) && size == s->objsize) 1910 if ((flags & SLAB_RED_ZONE) && size == s->objsize)
1629 size += sizeof(void *); 1911 size += sizeof(void *);
1912#endif
1630 1913
1631 /* 1914 /*
1632 * With that we have determined how much of the slab is in actual 1915 * With that we have determined the number of bytes in actual use
1633 * use by the object. This is the potential offset to the free 1916 * by the object. This is the potential offset to the free pointer.
1634 * pointer.
1635 */ 1917 */
1636 s->inuse = size; 1918 s->inuse = size;
1637 1919
1638 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || 1920 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
1639 s->ctor || s->dtor)) { 1921 s->ctor)) {
1640 /* 1922 /*
1641 * Relocate free pointer after the object if it is not 1923 * Relocate free pointer after the object if it is not
1642 * permitted to overwrite the first word of the object on 1924 * permitted to overwrite the first word of the object on
@@ -1649,6 +1931,7 @@ static int calculate_sizes(struct kmem_cache *s)
1649 size += sizeof(void *); 1931 size += sizeof(void *);
1650 } 1932 }
1651 1933
1934#ifdef CONFIG_SLUB_DEBUG
1652 if (flags & SLAB_STORE_USER) 1935 if (flags & SLAB_STORE_USER)
1653 /* 1936 /*
1654 * Need to store information about allocs and frees after 1937 * Need to store information about allocs and frees after
@@ -1656,7 +1939,7 @@ static int calculate_sizes(struct kmem_cache *s)
1656 */ 1939 */
1657 size += 2 * sizeof(struct track); 1940 size += 2 * sizeof(struct track);
1658 1941
1659 if (flags & DEBUG_DEFAULT_FLAGS) 1942 if (flags & SLAB_RED_ZONE)
1660 /* 1943 /*
1661 * Add some empty padding so that we can catch 1944 * Add some empty padding so that we can catch
1662 * overwrites from earlier objects rather than let 1945 * overwrites from earlier objects rather than let
@@ -1665,10 +1948,12 @@ static int calculate_sizes(struct kmem_cache *s)
1665 * of the object. 1948 * of the object.
1666 */ 1949 */
1667 size += sizeof(void *); 1950 size += sizeof(void *);
1951#endif
1952
1668 /* 1953 /*
1669 * Determine the alignment based on various parameters that the 1954 * Determine the alignment based on various parameters that the
1670 * user specified (this is unecessarily complex due to the attempt 1955 * user specified and the dynamic determination of cache line size
1671 * to be compatible with SLAB. Should be cleaned up some day). 1956 * on bootup.
1672 */ 1957 */
1673 align = calculate_alignment(flags, align, s->objsize); 1958 align = calculate_alignment(flags, align, s->objsize);
1674 1959
@@ -1700,62 +1985,18 @@ static int calculate_sizes(struct kmem_cache *s)
1700 1985
1701} 1986}
1702 1987
1703static int __init finish_bootstrap(void)
1704{
1705 struct list_head *h;
1706 int err;
1707
1708 slab_state = SYSFS;
1709
1710 list_for_each(h, &slab_caches) {
1711 struct kmem_cache *s =
1712 container_of(h, struct kmem_cache, list);
1713
1714 err = sysfs_slab_add(s);
1715 BUG_ON(err);
1716 }
1717 return 0;
1718}
1719
1720static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 1988static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
1721 const char *name, size_t size, 1989 const char *name, size_t size,
1722 size_t align, unsigned long flags, 1990 size_t align, unsigned long flags,
1723 void (*ctor)(void *, struct kmem_cache *, unsigned long), 1991 void (*ctor)(void *, struct kmem_cache *, unsigned long))
1724 void (*dtor)(void *, struct kmem_cache *, unsigned long))
1725{ 1992{
1726 memset(s, 0, kmem_size); 1993 memset(s, 0, kmem_size);
1727 s->name = name; 1994 s->name = name;
1728 s->ctor = ctor; 1995 s->ctor = ctor;
1729 s->dtor = dtor;
1730 s->objsize = size; 1996 s->objsize = size;
1731 s->flags = flags; 1997 s->flags = flags;
1732 s->align = align; 1998 s->align = align;
1733 1999 kmem_cache_open_debug_check(s);
1734 /*
1735 * The page->offset field is only 16 bit wide. This is an offset
1736 * in units of words from the beginning of an object. If the slab
1737 * size is bigger then we cannot move the free pointer behind the
1738 * object anymore.
1739 *
1740 * On 32 bit platforms the limit is 256k. On 64bit platforms
1741 * the limit is 512k.
1742 *
1743 * Debugging or ctor/dtors may create a need to move the free
1744 * pointer. Fail if this happens.
1745 */
1746 if (s->size >= 65535 * sizeof(void *)) {
1747 BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON |
1748 SLAB_STORE_USER | SLAB_DESTROY_BY_RCU));
1749 BUG_ON(ctor || dtor);
1750 }
1751 else
1752 /*
1753 * Enable debugging if selected on the kernel commandline.
1754 */
1755 if (slub_debug && (!slub_debug_slabs ||
1756 strncmp(slub_debug_slabs, name,
1757 strlen(slub_debug_slabs)) == 0))
1758 s->flags |= slub_debug;
1759 2000
1760 if (!calculate_sizes(s)) 2001 if (!calculate_sizes(s))
1761 goto error; 2002 goto error;
@@ -1783,7 +2024,6 @@ EXPORT_SYMBOL(kmem_cache_open);
1783int kmem_ptr_validate(struct kmem_cache *s, const void *object) 2024int kmem_ptr_validate(struct kmem_cache *s, const void *object)
1784{ 2025{
1785 struct page * page; 2026 struct page * page;
1786 void *addr;
1787 2027
1788 page = get_object_page(object); 2028 page = get_object_page(object);
1789 2029
@@ -1791,13 +2031,7 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *object)
1791 /* No slab or wrong slab */ 2031 /* No slab or wrong slab */
1792 return 0; 2032 return 0;
1793 2033
1794 addr = page_address(page); 2034 if (!check_valid_pointer(s, page, object))
1795 if (object < addr || object >= addr + s->objects * s->size)
1796 /* Out of bounds */
1797 return 0;
1798
1799 if ((object - addr) % s->size)
1800 /* Improperly aligned */
1801 return 0; 2035 return 0;
1802 2036
1803 /* 2037 /*
@@ -1826,7 +2060,8 @@ const char *kmem_cache_name(struct kmem_cache *s)
1826EXPORT_SYMBOL(kmem_cache_name); 2060EXPORT_SYMBOL(kmem_cache_name);
1827 2061
1828/* 2062/*
1829 * Attempt to free all slabs on a node 2063 * Attempt to free all slabs on a node. Return the number of slabs we
2064 * were unable to free.
1830 */ 2065 */
1831static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, 2066static int free_list(struct kmem_cache *s, struct kmem_cache_node *n,
1832 struct list_head *list) 2067 struct list_head *list)
@@ -1847,7 +2082,7 @@ static int free_list(struct kmem_cache *s, struct kmem_cache_node *n,
1847} 2082}
1848 2083
1849/* 2084/*
1850 * Release all resources used by slab cache 2085 * Release all resources used by a slab cache.
1851 */ 2086 */
1852static int kmem_cache_close(struct kmem_cache *s) 2087static int kmem_cache_close(struct kmem_cache *s)
1853{ 2088{
@@ -1932,45 +2167,6 @@ static int __init setup_slub_nomerge(char *str)
1932 2167
1933__setup("slub_nomerge", setup_slub_nomerge); 2168__setup("slub_nomerge", setup_slub_nomerge);
1934 2169
1935static int __init setup_slub_debug(char *str)
1936{
1937 if (!str || *str != '=')
1938 slub_debug = DEBUG_DEFAULT_FLAGS;
1939 else {
1940 str++;
1941 if (*str == 0 || *str == ',')
1942 slub_debug = DEBUG_DEFAULT_FLAGS;
1943 else
1944 for( ;*str && *str != ','; str++)
1945 switch (*str) {
1946 case 'f' : case 'F' :
1947 slub_debug |= SLAB_DEBUG_FREE;
1948 break;
1949 case 'z' : case 'Z' :
1950 slub_debug |= SLAB_RED_ZONE;
1951 break;
1952 case 'p' : case 'P' :
1953 slub_debug |= SLAB_POISON;
1954 break;
1955 case 'u' : case 'U' :
1956 slub_debug |= SLAB_STORE_USER;
1957 break;
1958 case 't' : case 'T' :
1959 slub_debug |= SLAB_TRACE;
1960 break;
1961 default:
1962 printk(KERN_ERR "slub_debug option '%c' "
1963 "unknown. skipped\n",*str);
1964 }
1965 }
1966
1967 if (*str == ',')
1968 slub_debug_slabs = str + 1;
1969 return 1;
1970}
1971
1972__setup("slub_debug", setup_slub_debug);
1973
1974static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, 2170static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
1975 const char *name, int size, gfp_t gfp_flags) 2171 const char *name, int size, gfp_t gfp_flags)
1976{ 2172{
@@ -1981,7 +2177,7 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
1981 2177
1982 down_write(&slub_lock); 2178 down_write(&slub_lock);
1983 if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, 2179 if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
1984 flags, NULL, NULL)) 2180 flags, NULL))
1985 goto panic; 2181 goto panic;
1986 2182
1987 list_add(&s->list, &slab_caches); 2183 list_add(&s->list, &slab_caches);
@@ -2108,13 +2304,14 @@ void kfree(const void *x)
2108EXPORT_SYMBOL(kfree); 2304EXPORT_SYMBOL(kfree);
2109 2305
2110/* 2306/*
2111 * kmem_cache_shrink removes empty slabs from the partial lists 2307 * kmem_cache_shrink removes empty slabs from the partial lists and sorts
2112 * and then sorts the partially allocated slabs by the number 2308 * the remaining slabs by the number of items in use. The slabs with the
2113 * of items in use. The slabs with the most items in use 2309 * most items in use come first. New allocations will then fill those up
2114 * come first. New allocations will remove these from the 2310 * and thus they can be removed from the partial lists.
2115 * partial list because they are full. The slabs with the 2311 *
2116 * least items are placed last. If it happens that the objects 2312 * The slabs with the least items are placed last. This results in them
2117 * are freed then the page can be returned to the page allocator. 2313 * being allocated from last increasing the chance that the last objects
2314 * are freed in them.
2118 */ 2315 */
2119int kmem_cache_shrink(struct kmem_cache *s) 2316int kmem_cache_shrink(struct kmem_cache *s)
2120{ 2317{
@@ -2143,12 +2340,10 @@ int kmem_cache_shrink(struct kmem_cache *s)
2143 spin_lock_irqsave(&n->list_lock, flags); 2340 spin_lock_irqsave(&n->list_lock, flags);
2144 2341
2145 /* 2342 /*
2146 * Build lists indexed by the items in use in 2343 * Build lists indexed by the items in use in each slab.
2147 * each slab or free slabs if empty.
2148 * 2344 *
2149 * Note that concurrent frees may occur while 2345 * Note that concurrent frees may occur while we hold the
2150 * we hold the list_lock. page->inuse here is 2346 * list_lock. page->inuse here is the upper limit.
2151 * the upper limit.
2152 */ 2347 */
2153 list_for_each_entry_safe(page, t, &n->partial, lru) { 2348 list_for_each_entry_safe(page, t, &n->partial, lru) {
2154 if (!page->inuse && slab_trylock(page)) { 2349 if (!page->inuse && slab_trylock(page)) {
@@ -2172,8 +2367,8 @@ int kmem_cache_shrink(struct kmem_cache *s)
2172 goto out; 2367 goto out;
2173 2368
2174 /* 2369 /*
2175 * Rebuild the partial list with the slabs filled up 2370 * Rebuild the partial list with the slabs filled up most
2176 * most first and the least used slabs at the end. 2371 * first and the least used slabs at the end.
2177 */ 2372 */
2178 for (i = s->objects - 1; i >= 0; i--) 2373 for (i = s->objects - 1; i >= 0; i--)
2179 list_splice(slabs_by_inuse + i, n->partial.prev); 2374 list_splice(slabs_by_inuse + i, n->partial.prev);
@@ -2189,7 +2384,6 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2189 2384
2190/** 2385/**
2191 * krealloc - reallocate memory. The contents will remain unchanged. 2386 * krealloc - reallocate memory. The contents will remain unchanged.
2192 *
2193 * @p: object to reallocate memory for. 2387 * @p: object to reallocate memory for.
2194 * @new_size: how many bytes of memory are required. 2388 * @new_size: how many bytes of memory are required.
2195 * @flags: the type of memory to allocate. 2389 * @flags: the type of memory to allocate.
@@ -2201,9 +2395,8 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2201 */ 2395 */
2202void *krealloc(const void *p, size_t new_size, gfp_t flags) 2396void *krealloc(const void *p, size_t new_size, gfp_t flags)
2203{ 2397{
2204 struct kmem_cache *new_cache;
2205 void *ret; 2398 void *ret;
2206 struct page *page; 2399 size_t ks;
2207 2400
2208 if (unlikely(!p)) 2401 if (unlikely(!p))
2209 return kmalloc(new_size, flags); 2402 return kmalloc(new_size, flags);
@@ -2213,19 +2406,13 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
2213 return NULL; 2406 return NULL;
2214 } 2407 }
2215 2408
2216 page = virt_to_head_page(p); 2409 ks = ksize(p);
2217 2410 if (ks >= new_size)
2218 new_cache = get_slab(new_size, flags);
2219
2220 /*
2221 * If new size fits in the current cache, bail out.
2222 */
2223 if (likely(page->slab == new_cache))
2224 return (void *)p; 2411 return (void *)p;
2225 2412
2226 ret = kmalloc(new_size, flags); 2413 ret = kmalloc(new_size, flags);
2227 if (ret) { 2414 if (ret) {
2228 memcpy(ret, p, min(new_size, ksize(p))); 2415 memcpy(ret, p, min(new_size, ks));
2229 kfree(p); 2416 kfree(p);
2230 } 2417 }
2231 return ret; 2418 return ret;
@@ -2243,11 +2430,12 @@ void __init kmem_cache_init(void)
2243#ifdef CONFIG_NUMA 2430#ifdef CONFIG_NUMA
2244 /* 2431 /*
2245 * Must first have the slab cache available for the allocations of the 2432 * Must first have the slab cache available for the allocations of the
2246 * struct kmalloc_cache_node's. There is special bootstrap code in 2433 * struct kmem_cache_node's. There is special bootstrap code in
2247 * kmem_cache_open for slab_state == DOWN. 2434 * kmem_cache_open for slab_state == DOWN.
2248 */ 2435 */
2249 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", 2436 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
2250 sizeof(struct kmem_cache_node), GFP_KERNEL); 2437 sizeof(struct kmem_cache_node), GFP_KERNEL);
2438 kmalloc_caches[0].refcount = -1;
2251#endif 2439#endif
2252 2440
2253 /* Able to allocate the per node structures */ 2441 /* Able to allocate the per node structures */
@@ -2274,13 +2462,12 @@ void __init kmem_cache_init(void)
2274 register_cpu_notifier(&slab_notifier); 2462 register_cpu_notifier(&slab_notifier);
2275#endif 2463#endif
2276 2464
2277 if (nr_cpu_ids) /* Remove when nr_cpu_ids is fixed upstream ! */ 2465 kmem_size = offsetof(struct kmem_cache, cpu_slab) +
2278 kmem_size = offsetof(struct kmem_cache, cpu_slab) 2466 nr_cpu_ids * sizeof(struct page *);
2279 + nr_cpu_ids * sizeof(struct page *);
2280 2467
2281 printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 2468 printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
2282 " Processors=%d, Nodes=%d\n", 2469 " Processors=%d, Nodes=%d\n",
2283 KMALLOC_SHIFT_HIGH, L1_CACHE_BYTES, 2470 KMALLOC_SHIFT_HIGH, cache_line_size(),
2284 slub_min_order, slub_max_order, slub_min_objects, 2471 slub_min_order, slub_max_order, slub_min_objects,
2285 nr_cpu_ids, nr_node_ids); 2472 nr_cpu_ids, nr_node_ids);
2286} 2473}
@@ -2293,7 +2480,13 @@ static int slab_unmergeable(struct kmem_cache *s)
2293 if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) 2480 if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
2294 return 1; 2481 return 1;
2295 2482
2296 if (s->ctor || s->dtor) 2483 if (s->ctor)
2484 return 1;
2485
2486 /*
2487 * We may have set a slab to be unmergeable during bootstrap.
2488 */
2489 if (s->refcount < 0)
2297 return 1; 2490 return 1;
2298 2491
2299 return 0; 2492 return 0;
@@ -2301,15 +2494,14 @@ static int slab_unmergeable(struct kmem_cache *s)
2301 2494
2302static struct kmem_cache *find_mergeable(size_t size, 2495static struct kmem_cache *find_mergeable(size_t size,
2303 size_t align, unsigned long flags, 2496 size_t align, unsigned long flags,
2304 void (*ctor)(void *, struct kmem_cache *, unsigned long), 2497 void (*ctor)(void *, struct kmem_cache *, unsigned long))
2305 void (*dtor)(void *, struct kmem_cache *, unsigned long))
2306{ 2498{
2307 struct list_head *h; 2499 struct list_head *h;
2308 2500
2309 if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) 2501 if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
2310 return NULL; 2502 return NULL;
2311 2503
2312 if (ctor || dtor) 2504 if (ctor)
2313 return NULL; 2505 return NULL;
2314 2506
2315 size = ALIGN(size, sizeof(void *)); 2507 size = ALIGN(size, sizeof(void *));
@@ -2351,8 +2543,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
2351{ 2543{
2352 struct kmem_cache *s; 2544 struct kmem_cache *s;
2353 2545
2546 BUG_ON(dtor);
2354 down_write(&slub_lock); 2547 down_write(&slub_lock);
2355 s = find_mergeable(size, align, flags, dtor, ctor); 2548 s = find_mergeable(size, align, flags, ctor);
2356 if (s) { 2549 if (s) {
2357 s->refcount++; 2550 s->refcount++;
2358 /* 2551 /*
@@ -2366,7 +2559,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
2366 } else { 2559 } else {
2367 s = kmalloc(kmem_size, GFP_KERNEL); 2560 s = kmalloc(kmem_size, GFP_KERNEL);
2368 if (s && kmem_cache_open(s, GFP_KERNEL, name, 2561 if (s && kmem_cache_open(s, GFP_KERNEL, name,
2369 size, align, flags, ctor, dtor)) { 2562 size, align, flags, ctor)) {
2370 if (sysfs_slab_add(s)) { 2563 if (sysfs_slab_add(s)) {
2371 kfree(s); 2564 kfree(s);
2372 goto err; 2565 goto err;
@@ -2415,8 +2608,21 @@ static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu)
2415} 2608}
2416 2609
2417/* 2610/*
2418 * Use the cpu notifier to insure that the slab are flushed 2611 * Version of __flush_cpu_slab for the case that interrupts
2419 * when necessary. 2612 * are enabled.
2613 */
2614static void cpu_slab_flush(struct kmem_cache *s, int cpu)
2615{
2616 unsigned long flags;
2617
2618 local_irq_save(flags);
2619 __flush_cpu_slab(s, cpu);
2620 local_irq_restore(flags);
2621}
2622
2623/*
2624 * Use the cpu notifier to insure that the cpu slabs are flushed when
2625 * necessary.
2420 */ 2626 */
2421static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, 2627static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
2422 unsigned long action, void *hcpu) 2628 unsigned long action, void *hcpu)
@@ -2425,8 +2631,10 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
2425 2631
2426 switch (action) { 2632 switch (action) {
2427 case CPU_UP_CANCELED: 2633 case CPU_UP_CANCELED:
2634 case CPU_UP_CANCELED_FROZEN:
2428 case CPU_DEAD: 2635 case CPU_DEAD:
2429 for_all_slabs(__flush_cpu_slab, cpu); 2636 case CPU_DEAD_FROZEN:
2637 for_all_slabs(cpu_slab_flush, cpu);
2430 break; 2638 break;
2431 default: 2639 default:
2432 break; 2640 break;
@@ -2439,153 +2647,6 @@ static struct notifier_block __cpuinitdata slab_notifier =
2439 2647
2440#endif 2648#endif
2441 2649
2442#ifdef CONFIG_NUMA
2443
2444/*****************************************************************
2445 * Generic reaper used to support the page allocator
2446 * (the cpu slabs are reaped by a per slab workqueue).
2447 *
2448 * Maybe move this to the page allocator?
2449 ****************************************************************/
2450
2451static DEFINE_PER_CPU(unsigned long, reap_node);
2452
2453static void init_reap_node(int cpu)
2454{
2455 int node;
2456
2457 node = next_node(cpu_to_node(cpu), node_online_map);
2458 if (node == MAX_NUMNODES)
2459 node = first_node(node_online_map);
2460
2461 __get_cpu_var(reap_node) = node;
2462}
2463
2464static void next_reap_node(void)
2465{
2466 int node = __get_cpu_var(reap_node);
2467
2468 /*
2469 * Also drain per cpu pages on remote zones
2470 */
2471 if (node != numa_node_id())
2472 drain_node_pages(node);
2473
2474 node = next_node(node, node_online_map);
2475 if (unlikely(node >= MAX_NUMNODES))
2476 node = first_node(node_online_map);
2477 __get_cpu_var(reap_node) = node;
2478}
2479#else
2480#define init_reap_node(cpu) do { } while (0)
2481#define next_reap_node(void) do { } while (0)
2482#endif
2483
2484#define REAPTIMEOUT_CPUC (2*HZ)
2485
2486#ifdef CONFIG_SMP
2487static DEFINE_PER_CPU(struct delayed_work, reap_work);
2488
2489static void cache_reap(struct work_struct *unused)
2490{
2491 next_reap_node();
2492 refresh_cpu_vm_stats(smp_processor_id());
2493 schedule_delayed_work(&__get_cpu_var(reap_work),
2494 REAPTIMEOUT_CPUC);
2495}
2496
2497static void __devinit start_cpu_timer(int cpu)
2498{
2499 struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
2500
2501 /*
2502 * When this gets called from do_initcalls via cpucache_init(),
2503 * init_workqueues() has already run, so keventd will be setup
2504 * at that time.
2505 */
2506 if (keventd_up() && reap_work->work.func == NULL) {
2507 init_reap_node(cpu);
2508 INIT_DELAYED_WORK(reap_work, cache_reap);
2509 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
2510 }
2511}
2512
2513static int __init cpucache_init(void)
2514{
2515 int cpu;
2516
2517 /*
2518 * Register the timers that drain pcp pages and update vm statistics
2519 */
2520 for_each_online_cpu(cpu)
2521 start_cpu_timer(cpu);
2522 return 0;
2523}
2524__initcall(cpucache_init);
2525#endif
2526
2527#ifdef SLUB_RESILIENCY_TEST
2528static unsigned long validate_slab_cache(struct kmem_cache *s);
2529
2530static void resiliency_test(void)
2531{
2532 u8 *p;
2533
2534 printk(KERN_ERR "SLUB resiliency testing\n");
2535 printk(KERN_ERR "-----------------------\n");
2536 printk(KERN_ERR "A. Corruption after allocation\n");
2537
2538 p = kzalloc(16, GFP_KERNEL);
2539 p[16] = 0x12;
2540 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
2541 " 0x12->0x%p\n\n", p + 16);
2542
2543 validate_slab_cache(kmalloc_caches + 4);
2544
2545 /* Hmmm... The next two are dangerous */
2546 p = kzalloc(32, GFP_KERNEL);
2547 p[32 + sizeof(void *)] = 0x34;
2548 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
2549 " 0x34 -> -0x%p\n", p);
2550 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
2551
2552 validate_slab_cache(kmalloc_caches + 5);
2553 p = kzalloc(64, GFP_KERNEL);
2554 p += 64 + (get_cycles() & 0xff) * sizeof(void *);
2555 *p = 0x56;
2556 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
2557 p);
2558 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
2559 validate_slab_cache(kmalloc_caches + 6);
2560
2561 printk(KERN_ERR "\nB. Corruption after free\n");
2562 p = kzalloc(128, GFP_KERNEL);
2563 kfree(p);
2564 *p = 0x78;
2565 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
2566 validate_slab_cache(kmalloc_caches + 7);
2567
2568 p = kzalloc(256, GFP_KERNEL);
2569 kfree(p);
2570 p[50] = 0x9a;
2571 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
2572 validate_slab_cache(kmalloc_caches + 8);
2573
2574 p = kzalloc(512, GFP_KERNEL);
2575 kfree(p);
2576 p[512] = 0xab;
2577 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
2578 validate_slab_cache(kmalloc_caches + 9);
2579}
2580#else
2581static void resiliency_test(void) {};
2582#endif
2583
2584/*
2585 * These are not as efficient as kmalloc for the non debug case.
2586 * We do not have the page struct available so we have to touch one
2587 * cacheline in struct kmem_cache to check slab flags.
2588 */
2589void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) 2650void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
2590{ 2651{
2591 struct kmem_cache *s = get_slab(size, gfpflags); 2652 struct kmem_cache *s = get_slab(size, gfpflags);
@@ -2607,13 +2668,12 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
2607 return slab_alloc(s, gfpflags, node, caller); 2668 return slab_alloc(s, gfpflags, node, caller);
2608} 2669}
2609 2670
2610#ifdef CONFIG_SYSFS 2671#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
2611
2612static int validate_slab(struct kmem_cache *s, struct page *page) 2672static int validate_slab(struct kmem_cache *s, struct page *page)
2613{ 2673{
2614 void *p; 2674 void *p;
2615 void *addr = page_address(page); 2675 void *addr = page_address(page);
2616 unsigned long map[BITS_TO_LONGS(s->objects)]; 2676 DECLARE_BITMAP(map, s->objects);
2617 2677
2618 if (!check_slab(s, page) || 2678 if (!check_slab(s, page) ||
2619 !on_freelist(s, page, NULL)) 2679 !on_freelist(s, page, NULL))
@@ -2622,14 +2682,14 @@ static int validate_slab(struct kmem_cache *s, struct page *page)
2622 /* Now we know that a valid freelist exists */ 2682 /* Now we know that a valid freelist exists */
2623 bitmap_zero(map, s->objects); 2683 bitmap_zero(map, s->objects);
2624 2684
2625 for(p = page->freelist; p; p = get_freepointer(s, p)) { 2685 for_each_free_object(p, s, page->freelist) {
2626 set_bit((p - addr) / s->size, map); 2686 set_bit(slab_index(p, s, addr), map);
2627 if (!check_object(s, page, p, 0)) 2687 if (!check_object(s, page, p, 0))
2628 return 0; 2688 return 0;
2629 } 2689 }
2630 2690
2631 for(p = addr; p < addr + s->objects * s->size; p += s->size) 2691 for_each_object(p, s, addr)
2632 if (!test_bit((p - addr) / s->size, map)) 2692 if (!test_bit(slab_index(p, s, addr), map))
2633 if (!check_object(s, page, p, 1)) 2693 if (!check_object(s, page, p, 1))
2634 return 0; 2694 return 0;
2635 return 1; 2695 return 1;
@@ -2645,12 +2705,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page)
2645 s->name, page); 2705 s->name, page);
2646 2706
2647 if (s->flags & DEBUG_DEFAULT_FLAGS) { 2707 if (s->flags & DEBUG_DEFAULT_FLAGS) {
2648 if (!PageError(page)) 2708 if (!SlabDebug(page))
2649 printk(KERN_ERR "SLUB %s: PageError not set " 2709 printk(KERN_ERR "SLUB %s: SlabDebug not set "
2650 "on slab 0x%p\n", s->name, page); 2710 "on slab 0x%p\n", s->name, page);
2651 } else { 2711 } else {
2652 if (PageError(page)) 2712 if (SlabDebug(page))
2653 printk(KERN_ERR "SLUB %s: PageError set on " 2713 printk(KERN_ERR "SLUB %s: SlabDebug set on "
2654 "slab 0x%p\n", s->name, page); 2714 "slab 0x%p\n", s->name, page);
2655 } 2715 }
2656} 2716}
@@ -2702,14 +2762,76 @@ static unsigned long validate_slab_cache(struct kmem_cache *s)
2702 return count; 2762 return count;
2703} 2763}
2704 2764
2765#ifdef SLUB_RESILIENCY_TEST
2766static void resiliency_test(void)
2767{
2768 u8 *p;
2769
2770 printk(KERN_ERR "SLUB resiliency testing\n");
2771 printk(KERN_ERR "-----------------------\n");
2772 printk(KERN_ERR "A. Corruption after allocation\n");
2773
2774 p = kzalloc(16, GFP_KERNEL);
2775 p[16] = 0x12;
2776 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
2777 " 0x12->0x%p\n\n", p + 16);
2778
2779 validate_slab_cache(kmalloc_caches + 4);
2780
2781 /* Hmmm... The next two are dangerous */
2782 p = kzalloc(32, GFP_KERNEL);
2783 p[32 + sizeof(void *)] = 0x34;
2784 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
2785 " 0x34 -> -0x%p\n", p);
2786 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
2787
2788 validate_slab_cache(kmalloc_caches + 5);
2789 p = kzalloc(64, GFP_KERNEL);
2790 p += 64 + (get_cycles() & 0xff) * sizeof(void *);
2791 *p = 0x56;
2792 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
2793 p);
2794 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
2795 validate_slab_cache(kmalloc_caches + 6);
2796
2797 printk(KERN_ERR "\nB. Corruption after free\n");
2798 p = kzalloc(128, GFP_KERNEL);
2799 kfree(p);
2800 *p = 0x78;
2801 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
2802 validate_slab_cache(kmalloc_caches + 7);
2803
2804 p = kzalloc(256, GFP_KERNEL);
2805 kfree(p);
2806 p[50] = 0x9a;
2807 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
2808 validate_slab_cache(kmalloc_caches + 8);
2809
2810 p = kzalloc(512, GFP_KERNEL);
2811 kfree(p);
2812 p[512] = 0xab;
2813 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
2814 validate_slab_cache(kmalloc_caches + 9);
2815}
2816#else
2817static void resiliency_test(void) {};
2818#endif
2819
2705/* 2820/*
2706 * Generate lists of locations where slabcache objects are allocated 2821 * Generate lists of code addresses where slabcache objects are allocated
2707 * and freed. 2822 * and freed.
2708 */ 2823 */
2709 2824
2710struct location { 2825struct location {
2711 unsigned long count; 2826 unsigned long count;
2712 void *addr; 2827 void *addr;
2828 long long sum_time;
2829 long min_time;
2830 long max_time;
2831 long min_pid;
2832 long max_pid;
2833 cpumask_t cpus;
2834 nodemask_t nodes;
2713}; 2835};
2714 2836
2715struct loc_track { 2837struct loc_track {
@@ -2750,11 +2872,12 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max)
2750} 2872}
2751 2873
2752static int add_location(struct loc_track *t, struct kmem_cache *s, 2874static int add_location(struct loc_track *t, struct kmem_cache *s,
2753 void *addr) 2875 const struct track *track)
2754{ 2876{
2755 long start, end, pos; 2877 long start, end, pos;
2756 struct location *l; 2878 struct location *l;
2757 void *caddr; 2879 void *caddr;
2880 unsigned long age = jiffies - track->when;
2758 2881
2759 start = -1; 2882 start = -1;
2760 end = t->count; 2883 end = t->count;
@@ -2770,19 +2893,36 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
2770 break; 2893 break;
2771 2894
2772 caddr = t->loc[pos].addr; 2895 caddr = t->loc[pos].addr;
2773 if (addr == caddr) { 2896 if (track->addr == caddr) {
2774 t->loc[pos].count++; 2897
2898 l = &t->loc[pos];
2899 l->count++;
2900 if (track->when) {
2901 l->sum_time += age;
2902 if (age < l->min_time)
2903 l->min_time = age;
2904 if (age > l->max_time)
2905 l->max_time = age;
2906
2907 if (track->pid < l->min_pid)
2908 l->min_pid = track->pid;
2909 if (track->pid > l->max_pid)
2910 l->max_pid = track->pid;
2911
2912 cpu_set(track->cpu, l->cpus);
2913 }
2914 node_set(page_to_nid(virt_to_page(track)), l->nodes);
2775 return 1; 2915 return 1;
2776 } 2916 }
2777 2917
2778 if (addr < caddr) 2918 if (track->addr < caddr)
2779 end = pos; 2919 end = pos;
2780 else 2920 else
2781 start = pos; 2921 start = pos;
2782 } 2922 }
2783 2923
2784 /* 2924 /*
2785 * Not found. Insert new tracking element 2925 * Not found. Insert new tracking element.
2786 */ 2926 */
2787 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max)) 2927 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max))
2788 return 0; 2928 return 0;
@@ -2793,7 +2933,16 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
2793 (t->count - pos) * sizeof(struct location)); 2933 (t->count - pos) * sizeof(struct location));
2794 t->count++; 2934 t->count++;
2795 l->count = 1; 2935 l->count = 1;
2796 l->addr = addr; 2936 l->addr = track->addr;
2937 l->sum_time = age;
2938 l->min_time = age;
2939 l->max_time = age;
2940 l->min_pid = track->pid;
2941 l->max_pid = track->pid;
2942 cpus_clear(l->cpus);
2943 cpu_set(track->cpu, l->cpus);
2944 nodes_clear(l->nodes);
2945 node_set(page_to_nid(virt_to_page(track)), l->nodes);
2797 return 1; 2946 return 1;
2798} 2947}
2799 2948
@@ -2801,19 +2950,16 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
2801 struct page *page, enum track_item alloc) 2950 struct page *page, enum track_item alloc)
2802{ 2951{
2803 void *addr = page_address(page); 2952 void *addr = page_address(page);
2804 unsigned long map[BITS_TO_LONGS(s->objects)]; 2953 DECLARE_BITMAP(map, s->objects);
2805 void *p; 2954 void *p;
2806 2955
2807 bitmap_zero(map, s->objects); 2956 bitmap_zero(map, s->objects);
2808 for (p = page->freelist; p; p = get_freepointer(s, p)) 2957 for_each_free_object(p, s, page->freelist)
2809 set_bit((p - addr) / s->size, map); 2958 set_bit(slab_index(p, s, addr), map);
2810
2811 for (p = addr; p < addr + s->objects * s->size; p += s->size)
2812 if (!test_bit((p - addr) / s->size, map)) {
2813 void *addr = get_track(s, p, alloc)->addr;
2814 2959
2815 add_location(t, s, addr); 2960 for_each_object(p, s, addr)
2816 } 2961 if (!test_bit(slab_index(p, s, addr), map))
2962 add_location(t, s, get_track(s, p, alloc));
2817} 2963}
2818 2964
2819static int list_locations(struct kmem_cache *s, char *buf, 2965static int list_locations(struct kmem_cache *s, char *buf,
@@ -2847,15 +2993,47 @@ static int list_locations(struct kmem_cache *s, char *buf,
2847 } 2993 }
2848 2994
2849 for (i = 0; i < t.count; i++) { 2995 for (i = 0; i < t.count; i++) {
2850 void *addr = t.loc[i].addr; 2996 struct location *l = &t.loc[i];
2851 2997
2852 if (n > PAGE_SIZE - 100) 2998 if (n > PAGE_SIZE - 100)
2853 break; 2999 break;
2854 n += sprintf(buf + n, "%7ld ", t.loc[i].count); 3000 n += sprintf(buf + n, "%7ld ", l->count);
2855 if (addr) 3001
2856 n += sprint_symbol(buf + n, (unsigned long)t.loc[i].addr); 3002 if (l->addr)
3003 n += sprint_symbol(buf + n, (unsigned long)l->addr);
2857 else 3004 else
2858 n += sprintf(buf + n, "<not-available>"); 3005 n += sprintf(buf + n, "<not-available>");
3006
3007 if (l->sum_time != l->min_time) {
3008 unsigned long remainder;
3009
3010 n += sprintf(buf + n, " age=%ld/%ld/%ld",
3011 l->min_time,
3012 div_long_long_rem(l->sum_time, l->count, &remainder),
3013 l->max_time);
3014 } else
3015 n += sprintf(buf + n, " age=%ld",
3016 l->min_time);
3017
3018 if (l->min_pid != l->max_pid)
3019 n += sprintf(buf + n, " pid=%ld-%ld",
3020 l->min_pid, l->max_pid);
3021 else
3022 n += sprintf(buf + n, " pid=%ld",
3023 l->min_pid);
3024
3025 if (num_online_cpus() > 1 && !cpus_empty(l->cpus)) {
3026 n += sprintf(buf + n, " cpus=");
3027 n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50,
3028 l->cpus);
3029 }
3030
3031 if (num_online_nodes() > 1 && !nodes_empty(l->nodes)) {
3032 n += sprintf(buf + n, " nodes=");
3033 n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50,
3034 l->nodes);
3035 }
3036
2859 n += sprintf(buf + n, "\n"); 3037 n += sprintf(buf + n, "\n");
2860 } 3038 }
2861 3039
@@ -3035,17 +3213,6 @@ static ssize_t ctor_show(struct kmem_cache *s, char *buf)
3035} 3213}
3036SLAB_ATTR_RO(ctor); 3214SLAB_ATTR_RO(ctor);
3037 3215
3038static ssize_t dtor_show(struct kmem_cache *s, char *buf)
3039{
3040 if (s->dtor) {
3041 int n = sprint_symbol(buf, (unsigned long)s->dtor);
3042
3043 return n + sprintf(buf + n, "\n");
3044 }
3045 return 0;
3046}
3047SLAB_ATTR_RO(dtor);
3048
3049static ssize_t aliases_show(struct kmem_cache *s, char *buf) 3216static ssize_t aliases_show(struct kmem_cache *s, char *buf)
3050{ 3217{
3051 return sprintf(buf, "%d\n", s->refcount - 1); 3218 return sprintf(buf, "%d\n", s->refcount - 1);
@@ -3277,7 +3444,6 @@ static struct attribute * slab_attrs[] = {
3277 &partial_attr.attr, 3444 &partial_attr.attr,
3278 &cpu_slabs_attr.attr, 3445 &cpu_slabs_attr.attr,
3279 &ctor_attr.attr, 3446 &ctor_attr.attr,
3280 &dtor_attr.attr,
3281 &aliases_attr.attr, 3447 &aliases_attr.attr,
3282 &align_attr.attr, 3448 &align_attr.attr,
3283 &sanity_checks_attr.attr, 3449 &sanity_checks_attr.attr,
@@ -3491,6 +3657,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
3491 3657
3492static int __init slab_sysfs_init(void) 3658static int __init slab_sysfs_init(void)
3493{ 3659{
3660 struct list_head *h;
3494 int err; 3661 int err;
3495 3662
3496 err = subsystem_register(&slab_subsys); 3663 err = subsystem_register(&slab_subsys);
@@ -3499,7 +3666,15 @@ static int __init slab_sysfs_init(void)
3499 return -ENOSYS; 3666 return -ENOSYS;
3500 } 3667 }
3501 3668
3502 finish_bootstrap(); 3669 slab_state = SYSFS;
3670
3671 list_for_each(h, &slab_caches) {
3672 struct kmem_cache *s =
3673 container_of(h, struct kmem_cache, list);
3674
3675 err = sysfs_slab_add(s);
3676 BUG_ON(err);
3677 }
3503 3678
3504 while (alias_list) { 3679 while (alias_list) {
3505 struct saved_alias *al = alias_list; 3680 struct saved_alias *al = alias_list;
@@ -3515,6 +3690,4 @@ static int __init slab_sysfs_init(void)
3515} 3690}
3516 3691
3517__initcall(slab_sysfs_init); 3692__initcall(slab_sysfs_init);
3518#else
3519__initcall(finish_bootstrap);
3520#endif 3693#endif
diff --git a/mm/sparse.c b/mm/sparse.c
index 893e5621c247..545e4d3afcdf 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -44,7 +44,7 @@ EXPORT_SYMBOL(page_to_nid);
44#endif 44#endif
45 45
46#ifdef CONFIG_SPARSEMEM_EXTREME 46#ifdef CONFIG_SPARSEMEM_EXTREME
47static struct mem_section *sparse_index_alloc(int nid) 47static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
48{ 48{
49 struct mem_section *section = NULL; 49 struct mem_section *section = NULL;
50 unsigned long array_size = SECTIONS_PER_ROOT * 50 unsigned long array_size = SECTIONS_PER_ROOT *
@@ -61,7 +61,7 @@ static struct mem_section *sparse_index_alloc(int nid)
61 return section; 61 return section;
62} 62}
63 63
64static int sparse_index_init(unsigned long section_nr, int nid) 64static int __meminit sparse_index_init(unsigned long section_nr, int nid)
65{ 65{
66 static DEFINE_SPINLOCK(index_init_lock); 66 static DEFINE_SPINLOCK(index_init_lock);
67 unsigned long root = SECTION_NR_TO_ROOT(section_nr); 67 unsigned long root = SECTION_NR_TO_ROOT(section_nr);
@@ -138,7 +138,7 @@ static inline int sparse_early_nid(struct mem_section *section)
138} 138}
139 139
140/* Record a memory area against a node. */ 140/* Record a memory area against a node. */
141void memory_present(int nid, unsigned long start, unsigned long end) 141void __init memory_present(int nid, unsigned long start, unsigned long end)
142{ 142{
143 unsigned long pfn; 143 unsigned long pfn;
144 144
@@ -197,7 +197,7 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
197 return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); 197 return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
198} 198}
199 199
200static int sparse_init_one_section(struct mem_section *ms, 200static int __meminit sparse_init_one_section(struct mem_section *ms,
201 unsigned long pnum, struct page *mem_map) 201 unsigned long pnum, struct page *mem_map)
202{ 202{
203 if (!valid_section(ms)) 203 if (!valid_section(ms))
@@ -209,7 +209,13 @@ static int sparse_init_one_section(struct mem_section *ms,
209 return 1; 209 return 1;
210} 210}
211 211
212static struct page *sparse_early_mem_map_alloc(unsigned long pnum) 212__attribute__((weak))
213void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
214{
215 return NULL;
216}
217
218static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
213{ 219{
214 struct page *map; 220 struct page *map;
215 struct mem_section *ms = __nr_to_section(pnum); 221 struct mem_section *ms = __nr_to_section(pnum);
@@ -219,6 +225,11 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
219 if (map) 225 if (map)
220 return map; 226 return map;
221 227
228 map = alloc_bootmem_high_node(NODE_DATA(nid),
229 sizeof(struct page) * PAGES_PER_SECTION);
230 if (map)
231 return map;
232
222 map = alloc_bootmem_node(NODE_DATA(nid), 233 map = alloc_bootmem_node(NODE_DATA(nid),
223 sizeof(struct page) * PAGES_PER_SECTION); 234 sizeof(struct page) * PAGES_PER_SECTION);
224 if (map) 235 if (map)
@@ -288,6 +299,7 @@ void __init sparse_init(void)
288 } 299 }
289} 300}
290 301
302#ifdef CONFIG_MEMORY_HOTPLUG
291/* 303/*
292 * returns the number of sections whose mem_maps were properly 304 * returns the number of sections whose mem_maps were properly
293 * set. If this is <=0, then that means that the passed-in 305 * set. If this is <=0, then that means that the passed-in
@@ -327,3 +339,4 @@ out:
327 __kfree_section_memmap(memmap, nr_pages); 339 __kfree_section_memmap(memmap, nr_pages);
328 return ret; 340 return ret;
329} 341}
342#endif
diff --git a/mm/swap.c b/mm/swap.c
index 218c52a24a21..d3cb966fe992 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -488,7 +488,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
488 long *committed; 488 long *committed;
489 489
490 committed = &per_cpu(committed_space, (long)hcpu); 490 committed = &per_cpu(committed_space, (long)hcpu);
491 if (action == CPU_DEAD) { 491 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
492 atomic_add(*committed, &vm_committed_space); 492 atomic_add(*committed, &vm_committed_space);
493 *committed = 0; 493 *committed = 0;
494 __lru_add_drain((long)hcpu); 494 __lru_add_drain((long)hcpu);
diff --git a/mm/thrash.c b/mm/thrash.c
index 9ef9071f99bc..c4c5205a9c35 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -48,9 +48,8 @@ void grab_swap_token(void)
48 if (current_interval < current->mm->last_interval) 48 if (current_interval < current->mm->last_interval)
49 current->mm->token_priority++; 49 current->mm->token_priority++;
50 else { 50 else {
51 current->mm->token_priority--; 51 if (likely(current->mm->token_priority > 0))
52 if (unlikely(current->mm->token_priority < 0)) 52 current->mm->token_priority--;
53 current->mm->token_priority = 0;
54 } 53 }
55 /* Check if we deserve the token */ 54 /* Check if we deserve the token */
56 if (current->mm->token_priority > 55 if (current->mm->token_priority >
diff --git a/mm/truncate.c b/mm/truncate.c
index 0f4b6d18ab0e..4fbe1a2da5fb 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -12,6 +12,7 @@
12#include <linux/swap.h> 12#include <linux/swap.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/pagemap.h> 14#include <linux/pagemap.h>
15#include <linux/highmem.h>
15#include <linux/pagevec.h> 16#include <linux/pagevec.h>
16#include <linux/task_io_accounting_ops.h> 17#include <linux/task_io_accounting_ops.h>
17#include <linux/buffer_head.h> /* grr. try_to_release_page, 18#include <linux/buffer_head.h> /* grr. try_to_release_page,
@@ -46,7 +47,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
46 47
47static inline void truncate_partial_page(struct page *page, unsigned partial) 48static inline void truncate_partial_page(struct page *page, unsigned partial)
48{ 49{
49 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); 50 zero_user_page(page, partial, PAGE_CACHE_SIZE - partial, KM_USER0);
50 if (PagePrivate(page)) 51 if (PagePrivate(page))
51 do_invalidatepage(page, partial); 52 do_invalidatepage(page, partial);
52} 53}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index cb5aabda7046..d3a9c5368257 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -311,7 +311,7 @@ struct vm_struct *remove_vm_area(void *addr)
311 return v; 311 return v;
312} 312}
313 313
314void __vunmap(void *addr, int deallocate_pages) 314static void __vunmap(void *addr, int deallocate_pages)
315{ 315{
316 struct vm_struct *area; 316 struct vm_struct *area;
317 317
@@ -755,3 +755,10 @@ out_einval_locked:
755} 755}
756EXPORT_SYMBOL(remap_vmalloc_range); 756EXPORT_SYMBOL(remap_vmalloc_range);
757 757
758/*
759 * Implement a stub for vmalloc_sync_all() if the architecture chose not to
760 * have one.
761 */
762void __attribute__((weak)) vmalloc_sync_all(void)
763{
764}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 56651a10c366..1be5a6376ef0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -284,12 +284,8 @@ static void handle_write_error(struct address_space *mapping,
284 struct page *page, int error) 284 struct page *page, int error)
285{ 285{
286 lock_page(page); 286 lock_page(page);
287 if (page_mapping(page) == mapping) { 287 if (page_mapping(page) == mapping)
288 if (error == -ENOSPC) 288 mapping_set_error(mapping, error);
289 set_bit(AS_ENOSPC, &mapping->flags);
290 else
291 set_bit(AS_EIO, &mapping->flags);
292 }
293 unlock_page(page); 289 unlock_page(page);
294} 290}
295 291
@@ -1532,7 +1528,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
1532 pg_data_t *pgdat; 1528 pg_data_t *pgdat;
1533 cpumask_t mask; 1529 cpumask_t mask;
1534 1530
1535 if (action == CPU_ONLINE) { 1531 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
1536 for_each_online_pgdat(pgdat) { 1532 for_each_online_pgdat(pgdat) {
1537 mask = node_to_cpumask(pgdat->node_id); 1533 mask = node_to_cpumask(pgdat->node_id);
1538 if (any_online_cpu(mask) != NR_CPUS) 1534 if (any_online_cpu(mask) != NR_CPUS)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6c488d6ac425..38254297a494 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -12,6 +12,7 @@
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/sched.h>
15 16
16#ifdef CONFIG_VM_EVENT_COUNTERS 17#ifdef CONFIG_VM_EVENT_COUNTERS
17DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 18DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -281,6 +282,17 @@ EXPORT_SYMBOL(dec_zone_page_state);
281 282
282/* 283/*
283 * Update the zone counters for one cpu. 284 * Update the zone counters for one cpu.
285 *
286 * Note that refresh_cpu_vm_stats strives to only access
287 * node local memory. The per cpu pagesets on remote zones are placed
288 * in the memory local to the processor using that pageset. So the
289 * loop over all zones will access a series of cachelines local to
290 * the processor.
291 *
292 * The call to zone_page_state_add updates the cachelines with the
293 * statistics in the remote zone struct as well as the global cachelines
294 * with the global counters. These could cause remote node cache line
295 * bouncing and will have to be only done when necessary.
284 */ 296 */
285void refresh_cpu_vm_stats(int cpu) 297void refresh_cpu_vm_stats(int cpu)
286{ 298{
@@ -289,21 +301,54 @@ void refresh_cpu_vm_stats(int cpu)
289 unsigned long flags; 301 unsigned long flags;
290 302
291 for_each_zone(zone) { 303 for_each_zone(zone) {
292 struct per_cpu_pageset *pcp; 304 struct per_cpu_pageset *p;
293 305
294 if (!populated_zone(zone)) 306 if (!populated_zone(zone))
295 continue; 307 continue;
296 308
297 pcp = zone_pcp(zone, cpu); 309 p = zone_pcp(zone, cpu);
298 310
299 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 311 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
300 if (pcp->vm_stat_diff[i]) { 312 if (p->vm_stat_diff[i]) {
301 local_irq_save(flags); 313 local_irq_save(flags);
302 zone_page_state_add(pcp->vm_stat_diff[i], 314 zone_page_state_add(p->vm_stat_diff[i],
303 zone, i); 315 zone, i);
304 pcp->vm_stat_diff[i] = 0; 316 p->vm_stat_diff[i] = 0;
317#ifdef CONFIG_NUMA
318 /* 3 seconds idle till flush */
319 p->expire = 3;
320#endif
305 local_irq_restore(flags); 321 local_irq_restore(flags);
306 } 322 }
323#ifdef CONFIG_NUMA
324 /*
325 * Deal with draining the remote pageset of this
326 * processor
327 *
328 * Check if there are pages remaining in this pageset
329 * if not then there is nothing to expire.
330 */
331 if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count))
332 continue;
333
334 /*
335 * We never drain zones local to this processor.
336 */
337 if (zone_to_nid(zone) == numa_node_id()) {
338 p->expire = 0;
339 continue;
340 }
341
342 p->expire--;
343 if (p->expire)
344 continue;
345
346 if (p->pcp[0].count)
347 drain_zone_pages(zone, p->pcp + 0);
348
349 if (p->pcp[1].count)
350 drain_zone_pages(zone, p->pcp + 1);
351#endif
307 } 352 }
308} 353}
309 354
@@ -640,6 +685,24 @@ const struct seq_operations vmstat_op = {
640#endif /* CONFIG_PROC_FS */ 685#endif /* CONFIG_PROC_FS */
641 686
642#ifdef CONFIG_SMP 687#ifdef CONFIG_SMP
688static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
689int sysctl_stat_interval __read_mostly = HZ;
690
691static void vmstat_update(struct work_struct *w)
692{
693 refresh_cpu_vm_stats(smp_processor_id());
694 schedule_delayed_work(&__get_cpu_var(vmstat_work),
695 sysctl_stat_interval);
696}
697
698static void __devinit start_cpu_timer(int cpu)
699{
700 struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu);
701
702 INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update);
703 schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu);
704}
705
643/* 706/*
644 * Use the cpu notifier to insure that the thresholds are recalculated 707 * Use the cpu notifier to insure that the thresholds are recalculated
645 * when necessary. 708 * when necessary.
@@ -648,10 +711,24 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
648 unsigned long action, 711 unsigned long action,
649 void *hcpu) 712 void *hcpu)
650{ 713{
714 long cpu = (long)hcpu;
715
651 switch (action) { 716 switch (action) {
652 case CPU_UP_PREPARE: 717 case CPU_ONLINE:
653 case CPU_UP_CANCELED: 718 case CPU_ONLINE_FROZEN:
719 start_cpu_timer(cpu);
720 break;
721 case CPU_DOWN_PREPARE:
722 case CPU_DOWN_PREPARE_FROZEN:
723 cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
724 per_cpu(vmstat_work, cpu).work.func = NULL;
725 break;
726 case CPU_DOWN_FAILED:
727 case CPU_DOWN_FAILED_FROZEN:
728 start_cpu_timer(cpu);
729 break;
654 case CPU_DEAD: 730 case CPU_DEAD:
731 case CPU_DEAD_FROZEN:
655 refresh_zone_stat_thresholds(); 732 refresh_zone_stat_thresholds();
656 break; 733 break;
657 default: 734 default:
@@ -665,8 +742,13 @@ static struct notifier_block __cpuinitdata vmstat_notifier =
665 742
666int __init setup_vmstat(void) 743int __init setup_vmstat(void)
667{ 744{
745 int cpu;
746
668 refresh_zone_stat_thresholds(); 747 refresh_zone_stat_thresholds();
669 register_cpu_notifier(&vmstat_notifier); 748 register_cpu_notifier(&vmstat_notifier);
749
750 for_each_online_cpu(cpu)
751 start_cpu_timer(cpu);
670 return 0; 752 return 0;
671} 753}
672module_init(setup_vmstat) 754module_init(setup_vmstat)