diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 2 | ||||
-rw-r--r-- | mm/filemap.c | 101 | ||||
-rw-r--r-- | mm/filemap_xip.c | 8 | ||||
-rw-r--r-- | mm/hugetlb.c | 33 | ||||
-rw-r--r-- | mm/madvise.c | 1 | ||||
-rw-r--r-- | mm/memory.c | 2 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 2 | ||||
-rw-r--r-- | mm/mlock.c | 11 | ||||
-rw-r--r-- | mm/mmap.c | 3 | ||||
-rw-r--r-- | mm/msync.c | 1 | ||||
-rw-r--r-- | mm/nommu.c | 8 | ||||
-rw-r--r-- | mm/page-writeback.c | 64 | ||||
-rw-r--r-- | mm/page_alloc.c | 127 | ||||
-rw-r--r-- | mm/rmap.c | 67 | ||||
-rw-r--r-- | mm/shmem.c | 8 | ||||
-rw-r--r-- | mm/slab.c | 152 | ||||
-rw-r--r-- | mm/slob.c | 53 | ||||
-rw-r--r-- | mm/slub.c | 1361 | ||||
-rw-r--r-- | mm/sparse.c | 23 | ||||
-rw-r--r-- | mm/swap.c | 2 | ||||
-rw-r--r-- | mm/thrash.c | 5 | ||||
-rw-r--r-- | mm/truncate.c | 3 | ||||
-rw-r--r-- | mm/vmalloc.c | 9 | ||||
-rw-r--r-- | mm/vmscan.c | 10 | ||||
-rw-r--r-- | mm/vmstat.c | 96 |
25 files changed, 1272 insertions, 880 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 1ac718f636ec..8ac412b45f18 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -166,5 +166,5 @@ config ZONE_DMA_FLAG | |||
166 | config NR_QUICK | 166 | config NR_QUICK |
167 | int | 167 | int |
168 | depends on QUICKLIST | 168 | depends on QUICKLIST |
169 | default "2" if (SUPERH && !SUPERH64) | ||
169 | default "1" | 170 | default "1" |
170 | |||
diff --git a/mm/filemap.c b/mm/filemap.c index 5631d6b2a62d..edb1b0b5cc8d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -670,7 +670,8 @@ repeat: | |||
670 | page = find_lock_page(mapping, index); | 670 | page = find_lock_page(mapping, index); |
671 | if (!page) { | 671 | if (!page) { |
672 | if (!cached_page) { | 672 | if (!cached_page) { |
673 | cached_page = alloc_page(gfp_mask); | 673 | cached_page = |
674 | __page_cache_alloc(gfp_mask); | ||
674 | if (!cached_page) | 675 | if (!cached_page) |
675 | return NULL; | 676 | return NULL; |
676 | } | 677 | } |
@@ -750,6 +751,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, | |||
750 | read_unlock_irq(&mapping->tree_lock); | 751 | read_unlock_irq(&mapping->tree_lock); |
751 | return i; | 752 | return i; |
752 | } | 753 | } |
754 | EXPORT_SYMBOL(find_get_pages_contig); | ||
753 | 755 | ||
754 | /** | 756 | /** |
755 | * find_get_pages_tag - find and return pages that match @tag | 757 | * find_get_pages_tag - find and return pages that match @tag |
@@ -778,6 +780,7 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, | |||
778 | read_unlock_irq(&mapping->tree_lock); | 780 | read_unlock_irq(&mapping->tree_lock); |
779 | return ret; | 781 | return ret; |
780 | } | 782 | } |
783 | EXPORT_SYMBOL(find_get_pages_tag); | ||
781 | 784 | ||
782 | /** | 785 | /** |
783 | * grab_cache_page_nowait - returns locked page at given index in given cache | 786 | * grab_cache_page_nowait - returns locked page at given index in given cache |
@@ -1110,6 +1113,45 @@ success: | |||
1110 | return size; | 1113 | return size; |
1111 | } | 1114 | } |
1112 | 1115 | ||
1116 | /* | ||
1117 | * Performs necessary checks before doing a write | ||
1118 | * @iov: io vector request | ||
1119 | * @nr_segs: number of segments in the iovec | ||
1120 | * @count: number of bytes to write | ||
1121 | * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE | ||
1122 | * | ||
1123 | * Adjust number of segments and amount of bytes to write (nr_segs should be | ||
1124 | * properly initialized first). Returns appropriate error code that caller | ||
1125 | * should return or zero in case that write should be allowed. | ||
1126 | */ | ||
1127 | int generic_segment_checks(const struct iovec *iov, | ||
1128 | unsigned long *nr_segs, size_t *count, int access_flags) | ||
1129 | { | ||
1130 | unsigned long seg; | ||
1131 | size_t cnt = 0; | ||
1132 | for (seg = 0; seg < *nr_segs; seg++) { | ||
1133 | const struct iovec *iv = &iov[seg]; | ||
1134 | |||
1135 | /* | ||
1136 | * If any segment has a negative length, or the cumulative | ||
1137 | * length ever wraps negative then return -EINVAL. | ||
1138 | */ | ||
1139 | cnt += iv->iov_len; | ||
1140 | if (unlikely((ssize_t)(cnt|iv->iov_len) < 0)) | ||
1141 | return -EINVAL; | ||
1142 | if (access_ok(access_flags, iv->iov_base, iv->iov_len)) | ||
1143 | continue; | ||
1144 | if (seg == 0) | ||
1145 | return -EFAULT; | ||
1146 | *nr_segs = seg; | ||
1147 | cnt -= iv->iov_len; /* This segment is no good */ | ||
1148 | break; | ||
1149 | } | ||
1150 | *count = cnt; | ||
1151 | return 0; | ||
1152 | } | ||
1153 | EXPORT_SYMBOL(generic_segment_checks); | ||
1154 | |||
1113 | /** | 1155 | /** |
1114 | * generic_file_aio_read - generic filesystem read routine | 1156 | * generic_file_aio_read - generic filesystem read routine |
1115 | * @iocb: kernel I/O control block | 1157 | * @iocb: kernel I/O control block |
@@ -1131,24 +1173,9 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1131 | loff_t *ppos = &iocb->ki_pos; | 1173 | loff_t *ppos = &iocb->ki_pos; |
1132 | 1174 | ||
1133 | count = 0; | 1175 | count = 0; |
1134 | for (seg = 0; seg < nr_segs; seg++) { | 1176 | retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); |
1135 | const struct iovec *iv = &iov[seg]; | 1177 | if (retval) |
1136 | 1178 | return retval; | |
1137 | /* | ||
1138 | * If any segment has a negative length, or the cumulative | ||
1139 | * length ever wraps negative then return -EINVAL. | ||
1140 | */ | ||
1141 | count += iv->iov_len; | ||
1142 | if (unlikely((ssize_t)(count|iv->iov_len) < 0)) | ||
1143 | return -EINVAL; | ||
1144 | if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len)) | ||
1145 | continue; | ||
1146 | if (seg == 0) | ||
1147 | return -EFAULT; | ||
1148 | nr_segs = seg; | ||
1149 | count -= iv->iov_len; /* This segment is no good */ | ||
1150 | break; | ||
1151 | } | ||
1152 | 1179 | ||
1153 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ | 1180 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ |
1154 | if (filp->f_flags & O_DIRECT) { | 1181 | if (filp->f_flags & O_DIRECT) { |
@@ -1758,7 +1785,7 @@ struct page *read_cache_page_async(struct address_space *mapping, | |||
1758 | retry: | 1785 | retry: |
1759 | page = __read_cache_page(mapping, index, filler, data); | 1786 | page = __read_cache_page(mapping, index, filler, data); |
1760 | if (IS_ERR(page)) | 1787 | if (IS_ERR(page)) |
1761 | goto out; | 1788 | return page; |
1762 | mark_page_accessed(page); | 1789 | mark_page_accessed(page); |
1763 | if (PageUptodate(page)) | 1790 | if (PageUptodate(page)) |
1764 | goto out; | 1791 | goto out; |
@@ -1776,9 +1803,9 @@ retry: | |||
1776 | err = filler(data, page); | 1803 | err = filler(data, page); |
1777 | if (err < 0) { | 1804 | if (err < 0) { |
1778 | page_cache_release(page); | 1805 | page_cache_release(page); |
1779 | page = ERR_PTR(err); | 1806 | return ERR_PTR(err); |
1780 | } | 1807 | } |
1781 | out: | 1808 | out: |
1782 | mark_page_accessed(page); | 1809 | mark_page_accessed(page); |
1783 | return page; | 1810 | return page; |
1784 | } | 1811 | } |
@@ -2218,30 +2245,14 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | |||
2218 | size_t ocount; /* original count */ | 2245 | size_t ocount; /* original count */ |
2219 | size_t count; /* after file limit checks */ | 2246 | size_t count; /* after file limit checks */ |
2220 | struct inode *inode = mapping->host; | 2247 | struct inode *inode = mapping->host; |
2221 | unsigned long seg; | ||
2222 | loff_t pos; | 2248 | loff_t pos; |
2223 | ssize_t written; | 2249 | ssize_t written; |
2224 | ssize_t err; | 2250 | ssize_t err; |
2225 | 2251 | ||
2226 | ocount = 0; | 2252 | ocount = 0; |
2227 | for (seg = 0; seg < nr_segs; seg++) { | 2253 | err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); |
2228 | const struct iovec *iv = &iov[seg]; | 2254 | if (err) |
2229 | 2255 | return err; | |
2230 | /* | ||
2231 | * If any segment has a negative length, or the cumulative | ||
2232 | * length ever wraps negative then return -EINVAL. | ||
2233 | */ | ||
2234 | ocount += iv->iov_len; | ||
2235 | if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) | ||
2236 | return -EINVAL; | ||
2237 | if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) | ||
2238 | continue; | ||
2239 | if (seg == 0) | ||
2240 | return -EFAULT; | ||
2241 | nr_segs = seg; | ||
2242 | ocount -= iv->iov_len; /* This segment is no good */ | ||
2243 | break; | ||
2244 | } | ||
2245 | 2256 | ||
2246 | count = ocount; | 2257 | count = ocount; |
2247 | pos = *ppos; | 2258 | pos = *ppos; |
@@ -2301,10 +2312,10 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | |||
2301 | * semantics. | 2312 | * semantics. |
2302 | */ | 2313 | */ |
2303 | endbyte = pos + written_buffered - written - 1; | 2314 | endbyte = pos + written_buffered - written - 1; |
2304 | err = do_sync_file_range(file, pos, endbyte, | 2315 | err = do_sync_mapping_range(file->f_mapping, pos, endbyte, |
2305 | SYNC_FILE_RANGE_WAIT_BEFORE| | 2316 | SYNC_FILE_RANGE_WAIT_BEFORE| |
2306 | SYNC_FILE_RANGE_WRITE| | 2317 | SYNC_FILE_RANGE_WRITE| |
2307 | SYNC_FILE_RANGE_WAIT_AFTER); | 2318 | SYNC_FILE_RANGE_WAIT_AFTER); |
2308 | if (err == 0) { | 2319 | if (err == 0) { |
2309 | written = written_buffered; | 2320 | written = written_buffered; |
2310 | invalidate_mapping_pages(mapping, | 2321 | invalidate_mapping_pages(mapping, |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index cbb335813ec0..fa360e566d88 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/uio.h> | 14 | #include <linux/uio.h> |
15 | #include <linux/rmap.h> | 15 | #include <linux/rmap.h> |
16 | #include <linux/sched.h> | ||
16 | #include <asm/tlbflush.h> | 17 | #include <asm/tlbflush.h> |
17 | #include "filemap.h" | 18 | #include "filemap.h" |
18 | 19 | ||
@@ -434,7 +435,6 @@ xip_truncate_page(struct address_space *mapping, loff_t from) | |||
434 | unsigned blocksize; | 435 | unsigned blocksize; |
435 | unsigned length; | 436 | unsigned length; |
436 | struct page *page; | 437 | struct page *page; |
437 | void *kaddr; | ||
438 | 438 | ||
439 | BUG_ON(!mapping->a_ops->get_xip_page); | 439 | BUG_ON(!mapping->a_ops->get_xip_page); |
440 | 440 | ||
@@ -458,11 +458,7 @@ xip_truncate_page(struct address_space *mapping, loff_t from) | |||
458 | else | 458 | else |
459 | return PTR_ERR(page); | 459 | return PTR_ERR(page); |
460 | } | 460 | } |
461 | kaddr = kmap_atomic(page, KM_USER0); | 461 | zero_user_page(page, offset, length, KM_USER0); |
462 | memset(kaddr + offset, 0, length); | ||
463 | kunmap_atomic(kaddr, KM_USER0); | ||
464 | |||
465 | flush_dcache_page(page); | ||
466 | return 0; | 462 | return 0; |
467 | } | 463 | } |
468 | EXPORT_SYMBOL_GPL(xip_truncate_page); | 464 | EXPORT_SYMBOL_GPL(xip_truncate_page); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 36db012b38dd..eb7180db3033 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -140,6 +140,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
140 | return page; | 140 | return page; |
141 | 141 | ||
142 | fail: | 142 | fail: |
143 | if (vma->vm_flags & VM_MAYSHARE) | ||
144 | resv_huge_pages++; | ||
143 | spin_unlock(&hugetlb_lock); | 145 | spin_unlock(&hugetlb_lock); |
144 | return NULL; | 146 | return NULL; |
145 | } | 147 | } |
@@ -172,6 +174,17 @@ static int __init hugetlb_setup(char *s) | |||
172 | } | 174 | } |
173 | __setup("hugepages=", hugetlb_setup); | 175 | __setup("hugepages=", hugetlb_setup); |
174 | 176 | ||
177 | static unsigned int cpuset_mems_nr(unsigned int *array) | ||
178 | { | ||
179 | int node; | ||
180 | unsigned int nr = 0; | ||
181 | |||
182 | for_each_node_mask(node, cpuset_current_mems_allowed) | ||
183 | nr += array[node]; | ||
184 | |||
185 | return nr; | ||
186 | } | ||
187 | |||
175 | #ifdef CONFIG_SYSCTL | 188 | #ifdef CONFIG_SYSCTL |
176 | static void update_and_free_page(struct page *page) | 189 | static void update_and_free_page(struct page *page) |
177 | { | 190 | { |
@@ -817,6 +830,26 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to) | |||
817 | chg = region_chg(&inode->i_mapping->private_list, from, to); | 830 | chg = region_chg(&inode->i_mapping->private_list, from, to); |
818 | if (chg < 0) | 831 | if (chg < 0) |
819 | return chg; | 832 | return chg; |
833 | /* | ||
834 | * When cpuset is configured, it breaks the strict hugetlb page | ||
835 | * reservation as the accounting is done on a global variable. Such | ||
836 | * reservation is completely rubbish in the presence of cpuset because | ||
837 | * the reservation is not checked against page availability for the | ||
838 | * current cpuset. Application can still potentially OOM'ed by kernel | ||
839 | * with lack of free htlb page in cpuset that the task is in. | ||
840 | * Attempt to enforce strict accounting with cpuset is almost | ||
841 | * impossible (or too ugly) because cpuset is too fluid that | ||
842 | * task or memory node can be dynamically moved between cpusets. | ||
843 | * | ||
844 | * The change of semantics for shared hugetlb mapping with cpuset is | ||
845 | * undesirable. However, in order to preserve some of the semantics, | ||
846 | * we fall back to check against current free page availability as | ||
847 | * a best attempt and hopefully to minimize the impact of changing | ||
848 | * semantics that cpuset has. | ||
849 | */ | ||
850 | if (chg > cpuset_mems_nr(free_huge_pages_node)) | ||
851 | return -ENOMEM; | ||
852 | |||
820 | ret = hugetlb_acct_memory(chg); | 853 | ret = hugetlb_acct_memory(chg); |
821 | if (ret < 0) | 854 | if (ret < 0) |
822 | return ret; | 855 | return ret; |
diff --git a/mm/madvise.c b/mm/madvise.c index e75096b5a6d3..60542d006ec1 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/syscalls.h> | 10 | #include <linux/syscalls.h> |
11 | #include <linux/mempolicy.h> | 11 | #include <linux/mempolicy.h> |
12 | #include <linux/hugetlb.h> | 12 | #include <linux/hugetlb.h> |
13 | #include <linux/sched.h> | ||
13 | 14 | ||
14 | /* | 15 | /* |
15 | * Any behaviour which results in changes to the vma->vm_flags needs to | 16 | * Any behaviour which results in changes to the vma->vm_flags needs to |
diff --git a/mm/memory.c b/mm/memory.c index 1d647ab0ee72..cb94488ab96d 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -481,7 +481,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
481 | page = vm_normal_page(vma, addr, pte); | 481 | page = vm_normal_page(vma, addr, pte); |
482 | if (page) { | 482 | if (page) { |
483 | get_page(page); | 483 | get_page(page); |
484 | page_dup_rmap(page); | 484 | page_dup_rmap(page, vma, addr); |
485 | rss[!!PageAnon(page)]++; | 485 | rss[!!PageAnon(page)]++; |
486 | } | 486 | } |
487 | 487 | ||
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 84279127fcd3..df9d554bea30 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -65,7 +65,7 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) | |||
65 | int zone_type; | 65 | int zone_type; |
66 | 66 | ||
67 | zone_type = zone - pgdat->node_zones; | 67 | zone_type = zone - pgdat->node_zones; |
68 | if (!populated_zone(zone)) { | 68 | if (!zone->wait_table) { |
69 | int ret = 0; | 69 | int ret = 0; |
70 | ret = init_currently_empty_zone(zone, phys_start_pfn, | 70 | ret = init_currently_empty_zone(zone, phys_start_pfn, |
71 | nr_pages, MEMMAP_HOTPLUG); | 71 | nr_pages, MEMMAP_HOTPLUG); |
diff --git a/mm/mlock.c b/mm/mlock.c index 3446b7ef731e..4d3fea267e0d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -10,7 +10,18 @@ | |||
10 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
11 | #include <linux/mempolicy.h> | 11 | #include <linux/mempolicy.h> |
12 | #include <linux/syscalls.h> | 12 | #include <linux/syscalls.h> |
13 | #include <linux/sched.h> | ||
14 | #include <linux/module.h> | ||
13 | 15 | ||
16 | int can_do_mlock(void) | ||
17 | { | ||
18 | if (capable(CAP_IPC_LOCK)) | ||
19 | return 1; | ||
20 | if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0) | ||
21 | return 1; | ||
22 | return 0; | ||
23 | } | ||
24 | EXPORT_SYMBOL(can_do_mlock); | ||
14 | 25 | ||
15 | static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, | 26 | static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, |
16 | unsigned long start, unsigned long end, unsigned int newflags) | 27 | unsigned long start, unsigned long end, unsigned int newflags) |
@@ -1366,7 +1366,6 @@ unsigned long | |||
1366 | get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, | 1366 | get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, |
1367 | unsigned long pgoff, unsigned long flags) | 1367 | unsigned long pgoff, unsigned long flags) |
1368 | { | 1368 | { |
1369 | unsigned long ret; | ||
1370 | unsigned long (*get_area)(struct file *, unsigned long, | 1369 | unsigned long (*get_area)(struct file *, unsigned long, |
1371 | unsigned long, unsigned long, unsigned long); | 1370 | unsigned long, unsigned long, unsigned long); |
1372 | 1371 | ||
@@ -1721,7 +1720,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1721 | 1720 | ||
1722 | /* | 1721 | /* |
1723 | * Split a vma into two pieces at address 'addr', a new vma is allocated | 1722 | * Split a vma into two pieces at address 'addr', a new vma is allocated |
1724 | * either for the first part or the the tail. | 1723 | * either for the first part or the tail. |
1725 | */ | 1724 | */ |
1726 | int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | 1725 | int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, |
1727 | unsigned long addr, int new_below) | 1726 | unsigned long addr, int new_below) |
diff --git a/mm/msync.c b/mm/msync.c index 358d73cf7b78..144a7570535d 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/mman.h> | 12 | #include <linux/mman.h> |
13 | #include <linux/file.h> | 13 | #include <linux/file.h> |
14 | #include <linux/syscalls.h> | 14 | #include <linux/syscalls.h> |
15 | #include <linux/sched.h> | ||
15 | 16 | ||
16 | /* | 17 | /* |
17 | * MS_SYNC syncs the entire file - including mappings. | 18 | * MS_SYNC syncs the entire file - including mappings. |
diff --git a/mm/nommu.c b/mm/nommu.c index 1f60194d9b9b..2b16b00a5b11 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -262,6 +262,14 @@ void vunmap(void *addr) | |||
262 | } | 262 | } |
263 | 263 | ||
264 | /* | 264 | /* |
265 | * Implement a stub for vmalloc_sync_all() if the architecture chose not to | ||
266 | * have one. | ||
267 | */ | ||
268 | void __attribute__((weak)) vmalloc_sync_all(void) | ||
269 | { | ||
270 | } | ||
271 | |||
272 | /* | ||
265 | * sys_brk() for the most part doesn't need the global kernel | 273 | * sys_brk() for the most part doesn't need the global kernel |
266 | * lock, except when an application is doing something nasty | 274 | * lock, except when an application is doing something nasty |
267 | * like trying to un-brk an area that has already been mapped | 275 | * like trying to un-brk an area that has already been mapped |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 029dfad5a235..eec1481ba44f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -588,31 +588,27 @@ void __init page_writeback_init(void) | |||
588 | } | 588 | } |
589 | 589 | ||
590 | /** | 590 | /** |
591 | * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. | 591 | * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. |
592 | * @mapping: address space structure to write | 592 | * @mapping: address space structure to write |
593 | * @wbc: subtract the number of written pages from *@wbc->nr_to_write | 593 | * @wbc: subtract the number of written pages from *@wbc->nr_to_write |
594 | * @writepage: function called for each page | ||
595 | * @data: data passed to writepage function | ||
594 | * | 596 | * |
595 | * This is a library function, which implements the writepages() | 597 | * If a page is already under I/O, write_cache_pages() skips it, even |
596 | * address_space_operation. | ||
597 | * | ||
598 | * If a page is already under I/O, generic_writepages() skips it, even | ||
599 | * if it's dirty. This is desirable behaviour for memory-cleaning writeback, | 598 | * if it's dirty. This is desirable behaviour for memory-cleaning writeback, |
600 | * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() | 599 | * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() |
601 | * and msync() need to guarantee that all the data which was dirty at the time | 600 | * and msync() need to guarantee that all the data which was dirty at the time |
602 | * the call was made get new I/O started against them. If wbc->sync_mode is | 601 | * the call was made get new I/O started against them. If wbc->sync_mode is |
603 | * WB_SYNC_ALL then we were called for data integrity and we must wait for | 602 | * WB_SYNC_ALL then we were called for data integrity and we must wait for |
604 | * existing IO to complete. | 603 | * existing IO to complete. |
605 | * | ||
606 | * Derived from mpage_writepages() - if you fix this you should check that | ||
607 | * also! | ||
608 | */ | 604 | */ |
609 | int generic_writepages(struct address_space *mapping, | 605 | int write_cache_pages(struct address_space *mapping, |
610 | struct writeback_control *wbc) | 606 | struct writeback_control *wbc, writepage_t writepage, |
607 | void *data) | ||
611 | { | 608 | { |
612 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 609 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
613 | int ret = 0; | 610 | int ret = 0; |
614 | int done = 0; | 611 | int done = 0; |
615 | int (*writepage)(struct page *page, struct writeback_control *wbc); | ||
616 | struct pagevec pvec; | 612 | struct pagevec pvec; |
617 | int nr_pages; | 613 | int nr_pages; |
618 | pgoff_t index; | 614 | pgoff_t index; |
@@ -625,12 +621,6 @@ int generic_writepages(struct address_space *mapping, | |||
625 | return 0; | 621 | return 0; |
626 | } | 622 | } |
627 | 623 | ||
628 | writepage = mapping->a_ops->writepage; | ||
629 | |||
630 | /* deal with chardevs and other special file */ | ||
631 | if (!writepage) | ||
632 | return 0; | ||
633 | |||
634 | pagevec_init(&pvec, 0); | 624 | pagevec_init(&pvec, 0); |
635 | if (wbc->range_cyclic) { | 625 | if (wbc->range_cyclic) { |
636 | index = mapping->writeback_index; /* Start from prev offset */ | 626 | index = mapping->writeback_index; /* Start from prev offset */ |
@@ -682,13 +672,7 @@ retry: | |||
682 | continue; | 672 | continue; |
683 | } | 673 | } |
684 | 674 | ||
685 | ret = (*writepage)(page, wbc); | 675 | ret = (*writepage)(page, wbc, data); |
686 | if (ret) { | ||
687 | if (ret == -ENOSPC) | ||
688 | set_bit(AS_ENOSPC, &mapping->flags); | ||
689 | else | ||
690 | set_bit(AS_EIO, &mapping->flags); | ||
691 | } | ||
692 | 676 | ||
693 | if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) | 677 | if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) |
694 | unlock_page(page); | 678 | unlock_page(page); |
@@ -715,6 +699,38 @@ retry: | |||
715 | mapping->writeback_index = index; | 699 | mapping->writeback_index = index; |
716 | return ret; | 700 | return ret; |
717 | } | 701 | } |
702 | EXPORT_SYMBOL(write_cache_pages); | ||
703 | |||
704 | /* | ||
705 | * Function used by generic_writepages to call the real writepage | ||
706 | * function and set the mapping flags on error | ||
707 | */ | ||
708 | static int __writepage(struct page *page, struct writeback_control *wbc, | ||
709 | void *data) | ||
710 | { | ||
711 | struct address_space *mapping = data; | ||
712 | int ret = mapping->a_ops->writepage(page, wbc); | ||
713 | mapping_set_error(mapping, ret); | ||
714 | return ret; | ||
715 | } | ||
716 | |||
717 | /** | ||
718 | * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. | ||
719 | * @mapping: address space structure to write | ||
720 | * @wbc: subtract the number of written pages from *@wbc->nr_to_write | ||
721 | * | ||
722 | * This is a library function, which implements the writepages() | ||
723 | * address_space_operation. | ||
724 | */ | ||
725 | int generic_writepages(struct address_space *mapping, | ||
726 | struct writeback_control *wbc) | ||
727 | { | ||
728 | /* deal with chardevs and other special file */ | ||
729 | if (!mapping->a_ops->writepage) | ||
730 | return 0; | ||
731 | |||
732 | return write_cache_pages(mapping, wbc, __writepage, mapping); | ||
733 | } | ||
718 | 734 | ||
719 | EXPORT_SYMBOL(generic_writepages); | 735 | EXPORT_SYMBOL(generic_writepages); |
720 | 736 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 59164313167f..bd8e33582d25 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -103,7 +103,7 @@ int min_free_kbytes = 1024; | |||
103 | 103 | ||
104 | unsigned long __meminitdata nr_kernel_pages; | 104 | unsigned long __meminitdata nr_kernel_pages; |
105 | unsigned long __meminitdata nr_all_pages; | 105 | unsigned long __meminitdata nr_all_pages; |
106 | static unsigned long __initdata dma_reserve; | 106 | static unsigned long __meminitdata dma_reserve; |
107 | 107 | ||
108 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | 108 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP |
109 | /* | 109 | /* |
@@ -126,16 +126,21 @@ static unsigned long __initdata dma_reserve; | |||
126 | #endif | 126 | #endif |
127 | #endif | 127 | #endif |
128 | 128 | ||
129 | struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS]; | 129 | struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; |
130 | int __initdata nr_nodemap_entries; | 130 | int __meminitdata nr_nodemap_entries; |
131 | unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; | 131 | unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; |
132 | unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; | 132 | unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; |
133 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | 133 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE |
134 | unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES]; | 134 | unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES]; |
135 | unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES]; | 135 | unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES]; |
136 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | 136 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ |
137 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ | 137 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ |
138 | 138 | ||
139 | #if MAX_NUMNODES > 1 | ||
140 | int nr_node_ids __read_mostly = MAX_NUMNODES; | ||
141 | EXPORT_SYMBOL(nr_node_ids); | ||
142 | #endif | ||
143 | |||
139 | #ifdef CONFIG_DEBUG_VM | 144 | #ifdef CONFIG_DEBUG_VM |
140 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 145 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
141 | { | 146 | { |
@@ -669,65 +674,28 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
669 | return i; | 674 | return i; |
670 | } | 675 | } |
671 | 676 | ||
672 | #if MAX_NUMNODES > 1 | ||
673 | int nr_node_ids __read_mostly = MAX_NUMNODES; | ||
674 | EXPORT_SYMBOL(nr_node_ids); | ||
675 | |||
676 | /* | ||
677 | * Figure out the number of possible node ids. | ||
678 | */ | ||
679 | static void __init setup_nr_node_ids(void) | ||
680 | { | ||
681 | unsigned int node; | ||
682 | unsigned int highest = 0; | ||
683 | |||
684 | for_each_node_mask(node, node_possible_map) | ||
685 | highest = node; | ||
686 | nr_node_ids = highest + 1; | ||
687 | } | ||
688 | #else | ||
689 | static void __init setup_nr_node_ids(void) {} | ||
690 | #endif | ||
691 | |||
692 | #ifdef CONFIG_NUMA | 677 | #ifdef CONFIG_NUMA |
693 | /* | 678 | /* |
694 | * Called from the slab reaper to drain pagesets on a particular node that | 679 | * Called from the vmstat counter updater to drain pagesets of this |
695 | * belongs to the currently executing processor. | 680 | * currently executing processor on remote nodes after they have |
681 | * expired. | ||
682 | * | ||
696 | * Note that this function must be called with the thread pinned to | 683 | * Note that this function must be called with the thread pinned to |
697 | * a single processor. | 684 | * a single processor. |
698 | */ | 685 | */ |
699 | void drain_node_pages(int nodeid) | 686 | void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) |
700 | { | 687 | { |
701 | int i; | ||
702 | enum zone_type z; | ||
703 | unsigned long flags; | 688 | unsigned long flags; |
689 | int to_drain; | ||
704 | 690 | ||
705 | for (z = 0; z < MAX_NR_ZONES; z++) { | 691 | local_irq_save(flags); |
706 | struct zone *zone = NODE_DATA(nodeid)->node_zones + z; | 692 | if (pcp->count >= pcp->batch) |
707 | struct per_cpu_pageset *pset; | 693 | to_drain = pcp->batch; |
708 | 694 | else | |
709 | if (!populated_zone(zone)) | 695 | to_drain = pcp->count; |
710 | continue; | 696 | free_pages_bulk(zone, to_drain, &pcp->list, 0); |
711 | 697 | pcp->count -= to_drain; | |
712 | pset = zone_pcp(zone, smp_processor_id()); | 698 | local_irq_restore(flags); |
713 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | ||
714 | struct per_cpu_pages *pcp; | ||
715 | |||
716 | pcp = &pset->pcp[i]; | ||
717 | if (pcp->count) { | ||
718 | int to_drain; | ||
719 | |||
720 | local_irq_save(flags); | ||
721 | if (pcp->count >= pcp->batch) | ||
722 | to_drain = pcp->batch; | ||
723 | else | ||
724 | to_drain = pcp->count; | ||
725 | free_pages_bulk(zone, to_drain, &pcp->list, 0); | ||
726 | pcp->count -= to_drain; | ||
727 | local_irq_restore(flags); | ||
728 | } | ||
729 | } | ||
730 | } | ||
731 | } | 699 | } |
732 | #endif | 700 | #endif |
733 | 701 | ||
@@ -2148,11 +2116,14 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, | |||
2148 | 2116 | ||
2149 | switch (action) { | 2117 | switch (action) { |
2150 | case CPU_UP_PREPARE: | 2118 | case CPU_UP_PREPARE: |
2119 | case CPU_UP_PREPARE_FROZEN: | ||
2151 | if (process_zones(cpu)) | 2120 | if (process_zones(cpu)) |
2152 | ret = NOTIFY_BAD; | 2121 | ret = NOTIFY_BAD; |
2153 | break; | 2122 | break; |
2154 | case CPU_UP_CANCELED: | 2123 | case CPU_UP_CANCELED: |
2124 | case CPU_UP_CANCELED_FROZEN: | ||
2155 | case CPU_DEAD: | 2125 | case CPU_DEAD: |
2126 | case CPU_DEAD_FROZEN: | ||
2156 | free_zone_pagesets(cpu); | 2127 | free_zone_pagesets(cpu); |
2157 | break; | 2128 | break; |
2158 | default: | 2129 | default: |
@@ -2179,7 +2150,7 @@ void __init setup_per_cpu_pageset(void) | |||
2179 | 2150 | ||
2180 | #endif | 2151 | #endif |
2181 | 2152 | ||
2182 | static __meminit | 2153 | static noinline __init_refok |
2183 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | 2154 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) |
2184 | { | 2155 | { |
2185 | int i; | 2156 | int i; |
@@ -2267,7 +2238,7 @@ __meminit int init_currently_empty_zone(struct zone *zone, | |||
2267 | * Basic iterator support. Return the first range of PFNs for a node | 2238 | * Basic iterator support. Return the first range of PFNs for a node |
2268 | * Note: nid == MAX_NUMNODES returns first region regardless of node | 2239 | * Note: nid == MAX_NUMNODES returns first region regardless of node |
2269 | */ | 2240 | */ |
2270 | static int __init first_active_region_index_in_nid(int nid) | 2241 | static int __meminit first_active_region_index_in_nid(int nid) |
2271 | { | 2242 | { |
2272 | int i; | 2243 | int i; |
2273 | 2244 | ||
@@ -2282,7 +2253,7 @@ static int __init first_active_region_index_in_nid(int nid) | |||
2282 | * Basic iterator support. Return the next active range of PFNs for a node | 2253 | * Basic iterator support. Return the next active range of PFNs for a node |
2283 | * Note: nid == MAX_NUMNODES returns next region regardles of node | 2254 | * Note: nid == MAX_NUMNODES returns next region regardles of node |
2284 | */ | 2255 | */ |
2285 | static int __init next_active_region_index_in_nid(int index, int nid) | 2256 | static int __meminit next_active_region_index_in_nid(int index, int nid) |
2286 | { | 2257 | { |
2287 | for (index = index + 1; index < nr_nodemap_entries; index++) | 2258 | for (index = index + 1; index < nr_nodemap_entries; index++) |
2288 | if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) | 2259 | if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) |
@@ -2298,7 +2269,7 @@ static int __init next_active_region_index_in_nid(int index, int nid) | |||
2298 | * was used and there are no special requirements, this is a convenient | 2269 | * was used and there are no special requirements, this is a convenient |
2299 | * alternative | 2270 | * alternative |
2300 | */ | 2271 | */ |
2301 | int __init early_pfn_to_nid(unsigned long pfn) | 2272 | int __meminit early_pfn_to_nid(unsigned long pfn) |
2302 | { | 2273 | { |
2303 | int i; | 2274 | int i; |
2304 | 2275 | ||
@@ -2435,7 +2406,7 @@ static void __init account_node_boundary(unsigned int nid, | |||
2435 | * with no available memory, a warning is printed and the start and end | 2406 | * with no available memory, a warning is printed and the start and end |
2436 | * PFNs will be 0. | 2407 | * PFNs will be 0. |
2437 | */ | 2408 | */ |
2438 | void __init get_pfn_range_for_nid(unsigned int nid, | 2409 | void __meminit get_pfn_range_for_nid(unsigned int nid, |
2439 | unsigned long *start_pfn, unsigned long *end_pfn) | 2410 | unsigned long *start_pfn, unsigned long *end_pfn) |
2440 | { | 2411 | { |
2441 | int i; | 2412 | int i; |
@@ -2460,7 +2431,7 @@ void __init get_pfn_range_for_nid(unsigned int nid, | |||
2460 | * Return the number of pages a zone spans in a node, including holes | 2431 | * Return the number of pages a zone spans in a node, including holes |
2461 | * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() | 2432 | * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() |
2462 | */ | 2433 | */ |
2463 | unsigned long __init zone_spanned_pages_in_node(int nid, | 2434 | unsigned long __meminit zone_spanned_pages_in_node(int nid, |
2464 | unsigned long zone_type, | 2435 | unsigned long zone_type, |
2465 | unsigned long *ignored) | 2436 | unsigned long *ignored) |
2466 | { | 2437 | { |
@@ -2488,7 +2459,7 @@ unsigned long __init zone_spanned_pages_in_node(int nid, | |||
2488 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, | 2459 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, |
2489 | * then all holes in the requested range will be accounted for. | 2460 | * then all holes in the requested range will be accounted for. |
2490 | */ | 2461 | */ |
2491 | unsigned long __init __absent_pages_in_range(int nid, | 2462 | unsigned long __meminit __absent_pages_in_range(int nid, |
2492 | unsigned long range_start_pfn, | 2463 | unsigned long range_start_pfn, |
2493 | unsigned long range_end_pfn) | 2464 | unsigned long range_end_pfn) |
2494 | { | 2465 | { |
@@ -2548,7 +2519,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, | |||
2548 | } | 2519 | } |
2549 | 2520 | ||
2550 | /* Return the number of page frames in holes in a zone on a node */ | 2521 | /* Return the number of page frames in holes in a zone on a node */ |
2551 | unsigned long __init zone_absent_pages_in_node(int nid, | 2522 | unsigned long __meminit zone_absent_pages_in_node(int nid, |
2552 | unsigned long zone_type, | 2523 | unsigned long zone_type, |
2553 | unsigned long *ignored) | 2524 | unsigned long *ignored) |
2554 | { | 2525 | { |
@@ -2584,7 +2555,7 @@ static inline unsigned long zone_absent_pages_in_node(int nid, | |||
2584 | 2555 | ||
2585 | #endif | 2556 | #endif |
2586 | 2557 | ||
2587 | static void __init calculate_node_totalpages(struct pglist_data *pgdat, | 2558 | static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, |
2588 | unsigned long *zones_size, unsigned long *zholes_size) | 2559 | unsigned long *zones_size, unsigned long *zholes_size) |
2589 | { | 2560 | { |
2590 | unsigned long realtotalpages, totalpages = 0; | 2561 | unsigned long realtotalpages, totalpages = 0; |
@@ -2692,7 +2663,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, | |||
2692 | } | 2663 | } |
2693 | } | 2664 | } |
2694 | 2665 | ||
2695 | static void __init alloc_node_mem_map(struct pglist_data *pgdat) | 2666 | static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) |
2696 | { | 2667 | { |
2697 | /* Skip empty nodes */ | 2668 | /* Skip empty nodes */ |
2698 | if (!pgdat->node_spanned_pages) | 2669 | if (!pgdat->node_spanned_pages) |
@@ -2718,7 +2689,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) | |||
2718 | map = alloc_bootmem_node(pgdat, size); | 2689 | map = alloc_bootmem_node(pgdat, size); |
2719 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); | 2690 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); |
2720 | } | 2691 | } |
2721 | #ifdef CONFIG_FLATMEM | 2692 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
2722 | /* | 2693 | /* |
2723 | * With no DISCONTIG, the global mem_map is just set as node 0's | 2694 | * With no DISCONTIG, the global mem_map is just set as node 0's |
2724 | */ | 2695 | */ |
@@ -2747,6 +2718,26 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, | |||
2747 | } | 2718 | } |
2748 | 2719 | ||
2749 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | 2720 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP |
2721 | |||
2722 | #if MAX_NUMNODES > 1 | ||
2723 | /* | ||
2724 | * Figure out the number of possible node ids. | ||
2725 | */ | ||
2726 | static void __init setup_nr_node_ids(void) | ||
2727 | { | ||
2728 | unsigned int node; | ||
2729 | unsigned int highest = 0; | ||
2730 | |||
2731 | for_each_node_mask(node, node_possible_map) | ||
2732 | highest = node; | ||
2733 | nr_node_ids = highest + 1; | ||
2734 | } | ||
2735 | #else | ||
2736 | static inline void setup_nr_node_ids(void) | ||
2737 | { | ||
2738 | } | ||
2739 | #endif | ||
2740 | |||
2750 | /** | 2741 | /** |
2751 | * add_active_range - Register a range of PFNs backed by physical memory | 2742 | * add_active_range - Register a range of PFNs backed by physical memory |
2752 | * @nid: The node ID the range resides on | 2743 | * @nid: The node ID the range resides on |
@@ -3012,7 +3003,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self, | |||
3012 | { | 3003 | { |
3013 | int cpu = (unsigned long)hcpu; | 3004 | int cpu = (unsigned long)hcpu; |
3014 | 3005 | ||
3015 | if (action == CPU_DEAD) { | 3006 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { |
3016 | local_irq_disable(); | 3007 | local_irq_disable(); |
3017 | __drain_pages(cpu); | 3008 | __drain_pages(cpu); |
3018 | vm_events_fold_cpu(cpu); | 3009 | vm_events_fold_cpu(cpu); |
@@ -162,12 +162,10 @@ void anon_vma_unlink(struct vm_area_struct *vma) | |||
162 | static void anon_vma_ctor(void *data, struct kmem_cache *cachep, | 162 | static void anon_vma_ctor(void *data, struct kmem_cache *cachep, |
163 | unsigned long flags) | 163 | unsigned long flags) |
164 | { | 164 | { |
165 | if (flags & SLAB_CTOR_CONSTRUCTOR) { | 165 | struct anon_vma *anon_vma = data; |
166 | struct anon_vma *anon_vma = data; | ||
167 | 166 | ||
168 | spin_lock_init(&anon_vma->lock); | 167 | spin_lock_init(&anon_vma->lock); |
169 | INIT_LIST_HEAD(&anon_vma->head); | 168 | INIT_LIST_HEAD(&anon_vma->head); |
170 | } | ||
171 | } | 169 | } |
172 | 170 | ||
173 | void __init anon_vma_init(void) | 171 | void __init anon_vma_init(void) |
@@ -505,6 +503,7 @@ int page_mkclean(struct page *page) | |||
505 | 503 | ||
506 | return ret; | 504 | return ret; |
507 | } | 505 | } |
506 | EXPORT_SYMBOL_GPL(page_mkclean); | ||
508 | 507 | ||
509 | /** | 508 | /** |
510 | * page_set_anon_rmap - setup new anonymous rmap | 509 | * page_set_anon_rmap - setup new anonymous rmap |
@@ -531,19 +530,51 @@ static void __page_set_anon_rmap(struct page *page, | |||
531 | } | 530 | } |
532 | 531 | ||
533 | /** | 532 | /** |
533 | * page_set_anon_rmap - sanity check anonymous rmap addition | ||
534 | * @page: the page to add the mapping to | ||
535 | * @vma: the vm area in which the mapping is added | ||
536 | * @address: the user virtual address mapped | ||
537 | */ | ||
538 | static void __page_check_anon_rmap(struct page *page, | ||
539 | struct vm_area_struct *vma, unsigned long address) | ||
540 | { | ||
541 | #ifdef CONFIG_DEBUG_VM | ||
542 | /* | ||
543 | * The page's anon-rmap details (mapping and index) are guaranteed to | ||
544 | * be set up correctly at this point. | ||
545 | * | ||
546 | * We have exclusion against page_add_anon_rmap because the caller | ||
547 | * always holds the page locked, except if called from page_dup_rmap, | ||
548 | * in which case the page is already known to be setup. | ||
549 | * | ||
550 | * We have exclusion against page_add_new_anon_rmap because those pages | ||
551 | * are initially only visible via the pagetables, and the pte is locked | ||
552 | * over the call to page_add_new_anon_rmap. | ||
553 | */ | ||
554 | struct anon_vma *anon_vma = vma->anon_vma; | ||
555 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
556 | BUG_ON(page->mapping != (struct address_space *)anon_vma); | ||
557 | BUG_ON(page->index != linear_page_index(vma, address)); | ||
558 | #endif | ||
559 | } | ||
560 | |||
561 | /** | ||
534 | * page_add_anon_rmap - add pte mapping to an anonymous page | 562 | * page_add_anon_rmap - add pte mapping to an anonymous page |
535 | * @page: the page to add the mapping to | 563 | * @page: the page to add the mapping to |
536 | * @vma: the vm area in which the mapping is added | 564 | * @vma: the vm area in which the mapping is added |
537 | * @address: the user virtual address mapped | 565 | * @address: the user virtual address mapped |
538 | * | 566 | * |
539 | * The caller needs to hold the pte lock. | 567 | * The caller needs to hold the pte lock and the page must be locked. |
540 | */ | 568 | */ |
541 | void page_add_anon_rmap(struct page *page, | 569 | void page_add_anon_rmap(struct page *page, |
542 | struct vm_area_struct *vma, unsigned long address) | 570 | struct vm_area_struct *vma, unsigned long address) |
543 | { | 571 | { |
572 | VM_BUG_ON(!PageLocked(page)); | ||
573 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | ||
544 | if (atomic_inc_and_test(&page->_mapcount)) | 574 | if (atomic_inc_and_test(&page->_mapcount)) |
545 | __page_set_anon_rmap(page, vma, address); | 575 | __page_set_anon_rmap(page, vma, address); |
546 | /* else checking page index and mapping is racy */ | 576 | else |
577 | __page_check_anon_rmap(page, vma, address); | ||
547 | } | 578 | } |
548 | 579 | ||
549 | /* | 580 | /* |
@@ -554,10 +585,12 @@ void page_add_anon_rmap(struct page *page, | |||
554 | * | 585 | * |
555 | * Same as page_add_anon_rmap but must only be called on *new* pages. | 586 | * Same as page_add_anon_rmap but must only be called on *new* pages. |
556 | * This means the inc-and-test can be bypassed. | 587 | * This means the inc-and-test can be bypassed. |
588 | * Page does not have to be locked. | ||
557 | */ | 589 | */ |
558 | void page_add_new_anon_rmap(struct page *page, | 590 | void page_add_new_anon_rmap(struct page *page, |
559 | struct vm_area_struct *vma, unsigned long address) | 591 | struct vm_area_struct *vma, unsigned long address) |
560 | { | 592 | { |
593 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); | ||
561 | atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ | 594 | atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ |
562 | __page_set_anon_rmap(page, vma, address); | 595 | __page_set_anon_rmap(page, vma, address); |
563 | } | 596 | } |
@@ -574,6 +607,26 @@ void page_add_file_rmap(struct page *page) | |||
574 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 607 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
575 | } | 608 | } |
576 | 609 | ||
610 | #ifdef CONFIG_DEBUG_VM | ||
611 | /** | ||
612 | * page_dup_rmap - duplicate pte mapping to a page | ||
613 | * @page: the page to add the mapping to | ||
614 | * | ||
615 | * For copy_page_range only: minimal extract from page_add_file_rmap / | ||
616 | * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's | ||
617 | * quicker. | ||
618 | * | ||
619 | * The caller needs to hold the pte lock. | ||
620 | */ | ||
621 | void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) | ||
622 | { | ||
623 | BUG_ON(page_mapcount(page) == 0); | ||
624 | if (PageAnon(page)) | ||
625 | __page_check_anon_rmap(page, vma, address); | ||
626 | atomic_inc(&page->_mapcount); | ||
627 | } | ||
628 | #endif | ||
629 | |||
577 | /** | 630 | /** |
578 | * page_remove_rmap - take down pte mapping from a page | 631 | * page_remove_rmap - take down pte mapping from a page |
579 | * @page: page to remove mapping from | 632 | * @page: page to remove mapping from |
diff --git a/mm/shmem.c b/mm/shmem.c index f01e8deed645..e537317bec4d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -2358,13 +2358,11 @@ static void init_once(void *foo, struct kmem_cache *cachep, | |||
2358 | { | 2358 | { |
2359 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; | 2359 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; |
2360 | 2360 | ||
2361 | if (flags & SLAB_CTOR_CONSTRUCTOR) { | 2361 | inode_init_once(&p->vfs_inode); |
2362 | inode_init_once(&p->vfs_inode); | ||
2363 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2362 | #ifdef CONFIG_TMPFS_POSIX_ACL |
2364 | p->i_acl = NULL; | 2363 | p->i_acl = NULL; |
2365 | p->i_default_acl = NULL; | 2364 | p->i_default_acl = NULL; |
2366 | #endif | 2365 | #endif |
2367 | } | ||
2368 | } | 2366 | } |
2369 | 2367 | ||
2370 | static int init_inodecache(void) | 2368 | static int init_inodecache(void) |
@@ -148,10 +148,11 @@ | |||
148 | * Usually, the kmalloc caches are cache_line_size() aligned, except when | 148 | * Usually, the kmalloc caches are cache_line_size() aligned, except when |
149 | * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. | 149 | * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. |
150 | * Some archs want to perform DMA into kmalloc caches and need a guaranteed | 150 | * Some archs want to perform DMA into kmalloc caches and need a guaranteed |
151 | * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that. | 151 | * alignment larger than the alignment of a 64-bit integer. |
152 | * Note that this flag disables some debug features. | 152 | * ARCH_KMALLOC_MINALIGN allows that. |
153 | * Note that increasing this value may disable some debug features. | ||
153 | */ | 154 | */ |
154 | #define ARCH_KMALLOC_MINALIGN 0 | 155 | #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) |
155 | #endif | 156 | #endif |
156 | 157 | ||
157 | #ifndef ARCH_SLAB_MINALIGN | 158 | #ifndef ARCH_SLAB_MINALIGN |
@@ -408,9 +409,6 @@ struct kmem_cache { | |||
408 | /* constructor func */ | 409 | /* constructor func */ |
409 | void (*ctor) (void *, struct kmem_cache *, unsigned long); | 410 | void (*ctor) (void *, struct kmem_cache *, unsigned long); |
410 | 411 | ||
411 | /* de-constructor func */ | ||
412 | void (*dtor) (void *, struct kmem_cache *, unsigned long); | ||
413 | |||
414 | /* 5) cache creation/removal */ | 412 | /* 5) cache creation/removal */ |
415 | const char *name; | 413 | const char *name; |
416 | struct list_head next; | 414 | struct list_head next; |
@@ -536,19 +534,22 @@ static int obj_size(struct kmem_cache *cachep) | |||
536 | return cachep->obj_size; | 534 | return cachep->obj_size; |
537 | } | 535 | } |
538 | 536 | ||
539 | static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp) | 537 | static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) |
540 | { | 538 | { |
541 | BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); | 539 | BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); |
542 | return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD); | 540 | return (unsigned long long*) (objp + obj_offset(cachep) - |
541 | sizeof(unsigned long long)); | ||
543 | } | 542 | } |
544 | 543 | ||
545 | static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp) | 544 | static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) |
546 | { | 545 | { |
547 | BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); | 546 | BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); |
548 | if (cachep->flags & SLAB_STORE_USER) | 547 | if (cachep->flags & SLAB_STORE_USER) |
549 | return (unsigned long *)(objp + cachep->buffer_size - | 548 | return (unsigned long long *)(objp + cachep->buffer_size - |
550 | 2 * BYTES_PER_WORD); | 549 | sizeof(unsigned long long) - |
551 | return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD); | 550 | BYTES_PER_WORD); |
551 | return (unsigned long long *) (objp + cachep->buffer_size - | ||
552 | sizeof(unsigned long long)); | ||
552 | } | 553 | } |
553 | 554 | ||
554 | static void **dbg_userword(struct kmem_cache *cachep, void *objp) | 555 | static void **dbg_userword(struct kmem_cache *cachep, void *objp) |
@@ -561,28 +562,13 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) | |||
561 | 562 | ||
562 | #define obj_offset(x) 0 | 563 | #define obj_offset(x) 0 |
563 | #define obj_size(cachep) (cachep->buffer_size) | 564 | #define obj_size(cachep) (cachep->buffer_size) |
564 | #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;}) | 565 | #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) |
565 | #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;}) | 566 | #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) |
566 | #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) | 567 | #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) |
567 | 568 | ||
568 | #endif | 569 | #endif |
569 | 570 | ||
570 | /* | 571 | /* |
571 | * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp | ||
572 | * order. | ||
573 | */ | ||
574 | #if defined(CONFIG_LARGE_ALLOCS) | ||
575 | #define MAX_OBJ_ORDER 13 /* up to 32Mb */ | ||
576 | #define MAX_GFP_ORDER 13 /* up to 32Mb */ | ||
577 | #elif defined(CONFIG_MMU) | ||
578 | #define MAX_OBJ_ORDER 5 /* 32 pages */ | ||
579 | #define MAX_GFP_ORDER 5 /* 32 pages */ | ||
580 | #else | ||
581 | #define MAX_OBJ_ORDER 8 /* up to 1Mb */ | ||
582 | #define MAX_GFP_ORDER 8 /* up to 1Mb */ | ||
583 | #endif | ||
584 | |||
585 | /* | ||
586 | * Do not go above this order unless 0 objects fit into the slab. | 572 | * Do not go above this order unless 0 objects fit into the slab. |
587 | */ | 573 | */ |
588 | #define BREAK_GFP_ORDER_HI 1 | 574 | #define BREAK_GFP_ORDER_HI 1 |
@@ -788,6 +774,7 @@ static inline struct kmem_cache *__find_general_cachep(size_t size, | |||
788 | */ | 774 | */ |
789 | BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); | 775 | BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); |
790 | #endif | 776 | #endif |
777 | WARN_ON_ONCE(size == 0); | ||
791 | while (size > csizep->cs_size) | 778 | while (size > csizep->cs_size) |
792 | csizep++; | 779 | csizep++; |
793 | 780 | ||
@@ -924,12 +911,6 @@ static void next_reap_node(void) | |||
924 | { | 911 | { |
925 | int node = __get_cpu_var(reap_node); | 912 | int node = __get_cpu_var(reap_node); |
926 | 913 | ||
927 | /* | ||
928 | * Also drain per cpu pages on remote zones | ||
929 | */ | ||
930 | if (node != numa_node_id()) | ||
931 | drain_node_pages(node); | ||
932 | |||
933 | node = next_node(node, node_online_map); | 914 | node = next_node(node, node_online_map); |
934 | if (unlikely(node >= MAX_NUMNODES)) | 915 | if (unlikely(node >= MAX_NUMNODES)) |
935 | node = first_node(node_online_map); | 916 | node = first_node(node_online_map); |
@@ -1182,8 +1163,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1182 | int memsize = sizeof(struct kmem_list3); | 1163 | int memsize = sizeof(struct kmem_list3); |
1183 | 1164 | ||
1184 | switch (action) { | 1165 | switch (action) { |
1185 | case CPU_UP_PREPARE: | 1166 | case CPU_LOCK_ACQUIRE: |
1186 | mutex_lock(&cache_chain_mutex); | 1167 | mutex_lock(&cache_chain_mutex); |
1168 | break; | ||
1169 | case CPU_UP_PREPARE: | ||
1170 | case CPU_UP_PREPARE_FROZEN: | ||
1187 | /* | 1171 | /* |
1188 | * We need to do this right in the beginning since | 1172 | * We need to do this right in the beginning since |
1189 | * alloc_arraycache's are going to use this list. | 1173 | * alloc_arraycache's are going to use this list. |
@@ -1270,17 +1254,28 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1270 | } | 1254 | } |
1271 | break; | 1255 | break; |
1272 | case CPU_ONLINE: | 1256 | case CPU_ONLINE: |
1273 | mutex_unlock(&cache_chain_mutex); | 1257 | case CPU_ONLINE_FROZEN: |
1274 | start_cpu_timer(cpu); | 1258 | start_cpu_timer(cpu); |
1275 | break; | 1259 | break; |
1276 | #ifdef CONFIG_HOTPLUG_CPU | 1260 | #ifdef CONFIG_HOTPLUG_CPU |
1277 | case CPU_DOWN_PREPARE: | 1261 | case CPU_DOWN_PREPARE: |
1278 | mutex_lock(&cache_chain_mutex); | 1262 | case CPU_DOWN_PREPARE_FROZEN: |
1279 | break; | 1263 | /* |
1280 | case CPU_DOWN_FAILED: | 1264 | * Shutdown cache reaper. Note that the cache_chain_mutex is |
1281 | mutex_unlock(&cache_chain_mutex); | 1265 | * held so that if cache_reap() is invoked it cannot do |
1282 | break; | 1266 | * anything expensive but will only modify reap_work |
1267 | * and reschedule the timer. | ||
1268 | */ | ||
1269 | cancel_rearming_delayed_work(&per_cpu(reap_work, cpu)); | ||
1270 | /* Now the cache_reaper is guaranteed to be not running. */ | ||
1271 | per_cpu(reap_work, cpu).work.func = NULL; | ||
1272 | break; | ||
1273 | case CPU_DOWN_FAILED: | ||
1274 | case CPU_DOWN_FAILED_FROZEN: | ||
1275 | start_cpu_timer(cpu); | ||
1276 | break; | ||
1283 | case CPU_DEAD: | 1277 | case CPU_DEAD: |
1278 | case CPU_DEAD_FROZEN: | ||
1284 | /* | 1279 | /* |
1285 | * Even if all the cpus of a node are down, we don't free the | 1280 | * Even if all the cpus of a node are down, we don't free the |
1286 | * kmem_list3 of any cache. This to avoid a race between | 1281 | * kmem_list3 of any cache. This to avoid a race between |
@@ -1292,6 +1287,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1292 | /* fall thru */ | 1287 | /* fall thru */ |
1293 | #endif | 1288 | #endif |
1294 | case CPU_UP_CANCELED: | 1289 | case CPU_UP_CANCELED: |
1290 | case CPU_UP_CANCELED_FROZEN: | ||
1295 | list_for_each_entry(cachep, &cache_chain, next) { | 1291 | list_for_each_entry(cachep, &cache_chain, next) { |
1296 | struct array_cache *nc; | 1292 | struct array_cache *nc; |
1297 | struct array_cache *shared; | 1293 | struct array_cache *shared; |
@@ -1350,6 +1346,8 @@ free_array_cache: | |||
1350 | continue; | 1346 | continue; |
1351 | drain_freelist(cachep, l3, l3->free_objects); | 1347 | drain_freelist(cachep, l3, l3->free_objects); |
1352 | } | 1348 | } |
1349 | break; | ||
1350 | case CPU_LOCK_RELEASE: | ||
1353 | mutex_unlock(&cache_chain_mutex); | 1351 | mutex_unlock(&cache_chain_mutex); |
1354 | break; | 1352 | break; |
1355 | } | 1353 | } |
@@ -1776,7 +1774,7 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) | |||
1776 | char *realobj; | 1774 | char *realobj; |
1777 | 1775 | ||
1778 | if (cachep->flags & SLAB_RED_ZONE) { | 1776 | if (cachep->flags & SLAB_RED_ZONE) { |
1779 | printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", | 1777 | printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n", |
1780 | *dbg_redzone1(cachep, objp), | 1778 | *dbg_redzone1(cachep, objp), |
1781 | *dbg_redzone2(cachep, objp)); | 1779 | *dbg_redzone2(cachep, objp)); |
1782 | } | 1780 | } |
@@ -1896,20 +1894,11 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) | |||
1896 | slab_error(cachep, "end of a freed object " | 1894 | slab_error(cachep, "end of a freed object " |
1897 | "was overwritten"); | 1895 | "was overwritten"); |
1898 | } | 1896 | } |
1899 | if (cachep->dtor && !(cachep->flags & SLAB_POISON)) | ||
1900 | (cachep->dtor) (objp + obj_offset(cachep), cachep, 0); | ||
1901 | } | 1897 | } |
1902 | } | 1898 | } |
1903 | #else | 1899 | #else |
1904 | static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) | 1900 | static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) |
1905 | { | 1901 | { |
1906 | if (cachep->dtor) { | ||
1907 | int i; | ||
1908 | for (i = 0; i < cachep->num; i++) { | ||
1909 | void *objp = index_to_obj(cachep, slabp, i); | ||
1910 | (cachep->dtor) (objp, cachep, 0); | ||
1911 | } | ||
1912 | } | ||
1913 | } | 1902 | } |
1914 | #endif | 1903 | #endif |
1915 | 1904 | ||
@@ -1998,7 +1987,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, | |||
1998 | size_t left_over = 0; | 1987 | size_t left_over = 0; |
1999 | int gfporder; | 1988 | int gfporder; |
2000 | 1989 | ||
2001 | for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) { | 1990 | for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { |
2002 | unsigned int num; | 1991 | unsigned int num; |
2003 | size_t remainder; | 1992 | size_t remainder; |
2004 | 1993 | ||
@@ -2048,7 +2037,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, | |||
2048 | return left_over; | 2037 | return left_over; |
2049 | } | 2038 | } |
2050 | 2039 | ||
2051 | static int setup_cpu_cache(struct kmem_cache *cachep) | 2040 | static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) |
2052 | { | 2041 | { |
2053 | if (g_cpucache_up == FULL) | 2042 | if (g_cpucache_up == FULL) |
2054 | return enable_cpucache(cachep); | 2043 | return enable_cpucache(cachep); |
@@ -2109,7 +2098,7 @@ static int setup_cpu_cache(struct kmem_cache *cachep) | |||
2109 | * @align: The required alignment for the objects. | 2098 | * @align: The required alignment for the objects. |
2110 | * @flags: SLAB flags | 2099 | * @flags: SLAB flags |
2111 | * @ctor: A constructor for the objects. | 2100 | * @ctor: A constructor for the objects. |
2112 | * @dtor: A destructor for the objects. | 2101 | * @dtor: A destructor for the objects (not implemented anymore). |
2113 | * | 2102 | * |
2114 | * Returns a ptr to the cache on success, NULL on failure. | 2103 | * Returns a ptr to the cache on success, NULL on failure. |
2115 | * Cannot be called within a int, but can be interrupted. | 2104 | * Cannot be called within a int, but can be interrupted. |
@@ -2144,7 +2133,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2144 | * Sanity checks... these are all serious usage bugs. | 2133 | * Sanity checks... these are all serious usage bugs. |
2145 | */ | 2134 | */ |
2146 | if (!name || in_interrupt() || (size < BYTES_PER_WORD) || | 2135 | if (!name || in_interrupt() || (size < BYTES_PER_WORD) || |
2147 | (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { | 2136 | size > KMALLOC_MAX_SIZE || dtor) { |
2148 | printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__, | 2137 | printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__, |
2149 | name); | 2138 | name); |
2150 | BUG(); | 2139 | BUG(); |
@@ -2198,9 +2187,6 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2198 | if (flags & SLAB_DESTROY_BY_RCU) | 2187 | if (flags & SLAB_DESTROY_BY_RCU) |
2199 | BUG_ON(flags & SLAB_POISON); | 2188 | BUG_ON(flags & SLAB_POISON); |
2200 | #endif | 2189 | #endif |
2201 | if (flags & SLAB_DESTROY_BY_RCU) | ||
2202 | BUG_ON(dtor); | ||
2203 | |||
2204 | /* | 2190 | /* |
2205 | * Always checks flags, a caller might be expecting debug support which | 2191 | * Always checks flags, a caller might be expecting debug support which |
2206 | * isn't available. | 2192 | * isn't available. |
@@ -2239,7 +2225,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2239 | * is greater than BYTES_PER_WORD. | 2225 | * is greater than BYTES_PER_WORD. |
2240 | */ | 2226 | */ |
2241 | if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) | 2227 | if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) |
2242 | ralign = BYTES_PER_WORD; | 2228 | ralign = __alignof__(unsigned long long); |
2243 | 2229 | ||
2244 | /* 2) arch mandated alignment */ | 2230 | /* 2) arch mandated alignment */ |
2245 | if (ralign < ARCH_SLAB_MINALIGN) { | 2231 | if (ralign < ARCH_SLAB_MINALIGN) { |
@@ -2250,7 +2236,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2250 | ralign = align; | 2236 | ralign = align; |
2251 | } | 2237 | } |
2252 | /* disable debug if necessary */ | 2238 | /* disable debug if necessary */ |
2253 | if (ralign > BYTES_PER_WORD) | 2239 | if (ralign > __alignof__(unsigned long long)) |
2254 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | 2240 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); |
2255 | /* | 2241 | /* |
2256 | * 4) Store it. | 2242 | * 4) Store it. |
@@ -2271,8 +2257,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2271 | */ | 2257 | */ |
2272 | if (flags & SLAB_RED_ZONE) { | 2258 | if (flags & SLAB_RED_ZONE) { |
2273 | /* add space for red zone words */ | 2259 | /* add space for red zone words */ |
2274 | cachep->obj_offset += BYTES_PER_WORD; | 2260 | cachep->obj_offset += sizeof(unsigned long long); |
2275 | size += 2 * BYTES_PER_WORD; | 2261 | size += 2 * sizeof(unsigned long long); |
2276 | } | 2262 | } |
2277 | if (flags & SLAB_STORE_USER) { | 2263 | if (flags & SLAB_STORE_USER) { |
2278 | /* user store requires one word storage behind the end of | 2264 | /* user store requires one word storage behind the end of |
@@ -2355,7 +2341,6 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2355 | BUG_ON(!cachep->slabp_cache); | 2341 | BUG_ON(!cachep->slabp_cache); |
2356 | } | 2342 | } |
2357 | cachep->ctor = ctor; | 2343 | cachep->ctor = ctor; |
2358 | cachep->dtor = dtor; | ||
2359 | cachep->name = name; | 2344 | cachep->name = name; |
2360 | 2345 | ||
2361 | if (setup_cpu_cache(cachep)) { | 2346 | if (setup_cpu_cache(cachep)) { |
@@ -2610,7 +2595,7 @@ static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) | |||
2610 | } | 2595 | } |
2611 | 2596 | ||
2612 | static void cache_init_objs(struct kmem_cache *cachep, | 2597 | static void cache_init_objs(struct kmem_cache *cachep, |
2613 | struct slab *slabp, unsigned long ctor_flags) | 2598 | struct slab *slabp) |
2614 | { | 2599 | { |
2615 | int i; | 2600 | int i; |
2616 | 2601 | ||
@@ -2634,7 +2619,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2634 | */ | 2619 | */ |
2635 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) | 2620 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) |
2636 | cachep->ctor(objp + obj_offset(cachep), cachep, | 2621 | cachep->ctor(objp + obj_offset(cachep), cachep, |
2637 | ctor_flags); | 2622 | 0); |
2638 | 2623 | ||
2639 | if (cachep->flags & SLAB_RED_ZONE) { | 2624 | if (cachep->flags & SLAB_RED_ZONE) { |
2640 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) | 2625 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) |
@@ -2650,7 +2635,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2650 | cachep->buffer_size / PAGE_SIZE, 0); | 2635 | cachep->buffer_size / PAGE_SIZE, 0); |
2651 | #else | 2636 | #else |
2652 | if (cachep->ctor) | 2637 | if (cachep->ctor) |
2653 | cachep->ctor(objp, cachep, ctor_flags); | 2638 | cachep->ctor(objp, cachep, 0); |
2654 | #endif | 2639 | #endif |
2655 | slab_bufctl(slabp)[i] = i + 1; | 2640 | slab_bufctl(slabp)[i] = i + 1; |
2656 | } | 2641 | } |
@@ -2739,7 +2724,6 @@ static int cache_grow(struct kmem_cache *cachep, | |||
2739 | struct slab *slabp; | 2724 | struct slab *slabp; |
2740 | size_t offset; | 2725 | size_t offset; |
2741 | gfp_t local_flags; | 2726 | gfp_t local_flags; |
2742 | unsigned long ctor_flags; | ||
2743 | struct kmem_list3 *l3; | 2727 | struct kmem_list3 *l3; |
2744 | 2728 | ||
2745 | /* | 2729 | /* |
@@ -2748,7 +2732,6 @@ static int cache_grow(struct kmem_cache *cachep, | |||
2748 | */ | 2732 | */ |
2749 | BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK)); | 2733 | BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK)); |
2750 | 2734 | ||
2751 | ctor_flags = SLAB_CTOR_CONSTRUCTOR; | ||
2752 | local_flags = (flags & GFP_LEVEL_MASK); | 2735 | local_flags = (flags & GFP_LEVEL_MASK); |
2753 | /* Take the l3 list lock to change the colour_next on this node */ | 2736 | /* Take the l3 list lock to change the colour_next on this node */ |
2754 | check_irq_off(); | 2737 | check_irq_off(); |
@@ -2793,7 +2776,7 @@ static int cache_grow(struct kmem_cache *cachep, | |||
2793 | slabp->nodeid = nodeid; | 2776 | slabp->nodeid = nodeid; |
2794 | slab_map_pages(cachep, slabp, objp); | 2777 | slab_map_pages(cachep, slabp, objp); |
2795 | 2778 | ||
2796 | cache_init_objs(cachep, slabp, ctor_flags); | 2779 | cache_init_objs(cachep, slabp); |
2797 | 2780 | ||
2798 | if (local_flags & __GFP_WAIT) | 2781 | if (local_flags & __GFP_WAIT) |
2799 | local_irq_disable(); | 2782 | local_irq_disable(); |
@@ -2820,7 +2803,6 @@ failed: | |||
2820 | * Perform extra freeing checks: | 2803 | * Perform extra freeing checks: |
2821 | * - detect bad pointers. | 2804 | * - detect bad pointers. |
2822 | * - POISON/RED_ZONE checking | 2805 | * - POISON/RED_ZONE checking |
2823 | * - destructor calls, for caches with POISON+dtor | ||
2824 | */ | 2806 | */ |
2825 | static void kfree_debugcheck(const void *objp) | 2807 | static void kfree_debugcheck(const void *objp) |
2826 | { | 2808 | { |
@@ -2833,7 +2815,7 @@ static void kfree_debugcheck(const void *objp) | |||
2833 | 2815 | ||
2834 | static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) | 2816 | static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) |
2835 | { | 2817 | { |
2836 | unsigned long redzone1, redzone2; | 2818 | unsigned long long redzone1, redzone2; |
2837 | 2819 | ||
2838 | redzone1 = *dbg_redzone1(cache, obj); | 2820 | redzone1 = *dbg_redzone1(cache, obj); |
2839 | redzone2 = *dbg_redzone2(cache, obj); | 2821 | redzone2 = *dbg_redzone2(cache, obj); |
@@ -2849,7 +2831,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) | |||
2849 | else | 2831 | else |
2850 | slab_error(cache, "memory outside object was overwritten"); | 2832 | slab_error(cache, "memory outside object was overwritten"); |
2851 | 2833 | ||
2852 | printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n", | 2834 | printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n", |
2853 | obj, redzone1, redzone2); | 2835 | obj, redzone1, redzone2); |
2854 | } | 2836 | } |
2855 | 2837 | ||
@@ -2879,12 +2861,6 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
2879 | BUG_ON(objnr >= cachep->num); | 2861 | BUG_ON(objnr >= cachep->num); |
2880 | BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); | 2862 | BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); |
2881 | 2863 | ||
2882 | if (cachep->flags & SLAB_POISON && cachep->dtor) { | ||
2883 | /* we want to cache poison the object, | ||
2884 | * call the destruction callback | ||
2885 | */ | ||
2886 | cachep->dtor(objp + obj_offset(cachep), cachep, 0); | ||
2887 | } | ||
2888 | #ifdef CONFIG_DEBUG_SLAB_LEAK | 2864 | #ifdef CONFIG_DEBUG_SLAB_LEAK |
2889 | slab_bufctl(slabp)[objnr] = BUFCTL_FREE; | 2865 | slab_bufctl(slabp)[objnr] = BUFCTL_FREE; |
2890 | #endif | 2866 | #endif |
@@ -3065,7 +3041,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3065 | slab_error(cachep, "double free, or memory outside" | 3041 | slab_error(cachep, "double free, or memory outside" |
3066 | " object was overwritten"); | 3042 | " object was overwritten"); |
3067 | printk(KERN_ERR | 3043 | printk(KERN_ERR |
3068 | "%p: redzone 1:0x%lx, redzone 2:0x%lx\n", | 3044 | "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", |
3069 | objp, *dbg_redzone1(cachep, objp), | 3045 | objp, *dbg_redzone1(cachep, objp), |
3070 | *dbg_redzone2(cachep, objp)); | 3046 | *dbg_redzone2(cachep, objp)); |
3071 | } | 3047 | } |
@@ -3084,7 +3060,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3084 | #endif | 3060 | #endif |
3085 | objp += obj_offset(cachep); | 3061 | objp += obj_offset(cachep); |
3086 | if (cachep->ctor && cachep->flags & SLAB_POISON) | 3062 | if (cachep->ctor && cachep->flags & SLAB_POISON) |
3087 | cachep->ctor(objp, cachep, SLAB_CTOR_CONSTRUCTOR); | 3063 | cachep->ctor(objp, cachep, 0); |
3088 | #if ARCH_SLAB_MINALIGN | 3064 | #if ARCH_SLAB_MINALIGN |
3089 | if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { | 3065 | if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { |
3090 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", | 3066 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", |
@@ -3738,7 +3714,6 @@ EXPORT_SYMBOL(__kmalloc); | |||
3738 | 3714 | ||
3739 | /** | 3715 | /** |
3740 | * krealloc - reallocate memory. The contents will remain unchanged. | 3716 | * krealloc - reallocate memory. The contents will remain unchanged. |
3741 | * | ||
3742 | * @p: object to reallocate memory for. | 3717 | * @p: object to reallocate memory for. |
3743 | * @new_size: how many bytes of memory are required. | 3718 | * @new_size: how many bytes of memory are required. |
3744 | * @flags: the type of memory to allocate. | 3719 | * @flags: the type of memory to allocate. |
@@ -4136,7 +4111,6 @@ next: | |||
4136 | check_irq_on(); | 4111 | check_irq_on(); |
4137 | mutex_unlock(&cache_chain_mutex); | 4112 | mutex_unlock(&cache_chain_mutex); |
4138 | next_reap_node(); | 4113 | next_reap_node(); |
4139 | refresh_cpu_vm_stats(smp_processor_id()); | ||
4140 | out: | 4114 | out: |
4141 | /* Set up the next iteration */ | 4115 | /* Set up the next iteration */ |
4142 | schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); | 4116 | schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); |
@@ -4428,16 +4402,12 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) | |||
4428 | static void show_symbol(struct seq_file *m, unsigned long address) | 4402 | static void show_symbol(struct seq_file *m, unsigned long address) |
4429 | { | 4403 | { |
4430 | #ifdef CONFIG_KALLSYMS | 4404 | #ifdef CONFIG_KALLSYMS |
4431 | char *modname; | ||
4432 | const char *name; | ||
4433 | unsigned long offset, size; | 4405 | unsigned long offset, size; |
4434 | char namebuf[KSYM_NAME_LEN+1]; | 4406 | char modname[MODULE_NAME_LEN + 1], name[KSYM_NAME_LEN + 1]; |
4435 | |||
4436 | name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); | ||
4437 | 4407 | ||
4438 | if (name) { | 4408 | if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) { |
4439 | seq_printf(m, "%s+%#lx/%#lx", name, offset, size); | 4409 | seq_printf(m, "%s+%#lx/%#lx", name, offset, size); |
4440 | if (modname) | 4410 | if (modname[0]) |
4441 | seq_printf(m, " [%s]", modname); | 4411 | seq_printf(m, " [%s]", modname); |
4442 | return; | 4412 | return; |
4443 | } | 4413 | } |
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/init.h> | 35 | #include <linux/init.h> |
36 | #include <linux/module.h> | 36 | #include <linux/module.h> |
37 | #include <linux/timer.h> | 37 | #include <linux/timer.h> |
38 | #include <linux/rcupdate.h> | ||
38 | 39 | ||
39 | struct slob_block { | 40 | struct slob_block { |
40 | int units; | 41 | int units; |
@@ -53,6 +54,16 @@ struct bigblock { | |||
53 | }; | 54 | }; |
54 | typedef struct bigblock bigblock_t; | 55 | typedef struct bigblock bigblock_t; |
55 | 56 | ||
57 | /* | ||
58 | * struct slob_rcu is inserted at the tail of allocated slob blocks, which | ||
59 | * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free | ||
60 | * the block using call_rcu. | ||
61 | */ | ||
62 | struct slob_rcu { | ||
63 | struct rcu_head head; | ||
64 | int size; | ||
65 | }; | ||
66 | |||
56 | static slob_t arena = { .next = &arena, .units = 1 }; | 67 | static slob_t arena = { .next = &arena, .units = 1 }; |
57 | static slob_t *slobfree = &arena; | 68 | static slob_t *slobfree = &arena; |
58 | static bigblock_t *bigblocks; | 69 | static bigblock_t *bigblocks; |
@@ -266,9 +277,9 @@ size_t ksize(const void *block) | |||
266 | 277 | ||
267 | struct kmem_cache { | 278 | struct kmem_cache { |
268 | unsigned int size, align; | 279 | unsigned int size, align; |
280 | unsigned long flags; | ||
269 | const char *name; | 281 | const char *name; |
270 | void (*ctor)(void *, struct kmem_cache *, unsigned long); | 282 | void (*ctor)(void *, struct kmem_cache *, unsigned long); |
271 | void (*dtor)(void *, struct kmem_cache *, unsigned long); | ||
272 | }; | 283 | }; |
273 | 284 | ||
274 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, | 285 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, |
@@ -283,8 +294,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
283 | if (c) { | 294 | if (c) { |
284 | c->name = name; | 295 | c->name = name; |
285 | c->size = size; | 296 | c->size = size; |
297 | if (flags & SLAB_DESTROY_BY_RCU) { | ||
298 | /* leave room for rcu footer at the end of object */ | ||
299 | c->size += sizeof(struct slob_rcu); | ||
300 | } | ||
301 | c->flags = flags; | ||
286 | c->ctor = ctor; | 302 | c->ctor = ctor; |
287 | c->dtor = dtor; | ||
288 | /* ignore alignment unless it's forced */ | 303 | /* ignore alignment unless it's forced */ |
289 | c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; | 304 | c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; |
290 | if (c->align < align) | 305 | if (c->align < align) |
@@ -312,7 +327,7 @@ void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags) | |||
312 | b = (void *)__get_free_pages(flags, get_order(c->size)); | 327 | b = (void *)__get_free_pages(flags, get_order(c->size)); |
313 | 328 | ||
314 | if (c->ctor) | 329 | if (c->ctor) |
315 | c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR); | 330 | c->ctor(b, c, 0); |
316 | 331 | ||
317 | return b; | 332 | return b; |
318 | } | 333 | } |
@@ -328,15 +343,33 @@ void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags) | |||
328 | } | 343 | } |
329 | EXPORT_SYMBOL(kmem_cache_zalloc); | 344 | EXPORT_SYMBOL(kmem_cache_zalloc); |
330 | 345 | ||
331 | void kmem_cache_free(struct kmem_cache *c, void *b) | 346 | static void __kmem_cache_free(void *b, int size) |
332 | { | 347 | { |
333 | if (c->dtor) | 348 | if (size < PAGE_SIZE) |
334 | c->dtor(b, c, 0); | 349 | slob_free(b, size); |
335 | |||
336 | if (c->size < PAGE_SIZE) | ||
337 | slob_free(b, c->size); | ||
338 | else | 350 | else |
339 | free_pages((unsigned long)b, get_order(c->size)); | 351 | free_pages((unsigned long)b, get_order(size)); |
352 | } | ||
353 | |||
354 | static void kmem_rcu_free(struct rcu_head *head) | ||
355 | { | ||
356 | struct slob_rcu *slob_rcu = (struct slob_rcu *)head; | ||
357 | void *b = (void *)slob_rcu - (slob_rcu->size - sizeof(struct slob_rcu)); | ||
358 | |||
359 | __kmem_cache_free(b, slob_rcu->size); | ||
360 | } | ||
361 | |||
362 | void kmem_cache_free(struct kmem_cache *c, void *b) | ||
363 | { | ||
364 | if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { | ||
365 | struct slob_rcu *slob_rcu; | ||
366 | slob_rcu = b + (c->size - sizeof(struct slob_rcu)); | ||
367 | INIT_RCU_HEAD(&slob_rcu->head); | ||
368 | slob_rcu->size = c->size; | ||
369 | call_rcu(&slob_rcu->head, kmem_rcu_free); | ||
370 | } else { | ||
371 | __kmem_cache_free(b, c->size); | ||
372 | } | ||
340 | } | 373 | } |
341 | EXPORT_SYMBOL(kmem_cache_free); | 374 | EXPORT_SYMBOL(kmem_cache_free); |
342 | 375 | ||
@@ -66,11 +66,11 @@ | |||
66 | * SLUB assigns one slab for allocation to each processor. | 66 | * SLUB assigns one slab for allocation to each processor. |
67 | * Allocations only occur from these slabs called cpu slabs. | 67 | * Allocations only occur from these slabs called cpu slabs. |
68 | * | 68 | * |
69 | * Slabs with free elements are kept on a partial list. | 69 | * Slabs with free elements are kept on a partial list and during regular |
70 | * There is no list for full slabs. If an object in a full slab is | 70 | * operations no list for full slabs is used. If an object in a full slab is |
71 | * freed then the slab will show up again on the partial lists. | 71 | * freed then the slab will show up again on the partial lists. |
72 | * Otherwise there is no need to track full slabs unless we have to | 72 | * We track full slabs for debugging purposes though because otherwise we |
73 | * track full slabs for debugging purposes. | 73 | * cannot scan all objects. |
74 | * | 74 | * |
75 | * Slabs are freed when they become empty. Teardown and setup is | 75 | * Slabs are freed when they become empty. Teardown and setup is |
76 | * minimal so we rely on the page allocators per cpu caches for | 76 | * minimal so we rely on the page allocators per cpu caches for |
@@ -78,22 +78,72 @@ | |||
78 | * | 78 | * |
79 | * Overloading of page flags that are otherwise used for LRU management. | 79 | * Overloading of page flags that are otherwise used for LRU management. |
80 | * | 80 | * |
81 | * PageActive The slab is used as a cpu cache. Allocations | 81 | * PageActive The slab is frozen and exempt from list processing. |
82 | * may be performed from the slab. The slab is not | 82 | * This means that the slab is dedicated to a purpose |
83 | * on any slab list and cannot be moved onto one. | 83 | * such as satisfying allocations for a specific |
84 | * processor. Objects may be freed in the slab while | ||
85 | * it is frozen but slab_free will then skip the usual | ||
86 | * list operations. It is up to the processor holding | ||
87 | * the slab to integrate the slab into the slab lists | ||
88 | * when the slab is no longer needed. | ||
89 | * | ||
90 | * One use of this flag is to mark slabs that are | ||
91 | * used for allocations. Then such a slab becomes a cpu | ||
92 | * slab. The cpu slab may be equipped with an additional | ||
93 | * lockless_freelist that allows lockless access to | ||
94 | * free objects in addition to the regular freelist | ||
95 | * that requires the slab lock. | ||
84 | * | 96 | * |
85 | * PageError Slab requires special handling due to debug | 97 | * PageError Slab requires special handling due to debug |
86 | * options set. This moves slab handling out of | 98 | * options set. This moves slab handling out of |
87 | * the fast path. | 99 | * the fast path and disables lockless freelists. |
88 | */ | 100 | */ |
89 | 101 | ||
102 | #define FROZEN (1 << PG_active) | ||
103 | |||
104 | #ifdef CONFIG_SLUB_DEBUG | ||
105 | #define SLABDEBUG (1 << PG_error) | ||
106 | #else | ||
107 | #define SLABDEBUG 0 | ||
108 | #endif | ||
109 | |||
110 | static inline int SlabFrozen(struct page *page) | ||
111 | { | ||
112 | return page->flags & FROZEN; | ||
113 | } | ||
114 | |||
115 | static inline void SetSlabFrozen(struct page *page) | ||
116 | { | ||
117 | page->flags |= FROZEN; | ||
118 | } | ||
119 | |||
120 | static inline void ClearSlabFrozen(struct page *page) | ||
121 | { | ||
122 | page->flags &= ~FROZEN; | ||
123 | } | ||
124 | |||
125 | static inline int SlabDebug(struct page *page) | ||
126 | { | ||
127 | return page->flags & SLABDEBUG; | ||
128 | } | ||
129 | |||
130 | static inline void SetSlabDebug(struct page *page) | ||
131 | { | ||
132 | page->flags |= SLABDEBUG; | ||
133 | } | ||
134 | |||
135 | static inline void ClearSlabDebug(struct page *page) | ||
136 | { | ||
137 | page->flags &= ~SLABDEBUG; | ||
138 | } | ||
139 | |||
90 | /* | 140 | /* |
91 | * Issues still to be resolved: | 141 | * Issues still to be resolved: |
92 | * | 142 | * |
93 | * - The per cpu array is updated for each new slab and and is a remote | 143 | * - The per cpu array is updated for each new slab and and is a remote |
94 | * cacheline for most nodes. This could become a bouncing cacheline given | 144 | * cacheline for most nodes. This could become a bouncing cacheline given |
95 | * enough frequent updates. There are 16 pointers in a cacheline.so at | 145 | * enough frequent updates. There are 16 pointers in a cacheline, so at |
96 | * max 16 cpus could compete. Likely okay. | 146 | * max 16 cpus could compete for the cacheline which may be okay. |
97 | * | 147 | * |
98 | * - Support PAGE_ALLOC_DEBUG. Should be easy to do. | 148 | * - Support PAGE_ALLOC_DEBUG. Should be easy to do. |
99 | * | 149 | * |
@@ -137,6 +187,7 @@ | |||
137 | 187 | ||
138 | #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ | 188 | #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ |
139 | SLAB_POISON | SLAB_STORE_USER) | 189 | SLAB_POISON | SLAB_STORE_USER) |
190 | |||
140 | /* | 191 | /* |
141 | * Set of flags that will prevent slab merging | 192 | * Set of flags that will prevent slab merging |
142 | */ | 193 | */ |
@@ -157,6 +208,11 @@ | |||
157 | /* Internal SLUB flags */ | 208 | /* Internal SLUB flags */ |
158 | #define __OBJECT_POISON 0x80000000 /* Poison object */ | 209 | #define __OBJECT_POISON 0x80000000 /* Poison object */ |
159 | 210 | ||
211 | /* Not all arches define cache_line_size */ | ||
212 | #ifndef cache_line_size | ||
213 | #define cache_line_size() L1_CACHE_BYTES | ||
214 | #endif | ||
215 | |||
160 | static int kmem_size = sizeof(struct kmem_cache); | 216 | static int kmem_size = sizeof(struct kmem_cache); |
161 | 217 | ||
162 | #ifdef CONFIG_SMP | 218 | #ifdef CONFIG_SMP |
@@ -166,7 +222,7 @@ static struct notifier_block slab_notifier; | |||
166 | static enum { | 222 | static enum { |
167 | DOWN, /* No slab functionality available */ | 223 | DOWN, /* No slab functionality available */ |
168 | PARTIAL, /* kmem_cache_open() works but kmalloc does not */ | 224 | PARTIAL, /* kmem_cache_open() works but kmalloc does not */ |
169 | UP, /* Everything works */ | 225 | UP, /* Everything works but does not show up in sysfs */ |
170 | SYSFS /* Sysfs up */ | 226 | SYSFS /* Sysfs up */ |
171 | } slab_state = DOWN; | 227 | } slab_state = DOWN; |
172 | 228 | ||
@@ -174,7 +230,19 @@ static enum { | |||
174 | static DECLARE_RWSEM(slub_lock); | 230 | static DECLARE_RWSEM(slub_lock); |
175 | LIST_HEAD(slab_caches); | 231 | LIST_HEAD(slab_caches); |
176 | 232 | ||
177 | #ifdef CONFIG_SYSFS | 233 | /* |
234 | * Tracking user of a slab. | ||
235 | */ | ||
236 | struct track { | ||
237 | void *addr; /* Called from address */ | ||
238 | int cpu; /* Was running on cpu */ | ||
239 | int pid; /* Pid context */ | ||
240 | unsigned long when; /* When did the operation occur */ | ||
241 | }; | ||
242 | |||
243 | enum track_item { TRACK_ALLOC, TRACK_FREE }; | ||
244 | |||
245 | #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) | ||
178 | static int sysfs_slab_add(struct kmem_cache *); | 246 | static int sysfs_slab_add(struct kmem_cache *); |
179 | static int sysfs_slab_alias(struct kmem_cache *, const char *); | 247 | static int sysfs_slab_alias(struct kmem_cache *, const char *); |
180 | static void sysfs_slab_remove(struct kmem_cache *); | 248 | static void sysfs_slab_remove(struct kmem_cache *); |
@@ -202,6 +270,63 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) | |||
202 | #endif | 270 | #endif |
203 | } | 271 | } |
204 | 272 | ||
273 | static inline int check_valid_pointer(struct kmem_cache *s, | ||
274 | struct page *page, const void *object) | ||
275 | { | ||
276 | void *base; | ||
277 | |||
278 | if (!object) | ||
279 | return 1; | ||
280 | |||
281 | base = page_address(page); | ||
282 | if (object < base || object >= base + s->objects * s->size || | ||
283 | (object - base) % s->size) { | ||
284 | return 0; | ||
285 | } | ||
286 | |||
287 | return 1; | ||
288 | } | ||
289 | |||
290 | /* | ||
291 | * Slow version of get and set free pointer. | ||
292 | * | ||
293 | * This version requires touching the cache lines of kmem_cache which | ||
294 | * we avoid to do in the fast alloc free paths. There we obtain the offset | ||
295 | * from the page struct. | ||
296 | */ | ||
297 | static inline void *get_freepointer(struct kmem_cache *s, void *object) | ||
298 | { | ||
299 | return *(void **)(object + s->offset); | ||
300 | } | ||
301 | |||
302 | static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) | ||
303 | { | ||
304 | *(void **)(object + s->offset) = fp; | ||
305 | } | ||
306 | |||
307 | /* Loop over all objects in a slab */ | ||
308 | #define for_each_object(__p, __s, __addr) \ | ||
309 | for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\ | ||
310 | __p += (__s)->size) | ||
311 | |||
312 | /* Scan freelist */ | ||
313 | #define for_each_free_object(__p, __s, __free) \ | ||
314 | for (__p = (__free); __p; __p = get_freepointer((__s), __p)) | ||
315 | |||
316 | /* Determine object index from a given position */ | ||
317 | static inline int slab_index(void *p, struct kmem_cache *s, void *addr) | ||
318 | { | ||
319 | return (p - addr) / s->size; | ||
320 | } | ||
321 | |||
322 | #ifdef CONFIG_SLUB_DEBUG | ||
323 | /* | ||
324 | * Debug settings: | ||
325 | */ | ||
326 | static int slub_debug; | ||
327 | |||
328 | static char *slub_debug_slabs; | ||
329 | |||
205 | /* | 330 | /* |
206 | * Object debugging | 331 | * Object debugging |
207 | */ | 332 | */ |
@@ -237,35 +362,6 @@ static void print_section(char *text, u8 *addr, unsigned int length) | |||
237 | } | 362 | } |
238 | } | 363 | } |
239 | 364 | ||
240 | /* | ||
241 | * Slow version of get and set free pointer. | ||
242 | * | ||
243 | * This requires touching the cache lines of kmem_cache. | ||
244 | * The offset can also be obtained from the page. In that | ||
245 | * case it is in the cacheline that we already need to touch. | ||
246 | */ | ||
247 | static void *get_freepointer(struct kmem_cache *s, void *object) | ||
248 | { | ||
249 | return *(void **)(object + s->offset); | ||
250 | } | ||
251 | |||
252 | static void set_freepointer(struct kmem_cache *s, void *object, void *fp) | ||
253 | { | ||
254 | *(void **)(object + s->offset) = fp; | ||
255 | } | ||
256 | |||
257 | /* | ||
258 | * Tracking user of a slab. | ||
259 | */ | ||
260 | struct track { | ||
261 | void *addr; /* Called from address */ | ||
262 | int cpu; /* Was running on cpu */ | ||
263 | int pid; /* Pid context */ | ||
264 | unsigned long when; /* When did the operation occur */ | ||
265 | }; | ||
266 | |||
267 | enum track_item { TRACK_ALLOC, TRACK_FREE }; | ||
268 | |||
269 | static struct track *get_track(struct kmem_cache *s, void *object, | 365 | static struct track *get_track(struct kmem_cache *s, void *object, |
270 | enum track_item alloc) | 366 | enum track_item alloc) |
271 | { | 367 | { |
@@ -400,24 +496,6 @@ static int check_bytes(u8 *start, unsigned int value, unsigned int bytes) | |||
400 | return 1; | 496 | return 1; |
401 | } | 497 | } |
402 | 498 | ||
403 | |||
404 | static int check_valid_pointer(struct kmem_cache *s, struct page *page, | ||
405 | void *object) | ||
406 | { | ||
407 | void *base; | ||
408 | |||
409 | if (!object) | ||
410 | return 1; | ||
411 | |||
412 | base = page_address(page); | ||
413 | if (object < base || object >= base + s->objects * s->size || | ||
414 | (object - base) % s->size) { | ||
415 | return 0; | ||
416 | } | ||
417 | |||
418 | return 1; | ||
419 | } | ||
420 | |||
421 | /* | 499 | /* |
422 | * Object layout: | 500 | * Object layout: |
423 | * | 501 | * |
@@ -425,26 +503,34 @@ static int check_valid_pointer(struct kmem_cache *s, struct page *page, | |||
425 | * Bytes of the object to be managed. | 503 | * Bytes of the object to be managed. |
426 | * If the freepointer may overlay the object then the free | 504 | * If the freepointer may overlay the object then the free |
427 | * pointer is the first word of the object. | 505 | * pointer is the first word of the object. |
506 | * | ||
428 | * Poisoning uses 0x6b (POISON_FREE) and the last byte is | 507 | * Poisoning uses 0x6b (POISON_FREE) and the last byte is |
429 | * 0xa5 (POISON_END) | 508 | * 0xa5 (POISON_END) |
430 | * | 509 | * |
431 | * object + s->objsize | 510 | * object + s->objsize |
432 | * Padding to reach word boundary. This is also used for Redzoning. | 511 | * Padding to reach word boundary. This is also used for Redzoning. |
433 | * Padding is extended to word size if Redzoning is enabled | 512 | * Padding is extended by another word if Redzoning is enabled and |
434 | * and objsize == inuse. | 513 | * objsize == inuse. |
514 | * | ||
435 | * We fill with 0xbb (RED_INACTIVE) for inactive objects and with | 515 | * We fill with 0xbb (RED_INACTIVE) for inactive objects and with |
436 | * 0xcc (RED_ACTIVE) for objects in use. | 516 | * 0xcc (RED_ACTIVE) for objects in use. |
437 | * | 517 | * |
438 | * object + s->inuse | 518 | * object + s->inuse |
519 | * Meta data starts here. | ||
520 | * | ||
439 | * A. Free pointer (if we cannot overwrite object on free) | 521 | * A. Free pointer (if we cannot overwrite object on free) |
440 | * B. Tracking data for SLAB_STORE_USER | 522 | * B. Tracking data for SLAB_STORE_USER |
441 | * C. Padding to reach required alignment boundary | 523 | * C. Padding to reach required alignment boundary or at mininum |
442 | * Padding is done using 0x5a (POISON_INUSE) | 524 | * one word if debuggin is on to be able to detect writes |
525 | * before the word boundary. | ||
526 | * | ||
527 | * Padding is done using 0x5a (POISON_INUSE) | ||
443 | * | 528 | * |
444 | * object + s->size | 529 | * object + s->size |
530 | * Nothing is used beyond s->size. | ||
445 | * | 531 | * |
446 | * If slabcaches are merged then the objsize and inuse boundaries are to | 532 | * If slabcaches are merged then the objsize and inuse boundaries are mostly |
447 | * be ignored. And therefore no slab options that rely on these boundaries | 533 | * ignored. And therefore no slab options that rely on these boundaries |
448 | * may be used with merged slabcaches. | 534 | * may be used with merged slabcaches. |
449 | */ | 535 | */ |
450 | 536 | ||
@@ -570,8 +656,7 @@ static int check_object(struct kmem_cache *s, struct page *page, | |||
570 | /* | 656 | /* |
571 | * No choice but to zap it and thus loose the remainder | 657 | * No choice but to zap it and thus loose the remainder |
572 | * of the free objects in this slab. May cause | 658 | * of the free objects in this slab. May cause |
573 | * another error because the object count maybe | 659 | * another error because the object count is now wrong. |
574 | * wrong now. | ||
575 | */ | 660 | */ |
576 | set_freepointer(s, p, NULL); | 661 | set_freepointer(s, p, NULL); |
577 | return 0; | 662 | return 0; |
@@ -611,9 +696,8 @@ static int check_slab(struct kmem_cache *s, struct page *page) | |||
611 | } | 696 | } |
612 | 697 | ||
613 | /* | 698 | /* |
614 | * Determine if a certain object on a page is on the freelist and | 699 | * Determine if a certain object on a page is on the freelist. Must hold the |
615 | * therefore free. Must hold the slab lock for cpu slabs to | 700 | * slab lock to guarantee that the chains are in a consistent state. |
616 | * guarantee that the chains are consistent. | ||
617 | */ | 701 | */ |
618 | static int on_freelist(struct kmem_cache *s, struct page *page, void *search) | 702 | static int on_freelist(struct kmem_cache *s, struct page *page, void *search) |
619 | { | 703 | { |
@@ -658,8 +742,24 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) | |||
658 | return search == NULL; | 742 | return search == NULL; |
659 | } | 743 | } |
660 | 744 | ||
745 | static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc) | ||
746 | { | ||
747 | if (s->flags & SLAB_TRACE) { | ||
748 | printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", | ||
749 | s->name, | ||
750 | alloc ? "alloc" : "free", | ||
751 | object, page->inuse, | ||
752 | page->freelist); | ||
753 | |||
754 | if (!alloc) | ||
755 | print_section("Object", (void *)object, s->objsize); | ||
756 | |||
757 | dump_stack(); | ||
758 | } | ||
759 | } | ||
760 | |||
661 | /* | 761 | /* |
662 | * Tracking of fully allocated slabs for debugging | 762 | * Tracking of fully allocated slabs for debugging purposes. |
663 | */ | 763 | */ |
664 | static void add_full(struct kmem_cache_node *n, struct page *page) | 764 | static void add_full(struct kmem_cache_node *n, struct page *page) |
665 | { | 765 | { |
@@ -682,8 +782,18 @@ static void remove_full(struct kmem_cache *s, struct page *page) | |||
682 | spin_unlock(&n->list_lock); | 782 | spin_unlock(&n->list_lock); |
683 | } | 783 | } |
684 | 784 | ||
685 | static int alloc_object_checks(struct kmem_cache *s, struct page *page, | 785 | static void setup_object_debug(struct kmem_cache *s, struct page *page, |
686 | void *object) | 786 | void *object) |
787 | { | ||
788 | if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) | ||
789 | return; | ||
790 | |||
791 | init_object(s, object, 0); | ||
792 | init_tracking(s, object); | ||
793 | } | ||
794 | |||
795 | static int alloc_debug_processing(struct kmem_cache *s, struct page *page, | ||
796 | void *object, void *addr) | ||
687 | { | 797 | { |
688 | if (!check_slab(s, page)) | 798 | if (!check_slab(s, page)) |
689 | goto bad; | 799 | goto bad; |
@@ -698,19 +808,22 @@ static int alloc_object_checks(struct kmem_cache *s, struct page *page, | |||
698 | goto bad; | 808 | goto bad; |
699 | } | 809 | } |
700 | 810 | ||
701 | if (!object) | 811 | if (object && !check_object(s, page, object, 0)) |
702 | return 1; | ||
703 | |||
704 | if (!check_object(s, page, object, 0)) | ||
705 | goto bad; | 812 | goto bad; |
706 | 813 | ||
814 | /* Success perform special debug activities for allocs */ | ||
815 | if (s->flags & SLAB_STORE_USER) | ||
816 | set_track(s, object, TRACK_ALLOC, addr); | ||
817 | trace(s, page, object, 1); | ||
818 | init_object(s, object, 1); | ||
707 | return 1; | 819 | return 1; |
820 | |||
708 | bad: | 821 | bad: |
709 | if (PageSlab(page)) { | 822 | if (PageSlab(page)) { |
710 | /* | 823 | /* |
711 | * If this is a slab page then lets do the best we can | 824 | * If this is a slab page then lets do the best we can |
712 | * to avoid issues in the future. Marking all objects | 825 | * to avoid issues in the future. Marking all objects |
713 | * as used avoids touching the remainder. | 826 | * as used avoids touching the remaining objects. |
714 | */ | 827 | */ |
715 | printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n", | 828 | printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n", |
716 | s->name, page); | 829 | s->name, page); |
@@ -722,8 +835,8 @@ bad: | |||
722 | return 0; | 835 | return 0; |
723 | } | 836 | } |
724 | 837 | ||
725 | static int free_object_checks(struct kmem_cache *s, struct page *page, | 838 | static int free_debug_processing(struct kmem_cache *s, struct page *page, |
726 | void *object) | 839 | void *object, void *addr) |
727 | { | 840 | { |
728 | if (!check_slab(s, page)) | 841 | if (!check_slab(s, page)) |
729 | goto fail; | 842 | goto fail; |
@@ -757,13 +870,107 @@ static int free_object_checks(struct kmem_cache *s, struct page *page, | |||
757 | "to slab %s", object, page->slab->name); | 870 | "to slab %s", object, page->slab->name); |
758 | goto fail; | 871 | goto fail; |
759 | } | 872 | } |
873 | |||
874 | /* Special debug activities for freeing objects */ | ||
875 | if (!SlabFrozen(page) && !page->freelist) | ||
876 | remove_full(s, page); | ||
877 | if (s->flags & SLAB_STORE_USER) | ||
878 | set_track(s, object, TRACK_FREE, addr); | ||
879 | trace(s, page, object, 0); | ||
880 | init_object(s, object, 0); | ||
760 | return 1; | 881 | return 1; |
882 | |||
761 | fail: | 883 | fail: |
762 | printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n", | 884 | printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n", |
763 | s->name, page, object); | 885 | s->name, page, object); |
764 | return 0; | 886 | return 0; |
765 | } | 887 | } |
766 | 888 | ||
889 | static int __init setup_slub_debug(char *str) | ||
890 | { | ||
891 | if (!str || *str != '=') | ||
892 | slub_debug = DEBUG_DEFAULT_FLAGS; | ||
893 | else { | ||
894 | str++; | ||
895 | if (*str == 0 || *str == ',') | ||
896 | slub_debug = DEBUG_DEFAULT_FLAGS; | ||
897 | else | ||
898 | for( ;*str && *str != ','; str++) | ||
899 | switch (*str) { | ||
900 | case 'f' : case 'F' : | ||
901 | slub_debug |= SLAB_DEBUG_FREE; | ||
902 | break; | ||
903 | case 'z' : case 'Z' : | ||
904 | slub_debug |= SLAB_RED_ZONE; | ||
905 | break; | ||
906 | case 'p' : case 'P' : | ||
907 | slub_debug |= SLAB_POISON; | ||
908 | break; | ||
909 | case 'u' : case 'U' : | ||
910 | slub_debug |= SLAB_STORE_USER; | ||
911 | break; | ||
912 | case 't' : case 'T' : | ||
913 | slub_debug |= SLAB_TRACE; | ||
914 | break; | ||
915 | default: | ||
916 | printk(KERN_ERR "slub_debug option '%c' " | ||
917 | "unknown. skipped\n",*str); | ||
918 | } | ||
919 | } | ||
920 | |||
921 | if (*str == ',') | ||
922 | slub_debug_slabs = str + 1; | ||
923 | return 1; | ||
924 | } | ||
925 | |||
926 | __setup("slub_debug", setup_slub_debug); | ||
927 | |||
928 | static void kmem_cache_open_debug_check(struct kmem_cache *s) | ||
929 | { | ||
930 | /* | ||
931 | * The page->offset field is only 16 bit wide. This is an offset | ||
932 | * in units of words from the beginning of an object. If the slab | ||
933 | * size is bigger then we cannot move the free pointer behind the | ||
934 | * object anymore. | ||
935 | * | ||
936 | * On 32 bit platforms the limit is 256k. On 64bit platforms | ||
937 | * the limit is 512k. | ||
938 | * | ||
939 | * Debugging or ctor may create a need to move the free | ||
940 | * pointer. Fail if this happens. | ||
941 | */ | ||
942 | if (s->objsize >= 65535 * sizeof(void *)) { | ||
943 | BUG_ON(s->flags & (SLAB_RED_ZONE | SLAB_POISON | | ||
944 | SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); | ||
945 | BUG_ON(s->ctor); | ||
946 | } | ||
947 | else | ||
948 | /* | ||
949 | * Enable debugging if selected on the kernel commandline. | ||
950 | */ | ||
951 | if (slub_debug && (!slub_debug_slabs || | ||
952 | strncmp(slub_debug_slabs, s->name, | ||
953 | strlen(slub_debug_slabs)) == 0)) | ||
954 | s->flags |= slub_debug; | ||
955 | } | ||
956 | #else | ||
957 | static inline void setup_object_debug(struct kmem_cache *s, | ||
958 | struct page *page, void *object) {} | ||
959 | |||
960 | static inline int alloc_debug_processing(struct kmem_cache *s, | ||
961 | struct page *page, void *object, void *addr) { return 0; } | ||
962 | |||
963 | static inline int free_debug_processing(struct kmem_cache *s, | ||
964 | struct page *page, void *object, void *addr) { return 0; } | ||
965 | |||
966 | static inline int slab_pad_check(struct kmem_cache *s, struct page *page) | ||
967 | { return 1; } | ||
968 | static inline int check_object(struct kmem_cache *s, struct page *page, | ||
969 | void *object, int active) { return 1; } | ||
970 | static inline void add_full(struct kmem_cache_node *n, struct page *page) {} | ||
971 | static inline void kmem_cache_open_debug_check(struct kmem_cache *s) {} | ||
972 | #define slub_debug 0 | ||
973 | #endif | ||
767 | /* | 974 | /* |
768 | * Slab allocation and freeing | 975 | * Slab allocation and freeing |
769 | */ | 976 | */ |
@@ -797,13 +1004,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
797 | static void setup_object(struct kmem_cache *s, struct page *page, | 1004 | static void setup_object(struct kmem_cache *s, struct page *page, |
798 | void *object) | 1005 | void *object) |
799 | { | 1006 | { |
800 | if (PageError(page)) { | 1007 | setup_object_debug(s, page, object); |
801 | init_object(s, object, 0); | ||
802 | init_tracking(s, object); | ||
803 | } | ||
804 | |||
805 | if (unlikely(s->ctor)) | 1008 | if (unlikely(s->ctor)) |
806 | s->ctor(object, s, SLAB_CTOR_CONSTRUCTOR); | 1009 | s->ctor(object, s, 0); |
807 | } | 1010 | } |
808 | 1011 | ||
809 | static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | 1012 | static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) |
@@ -832,7 +1035,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
832 | page->flags |= 1 << PG_slab; | 1035 | page->flags |= 1 << PG_slab; |
833 | if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | | 1036 | if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | |
834 | SLAB_STORE_USER | SLAB_TRACE)) | 1037 | SLAB_STORE_USER | SLAB_TRACE)) |
835 | page->flags |= 1 << PG_error; | 1038 | SetSlabDebug(page); |
836 | 1039 | ||
837 | start = page_address(page); | 1040 | start = page_address(page); |
838 | end = start + s->objects * s->size; | 1041 | end = start + s->objects * s->size; |
@@ -841,7 +1044,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
841 | memset(start, POISON_INUSE, PAGE_SIZE << s->order); | 1044 | memset(start, POISON_INUSE, PAGE_SIZE << s->order); |
842 | 1045 | ||
843 | last = start; | 1046 | last = start; |
844 | for (p = start + s->size; p < end; p += s->size) { | 1047 | for_each_object(p, s, start) { |
845 | setup_object(s, page, last); | 1048 | setup_object(s, page, last); |
846 | set_freepointer(s, last, p); | 1049 | set_freepointer(s, last, p); |
847 | last = p; | 1050 | last = p; |
@@ -850,6 +1053,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
850 | set_freepointer(s, last, NULL); | 1053 | set_freepointer(s, last, NULL); |
851 | 1054 | ||
852 | page->freelist = start; | 1055 | page->freelist = start; |
1056 | page->lockless_freelist = NULL; | ||
853 | page->inuse = 0; | 1057 | page->inuse = 0; |
854 | out: | 1058 | out: |
855 | if (flags & __GFP_WAIT) | 1059 | if (flags & __GFP_WAIT) |
@@ -861,17 +1065,12 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
861 | { | 1065 | { |
862 | int pages = 1 << s->order; | 1066 | int pages = 1 << s->order; |
863 | 1067 | ||
864 | if (unlikely(PageError(page) || s->dtor)) { | 1068 | if (unlikely(SlabDebug(page))) { |
865 | void *start = page_address(page); | ||
866 | void *end = start + (pages << PAGE_SHIFT); | ||
867 | void *p; | 1069 | void *p; |
868 | 1070 | ||
869 | slab_pad_check(s, page); | 1071 | slab_pad_check(s, page); |
870 | for (p = start; p <= end - s->size; p += s->size) { | 1072 | for_each_object(p, s, page_address(page)) |
871 | if (s->dtor) | ||
872 | s->dtor(p, s, 0); | ||
873 | check_object(s, page, p, 0); | 1073 | check_object(s, page, p, 0); |
874 | } | ||
875 | } | 1074 | } |
876 | 1075 | ||
877 | mod_zone_page_state(page_zone(page), | 1076 | mod_zone_page_state(page_zone(page), |
@@ -910,7 +1109,8 @@ static void discard_slab(struct kmem_cache *s, struct page *page) | |||
910 | 1109 | ||
911 | atomic_long_dec(&n->nr_slabs); | 1110 | atomic_long_dec(&n->nr_slabs); |
912 | reset_page_mapcount(page); | 1111 | reset_page_mapcount(page); |
913 | page->flags &= ~(1 << PG_slab | 1 << PG_error); | 1112 | ClearSlabDebug(page); |
1113 | __ClearPageSlab(page); | ||
914 | free_slab(s, page); | 1114 | free_slab(s, page); |
915 | } | 1115 | } |
916 | 1116 | ||
@@ -966,22 +1166,23 @@ static void remove_partial(struct kmem_cache *s, | |||
966 | } | 1166 | } |
967 | 1167 | ||
968 | /* | 1168 | /* |
969 | * Lock page and remove it from the partial list | 1169 | * Lock slab and remove from the partial list. |
970 | * | 1170 | * |
971 | * Must hold list_lock | 1171 | * Must hold list_lock. |
972 | */ | 1172 | */ |
973 | static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page) | 1173 | static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page) |
974 | { | 1174 | { |
975 | if (slab_trylock(page)) { | 1175 | if (slab_trylock(page)) { |
976 | list_del(&page->lru); | 1176 | list_del(&page->lru); |
977 | n->nr_partial--; | 1177 | n->nr_partial--; |
1178 | SetSlabFrozen(page); | ||
978 | return 1; | 1179 | return 1; |
979 | } | 1180 | } |
980 | return 0; | 1181 | return 0; |
981 | } | 1182 | } |
982 | 1183 | ||
983 | /* | 1184 | /* |
984 | * Try to get a partial slab from a specific node | 1185 | * Try to allocate a partial slab from a specific node. |
985 | */ | 1186 | */ |
986 | static struct page *get_partial_node(struct kmem_cache_node *n) | 1187 | static struct page *get_partial_node(struct kmem_cache_node *n) |
987 | { | 1188 | { |
@@ -990,14 +1191,15 @@ static struct page *get_partial_node(struct kmem_cache_node *n) | |||
990 | /* | 1191 | /* |
991 | * Racy check. If we mistakenly see no partial slabs then we | 1192 | * Racy check. If we mistakenly see no partial slabs then we |
992 | * just allocate an empty slab. If we mistakenly try to get a | 1193 | * just allocate an empty slab. If we mistakenly try to get a |
993 | * partial slab then get_partials() will return NULL. | 1194 | * partial slab and there is none available then get_partials() |
1195 | * will return NULL. | ||
994 | */ | 1196 | */ |
995 | if (!n || !n->nr_partial) | 1197 | if (!n || !n->nr_partial) |
996 | return NULL; | 1198 | return NULL; |
997 | 1199 | ||
998 | spin_lock(&n->list_lock); | 1200 | spin_lock(&n->list_lock); |
999 | list_for_each_entry(page, &n->partial, lru) | 1201 | list_for_each_entry(page, &n->partial, lru) |
1000 | if (lock_and_del_slab(n, page)) | 1202 | if (lock_and_freeze_slab(n, page)) |
1001 | goto out; | 1203 | goto out; |
1002 | page = NULL; | 1204 | page = NULL; |
1003 | out: | 1205 | out: |
@@ -1006,8 +1208,7 @@ out: | |||
1006 | } | 1208 | } |
1007 | 1209 | ||
1008 | /* | 1210 | /* |
1009 | * Get a page from somewhere. Search in increasing NUMA | 1211 | * Get a page from somewhere. Search in increasing NUMA distances. |
1010 | * distances. | ||
1011 | */ | 1212 | */ |
1012 | static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | 1213 | static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) |
1013 | { | 1214 | { |
@@ -1017,24 +1218,22 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
1017 | struct page *page; | 1218 | struct page *page; |
1018 | 1219 | ||
1019 | /* | 1220 | /* |
1020 | * The defrag ratio allows to configure the tradeoffs between | 1221 | * The defrag ratio allows a configuration of the tradeoffs between |
1021 | * inter node defragmentation and node local allocations. | 1222 | * inter node defragmentation and node local allocations. A lower |
1022 | * A lower defrag_ratio increases the tendency to do local | 1223 | * defrag_ratio increases the tendency to do local allocations |
1023 | * allocations instead of scanning throught the partial | 1224 | * instead of attempting to obtain partial slabs from other nodes. |
1024 | * lists on other nodes. | ||
1025 | * | 1225 | * |
1026 | * If defrag_ratio is set to 0 then kmalloc() always | 1226 | * If the defrag_ratio is set to 0 then kmalloc() always |
1027 | * returns node local objects. If its higher then kmalloc() | 1227 | * returns node local objects. If the ratio is higher then kmalloc() |
1028 | * may return off node objects in order to avoid fragmentation. | 1228 | * may return off node objects because partial slabs are obtained |
1029 | * | 1229 | * from other nodes and filled up. |
1030 | * A higher ratio means slabs may be taken from other nodes | ||
1031 | * thus reducing the number of partial slabs on those nodes. | ||
1032 | * | 1230 | * |
1033 | * If /sys/slab/xx/defrag_ratio is set to 100 (which makes | 1231 | * If /sys/slab/xx/defrag_ratio is set to 100 (which makes |
1034 | * defrag_ratio = 1000) then every (well almost) allocation | 1232 | * defrag_ratio = 1000) then every (well almost) allocation will |
1035 | * will first attempt to defrag slab caches on other nodes. This | 1233 | * first attempt to defrag slab caches on other nodes. This means |
1036 | * means scanning over all nodes to look for partial slabs which | 1234 | * scanning over all nodes to look for partial slabs which may be |
1037 | * may be a bit expensive to do on every slab allocation. | 1235 | * expensive if we do it every time we are trying to find a slab |
1236 | * with available objects. | ||
1038 | */ | 1237 | */ |
1039 | if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) | 1238 | if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) |
1040 | return NULL; | 1239 | return NULL; |
@@ -1079,26 +1278,28 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) | |||
1079 | * | 1278 | * |
1080 | * On exit the slab lock will have been dropped. | 1279 | * On exit the slab lock will have been dropped. |
1081 | */ | 1280 | */ |
1082 | static void putback_slab(struct kmem_cache *s, struct page *page) | 1281 | static void unfreeze_slab(struct kmem_cache *s, struct page *page) |
1083 | { | 1282 | { |
1084 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | 1283 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); |
1085 | 1284 | ||
1285 | ClearSlabFrozen(page); | ||
1086 | if (page->inuse) { | 1286 | if (page->inuse) { |
1087 | 1287 | ||
1088 | if (page->freelist) | 1288 | if (page->freelist) |
1089 | add_partial(n, page); | 1289 | add_partial(n, page); |
1090 | else if (PageError(page) && (s->flags & SLAB_STORE_USER)) | 1290 | else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) |
1091 | add_full(n, page); | 1291 | add_full(n, page); |
1092 | slab_unlock(page); | 1292 | slab_unlock(page); |
1093 | 1293 | ||
1094 | } else { | 1294 | } else { |
1095 | if (n->nr_partial < MIN_PARTIAL) { | 1295 | if (n->nr_partial < MIN_PARTIAL) { |
1096 | /* | 1296 | /* |
1097 | * Adding an empty page to the partial slabs in order | 1297 | * Adding an empty slab to the partial slabs in order |
1098 | * to avoid page allocator overhead. This page needs to | 1298 | * to avoid page allocator overhead. This slab needs |
1099 | * come after all the others that are not fully empty | 1299 | * to come after the other slabs with objects in |
1100 | * in order to make sure that we do maximum | 1300 | * order to fill them up. That way the size of the |
1101 | * defragmentation. | 1301 | * partial list stays small. kmem_cache_shrink can |
1302 | * reclaim empty slabs from the partial list. | ||
1102 | */ | 1303 | */ |
1103 | add_partial_tail(n, page); | 1304 | add_partial_tail(n, page); |
1104 | slab_unlock(page); | 1305 | slab_unlock(page); |
@@ -1114,10 +1315,25 @@ static void putback_slab(struct kmem_cache *s, struct page *page) | |||
1114 | */ | 1315 | */ |
1115 | static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) | 1316 | static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) |
1116 | { | 1317 | { |
1117 | s->cpu_slab[cpu] = NULL; | 1318 | /* |
1118 | ClearPageActive(page); | 1319 | * Merge cpu freelist into freelist. Typically we get here |
1320 | * because both freelists are empty. So this is unlikely | ||
1321 | * to occur. | ||
1322 | */ | ||
1323 | while (unlikely(page->lockless_freelist)) { | ||
1324 | void **object; | ||
1325 | |||
1326 | /* Retrieve object from cpu_freelist */ | ||
1327 | object = page->lockless_freelist; | ||
1328 | page->lockless_freelist = page->lockless_freelist[page->offset]; | ||
1119 | 1329 | ||
1120 | putback_slab(s, page); | 1330 | /* And put onto the regular freelist */ |
1331 | object[page->offset] = page->freelist; | ||
1332 | page->freelist = object; | ||
1333 | page->inuse--; | ||
1334 | } | ||
1335 | s->cpu_slab[cpu] = NULL; | ||
1336 | unfreeze_slab(s, page); | ||
1121 | } | 1337 | } |
1122 | 1338 | ||
1123 | static void flush_slab(struct kmem_cache *s, struct page *page, int cpu) | 1339 | static void flush_slab(struct kmem_cache *s, struct page *page, int cpu) |
@@ -1160,47 +1376,46 @@ static void flush_all(struct kmem_cache *s) | |||
1160 | } | 1376 | } |
1161 | 1377 | ||
1162 | /* | 1378 | /* |
1163 | * slab_alloc is optimized to only modify two cachelines on the fast path | 1379 | * Slow path. The lockless freelist is empty or we need to perform |
1164 | * (aside from the stack): | 1380 | * debugging duties. |
1381 | * | ||
1382 | * Interrupts are disabled. | ||
1165 | * | 1383 | * |
1166 | * 1. The page struct | 1384 | * Processing is still very fast if new objects have been freed to the |
1167 | * 2. The first cacheline of the object to be allocated. | 1385 | * regular freelist. In that case we simply take over the regular freelist |
1386 | * as the lockless freelist and zap the regular freelist. | ||
1168 | * | 1387 | * |
1169 | * The only cache lines that are read (apart from code) is the | 1388 | * If that is not working then we fall back to the partial lists. We take the |
1170 | * per cpu array in the kmem_cache struct. | 1389 | * first element of the freelist as the object to allocate now and move the |
1390 | * rest of the freelist to the lockless freelist. | ||
1171 | * | 1391 | * |
1172 | * Fastpath is not possible if we need to get a new slab or have | 1392 | * And if we were unable to get a new slab from the partial slab lists then |
1173 | * debugging enabled (which means all slabs are marked with PageError) | 1393 | * we need to allocate a new slab. This is slowest path since we may sleep. |
1174 | */ | 1394 | */ |
1175 | static void *slab_alloc(struct kmem_cache *s, | 1395 | static void *__slab_alloc(struct kmem_cache *s, |
1176 | gfp_t gfpflags, int node, void *addr) | 1396 | gfp_t gfpflags, int node, void *addr, struct page *page) |
1177 | { | 1397 | { |
1178 | struct page *page; | ||
1179 | void **object; | 1398 | void **object; |
1180 | unsigned long flags; | 1399 | int cpu = smp_processor_id(); |
1181 | int cpu; | ||
1182 | 1400 | ||
1183 | local_irq_save(flags); | ||
1184 | cpu = smp_processor_id(); | ||
1185 | page = s->cpu_slab[cpu]; | ||
1186 | if (!page) | 1401 | if (!page) |
1187 | goto new_slab; | 1402 | goto new_slab; |
1188 | 1403 | ||
1189 | slab_lock(page); | 1404 | slab_lock(page); |
1190 | if (unlikely(node != -1 && page_to_nid(page) != node)) | 1405 | if (unlikely(node != -1 && page_to_nid(page) != node)) |
1191 | goto another_slab; | 1406 | goto another_slab; |
1192 | redo: | 1407 | load_freelist: |
1193 | object = page->freelist; | 1408 | object = page->freelist; |
1194 | if (unlikely(!object)) | 1409 | if (unlikely(!object)) |
1195 | goto another_slab; | 1410 | goto another_slab; |
1196 | if (unlikely(PageError(page))) | 1411 | if (unlikely(SlabDebug(page))) |
1197 | goto debug; | 1412 | goto debug; |
1198 | 1413 | ||
1199 | have_object: | 1414 | object = page->freelist; |
1200 | page->inuse++; | 1415 | page->lockless_freelist = object[page->offset]; |
1201 | page->freelist = object[page->offset]; | 1416 | page->inuse = s->objects; |
1417 | page->freelist = NULL; | ||
1202 | slab_unlock(page); | 1418 | slab_unlock(page); |
1203 | local_irq_restore(flags); | ||
1204 | return object; | 1419 | return object; |
1205 | 1420 | ||
1206 | another_slab: | 1421 | another_slab: |
@@ -1208,11 +1423,9 @@ another_slab: | |||
1208 | 1423 | ||
1209 | new_slab: | 1424 | new_slab: |
1210 | page = get_partial(s, gfpflags, node); | 1425 | page = get_partial(s, gfpflags, node); |
1211 | if (likely(page)) { | 1426 | if (page) { |
1212 | have_slab: | ||
1213 | s->cpu_slab[cpu] = page; | 1427 | s->cpu_slab[cpu] = page; |
1214 | SetPageActive(page); | 1428 | goto load_freelist; |
1215 | goto redo; | ||
1216 | } | 1429 | } |
1217 | 1430 | ||
1218 | page = new_slab(s, gfpflags, node); | 1431 | page = new_slab(s, gfpflags, node); |
@@ -1220,9 +1433,11 @@ have_slab: | |||
1220 | cpu = smp_processor_id(); | 1433 | cpu = smp_processor_id(); |
1221 | if (s->cpu_slab[cpu]) { | 1434 | if (s->cpu_slab[cpu]) { |
1222 | /* | 1435 | /* |
1223 | * Someone else populated the cpu_slab while we enabled | 1436 | * Someone else populated the cpu_slab while we |
1224 | * interrupts, or we have got scheduled on another cpu. | 1437 | * enabled interrupts, or we have gotten scheduled |
1225 | * The page may not be on the requested node. | 1438 | * on another cpu. The page may not be on the |
1439 | * requested node even if __GFP_THISNODE was | ||
1440 | * specified. So we need to recheck. | ||
1226 | */ | 1441 | */ |
1227 | if (node == -1 || | 1442 | if (node == -1 || |
1228 | page_to_nid(s->cpu_slab[cpu]) == node) { | 1443 | page_to_nid(s->cpu_slab[cpu]) == node) { |
@@ -1233,29 +1448,58 @@ have_slab: | |||
1233 | discard_slab(s, page); | 1448 | discard_slab(s, page); |
1234 | page = s->cpu_slab[cpu]; | 1449 | page = s->cpu_slab[cpu]; |
1235 | slab_lock(page); | 1450 | slab_lock(page); |
1236 | goto redo; | 1451 | goto load_freelist; |
1237 | } | 1452 | } |
1238 | /* Dump the current slab */ | 1453 | /* New slab does not fit our expectations */ |
1239 | flush_slab(s, s->cpu_slab[cpu], cpu); | 1454 | flush_slab(s, s->cpu_slab[cpu], cpu); |
1240 | } | 1455 | } |
1241 | slab_lock(page); | 1456 | slab_lock(page); |
1242 | goto have_slab; | 1457 | SetSlabFrozen(page); |
1458 | s->cpu_slab[cpu] = page; | ||
1459 | goto load_freelist; | ||
1243 | } | 1460 | } |
1244 | local_irq_restore(flags); | ||
1245 | return NULL; | 1461 | return NULL; |
1246 | debug: | 1462 | debug: |
1247 | if (!alloc_object_checks(s, page, object)) | 1463 | object = page->freelist; |
1464 | if (!alloc_debug_processing(s, page, object, addr)) | ||
1248 | goto another_slab; | 1465 | goto another_slab; |
1249 | if (s->flags & SLAB_STORE_USER) | 1466 | |
1250 | set_track(s, object, TRACK_ALLOC, addr); | 1467 | page->inuse++; |
1251 | if (s->flags & SLAB_TRACE) { | 1468 | page->freelist = object[page->offset]; |
1252 | printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n", | 1469 | slab_unlock(page); |
1253 | s->name, object, page->inuse, | 1470 | return object; |
1254 | page->freelist); | 1471 | } |
1255 | dump_stack(); | 1472 | |
1473 | /* | ||
1474 | * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) | ||
1475 | * have the fastpath folded into their functions. So no function call | ||
1476 | * overhead for requests that can be satisfied on the fastpath. | ||
1477 | * | ||
1478 | * The fastpath works by first checking if the lockless freelist can be used. | ||
1479 | * If not then __slab_alloc is called for slow processing. | ||
1480 | * | ||
1481 | * Otherwise we can simply pick the next object from the lockless free list. | ||
1482 | */ | ||
1483 | static void __always_inline *slab_alloc(struct kmem_cache *s, | ||
1484 | gfp_t gfpflags, int node, void *addr) | ||
1485 | { | ||
1486 | struct page *page; | ||
1487 | void **object; | ||
1488 | unsigned long flags; | ||
1489 | |||
1490 | local_irq_save(flags); | ||
1491 | page = s->cpu_slab[smp_processor_id()]; | ||
1492 | if (unlikely(!page || !page->lockless_freelist || | ||
1493 | (node != -1 && page_to_nid(page) != node))) | ||
1494 | |||
1495 | object = __slab_alloc(s, gfpflags, node, addr, page); | ||
1496 | |||
1497 | else { | ||
1498 | object = page->lockless_freelist; | ||
1499 | page->lockless_freelist = object[page->offset]; | ||
1256 | } | 1500 | } |
1257 | init_object(s, object, 1); | 1501 | local_irq_restore(flags); |
1258 | goto have_object; | 1502 | return object; |
1259 | } | 1503 | } |
1260 | 1504 | ||
1261 | void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) | 1505 | void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) |
@@ -1273,33 +1517,29 @@ EXPORT_SYMBOL(kmem_cache_alloc_node); | |||
1273 | #endif | 1517 | #endif |
1274 | 1518 | ||
1275 | /* | 1519 | /* |
1276 | * The fastpath only writes the cacheline of the page struct and the first | 1520 | * Slow patch handling. This may still be called frequently since objects |
1277 | * cacheline of the object. | 1521 | * have a longer lifetime than the cpu slabs in most processing loads. |
1278 | * | 1522 | * |
1279 | * No special cachelines need to be read | 1523 | * So we still attempt to reduce cache line usage. Just take the slab |
1524 | * lock and free the item. If there is no additional partial page | ||
1525 | * handling required then we can return immediately. | ||
1280 | */ | 1526 | */ |
1281 | static void slab_free(struct kmem_cache *s, struct page *page, | 1527 | static void __slab_free(struct kmem_cache *s, struct page *page, |
1282 | void *x, void *addr) | 1528 | void *x, void *addr) |
1283 | { | 1529 | { |
1284 | void *prior; | 1530 | void *prior; |
1285 | void **object = (void *)x; | 1531 | void **object = (void *)x; |
1286 | unsigned long flags; | ||
1287 | 1532 | ||
1288 | local_irq_save(flags); | ||
1289 | slab_lock(page); | 1533 | slab_lock(page); |
1290 | 1534 | ||
1291 | if (unlikely(PageError(page))) | 1535 | if (unlikely(SlabDebug(page))) |
1292 | goto debug; | 1536 | goto debug; |
1293 | checks_ok: | 1537 | checks_ok: |
1294 | prior = object[page->offset] = page->freelist; | 1538 | prior = object[page->offset] = page->freelist; |
1295 | page->freelist = object; | 1539 | page->freelist = object; |
1296 | page->inuse--; | 1540 | page->inuse--; |
1297 | 1541 | ||
1298 | if (unlikely(PageActive(page))) | 1542 | if (unlikely(SlabFrozen(page))) |
1299 | /* | ||
1300 | * Cpu slabs are never on partial lists and are | ||
1301 | * never freed. | ||
1302 | */ | ||
1303 | goto out_unlock; | 1543 | goto out_unlock; |
1304 | 1544 | ||
1305 | if (unlikely(!page->inuse)) | 1545 | if (unlikely(!page->inuse)) |
@@ -1315,39 +1555,53 @@ checks_ok: | |||
1315 | 1555 | ||
1316 | out_unlock: | 1556 | out_unlock: |
1317 | slab_unlock(page); | 1557 | slab_unlock(page); |
1318 | local_irq_restore(flags); | ||
1319 | return; | 1558 | return; |
1320 | 1559 | ||
1321 | slab_empty: | 1560 | slab_empty: |
1322 | if (prior) | 1561 | if (prior) |
1323 | /* | 1562 | /* |
1324 | * Slab on the partial list. | 1563 | * Slab still on the partial list. |
1325 | */ | 1564 | */ |
1326 | remove_partial(s, page); | 1565 | remove_partial(s, page); |
1327 | 1566 | ||
1328 | slab_unlock(page); | 1567 | slab_unlock(page); |
1329 | discard_slab(s, page); | 1568 | discard_slab(s, page); |
1330 | local_irq_restore(flags); | ||
1331 | return; | 1569 | return; |
1332 | 1570 | ||
1333 | debug: | 1571 | debug: |
1334 | if (!free_object_checks(s, page, x)) | 1572 | if (!free_debug_processing(s, page, x, addr)) |
1335 | goto out_unlock; | 1573 | goto out_unlock; |
1336 | if (!PageActive(page) && !page->freelist) | ||
1337 | remove_full(s, page); | ||
1338 | if (s->flags & SLAB_STORE_USER) | ||
1339 | set_track(s, x, TRACK_FREE, addr); | ||
1340 | if (s->flags & SLAB_TRACE) { | ||
1341 | printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n", | ||
1342 | s->name, object, page->inuse, | ||
1343 | page->freelist); | ||
1344 | print_section("Object", (void *)object, s->objsize); | ||
1345 | dump_stack(); | ||
1346 | } | ||
1347 | init_object(s, object, 0); | ||
1348 | goto checks_ok; | 1574 | goto checks_ok; |
1349 | } | 1575 | } |
1350 | 1576 | ||
1577 | /* | ||
1578 | * Fastpath with forced inlining to produce a kfree and kmem_cache_free that | ||
1579 | * can perform fastpath freeing without additional function calls. | ||
1580 | * | ||
1581 | * The fastpath is only possible if we are freeing to the current cpu slab | ||
1582 | * of this processor. This typically the case if we have just allocated | ||
1583 | * the item before. | ||
1584 | * | ||
1585 | * If fastpath is not possible then fall back to __slab_free where we deal | ||
1586 | * with all sorts of special processing. | ||
1587 | */ | ||
1588 | static void __always_inline slab_free(struct kmem_cache *s, | ||
1589 | struct page *page, void *x, void *addr) | ||
1590 | { | ||
1591 | void **object = (void *)x; | ||
1592 | unsigned long flags; | ||
1593 | |||
1594 | local_irq_save(flags); | ||
1595 | if (likely(page == s->cpu_slab[smp_processor_id()] && | ||
1596 | !SlabDebug(page))) { | ||
1597 | object[page->offset] = page->lockless_freelist; | ||
1598 | page->lockless_freelist = object; | ||
1599 | } else | ||
1600 | __slab_free(s, page, x, addr); | ||
1601 | |||
1602 | local_irq_restore(flags); | ||
1603 | } | ||
1604 | |||
1351 | void kmem_cache_free(struct kmem_cache *s, void *x) | 1605 | void kmem_cache_free(struct kmem_cache *s, void *x) |
1352 | { | 1606 | { |
1353 | struct page *page; | 1607 | struct page *page; |
@@ -1370,22 +1624,16 @@ static struct page *get_object_page(const void *x) | |||
1370 | } | 1624 | } |
1371 | 1625 | ||
1372 | /* | 1626 | /* |
1373 | * kmem_cache_open produces objects aligned at "size" and the first object | 1627 | * Object placement in a slab is made very easy because we always start at |
1374 | * is placed at offset 0 in the slab (We have no metainformation on the | 1628 | * offset 0. If we tune the size of the object to the alignment then we can |
1375 | * slab, all slabs are in essence "off slab"). | 1629 | * get the required alignment by putting one properly sized object after |
1376 | * | 1630 | * another. |
1377 | * In order to get the desired alignment one just needs to align the | ||
1378 | * size. | ||
1379 | * | 1631 | * |
1380 | * Notice that the allocation order determines the sizes of the per cpu | 1632 | * Notice that the allocation order determines the sizes of the per cpu |
1381 | * caches. Each processor has always one slab available for allocations. | 1633 | * caches. Each processor has always one slab available for allocations. |
1382 | * Increasing the allocation order reduces the number of times that slabs | 1634 | * Increasing the allocation order reduces the number of times that slabs |
1383 | * must be moved on and off the partial lists and therefore may influence | 1635 | * must be moved on and off the partial lists and is therefore a factor in |
1384 | * locking overhead. | 1636 | * locking overhead. |
1385 | * | ||
1386 | * The offset is used to relocate the free list link in each object. It is | ||
1387 | * therefore possible to move the free list link behind the object. This | ||
1388 | * is necessary for RCU to work properly and also useful for debugging. | ||
1389 | */ | 1637 | */ |
1390 | 1638 | ||
1391 | /* | 1639 | /* |
@@ -1396,76 +1644,110 @@ static struct page *get_object_page(const void *x) | |||
1396 | */ | 1644 | */ |
1397 | static int slub_min_order; | 1645 | static int slub_min_order; |
1398 | static int slub_max_order = DEFAULT_MAX_ORDER; | 1646 | static int slub_max_order = DEFAULT_MAX_ORDER; |
1399 | |||
1400 | /* | ||
1401 | * Minimum number of objects per slab. This is necessary in order to | ||
1402 | * reduce locking overhead. Similar to the queue size in SLAB. | ||
1403 | */ | ||
1404 | static int slub_min_objects = DEFAULT_MIN_OBJECTS; | 1647 | static int slub_min_objects = DEFAULT_MIN_OBJECTS; |
1405 | 1648 | ||
1406 | /* | 1649 | /* |
1407 | * Merge control. If this is set then no merging of slab caches will occur. | 1650 | * Merge control. If this is set then no merging of slab caches will occur. |
1651 | * (Could be removed. This was introduced to pacify the merge skeptics.) | ||
1408 | */ | 1652 | */ |
1409 | static int slub_nomerge; | 1653 | static int slub_nomerge; |
1410 | 1654 | ||
1411 | /* | 1655 | /* |
1412 | * Debug settings: | ||
1413 | */ | ||
1414 | static int slub_debug; | ||
1415 | |||
1416 | static char *slub_debug_slabs; | ||
1417 | |||
1418 | /* | ||
1419 | * Calculate the order of allocation given an slab object size. | 1656 | * Calculate the order of allocation given an slab object size. |
1420 | * | 1657 | * |
1421 | * The order of allocation has significant impact on other elements | 1658 | * The order of allocation has significant impact on performance and other |
1422 | * of the system. Generally order 0 allocations should be preferred | 1659 | * system components. Generally order 0 allocations should be preferred since |
1423 | * since they do not cause fragmentation in the page allocator. Larger | 1660 | * order 0 does not cause fragmentation in the page allocator. Larger objects |
1424 | * objects may have problems with order 0 because there may be too much | 1661 | * be problematic to put into order 0 slabs because there may be too much |
1425 | * space left unused in a slab. We go to a higher order if more than 1/8th | 1662 | * unused space left. We go to a higher order if more than 1/8th of the slab |
1426 | * of the slab would be wasted. | 1663 | * would be wasted. |
1427 | * | 1664 | * |
1428 | * In order to reach satisfactory performance we must ensure that | 1665 | * In order to reach satisfactory performance we must ensure that a minimum |
1429 | * a minimum number of objects is in one slab. Otherwise we may | 1666 | * number of objects is in one slab. Otherwise we may generate too much |
1430 | * generate too much activity on the partial lists. This is less a | 1667 | * activity on the partial lists which requires taking the list_lock. This is |
1431 | * concern for large slabs though. slub_max_order specifies the order | 1668 | * less a concern for large slabs though which are rarely used. |
1432 | * where we begin to stop considering the number of objects in a slab. | ||
1433 | * | 1669 | * |
1434 | * Higher order allocations also allow the placement of more objects | 1670 | * slub_max_order specifies the order where we begin to stop considering the |
1435 | * in a slab and thereby reduce object handling overhead. If the user | 1671 | * number of objects in a slab as critical. If we reach slub_max_order then |
1436 | * has requested a higher mininum order then we start with that one | 1672 | * we try to keep the page order as low as possible. So we accept more waste |
1437 | * instead of zero. | 1673 | * of space in favor of a small page order. |
1674 | * | ||
1675 | * Higher order allocations also allow the placement of more objects in a | ||
1676 | * slab and thereby reduce object handling overhead. If the user has | ||
1677 | * requested a higher mininum order then we start with that one instead of | ||
1678 | * the smallest order which will fit the object. | ||
1438 | */ | 1679 | */ |
1439 | static int calculate_order(int size) | 1680 | static inline int slab_order(int size, int min_objects, |
1681 | int max_order, int fract_leftover) | ||
1440 | { | 1682 | { |
1441 | int order; | 1683 | int order; |
1442 | int rem; | 1684 | int rem; |
1443 | 1685 | ||
1444 | for (order = max(slub_min_order, fls(size - 1) - PAGE_SHIFT); | 1686 | for (order = max(slub_min_order, |
1445 | order < MAX_ORDER; order++) { | 1687 | fls(min_objects * size - 1) - PAGE_SHIFT); |
1446 | unsigned long slab_size = PAGE_SIZE << order; | 1688 | order <= max_order; order++) { |
1447 | 1689 | ||
1448 | if (slub_max_order > order && | 1690 | unsigned long slab_size = PAGE_SIZE << order; |
1449 | slab_size < slub_min_objects * size) | ||
1450 | continue; | ||
1451 | 1691 | ||
1452 | if (slab_size < size) | 1692 | if (slab_size < min_objects * size) |
1453 | continue; | 1693 | continue; |
1454 | 1694 | ||
1455 | rem = slab_size % size; | 1695 | rem = slab_size % size; |
1456 | 1696 | ||
1457 | if (rem <= (PAGE_SIZE << order) / 8) | 1697 | if (rem <= slab_size / fract_leftover) |
1458 | break; | 1698 | break; |
1459 | 1699 | ||
1460 | } | 1700 | } |
1461 | if (order >= MAX_ORDER) | 1701 | |
1462 | return -E2BIG; | ||
1463 | return order; | 1702 | return order; |
1464 | } | 1703 | } |
1465 | 1704 | ||
1705 | static inline int calculate_order(int size) | ||
1706 | { | ||
1707 | int order; | ||
1708 | int min_objects; | ||
1709 | int fraction; | ||
1710 | |||
1711 | /* | ||
1712 | * Attempt to find best configuration for a slab. This | ||
1713 | * works by first attempting to generate a layout with | ||
1714 | * the best configuration and backing off gradually. | ||
1715 | * | ||
1716 | * First we reduce the acceptable waste in a slab. Then | ||
1717 | * we reduce the minimum objects required in a slab. | ||
1718 | */ | ||
1719 | min_objects = slub_min_objects; | ||
1720 | while (min_objects > 1) { | ||
1721 | fraction = 8; | ||
1722 | while (fraction >= 4) { | ||
1723 | order = slab_order(size, min_objects, | ||
1724 | slub_max_order, fraction); | ||
1725 | if (order <= slub_max_order) | ||
1726 | return order; | ||
1727 | fraction /= 2; | ||
1728 | } | ||
1729 | min_objects /= 2; | ||
1730 | } | ||
1731 | |||
1732 | /* | ||
1733 | * We were unable to place multiple objects in a slab. Now | ||
1734 | * lets see if we can place a single object there. | ||
1735 | */ | ||
1736 | order = slab_order(size, 1, slub_max_order, 1); | ||
1737 | if (order <= slub_max_order) | ||
1738 | return order; | ||
1739 | |||
1740 | /* | ||
1741 | * Doh this slab cannot be placed using slub_max_order. | ||
1742 | */ | ||
1743 | order = slab_order(size, 1, MAX_ORDER, 1); | ||
1744 | if (order <= MAX_ORDER) | ||
1745 | return order; | ||
1746 | return -ENOSYS; | ||
1747 | } | ||
1748 | |||
1466 | /* | 1749 | /* |
1467 | * Function to figure out which alignment to use from the | 1750 | * Figure out what the alignment of the objects will be. |
1468 | * various ways of specifying it. | ||
1469 | */ | 1751 | */ |
1470 | static unsigned long calculate_alignment(unsigned long flags, | 1752 | static unsigned long calculate_alignment(unsigned long flags, |
1471 | unsigned long align, unsigned long size) | 1753 | unsigned long align, unsigned long size) |
@@ -1480,8 +1762,8 @@ static unsigned long calculate_alignment(unsigned long flags, | |||
1480 | * then use it. | 1762 | * then use it. |
1481 | */ | 1763 | */ |
1482 | if ((flags & SLAB_HWCACHE_ALIGN) && | 1764 | if ((flags & SLAB_HWCACHE_ALIGN) && |
1483 | size > L1_CACHE_BYTES / 2) | 1765 | size > cache_line_size() / 2) |
1484 | return max_t(unsigned long, align, L1_CACHE_BYTES); | 1766 | return max_t(unsigned long, align, cache_line_size()); |
1485 | 1767 | ||
1486 | if (align < ARCH_SLAB_MINALIGN) | 1768 | if (align < ARCH_SLAB_MINALIGN) |
1487 | return ARCH_SLAB_MINALIGN; | 1769 | return ARCH_SLAB_MINALIGN; |
@@ -1525,7 +1807,7 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag | |||
1525 | page->freelist = get_freepointer(kmalloc_caches, n); | 1807 | page->freelist = get_freepointer(kmalloc_caches, n); |
1526 | page->inuse++; | 1808 | page->inuse++; |
1527 | kmalloc_caches->node[node] = n; | 1809 | kmalloc_caches->node[node] = n; |
1528 | init_object(kmalloc_caches, n, 1); | 1810 | setup_object_debug(kmalloc_caches, page, n); |
1529 | init_kmem_cache_node(n); | 1811 | init_kmem_cache_node(n); |
1530 | atomic_long_inc(&n->nr_slabs); | 1812 | atomic_long_inc(&n->nr_slabs); |
1531 | add_partial(n, page); | 1813 | add_partial(n, page); |
@@ -1607,7 +1889,7 @@ static int calculate_sizes(struct kmem_cache *s) | |||
1607 | * then we should never poison the object itself. | 1889 | * then we should never poison the object itself. |
1608 | */ | 1890 | */ |
1609 | if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && | 1891 | if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && |
1610 | !s->ctor && !s->dtor) | 1892 | !s->ctor) |
1611 | s->flags |= __OBJECT_POISON; | 1893 | s->flags |= __OBJECT_POISON; |
1612 | else | 1894 | else |
1613 | s->flags &= ~__OBJECT_POISON; | 1895 | s->flags &= ~__OBJECT_POISON; |
@@ -1619,24 +1901,24 @@ static int calculate_sizes(struct kmem_cache *s) | |||
1619 | */ | 1901 | */ |
1620 | size = ALIGN(size, sizeof(void *)); | 1902 | size = ALIGN(size, sizeof(void *)); |
1621 | 1903 | ||
1904 | #ifdef CONFIG_SLUB_DEBUG | ||
1622 | /* | 1905 | /* |
1623 | * If we are redzoning then check if there is some space between the | 1906 | * If we are Redzoning then check if there is some space between the |
1624 | * end of the object and the free pointer. If not then add an | 1907 | * end of the object and the free pointer. If not then add an |
1625 | * additional word, so that we can establish a redzone between | 1908 | * additional word to have some bytes to store Redzone information. |
1626 | * the object and the freepointer to be able to check for overwrites. | ||
1627 | */ | 1909 | */ |
1628 | if ((flags & SLAB_RED_ZONE) && size == s->objsize) | 1910 | if ((flags & SLAB_RED_ZONE) && size == s->objsize) |
1629 | size += sizeof(void *); | 1911 | size += sizeof(void *); |
1912 | #endif | ||
1630 | 1913 | ||
1631 | /* | 1914 | /* |
1632 | * With that we have determined how much of the slab is in actual | 1915 | * With that we have determined the number of bytes in actual use |
1633 | * use by the object. This is the potential offset to the free | 1916 | * by the object. This is the potential offset to the free pointer. |
1634 | * pointer. | ||
1635 | */ | 1917 | */ |
1636 | s->inuse = size; | 1918 | s->inuse = size; |
1637 | 1919 | ||
1638 | if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || | 1920 | if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || |
1639 | s->ctor || s->dtor)) { | 1921 | s->ctor)) { |
1640 | /* | 1922 | /* |
1641 | * Relocate free pointer after the object if it is not | 1923 | * Relocate free pointer after the object if it is not |
1642 | * permitted to overwrite the first word of the object on | 1924 | * permitted to overwrite the first word of the object on |
@@ -1649,6 +1931,7 @@ static int calculate_sizes(struct kmem_cache *s) | |||
1649 | size += sizeof(void *); | 1931 | size += sizeof(void *); |
1650 | } | 1932 | } |
1651 | 1933 | ||
1934 | #ifdef CONFIG_SLUB_DEBUG | ||
1652 | if (flags & SLAB_STORE_USER) | 1935 | if (flags & SLAB_STORE_USER) |
1653 | /* | 1936 | /* |
1654 | * Need to store information about allocs and frees after | 1937 | * Need to store information about allocs and frees after |
@@ -1656,7 +1939,7 @@ static int calculate_sizes(struct kmem_cache *s) | |||
1656 | */ | 1939 | */ |
1657 | size += 2 * sizeof(struct track); | 1940 | size += 2 * sizeof(struct track); |
1658 | 1941 | ||
1659 | if (flags & DEBUG_DEFAULT_FLAGS) | 1942 | if (flags & SLAB_RED_ZONE) |
1660 | /* | 1943 | /* |
1661 | * Add some empty padding so that we can catch | 1944 | * Add some empty padding so that we can catch |
1662 | * overwrites from earlier objects rather than let | 1945 | * overwrites from earlier objects rather than let |
@@ -1665,10 +1948,12 @@ static int calculate_sizes(struct kmem_cache *s) | |||
1665 | * of the object. | 1948 | * of the object. |
1666 | */ | 1949 | */ |
1667 | size += sizeof(void *); | 1950 | size += sizeof(void *); |
1951 | #endif | ||
1952 | |||
1668 | /* | 1953 | /* |
1669 | * Determine the alignment based on various parameters that the | 1954 | * Determine the alignment based on various parameters that the |
1670 | * user specified (this is unecessarily complex due to the attempt | 1955 | * user specified and the dynamic determination of cache line size |
1671 | * to be compatible with SLAB. Should be cleaned up some day). | 1956 | * on bootup. |
1672 | */ | 1957 | */ |
1673 | align = calculate_alignment(flags, align, s->objsize); | 1958 | align = calculate_alignment(flags, align, s->objsize); |
1674 | 1959 | ||
@@ -1700,62 +1985,18 @@ static int calculate_sizes(struct kmem_cache *s) | |||
1700 | 1985 | ||
1701 | } | 1986 | } |
1702 | 1987 | ||
1703 | static int __init finish_bootstrap(void) | ||
1704 | { | ||
1705 | struct list_head *h; | ||
1706 | int err; | ||
1707 | |||
1708 | slab_state = SYSFS; | ||
1709 | |||
1710 | list_for_each(h, &slab_caches) { | ||
1711 | struct kmem_cache *s = | ||
1712 | container_of(h, struct kmem_cache, list); | ||
1713 | |||
1714 | err = sysfs_slab_add(s); | ||
1715 | BUG_ON(err); | ||
1716 | } | ||
1717 | return 0; | ||
1718 | } | ||
1719 | |||
1720 | static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, | 1988 | static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, |
1721 | const char *name, size_t size, | 1989 | const char *name, size_t size, |
1722 | size_t align, unsigned long flags, | 1990 | size_t align, unsigned long flags, |
1723 | void (*ctor)(void *, struct kmem_cache *, unsigned long), | 1991 | void (*ctor)(void *, struct kmem_cache *, unsigned long)) |
1724 | void (*dtor)(void *, struct kmem_cache *, unsigned long)) | ||
1725 | { | 1992 | { |
1726 | memset(s, 0, kmem_size); | 1993 | memset(s, 0, kmem_size); |
1727 | s->name = name; | 1994 | s->name = name; |
1728 | s->ctor = ctor; | 1995 | s->ctor = ctor; |
1729 | s->dtor = dtor; | ||
1730 | s->objsize = size; | 1996 | s->objsize = size; |
1731 | s->flags = flags; | 1997 | s->flags = flags; |
1732 | s->align = align; | 1998 | s->align = align; |
1733 | 1999 | kmem_cache_open_debug_check(s); | |
1734 | /* | ||
1735 | * The page->offset field is only 16 bit wide. This is an offset | ||
1736 | * in units of words from the beginning of an object. If the slab | ||
1737 | * size is bigger then we cannot move the free pointer behind the | ||
1738 | * object anymore. | ||
1739 | * | ||
1740 | * On 32 bit platforms the limit is 256k. On 64bit platforms | ||
1741 | * the limit is 512k. | ||
1742 | * | ||
1743 | * Debugging or ctor/dtors may create a need to move the free | ||
1744 | * pointer. Fail if this happens. | ||
1745 | */ | ||
1746 | if (s->size >= 65535 * sizeof(void *)) { | ||
1747 | BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON | | ||
1748 | SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); | ||
1749 | BUG_ON(ctor || dtor); | ||
1750 | } | ||
1751 | else | ||
1752 | /* | ||
1753 | * Enable debugging if selected on the kernel commandline. | ||
1754 | */ | ||
1755 | if (slub_debug && (!slub_debug_slabs || | ||
1756 | strncmp(slub_debug_slabs, name, | ||
1757 | strlen(slub_debug_slabs)) == 0)) | ||
1758 | s->flags |= slub_debug; | ||
1759 | 2000 | ||
1760 | if (!calculate_sizes(s)) | 2001 | if (!calculate_sizes(s)) |
1761 | goto error; | 2002 | goto error; |
@@ -1783,7 +2024,6 @@ EXPORT_SYMBOL(kmem_cache_open); | |||
1783 | int kmem_ptr_validate(struct kmem_cache *s, const void *object) | 2024 | int kmem_ptr_validate(struct kmem_cache *s, const void *object) |
1784 | { | 2025 | { |
1785 | struct page * page; | 2026 | struct page * page; |
1786 | void *addr; | ||
1787 | 2027 | ||
1788 | page = get_object_page(object); | 2028 | page = get_object_page(object); |
1789 | 2029 | ||
@@ -1791,13 +2031,7 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *object) | |||
1791 | /* No slab or wrong slab */ | 2031 | /* No slab or wrong slab */ |
1792 | return 0; | 2032 | return 0; |
1793 | 2033 | ||
1794 | addr = page_address(page); | 2034 | if (!check_valid_pointer(s, page, object)) |
1795 | if (object < addr || object >= addr + s->objects * s->size) | ||
1796 | /* Out of bounds */ | ||
1797 | return 0; | ||
1798 | |||
1799 | if ((object - addr) % s->size) | ||
1800 | /* Improperly aligned */ | ||
1801 | return 0; | 2035 | return 0; |
1802 | 2036 | ||
1803 | /* | 2037 | /* |
@@ -1826,7 +2060,8 @@ const char *kmem_cache_name(struct kmem_cache *s) | |||
1826 | EXPORT_SYMBOL(kmem_cache_name); | 2060 | EXPORT_SYMBOL(kmem_cache_name); |
1827 | 2061 | ||
1828 | /* | 2062 | /* |
1829 | * Attempt to free all slabs on a node | 2063 | * Attempt to free all slabs on a node. Return the number of slabs we |
2064 | * were unable to free. | ||
1830 | */ | 2065 | */ |
1831 | static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, | 2066 | static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, |
1832 | struct list_head *list) | 2067 | struct list_head *list) |
@@ -1847,7 +2082,7 @@ static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, | |||
1847 | } | 2082 | } |
1848 | 2083 | ||
1849 | /* | 2084 | /* |
1850 | * Release all resources used by slab cache | 2085 | * Release all resources used by a slab cache. |
1851 | */ | 2086 | */ |
1852 | static int kmem_cache_close(struct kmem_cache *s) | 2087 | static int kmem_cache_close(struct kmem_cache *s) |
1853 | { | 2088 | { |
@@ -1932,45 +2167,6 @@ static int __init setup_slub_nomerge(char *str) | |||
1932 | 2167 | ||
1933 | __setup("slub_nomerge", setup_slub_nomerge); | 2168 | __setup("slub_nomerge", setup_slub_nomerge); |
1934 | 2169 | ||
1935 | static int __init setup_slub_debug(char *str) | ||
1936 | { | ||
1937 | if (!str || *str != '=') | ||
1938 | slub_debug = DEBUG_DEFAULT_FLAGS; | ||
1939 | else { | ||
1940 | str++; | ||
1941 | if (*str == 0 || *str == ',') | ||
1942 | slub_debug = DEBUG_DEFAULT_FLAGS; | ||
1943 | else | ||
1944 | for( ;*str && *str != ','; str++) | ||
1945 | switch (*str) { | ||
1946 | case 'f' : case 'F' : | ||
1947 | slub_debug |= SLAB_DEBUG_FREE; | ||
1948 | break; | ||
1949 | case 'z' : case 'Z' : | ||
1950 | slub_debug |= SLAB_RED_ZONE; | ||
1951 | break; | ||
1952 | case 'p' : case 'P' : | ||
1953 | slub_debug |= SLAB_POISON; | ||
1954 | break; | ||
1955 | case 'u' : case 'U' : | ||
1956 | slub_debug |= SLAB_STORE_USER; | ||
1957 | break; | ||
1958 | case 't' : case 'T' : | ||
1959 | slub_debug |= SLAB_TRACE; | ||
1960 | break; | ||
1961 | default: | ||
1962 | printk(KERN_ERR "slub_debug option '%c' " | ||
1963 | "unknown. skipped\n",*str); | ||
1964 | } | ||
1965 | } | ||
1966 | |||
1967 | if (*str == ',') | ||
1968 | slub_debug_slabs = str + 1; | ||
1969 | return 1; | ||
1970 | } | ||
1971 | |||
1972 | __setup("slub_debug", setup_slub_debug); | ||
1973 | |||
1974 | static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, | 2170 | static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, |
1975 | const char *name, int size, gfp_t gfp_flags) | 2171 | const char *name, int size, gfp_t gfp_flags) |
1976 | { | 2172 | { |
@@ -1981,7 +2177,7 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, | |||
1981 | 2177 | ||
1982 | down_write(&slub_lock); | 2178 | down_write(&slub_lock); |
1983 | if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, | 2179 | if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, |
1984 | flags, NULL, NULL)) | 2180 | flags, NULL)) |
1985 | goto panic; | 2181 | goto panic; |
1986 | 2182 | ||
1987 | list_add(&s->list, &slab_caches); | 2183 | list_add(&s->list, &slab_caches); |
@@ -2108,13 +2304,14 @@ void kfree(const void *x) | |||
2108 | EXPORT_SYMBOL(kfree); | 2304 | EXPORT_SYMBOL(kfree); |
2109 | 2305 | ||
2110 | /* | 2306 | /* |
2111 | * kmem_cache_shrink removes empty slabs from the partial lists | 2307 | * kmem_cache_shrink removes empty slabs from the partial lists and sorts |
2112 | * and then sorts the partially allocated slabs by the number | 2308 | * the remaining slabs by the number of items in use. The slabs with the |
2113 | * of items in use. The slabs with the most items in use | 2309 | * most items in use come first. New allocations will then fill those up |
2114 | * come first. New allocations will remove these from the | 2310 | * and thus they can be removed from the partial lists. |
2115 | * partial list because they are full. The slabs with the | 2311 | * |
2116 | * least items are placed last. If it happens that the objects | 2312 | * The slabs with the least items are placed last. This results in them |
2117 | * are freed then the page can be returned to the page allocator. | 2313 | * being allocated from last increasing the chance that the last objects |
2314 | * are freed in them. | ||
2118 | */ | 2315 | */ |
2119 | int kmem_cache_shrink(struct kmem_cache *s) | 2316 | int kmem_cache_shrink(struct kmem_cache *s) |
2120 | { | 2317 | { |
@@ -2143,12 +2340,10 @@ int kmem_cache_shrink(struct kmem_cache *s) | |||
2143 | spin_lock_irqsave(&n->list_lock, flags); | 2340 | spin_lock_irqsave(&n->list_lock, flags); |
2144 | 2341 | ||
2145 | /* | 2342 | /* |
2146 | * Build lists indexed by the items in use in | 2343 | * Build lists indexed by the items in use in each slab. |
2147 | * each slab or free slabs if empty. | ||
2148 | * | 2344 | * |
2149 | * Note that concurrent frees may occur while | 2345 | * Note that concurrent frees may occur while we hold the |
2150 | * we hold the list_lock. page->inuse here is | 2346 | * list_lock. page->inuse here is the upper limit. |
2151 | * the upper limit. | ||
2152 | */ | 2347 | */ |
2153 | list_for_each_entry_safe(page, t, &n->partial, lru) { | 2348 | list_for_each_entry_safe(page, t, &n->partial, lru) { |
2154 | if (!page->inuse && slab_trylock(page)) { | 2349 | if (!page->inuse && slab_trylock(page)) { |
@@ -2172,8 +2367,8 @@ int kmem_cache_shrink(struct kmem_cache *s) | |||
2172 | goto out; | 2367 | goto out; |
2173 | 2368 | ||
2174 | /* | 2369 | /* |
2175 | * Rebuild the partial list with the slabs filled up | 2370 | * Rebuild the partial list with the slabs filled up most |
2176 | * most first and the least used slabs at the end. | 2371 | * first and the least used slabs at the end. |
2177 | */ | 2372 | */ |
2178 | for (i = s->objects - 1; i >= 0; i--) | 2373 | for (i = s->objects - 1; i >= 0; i--) |
2179 | list_splice(slabs_by_inuse + i, n->partial.prev); | 2374 | list_splice(slabs_by_inuse + i, n->partial.prev); |
@@ -2189,7 +2384,6 @@ EXPORT_SYMBOL(kmem_cache_shrink); | |||
2189 | 2384 | ||
2190 | /** | 2385 | /** |
2191 | * krealloc - reallocate memory. The contents will remain unchanged. | 2386 | * krealloc - reallocate memory. The contents will remain unchanged. |
2192 | * | ||
2193 | * @p: object to reallocate memory for. | 2387 | * @p: object to reallocate memory for. |
2194 | * @new_size: how many bytes of memory are required. | 2388 | * @new_size: how many bytes of memory are required. |
2195 | * @flags: the type of memory to allocate. | 2389 | * @flags: the type of memory to allocate. |
@@ -2201,9 +2395,8 @@ EXPORT_SYMBOL(kmem_cache_shrink); | |||
2201 | */ | 2395 | */ |
2202 | void *krealloc(const void *p, size_t new_size, gfp_t flags) | 2396 | void *krealloc(const void *p, size_t new_size, gfp_t flags) |
2203 | { | 2397 | { |
2204 | struct kmem_cache *new_cache; | ||
2205 | void *ret; | 2398 | void *ret; |
2206 | struct page *page; | 2399 | size_t ks; |
2207 | 2400 | ||
2208 | if (unlikely(!p)) | 2401 | if (unlikely(!p)) |
2209 | return kmalloc(new_size, flags); | 2402 | return kmalloc(new_size, flags); |
@@ -2213,19 +2406,13 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) | |||
2213 | return NULL; | 2406 | return NULL; |
2214 | } | 2407 | } |
2215 | 2408 | ||
2216 | page = virt_to_head_page(p); | 2409 | ks = ksize(p); |
2217 | 2410 | if (ks >= new_size) | |
2218 | new_cache = get_slab(new_size, flags); | ||
2219 | |||
2220 | /* | ||
2221 | * If new size fits in the current cache, bail out. | ||
2222 | */ | ||
2223 | if (likely(page->slab == new_cache)) | ||
2224 | return (void *)p; | 2411 | return (void *)p; |
2225 | 2412 | ||
2226 | ret = kmalloc(new_size, flags); | 2413 | ret = kmalloc(new_size, flags); |
2227 | if (ret) { | 2414 | if (ret) { |
2228 | memcpy(ret, p, min(new_size, ksize(p))); | 2415 | memcpy(ret, p, min(new_size, ks)); |
2229 | kfree(p); | 2416 | kfree(p); |
2230 | } | 2417 | } |
2231 | return ret; | 2418 | return ret; |
@@ -2243,11 +2430,12 @@ void __init kmem_cache_init(void) | |||
2243 | #ifdef CONFIG_NUMA | 2430 | #ifdef CONFIG_NUMA |
2244 | /* | 2431 | /* |
2245 | * Must first have the slab cache available for the allocations of the | 2432 | * Must first have the slab cache available for the allocations of the |
2246 | * struct kmalloc_cache_node's. There is special bootstrap code in | 2433 | * struct kmem_cache_node's. There is special bootstrap code in |
2247 | * kmem_cache_open for slab_state == DOWN. | 2434 | * kmem_cache_open for slab_state == DOWN. |
2248 | */ | 2435 | */ |
2249 | create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", | 2436 | create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", |
2250 | sizeof(struct kmem_cache_node), GFP_KERNEL); | 2437 | sizeof(struct kmem_cache_node), GFP_KERNEL); |
2438 | kmalloc_caches[0].refcount = -1; | ||
2251 | #endif | 2439 | #endif |
2252 | 2440 | ||
2253 | /* Able to allocate the per node structures */ | 2441 | /* Able to allocate the per node structures */ |
@@ -2274,13 +2462,12 @@ void __init kmem_cache_init(void) | |||
2274 | register_cpu_notifier(&slab_notifier); | 2462 | register_cpu_notifier(&slab_notifier); |
2275 | #endif | 2463 | #endif |
2276 | 2464 | ||
2277 | if (nr_cpu_ids) /* Remove when nr_cpu_ids is fixed upstream ! */ | 2465 | kmem_size = offsetof(struct kmem_cache, cpu_slab) + |
2278 | kmem_size = offsetof(struct kmem_cache, cpu_slab) | 2466 | nr_cpu_ids * sizeof(struct page *); |
2279 | + nr_cpu_ids * sizeof(struct page *); | ||
2280 | 2467 | ||
2281 | printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," | 2468 | printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," |
2282 | " Processors=%d, Nodes=%d\n", | 2469 | " Processors=%d, Nodes=%d\n", |
2283 | KMALLOC_SHIFT_HIGH, L1_CACHE_BYTES, | 2470 | KMALLOC_SHIFT_HIGH, cache_line_size(), |
2284 | slub_min_order, slub_max_order, slub_min_objects, | 2471 | slub_min_order, slub_max_order, slub_min_objects, |
2285 | nr_cpu_ids, nr_node_ids); | 2472 | nr_cpu_ids, nr_node_ids); |
2286 | } | 2473 | } |
@@ -2293,7 +2480,13 @@ static int slab_unmergeable(struct kmem_cache *s) | |||
2293 | if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) | 2480 | if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) |
2294 | return 1; | 2481 | return 1; |
2295 | 2482 | ||
2296 | if (s->ctor || s->dtor) | 2483 | if (s->ctor) |
2484 | return 1; | ||
2485 | |||
2486 | /* | ||
2487 | * We may have set a slab to be unmergeable during bootstrap. | ||
2488 | */ | ||
2489 | if (s->refcount < 0) | ||
2297 | return 1; | 2490 | return 1; |
2298 | 2491 | ||
2299 | return 0; | 2492 | return 0; |
@@ -2301,15 +2494,14 @@ static int slab_unmergeable(struct kmem_cache *s) | |||
2301 | 2494 | ||
2302 | static struct kmem_cache *find_mergeable(size_t size, | 2495 | static struct kmem_cache *find_mergeable(size_t size, |
2303 | size_t align, unsigned long flags, | 2496 | size_t align, unsigned long flags, |
2304 | void (*ctor)(void *, struct kmem_cache *, unsigned long), | 2497 | void (*ctor)(void *, struct kmem_cache *, unsigned long)) |
2305 | void (*dtor)(void *, struct kmem_cache *, unsigned long)) | ||
2306 | { | 2498 | { |
2307 | struct list_head *h; | 2499 | struct list_head *h; |
2308 | 2500 | ||
2309 | if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) | 2501 | if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) |
2310 | return NULL; | 2502 | return NULL; |
2311 | 2503 | ||
2312 | if (ctor || dtor) | 2504 | if (ctor) |
2313 | return NULL; | 2505 | return NULL; |
2314 | 2506 | ||
2315 | size = ALIGN(size, sizeof(void *)); | 2507 | size = ALIGN(size, sizeof(void *)); |
@@ -2351,8 +2543,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
2351 | { | 2543 | { |
2352 | struct kmem_cache *s; | 2544 | struct kmem_cache *s; |
2353 | 2545 | ||
2546 | BUG_ON(dtor); | ||
2354 | down_write(&slub_lock); | 2547 | down_write(&slub_lock); |
2355 | s = find_mergeable(size, align, flags, dtor, ctor); | 2548 | s = find_mergeable(size, align, flags, ctor); |
2356 | if (s) { | 2549 | if (s) { |
2357 | s->refcount++; | 2550 | s->refcount++; |
2358 | /* | 2551 | /* |
@@ -2366,7 +2559,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
2366 | } else { | 2559 | } else { |
2367 | s = kmalloc(kmem_size, GFP_KERNEL); | 2560 | s = kmalloc(kmem_size, GFP_KERNEL); |
2368 | if (s && kmem_cache_open(s, GFP_KERNEL, name, | 2561 | if (s && kmem_cache_open(s, GFP_KERNEL, name, |
2369 | size, align, flags, ctor, dtor)) { | 2562 | size, align, flags, ctor)) { |
2370 | if (sysfs_slab_add(s)) { | 2563 | if (sysfs_slab_add(s)) { |
2371 | kfree(s); | 2564 | kfree(s); |
2372 | goto err; | 2565 | goto err; |
@@ -2415,8 +2608,21 @@ static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu) | |||
2415 | } | 2608 | } |
2416 | 2609 | ||
2417 | /* | 2610 | /* |
2418 | * Use the cpu notifier to insure that the slab are flushed | 2611 | * Version of __flush_cpu_slab for the case that interrupts |
2419 | * when necessary. | 2612 | * are enabled. |
2613 | */ | ||
2614 | static void cpu_slab_flush(struct kmem_cache *s, int cpu) | ||
2615 | { | ||
2616 | unsigned long flags; | ||
2617 | |||
2618 | local_irq_save(flags); | ||
2619 | __flush_cpu_slab(s, cpu); | ||
2620 | local_irq_restore(flags); | ||
2621 | } | ||
2622 | |||
2623 | /* | ||
2624 | * Use the cpu notifier to insure that the cpu slabs are flushed when | ||
2625 | * necessary. | ||
2420 | */ | 2626 | */ |
2421 | static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, | 2627 | static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, |
2422 | unsigned long action, void *hcpu) | 2628 | unsigned long action, void *hcpu) |
@@ -2425,8 +2631,10 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, | |||
2425 | 2631 | ||
2426 | switch (action) { | 2632 | switch (action) { |
2427 | case CPU_UP_CANCELED: | 2633 | case CPU_UP_CANCELED: |
2634 | case CPU_UP_CANCELED_FROZEN: | ||
2428 | case CPU_DEAD: | 2635 | case CPU_DEAD: |
2429 | for_all_slabs(__flush_cpu_slab, cpu); | 2636 | case CPU_DEAD_FROZEN: |
2637 | for_all_slabs(cpu_slab_flush, cpu); | ||
2430 | break; | 2638 | break; |
2431 | default: | 2639 | default: |
2432 | break; | 2640 | break; |
@@ -2439,153 +2647,6 @@ static struct notifier_block __cpuinitdata slab_notifier = | |||
2439 | 2647 | ||
2440 | #endif | 2648 | #endif |
2441 | 2649 | ||
2442 | #ifdef CONFIG_NUMA | ||
2443 | |||
2444 | /***************************************************************** | ||
2445 | * Generic reaper used to support the page allocator | ||
2446 | * (the cpu slabs are reaped by a per slab workqueue). | ||
2447 | * | ||
2448 | * Maybe move this to the page allocator? | ||
2449 | ****************************************************************/ | ||
2450 | |||
2451 | static DEFINE_PER_CPU(unsigned long, reap_node); | ||
2452 | |||
2453 | static void init_reap_node(int cpu) | ||
2454 | { | ||
2455 | int node; | ||
2456 | |||
2457 | node = next_node(cpu_to_node(cpu), node_online_map); | ||
2458 | if (node == MAX_NUMNODES) | ||
2459 | node = first_node(node_online_map); | ||
2460 | |||
2461 | __get_cpu_var(reap_node) = node; | ||
2462 | } | ||
2463 | |||
2464 | static void next_reap_node(void) | ||
2465 | { | ||
2466 | int node = __get_cpu_var(reap_node); | ||
2467 | |||
2468 | /* | ||
2469 | * Also drain per cpu pages on remote zones | ||
2470 | */ | ||
2471 | if (node != numa_node_id()) | ||
2472 | drain_node_pages(node); | ||
2473 | |||
2474 | node = next_node(node, node_online_map); | ||
2475 | if (unlikely(node >= MAX_NUMNODES)) | ||
2476 | node = first_node(node_online_map); | ||
2477 | __get_cpu_var(reap_node) = node; | ||
2478 | } | ||
2479 | #else | ||
2480 | #define init_reap_node(cpu) do { } while (0) | ||
2481 | #define next_reap_node(void) do { } while (0) | ||
2482 | #endif | ||
2483 | |||
2484 | #define REAPTIMEOUT_CPUC (2*HZ) | ||
2485 | |||
2486 | #ifdef CONFIG_SMP | ||
2487 | static DEFINE_PER_CPU(struct delayed_work, reap_work); | ||
2488 | |||
2489 | static void cache_reap(struct work_struct *unused) | ||
2490 | { | ||
2491 | next_reap_node(); | ||
2492 | refresh_cpu_vm_stats(smp_processor_id()); | ||
2493 | schedule_delayed_work(&__get_cpu_var(reap_work), | ||
2494 | REAPTIMEOUT_CPUC); | ||
2495 | } | ||
2496 | |||
2497 | static void __devinit start_cpu_timer(int cpu) | ||
2498 | { | ||
2499 | struct delayed_work *reap_work = &per_cpu(reap_work, cpu); | ||
2500 | |||
2501 | /* | ||
2502 | * When this gets called from do_initcalls via cpucache_init(), | ||
2503 | * init_workqueues() has already run, so keventd will be setup | ||
2504 | * at that time. | ||
2505 | */ | ||
2506 | if (keventd_up() && reap_work->work.func == NULL) { | ||
2507 | init_reap_node(cpu); | ||
2508 | INIT_DELAYED_WORK(reap_work, cache_reap); | ||
2509 | schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); | ||
2510 | } | ||
2511 | } | ||
2512 | |||
2513 | static int __init cpucache_init(void) | ||
2514 | { | ||
2515 | int cpu; | ||
2516 | |||
2517 | /* | ||
2518 | * Register the timers that drain pcp pages and update vm statistics | ||
2519 | */ | ||
2520 | for_each_online_cpu(cpu) | ||
2521 | start_cpu_timer(cpu); | ||
2522 | return 0; | ||
2523 | } | ||
2524 | __initcall(cpucache_init); | ||
2525 | #endif | ||
2526 | |||
2527 | #ifdef SLUB_RESILIENCY_TEST | ||
2528 | static unsigned long validate_slab_cache(struct kmem_cache *s); | ||
2529 | |||
2530 | static void resiliency_test(void) | ||
2531 | { | ||
2532 | u8 *p; | ||
2533 | |||
2534 | printk(KERN_ERR "SLUB resiliency testing\n"); | ||
2535 | printk(KERN_ERR "-----------------------\n"); | ||
2536 | printk(KERN_ERR "A. Corruption after allocation\n"); | ||
2537 | |||
2538 | p = kzalloc(16, GFP_KERNEL); | ||
2539 | p[16] = 0x12; | ||
2540 | printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" | ||
2541 | " 0x12->0x%p\n\n", p + 16); | ||
2542 | |||
2543 | validate_slab_cache(kmalloc_caches + 4); | ||
2544 | |||
2545 | /* Hmmm... The next two are dangerous */ | ||
2546 | p = kzalloc(32, GFP_KERNEL); | ||
2547 | p[32 + sizeof(void *)] = 0x34; | ||
2548 | printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" | ||
2549 | " 0x34 -> -0x%p\n", p); | ||
2550 | printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); | ||
2551 | |||
2552 | validate_slab_cache(kmalloc_caches + 5); | ||
2553 | p = kzalloc(64, GFP_KERNEL); | ||
2554 | p += 64 + (get_cycles() & 0xff) * sizeof(void *); | ||
2555 | *p = 0x56; | ||
2556 | printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", | ||
2557 | p); | ||
2558 | printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); | ||
2559 | validate_slab_cache(kmalloc_caches + 6); | ||
2560 | |||
2561 | printk(KERN_ERR "\nB. Corruption after free\n"); | ||
2562 | p = kzalloc(128, GFP_KERNEL); | ||
2563 | kfree(p); | ||
2564 | *p = 0x78; | ||
2565 | printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); | ||
2566 | validate_slab_cache(kmalloc_caches + 7); | ||
2567 | |||
2568 | p = kzalloc(256, GFP_KERNEL); | ||
2569 | kfree(p); | ||
2570 | p[50] = 0x9a; | ||
2571 | printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); | ||
2572 | validate_slab_cache(kmalloc_caches + 8); | ||
2573 | |||
2574 | p = kzalloc(512, GFP_KERNEL); | ||
2575 | kfree(p); | ||
2576 | p[512] = 0xab; | ||
2577 | printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); | ||
2578 | validate_slab_cache(kmalloc_caches + 9); | ||
2579 | } | ||
2580 | #else | ||
2581 | static void resiliency_test(void) {}; | ||
2582 | #endif | ||
2583 | |||
2584 | /* | ||
2585 | * These are not as efficient as kmalloc for the non debug case. | ||
2586 | * We do not have the page struct available so we have to touch one | ||
2587 | * cacheline in struct kmem_cache to check slab flags. | ||
2588 | */ | ||
2589 | void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) | 2650 | void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) |
2590 | { | 2651 | { |
2591 | struct kmem_cache *s = get_slab(size, gfpflags); | 2652 | struct kmem_cache *s = get_slab(size, gfpflags); |
@@ -2607,13 +2668,12 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, | |||
2607 | return slab_alloc(s, gfpflags, node, caller); | 2668 | return slab_alloc(s, gfpflags, node, caller); |
2608 | } | 2669 | } |
2609 | 2670 | ||
2610 | #ifdef CONFIG_SYSFS | 2671 | #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) |
2611 | |||
2612 | static int validate_slab(struct kmem_cache *s, struct page *page) | 2672 | static int validate_slab(struct kmem_cache *s, struct page *page) |
2613 | { | 2673 | { |
2614 | void *p; | 2674 | void *p; |
2615 | void *addr = page_address(page); | 2675 | void *addr = page_address(page); |
2616 | unsigned long map[BITS_TO_LONGS(s->objects)]; | 2676 | DECLARE_BITMAP(map, s->objects); |
2617 | 2677 | ||
2618 | if (!check_slab(s, page) || | 2678 | if (!check_slab(s, page) || |
2619 | !on_freelist(s, page, NULL)) | 2679 | !on_freelist(s, page, NULL)) |
@@ -2622,14 +2682,14 @@ static int validate_slab(struct kmem_cache *s, struct page *page) | |||
2622 | /* Now we know that a valid freelist exists */ | 2682 | /* Now we know that a valid freelist exists */ |
2623 | bitmap_zero(map, s->objects); | 2683 | bitmap_zero(map, s->objects); |
2624 | 2684 | ||
2625 | for(p = page->freelist; p; p = get_freepointer(s, p)) { | 2685 | for_each_free_object(p, s, page->freelist) { |
2626 | set_bit((p - addr) / s->size, map); | 2686 | set_bit(slab_index(p, s, addr), map); |
2627 | if (!check_object(s, page, p, 0)) | 2687 | if (!check_object(s, page, p, 0)) |
2628 | return 0; | 2688 | return 0; |
2629 | } | 2689 | } |
2630 | 2690 | ||
2631 | for(p = addr; p < addr + s->objects * s->size; p += s->size) | 2691 | for_each_object(p, s, addr) |
2632 | if (!test_bit((p - addr) / s->size, map)) | 2692 | if (!test_bit(slab_index(p, s, addr), map)) |
2633 | if (!check_object(s, page, p, 1)) | 2693 | if (!check_object(s, page, p, 1)) |
2634 | return 0; | 2694 | return 0; |
2635 | return 1; | 2695 | return 1; |
@@ -2645,12 +2705,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page) | |||
2645 | s->name, page); | 2705 | s->name, page); |
2646 | 2706 | ||
2647 | if (s->flags & DEBUG_DEFAULT_FLAGS) { | 2707 | if (s->flags & DEBUG_DEFAULT_FLAGS) { |
2648 | if (!PageError(page)) | 2708 | if (!SlabDebug(page)) |
2649 | printk(KERN_ERR "SLUB %s: PageError not set " | 2709 | printk(KERN_ERR "SLUB %s: SlabDebug not set " |
2650 | "on slab 0x%p\n", s->name, page); | 2710 | "on slab 0x%p\n", s->name, page); |
2651 | } else { | 2711 | } else { |
2652 | if (PageError(page)) | 2712 | if (SlabDebug(page)) |
2653 | printk(KERN_ERR "SLUB %s: PageError set on " | 2713 | printk(KERN_ERR "SLUB %s: SlabDebug set on " |
2654 | "slab 0x%p\n", s->name, page); | 2714 | "slab 0x%p\n", s->name, page); |
2655 | } | 2715 | } |
2656 | } | 2716 | } |
@@ -2702,14 +2762,76 @@ static unsigned long validate_slab_cache(struct kmem_cache *s) | |||
2702 | return count; | 2762 | return count; |
2703 | } | 2763 | } |
2704 | 2764 | ||
2765 | #ifdef SLUB_RESILIENCY_TEST | ||
2766 | static void resiliency_test(void) | ||
2767 | { | ||
2768 | u8 *p; | ||
2769 | |||
2770 | printk(KERN_ERR "SLUB resiliency testing\n"); | ||
2771 | printk(KERN_ERR "-----------------------\n"); | ||
2772 | printk(KERN_ERR "A. Corruption after allocation\n"); | ||
2773 | |||
2774 | p = kzalloc(16, GFP_KERNEL); | ||
2775 | p[16] = 0x12; | ||
2776 | printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" | ||
2777 | " 0x12->0x%p\n\n", p + 16); | ||
2778 | |||
2779 | validate_slab_cache(kmalloc_caches + 4); | ||
2780 | |||
2781 | /* Hmmm... The next two are dangerous */ | ||
2782 | p = kzalloc(32, GFP_KERNEL); | ||
2783 | p[32 + sizeof(void *)] = 0x34; | ||
2784 | printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" | ||
2785 | " 0x34 -> -0x%p\n", p); | ||
2786 | printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); | ||
2787 | |||
2788 | validate_slab_cache(kmalloc_caches + 5); | ||
2789 | p = kzalloc(64, GFP_KERNEL); | ||
2790 | p += 64 + (get_cycles() & 0xff) * sizeof(void *); | ||
2791 | *p = 0x56; | ||
2792 | printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", | ||
2793 | p); | ||
2794 | printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); | ||
2795 | validate_slab_cache(kmalloc_caches + 6); | ||
2796 | |||
2797 | printk(KERN_ERR "\nB. Corruption after free\n"); | ||
2798 | p = kzalloc(128, GFP_KERNEL); | ||
2799 | kfree(p); | ||
2800 | *p = 0x78; | ||
2801 | printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); | ||
2802 | validate_slab_cache(kmalloc_caches + 7); | ||
2803 | |||
2804 | p = kzalloc(256, GFP_KERNEL); | ||
2805 | kfree(p); | ||
2806 | p[50] = 0x9a; | ||
2807 | printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); | ||
2808 | validate_slab_cache(kmalloc_caches + 8); | ||
2809 | |||
2810 | p = kzalloc(512, GFP_KERNEL); | ||
2811 | kfree(p); | ||
2812 | p[512] = 0xab; | ||
2813 | printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); | ||
2814 | validate_slab_cache(kmalloc_caches + 9); | ||
2815 | } | ||
2816 | #else | ||
2817 | static void resiliency_test(void) {}; | ||
2818 | #endif | ||
2819 | |||
2705 | /* | 2820 | /* |
2706 | * Generate lists of locations where slabcache objects are allocated | 2821 | * Generate lists of code addresses where slabcache objects are allocated |
2707 | * and freed. | 2822 | * and freed. |
2708 | */ | 2823 | */ |
2709 | 2824 | ||
2710 | struct location { | 2825 | struct location { |
2711 | unsigned long count; | 2826 | unsigned long count; |
2712 | void *addr; | 2827 | void *addr; |
2828 | long long sum_time; | ||
2829 | long min_time; | ||
2830 | long max_time; | ||
2831 | long min_pid; | ||
2832 | long max_pid; | ||
2833 | cpumask_t cpus; | ||
2834 | nodemask_t nodes; | ||
2713 | }; | 2835 | }; |
2714 | 2836 | ||
2715 | struct loc_track { | 2837 | struct loc_track { |
@@ -2750,11 +2872,12 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max) | |||
2750 | } | 2872 | } |
2751 | 2873 | ||
2752 | static int add_location(struct loc_track *t, struct kmem_cache *s, | 2874 | static int add_location(struct loc_track *t, struct kmem_cache *s, |
2753 | void *addr) | 2875 | const struct track *track) |
2754 | { | 2876 | { |
2755 | long start, end, pos; | 2877 | long start, end, pos; |
2756 | struct location *l; | 2878 | struct location *l; |
2757 | void *caddr; | 2879 | void *caddr; |
2880 | unsigned long age = jiffies - track->when; | ||
2758 | 2881 | ||
2759 | start = -1; | 2882 | start = -1; |
2760 | end = t->count; | 2883 | end = t->count; |
@@ -2770,19 +2893,36 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, | |||
2770 | break; | 2893 | break; |
2771 | 2894 | ||
2772 | caddr = t->loc[pos].addr; | 2895 | caddr = t->loc[pos].addr; |
2773 | if (addr == caddr) { | 2896 | if (track->addr == caddr) { |
2774 | t->loc[pos].count++; | 2897 | |
2898 | l = &t->loc[pos]; | ||
2899 | l->count++; | ||
2900 | if (track->when) { | ||
2901 | l->sum_time += age; | ||
2902 | if (age < l->min_time) | ||
2903 | l->min_time = age; | ||
2904 | if (age > l->max_time) | ||
2905 | l->max_time = age; | ||
2906 | |||
2907 | if (track->pid < l->min_pid) | ||
2908 | l->min_pid = track->pid; | ||
2909 | if (track->pid > l->max_pid) | ||
2910 | l->max_pid = track->pid; | ||
2911 | |||
2912 | cpu_set(track->cpu, l->cpus); | ||
2913 | } | ||
2914 | node_set(page_to_nid(virt_to_page(track)), l->nodes); | ||
2775 | return 1; | 2915 | return 1; |
2776 | } | 2916 | } |
2777 | 2917 | ||
2778 | if (addr < caddr) | 2918 | if (track->addr < caddr) |
2779 | end = pos; | 2919 | end = pos; |
2780 | else | 2920 | else |
2781 | start = pos; | 2921 | start = pos; |
2782 | } | 2922 | } |
2783 | 2923 | ||
2784 | /* | 2924 | /* |
2785 | * Not found. Insert new tracking element | 2925 | * Not found. Insert new tracking element. |
2786 | */ | 2926 | */ |
2787 | if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max)) | 2927 | if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max)) |
2788 | return 0; | 2928 | return 0; |
@@ -2793,7 +2933,16 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, | |||
2793 | (t->count - pos) * sizeof(struct location)); | 2933 | (t->count - pos) * sizeof(struct location)); |
2794 | t->count++; | 2934 | t->count++; |
2795 | l->count = 1; | 2935 | l->count = 1; |
2796 | l->addr = addr; | 2936 | l->addr = track->addr; |
2937 | l->sum_time = age; | ||
2938 | l->min_time = age; | ||
2939 | l->max_time = age; | ||
2940 | l->min_pid = track->pid; | ||
2941 | l->max_pid = track->pid; | ||
2942 | cpus_clear(l->cpus); | ||
2943 | cpu_set(track->cpu, l->cpus); | ||
2944 | nodes_clear(l->nodes); | ||
2945 | node_set(page_to_nid(virt_to_page(track)), l->nodes); | ||
2797 | return 1; | 2946 | return 1; |
2798 | } | 2947 | } |
2799 | 2948 | ||
@@ -2801,19 +2950,16 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, | |||
2801 | struct page *page, enum track_item alloc) | 2950 | struct page *page, enum track_item alloc) |
2802 | { | 2951 | { |
2803 | void *addr = page_address(page); | 2952 | void *addr = page_address(page); |
2804 | unsigned long map[BITS_TO_LONGS(s->objects)]; | 2953 | DECLARE_BITMAP(map, s->objects); |
2805 | void *p; | 2954 | void *p; |
2806 | 2955 | ||
2807 | bitmap_zero(map, s->objects); | 2956 | bitmap_zero(map, s->objects); |
2808 | for (p = page->freelist; p; p = get_freepointer(s, p)) | 2957 | for_each_free_object(p, s, page->freelist) |
2809 | set_bit((p - addr) / s->size, map); | 2958 | set_bit(slab_index(p, s, addr), map); |
2810 | |||
2811 | for (p = addr; p < addr + s->objects * s->size; p += s->size) | ||
2812 | if (!test_bit((p - addr) / s->size, map)) { | ||
2813 | void *addr = get_track(s, p, alloc)->addr; | ||
2814 | 2959 | ||
2815 | add_location(t, s, addr); | 2960 | for_each_object(p, s, addr) |
2816 | } | 2961 | if (!test_bit(slab_index(p, s, addr), map)) |
2962 | add_location(t, s, get_track(s, p, alloc)); | ||
2817 | } | 2963 | } |
2818 | 2964 | ||
2819 | static int list_locations(struct kmem_cache *s, char *buf, | 2965 | static int list_locations(struct kmem_cache *s, char *buf, |
@@ -2847,15 +2993,47 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
2847 | } | 2993 | } |
2848 | 2994 | ||
2849 | for (i = 0; i < t.count; i++) { | 2995 | for (i = 0; i < t.count; i++) { |
2850 | void *addr = t.loc[i].addr; | 2996 | struct location *l = &t.loc[i]; |
2851 | 2997 | ||
2852 | if (n > PAGE_SIZE - 100) | 2998 | if (n > PAGE_SIZE - 100) |
2853 | break; | 2999 | break; |
2854 | n += sprintf(buf + n, "%7ld ", t.loc[i].count); | 3000 | n += sprintf(buf + n, "%7ld ", l->count); |
2855 | if (addr) | 3001 | |
2856 | n += sprint_symbol(buf + n, (unsigned long)t.loc[i].addr); | 3002 | if (l->addr) |
3003 | n += sprint_symbol(buf + n, (unsigned long)l->addr); | ||
2857 | else | 3004 | else |
2858 | n += sprintf(buf + n, "<not-available>"); | 3005 | n += sprintf(buf + n, "<not-available>"); |
3006 | |||
3007 | if (l->sum_time != l->min_time) { | ||
3008 | unsigned long remainder; | ||
3009 | |||
3010 | n += sprintf(buf + n, " age=%ld/%ld/%ld", | ||
3011 | l->min_time, | ||
3012 | div_long_long_rem(l->sum_time, l->count, &remainder), | ||
3013 | l->max_time); | ||
3014 | } else | ||
3015 | n += sprintf(buf + n, " age=%ld", | ||
3016 | l->min_time); | ||
3017 | |||
3018 | if (l->min_pid != l->max_pid) | ||
3019 | n += sprintf(buf + n, " pid=%ld-%ld", | ||
3020 | l->min_pid, l->max_pid); | ||
3021 | else | ||
3022 | n += sprintf(buf + n, " pid=%ld", | ||
3023 | l->min_pid); | ||
3024 | |||
3025 | if (num_online_cpus() > 1 && !cpus_empty(l->cpus)) { | ||
3026 | n += sprintf(buf + n, " cpus="); | ||
3027 | n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50, | ||
3028 | l->cpus); | ||
3029 | } | ||
3030 | |||
3031 | if (num_online_nodes() > 1 && !nodes_empty(l->nodes)) { | ||
3032 | n += sprintf(buf + n, " nodes="); | ||
3033 | n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50, | ||
3034 | l->nodes); | ||
3035 | } | ||
3036 | |||
2859 | n += sprintf(buf + n, "\n"); | 3037 | n += sprintf(buf + n, "\n"); |
2860 | } | 3038 | } |
2861 | 3039 | ||
@@ -3035,17 +3213,6 @@ static ssize_t ctor_show(struct kmem_cache *s, char *buf) | |||
3035 | } | 3213 | } |
3036 | SLAB_ATTR_RO(ctor); | 3214 | SLAB_ATTR_RO(ctor); |
3037 | 3215 | ||
3038 | static ssize_t dtor_show(struct kmem_cache *s, char *buf) | ||
3039 | { | ||
3040 | if (s->dtor) { | ||
3041 | int n = sprint_symbol(buf, (unsigned long)s->dtor); | ||
3042 | |||
3043 | return n + sprintf(buf + n, "\n"); | ||
3044 | } | ||
3045 | return 0; | ||
3046 | } | ||
3047 | SLAB_ATTR_RO(dtor); | ||
3048 | |||
3049 | static ssize_t aliases_show(struct kmem_cache *s, char *buf) | 3216 | static ssize_t aliases_show(struct kmem_cache *s, char *buf) |
3050 | { | 3217 | { |
3051 | return sprintf(buf, "%d\n", s->refcount - 1); | 3218 | return sprintf(buf, "%d\n", s->refcount - 1); |
@@ -3277,7 +3444,6 @@ static struct attribute * slab_attrs[] = { | |||
3277 | &partial_attr.attr, | 3444 | &partial_attr.attr, |
3278 | &cpu_slabs_attr.attr, | 3445 | &cpu_slabs_attr.attr, |
3279 | &ctor_attr.attr, | 3446 | &ctor_attr.attr, |
3280 | &dtor_attr.attr, | ||
3281 | &aliases_attr.attr, | 3447 | &aliases_attr.attr, |
3282 | &align_attr.attr, | 3448 | &align_attr.attr, |
3283 | &sanity_checks_attr.attr, | 3449 | &sanity_checks_attr.attr, |
@@ -3491,6 +3657,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) | |||
3491 | 3657 | ||
3492 | static int __init slab_sysfs_init(void) | 3658 | static int __init slab_sysfs_init(void) |
3493 | { | 3659 | { |
3660 | struct list_head *h; | ||
3494 | int err; | 3661 | int err; |
3495 | 3662 | ||
3496 | err = subsystem_register(&slab_subsys); | 3663 | err = subsystem_register(&slab_subsys); |
@@ -3499,7 +3666,15 @@ static int __init slab_sysfs_init(void) | |||
3499 | return -ENOSYS; | 3666 | return -ENOSYS; |
3500 | } | 3667 | } |
3501 | 3668 | ||
3502 | finish_bootstrap(); | 3669 | slab_state = SYSFS; |
3670 | |||
3671 | list_for_each(h, &slab_caches) { | ||
3672 | struct kmem_cache *s = | ||
3673 | container_of(h, struct kmem_cache, list); | ||
3674 | |||
3675 | err = sysfs_slab_add(s); | ||
3676 | BUG_ON(err); | ||
3677 | } | ||
3503 | 3678 | ||
3504 | while (alias_list) { | 3679 | while (alias_list) { |
3505 | struct saved_alias *al = alias_list; | 3680 | struct saved_alias *al = alias_list; |
@@ -3515,6 +3690,4 @@ static int __init slab_sysfs_init(void) | |||
3515 | } | 3690 | } |
3516 | 3691 | ||
3517 | __initcall(slab_sysfs_init); | 3692 | __initcall(slab_sysfs_init); |
3518 | #else | ||
3519 | __initcall(finish_bootstrap); | ||
3520 | #endif | 3693 | #endif |
diff --git a/mm/sparse.c b/mm/sparse.c index 893e5621c247..545e4d3afcdf 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -44,7 +44,7 @@ EXPORT_SYMBOL(page_to_nid); | |||
44 | #endif | 44 | #endif |
45 | 45 | ||
46 | #ifdef CONFIG_SPARSEMEM_EXTREME | 46 | #ifdef CONFIG_SPARSEMEM_EXTREME |
47 | static struct mem_section *sparse_index_alloc(int nid) | 47 | static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) |
48 | { | 48 | { |
49 | struct mem_section *section = NULL; | 49 | struct mem_section *section = NULL; |
50 | unsigned long array_size = SECTIONS_PER_ROOT * | 50 | unsigned long array_size = SECTIONS_PER_ROOT * |
@@ -61,7 +61,7 @@ static struct mem_section *sparse_index_alloc(int nid) | |||
61 | return section; | 61 | return section; |
62 | } | 62 | } |
63 | 63 | ||
64 | static int sparse_index_init(unsigned long section_nr, int nid) | 64 | static int __meminit sparse_index_init(unsigned long section_nr, int nid) |
65 | { | 65 | { |
66 | static DEFINE_SPINLOCK(index_init_lock); | 66 | static DEFINE_SPINLOCK(index_init_lock); |
67 | unsigned long root = SECTION_NR_TO_ROOT(section_nr); | 67 | unsigned long root = SECTION_NR_TO_ROOT(section_nr); |
@@ -138,7 +138,7 @@ static inline int sparse_early_nid(struct mem_section *section) | |||
138 | } | 138 | } |
139 | 139 | ||
140 | /* Record a memory area against a node. */ | 140 | /* Record a memory area against a node. */ |
141 | void memory_present(int nid, unsigned long start, unsigned long end) | 141 | void __init memory_present(int nid, unsigned long start, unsigned long end) |
142 | { | 142 | { |
143 | unsigned long pfn; | 143 | unsigned long pfn; |
144 | 144 | ||
@@ -197,7 +197,7 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn | |||
197 | return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); | 197 | return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); |
198 | } | 198 | } |
199 | 199 | ||
200 | static int sparse_init_one_section(struct mem_section *ms, | 200 | static int __meminit sparse_init_one_section(struct mem_section *ms, |
201 | unsigned long pnum, struct page *mem_map) | 201 | unsigned long pnum, struct page *mem_map) |
202 | { | 202 | { |
203 | if (!valid_section(ms)) | 203 | if (!valid_section(ms)) |
@@ -209,7 +209,13 @@ static int sparse_init_one_section(struct mem_section *ms, | |||
209 | return 1; | 209 | return 1; |
210 | } | 210 | } |
211 | 211 | ||
212 | static struct page *sparse_early_mem_map_alloc(unsigned long pnum) | 212 | __attribute__((weak)) |
213 | void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) | ||
214 | { | ||
215 | return NULL; | ||
216 | } | ||
217 | |||
218 | static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | ||
213 | { | 219 | { |
214 | struct page *map; | 220 | struct page *map; |
215 | struct mem_section *ms = __nr_to_section(pnum); | 221 | struct mem_section *ms = __nr_to_section(pnum); |
@@ -219,6 +225,11 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum) | |||
219 | if (map) | 225 | if (map) |
220 | return map; | 226 | return map; |
221 | 227 | ||
228 | map = alloc_bootmem_high_node(NODE_DATA(nid), | ||
229 | sizeof(struct page) * PAGES_PER_SECTION); | ||
230 | if (map) | ||
231 | return map; | ||
232 | |||
222 | map = alloc_bootmem_node(NODE_DATA(nid), | 233 | map = alloc_bootmem_node(NODE_DATA(nid), |
223 | sizeof(struct page) * PAGES_PER_SECTION); | 234 | sizeof(struct page) * PAGES_PER_SECTION); |
224 | if (map) | 235 | if (map) |
@@ -288,6 +299,7 @@ void __init sparse_init(void) | |||
288 | } | 299 | } |
289 | } | 300 | } |
290 | 301 | ||
302 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
291 | /* | 303 | /* |
292 | * returns the number of sections whose mem_maps were properly | 304 | * returns the number of sections whose mem_maps were properly |
293 | * set. If this is <=0, then that means that the passed-in | 305 | * set. If this is <=0, then that means that the passed-in |
@@ -327,3 +339,4 @@ out: | |||
327 | __kfree_section_memmap(memmap, nr_pages); | 339 | __kfree_section_memmap(memmap, nr_pages); |
328 | return ret; | 340 | return ret; |
329 | } | 341 | } |
342 | #endif | ||
@@ -488,7 +488,7 @@ static int cpu_swap_callback(struct notifier_block *nfb, | |||
488 | long *committed; | 488 | long *committed; |
489 | 489 | ||
490 | committed = &per_cpu(committed_space, (long)hcpu); | 490 | committed = &per_cpu(committed_space, (long)hcpu); |
491 | if (action == CPU_DEAD) { | 491 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { |
492 | atomic_add(*committed, &vm_committed_space); | 492 | atomic_add(*committed, &vm_committed_space); |
493 | *committed = 0; | 493 | *committed = 0; |
494 | __lru_add_drain((long)hcpu); | 494 | __lru_add_drain((long)hcpu); |
diff --git a/mm/thrash.c b/mm/thrash.c index 9ef9071f99bc..c4c5205a9c35 100644 --- a/mm/thrash.c +++ b/mm/thrash.c | |||
@@ -48,9 +48,8 @@ void grab_swap_token(void) | |||
48 | if (current_interval < current->mm->last_interval) | 48 | if (current_interval < current->mm->last_interval) |
49 | current->mm->token_priority++; | 49 | current->mm->token_priority++; |
50 | else { | 50 | else { |
51 | current->mm->token_priority--; | 51 | if (likely(current->mm->token_priority > 0)) |
52 | if (unlikely(current->mm->token_priority < 0)) | 52 | current->mm->token_priority--; |
53 | current->mm->token_priority = 0; | ||
54 | } | 53 | } |
55 | /* Check if we deserve the token */ | 54 | /* Check if we deserve the token */ |
56 | if (current->mm->token_priority > | 55 | if (current->mm->token_priority > |
diff --git a/mm/truncate.c b/mm/truncate.c index 0f4b6d18ab0e..4fbe1a2da5fb 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/swap.h> | 12 | #include <linux/swap.h> |
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
15 | #include <linux/highmem.h> | ||
15 | #include <linux/pagevec.h> | 16 | #include <linux/pagevec.h> |
16 | #include <linux/task_io_accounting_ops.h> | 17 | #include <linux/task_io_accounting_ops.h> |
17 | #include <linux/buffer_head.h> /* grr. try_to_release_page, | 18 | #include <linux/buffer_head.h> /* grr. try_to_release_page, |
@@ -46,7 +47,7 @@ void do_invalidatepage(struct page *page, unsigned long offset) | |||
46 | 47 | ||
47 | static inline void truncate_partial_page(struct page *page, unsigned partial) | 48 | static inline void truncate_partial_page(struct page *page, unsigned partial) |
48 | { | 49 | { |
49 | memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); | 50 | zero_user_page(page, partial, PAGE_CACHE_SIZE - partial, KM_USER0); |
50 | if (PagePrivate(page)) | 51 | if (PagePrivate(page)) |
51 | do_invalidatepage(page, partial); | 52 | do_invalidatepage(page, partial); |
52 | } | 53 | } |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index cb5aabda7046..d3a9c5368257 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -311,7 +311,7 @@ struct vm_struct *remove_vm_area(void *addr) | |||
311 | return v; | 311 | return v; |
312 | } | 312 | } |
313 | 313 | ||
314 | void __vunmap(void *addr, int deallocate_pages) | 314 | static void __vunmap(void *addr, int deallocate_pages) |
315 | { | 315 | { |
316 | struct vm_struct *area; | 316 | struct vm_struct *area; |
317 | 317 | ||
@@ -755,3 +755,10 @@ out_einval_locked: | |||
755 | } | 755 | } |
756 | EXPORT_SYMBOL(remap_vmalloc_range); | 756 | EXPORT_SYMBOL(remap_vmalloc_range); |
757 | 757 | ||
758 | /* | ||
759 | * Implement a stub for vmalloc_sync_all() if the architecture chose not to | ||
760 | * have one. | ||
761 | */ | ||
762 | void __attribute__((weak)) vmalloc_sync_all(void) | ||
763 | { | ||
764 | } | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 56651a10c366..1be5a6376ef0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -284,12 +284,8 @@ static void handle_write_error(struct address_space *mapping, | |||
284 | struct page *page, int error) | 284 | struct page *page, int error) |
285 | { | 285 | { |
286 | lock_page(page); | 286 | lock_page(page); |
287 | if (page_mapping(page) == mapping) { | 287 | if (page_mapping(page) == mapping) |
288 | if (error == -ENOSPC) | 288 | mapping_set_error(mapping, error); |
289 | set_bit(AS_ENOSPC, &mapping->flags); | ||
290 | else | ||
291 | set_bit(AS_EIO, &mapping->flags); | ||
292 | } | ||
293 | unlock_page(page); | 289 | unlock_page(page); |
294 | } | 290 | } |
295 | 291 | ||
@@ -1532,7 +1528,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb, | |||
1532 | pg_data_t *pgdat; | 1528 | pg_data_t *pgdat; |
1533 | cpumask_t mask; | 1529 | cpumask_t mask; |
1534 | 1530 | ||
1535 | if (action == CPU_ONLINE) { | 1531 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { |
1536 | for_each_online_pgdat(pgdat) { | 1532 | for_each_online_pgdat(pgdat) { |
1537 | mask = node_to_cpumask(pgdat->node_id); | 1533 | mask = node_to_cpumask(pgdat->node_id); |
1538 | if (any_online_cpu(mask) != NR_CPUS) | 1534 | if (any_online_cpu(mask) != NR_CPUS) |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 6c488d6ac425..38254297a494 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | #include <linux/sched.h> | ||
15 | 16 | ||
16 | #ifdef CONFIG_VM_EVENT_COUNTERS | 17 | #ifdef CONFIG_VM_EVENT_COUNTERS |
17 | DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; | 18 | DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; |
@@ -281,6 +282,17 @@ EXPORT_SYMBOL(dec_zone_page_state); | |||
281 | 282 | ||
282 | /* | 283 | /* |
283 | * Update the zone counters for one cpu. | 284 | * Update the zone counters for one cpu. |
285 | * | ||
286 | * Note that refresh_cpu_vm_stats strives to only access | ||
287 | * node local memory. The per cpu pagesets on remote zones are placed | ||
288 | * in the memory local to the processor using that pageset. So the | ||
289 | * loop over all zones will access a series of cachelines local to | ||
290 | * the processor. | ||
291 | * | ||
292 | * The call to zone_page_state_add updates the cachelines with the | ||
293 | * statistics in the remote zone struct as well as the global cachelines | ||
294 | * with the global counters. These could cause remote node cache line | ||
295 | * bouncing and will have to be only done when necessary. | ||
284 | */ | 296 | */ |
285 | void refresh_cpu_vm_stats(int cpu) | 297 | void refresh_cpu_vm_stats(int cpu) |
286 | { | 298 | { |
@@ -289,21 +301,54 @@ void refresh_cpu_vm_stats(int cpu) | |||
289 | unsigned long flags; | 301 | unsigned long flags; |
290 | 302 | ||
291 | for_each_zone(zone) { | 303 | for_each_zone(zone) { |
292 | struct per_cpu_pageset *pcp; | 304 | struct per_cpu_pageset *p; |
293 | 305 | ||
294 | if (!populated_zone(zone)) | 306 | if (!populated_zone(zone)) |
295 | continue; | 307 | continue; |
296 | 308 | ||
297 | pcp = zone_pcp(zone, cpu); | 309 | p = zone_pcp(zone, cpu); |
298 | 310 | ||
299 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 311 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) |
300 | if (pcp->vm_stat_diff[i]) { | 312 | if (p->vm_stat_diff[i]) { |
301 | local_irq_save(flags); | 313 | local_irq_save(flags); |
302 | zone_page_state_add(pcp->vm_stat_diff[i], | 314 | zone_page_state_add(p->vm_stat_diff[i], |
303 | zone, i); | 315 | zone, i); |
304 | pcp->vm_stat_diff[i] = 0; | 316 | p->vm_stat_diff[i] = 0; |
317 | #ifdef CONFIG_NUMA | ||
318 | /* 3 seconds idle till flush */ | ||
319 | p->expire = 3; | ||
320 | #endif | ||
305 | local_irq_restore(flags); | 321 | local_irq_restore(flags); |
306 | } | 322 | } |
323 | #ifdef CONFIG_NUMA | ||
324 | /* | ||
325 | * Deal with draining the remote pageset of this | ||
326 | * processor | ||
327 | * | ||
328 | * Check if there are pages remaining in this pageset | ||
329 | * if not then there is nothing to expire. | ||
330 | */ | ||
331 | if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count)) | ||
332 | continue; | ||
333 | |||
334 | /* | ||
335 | * We never drain zones local to this processor. | ||
336 | */ | ||
337 | if (zone_to_nid(zone) == numa_node_id()) { | ||
338 | p->expire = 0; | ||
339 | continue; | ||
340 | } | ||
341 | |||
342 | p->expire--; | ||
343 | if (p->expire) | ||
344 | continue; | ||
345 | |||
346 | if (p->pcp[0].count) | ||
347 | drain_zone_pages(zone, p->pcp + 0); | ||
348 | |||
349 | if (p->pcp[1].count) | ||
350 | drain_zone_pages(zone, p->pcp + 1); | ||
351 | #endif | ||
307 | } | 352 | } |
308 | } | 353 | } |
309 | 354 | ||
@@ -640,6 +685,24 @@ const struct seq_operations vmstat_op = { | |||
640 | #endif /* CONFIG_PROC_FS */ | 685 | #endif /* CONFIG_PROC_FS */ |
641 | 686 | ||
642 | #ifdef CONFIG_SMP | 687 | #ifdef CONFIG_SMP |
688 | static DEFINE_PER_CPU(struct delayed_work, vmstat_work); | ||
689 | int sysctl_stat_interval __read_mostly = HZ; | ||
690 | |||
691 | static void vmstat_update(struct work_struct *w) | ||
692 | { | ||
693 | refresh_cpu_vm_stats(smp_processor_id()); | ||
694 | schedule_delayed_work(&__get_cpu_var(vmstat_work), | ||
695 | sysctl_stat_interval); | ||
696 | } | ||
697 | |||
698 | static void __devinit start_cpu_timer(int cpu) | ||
699 | { | ||
700 | struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu); | ||
701 | |||
702 | INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update); | ||
703 | schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu); | ||
704 | } | ||
705 | |||
643 | /* | 706 | /* |
644 | * Use the cpu notifier to insure that the thresholds are recalculated | 707 | * Use the cpu notifier to insure that the thresholds are recalculated |
645 | * when necessary. | 708 | * when necessary. |
@@ -648,10 +711,24 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, | |||
648 | unsigned long action, | 711 | unsigned long action, |
649 | void *hcpu) | 712 | void *hcpu) |
650 | { | 713 | { |
714 | long cpu = (long)hcpu; | ||
715 | |||
651 | switch (action) { | 716 | switch (action) { |
652 | case CPU_UP_PREPARE: | 717 | case CPU_ONLINE: |
653 | case CPU_UP_CANCELED: | 718 | case CPU_ONLINE_FROZEN: |
719 | start_cpu_timer(cpu); | ||
720 | break; | ||
721 | case CPU_DOWN_PREPARE: | ||
722 | case CPU_DOWN_PREPARE_FROZEN: | ||
723 | cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu)); | ||
724 | per_cpu(vmstat_work, cpu).work.func = NULL; | ||
725 | break; | ||
726 | case CPU_DOWN_FAILED: | ||
727 | case CPU_DOWN_FAILED_FROZEN: | ||
728 | start_cpu_timer(cpu); | ||
729 | break; | ||
654 | case CPU_DEAD: | 730 | case CPU_DEAD: |
731 | case CPU_DEAD_FROZEN: | ||
655 | refresh_zone_stat_thresholds(); | 732 | refresh_zone_stat_thresholds(); |
656 | break; | 733 | break; |
657 | default: | 734 | default: |
@@ -665,8 +742,13 @@ static struct notifier_block __cpuinitdata vmstat_notifier = | |||
665 | 742 | ||
666 | int __init setup_vmstat(void) | 743 | int __init setup_vmstat(void) |
667 | { | 744 | { |
745 | int cpu; | ||
746 | |||
668 | refresh_zone_stat_thresholds(); | 747 | refresh_zone_stat_thresholds(); |
669 | register_cpu_notifier(&vmstat_notifier); | 748 | register_cpu_notifier(&vmstat_notifier); |
749 | |||
750 | for_each_online_cpu(cpu) | ||
751 | start_cpu_timer(cpu); | ||
670 | return 0; | 752 | return 0; |
671 | } | 753 | } |
672 | module_init(setup_vmstat) | 754 | module_init(setup_vmstat) |