aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c8
-rw-r--r--mm/filemap_xip.c7
-rw-r--r--mm/hugetlb.c33
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/page_alloc.c50
-rw-r--r--mm/slab.c41
-rw-r--r--mm/slub.c1050
-rw-r--r--mm/swap.c2
-rw-r--r--mm/truncate.c3
-rw-r--r--mm/vmscan.c2
-rw-r--r--mm/vmstat.c95
11 files changed, 734 insertions, 559 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 9cbf4fea4a59..7b48b2ad00e7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -750,6 +750,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
750 read_unlock_irq(&mapping->tree_lock); 750 read_unlock_irq(&mapping->tree_lock);
751 return i; 751 return i;
752} 752}
753EXPORT_SYMBOL(find_get_pages_contig);
753 754
754/** 755/**
755 * find_get_pages_tag - find and return pages that match @tag 756 * find_get_pages_tag - find and return pages that match @tag
@@ -778,6 +779,7 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
778 read_unlock_irq(&mapping->tree_lock); 779 read_unlock_irq(&mapping->tree_lock);
779 return ret; 780 return ret;
780} 781}
782EXPORT_SYMBOL(find_get_pages_tag);
781 783
782/** 784/**
783 * grab_cache_page_nowait - returns locked page at given index in given cache 785 * grab_cache_page_nowait - returns locked page at given index in given cache
@@ -1782,7 +1784,7 @@ struct page *read_cache_page_async(struct address_space *mapping,
1782retry: 1784retry:
1783 page = __read_cache_page(mapping, index, filler, data); 1785 page = __read_cache_page(mapping, index, filler, data);
1784 if (IS_ERR(page)) 1786 if (IS_ERR(page))
1785 goto out; 1787 return page;
1786 mark_page_accessed(page); 1788 mark_page_accessed(page);
1787 if (PageUptodate(page)) 1789 if (PageUptodate(page))
1788 goto out; 1790 goto out;
@@ -1800,9 +1802,9 @@ retry:
1800 err = filler(data, page); 1802 err = filler(data, page);
1801 if (err < 0) { 1803 if (err < 0) {
1802 page_cache_release(page); 1804 page_cache_release(page);
1803 page = ERR_PTR(err); 1805 return ERR_PTR(err);
1804 } 1806 }
1805 out: 1807out:
1806 mark_page_accessed(page); 1808 mark_page_accessed(page);
1807 return page; 1809 return page;
1808} 1810}
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index cbb335813ec0..1b49dab9b25d 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -434,7 +434,6 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
434 unsigned blocksize; 434 unsigned blocksize;
435 unsigned length; 435 unsigned length;
436 struct page *page; 436 struct page *page;
437 void *kaddr;
438 437
439 BUG_ON(!mapping->a_ops->get_xip_page); 438 BUG_ON(!mapping->a_ops->get_xip_page);
440 439
@@ -458,11 +457,7 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
458 else 457 else
459 return PTR_ERR(page); 458 return PTR_ERR(page);
460 } 459 }
461 kaddr = kmap_atomic(page, KM_USER0); 460 zero_user_page(page, offset, length, KM_USER0);
462 memset(kaddr + offset, 0, length);
463 kunmap_atomic(kaddr, KM_USER0);
464
465 flush_dcache_page(page);
466 return 0; 461 return 0;
467} 462}
468EXPORT_SYMBOL_GPL(xip_truncate_page); 463EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 36db012b38dd..eb7180db3033 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -140,6 +140,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
140 return page; 140 return page;
141 141
142fail: 142fail:
143 if (vma->vm_flags & VM_MAYSHARE)
144 resv_huge_pages++;
143 spin_unlock(&hugetlb_lock); 145 spin_unlock(&hugetlb_lock);
144 return NULL; 146 return NULL;
145} 147}
@@ -172,6 +174,17 @@ static int __init hugetlb_setup(char *s)
172} 174}
173__setup("hugepages=", hugetlb_setup); 175__setup("hugepages=", hugetlb_setup);
174 176
177static unsigned int cpuset_mems_nr(unsigned int *array)
178{
179 int node;
180 unsigned int nr = 0;
181
182 for_each_node_mask(node, cpuset_current_mems_allowed)
183 nr += array[node];
184
185 return nr;
186}
187
175#ifdef CONFIG_SYSCTL 188#ifdef CONFIG_SYSCTL
176static void update_and_free_page(struct page *page) 189static void update_and_free_page(struct page *page)
177{ 190{
@@ -817,6 +830,26 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to)
817 chg = region_chg(&inode->i_mapping->private_list, from, to); 830 chg = region_chg(&inode->i_mapping->private_list, from, to);
818 if (chg < 0) 831 if (chg < 0)
819 return chg; 832 return chg;
833 /*
834 * When cpuset is configured, it breaks the strict hugetlb page
835 * reservation as the accounting is done on a global variable. Such
836 * reservation is completely rubbish in the presence of cpuset because
837 * the reservation is not checked against page availability for the
838 * current cpuset. Application can still potentially OOM'ed by kernel
839 * with lack of free htlb page in cpuset that the task is in.
840 * Attempt to enforce strict accounting with cpuset is almost
841 * impossible (or too ugly) because cpuset is too fluid that
842 * task or memory node can be dynamically moved between cpusets.
843 *
844 * The change of semantics for shared hugetlb mapping with cpuset is
845 * undesirable. However, in order to preserve some of the semantics,
846 * we fall back to check against current free page availability as
847 * a best attempt and hopefully to minimize the impact of changing
848 * semantics that cpuset has.
849 */
850 if (chg > cpuset_mems_nr(free_huge_pages_node))
851 return -ENOMEM;
852
820 ret = hugetlb_acct_memory(chg); 853 ret = hugetlb_acct_memory(chg);
821 if (ret < 0) 854 if (ret < 0)
822 return ret; 855 return ret;
diff --git a/mm/mmap.c b/mm/mmap.c
index cc1f543eb1b8..68b9ad2ef1d6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1720,7 +1720,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1720 1720
1721/* 1721/*
1722 * Split a vma into two pieces at address 'addr', a new vma is allocated 1722 * Split a vma into two pieces at address 'addr', a new vma is allocated
1723 * either for the first part or the the tail. 1723 * either for the first part or the tail.
1724 */ 1724 */
1725int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, 1725int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1726 unsigned long addr, int new_below) 1726 unsigned long addr, int new_below)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6fd0b7455b0b..f9b5d6d5f4d6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -691,43 +691,26 @@ static void __init setup_nr_node_ids(void) {}
691 691
692#ifdef CONFIG_NUMA 692#ifdef CONFIG_NUMA
693/* 693/*
694 * Called from the slab reaper to drain pagesets on a particular node that 694 * Called from the vmstat counter updater to drain pagesets of this
695 * belongs to the currently executing processor. 695 * currently executing processor on remote nodes after they have
696 * expired.
697 *
696 * Note that this function must be called with the thread pinned to 698 * Note that this function must be called with the thread pinned to
697 * a single processor. 699 * a single processor.
698 */ 700 */
699void drain_node_pages(int nodeid) 701void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
700{ 702{
701 int i;
702 enum zone_type z;
703 unsigned long flags; 703 unsigned long flags;
704 int to_drain;
704 705
705 for (z = 0; z < MAX_NR_ZONES; z++) { 706 local_irq_save(flags);
706 struct zone *zone = NODE_DATA(nodeid)->node_zones + z; 707 if (pcp->count >= pcp->batch)
707 struct per_cpu_pageset *pset; 708 to_drain = pcp->batch;
708 709 else
709 if (!populated_zone(zone)) 710 to_drain = pcp->count;
710 continue; 711 free_pages_bulk(zone, to_drain, &pcp->list, 0);
711 712 pcp->count -= to_drain;
712 pset = zone_pcp(zone, smp_processor_id()); 713 local_irq_restore(flags);
713 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
714 struct per_cpu_pages *pcp;
715
716 pcp = &pset->pcp[i];
717 if (pcp->count) {
718 int to_drain;
719
720 local_irq_save(flags);
721 if (pcp->count >= pcp->batch)
722 to_drain = pcp->batch;
723 else
724 to_drain = pcp->count;
725 free_pages_bulk(zone, to_drain, &pcp->list, 0);
726 pcp->count -= to_drain;
727 local_irq_restore(flags);
728 }
729 }
730 }
731} 714}
732#endif 715#endif
733 716
@@ -2148,11 +2131,14 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
2148 2131
2149 switch (action) { 2132 switch (action) {
2150 case CPU_UP_PREPARE: 2133 case CPU_UP_PREPARE:
2134 case CPU_UP_PREPARE_FROZEN:
2151 if (process_zones(cpu)) 2135 if (process_zones(cpu))
2152 ret = NOTIFY_BAD; 2136 ret = NOTIFY_BAD;
2153 break; 2137 break;
2154 case CPU_UP_CANCELED: 2138 case CPU_UP_CANCELED:
2139 case CPU_UP_CANCELED_FROZEN:
2155 case CPU_DEAD: 2140 case CPU_DEAD:
2141 case CPU_DEAD_FROZEN:
2156 free_zone_pagesets(cpu); 2142 free_zone_pagesets(cpu);
2157 break; 2143 break;
2158 default: 2144 default:
@@ -3012,7 +2998,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
3012{ 2998{
3013 int cpu = (unsigned long)hcpu; 2999 int cpu = (unsigned long)hcpu;
3014 3000
3015 if (action == CPU_DEAD) { 3001 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
3016 local_irq_disable(); 3002 local_irq_disable();
3017 __drain_pages(cpu); 3003 __drain_pages(cpu);
3018 vm_events_fold_cpu(cpu); 3004 vm_events_fold_cpu(cpu);
diff --git a/mm/slab.c b/mm/slab.c
index acda7e2d66e4..944b20581f8c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -928,12 +928,6 @@ static void next_reap_node(void)
928{ 928{
929 int node = __get_cpu_var(reap_node); 929 int node = __get_cpu_var(reap_node);
930 930
931 /*
932 * Also drain per cpu pages on remote zones
933 */
934 if (node != numa_node_id())
935 drain_node_pages(node);
936
937 node = next_node(node, node_online_map); 931 node = next_node(node, node_online_map);
938 if (unlikely(node >= MAX_NUMNODES)) 932 if (unlikely(node >= MAX_NUMNODES))
939 node = first_node(node_online_map); 933 node = first_node(node_online_map);
@@ -1186,8 +1180,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1186 int memsize = sizeof(struct kmem_list3); 1180 int memsize = sizeof(struct kmem_list3);
1187 1181
1188 switch (action) { 1182 switch (action) {
1189 case CPU_UP_PREPARE: 1183 case CPU_LOCK_ACQUIRE:
1190 mutex_lock(&cache_chain_mutex); 1184 mutex_lock(&cache_chain_mutex);
1185 break;
1186 case CPU_UP_PREPARE:
1187 case CPU_UP_PREPARE_FROZEN:
1191 /* 1188 /*
1192 * We need to do this right in the beginning since 1189 * We need to do this right in the beginning since
1193 * alloc_arraycache's are going to use this list. 1190 * alloc_arraycache's are going to use this list.
@@ -1274,17 +1271,28 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1274 } 1271 }
1275 break; 1272 break;
1276 case CPU_ONLINE: 1273 case CPU_ONLINE:
1277 mutex_unlock(&cache_chain_mutex); 1274 case CPU_ONLINE_FROZEN:
1278 start_cpu_timer(cpu); 1275 start_cpu_timer(cpu);
1279 break; 1276 break;
1280#ifdef CONFIG_HOTPLUG_CPU 1277#ifdef CONFIG_HOTPLUG_CPU
1281 case CPU_DOWN_PREPARE: 1278 case CPU_DOWN_PREPARE:
1282 mutex_lock(&cache_chain_mutex); 1279 case CPU_DOWN_PREPARE_FROZEN:
1283 break; 1280 /*
1284 case CPU_DOWN_FAILED: 1281 * Shutdown cache reaper. Note that the cache_chain_mutex is
1285 mutex_unlock(&cache_chain_mutex); 1282 * held so that if cache_reap() is invoked it cannot do
1286 break; 1283 * anything expensive but will only modify reap_work
1284 * and reschedule the timer.
1285 */
1286 cancel_rearming_delayed_work(&per_cpu(reap_work, cpu));
1287 /* Now the cache_reaper is guaranteed to be not running. */
1288 per_cpu(reap_work, cpu).work.func = NULL;
1289 break;
1290 case CPU_DOWN_FAILED:
1291 case CPU_DOWN_FAILED_FROZEN:
1292 start_cpu_timer(cpu);
1293 break;
1287 case CPU_DEAD: 1294 case CPU_DEAD:
1295 case CPU_DEAD_FROZEN:
1288 /* 1296 /*
1289 * Even if all the cpus of a node are down, we don't free the 1297 * Even if all the cpus of a node are down, we don't free the
1290 * kmem_list3 of any cache. This to avoid a race between 1298 * kmem_list3 of any cache. This to avoid a race between
@@ -1296,6 +1304,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1296 /* fall thru */ 1304 /* fall thru */
1297#endif 1305#endif
1298 case CPU_UP_CANCELED: 1306 case CPU_UP_CANCELED:
1307 case CPU_UP_CANCELED_FROZEN:
1299 list_for_each_entry(cachep, &cache_chain, next) { 1308 list_for_each_entry(cachep, &cache_chain, next) {
1300 struct array_cache *nc; 1309 struct array_cache *nc;
1301 struct array_cache *shared; 1310 struct array_cache *shared;
@@ -1354,6 +1363,8 @@ free_array_cache:
1354 continue; 1363 continue;
1355 drain_freelist(cachep, l3, l3->free_objects); 1364 drain_freelist(cachep, l3, l3->free_objects);
1356 } 1365 }
1366 break;
1367 case CPU_LOCK_RELEASE:
1357 mutex_unlock(&cache_chain_mutex); 1368 mutex_unlock(&cache_chain_mutex);
1358 break; 1369 break;
1359 } 1370 }
@@ -3742,7 +3753,6 @@ EXPORT_SYMBOL(__kmalloc);
3742 3753
3743/** 3754/**
3744 * krealloc - reallocate memory. The contents will remain unchanged. 3755 * krealloc - reallocate memory. The contents will remain unchanged.
3745 *
3746 * @p: object to reallocate memory for. 3756 * @p: object to reallocate memory for.
3747 * @new_size: how many bytes of memory are required. 3757 * @new_size: how many bytes of memory are required.
3748 * @flags: the type of memory to allocate. 3758 * @flags: the type of memory to allocate.
@@ -4140,7 +4150,6 @@ next:
4140 check_irq_on(); 4150 check_irq_on();
4141 mutex_unlock(&cache_chain_mutex); 4151 mutex_unlock(&cache_chain_mutex);
4142 next_reap_node(); 4152 next_reap_node();
4143 refresh_cpu_vm_stats(smp_processor_id());
4144out: 4153out:
4145 /* Set up the next iteration */ 4154 /* Set up the next iteration */
4146 schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); 4155 schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
diff --git a/mm/slub.c b/mm/slub.c
index 5db3da5a60bf..bd2efae02bcd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -66,11 +66,11 @@
66 * SLUB assigns one slab for allocation to each processor. 66 * SLUB assigns one slab for allocation to each processor.
67 * Allocations only occur from these slabs called cpu slabs. 67 * Allocations only occur from these slabs called cpu slabs.
68 * 68 *
69 * Slabs with free elements are kept on a partial list. 69 * Slabs with free elements are kept on a partial list and during regular
70 * There is no list for full slabs. If an object in a full slab is 70 * operations no list for full slabs is used. If an object in a full slab is
71 * freed then the slab will show up again on the partial lists. 71 * freed then the slab will show up again on the partial lists.
72 * Otherwise there is no need to track full slabs unless we have to 72 * We track full slabs for debugging purposes though because otherwise we
73 * track full slabs for debugging purposes. 73 * cannot scan all objects.
74 * 74 *
75 * Slabs are freed when they become empty. Teardown and setup is 75 * Slabs are freed when they become empty. Teardown and setup is
76 * minimal so we rely on the page allocators per cpu caches for 76 * minimal so we rely on the page allocators per cpu caches for
@@ -87,13 +87,36 @@
87 * the fast path. 87 * the fast path.
88 */ 88 */
89 89
90static inline int SlabDebug(struct page *page)
91{
92#ifdef CONFIG_SLUB_DEBUG
93 return PageError(page);
94#else
95 return 0;
96#endif
97}
98
99static inline void SetSlabDebug(struct page *page)
100{
101#ifdef CONFIG_SLUB_DEBUG
102 SetPageError(page);
103#endif
104}
105
106static inline void ClearSlabDebug(struct page *page)
107{
108#ifdef CONFIG_SLUB_DEBUG
109 ClearPageError(page);
110#endif
111}
112
90/* 113/*
91 * Issues still to be resolved: 114 * Issues still to be resolved:
92 * 115 *
93 * - The per cpu array is updated for each new slab and and is a remote 116 * - The per cpu array is updated for each new slab and and is a remote
94 * cacheline for most nodes. This could become a bouncing cacheline given 117 * cacheline for most nodes. This could become a bouncing cacheline given
95 * enough frequent updates. There are 16 pointers in a cacheline.so at 118 * enough frequent updates. There are 16 pointers in a cacheline, so at
96 * max 16 cpus could compete. Likely okay. 119 * max 16 cpus could compete for the cacheline which may be okay.
97 * 120 *
98 * - Support PAGE_ALLOC_DEBUG. Should be easy to do. 121 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
99 * 122 *
@@ -137,6 +160,7 @@
137 160
138#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ 161#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
139 SLAB_POISON | SLAB_STORE_USER) 162 SLAB_POISON | SLAB_STORE_USER)
163
140/* 164/*
141 * Set of flags that will prevent slab merging 165 * Set of flags that will prevent slab merging
142 */ 166 */
@@ -157,6 +181,11 @@
157/* Internal SLUB flags */ 181/* Internal SLUB flags */
158#define __OBJECT_POISON 0x80000000 /* Poison object */ 182#define __OBJECT_POISON 0x80000000 /* Poison object */
159 183
184/* Not all arches define cache_line_size */
185#ifndef cache_line_size
186#define cache_line_size() L1_CACHE_BYTES
187#endif
188
160static int kmem_size = sizeof(struct kmem_cache); 189static int kmem_size = sizeof(struct kmem_cache);
161 190
162#ifdef CONFIG_SMP 191#ifdef CONFIG_SMP
@@ -166,7 +195,7 @@ static struct notifier_block slab_notifier;
166static enum { 195static enum {
167 DOWN, /* No slab functionality available */ 196 DOWN, /* No slab functionality available */
168 PARTIAL, /* kmem_cache_open() works but kmalloc does not */ 197 PARTIAL, /* kmem_cache_open() works but kmalloc does not */
169 UP, /* Everything works */ 198 UP, /* Everything works but does not show up in sysfs */
170 SYSFS /* Sysfs up */ 199 SYSFS /* Sysfs up */
171} slab_state = DOWN; 200} slab_state = DOWN;
172 201
@@ -174,7 +203,19 @@ static enum {
174static DECLARE_RWSEM(slub_lock); 203static DECLARE_RWSEM(slub_lock);
175LIST_HEAD(slab_caches); 204LIST_HEAD(slab_caches);
176 205
177#ifdef CONFIG_SYSFS 206/*
207 * Tracking user of a slab.
208 */
209struct track {
210 void *addr; /* Called from address */
211 int cpu; /* Was running on cpu */
212 int pid; /* Pid context */
213 unsigned long when; /* When did the operation occur */
214};
215
216enum track_item { TRACK_ALLOC, TRACK_FREE };
217
218#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
178static int sysfs_slab_add(struct kmem_cache *); 219static int sysfs_slab_add(struct kmem_cache *);
179static int sysfs_slab_alias(struct kmem_cache *, const char *); 220static int sysfs_slab_alias(struct kmem_cache *, const char *);
180static void sysfs_slab_remove(struct kmem_cache *); 221static void sysfs_slab_remove(struct kmem_cache *);
@@ -202,6 +243,63 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
202#endif 243#endif
203} 244}
204 245
246static inline int check_valid_pointer(struct kmem_cache *s,
247 struct page *page, const void *object)
248{
249 void *base;
250
251 if (!object)
252 return 1;
253
254 base = page_address(page);
255 if (object < base || object >= base + s->objects * s->size ||
256 (object - base) % s->size) {
257 return 0;
258 }
259
260 return 1;
261}
262
263/*
264 * Slow version of get and set free pointer.
265 *
266 * This version requires touching the cache lines of kmem_cache which
267 * we avoid to do in the fast alloc free paths. There we obtain the offset
268 * from the page struct.
269 */
270static inline void *get_freepointer(struct kmem_cache *s, void *object)
271{
272 return *(void **)(object + s->offset);
273}
274
275static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
276{
277 *(void **)(object + s->offset) = fp;
278}
279
280/* Loop over all objects in a slab */
281#define for_each_object(__p, __s, __addr) \
282 for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\
283 __p += (__s)->size)
284
285/* Scan freelist */
286#define for_each_free_object(__p, __s, __free) \
287 for (__p = (__free); __p; __p = get_freepointer((__s), __p))
288
289/* Determine object index from a given position */
290static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
291{
292 return (p - addr) / s->size;
293}
294
295#ifdef CONFIG_SLUB_DEBUG
296/*
297 * Debug settings:
298 */
299static int slub_debug;
300
301static char *slub_debug_slabs;
302
205/* 303/*
206 * Object debugging 304 * Object debugging
207 */ 305 */
@@ -237,35 +335,6 @@ static void print_section(char *text, u8 *addr, unsigned int length)
237 } 335 }
238} 336}
239 337
240/*
241 * Slow version of get and set free pointer.
242 *
243 * This requires touching the cache lines of kmem_cache.
244 * The offset can also be obtained from the page. In that
245 * case it is in the cacheline that we already need to touch.
246 */
247static void *get_freepointer(struct kmem_cache *s, void *object)
248{
249 return *(void **)(object + s->offset);
250}
251
252static void set_freepointer(struct kmem_cache *s, void *object, void *fp)
253{
254 *(void **)(object + s->offset) = fp;
255}
256
257/*
258 * Tracking user of a slab.
259 */
260struct track {
261 void *addr; /* Called from address */
262 int cpu; /* Was running on cpu */
263 int pid; /* Pid context */
264 unsigned long when; /* When did the operation occur */
265};
266
267enum track_item { TRACK_ALLOC, TRACK_FREE };
268
269static struct track *get_track(struct kmem_cache *s, void *object, 338static struct track *get_track(struct kmem_cache *s, void *object,
270 enum track_item alloc) 339 enum track_item alloc)
271{ 340{
@@ -400,24 +469,6 @@ static int check_bytes(u8 *start, unsigned int value, unsigned int bytes)
400 return 1; 469 return 1;
401} 470}
402 471
403
404static int check_valid_pointer(struct kmem_cache *s, struct page *page,
405 void *object)
406{
407 void *base;
408
409 if (!object)
410 return 1;
411
412 base = page_address(page);
413 if (object < base || object >= base + s->objects * s->size ||
414 (object - base) % s->size) {
415 return 0;
416 }
417
418 return 1;
419}
420
421/* 472/*
422 * Object layout: 473 * Object layout:
423 * 474 *
@@ -425,26 +476,34 @@ static int check_valid_pointer(struct kmem_cache *s, struct page *page,
425 * Bytes of the object to be managed. 476 * Bytes of the object to be managed.
426 * If the freepointer may overlay the object then the free 477 * If the freepointer may overlay the object then the free
427 * pointer is the first word of the object. 478 * pointer is the first word of the object.
479 *
428 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 480 * Poisoning uses 0x6b (POISON_FREE) and the last byte is
429 * 0xa5 (POISON_END) 481 * 0xa5 (POISON_END)
430 * 482 *
431 * object + s->objsize 483 * object + s->objsize
432 * Padding to reach word boundary. This is also used for Redzoning. 484 * Padding to reach word boundary. This is also used for Redzoning.
433 * Padding is extended to word size if Redzoning is enabled 485 * Padding is extended by another word if Redzoning is enabled and
434 * and objsize == inuse. 486 * objsize == inuse.
487 *
435 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 488 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with
436 * 0xcc (RED_ACTIVE) for objects in use. 489 * 0xcc (RED_ACTIVE) for objects in use.
437 * 490 *
438 * object + s->inuse 491 * object + s->inuse
492 * Meta data starts here.
493 *
439 * A. Free pointer (if we cannot overwrite object on free) 494 * A. Free pointer (if we cannot overwrite object on free)
440 * B. Tracking data for SLAB_STORE_USER 495 * B. Tracking data for SLAB_STORE_USER
441 * C. Padding to reach required alignment boundary 496 * C. Padding to reach required alignment boundary or at mininum
442 * Padding is done using 0x5a (POISON_INUSE) 497 * one word if debuggin is on to be able to detect writes
498 * before the word boundary.
499 *
500 * Padding is done using 0x5a (POISON_INUSE)
443 * 501 *
444 * object + s->size 502 * object + s->size
503 * Nothing is used beyond s->size.
445 * 504 *
446 * If slabcaches are merged then the objsize and inuse boundaries are to 505 * If slabcaches are merged then the objsize and inuse boundaries are mostly
447 * be ignored. And therefore no slab options that rely on these boundaries 506 * ignored. And therefore no slab options that rely on these boundaries
448 * may be used with merged slabcaches. 507 * may be used with merged slabcaches.
449 */ 508 */
450 509
@@ -570,8 +629,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
570 /* 629 /*
571 * No choice but to zap it and thus loose the remainder 630 * No choice but to zap it and thus loose the remainder
572 * of the free objects in this slab. May cause 631 * of the free objects in this slab. May cause
573 * another error because the object count maybe 632 * another error because the object count is now wrong.
574 * wrong now.
575 */ 633 */
576 set_freepointer(s, p, NULL); 634 set_freepointer(s, p, NULL);
577 return 0; 635 return 0;
@@ -611,9 +669,8 @@ static int check_slab(struct kmem_cache *s, struct page *page)
611} 669}
612 670
613/* 671/*
614 * Determine if a certain object on a page is on the freelist and 672 * Determine if a certain object on a page is on the freelist. Must hold the
615 * therefore free. Must hold the slab lock for cpu slabs to 673 * slab lock to guarantee that the chains are in a consistent state.
616 * guarantee that the chains are consistent.
617 */ 674 */
618static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 675static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
619{ 676{
@@ -659,7 +716,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
659} 716}
660 717
661/* 718/*
662 * Tracking of fully allocated slabs for debugging 719 * Tracking of fully allocated slabs for debugging purposes.
663 */ 720 */
664static void add_full(struct kmem_cache_node *n, struct page *page) 721static void add_full(struct kmem_cache_node *n, struct page *page)
665{ 722{
@@ -710,7 +767,7 @@ bad:
710 /* 767 /*
711 * If this is a slab page then lets do the best we can 768 * If this is a slab page then lets do the best we can
712 * to avoid issues in the future. Marking all objects 769 * to avoid issues in the future. Marking all objects
713 * as used avoids touching the remainder. 770 * as used avoids touching the remaining objects.
714 */ 771 */
715 printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n", 772 printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n",
716 s->name, page); 773 s->name, page);
@@ -764,6 +821,113 @@ fail:
764 return 0; 821 return 0;
765} 822}
766 823
824static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc)
825{
826 if (s->flags & SLAB_TRACE) {
827 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
828 s->name,
829 alloc ? "alloc" : "free",
830 object, page->inuse,
831 page->freelist);
832
833 if (!alloc)
834 print_section("Object", (void *)object, s->objsize);
835
836 dump_stack();
837 }
838}
839
840static int __init setup_slub_debug(char *str)
841{
842 if (!str || *str != '=')
843 slub_debug = DEBUG_DEFAULT_FLAGS;
844 else {
845 str++;
846 if (*str == 0 || *str == ',')
847 slub_debug = DEBUG_DEFAULT_FLAGS;
848 else
849 for( ;*str && *str != ','; str++)
850 switch (*str) {
851 case 'f' : case 'F' :
852 slub_debug |= SLAB_DEBUG_FREE;
853 break;
854 case 'z' : case 'Z' :
855 slub_debug |= SLAB_RED_ZONE;
856 break;
857 case 'p' : case 'P' :
858 slub_debug |= SLAB_POISON;
859 break;
860 case 'u' : case 'U' :
861 slub_debug |= SLAB_STORE_USER;
862 break;
863 case 't' : case 'T' :
864 slub_debug |= SLAB_TRACE;
865 break;
866 default:
867 printk(KERN_ERR "slub_debug option '%c' "
868 "unknown. skipped\n",*str);
869 }
870 }
871
872 if (*str == ',')
873 slub_debug_slabs = str + 1;
874 return 1;
875}
876
877__setup("slub_debug", setup_slub_debug);
878
879static void kmem_cache_open_debug_check(struct kmem_cache *s)
880{
881 /*
882 * The page->offset field is only 16 bit wide. This is an offset
883 * in units of words from the beginning of an object. If the slab
884 * size is bigger then we cannot move the free pointer behind the
885 * object anymore.
886 *
887 * On 32 bit platforms the limit is 256k. On 64bit platforms
888 * the limit is 512k.
889 *
890 * Debugging or ctor/dtors may create a need to move the free
891 * pointer. Fail if this happens.
892 */
893 if (s->size >= 65535 * sizeof(void *)) {
894 BUG_ON(s->flags & (SLAB_RED_ZONE | SLAB_POISON |
895 SLAB_STORE_USER | SLAB_DESTROY_BY_RCU));
896 BUG_ON(s->ctor || s->dtor);
897 }
898 else
899 /*
900 * Enable debugging if selected on the kernel commandline.
901 */
902 if (slub_debug && (!slub_debug_slabs ||
903 strncmp(slub_debug_slabs, s->name,
904 strlen(slub_debug_slabs)) == 0))
905 s->flags |= slub_debug;
906}
907#else
908
909static inline int alloc_object_checks(struct kmem_cache *s,
910 struct page *page, void *object) { return 0; }
911
912static inline int free_object_checks(struct kmem_cache *s,
913 struct page *page, void *object) { return 0; }
914
915static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
916static inline void remove_full(struct kmem_cache *s, struct page *page) {}
917static inline void trace(struct kmem_cache *s, struct page *page,
918 void *object, int alloc) {}
919static inline void init_object(struct kmem_cache *s,
920 void *object, int active) {}
921static inline void init_tracking(struct kmem_cache *s, void *object) {}
922static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
923 { return 1; }
924static inline int check_object(struct kmem_cache *s, struct page *page,
925 void *object, int active) { return 1; }
926static inline void set_track(struct kmem_cache *s, void *object,
927 enum track_item alloc, void *addr) {}
928static inline void kmem_cache_open_debug_check(struct kmem_cache *s) {}
929#define slub_debug 0
930#endif
767/* 931/*
768 * Slab allocation and freeing 932 * Slab allocation and freeing
769 */ 933 */
@@ -797,7 +961,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
797static void setup_object(struct kmem_cache *s, struct page *page, 961static void setup_object(struct kmem_cache *s, struct page *page,
798 void *object) 962 void *object)
799{ 963{
800 if (PageError(page)) { 964 if (SlabDebug(page)) {
801 init_object(s, object, 0); 965 init_object(s, object, 0);
802 init_tracking(s, object); 966 init_tracking(s, object);
803 } 967 }
@@ -832,7 +996,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
832 page->flags |= 1 << PG_slab; 996 page->flags |= 1 << PG_slab;
833 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | 997 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
834 SLAB_STORE_USER | SLAB_TRACE)) 998 SLAB_STORE_USER | SLAB_TRACE))
835 page->flags |= 1 << PG_error; 999 SetSlabDebug(page);
836 1000
837 start = page_address(page); 1001 start = page_address(page);
838 end = start + s->objects * s->size; 1002 end = start + s->objects * s->size;
@@ -841,7 +1005,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
841 memset(start, POISON_INUSE, PAGE_SIZE << s->order); 1005 memset(start, POISON_INUSE, PAGE_SIZE << s->order);
842 1006
843 last = start; 1007 last = start;
844 for (p = start + s->size; p < end; p += s->size) { 1008 for_each_object(p, s, start) {
845 setup_object(s, page, last); 1009 setup_object(s, page, last);
846 set_freepointer(s, last, p); 1010 set_freepointer(s, last, p);
847 last = p; 1011 last = p;
@@ -861,13 +1025,11 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
861{ 1025{
862 int pages = 1 << s->order; 1026 int pages = 1 << s->order;
863 1027
864 if (unlikely(PageError(page) || s->dtor)) { 1028 if (unlikely(SlabDebug(page) || s->dtor)) {
865 void *start = page_address(page);
866 void *end = start + (pages << PAGE_SHIFT);
867 void *p; 1029 void *p;
868 1030
869 slab_pad_check(s, page); 1031 slab_pad_check(s, page);
870 for (p = start; p <= end - s->size; p += s->size) { 1032 for_each_object(p, s, page_address(page)) {
871 if (s->dtor) 1033 if (s->dtor)
872 s->dtor(p, s, 0); 1034 s->dtor(p, s, 0);
873 check_object(s, page, p, 0); 1035 check_object(s, page, p, 0);
@@ -910,7 +1072,8 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
910 1072
911 atomic_long_dec(&n->nr_slabs); 1073 atomic_long_dec(&n->nr_slabs);
912 reset_page_mapcount(page); 1074 reset_page_mapcount(page);
913 page->flags &= ~(1 << PG_slab | 1 << PG_error); 1075 ClearSlabDebug(page);
1076 __ClearPageSlab(page);
914 free_slab(s, page); 1077 free_slab(s, page);
915} 1078}
916 1079
@@ -966,9 +1129,9 @@ static void remove_partial(struct kmem_cache *s,
966} 1129}
967 1130
968/* 1131/*
969 * Lock page and remove it from the partial list 1132 * Lock slab and remove from the partial list.
970 * 1133 *
971 * Must hold list_lock 1134 * Must hold list_lock.
972 */ 1135 */
973static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page) 1136static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page)
974{ 1137{
@@ -981,7 +1144,7 @@ static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page)
981} 1144}
982 1145
983/* 1146/*
984 * Try to get a partial slab from a specific node 1147 * Try to allocate a partial slab from a specific node.
985 */ 1148 */
986static struct page *get_partial_node(struct kmem_cache_node *n) 1149static struct page *get_partial_node(struct kmem_cache_node *n)
987{ 1150{
@@ -990,7 +1153,8 @@ static struct page *get_partial_node(struct kmem_cache_node *n)
990 /* 1153 /*
991 * Racy check. If we mistakenly see no partial slabs then we 1154 * Racy check. If we mistakenly see no partial slabs then we
992 * just allocate an empty slab. If we mistakenly try to get a 1155 * just allocate an empty slab. If we mistakenly try to get a
993 * partial slab then get_partials() will return NULL. 1156 * partial slab and there is none available then get_partials()
1157 * will return NULL.
994 */ 1158 */
995 if (!n || !n->nr_partial) 1159 if (!n || !n->nr_partial)
996 return NULL; 1160 return NULL;
@@ -1006,8 +1170,7 @@ out:
1006} 1170}
1007 1171
1008/* 1172/*
1009 * Get a page from somewhere. Search in increasing NUMA 1173 * Get a page from somewhere. Search in increasing NUMA distances.
1010 * distances.
1011 */ 1174 */
1012static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) 1175static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1013{ 1176{
@@ -1017,24 +1180,22 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1017 struct page *page; 1180 struct page *page;
1018 1181
1019 /* 1182 /*
1020 * The defrag ratio allows to configure the tradeoffs between 1183 * The defrag ratio allows a configuration of the tradeoffs between
1021 * inter node defragmentation and node local allocations. 1184 * inter node defragmentation and node local allocations. A lower
1022 * A lower defrag_ratio increases the tendency to do local 1185 * defrag_ratio increases the tendency to do local allocations
1023 * allocations instead of scanning throught the partial 1186 * instead of attempting to obtain partial slabs from other nodes.
1024 * lists on other nodes.
1025 *
1026 * If defrag_ratio is set to 0 then kmalloc() always
1027 * returns node local objects. If its higher then kmalloc()
1028 * may return off node objects in order to avoid fragmentation.
1029 * 1187 *
1030 * A higher ratio means slabs may be taken from other nodes 1188 * If the defrag_ratio is set to 0 then kmalloc() always
1031 * thus reducing the number of partial slabs on those nodes. 1189 * returns node local objects. If the ratio is higher then kmalloc()
1190 * may return off node objects because partial slabs are obtained
1191 * from other nodes and filled up.
1032 * 1192 *
1033 * If /sys/slab/xx/defrag_ratio is set to 100 (which makes 1193 * If /sys/slab/xx/defrag_ratio is set to 100 (which makes
1034 * defrag_ratio = 1000) then every (well almost) allocation 1194 * defrag_ratio = 1000) then every (well almost) allocation will
1035 * will first attempt to defrag slab caches on other nodes. This 1195 * first attempt to defrag slab caches on other nodes. This means
1036 * means scanning over all nodes to look for partial slabs which 1196 * scanning over all nodes to look for partial slabs which may be
1037 * may be a bit expensive to do on every slab allocation. 1197 * expensive if we do it every time we are trying to find a slab
1198 * with available objects.
1038 */ 1199 */
1039 if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) 1200 if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio)
1040 return NULL; 1201 return NULL;
@@ -1087,18 +1248,19 @@ static void putback_slab(struct kmem_cache *s, struct page *page)
1087 1248
1088 if (page->freelist) 1249 if (page->freelist)
1089 add_partial(n, page); 1250 add_partial(n, page);
1090 else if (PageError(page) && (s->flags & SLAB_STORE_USER)) 1251 else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER))
1091 add_full(n, page); 1252 add_full(n, page);
1092 slab_unlock(page); 1253 slab_unlock(page);
1093 1254
1094 } else { 1255 } else {
1095 if (n->nr_partial < MIN_PARTIAL) { 1256 if (n->nr_partial < MIN_PARTIAL) {
1096 /* 1257 /*
1097 * Adding an empty page to the partial slabs in order 1258 * Adding an empty slab to the partial slabs in order
1098 * to avoid page allocator overhead. This page needs to 1259 * to avoid page allocator overhead. This slab needs
1099 * come after all the others that are not fully empty 1260 * to come after the other slabs with objects in
1100 * in order to make sure that we do maximum 1261 * order to fill them up. That way the size of the
1101 * defragmentation. 1262 * partial list stays small. kmem_cache_shrink can
1263 * reclaim empty slabs from the partial list.
1102 */ 1264 */
1103 add_partial_tail(n, page); 1265 add_partial_tail(n, page);
1104 slab_unlock(page); 1266 slab_unlock(page);
@@ -1166,11 +1328,11 @@ static void flush_all(struct kmem_cache *s)
1166 * 1. The page struct 1328 * 1. The page struct
1167 * 2. The first cacheline of the object to be allocated. 1329 * 2. The first cacheline of the object to be allocated.
1168 * 1330 *
1169 * The only cache lines that are read (apart from code) is the 1331 * The only other cache lines that are read (apart from code) is the
1170 * per cpu array in the kmem_cache struct. 1332 * per cpu array in the kmem_cache struct.
1171 * 1333 *
1172 * Fastpath is not possible if we need to get a new slab or have 1334 * Fastpath is not possible if we need to get a new slab or have
1173 * debugging enabled (which means all slabs are marked with PageError) 1335 * debugging enabled (which means all slabs are marked with SlabDebug)
1174 */ 1336 */
1175static void *slab_alloc(struct kmem_cache *s, 1337static void *slab_alloc(struct kmem_cache *s,
1176 gfp_t gfpflags, int node, void *addr) 1338 gfp_t gfpflags, int node, void *addr)
@@ -1193,7 +1355,7 @@ redo:
1193 object = page->freelist; 1355 object = page->freelist;
1194 if (unlikely(!object)) 1356 if (unlikely(!object))
1195 goto another_slab; 1357 goto another_slab;
1196 if (unlikely(PageError(page))) 1358 if (unlikely(SlabDebug(page)))
1197 goto debug; 1359 goto debug;
1198 1360
1199have_object: 1361have_object:
@@ -1220,9 +1382,11 @@ have_slab:
1220 cpu = smp_processor_id(); 1382 cpu = smp_processor_id();
1221 if (s->cpu_slab[cpu]) { 1383 if (s->cpu_slab[cpu]) {
1222 /* 1384 /*
1223 * Someone else populated the cpu_slab while we enabled 1385 * Someone else populated the cpu_slab while we
1224 * interrupts, or we have got scheduled on another cpu. 1386 * enabled interrupts, or we have gotten scheduled
1225 * The page may not be on the requested node. 1387 * on another cpu. The page may not be on the
1388 * requested node even if __GFP_THISNODE was
1389 * specified. So we need to recheck.
1226 */ 1390 */
1227 if (node == -1 || 1391 if (node == -1 ||
1228 page_to_nid(s->cpu_slab[cpu]) == node) { 1392 page_to_nid(s->cpu_slab[cpu]) == node) {
@@ -1235,7 +1399,7 @@ have_slab:
1235 slab_lock(page); 1399 slab_lock(page);
1236 goto redo; 1400 goto redo;
1237 } 1401 }
1238 /* Dump the current slab */ 1402 /* New slab does not fit our expectations */
1239 flush_slab(s, s->cpu_slab[cpu], cpu); 1403 flush_slab(s, s->cpu_slab[cpu], cpu);
1240 } 1404 }
1241 slab_lock(page); 1405 slab_lock(page);
@@ -1248,12 +1412,7 @@ debug:
1248 goto another_slab; 1412 goto another_slab;
1249 if (s->flags & SLAB_STORE_USER) 1413 if (s->flags & SLAB_STORE_USER)
1250 set_track(s, object, TRACK_ALLOC, addr); 1414 set_track(s, object, TRACK_ALLOC, addr);
1251 if (s->flags & SLAB_TRACE) { 1415 trace(s, page, object, 1);
1252 printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n",
1253 s->name, object, page->inuse,
1254 page->freelist);
1255 dump_stack();
1256 }
1257 init_object(s, object, 1); 1416 init_object(s, object, 1);
1258 goto have_object; 1417 goto have_object;
1259} 1418}
@@ -1276,7 +1435,8 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
1276 * The fastpath only writes the cacheline of the page struct and the first 1435 * The fastpath only writes the cacheline of the page struct and the first
1277 * cacheline of the object. 1436 * cacheline of the object.
1278 * 1437 *
1279 * No special cachelines need to be read 1438 * We read the cpu_slab cacheline to check if the slab is the per cpu
1439 * slab for this processor.
1280 */ 1440 */
1281static void slab_free(struct kmem_cache *s, struct page *page, 1441static void slab_free(struct kmem_cache *s, struct page *page,
1282 void *x, void *addr) 1442 void *x, void *addr)
@@ -1288,7 +1448,7 @@ static void slab_free(struct kmem_cache *s, struct page *page,
1288 local_irq_save(flags); 1448 local_irq_save(flags);
1289 slab_lock(page); 1449 slab_lock(page);
1290 1450
1291 if (unlikely(PageError(page))) 1451 if (unlikely(SlabDebug(page)))
1292 goto debug; 1452 goto debug;
1293checks_ok: 1453checks_ok:
1294 prior = object[page->offset] = page->freelist; 1454 prior = object[page->offset] = page->freelist;
@@ -1321,7 +1481,7 @@ out_unlock:
1321slab_empty: 1481slab_empty:
1322 if (prior) 1482 if (prior)
1323 /* 1483 /*
1324 * Slab on the partial list. 1484 * Slab still on the partial list.
1325 */ 1485 */
1326 remove_partial(s, page); 1486 remove_partial(s, page);
1327 1487
@@ -1337,13 +1497,7 @@ debug:
1337 remove_full(s, page); 1497 remove_full(s, page);
1338 if (s->flags & SLAB_STORE_USER) 1498 if (s->flags & SLAB_STORE_USER)
1339 set_track(s, x, TRACK_FREE, addr); 1499 set_track(s, x, TRACK_FREE, addr);
1340 if (s->flags & SLAB_TRACE) { 1500 trace(s, page, object, 0);
1341 printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n",
1342 s->name, object, page->inuse,
1343 page->freelist);
1344 print_section("Object", (void *)object, s->objsize);
1345 dump_stack();
1346 }
1347 init_object(s, object, 0); 1501 init_object(s, object, 0);
1348 goto checks_ok; 1502 goto checks_ok;
1349} 1503}
@@ -1370,22 +1524,16 @@ static struct page *get_object_page(const void *x)
1370} 1524}
1371 1525
1372/* 1526/*
1373 * kmem_cache_open produces objects aligned at "size" and the first object 1527 * Object placement in a slab is made very easy because we always start at
1374 * is placed at offset 0 in the slab (We have no metainformation on the 1528 * offset 0. If we tune the size of the object to the alignment then we can
1375 * slab, all slabs are in essence "off slab"). 1529 * get the required alignment by putting one properly sized object after
1376 * 1530 * another.
1377 * In order to get the desired alignment one just needs to align the
1378 * size.
1379 * 1531 *
1380 * Notice that the allocation order determines the sizes of the per cpu 1532 * Notice that the allocation order determines the sizes of the per cpu
1381 * caches. Each processor has always one slab available for allocations. 1533 * caches. Each processor has always one slab available for allocations.
1382 * Increasing the allocation order reduces the number of times that slabs 1534 * Increasing the allocation order reduces the number of times that slabs
1383 * must be moved on and off the partial lists and therefore may influence 1535 * must be moved on and off the partial lists and is therefore a factor in
1384 * locking overhead. 1536 * locking overhead.
1385 *
1386 * The offset is used to relocate the free list link in each object. It is
1387 * therefore possible to move the free list link behind the object. This
1388 * is necessary for RCU to work properly and also useful for debugging.
1389 */ 1537 */
1390 1538
1391/* 1539/*
@@ -1396,76 +1544,110 @@ static struct page *get_object_page(const void *x)
1396 */ 1544 */
1397static int slub_min_order; 1545static int slub_min_order;
1398static int slub_max_order = DEFAULT_MAX_ORDER; 1546static int slub_max_order = DEFAULT_MAX_ORDER;
1399
1400/*
1401 * Minimum number of objects per slab. This is necessary in order to
1402 * reduce locking overhead. Similar to the queue size in SLAB.
1403 */
1404static int slub_min_objects = DEFAULT_MIN_OBJECTS; 1547static int slub_min_objects = DEFAULT_MIN_OBJECTS;
1405 1548
1406/* 1549/*
1407 * Merge control. If this is set then no merging of slab caches will occur. 1550 * Merge control. If this is set then no merging of slab caches will occur.
1551 * (Could be removed. This was introduced to pacify the merge skeptics.)
1408 */ 1552 */
1409static int slub_nomerge; 1553static int slub_nomerge;
1410 1554
1411/* 1555/*
1412 * Debug settings:
1413 */
1414static int slub_debug;
1415
1416static char *slub_debug_slabs;
1417
1418/*
1419 * Calculate the order of allocation given an slab object size. 1556 * Calculate the order of allocation given an slab object size.
1420 * 1557 *
1421 * The order of allocation has significant impact on other elements 1558 * The order of allocation has significant impact on performance and other
1422 * of the system. Generally order 0 allocations should be preferred 1559 * system components. Generally order 0 allocations should be preferred since
1423 * since they do not cause fragmentation in the page allocator. Larger 1560 * order 0 does not cause fragmentation in the page allocator. Larger objects
1424 * objects may have problems with order 0 because there may be too much 1561 * be problematic to put into order 0 slabs because there may be too much
1425 * space left unused in a slab. We go to a higher order if more than 1/8th 1562 * unused space left. We go to a higher order if more than 1/8th of the slab
1426 * of the slab would be wasted. 1563 * would be wasted.
1427 * 1564 *
1428 * In order to reach satisfactory performance we must ensure that 1565 * In order to reach satisfactory performance we must ensure that a minimum
1429 * a minimum number of objects is in one slab. Otherwise we may 1566 * number of objects is in one slab. Otherwise we may generate too much
1430 * generate too much activity on the partial lists. This is less a 1567 * activity on the partial lists which requires taking the list_lock. This is
1431 * concern for large slabs though. slub_max_order specifies the order 1568 * less a concern for large slabs though which are rarely used.
1432 * where we begin to stop considering the number of objects in a slab.
1433 * 1569 *
1434 * Higher order allocations also allow the placement of more objects 1570 * slub_max_order specifies the order where we begin to stop considering the
1435 * in a slab and thereby reduce object handling overhead. If the user 1571 * number of objects in a slab as critical. If we reach slub_max_order then
1436 * has requested a higher mininum order then we start with that one 1572 * we try to keep the page order as low as possible. So we accept more waste
1437 * instead of zero. 1573 * of space in favor of a small page order.
1574 *
1575 * Higher order allocations also allow the placement of more objects in a
1576 * slab and thereby reduce object handling overhead. If the user has
1577 * requested a higher mininum order then we start with that one instead of
1578 * the smallest order which will fit the object.
1438 */ 1579 */
1439static int calculate_order(int size) 1580static inline int slab_order(int size, int min_objects,
1581 int max_order, int fract_leftover)
1440{ 1582{
1441 int order; 1583 int order;
1442 int rem; 1584 int rem;
1443 1585
1444 for (order = max(slub_min_order, fls(size - 1) - PAGE_SHIFT); 1586 for (order = max(slub_min_order,
1445 order < MAX_ORDER; order++) { 1587 fls(min_objects * size - 1) - PAGE_SHIFT);
1446 unsigned long slab_size = PAGE_SIZE << order; 1588 order <= max_order; order++) {
1447 1589
1448 if (slub_max_order > order && 1590 unsigned long slab_size = PAGE_SIZE << order;
1449 slab_size < slub_min_objects * size)
1450 continue;
1451 1591
1452 if (slab_size < size) 1592 if (slab_size < min_objects * size)
1453 continue; 1593 continue;
1454 1594
1455 rem = slab_size % size; 1595 rem = slab_size % size;
1456 1596
1457 if (rem <= (PAGE_SIZE << order) / 8) 1597 if (rem <= slab_size / fract_leftover)
1458 break; 1598 break;
1459 1599
1460 } 1600 }
1461 if (order >= MAX_ORDER) 1601
1462 return -E2BIG;
1463 return order; 1602 return order;
1464} 1603}
1465 1604
1605static inline int calculate_order(int size)
1606{
1607 int order;
1608 int min_objects;
1609 int fraction;
1610
1611 /*
1612 * Attempt to find best configuration for a slab. This
1613 * works by first attempting to generate a layout with
1614 * the best configuration and backing off gradually.
1615 *
1616 * First we reduce the acceptable waste in a slab. Then
1617 * we reduce the minimum objects required in a slab.
1618 */
1619 min_objects = slub_min_objects;
1620 while (min_objects > 1) {
1621 fraction = 8;
1622 while (fraction >= 4) {
1623 order = slab_order(size, min_objects,
1624 slub_max_order, fraction);
1625 if (order <= slub_max_order)
1626 return order;
1627 fraction /= 2;
1628 }
1629 min_objects /= 2;
1630 }
1631
1632 /*
1633 * We were unable to place multiple objects in a slab. Now
1634 * lets see if we can place a single object there.
1635 */
1636 order = slab_order(size, 1, slub_max_order, 1);
1637 if (order <= slub_max_order)
1638 return order;
1639
1640 /*
1641 * Doh this slab cannot be placed using slub_max_order.
1642 */
1643 order = slab_order(size, 1, MAX_ORDER, 1);
1644 if (order <= MAX_ORDER)
1645 return order;
1646 return -ENOSYS;
1647}
1648
1466/* 1649/*
1467 * Function to figure out which alignment to use from the 1650 * Figure out what the alignment of the objects will be.
1468 * various ways of specifying it.
1469 */ 1651 */
1470static unsigned long calculate_alignment(unsigned long flags, 1652static unsigned long calculate_alignment(unsigned long flags,
1471 unsigned long align, unsigned long size) 1653 unsigned long align, unsigned long size)
@@ -1480,8 +1662,8 @@ static unsigned long calculate_alignment(unsigned long flags,
1480 * then use it. 1662 * then use it.
1481 */ 1663 */
1482 if ((flags & SLAB_HWCACHE_ALIGN) && 1664 if ((flags & SLAB_HWCACHE_ALIGN) &&
1483 size > L1_CACHE_BYTES / 2) 1665 size > cache_line_size() / 2)
1484 return max_t(unsigned long, align, L1_CACHE_BYTES); 1666 return max_t(unsigned long, align, cache_line_size());
1485 1667
1486 if (align < ARCH_SLAB_MINALIGN) 1668 if (align < ARCH_SLAB_MINALIGN)
1487 return ARCH_SLAB_MINALIGN; 1669 return ARCH_SLAB_MINALIGN;
@@ -1619,22 +1801,23 @@ static int calculate_sizes(struct kmem_cache *s)
1619 */ 1801 */
1620 size = ALIGN(size, sizeof(void *)); 1802 size = ALIGN(size, sizeof(void *));
1621 1803
1804#ifdef CONFIG_SLUB_DEBUG
1622 /* 1805 /*
1623 * If we are redzoning then check if there is some space between the 1806 * If we are Redzoning then check if there is some space between the
1624 * end of the object and the free pointer. If not then add an 1807 * end of the object and the free pointer. If not then add an
1625 * additional word, so that we can establish a redzone between 1808 * additional word to have some bytes to store Redzone information.
1626 * the object and the freepointer to be able to check for overwrites.
1627 */ 1809 */
1628 if ((flags & SLAB_RED_ZONE) && size == s->objsize) 1810 if ((flags & SLAB_RED_ZONE) && size == s->objsize)
1629 size += sizeof(void *); 1811 size += sizeof(void *);
1812#endif
1630 1813
1631 /* 1814 /*
1632 * With that we have determined how much of the slab is in actual 1815 * With that we have determined the number of bytes in actual use
1633 * use by the object. This is the potential offset to the free 1816 * by the object. This is the potential offset to the free pointer.
1634 * pointer.
1635 */ 1817 */
1636 s->inuse = size; 1818 s->inuse = size;
1637 1819
1820#ifdef CONFIG_SLUB_DEBUG
1638 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || 1821 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
1639 s->ctor || s->dtor)) { 1822 s->ctor || s->dtor)) {
1640 /* 1823 /*
@@ -1656,7 +1839,7 @@ static int calculate_sizes(struct kmem_cache *s)
1656 */ 1839 */
1657 size += 2 * sizeof(struct track); 1840 size += 2 * sizeof(struct track);
1658 1841
1659 if (flags & DEBUG_DEFAULT_FLAGS) 1842 if (flags & SLAB_RED_ZONE)
1660 /* 1843 /*
1661 * Add some empty padding so that we can catch 1844 * Add some empty padding so that we can catch
1662 * overwrites from earlier objects rather than let 1845 * overwrites from earlier objects rather than let
@@ -1665,10 +1848,12 @@ static int calculate_sizes(struct kmem_cache *s)
1665 * of the object. 1848 * of the object.
1666 */ 1849 */
1667 size += sizeof(void *); 1850 size += sizeof(void *);
1851#endif
1852
1668 /* 1853 /*
1669 * Determine the alignment based on various parameters that the 1854 * Determine the alignment based on various parameters that the
1670 * user specified (this is unecessarily complex due to the attempt 1855 * user specified and the dynamic determination of cache line size
1671 * to be compatible with SLAB. Should be cleaned up some day). 1856 * on bootup.
1672 */ 1857 */
1673 align = calculate_alignment(flags, align, s->objsize); 1858 align = calculate_alignment(flags, align, s->objsize);
1674 1859
@@ -1700,23 +1885,6 @@ static int calculate_sizes(struct kmem_cache *s)
1700 1885
1701} 1886}
1702 1887
1703static int __init finish_bootstrap(void)
1704{
1705 struct list_head *h;
1706 int err;
1707
1708 slab_state = SYSFS;
1709
1710 list_for_each(h, &slab_caches) {
1711 struct kmem_cache *s =
1712 container_of(h, struct kmem_cache, list);
1713
1714 err = sysfs_slab_add(s);
1715 BUG_ON(err);
1716 }
1717 return 0;
1718}
1719
1720static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 1888static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
1721 const char *name, size_t size, 1889 const char *name, size_t size,
1722 size_t align, unsigned long flags, 1890 size_t align, unsigned long flags,
@@ -1730,32 +1898,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
1730 s->objsize = size; 1898 s->objsize = size;
1731 s->flags = flags; 1899 s->flags = flags;
1732 s->align = align; 1900 s->align = align;
1733 1901 kmem_cache_open_debug_check(s);
1734 /*
1735 * The page->offset field is only 16 bit wide. This is an offset
1736 * in units of words from the beginning of an object. If the slab
1737 * size is bigger then we cannot move the free pointer behind the
1738 * object anymore.
1739 *
1740 * On 32 bit platforms the limit is 256k. On 64bit platforms
1741 * the limit is 512k.
1742 *
1743 * Debugging or ctor/dtors may create a need to move the free
1744 * pointer. Fail if this happens.
1745 */
1746 if (s->size >= 65535 * sizeof(void *)) {
1747 BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON |
1748 SLAB_STORE_USER | SLAB_DESTROY_BY_RCU));
1749 BUG_ON(ctor || dtor);
1750 }
1751 else
1752 /*
1753 * Enable debugging if selected on the kernel commandline.
1754 */
1755 if (slub_debug && (!slub_debug_slabs ||
1756 strncmp(slub_debug_slabs, name,
1757 strlen(slub_debug_slabs)) == 0))
1758 s->flags |= slub_debug;
1759 1902
1760 if (!calculate_sizes(s)) 1903 if (!calculate_sizes(s))
1761 goto error; 1904 goto error;
@@ -1783,7 +1926,6 @@ EXPORT_SYMBOL(kmem_cache_open);
1783int kmem_ptr_validate(struct kmem_cache *s, const void *object) 1926int kmem_ptr_validate(struct kmem_cache *s, const void *object)
1784{ 1927{
1785 struct page * page; 1928 struct page * page;
1786 void *addr;
1787 1929
1788 page = get_object_page(object); 1930 page = get_object_page(object);
1789 1931
@@ -1791,13 +1933,7 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *object)
1791 /* No slab or wrong slab */ 1933 /* No slab or wrong slab */
1792 return 0; 1934 return 0;
1793 1935
1794 addr = page_address(page); 1936 if (!check_valid_pointer(s, page, object))
1795 if (object < addr || object >= addr + s->objects * s->size)
1796 /* Out of bounds */
1797 return 0;
1798
1799 if ((object - addr) % s->size)
1800 /* Improperly aligned */
1801 return 0; 1937 return 0;
1802 1938
1803 /* 1939 /*
@@ -1826,7 +1962,8 @@ const char *kmem_cache_name(struct kmem_cache *s)
1826EXPORT_SYMBOL(kmem_cache_name); 1962EXPORT_SYMBOL(kmem_cache_name);
1827 1963
1828/* 1964/*
1829 * Attempt to free all slabs on a node 1965 * Attempt to free all slabs on a node. Return the number of slabs we
1966 * were unable to free.
1830 */ 1967 */
1831static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, 1968static int free_list(struct kmem_cache *s, struct kmem_cache_node *n,
1832 struct list_head *list) 1969 struct list_head *list)
@@ -1847,7 +1984,7 @@ static int free_list(struct kmem_cache *s, struct kmem_cache_node *n,
1847} 1984}
1848 1985
1849/* 1986/*
1850 * Release all resources used by slab cache 1987 * Release all resources used by a slab cache.
1851 */ 1988 */
1852static int kmem_cache_close(struct kmem_cache *s) 1989static int kmem_cache_close(struct kmem_cache *s)
1853{ 1990{
@@ -1932,45 +2069,6 @@ static int __init setup_slub_nomerge(char *str)
1932 2069
1933__setup("slub_nomerge", setup_slub_nomerge); 2070__setup("slub_nomerge", setup_slub_nomerge);
1934 2071
1935static int __init setup_slub_debug(char *str)
1936{
1937 if (!str || *str != '=')
1938 slub_debug = DEBUG_DEFAULT_FLAGS;
1939 else {
1940 str++;
1941 if (*str == 0 || *str == ',')
1942 slub_debug = DEBUG_DEFAULT_FLAGS;
1943 else
1944 for( ;*str && *str != ','; str++)
1945 switch (*str) {
1946 case 'f' : case 'F' :
1947 slub_debug |= SLAB_DEBUG_FREE;
1948 break;
1949 case 'z' : case 'Z' :
1950 slub_debug |= SLAB_RED_ZONE;
1951 break;
1952 case 'p' : case 'P' :
1953 slub_debug |= SLAB_POISON;
1954 break;
1955 case 'u' : case 'U' :
1956 slub_debug |= SLAB_STORE_USER;
1957 break;
1958 case 't' : case 'T' :
1959 slub_debug |= SLAB_TRACE;
1960 break;
1961 default:
1962 printk(KERN_ERR "slub_debug option '%c' "
1963 "unknown. skipped\n",*str);
1964 }
1965 }
1966
1967 if (*str == ',')
1968 slub_debug_slabs = str + 1;
1969 return 1;
1970}
1971
1972__setup("slub_debug", setup_slub_debug);
1973
1974static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, 2072static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
1975 const char *name, int size, gfp_t gfp_flags) 2073 const char *name, int size, gfp_t gfp_flags)
1976{ 2074{
@@ -2108,13 +2206,14 @@ void kfree(const void *x)
2108EXPORT_SYMBOL(kfree); 2206EXPORT_SYMBOL(kfree);
2109 2207
2110/* 2208/*
2111 * kmem_cache_shrink removes empty slabs from the partial lists 2209 * kmem_cache_shrink removes empty slabs from the partial lists and sorts
2112 * and then sorts the partially allocated slabs by the number 2210 * the remaining slabs by the number of items in use. The slabs with the
2113 * of items in use. The slabs with the most items in use 2211 * most items in use come first. New allocations will then fill those up
2114 * come first. New allocations will remove these from the 2212 * and thus they can be removed from the partial lists.
2115 * partial list because they are full. The slabs with the 2213 *
2116 * least items are placed last. If it happens that the objects 2214 * The slabs with the least items are placed last. This results in them
2117 * are freed then the page can be returned to the page allocator. 2215 * being allocated from last increasing the chance that the last objects
2216 * are freed in them.
2118 */ 2217 */
2119int kmem_cache_shrink(struct kmem_cache *s) 2218int kmem_cache_shrink(struct kmem_cache *s)
2120{ 2219{
@@ -2143,12 +2242,10 @@ int kmem_cache_shrink(struct kmem_cache *s)
2143 spin_lock_irqsave(&n->list_lock, flags); 2242 spin_lock_irqsave(&n->list_lock, flags);
2144 2243
2145 /* 2244 /*
2146 * Build lists indexed by the items in use in 2245 * Build lists indexed by the items in use in each slab.
2147 * each slab or free slabs if empty.
2148 * 2246 *
2149 * Note that concurrent frees may occur while 2247 * Note that concurrent frees may occur while we hold the
2150 * we hold the list_lock. page->inuse here is 2248 * list_lock. page->inuse here is the upper limit.
2151 * the upper limit.
2152 */ 2249 */
2153 list_for_each_entry_safe(page, t, &n->partial, lru) { 2250 list_for_each_entry_safe(page, t, &n->partial, lru) {
2154 if (!page->inuse && slab_trylock(page)) { 2251 if (!page->inuse && slab_trylock(page)) {
@@ -2172,8 +2269,8 @@ int kmem_cache_shrink(struct kmem_cache *s)
2172 goto out; 2269 goto out;
2173 2270
2174 /* 2271 /*
2175 * Rebuild the partial list with the slabs filled up 2272 * Rebuild the partial list with the slabs filled up most
2176 * most first and the least used slabs at the end. 2273 * first and the least used slabs at the end.
2177 */ 2274 */
2178 for (i = s->objects - 1; i >= 0; i--) 2275 for (i = s->objects - 1; i >= 0; i--)
2179 list_splice(slabs_by_inuse + i, n->partial.prev); 2276 list_splice(slabs_by_inuse + i, n->partial.prev);
@@ -2189,7 +2286,6 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2189 2286
2190/** 2287/**
2191 * krealloc - reallocate memory. The contents will remain unchanged. 2288 * krealloc - reallocate memory. The contents will remain unchanged.
2192 *
2193 * @p: object to reallocate memory for. 2289 * @p: object to reallocate memory for.
2194 * @new_size: how many bytes of memory are required. 2290 * @new_size: how many bytes of memory are required.
2195 * @flags: the type of memory to allocate. 2291 * @flags: the type of memory to allocate.
@@ -2201,9 +2297,8 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2201 */ 2297 */
2202void *krealloc(const void *p, size_t new_size, gfp_t flags) 2298void *krealloc(const void *p, size_t new_size, gfp_t flags)
2203{ 2299{
2204 struct kmem_cache *new_cache;
2205 void *ret; 2300 void *ret;
2206 struct page *page; 2301 size_t ks;
2207 2302
2208 if (unlikely(!p)) 2303 if (unlikely(!p))
2209 return kmalloc(new_size, flags); 2304 return kmalloc(new_size, flags);
@@ -2213,19 +2308,13 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
2213 return NULL; 2308 return NULL;
2214 } 2309 }
2215 2310
2216 page = virt_to_head_page(p); 2311 ks = ksize(p);
2217 2312 if (ks >= new_size)
2218 new_cache = get_slab(new_size, flags);
2219
2220 /*
2221 * If new size fits in the current cache, bail out.
2222 */
2223 if (likely(page->slab == new_cache))
2224 return (void *)p; 2313 return (void *)p;
2225 2314
2226 ret = kmalloc(new_size, flags); 2315 ret = kmalloc(new_size, flags);
2227 if (ret) { 2316 if (ret) {
2228 memcpy(ret, p, min(new_size, ksize(p))); 2317 memcpy(ret, p, min(new_size, ks));
2229 kfree(p); 2318 kfree(p);
2230 } 2319 }
2231 return ret; 2320 return ret;
@@ -2243,7 +2332,7 @@ void __init kmem_cache_init(void)
2243#ifdef CONFIG_NUMA 2332#ifdef CONFIG_NUMA
2244 /* 2333 /*
2245 * Must first have the slab cache available for the allocations of the 2334 * Must first have the slab cache available for the allocations of the
2246 * struct kmalloc_cache_node's. There is special bootstrap code in 2335 * struct kmem_cache_node's. There is special bootstrap code in
2247 * kmem_cache_open for slab_state == DOWN. 2336 * kmem_cache_open for slab_state == DOWN.
2248 */ 2337 */
2249 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", 2338 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
@@ -2280,7 +2369,7 @@ void __init kmem_cache_init(void)
2280 2369
2281 printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 2370 printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
2282 " Processors=%d, Nodes=%d\n", 2371 " Processors=%d, Nodes=%d\n",
2283 KMALLOC_SHIFT_HIGH, L1_CACHE_BYTES, 2372 KMALLOC_SHIFT_HIGH, cache_line_size(),
2284 slub_min_order, slub_max_order, slub_min_objects, 2373 slub_min_order, slub_max_order, slub_min_objects,
2285 nr_cpu_ids, nr_node_ids); 2374 nr_cpu_ids, nr_node_ids);
2286} 2375}
@@ -2415,8 +2504,8 @@ static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu)
2415} 2504}
2416 2505
2417/* 2506/*
2418 * Use the cpu notifier to insure that the slab are flushed 2507 * Use the cpu notifier to insure that the cpu slabs are flushed when
2419 * when necessary. 2508 * necessary.
2420 */ 2509 */
2421static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, 2510static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
2422 unsigned long action, void *hcpu) 2511 unsigned long action, void *hcpu)
@@ -2425,7 +2514,9 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
2425 2514
2426 switch (action) { 2515 switch (action) {
2427 case CPU_UP_CANCELED: 2516 case CPU_UP_CANCELED:
2517 case CPU_UP_CANCELED_FROZEN:
2428 case CPU_DEAD: 2518 case CPU_DEAD:
2519 case CPU_DEAD_FROZEN:
2429 for_all_slabs(__flush_cpu_slab, cpu); 2520 for_all_slabs(__flush_cpu_slab, cpu);
2430 break; 2521 break;
2431 default: 2522 default:
@@ -2439,153 +2530,6 @@ static struct notifier_block __cpuinitdata slab_notifier =
2439 2530
2440#endif 2531#endif
2441 2532
2442#ifdef CONFIG_NUMA
2443
2444/*****************************************************************
2445 * Generic reaper used to support the page allocator
2446 * (the cpu slabs are reaped by a per slab workqueue).
2447 *
2448 * Maybe move this to the page allocator?
2449 ****************************************************************/
2450
2451static DEFINE_PER_CPU(unsigned long, reap_node);
2452
2453static void init_reap_node(int cpu)
2454{
2455 int node;
2456
2457 node = next_node(cpu_to_node(cpu), node_online_map);
2458 if (node == MAX_NUMNODES)
2459 node = first_node(node_online_map);
2460
2461 __get_cpu_var(reap_node) = node;
2462}
2463
2464static void next_reap_node(void)
2465{
2466 int node = __get_cpu_var(reap_node);
2467
2468 /*
2469 * Also drain per cpu pages on remote zones
2470 */
2471 if (node != numa_node_id())
2472 drain_node_pages(node);
2473
2474 node = next_node(node, node_online_map);
2475 if (unlikely(node >= MAX_NUMNODES))
2476 node = first_node(node_online_map);
2477 __get_cpu_var(reap_node) = node;
2478}
2479#else
2480#define init_reap_node(cpu) do { } while (0)
2481#define next_reap_node(void) do { } while (0)
2482#endif
2483
2484#define REAPTIMEOUT_CPUC (2*HZ)
2485
2486#ifdef CONFIG_SMP
2487static DEFINE_PER_CPU(struct delayed_work, reap_work);
2488
2489static void cache_reap(struct work_struct *unused)
2490{
2491 next_reap_node();
2492 refresh_cpu_vm_stats(smp_processor_id());
2493 schedule_delayed_work(&__get_cpu_var(reap_work),
2494 REAPTIMEOUT_CPUC);
2495}
2496
2497static void __devinit start_cpu_timer(int cpu)
2498{
2499 struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
2500
2501 /*
2502 * When this gets called from do_initcalls via cpucache_init(),
2503 * init_workqueues() has already run, so keventd will be setup
2504 * at that time.
2505 */
2506 if (keventd_up() && reap_work->work.func == NULL) {
2507 init_reap_node(cpu);
2508 INIT_DELAYED_WORK(reap_work, cache_reap);
2509 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
2510 }
2511}
2512
2513static int __init cpucache_init(void)
2514{
2515 int cpu;
2516
2517 /*
2518 * Register the timers that drain pcp pages and update vm statistics
2519 */
2520 for_each_online_cpu(cpu)
2521 start_cpu_timer(cpu);
2522 return 0;
2523}
2524__initcall(cpucache_init);
2525#endif
2526
2527#ifdef SLUB_RESILIENCY_TEST
2528static unsigned long validate_slab_cache(struct kmem_cache *s);
2529
2530static void resiliency_test(void)
2531{
2532 u8 *p;
2533
2534 printk(KERN_ERR "SLUB resiliency testing\n");
2535 printk(KERN_ERR "-----------------------\n");
2536 printk(KERN_ERR "A. Corruption after allocation\n");
2537
2538 p = kzalloc(16, GFP_KERNEL);
2539 p[16] = 0x12;
2540 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
2541 " 0x12->0x%p\n\n", p + 16);
2542
2543 validate_slab_cache(kmalloc_caches + 4);
2544
2545 /* Hmmm... The next two are dangerous */
2546 p = kzalloc(32, GFP_KERNEL);
2547 p[32 + sizeof(void *)] = 0x34;
2548 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
2549 " 0x34 -> -0x%p\n", p);
2550 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
2551
2552 validate_slab_cache(kmalloc_caches + 5);
2553 p = kzalloc(64, GFP_KERNEL);
2554 p += 64 + (get_cycles() & 0xff) * sizeof(void *);
2555 *p = 0x56;
2556 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
2557 p);
2558 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
2559 validate_slab_cache(kmalloc_caches + 6);
2560
2561 printk(KERN_ERR "\nB. Corruption after free\n");
2562 p = kzalloc(128, GFP_KERNEL);
2563 kfree(p);
2564 *p = 0x78;
2565 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
2566 validate_slab_cache(kmalloc_caches + 7);
2567
2568 p = kzalloc(256, GFP_KERNEL);
2569 kfree(p);
2570 p[50] = 0x9a;
2571 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
2572 validate_slab_cache(kmalloc_caches + 8);
2573
2574 p = kzalloc(512, GFP_KERNEL);
2575 kfree(p);
2576 p[512] = 0xab;
2577 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
2578 validate_slab_cache(kmalloc_caches + 9);
2579}
2580#else
2581static void resiliency_test(void) {};
2582#endif
2583
2584/*
2585 * These are not as efficient as kmalloc for the non debug case.
2586 * We do not have the page struct available so we have to touch one
2587 * cacheline in struct kmem_cache to check slab flags.
2588 */
2589void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) 2533void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
2590{ 2534{
2591 struct kmem_cache *s = get_slab(size, gfpflags); 2535 struct kmem_cache *s = get_slab(size, gfpflags);
@@ -2607,13 +2551,12 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
2607 return slab_alloc(s, gfpflags, node, caller); 2551 return slab_alloc(s, gfpflags, node, caller);
2608} 2552}
2609 2553
2610#ifdef CONFIG_SYSFS 2554#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
2611
2612static int validate_slab(struct kmem_cache *s, struct page *page) 2555static int validate_slab(struct kmem_cache *s, struct page *page)
2613{ 2556{
2614 void *p; 2557 void *p;
2615 void *addr = page_address(page); 2558 void *addr = page_address(page);
2616 unsigned long map[BITS_TO_LONGS(s->objects)]; 2559 DECLARE_BITMAP(map, s->objects);
2617 2560
2618 if (!check_slab(s, page) || 2561 if (!check_slab(s, page) ||
2619 !on_freelist(s, page, NULL)) 2562 !on_freelist(s, page, NULL))
@@ -2622,14 +2565,14 @@ static int validate_slab(struct kmem_cache *s, struct page *page)
2622 /* Now we know that a valid freelist exists */ 2565 /* Now we know that a valid freelist exists */
2623 bitmap_zero(map, s->objects); 2566 bitmap_zero(map, s->objects);
2624 2567
2625 for(p = page->freelist; p; p = get_freepointer(s, p)) { 2568 for_each_free_object(p, s, page->freelist) {
2626 set_bit((p - addr) / s->size, map); 2569 set_bit(slab_index(p, s, addr), map);
2627 if (!check_object(s, page, p, 0)) 2570 if (!check_object(s, page, p, 0))
2628 return 0; 2571 return 0;
2629 } 2572 }
2630 2573
2631 for(p = addr; p < addr + s->objects * s->size; p += s->size) 2574 for_each_object(p, s, addr)
2632 if (!test_bit((p - addr) / s->size, map)) 2575 if (!test_bit(slab_index(p, s, addr), map))
2633 if (!check_object(s, page, p, 1)) 2576 if (!check_object(s, page, p, 1))
2634 return 0; 2577 return 0;
2635 return 1; 2578 return 1;
@@ -2645,12 +2588,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page)
2645 s->name, page); 2588 s->name, page);
2646 2589
2647 if (s->flags & DEBUG_DEFAULT_FLAGS) { 2590 if (s->flags & DEBUG_DEFAULT_FLAGS) {
2648 if (!PageError(page)) 2591 if (!SlabDebug(page))
2649 printk(KERN_ERR "SLUB %s: PageError not set " 2592 printk(KERN_ERR "SLUB %s: SlabDebug not set "
2650 "on slab 0x%p\n", s->name, page); 2593 "on slab 0x%p\n", s->name, page);
2651 } else { 2594 } else {
2652 if (PageError(page)) 2595 if (SlabDebug(page))
2653 printk(KERN_ERR "SLUB %s: PageError set on " 2596 printk(KERN_ERR "SLUB %s: SlabDebug set on "
2654 "slab 0x%p\n", s->name, page); 2597 "slab 0x%p\n", s->name, page);
2655 } 2598 }
2656} 2599}
@@ -2702,14 +2645,76 @@ static unsigned long validate_slab_cache(struct kmem_cache *s)
2702 return count; 2645 return count;
2703} 2646}
2704 2647
2648#ifdef SLUB_RESILIENCY_TEST
2649static void resiliency_test(void)
2650{
2651 u8 *p;
2652
2653 printk(KERN_ERR "SLUB resiliency testing\n");
2654 printk(KERN_ERR "-----------------------\n");
2655 printk(KERN_ERR "A. Corruption after allocation\n");
2656
2657 p = kzalloc(16, GFP_KERNEL);
2658 p[16] = 0x12;
2659 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
2660 " 0x12->0x%p\n\n", p + 16);
2661
2662 validate_slab_cache(kmalloc_caches + 4);
2663
2664 /* Hmmm... The next two are dangerous */
2665 p = kzalloc(32, GFP_KERNEL);
2666 p[32 + sizeof(void *)] = 0x34;
2667 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
2668 " 0x34 -> -0x%p\n", p);
2669 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
2670
2671 validate_slab_cache(kmalloc_caches + 5);
2672 p = kzalloc(64, GFP_KERNEL);
2673 p += 64 + (get_cycles() & 0xff) * sizeof(void *);
2674 *p = 0x56;
2675 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
2676 p);
2677 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
2678 validate_slab_cache(kmalloc_caches + 6);
2679
2680 printk(KERN_ERR "\nB. Corruption after free\n");
2681 p = kzalloc(128, GFP_KERNEL);
2682 kfree(p);
2683 *p = 0x78;
2684 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
2685 validate_slab_cache(kmalloc_caches + 7);
2686
2687 p = kzalloc(256, GFP_KERNEL);
2688 kfree(p);
2689 p[50] = 0x9a;
2690 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
2691 validate_slab_cache(kmalloc_caches + 8);
2692
2693 p = kzalloc(512, GFP_KERNEL);
2694 kfree(p);
2695 p[512] = 0xab;
2696 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
2697 validate_slab_cache(kmalloc_caches + 9);
2698}
2699#else
2700static void resiliency_test(void) {};
2701#endif
2702
2705/* 2703/*
2706 * Generate lists of locations where slabcache objects are allocated 2704 * Generate lists of code addresses where slabcache objects are allocated
2707 * and freed. 2705 * and freed.
2708 */ 2706 */
2709 2707
2710struct location { 2708struct location {
2711 unsigned long count; 2709 unsigned long count;
2712 void *addr; 2710 void *addr;
2711 long long sum_time;
2712 long min_time;
2713 long max_time;
2714 long min_pid;
2715 long max_pid;
2716 cpumask_t cpus;
2717 nodemask_t nodes;
2713}; 2718};
2714 2719
2715struct loc_track { 2720struct loc_track {
@@ -2750,11 +2755,12 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max)
2750} 2755}
2751 2756
2752static int add_location(struct loc_track *t, struct kmem_cache *s, 2757static int add_location(struct loc_track *t, struct kmem_cache *s,
2753 void *addr) 2758 const struct track *track)
2754{ 2759{
2755 long start, end, pos; 2760 long start, end, pos;
2756 struct location *l; 2761 struct location *l;
2757 void *caddr; 2762 void *caddr;
2763 unsigned long age = jiffies - track->when;
2758 2764
2759 start = -1; 2765 start = -1;
2760 end = t->count; 2766 end = t->count;
@@ -2770,19 +2776,36 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
2770 break; 2776 break;
2771 2777
2772 caddr = t->loc[pos].addr; 2778 caddr = t->loc[pos].addr;
2773 if (addr == caddr) { 2779 if (track->addr == caddr) {
2774 t->loc[pos].count++; 2780
2781 l = &t->loc[pos];
2782 l->count++;
2783 if (track->when) {
2784 l->sum_time += age;
2785 if (age < l->min_time)
2786 l->min_time = age;
2787 if (age > l->max_time)
2788 l->max_time = age;
2789
2790 if (track->pid < l->min_pid)
2791 l->min_pid = track->pid;
2792 if (track->pid > l->max_pid)
2793 l->max_pid = track->pid;
2794
2795 cpu_set(track->cpu, l->cpus);
2796 }
2797 node_set(page_to_nid(virt_to_page(track)), l->nodes);
2775 return 1; 2798 return 1;
2776 } 2799 }
2777 2800
2778 if (addr < caddr) 2801 if (track->addr < caddr)
2779 end = pos; 2802 end = pos;
2780 else 2803 else
2781 start = pos; 2804 start = pos;
2782 } 2805 }
2783 2806
2784 /* 2807 /*
2785 * Not found. Insert new tracking element 2808 * Not found. Insert new tracking element.
2786 */ 2809 */
2787 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max)) 2810 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max))
2788 return 0; 2811 return 0;
@@ -2793,7 +2816,16 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
2793 (t->count - pos) * sizeof(struct location)); 2816 (t->count - pos) * sizeof(struct location));
2794 t->count++; 2817 t->count++;
2795 l->count = 1; 2818 l->count = 1;
2796 l->addr = addr; 2819 l->addr = track->addr;
2820 l->sum_time = age;
2821 l->min_time = age;
2822 l->max_time = age;
2823 l->min_pid = track->pid;
2824 l->max_pid = track->pid;
2825 cpus_clear(l->cpus);
2826 cpu_set(track->cpu, l->cpus);
2827 nodes_clear(l->nodes);
2828 node_set(page_to_nid(virt_to_page(track)), l->nodes);
2797 return 1; 2829 return 1;
2798} 2830}
2799 2831
@@ -2801,19 +2833,16 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
2801 struct page *page, enum track_item alloc) 2833 struct page *page, enum track_item alloc)
2802{ 2834{
2803 void *addr = page_address(page); 2835 void *addr = page_address(page);
2804 unsigned long map[BITS_TO_LONGS(s->objects)]; 2836 DECLARE_BITMAP(map, s->objects);
2805 void *p; 2837 void *p;
2806 2838
2807 bitmap_zero(map, s->objects); 2839 bitmap_zero(map, s->objects);
2808 for (p = page->freelist; p; p = get_freepointer(s, p)) 2840 for_each_free_object(p, s, page->freelist)
2809 set_bit((p - addr) / s->size, map); 2841 set_bit(slab_index(p, s, addr), map);
2810
2811 for (p = addr; p < addr + s->objects * s->size; p += s->size)
2812 if (!test_bit((p - addr) / s->size, map)) {
2813 void *addr = get_track(s, p, alloc)->addr;
2814 2842
2815 add_location(t, s, addr); 2843 for_each_object(p, s, addr)
2816 } 2844 if (!test_bit(slab_index(p, s, addr), map))
2845 add_location(t, s, get_track(s, p, alloc));
2817} 2846}
2818 2847
2819static int list_locations(struct kmem_cache *s, char *buf, 2848static int list_locations(struct kmem_cache *s, char *buf,
@@ -2847,15 +2876,47 @@ static int list_locations(struct kmem_cache *s, char *buf,
2847 } 2876 }
2848 2877
2849 for (i = 0; i < t.count; i++) { 2878 for (i = 0; i < t.count; i++) {
2850 void *addr = t.loc[i].addr; 2879 struct location *l = &t.loc[i];
2851 2880
2852 if (n > PAGE_SIZE - 100) 2881 if (n > PAGE_SIZE - 100)
2853 break; 2882 break;
2854 n += sprintf(buf + n, "%7ld ", t.loc[i].count); 2883 n += sprintf(buf + n, "%7ld ", l->count);
2855 if (addr) 2884
2856 n += sprint_symbol(buf + n, (unsigned long)t.loc[i].addr); 2885 if (l->addr)
2886 n += sprint_symbol(buf + n, (unsigned long)l->addr);
2857 else 2887 else
2858 n += sprintf(buf + n, "<not-available>"); 2888 n += sprintf(buf + n, "<not-available>");
2889
2890 if (l->sum_time != l->min_time) {
2891 unsigned long remainder;
2892
2893 n += sprintf(buf + n, " age=%ld/%ld/%ld",
2894 l->min_time,
2895 div_long_long_rem(l->sum_time, l->count, &remainder),
2896 l->max_time);
2897 } else
2898 n += sprintf(buf + n, " age=%ld",
2899 l->min_time);
2900
2901 if (l->min_pid != l->max_pid)
2902 n += sprintf(buf + n, " pid=%ld-%ld",
2903 l->min_pid, l->max_pid);
2904 else
2905 n += sprintf(buf + n, " pid=%ld",
2906 l->min_pid);
2907
2908 if (num_online_cpus() > 1 && !cpus_empty(l->cpus)) {
2909 n += sprintf(buf + n, " cpus=");
2910 n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50,
2911 l->cpus);
2912 }
2913
2914 if (num_online_nodes() > 1 && !nodes_empty(l->nodes)) {
2915 n += sprintf(buf + n, " nodes=");
2916 n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50,
2917 l->nodes);
2918 }
2919
2859 n += sprintf(buf + n, "\n"); 2920 n += sprintf(buf + n, "\n");
2860 } 2921 }
2861 2922
@@ -3491,6 +3552,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
3491 3552
3492static int __init slab_sysfs_init(void) 3553static int __init slab_sysfs_init(void)
3493{ 3554{
3555 struct list_head *h;
3494 int err; 3556 int err;
3495 3557
3496 err = subsystem_register(&slab_subsys); 3558 err = subsystem_register(&slab_subsys);
@@ -3499,7 +3561,15 @@ static int __init slab_sysfs_init(void)
3499 return -ENOSYS; 3561 return -ENOSYS;
3500 } 3562 }
3501 3563
3502 finish_bootstrap(); 3564 slab_state = SYSFS;
3565
3566 list_for_each(h, &slab_caches) {
3567 struct kmem_cache *s =
3568 container_of(h, struct kmem_cache, list);
3569
3570 err = sysfs_slab_add(s);
3571 BUG_ON(err);
3572 }
3503 3573
3504 while (alias_list) { 3574 while (alias_list) {
3505 struct saved_alias *al = alias_list; 3575 struct saved_alias *al = alias_list;
@@ -3515,6 +3585,4 @@ static int __init slab_sysfs_init(void)
3515} 3585}
3516 3586
3517__initcall(slab_sysfs_init); 3587__initcall(slab_sysfs_init);
3518#else
3519__initcall(finish_bootstrap);
3520#endif 3588#endif
diff --git a/mm/swap.c b/mm/swap.c
index 218c52a24a21..d3cb966fe992 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -488,7 +488,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
488 long *committed; 488 long *committed;
489 489
490 committed = &per_cpu(committed_space, (long)hcpu); 490 committed = &per_cpu(committed_space, (long)hcpu);
491 if (action == CPU_DEAD) { 491 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
492 atomic_add(*committed, &vm_committed_space); 492 atomic_add(*committed, &vm_committed_space);
493 *committed = 0; 493 *committed = 0;
494 __lru_add_drain((long)hcpu); 494 __lru_add_drain((long)hcpu);
diff --git a/mm/truncate.c b/mm/truncate.c
index 0f4b6d18ab0e..4fbe1a2da5fb 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -12,6 +12,7 @@
12#include <linux/swap.h> 12#include <linux/swap.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/pagemap.h> 14#include <linux/pagemap.h>
15#include <linux/highmem.h>
15#include <linux/pagevec.h> 16#include <linux/pagevec.h>
16#include <linux/task_io_accounting_ops.h> 17#include <linux/task_io_accounting_ops.h>
17#include <linux/buffer_head.h> /* grr. try_to_release_page, 18#include <linux/buffer_head.h> /* grr. try_to_release_page,
@@ -46,7 +47,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
46 47
47static inline void truncate_partial_page(struct page *page, unsigned partial) 48static inline void truncate_partial_page(struct page *page, unsigned partial)
48{ 49{
49 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); 50 zero_user_page(page, partial, PAGE_CACHE_SIZE - partial, KM_USER0);
50 if (PagePrivate(page)) 51 if (PagePrivate(page))
51 do_invalidatepage(page, partial); 52 do_invalidatepage(page, partial);
52} 53}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1c8e75a1cfcd..1be5a6376ef0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1528,7 +1528,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
1528 pg_data_t *pgdat; 1528 pg_data_t *pgdat;
1529 cpumask_t mask; 1529 cpumask_t mask;
1530 1530
1531 if (action == CPU_ONLINE) { 1531 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
1532 for_each_online_pgdat(pgdat) { 1532 for_each_online_pgdat(pgdat) {
1533 mask = node_to_cpumask(pgdat->node_id); 1533 mask = node_to_cpumask(pgdat->node_id);
1534 if (any_online_cpu(mask) != NR_CPUS) 1534 if (any_online_cpu(mask) != NR_CPUS)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6c488d6ac425..9832d9a41d8c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -281,6 +281,17 @@ EXPORT_SYMBOL(dec_zone_page_state);
281 281
282/* 282/*
283 * Update the zone counters for one cpu. 283 * Update the zone counters for one cpu.
284 *
285 * Note that refresh_cpu_vm_stats strives to only access
286 * node local memory. The per cpu pagesets on remote zones are placed
287 * in the memory local to the processor using that pageset. So the
288 * loop over all zones will access a series of cachelines local to
289 * the processor.
290 *
291 * The call to zone_page_state_add updates the cachelines with the
292 * statistics in the remote zone struct as well as the global cachelines
293 * with the global counters. These could cause remote node cache line
294 * bouncing and will have to be only done when necessary.
284 */ 295 */
285void refresh_cpu_vm_stats(int cpu) 296void refresh_cpu_vm_stats(int cpu)
286{ 297{
@@ -289,21 +300,54 @@ void refresh_cpu_vm_stats(int cpu)
289 unsigned long flags; 300 unsigned long flags;
290 301
291 for_each_zone(zone) { 302 for_each_zone(zone) {
292 struct per_cpu_pageset *pcp; 303 struct per_cpu_pageset *p;
293 304
294 if (!populated_zone(zone)) 305 if (!populated_zone(zone))
295 continue; 306 continue;
296 307
297 pcp = zone_pcp(zone, cpu); 308 p = zone_pcp(zone, cpu);
298 309
299 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 310 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
300 if (pcp->vm_stat_diff[i]) { 311 if (p->vm_stat_diff[i]) {
301 local_irq_save(flags); 312 local_irq_save(flags);
302 zone_page_state_add(pcp->vm_stat_diff[i], 313 zone_page_state_add(p->vm_stat_diff[i],
303 zone, i); 314 zone, i);
304 pcp->vm_stat_diff[i] = 0; 315 p->vm_stat_diff[i] = 0;
316#ifdef CONFIG_NUMA
317 /* 3 seconds idle till flush */
318 p->expire = 3;
319#endif
305 local_irq_restore(flags); 320 local_irq_restore(flags);
306 } 321 }
322#ifdef CONFIG_NUMA
323 /*
324 * Deal with draining the remote pageset of this
325 * processor
326 *
327 * Check if there are pages remaining in this pageset
328 * if not then there is nothing to expire.
329 */
330 if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count))
331 continue;
332
333 /*
334 * We never drain zones local to this processor.
335 */
336 if (zone_to_nid(zone) == numa_node_id()) {
337 p->expire = 0;
338 continue;
339 }
340
341 p->expire--;
342 if (p->expire)
343 continue;
344
345 if (p->pcp[0].count)
346 drain_zone_pages(zone, p->pcp + 0);
347
348 if (p->pcp[1].count)
349 drain_zone_pages(zone, p->pcp + 1);
350#endif
307 } 351 }
308} 352}
309 353
@@ -640,6 +684,24 @@ const struct seq_operations vmstat_op = {
640#endif /* CONFIG_PROC_FS */ 684#endif /* CONFIG_PROC_FS */
641 685
642#ifdef CONFIG_SMP 686#ifdef CONFIG_SMP
687static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
688int sysctl_stat_interval __read_mostly = HZ;
689
690static void vmstat_update(struct work_struct *w)
691{
692 refresh_cpu_vm_stats(smp_processor_id());
693 schedule_delayed_work(&__get_cpu_var(vmstat_work),
694 sysctl_stat_interval);
695}
696
697static void __devinit start_cpu_timer(int cpu)
698{
699 struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu);
700
701 INIT_DELAYED_WORK(vmstat_work, vmstat_update);
702 schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu);
703}
704
643/* 705/*
644 * Use the cpu notifier to insure that the thresholds are recalculated 706 * Use the cpu notifier to insure that the thresholds are recalculated
645 * when necessary. 707 * when necessary.
@@ -648,10 +710,24 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
648 unsigned long action, 710 unsigned long action,
649 void *hcpu) 711 void *hcpu)
650{ 712{
713 long cpu = (long)hcpu;
714
651 switch (action) { 715 switch (action) {
652 case CPU_UP_PREPARE: 716 case CPU_ONLINE:
653 case CPU_UP_CANCELED: 717 case CPU_ONLINE_FROZEN:
718 start_cpu_timer(cpu);
719 break;
720 case CPU_DOWN_PREPARE:
721 case CPU_DOWN_PREPARE_FROZEN:
722 cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
723 per_cpu(vmstat_work, cpu).work.func = NULL;
724 break;
725 case CPU_DOWN_FAILED:
726 case CPU_DOWN_FAILED_FROZEN:
727 start_cpu_timer(cpu);
728 break;
654 case CPU_DEAD: 729 case CPU_DEAD:
730 case CPU_DEAD_FROZEN:
655 refresh_zone_stat_thresholds(); 731 refresh_zone_stat_thresholds();
656 break; 732 break;
657 default: 733 default:
@@ -665,8 +741,13 @@ static struct notifier_block __cpuinitdata vmstat_notifier =
665 741
666int __init setup_vmstat(void) 742int __init setup_vmstat(void)
667{ 743{
744 int cpu;
745
668 refresh_zone_stat_thresholds(); 746 refresh_zone_stat_thresholds();
669 register_cpu_notifier(&vmstat_notifier); 747 register_cpu_notifier(&vmstat_notifier);
748
749 for_each_online_cpu(cpu)
750 start_cpu_timer(cpu);
670 return 0; 751 return 0;
671} 752}
672module_init(setup_vmstat) 753module_init(setup_vmstat)