Merge branch 'master'

author: Kumar Gala <galak@kernel.crashing.org> 2006-03-20 12:58:02 -0500
committer: Kumar Gala <galak@kernel.crashing.org> 2006-03-20 12:58:02 -0500
commit: 1a02e59a2970f9ed28ab51d3b08624b79e54d848 (patch)
tree: 470cce472be3b08c160e0c569648e7228651b12a /mm/slab.c
parent: ebcff3c773b42bce6182ec16485abca4e53fba97 (diff)
parent: 2c276603c3e5ebf38155a9d1fbbda656d52d138e (diff)
1 files changed, 99 insertions, 33 deletions
diff --git a/mm/slab.c b/mm/slab.c
index d66c2b0d9715..d0bd7f07ab04 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -789,6 +789,47 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, char *
        dump_stack();
 }
+#ifdef CONFIG_NUMA
+/*
+ * Special reaping functions for NUMA systems called from cache_reap().
+ * These take care of doing round robin flushing of alien caches (containing
+ * objects freed on different nodes from which they were allocated) and the
+ * flushing of remote pcps by calling drain_node_pages.
+ */
+static DEFINE_PER_CPU(unsigned long, reap_node);
+static void init_reap_node(int cpu)
+{
+        int node;
+        node = next_node(cpu_to_node(cpu), node_online_map);
+        if (node == MAX_NUMNODES)
+                node = 0;
+        __get_cpu_var(reap_node) = node;
+}
+static void next_reap_node(void)
+{
+        int node = __get_cpu_var(reap_node);
+        /*
+         * Also drain per cpu pages on remote zones
+         */
+        if (node != numa_node_id())
+                drain_node_pages(node);
+        node = next_node(node, node_online_map);
+        if (unlikely(node >= MAX_NUMNODES))
+                node = first_node(node_online_map);
+        __get_cpu_var(reap_node) = node;
+}
+#else
+#define init_reap_node(cpu) do { } while (0)
+#define next_reap_node(void) do { } while (0)
+#endif
 /*
 * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
 * via the workqueue/eventd.
@@ -806,6 +847,7 @@ static void __devinit start_cpu_timer(int cpu)
         * at that time.
         */
        if (keventd_up() && reap_work->func == NULL) {
+                init_reap_node(cpu);
                INIT_WORK(reap_work, cache_reap, NULL);
                schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
        }
@@ -884,6 +926,23 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
        }
 }
+/*
+ * Called from cache_reap() to regularly drain alien caches round robin.
+ */
+static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
+{
+        int node = __get_cpu_var(reap_node);
+        if (l3->alien) {
+                struct array_cache *ac = l3->alien[node];
+                if (ac && ac->avail) {
+                        spin_lock_irq(&ac->lock);
+                        __drain_alien_cache(cachep, ac, node);
+                        spin_unlock_irq(&ac->lock);
+                }
+        }
+}
 static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
 {
        int i = 0;
@@ -902,6 +961,7 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **al
 #else
 #define drain_alien_cache(cachep, alien) do { } while (0)
+#define reap_alien(cachep, l3) do { } while (0)
 static inline struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -1124,6 +1184,7 @@ void __init kmem_cache_init(void)
        struct cache_sizes *sizes;
        struct cache_names *names;
        int i;
+        int order;
        for (i = 0; i < NUM_INIT_LISTS; i++) {
                kmem_list3_init(&initkmem_list3[i]);
@@ -1167,11 +1228,15 @@ void __init kmem_cache_init(void)
        cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
-        cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0,
+        for (order = 0; order < MAX_ORDER; order++) {
-                       &left_over, &cache_cache.num);
+                cache_estimate(order, cache_cache.buffer_size,
+                        cache_line_size(), 0, &left_over, &cache_cache.num);
+                if (cache_cache.num)
+                        break;
+        }
        if (!cache_cache.num)
                BUG();
+        cache_cache.gfporder = order;
        cache_cache.colour = left_over / cache_cache.colour_off;
        cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
                                      sizeof(struct slab), cache_line_size());
@@ -1628,36 +1693,44 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
                        size_t size, size_t align, unsigned long flags)
 {
        size_t left_over = 0;
+        int gfporder;
-        for (;; cachep->gfporder++) {
+        for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) {
                unsigned int num;
                size_t remainder;
-                if (cachep->gfporder > MAX_GFP_ORDER) {
+                cache_estimate(gfporder, size, align, flags, &remainder, &num);
-                        cachep->num = 0;
-                        break;
-                }
-                cache_estimate(cachep->gfporder, size, align, flags,
-                               &remainder, &num);
                if (!num)
                        continue;
                /* More than offslab_limit objects will cause problems */
-                if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)
+                if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit)
                        break;
+                /* Found something acceptable - save it away */
                cachep->num = num;
+                cachep->gfporder = gfporder;
                left_over = remainder;
                /*
+                 * A VFS-reclaimable slab tends to have most allocations
+                 * as GFP_NOFS and we really don't want to have to be allocating
+                 * higher-order pages when we are unable to shrink dcache.
+                 */
+                if (flags & SLAB_RECLAIM_ACCOUNT)
+                        break;
+                /*
                 * Large number of objects is good, but very large slabs are
                 * currently bad for the gfp()s.
                 */
-                if (cachep->gfporder >= slab_break_gfp_order)
+                if (gfporder >= slab_break_gfp_order)
                        break;
-                if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))
+                /*
-                        /* Acceptable internal fragmentation */
+                 * Acceptable internal fragmentation?
+                 */
+                if ((left_over * 8) <= (PAGE_SIZE << gfporder))
                        break;
        }
        return left_over;
@@ -1717,6 +1790,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                BUG();
        }
+        /*
+         * Prevent CPUs from coming and going.
+         * lock_cpu_hotplug() nests outside cache_chain_mutex
+         */
+        lock_cpu_hotplug();
        mutex_lock(&cache_chain_mutex);
        list_for_each(p, &cache_chain) {
@@ -1863,17 +1942,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        size = ALIGN(size, align);
-        if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) {
+        left_over = calculate_slab_order(cachep, size, align, flags);
-                /*
-                 * A VFS-reclaimable slab tends to have most allocations
-                 * as GFP_NOFS and we really don't want to have to be allocating
-                 * higher-order pages when we are unable to shrink dcache.
-                 */
-                cachep->gfporder = 0;
-                cache_estimate(cachep->gfporder, size, align, flags,
-                               &left_over, &cachep->num);
-        } else
-                left_over = calculate_slab_order(cachep, size, align, flags);
        if (!cachep->num) {
                printk("kmem_cache_create: couldn't create cache %s.\n", name);
@@ -1918,8 +1987,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        cachep->dtor = dtor;
        cachep->name = name;
-        /* Don't let CPUs to come and go */
-        lock_cpu_hotplug();
        if (g_cpucache_up == FULL) {
                enable_cpucache(cachep);
@@ -1978,12 +2045,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        /* cache setup completed, link it into the list */
        list_add(&cachep->next, &cache_chain);
-        unlock_cpu_hotplug();
      oops:
        if (!cachep && (flags & SLAB_PANIC))
                panic("kmem_cache_create(): failed to create slab `%s'\n",
                      name);
        mutex_unlock(&cache_chain_mutex);
+        unlock_cpu_hotplug();
        return cachep;
 }
 EXPORT_SYMBOL(kmem_cache_create);
@@ -2550,7 +2617,7 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
                       "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
                       cachep->name, cachep->num, slabp, slabp->inuse);
                for (i = 0;
-                     i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t);
+                     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
                     i++) {
                        if ((i % 16) == 0)
                                printk("\n%03x:", i);
@@ -3490,8 +3557,7 @@ static void cache_reap(void *unused)
                check_irq_on();
                l3 = searchp->nodelists[numa_node_id()];
-                if (l3->alien)
+                reap_alien(searchp, l3);
-                        drain_alien_cache(searchp, l3->alien);
                spin_lock_irq(&l3->list_lock);
                drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3541,7 +3607,7 @@ static void cache_reap(void *unused)
        }
        check_irq_on();
        mutex_unlock(&cache_chain_mutex);
-        drain_remote_pages();
+        next_reap_node();
        /* Setup the next iteration */
        schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
 }
author	Kumar Gala <galak@kernel.crashing.org>	2006-03-20 12:58:02 -0500
committer	Kumar Gala <galak@kernel.crashing.org>	2006-03-20 12:58:02 -0500
commit	1a02e59a2970f9ed28ab51d3b08624b79e54d848 (patch)
tree	470cce472be3b08c160e0c569648e7228651b12a /mm/slab.c
parent	ebcff3c773b42bce6182ec16485abca4e53fba97 (diff)
parent	2c276603c3e5ebf38155a9d1fbbda656d52d138e (diff)

diff --git a/mm/slab.c b/mm/slab.c index d66c2b0d9715..d0bd7f07ab04 100644 --- a/mm/slab.c +++ b/mm/slab.c
@@ -789,6 +789,47 @@ static void __slab_error(const char function, struct kmem_cache cachep, char *
789	dump_stack();	789	dump_stack();
790	}	790	}
791		791
		792	#ifdef CONFIG_NUMA
		793	/*
		794	* Special reaping functions for NUMA systems called from cache_reap().
		795	* These take care of doing round robin flushing of alien caches (containing
		796	* objects freed on different nodes from which they were allocated) and the
		797	* flushing of remote pcps by calling drain_node_pages.
		798	*/
		799	static DEFINE_PER_CPU(unsigned long, reap_node);
		800
		801	static void init_reap_node(int cpu)
		802	{
		803	int node;
		804
		805	node = next_node(cpu_to_node(cpu), node_online_map);
		806	if (node == MAX_NUMNODES)
		807	node = 0;
		808
		809	__get_cpu_var(reap_node) = node;
		810	}
		811
		812	static void next_reap_node(void)
		813	{
		814	int node = __get_cpu_var(reap_node);
		815
		816	/*
		817	* Also drain per cpu pages on remote zones
		818	*/
		819	if (node != numa_node_id())
		820	drain_node_pages(node);
		821
		822	node = next_node(node, node_online_map);
		823	if (unlikely(node >= MAX_NUMNODES))
		824	node = first_node(node_online_map);
		825	__get_cpu_var(reap_node) = node;
		826	}
		827
		828	#else
		829	#define init_reap_node(cpu) do { } while (0)
		830	#define next_reap_node(void) do { } while (0)
		831	#endif
		832
792	/*	833	/*
793	* Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz	834	* Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz
794	* via the workqueue/eventd.	835	* via the workqueue/eventd.
@@ -806,6 +847,7 @@ static void __devinit start_cpu_timer(int cpu)
806	* at that time.	847	* at that time.
807	*/	848	*/
808	if (keventd_up() && reap_work->func == NULL) {	849	if (keventd_up() && reap_work->func == NULL) {
		850	init_reap_node(cpu);
809	INIT_WORK(reap_work, cache_reap, NULL);	851	INIT_WORK(reap_work, cache_reap, NULL);
810	schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);	852	schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
811	}	853	}
@@ -884,6 +926,23 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
884	}	926	}
885	}	927	}
886		928
		929	/*
		930	* Called from cache_reap() to regularly drain alien caches round robin.
		931	*/
		932	static void reap_alien(struct kmem_cache cachep, struct kmem_list3 l3)
		933	{
		934	int node = __get_cpu_var(reap_node);
		935
		936	if (l3->alien) {
		937	struct array_cache *ac = l3->alien[node];
		938	if (ac && ac->avail) {
		939	spin_lock_irq(&ac->lock);
		940	__drain_alien_cache(cachep, ac, node);
		941	spin_unlock_irq(&ac->lock);
		942	}
		943	}
		944	}
		945
887	static void drain_alien_cache(struct kmem_cache cachep, struct array_cache *alien)	946	static void drain_alien_cache(struct kmem_cache cachep, struct array_cache *alien)
888	{	947	{
889	int i = 0;	948	int i = 0;
@@ -902,6 +961,7 @@ static void drain_alien_cache(struct kmem_cache cachep, struct array_cache *al
902	#else	961	#else
903		962
904	#define drain_alien_cache(cachep, alien) do { } while (0)	963	#define drain_alien_cache(cachep, alien) do { } while (0)
		964	#define reap_alien(cachep, l3) do { } while (0)
905		965
906	static inline struct array_cache **alloc_alien_cache(int node, int limit)	966	static inline struct array_cache **alloc_alien_cache(int node, int limit)
907	{	967	{
@@ -1124,6 +1184,7 @@ void __init kmem_cache_init(void)
1124	struct cache_sizes *sizes;	1184	struct cache_sizes *sizes;
1125	struct cache_names *names;	1185	struct cache_names *names;
1126	int i;	1186	int i;
		1187	int order;
1127		1188
1128	for (i = 0; i < NUM_INIT_LISTS; i++) {	1189	for (i = 0; i < NUM_INIT_LISTS; i++) {
1129	kmem_list3_init(&initkmem_list3[i]);	1190	kmem_list3_init(&initkmem_list3[i]);
@@ -1167,11 +1228,15 @@ void __init kmem_cache_init(void)
1167		1228
1168	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());	1229	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
1169		1230
1170	cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0,	1231	for (order = 0; order < MAX_ORDER; order++) {
1171	&left_over, &cache_cache.num);	1232	cache_estimate(order, cache_cache.buffer_size,
		1233	cache_line_size(), 0, &left_over, &cache_cache.num);
		1234	if (cache_cache.num)
		1235	break;
		1236	}
1172	if (!cache_cache.num)	1237	if (!cache_cache.num)
1173	BUG();	1238	BUG();
1174		1239	cache_cache.gfporder = order;
1175	cache_cache.colour = left_over / cache_cache.colour_off;	1240	cache_cache.colour = left_over / cache_cache.colour_off;
1176	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +	1241	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1177	sizeof(struct slab), cache_line_size());	1242	sizeof(struct slab), cache_line_size());
@@ -1628,36 +1693,44 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
1628	size_t size, size_t align, unsigned long flags)	1693	size_t size, size_t align, unsigned long flags)
1629	{	1694	{
1630	size_t left_over = 0;	1695	size_t left_over = 0;
		1696	int gfporder;
1631		1697
1632	for (;; cachep->gfporder++) {	1698	for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) {
1633	unsigned int num;	1699	unsigned int num;
1634	size_t remainder;	1700	size_t remainder;
1635		1701
1636	if (cachep->gfporder > MAX_GFP_ORDER) {	1702	cache_estimate(gfporder, size, align, flags, &remainder, &num);
1637	cachep->num = 0;
1638	break;
1639	}
1640
1641	cache_estimate(cachep->gfporder, size, align, flags,
1642	&remainder, &num);
1643	if (!num)	1703	if (!num)
1644	continue;	1704	continue;
		1705
1645	/* More than offslab_limit objects will cause problems */	1706	/* More than offslab_limit objects will cause problems */
1646	if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)	1707	if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit)
1647	break;	1708	break;
1648		1709
		1710	/* Found something acceptable - save it away */
1649	cachep->num = num;	1711	cachep->num = num;
		1712	cachep->gfporder = gfporder;
1650	left_over = remainder;	1713	left_over = remainder;
1651		1714
1652	/*	1715	/*
		1716	* A VFS-reclaimable slab tends to have most allocations
		1717	* as GFP_NOFS and we really don't want to have to be allocating
		1718	* higher-order pages when we are unable to shrink dcache.
		1719	*/
		1720	if (flags & SLAB_RECLAIM_ACCOUNT)
		1721	break;
		1722
		1723	/*
1653	* Large number of objects is good, but very large slabs are	1724	* Large number of objects is good, but very large slabs are
1654	* currently bad for the gfp()s.	1725	* currently bad for the gfp()s.
1655	*/	1726	*/
1656	if (cachep->gfporder >= slab_break_gfp_order)	1727	if (gfporder >= slab_break_gfp_order)
1657	break;	1728	break;
1658		1729
1659	if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))	1730	/*
1660	/* Acceptable internal fragmentation */	1731	* Acceptable internal fragmentation?
		1732	*/
		1733	if ((left_over * 8) <= (PAGE_SIZE << gfporder))
1661	break;	1734	break;
1662	}	1735	}
1663	return left_over;	1736	return left_over;
@@ -1717,6 +1790,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1717	BUG();	1790	BUG();
1718	}	1791	}
1719		1792
		1793	/*
		1794	* Prevent CPUs from coming and going.
		1795	* lock_cpu_hotplug() nests outside cache_chain_mutex
		1796	*/
		1797	lock_cpu_hotplug();
		1798
1720	mutex_lock(&cache_chain_mutex);	1799	mutex_lock(&cache_chain_mutex);
1721		1800
1722	list_for_each(p, &cache_chain) {	1801	list_for_each(p, &cache_chain) {
@@ -1863,17 +1942,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1863		1942
1864	size = ALIGN(size, align);	1943	size = ALIGN(size, align);
1865		1944
1866	if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) {	1945	left_over = calculate_slab_order(cachep, size, align, flags);
1867	/*
1868	* A VFS-reclaimable slab tends to have most allocations
1869	* as GFP_NOFS and we really don't want to have to be allocating
1870	* higher-order pages when we are unable to shrink dcache.
1871	*/
1872	cachep->gfporder = 0;
1873	cache_estimate(cachep->gfporder, size, align, flags,
1874	&left_over, &cachep->num);
1875	} else
1876	left_over = calculate_slab_order(cachep, size, align, flags);
1877		1946
1878	if (!cachep->num) {	1947	if (!cachep->num) {
1879	printk("kmem_cache_create: couldn't create cache %s.\n", name);	1948	printk("kmem_cache_create: couldn't create cache %s.\n", name);
@@ -1918,8 +1987,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1918	cachep->dtor = dtor;	1987	cachep->dtor = dtor;
1919	cachep->name = name;	1988	cachep->name = name;
1920		1989
1921	/* Don't let CPUs to come and go */
1922	lock_cpu_hotplug();
1923		1990
1924	if (g_cpucache_up == FULL) {	1991	if (g_cpucache_up == FULL) {
1925	enable_cpucache(cachep);	1992	enable_cpucache(cachep);
@@ -1978,12 +2045,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1978		2045
1979	/* cache setup completed, link it into the list */	2046	/* cache setup completed, link it into the list */
1980	list_add(&cachep->next, &cache_chain);	2047	list_add(&cachep->next, &cache_chain);
1981	unlock_cpu_hotplug();
1982	oops:	2048	oops:
1983	if (!cachep && (flags & SLAB_PANIC))	2049	if (!cachep && (flags & SLAB_PANIC))
1984	panic("kmem_cache_create(): failed to create slab `%s'\n",	2050	panic("kmem_cache_create(): failed to create slab `%s'\n",
1985	name);	2051	name);
1986	mutex_unlock(&cache_chain_mutex);	2052	mutex_unlock(&cache_chain_mutex);
		2053	unlock_cpu_hotplug();
1987	return cachep;	2054	return cachep;
1988	}	2055	}
1989	EXPORT_SYMBOL(kmem_cache_create);	2056	EXPORT_SYMBOL(kmem_cache_create);
@@ -2550,7 +2617,7 @@ static void check_slabp(struct kmem_cache cachep, struct slab slabp)
2550	"slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",	2617	"slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2551	cachep->name, cachep->num, slabp, slabp->inuse);	2618	cachep->name, cachep->num, slabp, slabp->inuse);
2552	for (i = 0;	2619	for (i = 0;
2553	i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t);	2620	i < sizeof(slabp) + cachep->num sizeof(kmem_bufctl_t);
2554	i++) {	2621	i++) {
2555	if ((i % 16) == 0)	2622	if ((i % 16) == 0)
2556	printk("\n%03x:", i);	2623	printk("\n%03x:", i);
@@ -3490,8 +3557,7 @@ static void cache_reap(void *unused)
3490	check_irq_on();	3557	check_irq_on();
3491		3558
3492	l3 = searchp->nodelists[numa_node_id()];	3559	l3 = searchp->nodelists[numa_node_id()];
3493	if (l3->alien)	3560	reap_alien(searchp, l3);
3494	drain_alien_cache(searchp, l3->alien);
3495	spin_lock_irq(&l3->list_lock);	3561	spin_lock_irq(&l3->list_lock);
3496		3562
3497	drain_array_locked(searchp, cpu_cache_get(searchp), 0,	3563	drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3541,7 +3607,7 @@ static void cache_reap(void *unused)
3541	}	3607	}
3542	check_irq_on();	3608	check_irq_on();
3543	mutex_unlock(&cache_chain_mutex);	3609	mutex_unlock(&cache_chain_mutex);
3544	drain_remote_pages();	3610	next_reap_node();
3545	/* Setup the next iteration */	3611	/* Setup the next iteration */
3546	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);	3612	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
3547	}	3613	}