[PATCH] NUMA slab locking fixes: fix cpu down and up locking

This fixes locking and bugs in cpu_down and cpu_up paths of the NUMA slab allocator. Sonny Rao <sonny@burdell.org> reported problems sometime back on POWER5 boxes, when the last cpu on the nodes were being offlined. We could not reproduce the same on x86_64 because the cpumask (node_to_cpumask) was not being updated on cpu down. Since that issue is now fixed, we can reproduce Sonny's problems on x86_64 NUMA, and here is the fix. The problem earlier was on CPU_DOWN, if it was the last cpu on the node to go down, the array_caches (shared, alien) and the kmem_list3 of the node were being freed (kfree) with the kmem_list3 lock held. If the l3 or the array_caches were to come from the same cache being cleared, we hit on badness. This patch cleans up the locking in cpu_up and cpu_down path. We cannot really free l3 on cpu down because, there is no node offlining yet and even though a cpu is not yet up, node local memory can be allocated for it. So l3s are usually allocated at keme_cache_create and destroyed at kmem_cache_destroy. Hence, we don't need cachep->spinlock protection to get to the cachep->nodelist[nodeid] either. Patch survived onlining and offlining on a 4 core 2 node Tyan box with a 4 dbench process running all the time. Signed-off-by: Alok N Kataria <alokk@calsoftinc.com> Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org> Cc: Christoph Lameter <christoph@lameter.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Ravikiran G Thirumalai <kiran@scalex86.org> 2006-02-05 02:27:59 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-02-05 14:06:53 -0500
commit: 4484ebf12bdb0ebcdc6e8951243cbab3d7f6f4c1 (patch)
tree: 9feabea0bac1e6401742bc95bf381e36d2651fbc
parent: ca3b9b91735316f0ec7f01976f85842e0bfe5c6e (diff)
1 files changed, 85 insertions, 38 deletions
diff --git a/mm/slab.c b/mm/slab.c
index d3f68543f9f4..9cc049a942c6 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -884,14 +884,14 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
        }
 }
-static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
+static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
 {
        int i = 0;
        struct array_cache *ac;
        unsigned long flags;
        for_each_online_node(i) {
-                ac = l3->alien[i];
+                ac = alien[i];
                if (ac) {
                        spin_lock_irqsave(&ac->lock, flags);
                        __drain_alien_cache(cachep, ac, i);
@@ -901,8 +901,11 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
 }
 #else
 #define alloc_alien_cache(node, limit) do { } while (0)
-#define free_alien_cache(ac_ptr) do { } while (0)
+#define drain_alien_cache(cachep, alien) do { } while (0)
-#define drain_alien_cache(cachep, l3) do { } while (0)
+static inline void free_alien_cache(struct array_cache **ac_ptr)
+{
+}
 #endif
 static int __devinit cpuup_callback(struct notifier_block *nfb,
@@ -936,6 +939,11 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                                l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
                                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+                                /*
+                                 * The l3s don't come and go as CPUs come and
+                                 * go.  cache_chain_mutex is sufficient
+                                 * protection here.
+                                 */
                                cachep->nodelists[node] = l3;
                        }
@@ -950,26 +958,47 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                   & array cache's */
                list_for_each_entry(cachep, &cache_chain, next) {
                        struct array_cache *nc;
+                        struct array_cache *shared;
+                        struct array_cache **alien;
                        nc = alloc_arraycache(node, cachep->limit,
-                                              cachep->batchcount);
+                                                cachep->batchcount);
                        if (!nc)
                                goto bad;
+                        shared = alloc_arraycache(node,
+                                        cachep->shared * cachep->batchcount,
+                                        0xbaadf00d);
+                        if (!shared)
+                                goto bad;
+#ifdef CONFIG_NUMA
+                        alien = alloc_alien_cache(node, cachep->limit);
+                        if (!alien)
+                                goto bad;
+#endif
                        cachep->array[cpu] = nc;
                        l3 = cachep->nodelists[node];
                        BUG_ON(!l3);
-                        if (!l3->shared) {
-                                if (!(nc = alloc_arraycache(node,
-                                                            cachep->shared *
-                                                            cachep->batchcount,
-                                                            0xbaadf00d)))
-                                        goto bad;
-                                /* we are serialised from CPU_DEAD or
+                        spin_lock_irq(&l3->list_lock);
-                                   CPU_UP_CANCELLED by the cpucontrol lock */
+                        if (!l3->shared) {
-                                l3->shared = nc;
+                                /*
+                                 * We are serialised from CPU_DEAD or
+                                 * CPU_UP_CANCELLED by the cpucontrol lock
+                                 */
+                                l3->shared = shared;
+                                shared = NULL;
                        }
+#ifdef CONFIG_NUMA
+                        if (!l3->alien) {
+                                l3->alien = alien;
+                                alien = NULL;
+                        }
+#endif
+                        spin_unlock_irq(&l3->list_lock);
+                        kfree(shared);
+                        free_alien_cache(alien);
                }
                mutex_unlock(&cache_chain_mutex);
                break;
@@ -978,23 +1007,32 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_DEAD:
+                /*
+                 * Even if all the cpus of a node are down, we don't free the
+                 * kmem_list3 of any cache. This to avoid a race between
+                 * cpu_down, and a kmalloc allocation from another cpu for
+                 * memory from the node of the cpu going down.  The list3
+                 * structure is usually allocated from kmem_cache_create() and
+                 * gets destroyed at kmem_cache_destroy().
+                 */
                /* fall thru */
        case CPU_UP_CANCELED:
                mutex_lock(&cache_chain_mutex);
                list_for_each_entry(cachep, &cache_chain, next) {
                        struct array_cache *nc;
+                        struct array_cache *shared;
+                        struct array_cache **alien;
                        cpumask_t mask;
                        mask = node_to_cpumask(node);
-                        spin_lock(&cachep->spinlock);
                        /* cpu is dead; no one can alloc from it. */
                        nc = cachep->array[cpu];
                        cachep->array[cpu] = NULL;
                        l3 = cachep->nodelists[node];
                        if (!l3)
-                                goto unlock_cache;
+                                goto free_array_cache;
                        spin_lock_irq(&l3->list_lock);
@@ -1005,33 +1043,43 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                        if (!cpus_empty(mask)) {
                                spin_unlock_irq(&l3->list_lock);
-                                goto unlock_cache;
+                                goto free_array_cache;
                        }
-                        if (l3->shared) {
+                        shared = l3->shared;
+                        if (shared) {
                                free_block(cachep, l3->shared->entry,
                                           l3->shared->avail, node);
-                                kfree(l3->shared);
                                l3->shared = NULL;
                        }
-                        if (l3->alien) {
-                                drain_alien_cache(cachep, l3);
-                                free_alien_cache(l3->alien);
-                                l3->alien = NULL;
-                        }
-                        /* free slabs belonging to this node */
+                        alien = l3->alien;
-                        if (__node_shrink(cachep, node)) {
+                        l3->alien = NULL;
-                                cachep->nodelists[node] = NULL;
-                                spin_unlock_irq(&l3->list_lock);
+                        spin_unlock_irq(&l3->list_lock);
-                                kfree(l3);
-                        } else {
+                        kfree(shared);
-                                spin_unlock_irq(&l3->list_lock);
+                        if (alien) {
+                                drain_alien_cache(cachep, alien);
+                                free_alien_cache(alien);
                        }
-                      unlock_cache:
+free_array_cache:
-                        spin_unlock(&cachep->spinlock);
                        kfree(nc);
                }
+                /*
+                 * In the previous loop, all the objects were freed to
+                 * the respective cache's slabs,  now we can go ahead and
+                 * shrink each nodelist to its limit.
+                 */
+                list_for_each_entry(cachep, &cache_chain, next) {
+                        l3 = cachep->nodelists[node];
+                        if (!l3)
+                                continue;
+                        spin_lock_irq(&l3->list_lock);
+                        /* free slabs belonging to this node */
+                        __node_shrink(cachep, node);
+                        spin_unlock_irq(&l3->list_lock);
+                }
                mutex_unlock(&cache_chain_mutex);
                break;
 #endif
@@ -2011,7 +2059,6 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
        smp_call_function_all_cpus(do_drain, cachep);
        check_irq_on();
-        spin_lock(&cachep->spinlock);
        for_each_online_node(node) {
                l3 = cachep->nodelists[node];
                if (l3) {
@@ -2019,10 +2066,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
                        drain_array_locked(cachep, l3->shared, 1, node);
                        spin_unlock_irq(&l3->list_lock);
                        if (l3->alien)
-                                drain_alien_cache(cachep, l3);
+                                drain_alien_cache(cachep, l3->alien);
                }
        }
-        spin_unlock(&cachep->spinlock);
 }
 static int __node_shrink(struct kmem_cache *cachep, int node)
@@ -3440,7 +3486,7 @@ static void cache_reap(void *unused)
                l3 = searchp->nodelists[numa_node_id()];
                if (l3->alien)
-                        drain_alien_cache(searchp, l3);
+                        drain_alien_cache(searchp, l3->alien);
                spin_lock_irq(&l3->list_lock);
                drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3598,7 +3644,8 @@ static int s_show(struct seq_file *m, void *p)
                        num_slabs++;
                }
                free_objects += l3->free_objects;
-                shared_avail += l3->shared->avail;
+                if (l3->shared)
+                        shared_avail += l3->shared->avail;
                spin_unlock_irq(&l3->list_lock);
        }
author	Ravikiran G Thirumalai <kiran@scalex86.org>	2006-02-05 02:27:59 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-02-05 14:06:53 -0500
commit	4484ebf12bdb0ebcdc6e8951243cbab3d7f6f4c1 (patch)
tree	9feabea0bac1e6401742bc95bf381e36d2651fbc
parent	ca3b9b91735316f0ec7f01976f85842e0bfe5c6e (diff)

diff --git a/mm/slab.c b/mm/slab.c index d3f68543f9f4..9cc049a942c6 100644 --- a/mm/slab.c +++ b/mm/slab.c
@@ -884,14 +884,14 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
884	}	884	}
885	}	885	}
886		886
887	static void drain_alien_cache(struct kmem_cache cachep, struct kmem_list3 l3)	887	static void drain_alien_cache(struct kmem_cache cachep, struct array_cache *alien)
888	{	888	{
889	int i = 0;	889	int i = 0;
890	struct array_cache *ac;	890	struct array_cache *ac;
891	unsigned long flags;	891	unsigned long flags;
892		892
893	for_each_online_node(i) {	893	for_each_online_node(i) {
894	ac = l3->alien[i];	894	ac = alien[i];
895	if (ac) {	895	if (ac) {
896	spin_lock_irqsave(&ac->lock, flags);	896	spin_lock_irqsave(&ac->lock, flags);
897	__drain_alien_cache(cachep, ac, i);	897	__drain_alien_cache(cachep, ac, i);
@@ -901,8 +901,11 @@ static void drain_alien_cache(struct kmem_cache cachep, struct kmem_list3 l3)
901	}	901	}
902	#else	902	#else
903	#define alloc_alien_cache(node, limit) do { } while (0)	903	#define alloc_alien_cache(node, limit) do { } while (0)
904	#define free_alien_cache(ac_ptr) do { } while (0)	904	#define drain_alien_cache(cachep, alien) do { } while (0)
905	#define drain_alien_cache(cachep, l3) do { } while (0)	905
		906	static inline void free_alien_cache(struct array_cache **ac_ptr)
		907	{
		908	}
906	#endif	909	#endif
907		910
908	static int __devinit cpuup_callback(struct notifier_block *nfb,	911	static int __devinit cpuup_callback(struct notifier_block *nfb,
@@ -936,6 +939,11 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
936	l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +	939	l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
937	((unsigned long)cachep) % REAPTIMEOUT_LIST3;	940	((unsigned long)cachep) % REAPTIMEOUT_LIST3;
938		941
		942	/*
		943	* The l3s don't come and go as CPUs come and
		944	* go. cache_chain_mutex is sufficient
		945	* protection here.
		946	*/
939	cachep->nodelists[node] = l3;	947	cachep->nodelists[node] = l3;
940	}	948	}
941		949
@@ -950,26 +958,47 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
950	& array cache's */	958	& array cache's */
951	list_for_each_entry(cachep, &cache_chain, next) {	959	list_for_each_entry(cachep, &cache_chain, next) {
952	struct array_cache *nc;	960	struct array_cache *nc;
		961	struct array_cache *shared;
		962	struct array_cache **alien;
953		963
954	nc = alloc_arraycache(node, cachep->limit,	964	nc = alloc_arraycache(node, cachep->limit,
955	cachep->batchcount);	965	cachep->batchcount);
956	if (!nc)	966	if (!nc)
957	goto bad;	967	goto bad;
		968	shared = alloc_arraycache(node,
		969	cachep->shared * cachep->batchcount,
		970	0xbaadf00d);
		971	if (!shared)
		972	goto bad;
		973	#ifdef CONFIG_NUMA
		974	alien = alloc_alien_cache(node, cachep->limit);
		975	if (!alien)
		976	goto bad;
		977	#endif
958	cachep->array[cpu] = nc;	978	cachep->array[cpu] = nc;
959		979
960	l3 = cachep->nodelists[node];	980	l3 = cachep->nodelists[node];
961	BUG_ON(!l3);	981	BUG_ON(!l3);
962	if (!l3->shared) {
963	if (!(nc = alloc_arraycache(node,
964	cachep->shared *
965	cachep->batchcount,
966	0xbaadf00d)))
967	goto bad;
968		982
969	/* we are serialised from CPU_DEAD or	983	spin_lock_irq(&l3->list_lock);
970	CPU_UP_CANCELLED by the cpucontrol lock */	984	if (!l3->shared) {
971	l3->shared = nc;	985	/*
		986	* We are serialised from CPU_DEAD or
		987	* CPU_UP_CANCELLED by the cpucontrol lock
		988	*/
		989	l3->shared = shared;
		990	shared = NULL;
972	}	991	}
		992	#ifdef CONFIG_NUMA
		993	if (!l3->alien) {
		994	l3->alien = alien;
		995	alien = NULL;
		996	}
		997	#endif
		998	spin_unlock_irq(&l3->list_lock);
		999
		1000	kfree(shared);
		1001	free_alien_cache(alien);
973	}	1002	}
974	mutex_unlock(&cache_chain_mutex);	1003	mutex_unlock(&cache_chain_mutex);
975	break;	1004	break;
@@ -978,23 +1007,32 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
978	break;	1007	break;
979	#ifdef CONFIG_HOTPLUG_CPU	1008	#ifdef CONFIG_HOTPLUG_CPU
980	case CPU_DEAD:	1009	case CPU_DEAD:
		1010	/*
		1011	* Even if all the cpus of a node are down, we don't free the
		1012	* kmem_list3 of any cache. This to avoid a race between
		1013	* cpu_down, and a kmalloc allocation from another cpu for
		1014	* memory from the node of the cpu going down. The list3
		1015	* structure is usually allocated from kmem_cache_create() and
		1016	* gets destroyed at kmem_cache_destroy().
		1017	*/
981	/* fall thru */	1018	/* fall thru */
982	case CPU_UP_CANCELED:	1019	case CPU_UP_CANCELED:
983	mutex_lock(&cache_chain_mutex);	1020	mutex_lock(&cache_chain_mutex);
984		1021
985	list_for_each_entry(cachep, &cache_chain, next) {	1022	list_for_each_entry(cachep, &cache_chain, next) {
986	struct array_cache *nc;	1023	struct array_cache *nc;
		1024	struct array_cache *shared;
		1025	struct array_cache **alien;
987	cpumask_t mask;	1026	cpumask_t mask;
988		1027
989	mask = node_to_cpumask(node);	1028	mask = node_to_cpumask(node);
990	spin_lock(&cachep->spinlock);
991	/* cpu is dead; no one can alloc from it. */	1029	/* cpu is dead; no one can alloc from it. */
992	nc = cachep->array[cpu];	1030	nc = cachep->array[cpu];
993	cachep->array[cpu] = NULL;	1031	cachep->array[cpu] = NULL;
994	l3 = cachep->nodelists[node];	1032	l3 = cachep->nodelists[node];
995		1033
996	if (!l3)	1034	if (!l3)
997	goto unlock_cache;	1035	goto free_array_cache;
998		1036
999	spin_lock_irq(&l3->list_lock);	1037	spin_lock_irq(&l3->list_lock);
1000		1038
@@ -1005,33 +1043,43 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
1005		1043
1006	if (!cpus_empty(mask)) {	1044	if (!cpus_empty(mask)) {
1007	spin_unlock_irq(&l3->list_lock);	1045	spin_unlock_irq(&l3->list_lock);
1008	goto unlock_cache;	1046	goto free_array_cache;
1009	}	1047	}
1010		1048
1011	if (l3->shared) {	1049	shared = l3->shared;
		1050	if (shared) {
1012	free_block(cachep, l3->shared->entry,	1051	free_block(cachep, l3->shared->entry,
1013	l3->shared->avail, node);	1052	l3->shared->avail, node);
1014	kfree(l3->shared);
1015	l3->shared = NULL;	1053	l3->shared = NULL;
1016	}	1054	}
1017	if (l3->alien) {
1018	drain_alien_cache(cachep, l3);
1019	free_alien_cache(l3->alien);
1020	l3->alien = NULL;
1021	}
1022		1055
1023	/* free slabs belonging to this node */	1056	alien = l3->alien;
1024	if (__node_shrink(cachep, node)) {	1057	l3->alien = NULL;
1025	cachep->nodelists[node] = NULL;	1058
1026	spin_unlock_irq(&l3->list_lock);	1059	spin_unlock_irq(&l3->list_lock);
1027	kfree(l3);	1060
1028	} else {	1061	kfree(shared);
1029	spin_unlock_irq(&l3->list_lock);	1062	if (alien) {
		1063	drain_alien_cache(cachep, alien);
		1064	free_alien_cache(alien);
1030	}	1065	}
1031	unlock_cache:	1066	free_array_cache:
1032	spin_unlock(&cachep->spinlock);
1033	kfree(nc);	1067	kfree(nc);
1034	}	1068	}
		1069	/*
		1070	* In the previous loop, all the objects were freed to
		1071	* the respective cache's slabs, now we can go ahead and
		1072	* shrink each nodelist to its limit.
		1073	*/
		1074	list_for_each_entry(cachep, &cache_chain, next) {
		1075	l3 = cachep->nodelists[node];
		1076	if (!l3)
		1077	continue;
		1078	spin_lock_irq(&l3->list_lock);
		1079	/* free slabs belonging to this node */
		1080	__node_shrink(cachep, node);
		1081	spin_unlock_irq(&l3->list_lock);
		1082	}
1035	mutex_unlock(&cache_chain_mutex);	1083	mutex_unlock(&cache_chain_mutex);
1036	break;	1084	break;
1037	#endif	1085	#endif
@@ -2011,7 +2059,6 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
2011		2059
2012	smp_call_function_all_cpus(do_drain, cachep);	2060	smp_call_function_all_cpus(do_drain, cachep);
2013	check_irq_on();	2061	check_irq_on();
2014	spin_lock(&cachep->spinlock);
2015	for_each_online_node(node) {	2062	for_each_online_node(node) {
2016	l3 = cachep->nodelists[node];	2063	l3 = cachep->nodelists[node];
2017	if (l3) {	2064	if (l3) {
@@ -2019,10 +2066,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
2019	drain_array_locked(cachep, l3->shared, 1, node);	2066	drain_array_locked(cachep, l3->shared, 1, node);
2020	spin_unlock_irq(&l3->list_lock);	2067	spin_unlock_irq(&l3->list_lock);
2021	if (l3->alien)	2068	if (l3->alien)
2022	drain_alien_cache(cachep, l3);	2069	drain_alien_cache(cachep, l3->alien);
2023	}	2070	}
2024	}	2071	}
2025	spin_unlock(&cachep->spinlock);
2026	}	2072	}
2027		2073
2028	static int __node_shrink(struct kmem_cache *cachep, int node)	2074	static int __node_shrink(struct kmem_cache *cachep, int node)
@@ -3440,7 +3486,7 @@ static void cache_reap(void *unused)
3440		3486
3441	l3 = searchp->nodelists[numa_node_id()];	3487	l3 = searchp->nodelists[numa_node_id()];
3442	if (l3->alien)	3488	if (l3->alien)
3443	drain_alien_cache(searchp, l3);	3489	drain_alien_cache(searchp, l3->alien);
3444	spin_lock_irq(&l3->list_lock);	3490	spin_lock_irq(&l3->list_lock);
3445		3491
3446	drain_array_locked(searchp, cpu_cache_get(searchp), 0,	3492	drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3598,7 +3644,8 @@ static int s_show(struct seq_file m, void p)
3598	num_slabs++;	3644	num_slabs++;
3599	}	3645	}
3600	free_objects += l3->free_objects;	3646	free_objects += l3->free_objects;
3601	shared_avail += l3->shared->avail;	3647	if (l3->shared)
		3648	shared_avail += l3->shared->avail;
3602		3649
3603	spin_unlock_irq(&l3->list_lock);	3650	spin_unlock_irq(&l3->list_lock);
3604	}	3651	}