aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRavikiran G Thirumalai <kiran@scalex86.org>2006-02-05 02:27:59 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-02-05 14:06:53 -0500
commit4484ebf12bdb0ebcdc6e8951243cbab3d7f6f4c1 (patch)
tree9feabea0bac1e6401742bc95bf381e36d2651fbc
parentca3b9b91735316f0ec7f01976f85842e0bfe5c6e (diff)
[PATCH] NUMA slab locking fixes: fix cpu down and up locking
This fixes locking and bugs in cpu_down and cpu_up paths of the NUMA slab allocator. Sonny Rao <sonny@burdell.org> reported problems sometime back on POWER5 boxes, when the last cpu on the nodes were being offlined. We could not reproduce the same on x86_64 because the cpumask (node_to_cpumask) was not being updated on cpu down. Since that issue is now fixed, we can reproduce Sonny's problems on x86_64 NUMA, and here is the fix. The problem earlier was on CPU_DOWN, if it was the last cpu on the node to go down, the array_caches (shared, alien) and the kmem_list3 of the node were being freed (kfree) with the kmem_list3 lock held. If the l3 or the array_caches were to come from the same cache being cleared, we hit on badness. This patch cleans up the locking in cpu_up and cpu_down path. We cannot really free l3 on cpu down because, there is no node offlining yet and even though a cpu is not yet up, node local memory can be allocated for it. So l3s are usually allocated at keme_cache_create and destroyed at kmem_cache_destroy. Hence, we don't need cachep->spinlock protection to get to the cachep->nodelist[nodeid] either. Patch survived onlining and offlining on a 4 core 2 node Tyan box with a 4 dbench process running all the time. Signed-off-by: Alok N Kataria <alokk@calsoftinc.com> Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org> Cc: Christoph Lameter <christoph@lameter.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--mm/slab.c123
1 files changed, 85 insertions, 38 deletions
diff --git a/mm/slab.c b/mm/slab.c
index d3f68543f9f4..9cc049a942c6 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -884,14 +884,14 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
884 } 884 }
885} 885}
886 886
887static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3) 887static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
888{ 888{
889 int i = 0; 889 int i = 0;
890 struct array_cache *ac; 890 struct array_cache *ac;
891 unsigned long flags; 891 unsigned long flags;
892 892
893 for_each_online_node(i) { 893 for_each_online_node(i) {
894 ac = l3->alien[i]; 894 ac = alien[i];
895 if (ac) { 895 if (ac) {
896 spin_lock_irqsave(&ac->lock, flags); 896 spin_lock_irqsave(&ac->lock, flags);
897 __drain_alien_cache(cachep, ac, i); 897 __drain_alien_cache(cachep, ac, i);
@@ -901,8 +901,11 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
901} 901}
902#else 902#else
903#define alloc_alien_cache(node, limit) do { } while (0) 903#define alloc_alien_cache(node, limit) do { } while (0)
904#define free_alien_cache(ac_ptr) do { } while (0) 904#define drain_alien_cache(cachep, alien) do { } while (0)
905#define drain_alien_cache(cachep, l3) do { } while (0) 905
906static inline void free_alien_cache(struct array_cache **ac_ptr)
907{
908}
906#endif 909#endif
907 910
908static int __devinit cpuup_callback(struct notifier_block *nfb, 911static int __devinit cpuup_callback(struct notifier_block *nfb,
@@ -936,6 +939,11 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
936 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 939 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
937 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 940 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
938 941
942 /*
943 * The l3s don't come and go as CPUs come and
944 * go. cache_chain_mutex is sufficient
945 * protection here.
946 */
939 cachep->nodelists[node] = l3; 947 cachep->nodelists[node] = l3;
940 } 948 }
941 949
@@ -950,26 +958,47 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
950 & array cache's */ 958 & array cache's */
951 list_for_each_entry(cachep, &cache_chain, next) { 959 list_for_each_entry(cachep, &cache_chain, next) {
952 struct array_cache *nc; 960 struct array_cache *nc;
961 struct array_cache *shared;
962 struct array_cache **alien;
953 963
954 nc = alloc_arraycache(node, cachep->limit, 964 nc = alloc_arraycache(node, cachep->limit,
955 cachep->batchcount); 965 cachep->batchcount);
956 if (!nc) 966 if (!nc)
957 goto bad; 967 goto bad;
968 shared = alloc_arraycache(node,
969 cachep->shared * cachep->batchcount,
970 0xbaadf00d);
971 if (!shared)
972 goto bad;
973#ifdef CONFIG_NUMA
974 alien = alloc_alien_cache(node, cachep->limit);
975 if (!alien)
976 goto bad;
977#endif
958 cachep->array[cpu] = nc; 978 cachep->array[cpu] = nc;
959 979
960 l3 = cachep->nodelists[node]; 980 l3 = cachep->nodelists[node];
961 BUG_ON(!l3); 981 BUG_ON(!l3);
962 if (!l3->shared) {
963 if (!(nc = alloc_arraycache(node,
964 cachep->shared *
965 cachep->batchcount,
966 0xbaadf00d)))
967 goto bad;
968 982
969 /* we are serialised from CPU_DEAD or 983 spin_lock_irq(&l3->list_lock);
970 CPU_UP_CANCELLED by the cpucontrol lock */ 984 if (!l3->shared) {
971 l3->shared = nc; 985 /*
986 * We are serialised from CPU_DEAD or
987 * CPU_UP_CANCELLED by the cpucontrol lock
988 */
989 l3->shared = shared;
990 shared = NULL;
972 } 991 }
992#ifdef CONFIG_NUMA
993 if (!l3->alien) {
994 l3->alien = alien;
995 alien = NULL;
996 }
997#endif
998 spin_unlock_irq(&l3->list_lock);
999
1000 kfree(shared);
1001 free_alien_cache(alien);
973 } 1002 }
974 mutex_unlock(&cache_chain_mutex); 1003 mutex_unlock(&cache_chain_mutex);
975 break; 1004 break;
@@ -978,23 +1007,32 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
978 break; 1007 break;
979#ifdef CONFIG_HOTPLUG_CPU 1008#ifdef CONFIG_HOTPLUG_CPU
980 case CPU_DEAD: 1009 case CPU_DEAD:
1010 /*
1011 * Even if all the cpus of a node are down, we don't free the
1012 * kmem_list3 of any cache. This to avoid a race between
1013 * cpu_down, and a kmalloc allocation from another cpu for
1014 * memory from the node of the cpu going down. The list3
1015 * structure is usually allocated from kmem_cache_create() and
1016 * gets destroyed at kmem_cache_destroy().
1017 */
981 /* fall thru */ 1018 /* fall thru */
982 case CPU_UP_CANCELED: 1019 case CPU_UP_CANCELED:
983 mutex_lock(&cache_chain_mutex); 1020 mutex_lock(&cache_chain_mutex);
984 1021
985 list_for_each_entry(cachep, &cache_chain, next) { 1022 list_for_each_entry(cachep, &cache_chain, next) {
986 struct array_cache *nc; 1023 struct array_cache *nc;
1024 struct array_cache *shared;
1025 struct array_cache **alien;
987 cpumask_t mask; 1026 cpumask_t mask;
988 1027
989 mask = node_to_cpumask(node); 1028 mask = node_to_cpumask(node);
990 spin_lock(&cachep->spinlock);
991 /* cpu is dead; no one can alloc from it. */ 1029 /* cpu is dead; no one can alloc from it. */
992 nc = cachep->array[cpu]; 1030 nc = cachep->array[cpu];
993 cachep->array[cpu] = NULL; 1031 cachep->array[cpu] = NULL;
994 l3 = cachep->nodelists[node]; 1032 l3 = cachep->nodelists[node];
995 1033
996 if (!l3) 1034 if (!l3)
997 goto unlock_cache; 1035 goto free_array_cache;
998 1036
999 spin_lock_irq(&l3->list_lock); 1037 spin_lock_irq(&l3->list_lock);
1000 1038
@@ -1005,33 +1043,43 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
1005 1043
1006 if (!cpus_empty(mask)) { 1044 if (!cpus_empty(mask)) {
1007 spin_unlock_irq(&l3->list_lock); 1045 spin_unlock_irq(&l3->list_lock);
1008 goto unlock_cache; 1046 goto free_array_cache;
1009 } 1047 }
1010 1048
1011 if (l3->shared) { 1049 shared = l3->shared;
1050 if (shared) {
1012 free_block(cachep, l3->shared->entry, 1051 free_block(cachep, l3->shared->entry,
1013 l3->shared->avail, node); 1052 l3->shared->avail, node);
1014 kfree(l3->shared);
1015 l3->shared = NULL; 1053 l3->shared = NULL;
1016 } 1054 }
1017 if (l3->alien) {
1018 drain_alien_cache(cachep, l3);
1019 free_alien_cache(l3->alien);
1020 l3->alien = NULL;
1021 }
1022 1055
1023 /* free slabs belonging to this node */ 1056 alien = l3->alien;
1024 if (__node_shrink(cachep, node)) { 1057 l3->alien = NULL;
1025 cachep->nodelists[node] = NULL; 1058
1026 spin_unlock_irq(&l3->list_lock); 1059 spin_unlock_irq(&l3->list_lock);
1027 kfree(l3); 1060
1028 } else { 1061 kfree(shared);
1029 spin_unlock_irq(&l3->list_lock); 1062 if (alien) {
1063 drain_alien_cache(cachep, alien);
1064 free_alien_cache(alien);
1030 } 1065 }
1031 unlock_cache: 1066free_array_cache:
1032 spin_unlock(&cachep->spinlock);
1033 kfree(nc); 1067 kfree(nc);
1034 } 1068 }
1069 /*
1070 * In the previous loop, all the objects were freed to
1071 * the respective cache's slabs, now we can go ahead and
1072 * shrink each nodelist to its limit.
1073 */
1074 list_for_each_entry(cachep, &cache_chain, next) {
1075 l3 = cachep->nodelists[node];
1076 if (!l3)
1077 continue;
1078 spin_lock_irq(&l3->list_lock);
1079 /* free slabs belonging to this node */
1080 __node_shrink(cachep, node);
1081 spin_unlock_irq(&l3->list_lock);
1082 }
1035 mutex_unlock(&cache_chain_mutex); 1083 mutex_unlock(&cache_chain_mutex);
1036 break; 1084 break;
1037#endif 1085#endif
@@ -2011,7 +2059,6 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
2011 2059
2012 smp_call_function_all_cpus(do_drain, cachep); 2060 smp_call_function_all_cpus(do_drain, cachep);
2013 check_irq_on(); 2061 check_irq_on();
2014 spin_lock(&cachep->spinlock);
2015 for_each_online_node(node) { 2062 for_each_online_node(node) {
2016 l3 = cachep->nodelists[node]; 2063 l3 = cachep->nodelists[node];
2017 if (l3) { 2064 if (l3) {
@@ -2019,10 +2066,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
2019 drain_array_locked(cachep, l3->shared, 1, node); 2066 drain_array_locked(cachep, l3->shared, 1, node);
2020 spin_unlock_irq(&l3->list_lock); 2067 spin_unlock_irq(&l3->list_lock);
2021 if (l3->alien) 2068 if (l3->alien)
2022 drain_alien_cache(cachep, l3); 2069 drain_alien_cache(cachep, l3->alien);
2023 } 2070 }
2024 } 2071 }
2025 spin_unlock(&cachep->spinlock);
2026} 2072}
2027 2073
2028static int __node_shrink(struct kmem_cache *cachep, int node) 2074static int __node_shrink(struct kmem_cache *cachep, int node)
@@ -3440,7 +3486,7 @@ static void cache_reap(void *unused)
3440 3486
3441 l3 = searchp->nodelists[numa_node_id()]; 3487 l3 = searchp->nodelists[numa_node_id()];
3442 if (l3->alien) 3488 if (l3->alien)
3443 drain_alien_cache(searchp, l3); 3489 drain_alien_cache(searchp, l3->alien);
3444 spin_lock_irq(&l3->list_lock); 3490 spin_lock_irq(&l3->list_lock);
3445 3491
3446 drain_array_locked(searchp, cpu_cache_get(searchp), 0, 3492 drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3598,7 +3644,8 @@ static int s_show(struct seq_file *m, void *p)
3598 num_slabs++; 3644 num_slabs++;
3599 } 3645 }
3600 free_objects += l3->free_objects; 3646 free_objects += l3->free_objects;
3601 shared_avail += l3->shared->avail; 3647 if (l3->shared)
3648 shared_avail += l3->shared->avail;
3602 3649
3603 spin_unlock_irq(&l3->list_lock); 3650 spin_unlock_irq(&l3->list_lock);
3604 } 3651 }