diff options
author | Ravikiran G Thirumalai <kiran@scalex86.org> | 2006-02-05 02:27:59 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-02-05 14:06:53 -0500 |
commit | 4484ebf12bdb0ebcdc6e8951243cbab3d7f6f4c1 (patch) | |
tree | 9feabea0bac1e6401742bc95bf381e36d2651fbc | |
parent | ca3b9b91735316f0ec7f01976f85842e0bfe5c6e (diff) |
[PATCH] NUMA slab locking fixes: fix cpu down and up locking
This fixes locking and bugs in cpu_down and cpu_up paths of the NUMA slab
allocator. Sonny Rao <sonny@burdell.org> reported problems sometime back on
POWER5 boxes, when the last cpu on the nodes were being offlined. We could
not reproduce the same on x86_64 because the cpumask (node_to_cpumask) was not
being updated on cpu down. Since that issue is now fixed, we can reproduce
Sonny's problems on x86_64 NUMA, and here is the fix.
The problem earlier was on CPU_DOWN, if it was the last cpu on the node to go
down, the array_caches (shared, alien) and the kmem_list3 of the node were
being freed (kfree) with the kmem_list3 lock held. If the l3 or the
array_caches were to come from the same cache being cleared, we hit on
badness.
This patch cleans up the locking in cpu_up and cpu_down path. We cannot
really free l3 on cpu down because, there is no node offlining yet and even
though a cpu is not yet up, node local memory can be allocated for it. So l3s
are usually allocated at keme_cache_create and destroyed at
kmem_cache_destroy. Hence, we don't need cachep->spinlock protection to get
to the cachep->nodelist[nodeid] either.
Patch survived onlining and offlining on a 4 core 2 node Tyan box with a 4
dbench process running all the time.
Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | mm/slab.c | 123 |
1 files changed, 85 insertions, 38 deletions
@@ -884,14 +884,14 @@ static void __drain_alien_cache(struct kmem_cache *cachep, | |||
884 | } | 884 | } |
885 | } | 885 | } |
886 | 886 | ||
887 | static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3) | 887 | static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) |
888 | { | 888 | { |
889 | int i = 0; | 889 | int i = 0; |
890 | struct array_cache *ac; | 890 | struct array_cache *ac; |
891 | unsigned long flags; | 891 | unsigned long flags; |
892 | 892 | ||
893 | for_each_online_node(i) { | 893 | for_each_online_node(i) { |
894 | ac = l3->alien[i]; | 894 | ac = alien[i]; |
895 | if (ac) { | 895 | if (ac) { |
896 | spin_lock_irqsave(&ac->lock, flags); | 896 | spin_lock_irqsave(&ac->lock, flags); |
897 | __drain_alien_cache(cachep, ac, i); | 897 | __drain_alien_cache(cachep, ac, i); |
@@ -901,8 +901,11 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3) | |||
901 | } | 901 | } |
902 | #else | 902 | #else |
903 | #define alloc_alien_cache(node, limit) do { } while (0) | 903 | #define alloc_alien_cache(node, limit) do { } while (0) |
904 | #define free_alien_cache(ac_ptr) do { } while (0) | 904 | #define drain_alien_cache(cachep, alien) do { } while (0) |
905 | #define drain_alien_cache(cachep, l3) do { } while (0) | 905 | |
906 | static inline void free_alien_cache(struct array_cache **ac_ptr) | ||
907 | { | ||
908 | } | ||
906 | #endif | 909 | #endif |
907 | 910 | ||
908 | static int __devinit cpuup_callback(struct notifier_block *nfb, | 911 | static int __devinit cpuup_callback(struct notifier_block *nfb, |
@@ -936,6 +939,11 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
936 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + | 939 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + |
937 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; | 940 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; |
938 | 941 | ||
942 | /* | ||
943 | * The l3s don't come and go as CPUs come and | ||
944 | * go. cache_chain_mutex is sufficient | ||
945 | * protection here. | ||
946 | */ | ||
939 | cachep->nodelists[node] = l3; | 947 | cachep->nodelists[node] = l3; |
940 | } | 948 | } |
941 | 949 | ||
@@ -950,26 +958,47 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
950 | & array cache's */ | 958 | & array cache's */ |
951 | list_for_each_entry(cachep, &cache_chain, next) { | 959 | list_for_each_entry(cachep, &cache_chain, next) { |
952 | struct array_cache *nc; | 960 | struct array_cache *nc; |
961 | struct array_cache *shared; | ||
962 | struct array_cache **alien; | ||
953 | 963 | ||
954 | nc = alloc_arraycache(node, cachep->limit, | 964 | nc = alloc_arraycache(node, cachep->limit, |
955 | cachep->batchcount); | 965 | cachep->batchcount); |
956 | if (!nc) | 966 | if (!nc) |
957 | goto bad; | 967 | goto bad; |
968 | shared = alloc_arraycache(node, | ||
969 | cachep->shared * cachep->batchcount, | ||
970 | 0xbaadf00d); | ||
971 | if (!shared) | ||
972 | goto bad; | ||
973 | #ifdef CONFIG_NUMA | ||
974 | alien = alloc_alien_cache(node, cachep->limit); | ||
975 | if (!alien) | ||
976 | goto bad; | ||
977 | #endif | ||
958 | cachep->array[cpu] = nc; | 978 | cachep->array[cpu] = nc; |
959 | 979 | ||
960 | l3 = cachep->nodelists[node]; | 980 | l3 = cachep->nodelists[node]; |
961 | BUG_ON(!l3); | 981 | BUG_ON(!l3); |
962 | if (!l3->shared) { | ||
963 | if (!(nc = alloc_arraycache(node, | ||
964 | cachep->shared * | ||
965 | cachep->batchcount, | ||
966 | 0xbaadf00d))) | ||
967 | goto bad; | ||
968 | 982 | ||
969 | /* we are serialised from CPU_DEAD or | 983 | spin_lock_irq(&l3->list_lock); |
970 | CPU_UP_CANCELLED by the cpucontrol lock */ | 984 | if (!l3->shared) { |
971 | l3->shared = nc; | 985 | /* |
986 | * We are serialised from CPU_DEAD or | ||
987 | * CPU_UP_CANCELLED by the cpucontrol lock | ||
988 | */ | ||
989 | l3->shared = shared; | ||
990 | shared = NULL; | ||
972 | } | 991 | } |
992 | #ifdef CONFIG_NUMA | ||
993 | if (!l3->alien) { | ||
994 | l3->alien = alien; | ||
995 | alien = NULL; | ||
996 | } | ||
997 | #endif | ||
998 | spin_unlock_irq(&l3->list_lock); | ||
999 | |||
1000 | kfree(shared); | ||
1001 | free_alien_cache(alien); | ||
973 | } | 1002 | } |
974 | mutex_unlock(&cache_chain_mutex); | 1003 | mutex_unlock(&cache_chain_mutex); |
975 | break; | 1004 | break; |
@@ -978,23 +1007,32 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
978 | break; | 1007 | break; |
979 | #ifdef CONFIG_HOTPLUG_CPU | 1008 | #ifdef CONFIG_HOTPLUG_CPU |
980 | case CPU_DEAD: | 1009 | case CPU_DEAD: |
1010 | /* | ||
1011 | * Even if all the cpus of a node are down, we don't free the | ||
1012 | * kmem_list3 of any cache. This to avoid a race between | ||
1013 | * cpu_down, and a kmalloc allocation from another cpu for | ||
1014 | * memory from the node of the cpu going down. The list3 | ||
1015 | * structure is usually allocated from kmem_cache_create() and | ||
1016 | * gets destroyed at kmem_cache_destroy(). | ||
1017 | */ | ||
981 | /* fall thru */ | 1018 | /* fall thru */ |
982 | case CPU_UP_CANCELED: | 1019 | case CPU_UP_CANCELED: |
983 | mutex_lock(&cache_chain_mutex); | 1020 | mutex_lock(&cache_chain_mutex); |
984 | 1021 | ||
985 | list_for_each_entry(cachep, &cache_chain, next) { | 1022 | list_for_each_entry(cachep, &cache_chain, next) { |
986 | struct array_cache *nc; | 1023 | struct array_cache *nc; |
1024 | struct array_cache *shared; | ||
1025 | struct array_cache **alien; | ||
987 | cpumask_t mask; | 1026 | cpumask_t mask; |
988 | 1027 | ||
989 | mask = node_to_cpumask(node); | 1028 | mask = node_to_cpumask(node); |
990 | spin_lock(&cachep->spinlock); | ||
991 | /* cpu is dead; no one can alloc from it. */ | 1029 | /* cpu is dead; no one can alloc from it. */ |
992 | nc = cachep->array[cpu]; | 1030 | nc = cachep->array[cpu]; |
993 | cachep->array[cpu] = NULL; | 1031 | cachep->array[cpu] = NULL; |
994 | l3 = cachep->nodelists[node]; | 1032 | l3 = cachep->nodelists[node]; |
995 | 1033 | ||
996 | if (!l3) | 1034 | if (!l3) |
997 | goto unlock_cache; | 1035 | goto free_array_cache; |
998 | 1036 | ||
999 | spin_lock_irq(&l3->list_lock); | 1037 | spin_lock_irq(&l3->list_lock); |
1000 | 1038 | ||
@@ -1005,33 +1043,43 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
1005 | 1043 | ||
1006 | if (!cpus_empty(mask)) { | 1044 | if (!cpus_empty(mask)) { |
1007 | spin_unlock_irq(&l3->list_lock); | 1045 | spin_unlock_irq(&l3->list_lock); |
1008 | goto unlock_cache; | 1046 | goto free_array_cache; |
1009 | } | 1047 | } |
1010 | 1048 | ||
1011 | if (l3->shared) { | 1049 | shared = l3->shared; |
1050 | if (shared) { | ||
1012 | free_block(cachep, l3->shared->entry, | 1051 | free_block(cachep, l3->shared->entry, |
1013 | l3->shared->avail, node); | 1052 | l3->shared->avail, node); |
1014 | kfree(l3->shared); | ||
1015 | l3->shared = NULL; | 1053 | l3->shared = NULL; |
1016 | } | 1054 | } |
1017 | if (l3->alien) { | ||
1018 | drain_alien_cache(cachep, l3); | ||
1019 | free_alien_cache(l3->alien); | ||
1020 | l3->alien = NULL; | ||
1021 | } | ||
1022 | 1055 | ||
1023 | /* free slabs belonging to this node */ | 1056 | alien = l3->alien; |
1024 | if (__node_shrink(cachep, node)) { | 1057 | l3->alien = NULL; |
1025 | cachep->nodelists[node] = NULL; | 1058 | |
1026 | spin_unlock_irq(&l3->list_lock); | 1059 | spin_unlock_irq(&l3->list_lock); |
1027 | kfree(l3); | 1060 | |
1028 | } else { | 1061 | kfree(shared); |
1029 | spin_unlock_irq(&l3->list_lock); | 1062 | if (alien) { |
1063 | drain_alien_cache(cachep, alien); | ||
1064 | free_alien_cache(alien); | ||
1030 | } | 1065 | } |
1031 | unlock_cache: | 1066 | free_array_cache: |
1032 | spin_unlock(&cachep->spinlock); | ||
1033 | kfree(nc); | 1067 | kfree(nc); |
1034 | } | 1068 | } |
1069 | /* | ||
1070 | * In the previous loop, all the objects were freed to | ||
1071 | * the respective cache's slabs, now we can go ahead and | ||
1072 | * shrink each nodelist to its limit. | ||
1073 | */ | ||
1074 | list_for_each_entry(cachep, &cache_chain, next) { | ||
1075 | l3 = cachep->nodelists[node]; | ||
1076 | if (!l3) | ||
1077 | continue; | ||
1078 | spin_lock_irq(&l3->list_lock); | ||
1079 | /* free slabs belonging to this node */ | ||
1080 | __node_shrink(cachep, node); | ||
1081 | spin_unlock_irq(&l3->list_lock); | ||
1082 | } | ||
1035 | mutex_unlock(&cache_chain_mutex); | 1083 | mutex_unlock(&cache_chain_mutex); |
1036 | break; | 1084 | break; |
1037 | #endif | 1085 | #endif |
@@ -2011,7 +2059,6 @@ static void drain_cpu_caches(struct kmem_cache *cachep) | |||
2011 | 2059 | ||
2012 | smp_call_function_all_cpus(do_drain, cachep); | 2060 | smp_call_function_all_cpus(do_drain, cachep); |
2013 | check_irq_on(); | 2061 | check_irq_on(); |
2014 | spin_lock(&cachep->spinlock); | ||
2015 | for_each_online_node(node) { | 2062 | for_each_online_node(node) { |
2016 | l3 = cachep->nodelists[node]; | 2063 | l3 = cachep->nodelists[node]; |
2017 | if (l3) { | 2064 | if (l3) { |
@@ -2019,10 +2066,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep) | |||
2019 | drain_array_locked(cachep, l3->shared, 1, node); | 2066 | drain_array_locked(cachep, l3->shared, 1, node); |
2020 | spin_unlock_irq(&l3->list_lock); | 2067 | spin_unlock_irq(&l3->list_lock); |
2021 | if (l3->alien) | 2068 | if (l3->alien) |
2022 | drain_alien_cache(cachep, l3); | 2069 | drain_alien_cache(cachep, l3->alien); |
2023 | } | 2070 | } |
2024 | } | 2071 | } |
2025 | spin_unlock(&cachep->spinlock); | ||
2026 | } | 2072 | } |
2027 | 2073 | ||
2028 | static int __node_shrink(struct kmem_cache *cachep, int node) | 2074 | static int __node_shrink(struct kmem_cache *cachep, int node) |
@@ -3440,7 +3486,7 @@ static void cache_reap(void *unused) | |||
3440 | 3486 | ||
3441 | l3 = searchp->nodelists[numa_node_id()]; | 3487 | l3 = searchp->nodelists[numa_node_id()]; |
3442 | if (l3->alien) | 3488 | if (l3->alien) |
3443 | drain_alien_cache(searchp, l3); | 3489 | drain_alien_cache(searchp, l3->alien); |
3444 | spin_lock_irq(&l3->list_lock); | 3490 | spin_lock_irq(&l3->list_lock); |
3445 | 3491 | ||
3446 | drain_array_locked(searchp, cpu_cache_get(searchp), 0, | 3492 | drain_array_locked(searchp, cpu_cache_get(searchp), 0, |
@@ -3598,7 +3644,8 @@ static int s_show(struct seq_file *m, void *p) | |||
3598 | num_slabs++; | 3644 | num_slabs++; |
3599 | } | 3645 | } |
3600 | free_objects += l3->free_objects; | 3646 | free_objects += l3->free_objects; |
3601 | shared_avail += l3->shared->avail; | 3647 | if (l3->shared) |
3648 | shared_avail += l3->shared->avail; | ||
3602 | 3649 | ||
3603 | spin_unlock_irq(&l3->list_lock); | 3650 | spin_unlock_irq(&l3->list_lock); |
3604 | } | 3651 | } |