aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMikulas Patocka <mpatocka@redhat.com>2009-01-05 22:05:10 -0500
committerAlasdair G Kergon <agk@redhat.com>2009-01-05 22:05:10 -0500
commitd58168763f74d1edbc296d7038c60efe6493fdd4 (patch)
tree03866d641211fe16961a5b8aab6d9132bf07d9c8
parentab4c1424882be9cd70b89abf2b484add355712fa (diff)
dm table: rework reference counting
Rework table reference counting. The existing code uses a reference counter. When the last reference is dropped and the counter reaches zero, the table destructor is called. Table reference counters are acquired/released from upcalls from other kernel code (dm_any_congested, dm_merge_bvec, dm_unplug_all). If the reference counter reaches zero in one of the upcalls, the table destructor is called from almost random kernel code. This leads to various problems: * dm_any_congested being called under a spinlock, which calls the destructor, which calls some sleeping function. * the destructor attempting to take a lock that is already taken by the same process. * stale reference from some other kernel code keeps the table constructed, which keeps some devices open, even after successful return from "dmsetup remove". This can confuse lvm and prevent closing of underlying devices or reusing device minor numbers. The patch changes reference counting so that the table destructor can be called only at predetermined places. The table has always exactly one reference from either mapped_device->map or hash_cell->new_map. After this patch, this reference is not counted in table->holders. A pair of dm_create_table/dm_destroy_table functions is used for table creation/destruction. Temporary references from the other code increase table->holders. A pair of dm_table_get/dm_table_put functions is used to manipulate it. When the table is about to be destroyed, we wait for table->holders to reach 0. Then, we call the table destructor. We use active waiting with msleep(1), because the situation happens rarely (to one user in 5 years) and removing the device isn't performance-critical task: the user doesn't care if it takes one tick more or not. This way, the destructor is called only at specific points (dm_table_destroy function) and the above problems associated with lazy destruction can't happen. Finally remove the temporary protection added to dm_any_congested(). Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
-rw-r--r--drivers/md/dm-ioctl.c10
-rw-r--r--drivers/md/dm-table.c28
-rw-r--r--drivers/md/dm.c14
-rw-r--r--drivers/md/dm.h1
4 files changed, 33 insertions, 20 deletions
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 8da7a017b4ef..54d0588fc1f6 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -233,7 +233,7 @@ static void __hash_remove(struct hash_cell *hc)
233 } 233 }
234 234
235 if (hc->new_map) 235 if (hc->new_map)
236 dm_table_put(hc->new_map); 236 dm_table_destroy(hc->new_map);
237 dm_put(hc->md); 237 dm_put(hc->md);
238 free_cell(hc); 238 free_cell(hc);
239} 239}
@@ -827,8 +827,8 @@ static int do_resume(struct dm_ioctl *param)
827 827
828 r = dm_swap_table(md, new_map); 828 r = dm_swap_table(md, new_map);
829 if (r) { 829 if (r) {
830 dm_table_destroy(new_map);
830 dm_put(md); 831 dm_put(md);
831 dm_table_put(new_map);
832 return r; 832 return r;
833 } 833 }
834 834
@@ -836,8 +836,6 @@ static int do_resume(struct dm_ioctl *param)
836 set_disk_ro(dm_disk(md), 0); 836 set_disk_ro(dm_disk(md), 0);
837 else 837 else
838 set_disk_ro(dm_disk(md), 1); 838 set_disk_ro(dm_disk(md), 1);
839
840 dm_table_put(new_map);
841 } 839 }
842 840
843 if (dm_suspended(md)) 841 if (dm_suspended(md))
@@ -1080,7 +1078,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
1080 } 1078 }
1081 1079
1082 if (hc->new_map) 1080 if (hc->new_map)
1083 dm_table_put(hc->new_map); 1081 dm_table_destroy(hc->new_map);
1084 hc->new_map = t; 1082 hc->new_map = t;
1085 up_write(&_hash_lock); 1083 up_write(&_hash_lock);
1086 1084
@@ -1109,7 +1107,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
1109 } 1107 }
1110 1108
1111 if (hc->new_map) { 1109 if (hc->new_map) {
1112 dm_table_put(hc->new_map); 1110 dm_table_destroy(hc->new_map);
1113 hc->new_map = NULL; 1111 hc->new_map = NULL;
1114 } 1112 }
1115 1113
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index ebaaf72cd822..2fd66c30f7f8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) 2001 Sistina Software (UK) Limited. 2 * Copyright (C) 2001 Sistina Software (UK) Limited.
3 * Copyright (C) 2004 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
@@ -15,6 +15,7 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/mutex.h> 17#include <linux/mutex.h>
18#include <linux/delay.h>
18#include <asm/atomic.h> 19#include <asm/atomic.h>
19 20
20#define DM_MSG_PREFIX "table" 21#define DM_MSG_PREFIX "table"
@@ -24,6 +25,19 @@
24#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) 25#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
25#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) 26#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
26 27
28/*
29 * The table has always exactly one reference from either mapped_device->map
30 * or hash_cell->new_map. This reference is not counted in table->holders.
31 * A pair of dm_create_table/dm_destroy_table functions is used for table
32 * creation/destruction.
33 *
34 * Temporary references from the other code increase table->holders. A pair
35 * of dm_table_get/dm_table_put functions is used to manipulate it.
36 *
37 * When the table is about to be destroyed, we wait for table->holders to
38 * drop to zero.
39 */
40
27struct dm_table { 41struct dm_table {
28 struct mapped_device *md; 42 struct mapped_device *md;
29 atomic_t holders; 43 atomic_t holders;
@@ -228,7 +242,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
228 return -ENOMEM; 242 return -ENOMEM;
229 243
230 INIT_LIST_HEAD(&t->devices); 244 INIT_LIST_HEAD(&t->devices);
231 atomic_set(&t->holders, 1); 245 atomic_set(&t->holders, 0);
232 t->barriers_supported = 1; 246 t->barriers_supported = 1;
233 247
234 if (!num_targets) 248 if (!num_targets)
@@ -259,10 +273,14 @@ static void free_devices(struct list_head *devices)
259 } 273 }
260} 274}
261 275
262static void table_destroy(struct dm_table *t) 276void dm_table_destroy(struct dm_table *t)
263{ 277{
264 unsigned int i; 278 unsigned int i;
265 279
280 while (atomic_read(&t->holders))
281 msleep(1);
282 smp_mb();
283
266 /* free the indexes (see dm_table_complete) */ 284 /* free the indexes (see dm_table_complete) */
267 if (t->depth >= 2) 285 if (t->depth >= 2)
268 vfree(t->index[t->depth - 2]); 286 vfree(t->index[t->depth - 2]);
@@ -300,8 +318,8 @@ void dm_table_put(struct dm_table *t)
300 if (!t) 318 if (!t)
301 return; 319 return;
302 320
303 if (atomic_dec_and_test(&t->holders)) 321 smp_mb__before_atomic_dec();
304 table_destroy(t); 322 atomic_dec(&t->holders);
305} 323}
306 324
307/* 325/*
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dd953b189f45..9f9aa64f7336 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -977,8 +977,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
977 struct mapped_device *md = congested_data; 977 struct mapped_device *md = congested_data;
978 struct dm_table *map; 978 struct dm_table *map;
979 979
980 atomic_inc(&md->pending);
981
982 if (!test_bit(DMF_BLOCK_IO, &md->flags)) { 980 if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
983 map = dm_get_table(md); 981 map = dm_get_table(md);
984 if (map) { 982 if (map) {
@@ -987,10 +985,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
987 } 985 }
988 } 986 }
989 987
990 if (!atomic_dec_return(&md->pending))
991 /* nudge anyone waiting on suspend queue */
992 wake_up(&md->wait);
993
994 return r; 988 return r;
995} 989}
996 990
@@ -1250,10 +1244,12 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
1250 1244
1251 if (md->suspended_bdev) 1245 if (md->suspended_bdev)
1252 __set_size(md, size); 1246 __set_size(md, size);
1253 if (size == 0) 1247
1248 if (!size) {
1249 dm_table_destroy(t);
1254 return 0; 1250 return 0;
1251 }
1255 1252
1256 dm_table_get(t);
1257 dm_table_event_callback(t, event_callback, md); 1253 dm_table_event_callback(t, event_callback, md);
1258 1254
1259 write_lock(&md->map_lock); 1255 write_lock(&md->map_lock);
@@ -1275,7 +1271,7 @@ static void __unbind(struct mapped_device *md)
1275 write_lock(&md->map_lock); 1271 write_lock(&md->map_lock);
1276 md->map = NULL; 1272 md->map = NULL;
1277 write_unlock(&md->map_lock); 1273 write_unlock(&md->map_lock);
1278 dm_table_put(map); 1274 dm_table_destroy(map);
1279} 1275}
1280 1276
1281/* 1277/*
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 5b5d08ba9e92..bbbe9110f3bf 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -36,6 +36,7 @@ struct dm_table;
36/*----------------------------------------------------------------- 36/*-----------------------------------------------------------------
37 * Internal table functions. 37 * Internal table functions.
38 *---------------------------------------------------------------*/ 38 *---------------------------------------------------------------*/
39void dm_table_destroy(struct dm_table *t);
39void dm_table_event_callback(struct dm_table *t, 40void dm_table_event_callback(struct dm_table *t,
40 void (*fn)(void *), void *context); 41 void (*fn)(void *), void *context);
41struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); 42struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);