aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEd Cashin <ecashin@coraid.com>2012-12-17 19:04:09 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-17 20:15:25 -0500
commite52a29326462badd9ceec90a9eb2ac2a8550e02e (patch)
tree28d732cd1bbae93d3e2b490127a6afbc0af279c8
parentbbb44e30d07fdc111e34a5ec935b57521cea9499 (diff)
aoe: avoid races between device destruction and discovery
This change avoids a race that could result in a NULL pointer derference following a WARNing from kobject_add_internal, "don't try to register things with the same name in the same directory." The problem was found with a test that forgets and discovers an aoe device in a loop: while test ! -r /tmp/stop; do aoe-flush -a aoe-discover done The race was between aoedev_flush taking aoedevs out of the devlist, allowing a new discovery of the same AoE target to take place before the driver gets around to calling sysfs_remove_group. Fixing that one revealed another race between do_open and add_disk, and this patch avoids that, too. The fix required some care, because for flushing (forgetting) an aoedev, some of the steps must be performed under lock and some must be able to sleep. Also, for discovering a new aoedev, some steps might sleep. The check for a bad aoedev pointer remains from a time when about half of this patch was done, and it was possible for the bdev->bd_disk->private_data to become corrupted. The check should be removed eventually, but it is not expected to add significant overhead, occurring in the aoeblk_open routine. Signed-off-by: Ed Cashin <ecashin@coraid.com> Cc: Jens Axboe <axboe@kernel.dk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--drivers/block/aoe/aoe.h7
-rw-r--r--drivers/block/aoe/aoeblk.c36
-rw-r--r--drivers/block/aoe/aoedev.c166
3 files changed, 146 insertions, 63 deletions
diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index b6d2b16358be..d50e9455b937 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -74,8 +74,11 @@ enum {
74 DEVFL_TKILL = (1<<1), /* flag for timer to know when to kill self */ 74 DEVFL_TKILL = (1<<1), /* flag for timer to know when to kill self */
75 DEVFL_EXT = (1<<2), /* device accepts lba48 commands */ 75 DEVFL_EXT = (1<<2), /* device accepts lba48 commands */
76 DEVFL_GDALLOC = (1<<3), /* need to alloc gendisk */ 76 DEVFL_GDALLOC = (1<<3), /* need to alloc gendisk */
77 DEVFL_KICKME = (1<<4), /* slow polling network card catch */ 77 DEVFL_GD_NOW = (1<<4), /* allocating gendisk */
78 DEVFL_NEWSIZE = (1<<5), /* need to update dev size in block layer */ 78 DEVFL_KICKME = (1<<5), /* slow polling network card catch */
79 DEVFL_NEWSIZE = (1<<6), /* need to update dev size in block layer */
80 DEVFL_FREEING = (1<<7), /* set when device is being cleaned up */
81 DEVFL_FREED = (1<<8), /* device has been cleaned up */
79}; 82};
80 83
81enum { 84enum {
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 57ac72c1715a..6b5b7876ecf3 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -147,9 +147,18 @@ aoeblk_open(struct block_device *bdev, fmode_t mode)
147 struct aoedev *d = bdev->bd_disk->private_data; 147 struct aoedev *d = bdev->bd_disk->private_data;
148 ulong flags; 148 ulong flags;
149 149
150 if (!virt_addr_valid(d)) {
151 pr_crit("aoe: invalid device pointer in %s\n",
152 __func__);
153 WARN_ON(1);
154 return -ENODEV;
155 }
156 if (!(d->flags & DEVFL_UP) || d->flags & DEVFL_TKILL)
157 return -ENODEV;
158
150 mutex_lock(&aoeblk_mutex); 159 mutex_lock(&aoeblk_mutex);
151 spin_lock_irqsave(&d->lock, flags); 160 spin_lock_irqsave(&d->lock, flags);
152 if (d->flags & DEVFL_UP) { 161 if (d->flags & DEVFL_UP && !(d->flags & DEVFL_TKILL)) {
153 d->nopen++; 162 d->nopen++;
154 spin_unlock_irqrestore(&d->lock, flags); 163 spin_unlock_irqrestore(&d->lock, flags);
155 mutex_unlock(&aoeblk_mutex); 164 mutex_unlock(&aoeblk_mutex);
@@ -259,6 +268,18 @@ aoeblk_gdalloc(void *vp)
259 struct request_queue *q; 268 struct request_queue *q;
260 enum { KB = 1024, MB = KB * KB, READ_AHEAD = 2 * MB, }; 269 enum { KB = 1024, MB = KB * KB, READ_AHEAD = 2 * MB, };
261 ulong flags; 270 ulong flags;
271 int late = 0;
272
273 spin_lock_irqsave(&d->lock, flags);
274 if (d->flags & DEVFL_GDALLOC
275 && !(d->flags & DEVFL_TKILL)
276 && !(d->flags & DEVFL_GD_NOW))
277 d->flags |= DEVFL_GD_NOW;
278 else
279 late = 1;
280 spin_unlock_irqrestore(&d->lock, flags);
281 if (late)
282 return;
262 283
263 gd = alloc_disk(AOE_PARTITIONS); 284 gd = alloc_disk(AOE_PARTITIONS);
264 if (gd == NULL) { 285 if (gd == NULL) {
@@ -282,6 +303,11 @@ aoeblk_gdalloc(void *vp)
282 } 303 }
283 304
284 spin_lock_irqsave(&d->lock, flags); 305 spin_lock_irqsave(&d->lock, flags);
306 WARN_ON(!(d->flags & DEVFL_GD_NOW));
307 WARN_ON(!(d->flags & DEVFL_GDALLOC));
308 WARN_ON(d->flags & DEVFL_TKILL);
309 WARN_ON(d->gd);
310 WARN_ON(d->flags & DEVFL_UP);
285 blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS); 311 blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS);
286 q->backing_dev_info.name = "aoe"; 312 q->backing_dev_info.name = "aoe";
287 q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE; 313 q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE;
@@ -306,6 +332,11 @@ aoeblk_gdalloc(void *vp)
306 332
307 add_disk(gd); 333 add_disk(gd);
308 aoedisk_add_sysfs(d); 334 aoedisk_add_sysfs(d);
335
336 spin_lock_irqsave(&d->lock, flags);
337 WARN_ON(!(d->flags & DEVFL_GD_NOW));
338 d->flags &= ~DEVFL_GD_NOW;
339 spin_unlock_irqrestore(&d->lock, flags);
309 return; 340 return;
310 341
311err_mempool: 342err_mempool:
@@ -314,7 +345,8 @@ err_disk:
314 put_disk(gd); 345 put_disk(gd);
315err: 346err:
316 spin_lock_irqsave(&d->lock, flags); 347 spin_lock_irqsave(&d->lock, flags);
317 d->flags &= ~DEVFL_GDALLOC; 348 d->flags &= ~DEVFL_GD_NOW;
349 schedule_work(&d->work);
318 spin_unlock_irqrestore(&d->lock, flags); 350 spin_unlock_irqrestore(&d->lock, flags);
319} 351}
320 352
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index f0c0c7416aed..3776715eb255 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -15,7 +15,6 @@
15#include "aoe.h" 15#include "aoe.h"
16 16
17static void dummy_timer(ulong); 17static void dummy_timer(ulong);
18static void aoedev_freedev(struct aoedev *);
19static void freetgt(struct aoedev *d, struct aoetgt *t); 18static void freetgt(struct aoedev *d, struct aoetgt *t);
20static void skbpoolfree(struct aoedev *d); 19static void skbpoolfree(struct aoedev *d);
21 20
@@ -236,29 +235,6 @@ aoedev_downdev(struct aoedev *d)
236 set_capacity(d->gd, 0); 235 set_capacity(d->gd, 0);
237} 236}
238 237
239static void
240aoedev_freedev(struct aoedev *d)
241{
242 struct aoetgt **t, **e;
243
244 cancel_work_sync(&d->work);
245 if (d->gd) {
246 aoedisk_rm_sysfs(d);
247 del_gendisk(d->gd);
248 put_disk(d->gd);
249 blk_cleanup_queue(d->blkq);
250 }
251 t = d->targets;
252 e = t + NTARGETS;
253 for (; t < e && *t; t++)
254 freetgt(d, *t);
255 if (d->bufpool)
256 mempool_destroy(d->bufpool);
257 skbpoolfree(d);
258 minor_free(d->sysminor);
259 kfree(d);
260}
261
262/* return whether the user asked for this particular 238/* return whether the user asked for this particular
263 * device to be flushed 239 * device to be flushed
264 */ 240 */
@@ -283,17 +259,62 @@ user_req(char *s, size_t slen, struct aoedev *d)
283 return !strncmp(s, p, lim); 259 return !strncmp(s, p, lim);
284} 260}
285 261
286int 262static void
287aoedev_flush(const char __user *str, size_t cnt) 263freedev(struct aoedev *d)
264{
265 struct aoetgt **t, **e;
266 int freeing = 0;
267 unsigned long flags;
268
269 spin_lock_irqsave(&d->lock, flags);
270 if (d->flags & DEVFL_TKILL
271 && !(d->flags & DEVFL_FREEING)) {
272 d->flags |= DEVFL_FREEING;
273 freeing = 1;
274 }
275 spin_unlock_irqrestore(&d->lock, flags);
276 if (!freeing)
277 return;
278
279 del_timer_sync(&d->timer);
280 if (d->gd) {
281 aoedisk_rm_sysfs(d);
282 del_gendisk(d->gd);
283 put_disk(d->gd);
284 blk_cleanup_queue(d->blkq);
285 }
286 t = d->targets;
287 e = t + NTARGETS;
288 for (; t < e && *t; t++)
289 freetgt(d, *t);
290 if (d->bufpool)
291 mempool_destroy(d->bufpool);
292 skbpoolfree(d);
293 minor_free(d->sysminor);
294
295 spin_lock_irqsave(&d->lock, flags);
296 d->flags |= DEVFL_FREED;
297 spin_unlock_irqrestore(&d->lock, flags);
298}
299
300enum flush_parms {
301 NOT_EXITING = 0,
302 EXITING = 1,
303};
304
305static int
306flush(const char __user *str, size_t cnt, int exiting)
288{ 307{
289 ulong flags; 308 ulong flags;
290 struct aoedev *d, **dd; 309 struct aoedev *d, **dd;
291 struct aoedev *rmd = NULL;
292 char buf[16]; 310 char buf[16];
293 int all = 0; 311 int all = 0;
294 int specified = 0; /* flush a specific device */ 312 int specified = 0; /* flush a specific device */
313 unsigned int skipflags;
314
315 skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL;
295 316
296 if (cnt >= 3) { 317 if (!exiting && cnt >= 3) {
297 if (cnt > sizeof buf) 318 if (cnt > sizeof buf)
298 cnt = sizeof buf; 319 cnt = sizeof buf;
299 if (copy_from_user(buf, str, cnt)) 320 if (copy_from_user(buf, str, cnt))
@@ -303,39 +324,71 @@ aoedev_flush(const char __user *str, size_t cnt)
303 specified = 1; 324 specified = 1;
304 } 325 }
305 326
327 flush_scheduled_work();
328 /* pass one: without sleeping, do aoedev_downdev */
306 spin_lock_irqsave(&devlist_lock, flags); 329 spin_lock_irqsave(&devlist_lock, flags);
307 dd = &devlist; 330 for (d = devlist; d; d = d->next) {
308 while ((d = *dd)) {
309 spin_lock(&d->lock); 331 spin_lock(&d->lock);
310 if (specified) { 332 if (exiting) {
333 /* unconditionally take each device down */
334 } else if (specified) {
311 if (!user_req(buf, cnt, d)) 335 if (!user_req(buf, cnt, d))
312 goto skip; 336 goto cont;
313 } else if ((!all && (d->flags & DEVFL_UP)) 337 } else if ((!all && (d->flags & DEVFL_UP))
314 || (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE)) 338 || d->flags & skipflags
315 || d->nopen 339 || d->nopen
316 || d->ref) 340 || d->ref)
317 goto skip; 341 goto cont;
318 342
319 *dd = d->next;
320 aoedev_downdev(d); 343 aoedev_downdev(d);
321 d->flags |= DEVFL_TKILL; 344 d->flags |= DEVFL_TKILL;
345cont:
322 spin_unlock(&d->lock); 346 spin_unlock(&d->lock);
323 d->next = rmd;
324 rmd = d;
325 continue;
326skip:
327 spin_unlock(&d->lock);
328 dd = &d->next;
329 } 347 }
330 spin_unlock_irqrestore(&devlist_lock, flags); 348 spin_unlock_irqrestore(&devlist_lock, flags);
331 while ((d = rmd)) { 349
332 rmd = d->next; 350 /* pass two: call freedev, which might sleep,
333 del_timer_sync(&d->timer); 351 * for aoedevs marked with DEVFL_TKILL
334 aoedev_freedev(d); /* must be able to sleep */ 352 */
353restart:
354 spin_lock_irqsave(&devlist_lock, flags);
355 for (d = devlist; d; d = d->next) {
356 spin_lock(&d->lock);
357 if (d->flags & DEVFL_TKILL
358 && !(d->flags & DEVFL_FREEING)) {
359 spin_unlock(&d->lock);
360 spin_unlock_irqrestore(&devlist_lock, flags);
361 freedev(d);
362 goto restart;
363 }
364 spin_unlock(&d->lock);
335 } 365 }
366
367 /* pass three: remove aoedevs marked with DEVFL_FREED */
368 for (dd = &devlist, d = *dd; d; d = *dd) {
369 struct aoedev *doomed = NULL;
370
371 spin_lock(&d->lock);
372 if (d->flags & DEVFL_FREED) {
373 *dd = d->next;
374 doomed = d;
375 } else {
376 dd = &d->next;
377 }
378 spin_unlock(&d->lock);
379 kfree(doomed);
380 }
381 spin_unlock_irqrestore(&devlist_lock, flags);
382
336 return 0; 383 return 0;
337} 384}
338 385
386int
387aoedev_flush(const char __user *str, size_t cnt)
388{
389 return flush(str, cnt, NOT_EXITING);
390}
391
339/* This has been confirmed to occur once with Tms=3*1000 due to the 392/* This has been confirmed to occur once with Tms=3*1000 due to the
340 * driver changing link and not processing its transmit ring. The 393 * driver changing link and not processing its transmit ring. The
341 * problem is hard enough to solve by returning an error that I'm 394 * problem is hard enough to solve by returning an error that I'm
@@ -388,7 +441,14 @@ aoedev_by_aoeaddr(ulong maj, int min, int do_alloc)
388 441
389 for (d=devlist; d; d=d->next) 442 for (d=devlist; d; d=d->next)
390 if (d->aoemajor == maj && d->aoeminor == min) { 443 if (d->aoemajor == maj && d->aoeminor == min) {
444 spin_lock(&d->lock);
445 if (d->flags & DEVFL_TKILL) {
446 spin_unlock(&d->lock);
447 d = NULL;
448 goto out;
449 }
391 d->ref++; 450 d->ref++;
451 spin_unlock(&d->lock);
392 break; 452 break;
393 } 453 }
394 if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0) 454 if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0)
@@ -448,21 +508,9 @@ freetgt(struct aoedev *d, struct aoetgt *t)
448void 508void
449aoedev_exit(void) 509aoedev_exit(void)
450{ 510{
451 struct aoedev *d; 511 flush_scheduled_work();
452 ulong flags;
453
454 aoe_flush_iocq(); 512 aoe_flush_iocq();
455 while ((d = devlist)) { 513 flush(NULL, 0, EXITING);
456 devlist = d->next;
457
458 spin_lock_irqsave(&d->lock, flags);
459 aoedev_downdev(d);
460 d->flags |= DEVFL_TKILL;
461 spin_unlock_irqrestore(&d->lock, flags);
462
463 del_timer_sync(&d->timer);
464 aoedev_freedev(d);
465 }
466} 514}
467 515
468int __init 516int __init