aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlasdair G Kergon <agk@redhat.com>2006-03-27 04:17:44 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-27 11:44:58 -0500
commitb4b610f684d13bf8691feeae5d4d7a8bd1f1033e (patch)
tree3226c70f318f494d3c6a6707879ba82ebec900b9
parenteccf081799be8d83852f183838bf26e1ca099db4 (diff)
[PATCH] device-mapper snapshot: replace sibling list
The siblings "list" is used unsafely at the moment. Firstly, only the element on the list being changed gets locked (via the snapshot lock), not the next and previous elements which have pointers that are also being changed. Secondly, if you have two or more snapshots and write to the same chunk a second time before every snapshot has finished making its private copy of the data, if you're unlucky, _origin_write() could attempt its list_merge() and dereference a 'last' pointer to a pending_exception structure that has just been freed. Analysis reveals that the list is actually only there for reference counting. If 5 pending_exceptions are needed in origin_write, then the 5 are joined together into a 5-element list - without a separate list head because there's nowhere suitable to store it. As the pending_exceptions complete, they are removed from the list one-by-one and any contents of origin_bios get moved across to one of the remaining pending_exceptions on the list. Whichever one is last is detected because list_empty() is then true and the origin_bios get submitted. The fix proposed here uses an alternative reference counting mechanism by choosing one of the pending_exceptions as primary and maintaining an atomic counter there. Signed-off-by: Alasdair G Kergon <agk@redhat.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--drivers/md/dm-snap.c122
1 files changed, 80 insertions, 42 deletions
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 874f145431d8..475514bda9d0 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -54,11 +54,21 @@ struct pending_exception {
54 struct list_head list; 54 struct list_head list;
55 55
56 /* 56 /*
57 * Other pending_exceptions that are processing this 57 * The primary pending_exception is the one that holds
58 * chunk. When this list is empty, we know we can 58 * the sibling_count and the list of origin_bios for a
59 * complete the origins. 59 * group of pending_exceptions. It is always last to get freed.
60 * These fields get set up when writing to the origin.
60 */ 61 */
61 struct list_head siblings; 62 struct pending_exception *primary_pe;
63
64 /*
65 * Number of pending_exceptions processing this chunk.
66 * When this drops to zero we must complete the origin bios.
67 * If incrementing or decrementing this, hold pe->snap->lock for
68 * the sibling concerned and not pe->primary_pe->snap->lock unless
69 * they are the same.
70 */
71 atomic_t sibling_count;
62 72
63 /* Pointer back to snapshot context */ 73 /* Pointer back to snapshot context */
64 struct dm_snapshot *snap; 74 struct dm_snapshot *snap;
@@ -593,20 +603,15 @@ static void error_bios(struct bio *bio)
593 603
594static struct bio *__flush_bios(struct pending_exception *pe) 604static struct bio *__flush_bios(struct pending_exception *pe)
595{ 605{
596 struct pending_exception *sibling; 606 /*
597 607 * If this pe is involved in a write to the origin and
598 if (list_empty(&pe->siblings)) 608 * it is the last sibling to complete then release
599 return bio_list_get(&pe->origin_bios); 609 * the bios for the original write to the origin.
600
601 sibling = list_entry(pe->siblings.next,
602 struct pending_exception, siblings);
603
604 list_del(&pe->siblings);
605
606 /* This is fine as long as kcopyd is single-threaded. If kcopyd
607 * becomes multi-threaded, we'll need some locking here.
608 */ 610 */
609 bio_list_merge(&sibling->origin_bios, &pe->origin_bios); 611
612 if (pe->primary_pe &&
613 atomic_dec_and_test(&pe->primary_pe->sibling_count))
614 return bio_list_get(&pe->primary_pe->origin_bios);
610 615
611 return NULL; 616 return NULL;
612} 617}
@@ -614,6 +619,7 @@ static struct bio *__flush_bios(struct pending_exception *pe)
614static void pending_complete(struct pending_exception *pe, int success) 619static void pending_complete(struct pending_exception *pe, int success)
615{ 620{
616 struct exception *e; 621 struct exception *e;
622 struct pending_exception *primary_pe;
617 struct dm_snapshot *s = pe->snap; 623 struct dm_snapshot *s = pe->snap;
618 struct bio *flush = NULL; 624 struct bio *flush = NULL;
619 625
@@ -662,7 +668,20 @@ static void pending_complete(struct pending_exception *pe, int success)
662 } 668 }
663 669
664 out: 670 out:
665 free_pending_exception(pe); 671 primary_pe = pe->primary_pe;
672
673 /*
674 * Free the pe if it's not linked to an origin write or if
675 * it's not itself a primary pe.
676 */
677 if (!primary_pe || primary_pe != pe)
678 free_pending_exception(pe);
679
680 /*
681 * Free the primary pe if nothing references it.
682 */
683 if (primary_pe && !atomic_read(&primary_pe->sibling_count))
684 free_pending_exception(primary_pe);
666 685
667 if (flush) 686 if (flush)
668 flush_bios(flush); 687 flush_bios(flush);
@@ -757,7 +776,8 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
757 pe->e.old_chunk = chunk; 776 pe->e.old_chunk = chunk;
758 bio_list_init(&pe->origin_bios); 777 bio_list_init(&pe->origin_bios);
759 bio_list_init(&pe->snapshot_bios); 778 bio_list_init(&pe->snapshot_bios);
760 INIT_LIST_HEAD(&pe->siblings); 779 pe->primary_pe = NULL;
780 atomic_set(&pe->sibling_count, 1);
761 pe->snap = s; 781 pe->snap = s;
762 pe->started = 0; 782 pe->started = 0;
763 783
@@ -916,26 +936,12 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
916/*----------------------------------------------------------------- 936/*-----------------------------------------------------------------
917 * Origin methods 937 * Origin methods
918 *---------------------------------------------------------------*/ 938 *---------------------------------------------------------------*/
919static void list_merge(struct list_head *l1, struct list_head *l2)
920{
921 struct list_head *l1_n, *l2_p;
922
923 l1_n = l1->next;
924 l2_p = l2->prev;
925
926 l1->next = l2;
927 l2->prev = l1;
928
929 l2_p->next = l1_n;
930 l1_n->prev = l2_p;
931}
932
933static int __origin_write(struct list_head *snapshots, struct bio *bio) 939static int __origin_write(struct list_head *snapshots, struct bio *bio)
934{ 940{
935 int r = 1, first = 1; 941 int r = 1, first = 0;
936 struct dm_snapshot *snap; 942 struct dm_snapshot *snap;
937 struct exception *e; 943 struct exception *e;
938 struct pending_exception *pe, *next_pe, *last = NULL; 944 struct pending_exception *pe, *next_pe, *primary_pe = NULL;
939 chunk_t chunk; 945 chunk_t chunk;
940 LIST_HEAD(pe_queue); 946 LIST_HEAD(pe_queue);
941 947
@@ -962,6 +968,9 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
962 * Check exception table to see if block 968 * Check exception table to see if block
963 * is already remapped in this snapshot 969 * is already remapped in this snapshot
964 * and trigger an exception if not. 970 * and trigger an exception if not.
971 *
972 * sibling_count is initialised to 1 so pending_complete()
973 * won't destroy the primary_pe while we're inside this loop.
965 */ 974 */
966 e = lookup_exception(&snap->complete, chunk); 975 e = lookup_exception(&snap->complete, chunk);
967 if (!e) { 976 if (!e) {
@@ -971,31 +980,60 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
971 snap->valid = 0; 980 snap->valid = 0;
972 981
973 } else { 982 } else {
974 if (first) { 983 if (!primary_pe) {
975 bio_list_add(&pe->origin_bios, bio); 984 /*
985 * Either every pe here has same
986 * primary_pe or none has one yet.
987 */
988 if (pe->primary_pe)
989 primary_pe = pe->primary_pe;
990 else {
991 primary_pe = pe;
992 first = 1;
993 }
994
995 bio_list_add(&primary_pe->origin_bios,
996 bio);
976 r = 0; 997 r = 0;
977 first = 0;
978 } 998 }
979 if (last && list_empty(&pe->siblings)) 999 if (!pe->primary_pe) {
980 list_merge(&pe->siblings, 1000 atomic_inc(&primary_pe->sibling_count);
981 &last->siblings); 1001 pe->primary_pe = primary_pe;
1002 }
982 if (!pe->started) { 1003 if (!pe->started) {
983 pe->started = 1; 1004 pe->started = 1;
984 list_add_tail(&pe->list, &pe_queue); 1005 list_add_tail(&pe->list, &pe_queue);
985 } 1006 }
986 last = pe;
987 } 1007 }
988 } 1008 }
989 1009
990 up_write(&snap->lock); 1010 up_write(&snap->lock);
991 } 1011 }
992 1012
1013 if (!primary_pe)
1014 goto out;
1015
1016 /*
1017 * If this is the first time we're processing this chunk and
1018 * sibling_count is now 1 it means all the pending exceptions
1019 * got completed while we were in the loop above, so it falls to
1020 * us here to remove the primary_pe and submit any origin_bios.
1021 */
1022
1023 if (first && atomic_dec_and_test(&primary_pe->sibling_count)) {
1024 flush_bios(bio_list_get(&primary_pe->origin_bios));
1025 free_pending_exception(primary_pe);
1026 /* If we got here, pe_queue is necessarily empty. */
1027 goto out;
1028 }
1029
993 /* 1030 /*
994 * Now that we have a complete pe list we can start the copying. 1031 * Now that we have a complete pe list we can start the copying.
995 */ 1032 */
996 list_for_each_entry_safe(pe, next_pe, &pe_queue, list) 1033 list_for_each_entry_safe(pe, next_pe, &pe_queue, list)
997 start_copy(pe); 1034 start_copy(pe);
998 1035
1036 out:
999 return r; 1037 return r;
1000} 1038}
1001 1039