aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorNeilBrown <neilb@cse.unsw.edu.au>2005-06-21 20:17:14 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-21 22:07:43 -0400
commit32a7627cf3a35396a8e834faf34e38ae9f3b1309 (patch)
tree3fe7764f5d8e39d835a397e1099358d924b02981 /include
parent57afd89f98a990747445f01c458ecae64263b2f8 (diff)
[PATCH] md: optimised resync using Bitmap based intent logging
With this patch, the intent to write to some block in the array can be logged to a bitmap file. Each bit represents some number of sectors and is set before any update happens, and only cleared when all writes relating to all sectors are complete. After an unclean shutdown, information in this bitmap can be used to optimise resync - only sectors which could be out-of-sync need to be updated. Also if a drive is removed and then added back into an array, the recovery can make use of the bitmap to optimise reconstruction. This is not implemented in this patch. Currently the bitmap is stored in a file which must (obviously) be stored on a separate device. The patch only provided infrastructure. It does not update any personalities to bitmap intent logging. Md arrays can still be used with no bitmap file. This patch has minimal impact on such arrays. Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'include')
-rw-r--r--include/linux/raid/bitmap.h280
-rw-r--r--include/linux/raid/md_k.h4
-rw-r--r--include/linux/raid/md_u.h7
3 files changed, 291 insertions, 0 deletions
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
new file mode 100644
index 000000000000..f785cf26cbad
--- /dev/null
+++ b/include/linux/raid/bitmap.h
@@ -0,0 +1,280 @@
1/*
2 * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
3 *
4 * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
5 */
6#ifndef BITMAP_H
7#define BITMAP_H 1
8
9#define BITMAP_MAJOR 3
10#define BITMAP_MINOR 38
11
12/*
13 * in-memory bitmap:
14 *
15 * Use 16 bit block counters to track pending writes to each "chunk".
16 * The 2 high order bits are special-purpose, the first is a flag indicating
17 * whether a resync is needed. The second is a flag indicating whether a
18 * resync is active.
19 * This means that the counter is actually 14 bits:
20 *
21 * +--------+--------+------------------------------------------------+
22 * | resync | resync | counter |
23 * | needed | active | |
24 * | (0-1) | (0-1) | (0-16383) |
25 * +--------+--------+------------------------------------------------+
26 *
27 * The "resync needed" bit is set when:
28 * a '1' bit is read from storage at startup.
29 * a write request fails on some drives
30 * a resync is aborted on a chunk with 'resync active' set
31 * It is cleared (and resync-active set) when a resync starts across all drives
32 * of the chunk.
33 *
34 *
35 * The "resync active" bit is set when:
36 * a resync is started on all drives, and resync_needed is set.
37 * resync_needed will be cleared (as long as resync_active wasn't already set).
38 * It is cleared when a resync completes.
39 *
40 * The counter counts pending write requests, plus the on-disk bit.
41 * When the counter is '1' and the resync bits are clear, the on-disk
42 * bit can be cleared aswell, thus setting the counter to 0.
43 * When we set a bit, or in the counter (to start a write), if the fields is
44 * 0, we first set the disk bit and set the counter to 1.
45 *
46 * If the counter is 0, the on-disk bit is clear and the stipe is clean
47 * Anything that dirties the stipe pushes the counter to 2 (at least)
48 * and sets the on-disk bit (lazily).
49 * If a periodic sweep find the counter at 2, it is decremented to 1.
50 * If the sweep find the counter at 1, the on-disk bit is cleared and the
51 * counter goes to zero.
52 *
53 * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
54 * counters as a fallback when "page" memory cannot be allocated:
55 *
56 * Normal case (page memory allocated):
57 *
58 * page pointer (32-bit)
59 *
60 * [ ] ------+
61 * |
62 * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters)
63 * c1 c2 c2048
64 *
65 * Hijacked case (page memory allocation failed):
66 *
67 * hijacked page pointer (32-bit)
68 *
69 * [ ][ ] (no page memory allocated)
70 * counter #1 (16-bit) counter #2 (16-bit)
71 *
72 */
73
74#ifdef __KERNEL__
75
76#define PAGE_BITS (PAGE_SIZE << 3)
77#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
78
79typedef __u16 bitmap_counter_t;
80#define COUNTER_BITS 16
81#define COUNTER_BIT_SHIFT 4
82#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8)
83#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
84
85#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
86#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
87#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
88#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
89#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
90#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
91
92/* how many counters per page? */
93#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
94/* same, except a shift value for more efficient bitops */
95#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
96/* same, except a mask value for more efficient bitops */
97#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
98
99#define BITMAP_BLOCK_SIZE 512
100#define BITMAP_BLOCK_SHIFT 9
101
102/* how many blocks per chunk? (this is variable) */
103#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT)
104#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
105#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
106
107/* when hijacked, the counters and bits represent even larger "chunks" */
108/* there will be 1024 chunks represented by each counter in the page pointers */
109#define PAGEPTR_BLOCK_RATIO(bitmap) \
110 (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
111#define PAGEPTR_BLOCK_SHIFT(bitmap) \
112 (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
113#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
114
115/*
116 * on-disk bitmap:
117 *
118 * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
119 * file a page at a time. There's a superblock at the start of the file.
120 */
121
122/* map chunks (bits) to file pages - offset by the size of the superblock */
123#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))
124
125#endif
126
127/*
128 * bitmap structures:
129 */
130
131#define BITMAP_MAGIC 0x6d746962
132
133/* use these for bitmap->flags and bitmap->sb->state bit-fields */
134enum bitmap_state {
135 BITMAP_ACTIVE = 0x001, /* the bitmap is in use */
136 BITMAP_STALE = 0x002 /* the bitmap file is out of date or had -EIO */
137};
138
139/* the superblock at the front of the bitmap file -- little endian */
140typedef struct bitmap_super_s {
141 __u32 magic; /* 0 BITMAP_MAGIC */
142 __u32 version; /* 4 the bitmap major for now, could change... */
143 __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */
144 __u64 events; /* 24 event counter for the bitmap (1)*/
145 __u64 events_cleared;/*32 event counter when last bit cleared (2) */
146 __u64 sync_size; /* 40 the size of the md device's sync range(3) */
147 __u32 state; /* 48 bitmap state information */
148 __u32 chunksize; /* 52 the bitmap chunk size in bytes */
149 __u32 daemon_sleep; /* 56 seconds between disk flushes */
150
151 __u8 pad[256 - 60]; /* set to zero */
152} bitmap_super_t;
153
154/* notes:
155 * (1) This event counter is updated before the eventcounter in the md superblock
156 * When a bitmap is loaded, it is only accepted if this event counter is equal
157 * to, or one greater than, the event counter in the superblock.
158 * (2) This event counter is updated when the other one is *if*and*only*if* the
159 * array is not degraded. As bits are not cleared when the array is degraded,
160 * this represents the last time that any bits were cleared.
161 * If a device is being added that has an event count with this value or
162 * higher, it is accepted as conforming to the bitmap.
163 * (3)This is the number of sectors represented by the bitmap, and is the range that
164 * resync happens across. For raid1 and raid5/6 it is the size of individual
165 * devices. For raid10 it is the size of the array.
166 */
167
168#ifdef __KERNEL__
169
170/* the in-memory bitmap is represented by bitmap_pages */
171struct bitmap_page {
172 /*
173 * map points to the actual memory page
174 */
175 char *map;
176 /*
177 * in emergencies (when map cannot be alloced), hijack the map
178 * pointer and use it as two counters itself
179 */
180 unsigned int hijacked:1;
181 /*
182 * count of dirty bits on the page
183 */
184 unsigned int count:31;
185};
186
187/* keep track of bitmap file pages that have pending writes on them */
188struct page_list {
189 struct list_head list;
190 struct page *page;
191};
192
193/* the main bitmap structure - one per mddev */
194struct bitmap {
195 struct bitmap_page *bp;
196 unsigned long pages; /* total number of pages in the bitmap */
197 unsigned long missing_pages; /* number of pages not yet allocated */
198
199 mddev_t *mddev; /* the md device that the bitmap is for */
200
201 int counter_bits; /* how many bits per block counter */
202
203 /* bitmap chunksize -- how much data does each bit represent? */
204 unsigned long chunksize;
205 unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
206 unsigned long chunks; /* total number of data chunks for the array */
207
208 /* We hold a count on the chunk currently being synced, and drop
209 * it when the last block is started. If the resync is aborted
210 * midway, we need to be able to drop that count, so we remember
211 * the counted chunk..
212 */
213 unsigned long syncchunk;
214
215 __u64 events_cleared;
216
217 /* bitmap spinlock */
218 spinlock_t lock;
219
220 struct file *file; /* backing disk file */
221 struct page *sb_page; /* cached copy of the bitmap file superblock */
222 struct page **filemap; /* list of cache pages for the file */
223 unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
224 unsigned long file_pages; /* number of pages in the file */
225
226 unsigned long flags;
227
228 /*
229 * the bitmap daemon - periodically wakes up and sweeps the bitmap
230 * file, cleaning up bits and flushing out pages to disk as necessary
231 */
232 unsigned long daemon_lastrun; /* jiffies of last run */
233 unsigned long daemon_sleep; /* how many seconds between updates? */
234
235 /*
236 * bitmap write daemon - this daemon performs writes to the bitmap file
237 * this thread is only needed because of a limitation in ext3 (jbd)
238 * that does not allow a task to have two journal transactions ongoing
239 * simultaneously (even if the transactions are for two different
240 * filesystems) -- in the case of bitmap, that would be the filesystem
241 * that the bitmap file resides on and the filesystem that is mounted
242 * on the md device -- see current->journal_info in jbd/transaction.c
243 */
244 mdk_thread_t *writeback_daemon;
245 spinlock_t write_lock;
246 struct semaphore write_ready;
247 struct semaphore write_done;
248 unsigned long writes_pending;
249 wait_queue_head_t write_wait;
250 struct list_head write_pages;
251 struct list_head complete_pages;
252 mempool_t *write_pool;
253};
254
255/* the bitmap API */
256
257/* these are used only by md/bitmap */
258int bitmap_create(mddev_t *mddev);
259void bitmap_destroy(mddev_t *mddev);
260int bitmap_active(struct bitmap *bitmap);
261
262char *file_path(struct file *file, char *buf, int count);
263void bitmap_print_sb(struct bitmap *bitmap);
264int bitmap_update_sb(struct bitmap *bitmap);
265
266int bitmap_setallbits(struct bitmap *bitmap);
267
268/* these are exported */
269int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors);
270void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors,
271 int success);
272int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks);
273void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted);
274void bitmap_close_sync(struct bitmap *bitmap);
275
276int bitmap_unplug(struct bitmap *bitmap);
277int bitmap_daemon_work(struct bitmap *bitmap);
278#endif
279
280#endif
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index bce0032decff..16e94a9f0f8c 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -267,6 +267,9 @@ struct mddev_s
267 atomic_t writes_pending; 267 atomic_t writes_pending;
268 request_queue_t *queue; /* for plugging ... */ 268 request_queue_t *queue; /* for plugging ... */
269 269
270 struct bitmap *bitmap; /* the bitmap for the device */
271 struct file *bitmap_file; /* the bitmap file */
272
270 struct list_head all_mddevs; 273 struct list_head all_mddevs;
271}; 274};
272 275
@@ -341,6 +344,7 @@ typedef struct mdk_thread_s {
341 unsigned long flags; 344 unsigned long flags;
342 struct completion *event; 345 struct completion *event;
343 struct task_struct *tsk; 346 struct task_struct *tsk;
347 unsigned long timeout;
344 const char *name; 348 const char *name;
345} mdk_thread_t; 349} mdk_thread_t;
346 350
diff --git a/include/linux/raid/md_u.h b/include/linux/raid/md_u.h
index a2df5c2a42af..81da20ccec4d 100644
--- a/include/linux/raid/md_u.h
+++ b/include/linux/raid/md_u.h
@@ -23,6 +23,7 @@
23#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t) 23#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t)
24#define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13) 24#define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13)
25#define RAID_AUTORUN _IO (MD_MAJOR, 0x14) 25#define RAID_AUTORUN _IO (MD_MAJOR, 0x14)
26#define GET_BITMAP_FILE _IOR (MD_MAJOR, 0x15, mdu_bitmap_file_t)
26 27
27/* configuration */ 28/* configuration */
28#define CLEAR_ARRAY _IO (MD_MAJOR, 0x20) 29#define CLEAR_ARRAY _IO (MD_MAJOR, 0x20)
@@ -36,6 +37,7 @@
36#define HOT_ADD_DISK _IO (MD_MAJOR, 0x28) 37#define HOT_ADD_DISK _IO (MD_MAJOR, 0x28)
37#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29) 38#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29)
38#define HOT_GENERATE_ERROR _IO (MD_MAJOR, 0x2a) 39#define HOT_GENERATE_ERROR _IO (MD_MAJOR, 0x2a)
40#define SET_BITMAP_FILE _IOW (MD_MAJOR, 0x2b, int)
39 41
40/* usage */ 42/* usage */
41#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t) 43#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t)
@@ -106,6 +108,11 @@ typedef struct mdu_start_info_s {
106 108
107} mdu_start_info_t; 109} mdu_start_info_t;
108 110
111typedef struct mdu_bitmap_file_s
112{
113 char pathname[4096];
114} mdu_bitmap_file_t;
115
109typedef struct mdu_param_s 116typedef struct mdu_param_s
110{ 117{
111 int personality; /* 1,2,3,4 */ 118 int personality; /* 1,2,3,4 */