aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@cse.unsw.edu.au>2005-06-21 20:17:14 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-21 22:07:43 -0400
commit32a7627cf3a35396a8e834faf34e38ae9f3b1309 (patch)
tree3fe7764f5d8e39d835a397e1099358d924b02981
parent57afd89f98a990747445f01c458ecae64263b2f8 (diff)
[PATCH] md: optimised resync using Bitmap based intent logging
With this patch, the intent to write to some block in the array can be logged to a bitmap file. Each bit represents some number of sectors and is set before any update happens, and only cleared when all writes relating to all sectors are complete. After an unclean shutdown, information in this bitmap can be used to optimise resync - only sectors which could be out-of-sync need to be updated. Also if a drive is removed and then added back into an array, the recovery can make use of the bitmap to optimise reconstruction. This is not implemented in this patch. Currently the bitmap is stored in a file which must (obviously) be stored on a separate device. The patch only provided infrastructure. It does not update any personalities to bitmap intent logging. Md arrays can still be used with no bitmap file. This patch has minimal impact on such arrays. Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--drivers/md/Makefile3
-rw-r--r--drivers/md/bitmap.c1519
-rw-r--r--drivers/md/md.c172
-rw-r--r--include/linux/raid/bitmap.h280
-rw-r--r--include/linux/raid/md_k.h4
-rw-r--r--include/linux/raid/md_u.h7
6 files changed, 1970 insertions, 15 deletions
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 90de9c146a5f..d3efedf6a6ad 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -7,6 +7,7 @@ dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
7dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o 7dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o
8dm-snapshot-objs := dm-snap.o dm-exception-store.o 8dm-snapshot-objs := dm-snap.o dm-exception-store.o
9dm-mirror-objs := dm-log.o dm-raid1.o 9dm-mirror-objs := dm-log.o dm-raid1.o
10md-mod-objs := md.o bitmap.o
10raid6-objs := raid6main.o raid6algos.o raid6recov.o raid6tables.o \ 11raid6-objs := raid6main.o raid6algos.o raid6recov.o raid6tables.o \
11 raid6int1.o raid6int2.o raid6int4.o \ 12 raid6int1.o raid6int2.o raid6int4.o \
12 raid6int8.o raid6int16.o raid6int32.o \ 13 raid6int8.o raid6int16.o raid6int32.o \
@@ -28,7 +29,7 @@ obj-$(CONFIG_MD_RAID5) += raid5.o xor.o
28obj-$(CONFIG_MD_RAID6) += raid6.o xor.o 29obj-$(CONFIG_MD_RAID6) += raid6.o xor.o
29obj-$(CONFIG_MD_MULTIPATH) += multipath.o 30obj-$(CONFIG_MD_MULTIPATH) += multipath.o
30obj-$(CONFIG_MD_FAULTY) += faulty.o 31obj-$(CONFIG_MD_FAULTY) += faulty.o
31obj-$(CONFIG_BLK_DEV_MD) += md.o 32obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
32obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o 33obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
33obj-$(CONFIG_DM_CRYPT) += dm-crypt.o 34obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
34obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o 35obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
new file mode 100644
index 000000000000..34ffc133db05
--- /dev/null
+++ b/drivers/md/bitmap.c
@@ -0,0 +1,1519 @@
1/*
2 * bitmap.c two-level bitmap (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
3 *
4 * bitmap_create - sets up the bitmap structure
5 * bitmap_destroy - destroys the bitmap structure
6 *
7 * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.:
8 * - added disk storage for bitmap
9 * - changes to allow various bitmap chunk sizes
10 * - added bitmap daemon (to asynchronously clear bitmap bits from disk)
11 */
12
13/*
14 * Still to do:
15 *
16 * flush after percent set rather than just time based. (maybe both).
17 * wait if count gets too high, wake when it drops to half.
18 * allow bitmap to be mirrored with superblock (before or after...)
19 * allow hot-add to re-instate a current device.
20 * allow hot-add of bitmap after quiessing device
21 */
22
23#include <linux/module.h>
24#include <linux/version.h>
25#include <linux/errno.h>
26#include <linux/slab.h>
27#include <linux/init.h>
28#include <linux/config.h>
29#include <linux/timer.h>
30#include <linux/sched.h>
31#include <linux/list.h>
32#include <linux/file.h>
33#include <linux/mount.h>
34#include <linux/buffer_head.h>
35#include <linux/raid/md.h>
36#include <linux/raid/bitmap.h>
37
38/* debug macros */
39
40#define DEBUG 0
41
42#if DEBUG
43/* these are for debugging purposes only! */
44
45/* define one and only one of these */
46#define INJECT_FAULTS_1 0 /* cause bitmap_alloc_page to fail always */
47#define INJECT_FAULTS_2 0 /* cause bitmap file to be kicked when first bit set*/
48#define INJECT_FAULTS_3 0 /* treat bitmap file as kicked at init time */
49#define INJECT_FAULTS_4 0 /* undef */
50#define INJECT_FAULTS_5 0 /* undef */
51#define INJECT_FAULTS_6 0
52
53/* if these are defined, the driver will fail! debug only */
54#define INJECT_FATAL_FAULT_1 0 /* fail kmalloc, causing bitmap_create to fail */
55#define INJECT_FATAL_FAULT_2 0 /* undef */
56#define INJECT_FATAL_FAULT_3 0 /* undef */
57#endif
58
59//#define DPRINTK PRINTK /* set this NULL to avoid verbose debug output */
60#define DPRINTK(x...) do { } while(0)
61
62#ifndef PRINTK
63# if DEBUG > 0
64# define PRINTK(x...) printk(KERN_DEBUG x)
65# else
66# define PRINTK(x...)
67# endif
68#endif
69
70static inline char * bmname(struct bitmap *bitmap)
71{
72 return bitmap->mddev ? mdname(bitmap->mddev) : "mdX";
73}
74
75
76/*
77 * test if the bitmap is active
78 */
79int bitmap_active(struct bitmap *bitmap)
80{
81 unsigned long flags;
82 int res = 0;
83
84 if (!bitmap)
85 return res;
86 spin_lock_irqsave(&bitmap->lock, flags);
87 res = bitmap->flags & BITMAP_ACTIVE;
88 spin_unlock_irqrestore(&bitmap->lock, flags);
89 return res;
90}
91
92#define WRITE_POOL_SIZE 256
93/* mempool for queueing pending writes on the bitmap file */
94static void *write_pool_alloc(unsigned int gfp_flags, void *data)
95{
96 return kmalloc(sizeof(struct page_list), gfp_flags);
97}
98
99static void write_pool_free(void *ptr, void *data)
100{
101 kfree(ptr);
102}
103
104/*
105 * just a placeholder - calls kmalloc for bitmap pages
106 */
107static unsigned char *bitmap_alloc_page(struct bitmap *bitmap)
108{
109 unsigned char *page;
110
111#if INJECT_FAULTS_1
112 page = NULL;
113#else
114 page = kmalloc(PAGE_SIZE, GFP_NOIO);
115#endif
116 if (!page)
117 printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap));
118 else
119 printk("%s: bitmap_alloc_page: allocated page at %p\n",
120 bmname(bitmap), page);
121 return page;
122}
123
124/*
125 * for now just a placeholder -- just calls kfree for bitmap pages
126 */
127static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page)
128{
129 PRINTK("%s: bitmap_free_page: free page %p\n", bmname(bitmap), page);
130 kfree(page);
131}
132
133/*
134 * check a page and, if necessary, allocate it (or hijack it if the alloc fails)
135 *
136 * 1) check to see if this page is allocated, if it's not then try to alloc
137 * 2) if the alloc fails, set the page's hijacked flag so we'll use the
138 * page pointer directly as a counter
139 *
140 * if we find our page, we increment the page's refcount so that it stays
141 * allocated while we're using it
142 */
143static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int create)
144{
145 unsigned char *mappage;
146
147 if (page >= bitmap->pages) {
148 printk(KERN_ALERT
149 "%s: invalid bitmap page request: %lu (> %lu)\n",
150 bmname(bitmap), page, bitmap->pages-1);
151 return -EINVAL;
152 }
153
154
155 if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */
156 return 0;
157
158 if (bitmap->bp[page].map) /* page is already allocated, just return */
159 return 0;
160
161 if (!create)
162 return -ENOENT;
163
164 spin_unlock_irq(&bitmap->lock);
165
166 /* this page has not been allocated yet */
167
168 if ((mappage = bitmap_alloc_page(bitmap)) == NULL) {
169 PRINTK("%s: bitmap map page allocation failed, hijacking\n",
170 bmname(bitmap));
171 /* failed - set the hijacked flag so that we can use the
172 * pointer as a counter */
173 spin_lock_irq(&bitmap->lock);
174 if (!bitmap->bp[page].map)
175 bitmap->bp[page].hijacked = 1;
176 goto out;
177 }
178
179 /* got a page */
180
181 spin_lock_irq(&bitmap->lock);
182
183 /* recheck the page */
184
185 if (bitmap->bp[page].map || bitmap->bp[page].hijacked) {
186 /* somebody beat us to getting the page */
187 bitmap_free_page(bitmap, mappage);
188 return 0;
189 }
190
191 /* no page was in place and we have one, so install it */
192
193 memset(mappage, 0, PAGE_SIZE);
194 bitmap->bp[page].map = mappage;
195 bitmap->missing_pages--;
196out:
197 return 0;
198}
199
200
201/* if page is completely empty, put it back on the free list, or dealloc it */
202/* if page was hijacked, unmark the flag so it might get alloced next time */
203/* Note: lock should be held when calling this */
204static inline void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
205{
206 char *ptr;
207
208 if (bitmap->bp[page].count) /* page is still busy */
209 return;
210
211 /* page is no longer in use, it can be released */
212
213 if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */
214 bitmap->bp[page].hijacked = 0;
215 bitmap->bp[page].map = NULL;
216 return;
217 }
218
219 /* normal case, free the page */
220
221#if 0
222/* actually ... let's not. We will probably need the page again exactly when
223 * memory is tight and we are flusing to disk
224 */
225 return;
226#else
227 ptr = bitmap->bp[page].map;
228 bitmap->bp[page].map = NULL;
229 bitmap->missing_pages++;
230 bitmap_free_page(bitmap, ptr);
231 return;
232#endif
233}
234
235
236/*
237 * bitmap file handling - read and write the bitmap file and its superblock
238 */
239
240/* copy the pathname of a file to a buffer */
241char *file_path(struct file *file, char *buf, int count)
242{
243 struct dentry *d;
244 struct vfsmount *v;
245
246 if (!buf)
247 return NULL;
248
249 d = file->f_dentry;
250 v = file->f_vfsmnt;
251
252 buf = d_path(d, v, buf, count);
253
254 return IS_ERR(buf) ? NULL : buf;
255}
256
257/*
258 * basic page I/O operations
259 */
260
261/*
262 * write out a page
263 */
264static int write_page(struct page *page, int wait)
265{
266 int ret = -ENOMEM;
267
268 lock_page(page);
269
270 if (page->mapping == NULL)
271 goto unlock_out;
272 else if (i_size_read(page->mapping->host) < page->index << PAGE_SHIFT) {
273 ret = -ENOENT;
274 goto unlock_out;
275 }
276
277 ret = page->mapping->a_ops->prepare_write(NULL, page, 0, PAGE_SIZE);
278 if (!ret)
279 ret = page->mapping->a_ops->commit_write(NULL, page, 0,
280 PAGE_SIZE);
281 if (ret) {
282unlock_out:
283 unlock_page(page);
284 return ret;
285 }
286
287 set_page_dirty(page); /* force it to be written out */
288 return write_one_page(page, wait);
289}
290
291/* read a page from a file, pinning it into cache, and return bytes_read */
292static struct page *read_page(struct file *file, unsigned long index,
293 unsigned long *bytes_read)
294{
295 struct inode *inode = file->f_mapping->host;
296 struct page *page = NULL;
297 loff_t isize = i_size_read(inode);
298 unsigned long end_index = isize >> PAGE_CACHE_SHIFT;
299
300 PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_CACHE_SIZE,
301 (unsigned long long)index << PAGE_CACHE_SHIFT);
302
303 page = read_cache_page(inode->i_mapping, index,
304 (filler_t *)inode->i_mapping->a_ops->readpage, file);
305 if (IS_ERR(page))
306 goto out;
307 wait_on_page_locked(page);
308 if (!PageUptodate(page) || PageError(page)) {
309 page_cache_release(page);
310 page = ERR_PTR(-EIO);
311 goto out;
312 }
313
314 if (index > end_index) /* we have read beyond EOF */
315 *bytes_read = 0;
316 else if (index == end_index) /* possible short read */
317 *bytes_read = isize & ~PAGE_CACHE_MASK;
318 else
319 *bytes_read = PAGE_CACHE_SIZE; /* got a full page */
320out:
321 if (IS_ERR(page))
322 printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n",
323 (int)PAGE_CACHE_SIZE,
324 (unsigned long long)index << PAGE_CACHE_SHIFT,
325 PTR_ERR(page));
326 return page;
327}
328
329/*
330 * bitmap file superblock operations
331 */
332
333/* update the event counter and sync the superblock to disk */
334int bitmap_update_sb(struct bitmap *bitmap)
335{
336 bitmap_super_t *sb;
337 unsigned long flags;
338
339 if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
340 return 0;
341 spin_lock_irqsave(&bitmap->lock, flags);
342 if (!bitmap->sb_page) { /* no superblock */
343 spin_unlock_irqrestore(&bitmap->lock, flags);
344 return 0;
345 }
346 page_cache_get(bitmap->sb_page);
347 spin_unlock_irqrestore(&bitmap->lock, flags);
348 sb = (bitmap_super_t *)kmap(bitmap->sb_page);
349 sb->events = cpu_to_le64(bitmap->mddev->events);
350 if (!bitmap->mddev->degraded)
351 sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
352 kunmap(bitmap->sb_page);
353 write_page(bitmap->sb_page, 0);
354 return 0;
355}
356
357/* print out the bitmap file superblock */
358void bitmap_print_sb(struct bitmap *bitmap)
359{
360 bitmap_super_t *sb;
361
362 if (!bitmap || !bitmap->sb_page)
363 return;
364 sb = (bitmap_super_t *)kmap(bitmap->sb_page);
365 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
366 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic));
367 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version));
368 printk(KERN_DEBUG " uuid: %08x.%08x.%08x.%08x\n",
369 *(__u32 *)(sb->uuid+0),
370 *(__u32 *)(sb->uuid+4),
371 *(__u32 *)(sb->uuid+8),
372 *(__u32 *)(sb->uuid+12));
373 printk(KERN_DEBUG " events: %llu\n",
374 (unsigned long long) le64_to_cpu(sb->events));
375 printk(KERN_DEBUG "events_clred: %llu\n",
376 (unsigned long long) le64_to_cpu(sb->events_cleared));
377 printk(KERN_DEBUG " state: %08x\n", le32_to_cpu(sb->state));
378 printk(KERN_DEBUG " chunksize: %d B\n", le32_to_cpu(sb->chunksize));
379 printk(KERN_DEBUG "daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
380 printk(KERN_DEBUG " sync size: %llu KB\n", le64_to_cpu(sb->sync_size));
381 kunmap(bitmap->sb_page);
382}
383
384/* read the superblock from the bitmap file and initialize some bitmap fields */
385static int bitmap_read_sb(struct bitmap *bitmap)
386{
387 char *reason = NULL;
388 bitmap_super_t *sb;
389 unsigned long chunksize, daemon_sleep;
390 unsigned long bytes_read;
391 unsigned long long events;
392 int err = -EINVAL;
393
394 /* page 0 is the superblock, read it... */
395 bitmap->sb_page = read_page(bitmap->file, 0, &bytes_read);
396 if (IS_ERR(bitmap->sb_page)) {
397 err = PTR_ERR(bitmap->sb_page);
398 bitmap->sb_page = NULL;
399 return err;
400 }
401
402 sb = (bitmap_super_t *)kmap(bitmap->sb_page);
403
404 if (bytes_read < sizeof(*sb)) { /* short read */
405 printk(KERN_INFO "%s: bitmap file superblock truncated\n",
406 bmname(bitmap));
407 err = -ENOSPC;
408 goto out;
409 }
410
411 chunksize = le32_to_cpu(sb->chunksize);
412 daemon_sleep = le32_to_cpu(sb->daemon_sleep);
413
414 /* verify that the bitmap-specific fields are valid */
415 if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
416 reason = "bad magic";
417 else if (sb->version != cpu_to_le32(BITMAP_MAJOR))
418 reason = "unrecognized superblock version";
419 else if (chunksize < 512 || chunksize > (1024 * 1024 * 4))
420 reason = "bitmap chunksize out of range (512B - 4MB)";
421 else if ((1 << ffz(~chunksize)) != chunksize)
422 reason = "bitmap chunksize not a power of 2";
423 else if (daemon_sleep < 1 || daemon_sleep > 15)
424 reason = "daemon sleep period out of range";
425 if (reason) {
426 printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n",
427 bmname(bitmap), reason);
428 goto out;
429 }
430
431 /* keep the array size field of the bitmap superblock up to date */
432 sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
433
434 if (!bitmap->mddev->persistent)
435 goto success;
436
437 /*
438 * if we have a persistent array superblock, compare the
439 * bitmap's UUID and event counter to the mddev's
440 */
441 if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
442 printk(KERN_INFO "%s: bitmap superblock UUID mismatch\n",
443 bmname(bitmap));
444 goto out;
445 }
446 events = le64_to_cpu(sb->events);
447 if (events < bitmap->mddev->events) {
448 printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) "
449 "-- forcing full recovery\n", bmname(bitmap), events,
450 (unsigned long long) bitmap->mddev->events);
451 sb->state |= BITMAP_STALE;
452 }
453success:
454 /* assign fields using values from superblock */
455 bitmap->chunksize = chunksize;
456 bitmap->daemon_sleep = daemon_sleep;
457 bitmap->flags |= sb->state;
458 bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
459 err = 0;
460out:
461 kunmap(bitmap->sb_page);
462 if (err)
463 bitmap_print_sb(bitmap);
464 return err;
465}
466
467enum bitmap_mask_op {
468 MASK_SET,
469 MASK_UNSET
470};
471
472/* record the state of the bitmap in the superblock */
473static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
474 enum bitmap_mask_op op)
475{
476 bitmap_super_t *sb;
477 unsigned long flags;
478
479 spin_lock_irqsave(&bitmap->lock, flags);
480 if (!bitmap || !bitmap->sb_page) { /* can't set the state */
481 spin_unlock_irqrestore(&bitmap->lock, flags);
482 return;
483 }
484 page_cache_get(bitmap->sb_page);
485 spin_unlock_irqrestore(&bitmap->lock, flags);
486 sb = (bitmap_super_t *)kmap(bitmap->sb_page);
487 switch (op) {
488 case MASK_SET: sb->state |= bits;
489 break;
490 case MASK_UNSET: sb->state &= ~bits;
491 break;
492 default: BUG();
493 }
494 kunmap(bitmap->sb_page);
495 page_cache_release(bitmap->sb_page);
496}
497
498/*
499 * general bitmap file operations
500 */
501
502/* calculate the index of the page that contains this bit */
503static inline unsigned long file_page_index(unsigned long chunk)
504{
505 return CHUNK_BIT_OFFSET(chunk) >> PAGE_BIT_SHIFT;
506}
507
508/* calculate the (bit) offset of this bit within a page */
509static inline unsigned long file_page_offset(unsigned long chunk)
510{
511 return CHUNK_BIT_OFFSET(chunk) & (PAGE_BITS - 1);
512}
513
514/*
515 * return a pointer to the page in the filemap that contains the given bit
516 *
517 * this lookup is complicated by the fact that the bitmap sb might be exactly
518 * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page
519 * 0 or page 1
520 */
521static inline struct page *filemap_get_page(struct bitmap *bitmap,
522 unsigned long chunk)
523{
524 return bitmap->filemap[file_page_index(chunk) - file_page_index(0)];
525}
526
527
528static void bitmap_file_unmap(struct bitmap *bitmap)
529{
530 struct page **map, *sb_page;
531 unsigned long *attr;
532 int pages;
533 unsigned long flags;
534
535 spin_lock_irqsave(&bitmap->lock, flags);
536 map = bitmap->filemap;
537 bitmap->filemap = NULL;
538 attr = bitmap->filemap_attr;
539 bitmap->filemap_attr = NULL;
540 pages = bitmap->file_pages;
541 bitmap->file_pages = 0;
542 sb_page = bitmap->sb_page;
543 bitmap->sb_page = NULL;
544 spin_unlock_irqrestore(&bitmap->lock, flags);
545
546 while (pages--)
547 if (map[pages]->index != 0) /* 0 is sb_page, release it below */
548 page_cache_release(map[pages]);
549 kfree(map);
550 kfree(attr);
551
552 if (sb_page)
553 page_cache_release(sb_page);
554}
555
556static void bitmap_stop_daemons(struct bitmap *bitmap);
557
558/* dequeue the next item in a page list -- don't call from irq context */
559static struct page_list *dequeue_page(struct bitmap *bitmap,
560 struct list_head *head)
561{
562 struct page_list *item = NULL;
563
564 spin_lock(&bitmap->write_lock);
565 if (list_empty(head))
566 goto out;
567 item = list_entry(head->prev, struct page_list, list);
568 list_del(head->prev);
569out:
570 spin_unlock(&bitmap->write_lock);
571 return item;
572}
573
574static void drain_write_queues(struct bitmap *bitmap)
575{
576 struct list_head *queues[] = { &bitmap->complete_pages, NULL };
577 struct list_head *head;
578 struct page_list *item;
579 int i;
580
581 for (i = 0; queues[i]; i++) {
582 head = queues[i];
583 while ((item = dequeue_page(bitmap, head))) {
584 page_cache_release(item->page);
585 mempool_free(item, bitmap->write_pool);
586 }
587 }
588
589 spin_lock(&bitmap->write_lock);
590 bitmap->writes_pending = 0; /* make sure waiters continue */
591 wake_up(&bitmap->write_wait);
592 spin_unlock(&bitmap->write_lock);
593}
594
595static void bitmap_file_put(struct bitmap *bitmap)
596{
597 struct file *file;
598 struct inode *inode;
599 unsigned long flags;
600
601 spin_lock_irqsave(&bitmap->lock, flags);
602 file = bitmap->file;
603 bitmap->file = NULL;
604 spin_unlock_irqrestore(&bitmap->lock, flags);
605
606 bitmap_stop_daemons(bitmap);
607
608 drain_write_queues(bitmap);
609
610 bitmap_file_unmap(bitmap);
611
612 if (file) {
613 inode = file->f_mapping->host;
614 spin_lock(&inode->i_lock);
615 atomic_set(&inode->i_writecount, 1); /* allow writes again */
616 spin_unlock(&inode->i_lock);
617 fput(file);
618 }
619}
620
621
622/*
623 * bitmap_file_kick - if an error occurs while manipulating the bitmap file
624 * then it is no longer reliable, so we stop using it and we mark the file
625 * as failed in the superblock
626 */
627static void bitmap_file_kick(struct bitmap *bitmap)
628{
629 char *path, *ptr = NULL;
630
631 bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET);
632 bitmap_update_sb(bitmap);
633
634 path = kmalloc(PAGE_SIZE, GFP_KERNEL);
635 if (path)
636 ptr = file_path(bitmap->file, path, PAGE_SIZE);
637
638 printk(KERN_ALERT "%s: kicking failed bitmap file %s from array!\n",
639 bmname(bitmap), ptr ? ptr : "");
640
641 kfree(path);
642
643 bitmap_file_put(bitmap);
644
645 return;
646}
647
648enum bitmap_page_attr {
649 BITMAP_PAGE_DIRTY = 1, // there are set bits that need to be synced
650 BITMAP_PAGE_CLEAN = 2, // there are bits that might need to be cleared
651 BITMAP_PAGE_NEEDWRITE=4, // there are cleared bits that need to be synced
652};
653
654static inline void set_page_attr(struct bitmap *bitmap, struct page *page,
655 enum bitmap_page_attr attr)
656{
657 bitmap->filemap_attr[page->index] |= attr;
658}
659
660static inline void clear_page_attr(struct bitmap *bitmap, struct page *page,
661 enum bitmap_page_attr attr)
662{
663 bitmap->filemap_attr[page->index] &= ~attr;
664}
665
666static inline unsigned long get_page_attr(struct bitmap *bitmap, struct page *page)
667{
668 return bitmap->filemap_attr[page->index];
669}
670
671/*
672 * bitmap_file_set_bit -- called before performing a write to the md device
673 * to set (and eventually sync) a particular bit in the bitmap file
674 *
675 * we set the bit immediately, then we record the page number so that
676 * when an unplug occurs, we can flush the dirty pages out to disk
677 */
678static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
679{
680 unsigned long bit;
681 struct page *page;
682 void *kaddr;
683 unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap);
684
685 if (!bitmap->file || !bitmap->filemap) {
686 return;
687 }
688
689 page = filemap_get_page(bitmap, chunk);
690 bit = file_page_offset(chunk);
691
692
693 /* make sure the page stays cached until it gets written out */
694 if (! (get_page_attr(bitmap, page) & BITMAP_PAGE_DIRTY))
695 page_cache_get(page);
696
697 /* set the bit */
698 kaddr = kmap_atomic(page, KM_USER0);
699 set_bit(bit, kaddr);
700 kunmap_atomic(kaddr, KM_USER0);
701 PRINTK("set file bit %lu page %lu\n", bit, page->index);
702
703 /* record page number so it gets flushed to disk when unplug occurs */
704 set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
705
706}
707
708/* this gets called when the md device is ready to unplug its underlying
709 * (slave) device queues -- before we let any writes go down, we need to
710 * sync the dirty pages of the bitmap file to disk */
711int bitmap_unplug(struct bitmap *bitmap)
712{
713 unsigned long i, attr, flags;
714 struct page *page;
715 int wait = 0;
716
717 if (!bitmap)
718 return 0;
719
720 /* look at each page to see if there are any set bits that need to be
721 * flushed out to disk */
722 for (i = 0; i < bitmap->file_pages; i++) {
723 spin_lock_irqsave(&bitmap->lock, flags);
724 if (!bitmap->file || !bitmap->filemap) {
725 spin_unlock_irqrestore(&bitmap->lock, flags);
726 return 0;
727 }
728 page = bitmap->filemap[i];
729 attr = get_page_attr(bitmap, page);
730 clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
731 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
732 if ((attr & BITMAP_PAGE_DIRTY))
733 wait = 1;
734 spin_unlock_irqrestore(&bitmap->lock, flags);
735
736 if (attr & (BITMAP_PAGE_DIRTY | BITMAP_PAGE_NEEDWRITE))
737 write_page(page, 0);
738 }
739 if (wait) { /* if any writes were performed, we need to wait on them */
740 spin_lock_irq(&bitmap->write_lock);
741 wait_event_lock_irq(bitmap->write_wait,
742 bitmap->writes_pending == 0, bitmap->write_lock,
743 wake_up_process(bitmap->writeback_daemon->tsk));
744 spin_unlock_irq(&bitmap->write_lock);
745 }
746 return 0;
747}
748
749static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset,
750 unsigned long sectors, int set);
751/* * bitmap_init_from_disk -- called at bitmap_create time to initialize
752 * the in-memory bitmap from the on-disk bitmap -- also, sets up the
753 * memory mapping of the bitmap file
754 * Special cases:
755 * if there's no bitmap file, or if the bitmap file had been
756 * previously kicked from the array, we mark all the bits as
757 * 1's in order to cause a full resync.
758 */
759static int bitmap_init_from_disk(struct bitmap *bitmap)
760{
761 unsigned long i, chunks, index, oldindex, bit;
762 struct page *page = NULL, *oldpage = NULL;
763 unsigned long num_pages, bit_cnt = 0;
764 struct file *file;
765 unsigned long bytes, offset, dummy;
766 int outofdate;
767 int ret = -ENOSPC;
768
769 chunks = bitmap->chunks;
770 file = bitmap->file;
771
772 if (!file) { /* no file, dirty all the in-memory bits */
773 printk(KERN_INFO "%s: no bitmap file, doing full recovery\n",
774 bmname(bitmap));
775 bitmap_set_memory_bits(bitmap, 0,
776 chunks << CHUNK_BLOCK_SHIFT(bitmap), 1);
777 return 0;
778 }
779
780#if INJECT_FAULTS_3
781 outofdate = 1;
782#else
783 outofdate = bitmap->flags & BITMAP_STALE;
784#endif
785 if (outofdate)
786 printk(KERN_INFO "%s: bitmap file is out of date, doing full "
787 "recovery\n", bmname(bitmap));
788
789 bytes = (chunks + 7) / 8;
790 num_pages = (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
791 if (i_size_read(file->f_mapping->host) < bytes + sizeof(bitmap_super_t)) {
792 printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n",
793 bmname(bitmap),
794 (unsigned long) i_size_read(file->f_mapping->host),
795 bytes + sizeof(bitmap_super_t));
796 goto out;
797 }
798 num_pages++;
799 bitmap->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL);
800 if (!bitmap->filemap) {
801 ret = -ENOMEM;
802 goto out;
803 }
804
805 bitmap->filemap_attr = kmalloc(sizeof(long) * num_pages, GFP_KERNEL);
806 if (!bitmap->filemap_attr) {
807 ret = -ENOMEM;
808 goto out;
809 }
810
811 memset(bitmap->filemap_attr, 0, sizeof(long) * num_pages);
812
813 oldindex = ~0L;
814
815 for (i = 0; i < chunks; i++) {
816 index = file_page_index(i);
817 bit = file_page_offset(i);
818 if (index != oldindex) { /* this is a new page, read it in */
819 /* unmap the old page, we're done with it */
820 if (oldpage != NULL)
821 kunmap(oldpage);
822 if (index == 0) {
823 /*
824 * if we're here then the superblock page
825 * contains some bits (PAGE_SIZE != sizeof sb)
826 * we've already read it in, so just use it
827 */
828 page = bitmap->sb_page;
829 offset = sizeof(bitmap_super_t);
830 } else {
831 page = read_page(file, index, &dummy);
832 if (IS_ERR(page)) { /* read error */
833 ret = PTR_ERR(page);
834 goto out;
835 }
836 offset = 0;
837 }
838 oldindex = index;
839 oldpage = page;
840 kmap(page);
841
842 if (outofdate) {
843 /*
844 * if bitmap is out of date, dirty the
845 * whole page and write it out
846 */
847 memset(page_address(page) + offset, 0xff,
848 PAGE_SIZE - offset);
849 ret = write_page(page, 1);
850 if (ret) {
851 kunmap(page);
852 /* release, page not in filemap yet */
853 page_cache_release(page);
854 goto out;
855 }
856 }
857
858 bitmap->filemap[bitmap->file_pages++] = page;
859 }
860 if (test_bit(bit, page_address(page))) {
861 /* if the disk bit is set, set the memory bit */
862 bitmap_set_memory_bits(bitmap,
863 i << CHUNK_BLOCK_SHIFT(bitmap), 1, 1);
864 bit_cnt++;
865 }
866#if 0
867 else
868 bitmap_set_memory_bits(bitmap,
869 i << CHUNK_BLOCK_SHIFT(bitmap), 1, 0);
870#endif
871 }
872
873 /* everything went OK */
874 ret = 0;
875 bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET);
876
877 if (page) /* unmap the last page */
878 kunmap(page);
879
880 if (bit_cnt) { /* Kick recovery if any bits were set */
881 set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
882 md_wakeup_thread(bitmap->mddev->thread);
883 }
884
885out:
886 printk(KERN_INFO "%s: bitmap initialized from disk: "
887 "read %lu/%lu pages, set %lu bits, status: %d\n",
888 bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, ret);
889
890 return ret;
891}
892
893
894static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
895{
896 sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap);
897 unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
898 bitmap->bp[page].count += inc;
899/*
900 if (page == 0) printk("count page 0, offset %llu: %d gives %d\n",
901 (unsigned long long)offset, inc, bitmap->bp[page].count);
902*/
903 bitmap_checkfree(bitmap, page);
904}
905static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
906 sector_t offset, int *blocks,
907 int create);
908
909/*
910 * bitmap daemon -- periodically wakes up to clean bits and flush pages
911 * out to disk
912 */
913
914int bitmap_daemon_work(struct bitmap *bitmap)
915{
916 unsigned long bit, j;
917 unsigned long flags;
918 struct page *page = NULL, *lastpage = NULL;
919 int err = 0;
920 int blocks;
921 int attr;
922
923 if (bitmap == NULL)
924 return 0;
925 if (time_before(jiffies, bitmap->daemon_lastrun + bitmap->daemon_sleep*HZ))
926 return 0;
927 bitmap->daemon_lastrun = jiffies;
928
929 for (j = 0; j < bitmap->chunks; j++) {
930 bitmap_counter_t *bmc;
931 spin_lock_irqsave(&bitmap->lock, flags);
932 if (!bitmap->file || !bitmap->filemap) {
933 /* error or shutdown */
934 spin_unlock_irqrestore(&bitmap->lock, flags);
935 break;
936 }
937
938 page = filemap_get_page(bitmap, j);
939 /* skip this page unless it's marked as needing cleaning */
940 if (!((attr=get_page_attr(bitmap, page)) & BITMAP_PAGE_CLEAN)) {
941 if (attr & BITMAP_PAGE_NEEDWRITE) {
942 page_cache_get(page);
943 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
944 }
945 spin_unlock_irqrestore(&bitmap->lock, flags);
946 if (attr & BITMAP_PAGE_NEEDWRITE) {
947 if (write_page(page, 0))
948 bitmap_file_kick(bitmap);
949 page_cache_release(page);
950 }
951 continue;
952 }
953
954 bit = file_page_offset(j);
955
956 if (page != lastpage) {
957 /* grab the new page, sync and release the old */
958 page_cache_get(page);
959 if (lastpage != NULL) {
960 if (get_page_attr(bitmap, lastpage) & BITMAP_PAGE_NEEDWRITE) {
961 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
962 spin_unlock_irqrestore(&bitmap->lock, flags);
963 write_page(lastpage, 0);
964 } else {
965 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
966 spin_unlock_irqrestore(&bitmap->lock, flags);
967 }
968 kunmap(lastpage);
969 page_cache_release(lastpage);
970 if (err)
971 bitmap_file_kick(bitmap);
972 } else
973 spin_unlock_irqrestore(&bitmap->lock, flags);
974 lastpage = page;
975 kmap(page);
976/*
977 printk("bitmap clean at page %lu\n", j);
978*/
979 spin_lock_irqsave(&bitmap->lock, flags);
980 clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
981 }
982 bmc = bitmap_get_counter(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap),
983 &blocks, 0);
984 if (bmc) {
985/*
986 if (j < 100) printk("bitmap: j=%lu, *bmc = 0x%x\n", j, *bmc);
987*/
988 if (*bmc == 2) {
989 *bmc=1; /* maybe clear the bit next time */
990 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
991 } else if (*bmc == 1) {
992 /* we can clear the bit */
993 *bmc = 0;
994 bitmap_count_page(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap),
995 -1);
996
997 /* clear the bit */
998 clear_bit(bit, page_address(page));
999 }
1000 }
1001 spin_unlock_irqrestore(&bitmap->lock, flags);
1002 }
1003
1004 /* now sync the final page */
1005 if (lastpage != NULL) {
1006 kunmap(lastpage);
1007 spin_lock_irqsave(&bitmap->lock, flags);
1008 if (get_page_attr(bitmap, lastpage) &BITMAP_PAGE_NEEDWRITE) {
1009 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1010 spin_unlock_irqrestore(&bitmap->lock, flags);
1011 write_page(lastpage, 0);
1012 } else {
1013 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1014 spin_unlock_irqrestore(&bitmap->lock, flags);
1015 }
1016
1017 page_cache_release(lastpage);
1018 }
1019
1020 return err;
1021}
1022
1023static void daemon_exit(struct bitmap *bitmap, mdk_thread_t **daemon)
1024{
1025 mdk_thread_t *dmn;
1026 unsigned long flags;
1027
1028 /* if no one is waiting on us, we'll free the md thread struct
1029 * and exit, otherwise we let the waiter clean things up */
1030 spin_lock_irqsave(&bitmap->lock, flags);
1031 if ((dmn = *daemon)) { /* no one is waiting, cleanup and exit */
1032 *daemon = NULL;
1033 spin_unlock_irqrestore(&bitmap->lock, flags);
1034 kfree(dmn);
1035 complete_and_exit(NULL, 0); /* do_exit not exported */
1036 }
1037 spin_unlock_irqrestore(&bitmap->lock, flags);
1038}
1039
1040static void bitmap_writeback_daemon(mddev_t *mddev)
1041{
1042 struct bitmap *bitmap = mddev->bitmap;
1043 struct page *page;
1044 struct page_list *item;
1045 int err = 0;
1046
1047 while (1) {
1048 PRINTK("%s: bitmap writeback daemon waiting...\n", bmname(bitmap));
1049 down_interruptible(&bitmap->write_done);
1050 if (signal_pending(current)) {
1051 printk(KERN_INFO
1052 "%s: bitmap writeback daemon got signal, exiting...\n",
1053 bmname(bitmap));
1054 break;
1055 }
1056
1057 PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap));
1058 /* wait on bitmap page writebacks */
1059 while ((item = dequeue_page(bitmap, &bitmap->complete_pages))) {
1060 page = item->page;
1061 mempool_free(item, bitmap->write_pool);
1062 PRINTK("wait on page writeback: %p %lu\n", page, bitmap->writes_pending);
1063 wait_on_page_writeback(page);
1064 PRINTK("finished page writeback: %p %lu\n", page, bitmap->writes_pending);
1065 spin_lock(&bitmap->write_lock);
1066 if (!--bitmap->writes_pending)
1067 wake_up(&bitmap->write_wait);
1068 spin_unlock(&bitmap->write_lock);
1069 err = PageError(page);
1070 page_cache_release(page);
1071 if (err) {
1072 printk(KERN_WARNING "%s: bitmap file writeback "
1073 "failed (page %lu): %d\n",
1074 bmname(bitmap), page->index, err);
1075 bitmap_file_kick(bitmap);
1076 goto out;
1077 }
1078 }
1079 }
1080out:
1081 if (err) {
1082 printk(KERN_INFO "%s: bitmap writeback daemon exiting (%d)\n",
1083 bmname(bitmap), err);
1084 daemon_exit(bitmap, &bitmap->writeback_daemon);
1085 }
1086 return;
1087}
1088
1089static int bitmap_start_daemon(struct bitmap *bitmap, mdk_thread_t **ptr,
1090 void (*func)(mddev_t *), char *name)
1091{
1092 mdk_thread_t *daemon;
1093 unsigned long flags;
1094 char namebuf[32];
1095
1096 spin_lock_irqsave(&bitmap->lock, flags);
1097 *ptr = NULL;
1098 if (!bitmap->file) /* no need for daemon if there's no backing file */
1099 goto out_unlock;
1100
1101 spin_unlock_irqrestore(&bitmap->lock, flags);
1102
1103#if INJECT_FATAL_FAULT_2
1104 daemon = NULL;
1105#else
1106 sprintf(namebuf, "%%s_%s", name);
1107 daemon = md_register_thread(func, bitmap->mddev, namebuf);
1108#endif
1109 if (!daemon) {
1110 printk(KERN_ERR "%s: failed to start bitmap daemon\n",
1111 bmname(bitmap));
1112 return -ECHILD;
1113 }
1114
1115 spin_lock_irqsave(&bitmap->lock, flags);
1116 *ptr = daemon;
1117
1118 md_wakeup_thread(daemon); /* start it running */
1119
1120 PRINTK("%s: %s daemon (pid %d) started...\n",
1121 bmname(bitmap), name, bitmap->daemon->tsk->pid);
1122out_unlock:
1123 spin_unlock_irqrestore(&bitmap->lock, flags);
1124 return 0;
1125}
1126
1127static int bitmap_start_daemons(struct bitmap *bitmap)
1128{
1129 int err = bitmap_start_daemon(bitmap, &bitmap->writeback_daemon,
1130 bitmap_writeback_daemon, "bitmap_wb");
1131 return err;
1132}
1133
1134static void bitmap_stop_daemon(struct bitmap *bitmap, mdk_thread_t **ptr)
1135{
1136 mdk_thread_t *daemon;
1137 unsigned long flags;
1138
1139 spin_lock_irqsave(&bitmap->lock, flags);
1140 daemon = *ptr;
1141 *ptr = NULL;
1142 spin_unlock_irqrestore(&bitmap->lock, flags);
1143 if (daemon)
1144 md_unregister_thread(daemon); /* destroy the thread */
1145}
1146
1147static void bitmap_stop_daemons(struct bitmap *bitmap)
1148{
1149 /* the daemons can't stop themselves... they'll just exit instead... */
1150 if (bitmap->writeback_daemon &&
1151 current->pid != bitmap->writeback_daemon->tsk->pid)
1152 bitmap_stop_daemon(bitmap, &bitmap->writeback_daemon);
1153}
1154
1155static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
1156 sector_t offset, int *blocks,
1157 int create)
1158{
1159 /* If 'create', we might release the lock and reclaim it.
1160 * The lock must have been taken with interrupts enabled.
1161 * If !create, we don't release the lock.
1162 */
1163 sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap);
1164 unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1165 unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
1166 sector_t csize;
1167
1168 if (bitmap_checkpage(bitmap, page, create) < 0) {
1169 csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap));
1170 *blocks = csize - (offset & (csize- 1));
1171 return NULL;
1172 }
1173 /* now locked ... */
1174
1175 if (bitmap->bp[page].hijacked) { /* hijacked pointer */
1176 /* should we use the first or second counter field
1177 * of the hijacked pointer? */
1178 int hi = (pageoff > PAGE_COUNTER_MASK);
1179 csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) +
1180 PAGE_COUNTER_SHIFT - 1);
1181 *blocks = csize - (offset & (csize- 1));
1182 return &((bitmap_counter_t *)
1183 &bitmap->bp[page].map)[hi];
1184 } else { /* page is allocated */
1185 csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap));
1186 *blocks = csize - (offset & (csize- 1));
1187 return (bitmap_counter_t *)
1188 &(bitmap->bp[page].map[pageoff]);
1189 }
1190}
1191
1192int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors)
1193{
1194 if (!bitmap) return 0;
1195 while (sectors) {
1196 int blocks;
1197 bitmap_counter_t *bmc;
1198
1199 spin_lock_irq(&bitmap->lock);
1200 bmc = bitmap_get_counter(bitmap, offset, &blocks, 1);
1201 if (!bmc) {
1202 spin_unlock_irq(&bitmap->lock);
1203 return 0;
1204 }
1205
1206 switch(*bmc) {
1207 case 0:
1208 bitmap_file_set_bit(bitmap, offset);
1209 bitmap_count_page(bitmap,offset, 1);
1210 blk_plug_device(bitmap->mddev->queue);
1211 /* fall through */
1212 case 1:
1213 *bmc = 2;
1214 }
1215 if ((*bmc & COUNTER_MAX) == COUNTER_MAX) BUG();
1216 (*bmc)++;
1217
1218 spin_unlock_irq(&bitmap->lock);
1219
1220 offset += blocks;
1221 if (sectors > blocks)
1222 sectors -= blocks;
1223 else sectors = 0;
1224 }
1225 return 0;
1226}
1227
1228void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors,
1229 int success)
1230{
1231 if (!bitmap) return;
1232 while (sectors) {
1233 int blocks;
1234 unsigned long flags;
1235 bitmap_counter_t *bmc;
1236
1237 spin_lock_irqsave(&bitmap->lock, flags);
1238 bmc = bitmap_get_counter(bitmap, offset, &blocks, 0);
1239 if (!bmc) {
1240 spin_unlock_irqrestore(&bitmap->lock, flags);
1241 return;
1242 }
1243
1244 if (!success && ! (*bmc & NEEDED_MASK))
1245 *bmc |= NEEDED_MASK;
1246
1247 (*bmc)--;
1248 if (*bmc <= 2) {
1249 set_page_attr(bitmap,
1250 filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)),
1251 BITMAP_PAGE_CLEAN);
1252 }
1253 spin_unlock_irqrestore(&bitmap->lock, flags);
1254 offset += blocks;
1255 if (sectors > blocks)
1256 sectors -= blocks;
1257 else sectors = 0;
1258 }
1259}
1260
1261int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks)
1262{
1263 bitmap_counter_t *bmc;
1264 int rv;
1265 if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */
1266 *blocks = 1024;
1267 return 1; /* always resync if no bitmap */
1268 }
1269 spin_lock_irq(&bitmap->lock);
1270 bmc = bitmap_get_counter(bitmap, offset, blocks, 0);
1271 rv = 0;
1272 if (bmc) {
1273 /* locked */
1274 if (RESYNC(*bmc))
1275 rv = 1;
1276 else if (NEEDED(*bmc)) {
1277 rv = 1;
1278 *bmc |= RESYNC_MASK;
1279 *bmc &= ~NEEDED_MASK;
1280 }
1281 }
1282 spin_unlock_irq(&bitmap->lock);
1283 return rv;
1284}
1285
1286void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted)
1287{
1288 bitmap_counter_t *bmc;
1289 unsigned long flags;
1290/*
1291 if (offset == 0) printk("bitmap_end_sync 0 (%d)\n", aborted);
1292*/ if (bitmap == NULL) {
1293 *blocks = 1024;
1294 return;
1295 }
1296 spin_lock_irqsave(&bitmap->lock, flags);
1297 bmc = bitmap_get_counter(bitmap, offset, blocks, 0);
1298 if (bmc == NULL)
1299 goto unlock;
1300 /* locked */
1301/*
1302 if (offset == 0) printk("bitmap_end sync found 0x%x, blocks %d\n", *bmc, *blocks);
1303*/
1304 if (RESYNC(*bmc)) {
1305 *bmc &= ~RESYNC_MASK;
1306
1307 if (!NEEDED(*bmc) && aborted)
1308 *bmc |= NEEDED_MASK;
1309 else {
1310 if (*bmc <= 2) {
1311 set_page_attr(bitmap,
1312 filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)),
1313 BITMAP_PAGE_CLEAN);
1314 }
1315 }
1316 }
1317 unlock:
1318 spin_unlock_irqrestore(&bitmap->lock, flags);
1319}
1320
1321void bitmap_close_sync(struct bitmap *bitmap)
1322{
1323 /* Sync has finished, and any bitmap chunks that weren't synced
1324 * properly have been aborted. It remains to us to clear the
1325 * RESYNC bit wherever it is still on
1326 */
1327 sector_t sector = 0;
1328 int blocks;
1329 if (!bitmap) return;
1330 while (sector < bitmap->mddev->resync_max_sectors) {
1331 bitmap_end_sync(bitmap, sector, &blocks, 0);
1332/*
1333 if (sector < 500) printk("bitmap_close_sync: sec %llu blks %d\n",
1334 (unsigned long long)sector, blocks);
1335*/ sector += blocks;
1336 }
1337}
1338
1339static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset,
1340 unsigned long sectors, int set)
1341{
1342 /* For each chunk covered by any of these sectors, set the
1343 * resync needed bit, and the counter to 1. They should all
1344 * be 0 at this point
1345 */
1346 while (sectors) {
1347 int secs;
1348 bitmap_counter_t *bmc;
1349 spin_lock_irq(&bitmap->lock);
1350 bmc = bitmap_get_counter(bitmap, offset, &secs, 1);
1351 if (!bmc) {
1352 spin_unlock_irq(&bitmap->lock);
1353 return;
1354 }
1355 if (set && !NEEDED(*bmc)) {
1356 BUG_ON(*bmc);
1357 *bmc = NEEDED_MASK | 1;
1358 bitmap_count_page(bitmap, offset, 1);
1359 }
1360 spin_unlock_irq(&bitmap->lock);
1361 if (sectors > secs)
1362 sectors -= secs;
1363 else
1364 sectors = 0;
1365 }
1366}
1367
1368/* dirty the entire bitmap */
1369int bitmap_setallbits(struct bitmap *bitmap)
1370{
1371 unsigned long flags;
1372 unsigned long j;
1373
1374 /* dirty the in-memory bitmap */
1375 bitmap_set_memory_bits(bitmap, 0, bitmap->chunks << CHUNK_BLOCK_SHIFT(bitmap), 1);
1376
1377 /* dirty the bitmap file */
1378 for (j = 0; j < bitmap->file_pages; j++) {
1379 struct page *page = bitmap->filemap[j];
1380
1381 spin_lock_irqsave(&bitmap->lock, flags);
1382 page_cache_get(page);
1383 spin_unlock_irqrestore(&bitmap->lock, flags);
1384 memset(kmap(page), 0xff, PAGE_SIZE);
1385 kunmap(page);
1386 write_page(page, 0);
1387 }
1388
1389 return 0;
1390}
1391
1392/*
1393 * free memory that was allocated
1394 */
1395void bitmap_destroy(mddev_t *mddev)
1396{
1397 unsigned long k, pages;
1398 struct bitmap_page *bp;
1399 struct bitmap *bitmap = mddev->bitmap;
1400
1401 if (!bitmap) /* there was no bitmap */
1402 return;
1403
1404 mddev->bitmap = NULL; /* disconnect from the md device */
1405
1406 /* release the bitmap file and kill the daemon */
1407 bitmap_file_put(bitmap);
1408
1409 bp = bitmap->bp;
1410 pages = bitmap->pages;
1411
1412 /* free all allocated memory */
1413
1414 mempool_destroy(bitmap->write_pool);
1415
1416 if (bp) /* deallocate the page memory */
1417 for (k = 0; k < pages; k++)
1418 if (bp[k].map && !bp[k].hijacked)
1419 kfree(bp[k].map);
1420 kfree(bp);
1421 kfree(bitmap);
1422}
1423
1424/*
1425 * initialize the bitmap structure
1426 * if this returns an error, bitmap_destroy must be called to do clean up
1427 */
1428int bitmap_create(mddev_t *mddev)
1429{
1430 struct bitmap *bitmap;
1431 unsigned long blocks = mddev->resync_max_sectors;
1432 unsigned long chunks;
1433 unsigned long pages;
1434 struct file *file = mddev->bitmap_file;
1435 int err;
1436
1437 BUG_ON(sizeof(bitmap_super_t) != 256);
1438
1439 if (!file) /* bitmap disabled, nothing to do */
1440 return 0;
1441
1442 bitmap = kmalloc(sizeof(*bitmap), GFP_KERNEL);
1443 if (!bitmap)
1444 return -ENOMEM;
1445
1446 memset(bitmap, 0, sizeof(*bitmap));
1447
1448 spin_lock_init(&bitmap->lock);
1449 bitmap->mddev = mddev;
1450 mddev->bitmap = bitmap;
1451
1452 spin_lock_init(&bitmap->write_lock);
1453 init_MUTEX_LOCKED(&bitmap->write_done);
1454 INIT_LIST_HEAD(&bitmap->complete_pages);
1455 init_waitqueue_head(&bitmap->write_wait);
1456 bitmap->write_pool = mempool_create(WRITE_POOL_SIZE, write_pool_alloc,
1457 write_pool_free, NULL);
1458 if (!bitmap->write_pool)
1459 return -ENOMEM;
1460
1461 bitmap->file = file;
1462 get_file(file);
1463 /* read superblock from bitmap file (this sets bitmap->chunksize) */
1464 err = bitmap_read_sb(bitmap);
1465 if (err)
1466 return err;
1467
1468 bitmap->chunkshift = find_first_bit(&bitmap->chunksize,
1469 sizeof(bitmap->chunksize));
1470
1471 /* now that chunksize and chunkshift are set, we can use these macros */
1472 chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) /
1473 CHUNK_BLOCK_RATIO(bitmap);
1474 pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO;
1475
1476 BUG_ON(!pages);
1477
1478 bitmap->chunks = chunks;
1479 bitmap->pages = pages;
1480 bitmap->missing_pages = pages;
1481 bitmap->counter_bits = COUNTER_BITS;
1482
1483 bitmap->syncchunk = ~0UL;
1484
1485#if INJECT_FATAL_FAULT_1
1486 bitmap->bp = NULL;
1487#else
1488 bitmap->bp = kmalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL);
1489#endif
1490 if (!bitmap->bp)
1491 return -ENOMEM;
1492 memset(bitmap->bp, 0, pages * sizeof(*bitmap->bp));
1493
1494 bitmap->flags |= BITMAP_ACTIVE;
1495
1496 /* now that we have some pages available, initialize the in-memory
1497 * bitmap from the on-disk bitmap */
1498 err = bitmap_init_from_disk(bitmap);
1499 if (err)
1500 return err;
1501
1502 printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
1503 pages, bmname(bitmap));
1504
1505 /* kick off the bitmap daemons */
1506 err = bitmap_start_daemons(bitmap);
1507 if (err)
1508 return err;
1509 return bitmap_update_sb(bitmap);
1510}
1511
1512/* the bitmap API -- for raid personalities */
1513EXPORT_SYMBOL(bitmap_startwrite);
1514EXPORT_SYMBOL(bitmap_endwrite);
1515EXPORT_SYMBOL(bitmap_start_sync);
1516EXPORT_SYMBOL(bitmap_end_sync);
1517EXPORT_SYMBOL(bitmap_unplug);
1518EXPORT_SYMBOL(bitmap_close_sync);
1519EXPORT_SYMBOL(bitmap_daemon_work);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index fa608a1a5c20..c402f6cc7047 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -19,6 +19,9 @@
19 19
20 Neil Brown <neilb@cse.unsw.edu.au>. 20 Neil Brown <neilb@cse.unsw.edu.au>.
21 21
22 - persistent bitmap code
23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
22 This program is free software; you can redistribute it and/or modify 25 This program is free software; you can redistribute it and/or modify
23 it under the terms of the GNU General Public License as published by 26 it under the terms of the GNU General Public License as published by
24 the Free Software Foundation; either version 2, or (at your option) 27 the Free Software Foundation; either version 2, or (at your option)
@@ -33,6 +36,7 @@
33#include <linux/config.h> 36#include <linux/config.h>
34#include <linux/linkage.h> 37#include <linux/linkage.h>
35#include <linux/raid/md.h> 38#include <linux/raid/md.h>
39#include <linux/raid/bitmap.h>
36#include <linux/sysctl.h> 40#include <linux/sysctl.h>
37#include <linux/devfs_fs_kernel.h> 41#include <linux/devfs_fs_kernel.h>
38#include <linux/buffer_head.h> /* for invalidate_bdev */ 42#include <linux/buffer_head.h> /* for invalidate_bdev */
@@ -40,6 +44,8 @@
40 44
41#include <linux/init.h> 45#include <linux/init.h>
42 46
47#include <linux/file.h>
48
43#ifdef CONFIG_KMOD 49#ifdef CONFIG_KMOD
44#include <linux/kmod.h> 50#include <linux/kmod.h>
45#endif 51#endif
@@ -1198,8 +1204,11 @@ void md_print_devices(void)
1198 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1204 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
1199 printk("md: **********************************\n"); 1205 printk("md: **********************************\n");
1200 ITERATE_MDDEV(mddev,tmp) { 1206 ITERATE_MDDEV(mddev,tmp) {
1201 printk("%s: ", mdname(mddev));
1202 1207
1208 if (mddev->bitmap)
1209 bitmap_print_sb(mddev->bitmap);
1210 else
1211 printk("%s: ", mdname(mddev));
1203 ITERATE_RDEV(mddev,rdev,tmp2) 1212 ITERATE_RDEV(mddev,rdev,tmp2)
1204 printk("<%s>", bdevname(rdev->bdev,b)); 1213 printk("<%s>", bdevname(rdev->bdev,b));
1205 printk("\n"); 1214 printk("\n");
@@ -1287,7 +1296,7 @@ repeat:
1287 "md: updating %s RAID superblock on device (in sync %d)\n", 1296 "md: updating %s RAID superblock on device (in sync %d)\n",
1288 mdname(mddev),mddev->in_sync); 1297 mdname(mddev),mddev->in_sync);
1289 1298
1290 err = 0; 1299 err = bitmap_update_sb(mddev->bitmap);
1291 ITERATE_RDEV(mddev,rdev,tmp) { 1300 ITERATE_RDEV(mddev,rdev,tmp) {
1292 char b[BDEVNAME_SIZE]; 1301 char b[BDEVNAME_SIZE];
1293 dprintk(KERN_INFO "md: "); 1302 dprintk(KERN_INFO "md: ");
@@ -1624,12 +1633,19 @@ static int do_md_run(mddev_t * mddev)
1624 1633
1625 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 1634 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
1626 1635
1627 err = mddev->pers->run(mddev); 1636 /* before we start the array running, initialise the bitmap */
1637 err = bitmap_create(mddev);
1638 if (err)
1639 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
1640 mdname(mddev), err);
1641 else
1642 err = mddev->pers->run(mddev);
1628 if (err) { 1643 if (err) {
1629 printk(KERN_ERR "md: pers->run() failed ...\n"); 1644 printk(KERN_ERR "md: pers->run() failed ...\n");
1630 module_put(mddev->pers->owner); 1645 module_put(mddev->pers->owner);
1631 mddev->pers = NULL; 1646 mddev->pers = NULL;
1632 return -EINVAL; 1647 bitmap_destroy(mddev);
1648 return err;
1633 } 1649 }
1634 atomic_set(&mddev->writes_pending,0); 1650 atomic_set(&mddev->writes_pending,0);
1635 mddev->safemode = 0; 1651 mddev->safemode = 0;
@@ -1742,6 +1758,14 @@ static int do_md_stop(mddev_t * mddev, int ro)
1742 if (ro) 1758 if (ro)
1743 set_disk_ro(disk, 1); 1759 set_disk_ro(disk, 1);
1744 } 1760 }
1761
1762 bitmap_destroy(mddev);
1763 if (mddev->bitmap_file) {
1764 atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1);
1765 fput(mddev->bitmap_file);
1766 mddev->bitmap_file = NULL;
1767 }
1768
1745 /* 1769 /*
1746 * Free resources if final stop 1770 * Free resources if final stop
1747 */ 1771 */
@@ -2000,6 +2024,42 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
2000 return 0; 2024 return 0;
2001} 2025}
2002 2026
2027static int get_bitmap_file(mddev_t * mddev, void * arg)
2028{
2029 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
2030 char *ptr, *buf = NULL;
2031 int err = -ENOMEM;
2032
2033 file = kmalloc(sizeof(*file), GFP_KERNEL);
2034 if (!file)
2035 goto out;
2036
2037 /* bitmap disabled, zero the first byte and copy out */
2038 if (!mddev->bitmap || !mddev->bitmap->file) {
2039 file->pathname[0] = '\0';
2040 goto copy_out;
2041 }
2042
2043 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
2044 if (!buf)
2045 goto out;
2046
2047 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
2048 if (!ptr)
2049 goto out;
2050
2051 strcpy(file->pathname, ptr);
2052
2053copy_out:
2054 err = 0;
2055 if (copy_to_user(arg, file, sizeof(*file)))
2056 err = -EFAULT;
2057out:
2058 kfree(buf);
2059 kfree(file);
2060 return err;
2061}
2062
2003static int get_disk_info(mddev_t * mddev, void __user * arg) 2063static int get_disk_info(mddev_t * mddev, void __user * arg)
2004{ 2064{
2005 mdu_disk_info_t info; 2065 mdu_disk_info_t info;
@@ -2275,6 +2335,48 @@ abort_export:
2275 return err; 2335 return err;
2276} 2336}
2277 2337
2338/* similar to deny_write_access, but accounts for our holding a reference
2339 * to the file ourselves */
2340static int deny_bitmap_write_access(struct file * file)
2341{
2342 struct inode *inode = file->f_mapping->host;
2343
2344 spin_lock(&inode->i_lock);
2345 if (atomic_read(&inode->i_writecount) > 1) {
2346 spin_unlock(&inode->i_lock);
2347 return -ETXTBSY;
2348 }
2349 atomic_set(&inode->i_writecount, -1);
2350 spin_unlock(&inode->i_lock);
2351
2352 return 0;
2353}
2354
2355static int set_bitmap_file(mddev_t *mddev, int fd)
2356{
2357 int err;
2358
2359 if (mddev->pers)
2360 return -EBUSY;
2361
2362 mddev->bitmap_file = fget(fd);
2363
2364 if (mddev->bitmap_file == NULL) {
2365 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
2366 mdname(mddev));
2367 return -EBADF;
2368 }
2369
2370 err = deny_bitmap_write_access(mddev->bitmap_file);
2371 if (err) {
2372 printk(KERN_ERR "%s: error: bitmap file is already in use\n",
2373 mdname(mddev));
2374 fput(mddev->bitmap_file);
2375 mddev->bitmap_file = NULL;
2376 }
2377 return err;
2378}
2379
2278/* 2380/*
2279 * set_array_info is used two different ways 2381 * set_array_info is used two different ways
2280 * The original usage is when creating a new array. 2382 * The original usage is when creating a new array.
@@ -2586,8 +2688,10 @@ static int md_ioctl(struct inode *inode, struct file *file,
2586 /* 2688 /*
2587 * Commands querying/configuring an existing array: 2689 * Commands querying/configuring an existing array:
2588 */ 2690 */
2589 /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ 2691 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
2590 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { 2692 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */
2693 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
2694 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) {
2591 err = -ENODEV; 2695 err = -ENODEV;
2592 goto abort_unlock; 2696 goto abort_unlock;
2593 } 2697 }
@@ -2601,6 +2705,10 @@ static int md_ioctl(struct inode *inode, struct file *file,
2601 err = get_array_info(mddev, argp); 2705 err = get_array_info(mddev, argp);
2602 goto done_unlock; 2706 goto done_unlock;
2603 2707
2708 case GET_BITMAP_FILE:
2709 err = get_bitmap_file(mddev, (void *)arg);
2710 goto done_unlock;
2711
2604 case GET_DISK_INFO: 2712 case GET_DISK_INFO:
2605 err = get_disk_info(mddev, argp); 2713 err = get_disk_info(mddev, argp);
2606 goto done_unlock; 2714 goto done_unlock;
@@ -2681,6 +2789,10 @@ static int md_ioctl(struct inode *inode, struct file *file,
2681 err = do_md_run (mddev); 2789 err = do_md_run (mddev);
2682 goto done_unlock; 2790 goto done_unlock;
2683 2791
2792 case SET_BITMAP_FILE:
2793 err = set_bitmap_file(mddev, (int)arg);
2794 goto done_unlock;
2795
2684 default: 2796 default:
2685 if (_IOC_TYPE(cmd) == MD_MAJOR) 2797 if (_IOC_TYPE(cmd) == MD_MAJOR)
2686 printk(KERN_WARNING "md: %s(pid %d) used" 2798 printk(KERN_WARNING "md: %s(pid %d) used"
@@ -2792,8 +2904,9 @@ static int md_thread(void * arg)
2792 while (thread->run) { 2904 while (thread->run) {
2793 void (*run)(mddev_t *); 2905 void (*run)(mddev_t *);
2794 2906
2795 wait_event_interruptible(thread->wqueue, 2907 wait_event_interruptible_timeout(thread->wqueue,
2796 test_bit(THREAD_WAKEUP, &thread->flags)); 2908 test_bit(THREAD_WAKEUP, &thread->flags),
2909 thread->timeout);
2797 if (current->flags & PF_FREEZE) 2910 if (current->flags & PF_FREEZE)
2798 refrigerator(PF_FREEZE); 2911 refrigerator(PF_FREEZE);
2799 2912
@@ -2839,6 +2952,7 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
2839 thread->run = run; 2952 thread->run = run;
2840 thread->mddev = mddev; 2953 thread->mddev = mddev;
2841 thread->name = name; 2954 thread->name = name;
2955 thread->timeout = MAX_SCHEDULE_TIMEOUT;
2842 ret = kernel_thread(md_thread, thread, 0); 2956 ret = kernel_thread(md_thread, thread, 0);
2843 if (ret < 0) { 2957 if (ret < 0) {
2844 kfree(thread); 2958 kfree(thread);
@@ -2877,13 +2991,13 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
2877 2991
2878 if (!rdev || rdev->faulty) 2992 if (!rdev || rdev->faulty)
2879 return; 2993 return;
2880 2994/*
2881 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 2995 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
2882 mdname(mddev), 2996 mdname(mddev),
2883 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 2997 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
2884 __builtin_return_address(0),__builtin_return_address(1), 2998 __builtin_return_address(0),__builtin_return_address(1),
2885 __builtin_return_address(2),__builtin_return_address(3)); 2999 __builtin_return_address(2),__builtin_return_address(3));
2886 3000*/
2887 if (!mddev->pers->error_handler) 3001 if (!mddev->pers->error_handler)
2888 return; 3002 return;
2889 mddev->pers->error_handler(mddev,rdev); 3003 mddev->pers->error_handler(mddev,rdev);
@@ -3037,6 +3151,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
3037 struct list_head *tmp2; 3151 struct list_head *tmp2;
3038 mdk_rdev_t *rdev; 3152 mdk_rdev_t *rdev;
3039 int i; 3153 int i;
3154 struct bitmap *bitmap;
3040 3155
3041 if (v == (void*)1) { 3156 if (v == (void*)1) {
3042 seq_printf(seq, "Personalities : "); 3157 seq_printf(seq, "Personalities : ");
@@ -3089,10 +3204,36 @@ static int md_seq_show(struct seq_file *seq, void *v)
3089 if (mddev->pers) { 3204 if (mddev->pers) {
3090 mddev->pers->status (seq, mddev); 3205 mddev->pers->status (seq, mddev);
3091 seq_printf(seq, "\n "); 3206 seq_printf(seq, "\n ");
3092 if (mddev->curr_resync > 2) 3207 if (mddev->curr_resync > 2) {
3093 status_resync (seq, mddev); 3208 status_resync (seq, mddev);
3094 else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 3209 seq_printf(seq, "\n ");
3095 seq_printf(seq, " resync=DELAYED"); 3210 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
3211 seq_printf(seq, " resync=DELAYED\n ");
3212 } else
3213 seq_printf(seq, "\n ");
3214
3215 if ((bitmap = mddev->bitmap)) {
3216 char *buf, *path;
3217 unsigned long chunk_kb;
3218 unsigned long flags;
3219 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
3220 spin_lock_irqsave(&bitmap->lock, flags);
3221 chunk_kb = bitmap->chunksize >> 10;
3222 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
3223 "%lu%s chunk",
3224 bitmap->pages - bitmap->missing_pages,
3225 bitmap->pages,
3226 (bitmap->pages - bitmap->missing_pages)
3227 << (PAGE_SHIFT - 10),
3228 chunk_kb ? chunk_kb : bitmap->chunksize,
3229 chunk_kb ? "KB" : "B");
3230 if (bitmap->file && buf) {
3231 path = file_path(bitmap->file, buf, PAGE_SIZE);
3232 seq_printf(seq, ", file: %s", path ? path : "");
3233 }
3234 seq_printf(seq, "\n");
3235 spin_unlock_irqrestore(&bitmap->lock, flags);
3236 kfree(buf);
3096 } 3237 }
3097 3238
3098 seq_printf(seq, "\n"); 3239 seq_printf(seq, "\n");
@@ -3328,7 +3469,8 @@ static void md_do_sync(mddev_t *mddev)
3328 sysctl_speed_limit_max); 3469 sysctl_speed_limit_max);
3329 3470
3330 is_mddev_idle(mddev); /* this also initializes IO event counters */ 3471 is_mddev_idle(mddev); /* this also initializes IO event counters */
3331 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3472 /* we don't use the checkpoint if there's a bitmap */
3473 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap)
3332 j = mddev->recovery_cp; 3474 j = mddev->recovery_cp;
3333 else 3475 else
3334 j = 0; 3476 j = 0;
@@ -3673,6 +3815,8 @@ static int __init md_init(void)
3673 " MD_SB_DISKS=%d\n", 3815 " MD_SB_DISKS=%d\n",
3674 MD_MAJOR_VERSION, MD_MINOR_VERSION, 3816 MD_MAJOR_VERSION, MD_MINOR_VERSION,
3675 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 3817 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
3818 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR,
3819 BITMAP_MINOR);
3676 3820
3677 if (register_blkdev(MAJOR_NR, "md")) 3821 if (register_blkdev(MAJOR_NR, "md"))
3678 return -1; 3822 return -1;
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
new file mode 100644
index 000000000000..f785cf26cbad
--- /dev/null
+++ b/include/linux/raid/bitmap.h
@@ -0,0 +1,280 @@
1/*
2 * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
3 *
4 * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
5 */
6#ifndef BITMAP_H
7#define BITMAP_H 1
8
9#define BITMAP_MAJOR 3
10#define BITMAP_MINOR 38
11
12/*
13 * in-memory bitmap:
14 *
15 * Use 16 bit block counters to track pending writes to each "chunk".
16 * The 2 high order bits are special-purpose, the first is a flag indicating
17 * whether a resync is needed. The second is a flag indicating whether a
18 * resync is active.
19 * This means that the counter is actually 14 bits:
20 *
21 * +--------+--------+------------------------------------------------+
22 * | resync | resync | counter |
23 * | needed | active | |
24 * | (0-1) | (0-1) | (0-16383) |
25 * +--------+--------+------------------------------------------------+
26 *
27 * The "resync needed" bit is set when:
28 * a '1' bit is read from storage at startup.
29 * a write request fails on some drives
30 * a resync is aborted on a chunk with 'resync active' set
31 * It is cleared (and resync-active set) when a resync starts across all drives
32 * of the chunk.
33 *
34 *
35 * The "resync active" bit is set when:
36 * a resync is started on all drives, and resync_needed is set.
37 * resync_needed will be cleared (as long as resync_active wasn't already set).
38 * It is cleared when a resync completes.
39 *
40 * The counter counts pending write requests, plus the on-disk bit.
41 * When the counter is '1' and the resync bits are clear, the on-disk
42 * bit can be cleared aswell, thus setting the counter to 0.
43 * When we set a bit, or in the counter (to start a write), if the fields is
44 * 0, we first set the disk bit and set the counter to 1.
45 *
46 * If the counter is 0, the on-disk bit is clear and the stipe is clean
47 * Anything that dirties the stipe pushes the counter to 2 (at least)
48 * and sets the on-disk bit (lazily).
49 * If a periodic sweep find the counter at 2, it is decremented to 1.
50 * If the sweep find the counter at 1, the on-disk bit is cleared and the
51 * counter goes to zero.
52 *
53 * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
54 * counters as a fallback when "page" memory cannot be allocated:
55 *
56 * Normal case (page memory allocated):
57 *
58 * page pointer (32-bit)
59 *
60 * [ ] ------+
61 * |
62 * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters)
63 * c1 c2 c2048
64 *
65 * Hijacked case (page memory allocation failed):
66 *
67 * hijacked page pointer (32-bit)
68 *
69 * [ ][ ] (no page memory allocated)
70 * counter #1 (16-bit) counter #2 (16-bit)
71 *
72 */
73
74#ifdef __KERNEL__
75
76#define PAGE_BITS (PAGE_SIZE << 3)
77#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
78
79typedef __u16 bitmap_counter_t;
80#define COUNTER_BITS 16
81#define COUNTER_BIT_SHIFT 4
82#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8)
83#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
84
85#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
86#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
87#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
88#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
89#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
90#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
91
92/* how many counters per page? */
93#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
94/* same, except a shift value for more efficient bitops */
95#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
96/* same, except a mask value for more efficient bitops */
97#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
98
99#define BITMAP_BLOCK_SIZE 512
100#define BITMAP_BLOCK_SHIFT 9
101
102/* how many blocks per chunk? (this is variable) */
103#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT)
104#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
105#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
106
107/* when hijacked, the counters and bits represent even larger "chunks" */
108/* there will be 1024 chunks represented by each counter in the page pointers */
109#define PAGEPTR_BLOCK_RATIO(bitmap) \
110 (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
111#define PAGEPTR_BLOCK_SHIFT(bitmap) \
112 (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
113#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
114
115/*
116 * on-disk bitmap:
117 *
118 * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
119 * file a page at a time. There's a superblock at the start of the file.
120 */
121
122/* map chunks (bits) to file pages - offset by the size of the superblock */
123#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))
124
125#endif
126
127/*
128 * bitmap structures:
129 */
130
131#define BITMAP_MAGIC 0x6d746962
132
133/* use these for bitmap->flags and bitmap->sb->state bit-fields */
134enum bitmap_state {
135 BITMAP_ACTIVE = 0x001, /* the bitmap is in use */
136 BITMAP_STALE = 0x002 /* the bitmap file is out of date or had -EIO */
137};
138
139/* the superblock at the front of the bitmap file -- little endian */
140typedef struct bitmap_super_s {
141 __u32 magic; /* 0 BITMAP_MAGIC */
142 __u32 version; /* 4 the bitmap major for now, could change... */
143 __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */
144 __u64 events; /* 24 event counter for the bitmap (1)*/
145 __u64 events_cleared;/*32 event counter when last bit cleared (2) */
146 __u64 sync_size; /* 40 the size of the md device's sync range(3) */
147 __u32 state; /* 48 bitmap state information */
148 __u32 chunksize; /* 52 the bitmap chunk size in bytes */
149 __u32 daemon_sleep; /* 56 seconds between disk flushes */
150
151 __u8 pad[256 - 60]; /* set to zero */
152} bitmap_super_t;
153
154/* notes:
155 * (1) This event counter is updated before the eventcounter in the md superblock
156 * When a bitmap is loaded, it is only accepted if this event counter is equal
157 * to, or one greater than, the event counter in the superblock.
158 * (2) This event counter is updated when the other one is *if*and*only*if* the
159 * array is not degraded. As bits are not cleared when the array is degraded,
160 * this represents the last time that any bits were cleared.
161 * If a device is being added that has an event count with this value or
162 * higher, it is accepted as conforming to the bitmap.
163 * (3)This is the number of sectors represented by the bitmap, and is the range that
164 * resync happens across. For raid1 and raid5/6 it is the size of individual
165 * devices. For raid10 it is the size of the array.
166 */
167
168#ifdef __KERNEL__
169
170/* the in-memory bitmap is represented by bitmap_pages */
171struct bitmap_page {
172 /*
173 * map points to the actual memory page
174 */
175 char *map;
176 /*
177 * in emergencies (when map cannot be alloced), hijack the map
178 * pointer and use it as two counters itself
179 */
180 unsigned int hijacked:1;
181 /*
182 * count of dirty bits on the page
183 */
184 unsigned int count:31;
185};
186
187/* keep track of bitmap file pages that have pending writes on them */
188struct page_list {
189 struct list_head list;
190 struct page *page;
191};
192
193/* the main bitmap structure - one per mddev */
194struct bitmap {
195 struct bitmap_page *bp;
196 unsigned long pages; /* total number of pages in the bitmap */
197 unsigned long missing_pages; /* number of pages not yet allocated */
198
199 mddev_t *mddev; /* the md device that the bitmap is for */
200
201 int counter_bits; /* how many bits per block counter */
202
203 /* bitmap chunksize -- how much data does each bit represent? */
204 unsigned long chunksize;
205 unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
206 unsigned long chunks; /* total number of data chunks for the array */
207
208 /* We hold a count on the chunk currently being synced, and drop
209 * it when the last block is started. If the resync is aborted
210 * midway, we need to be able to drop that count, so we remember
211 * the counted chunk..
212 */
213 unsigned long syncchunk;
214
215 __u64 events_cleared;
216
217 /* bitmap spinlock */
218 spinlock_t lock;
219
220 struct file *file; /* backing disk file */
221 struct page *sb_page; /* cached copy of the bitmap file superblock */
222 struct page **filemap; /* list of cache pages for the file */
223 unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
224 unsigned long file_pages; /* number of pages in the file */
225
226 unsigned long flags;
227
228 /*
229 * the bitmap daemon - periodically wakes up and sweeps the bitmap
230 * file, cleaning up bits and flushing out pages to disk as necessary
231 */
232 unsigned long daemon_lastrun; /* jiffies of last run */
233 unsigned long daemon_sleep; /* how many seconds between updates? */
234
235 /*
236 * bitmap write daemon - this daemon performs writes to the bitmap file
237 * this thread is only needed because of a limitation in ext3 (jbd)
238 * that does not allow a task to have two journal transactions ongoing
239 * simultaneously (even if the transactions are for two different
240 * filesystems) -- in the case of bitmap, that would be the filesystem
241 * that the bitmap file resides on and the filesystem that is mounted
242 * on the md device -- see current->journal_info in jbd/transaction.c
243 */
244 mdk_thread_t *writeback_daemon;
245 spinlock_t write_lock;
246 struct semaphore write_ready;
247 struct semaphore write_done;
248 unsigned long writes_pending;
249 wait_queue_head_t write_wait;
250 struct list_head write_pages;
251 struct list_head complete_pages;
252 mempool_t *write_pool;
253};
254
255/* the bitmap API */
256
257/* these are used only by md/bitmap */
258int bitmap_create(mddev_t *mddev);
259void bitmap_destroy(mddev_t *mddev);
260int bitmap_active(struct bitmap *bitmap);
261
262char *file_path(struct file *file, char *buf, int count);
263void bitmap_print_sb(struct bitmap *bitmap);
264int bitmap_update_sb(struct bitmap *bitmap);
265
266int bitmap_setallbits(struct bitmap *bitmap);
267
268/* these are exported */
269int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors);
270void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors,
271 int success);
272int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks);
273void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted);
274void bitmap_close_sync(struct bitmap *bitmap);
275
276int bitmap_unplug(struct bitmap *bitmap);
277int bitmap_daemon_work(struct bitmap *bitmap);
278#endif
279
280#endif
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index bce0032decff..16e94a9f0f8c 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -267,6 +267,9 @@ struct mddev_s
267 atomic_t writes_pending; 267 atomic_t writes_pending;
268 request_queue_t *queue; /* for plugging ... */ 268 request_queue_t *queue; /* for plugging ... */
269 269
270 struct bitmap *bitmap; /* the bitmap for the device */
271 struct file *bitmap_file; /* the bitmap file */
272
270 struct list_head all_mddevs; 273 struct list_head all_mddevs;
271}; 274};
272 275
@@ -341,6 +344,7 @@ typedef struct mdk_thread_s {
341 unsigned long flags; 344 unsigned long flags;
342 struct completion *event; 345 struct completion *event;
343 struct task_struct *tsk; 346 struct task_struct *tsk;
347 unsigned long timeout;
344 const char *name; 348 const char *name;
345} mdk_thread_t; 349} mdk_thread_t;
346 350
diff --git a/include/linux/raid/md_u.h b/include/linux/raid/md_u.h
index a2df5c2a42af..81da20ccec4d 100644
--- a/include/linux/raid/md_u.h
+++ b/include/linux/raid/md_u.h
@@ -23,6 +23,7 @@
23#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t) 23#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t)
24#define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13) 24#define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13)
25#define RAID_AUTORUN _IO (MD_MAJOR, 0x14) 25#define RAID_AUTORUN _IO (MD_MAJOR, 0x14)
26#define GET_BITMAP_FILE _IOR (MD_MAJOR, 0x15, mdu_bitmap_file_t)
26 27
27/* configuration */ 28/* configuration */
28#define CLEAR_ARRAY _IO (MD_MAJOR, 0x20) 29#define CLEAR_ARRAY _IO (MD_MAJOR, 0x20)
@@ -36,6 +37,7 @@
36#define HOT_ADD_DISK _IO (MD_MAJOR, 0x28) 37#define HOT_ADD_DISK _IO (MD_MAJOR, 0x28)
37#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29) 38#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29)
38#define HOT_GENERATE_ERROR _IO (MD_MAJOR, 0x2a) 39#define HOT_GENERATE_ERROR _IO (MD_MAJOR, 0x2a)
40#define SET_BITMAP_FILE _IOW (MD_MAJOR, 0x2b, int)
39 41
40/* usage */ 42/* usage */
41#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t) 43#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t)
@@ -106,6 +108,11 @@ typedef struct mdu_start_info_s {
106 108
107} mdu_start_info_t; 109} mdu_start_info_t;
108 110
111typedef struct mdu_bitmap_file_s
112{
113 char pathname[4096];
114} mdu_bitmap_file_t;
115
109typedef struct mdu_param_s 116typedef struct mdu_param_s
110{ 117{
111 int personality; /* 1,2,3,4 */ 118 int personality; /* 1,2,3,4 */