aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorJeff Garzik <jgarzik@pretzel.yyz.us>2005-06-22 21:50:57 -0400
committerJeff Garzik <jgarzik@pobox.com>2005-06-22 21:50:57 -0400
commita5324343955997d1439f26518ddac567cd5d134b (patch)
treef43558389c41e3a0f076c4ee55d77c4aa1561779 /drivers/md
parent8199d3a79c224bbe5943fa08684e1f93a17881b0 (diff)
parenta4936044001694f033fe4ea94d6034d51a6b465c (diff)
Merge /spare/repo/linux-2.6/
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Makefile3
-rw-r--r--drivers/md/bitmap.c1586
-rw-r--r--drivers/md/dm-crypt.c3
-rw-r--r--drivers/md/dm-mpath.c3
-rw-r--r--drivers/md/linear.c8
-rw-r--r--drivers/md/md.c525
-rw-r--r--drivers/md/multipath.c11
-rw-r--r--drivers/md/raid0.c12
-rw-r--r--drivers/md/raid1.c249
-rw-r--r--drivers/md/raid10.c36
-rw-r--r--drivers/md/raid5.c19
-rw-r--r--drivers/md/raid6main.c18
12 files changed, 2234 insertions, 239 deletions
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 90de9c146a5f..d3efedf6a6ad 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -7,6 +7,7 @@ dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
7dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o 7dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o
8dm-snapshot-objs := dm-snap.o dm-exception-store.o 8dm-snapshot-objs := dm-snap.o dm-exception-store.o
9dm-mirror-objs := dm-log.o dm-raid1.o 9dm-mirror-objs := dm-log.o dm-raid1.o
10md-mod-objs := md.o bitmap.o
10raid6-objs := raid6main.o raid6algos.o raid6recov.o raid6tables.o \ 11raid6-objs := raid6main.o raid6algos.o raid6recov.o raid6tables.o \
11 raid6int1.o raid6int2.o raid6int4.o \ 12 raid6int1.o raid6int2.o raid6int4.o \
12 raid6int8.o raid6int16.o raid6int32.o \ 13 raid6int8.o raid6int16.o raid6int32.o \
@@ -28,7 +29,7 @@ obj-$(CONFIG_MD_RAID5) += raid5.o xor.o
28obj-$(CONFIG_MD_RAID6) += raid6.o xor.o 29obj-$(CONFIG_MD_RAID6) += raid6.o xor.o
29obj-$(CONFIG_MD_MULTIPATH) += multipath.o 30obj-$(CONFIG_MD_MULTIPATH) += multipath.o
30obj-$(CONFIG_MD_FAULTY) += faulty.o 31obj-$(CONFIG_MD_FAULTY) += faulty.o
31obj-$(CONFIG_BLK_DEV_MD) += md.o 32obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
32obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o 33obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
33obj-$(CONFIG_DM_CRYPT) += dm-crypt.o 34obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
34obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o 35obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
new file mode 100644
index 000000000000..95980ad6b27b
--- /dev/null
+++ b/drivers/md/bitmap.c
@@ -0,0 +1,1586 @@
1/*
2 * bitmap.c two-level bitmap (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
3 *
4 * bitmap_create - sets up the bitmap structure
5 * bitmap_destroy - destroys the bitmap structure
6 *
7 * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.:
8 * - added disk storage for bitmap
9 * - changes to allow various bitmap chunk sizes
10 * - added bitmap daemon (to asynchronously clear bitmap bits from disk)
11 */
12
13/*
14 * Still to do:
15 *
16 * flush after percent set rather than just time based. (maybe both).
17 * wait if count gets too high, wake when it drops to half.
18 * allow bitmap to be mirrored with superblock (before or after...)
19 * allow hot-add to re-instate a current device.
20 * allow hot-add of bitmap after quiessing device
21 */
22
23#include <linux/module.h>
24#include <linux/version.h>
25#include <linux/errno.h>
26#include <linux/slab.h>
27#include <linux/init.h>
28#include <linux/config.h>
29#include <linux/timer.h>
30#include <linux/sched.h>
31#include <linux/list.h>
32#include <linux/file.h>
33#include <linux/mount.h>
34#include <linux/buffer_head.h>
35#include <linux/raid/md.h>
36#include <linux/raid/bitmap.h>
37
38/* debug macros */
39
40#define DEBUG 0
41
42#if DEBUG
43/* these are for debugging purposes only! */
44
45/* define one and only one of these */
46#define INJECT_FAULTS_1 0 /* cause bitmap_alloc_page to fail always */
47#define INJECT_FAULTS_2 0 /* cause bitmap file to be kicked when first bit set*/
48#define INJECT_FAULTS_3 0 /* treat bitmap file as kicked at init time */
49#define INJECT_FAULTS_4 0 /* undef */
50#define INJECT_FAULTS_5 0 /* undef */
51#define INJECT_FAULTS_6 0
52
53/* if these are defined, the driver will fail! debug only */
54#define INJECT_FATAL_FAULT_1 0 /* fail kmalloc, causing bitmap_create to fail */
55#define INJECT_FATAL_FAULT_2 0 /* undef */
56#define INJECT_FATAL_FAULT_3 0 /* undef */
57#endif
58
59//#define DPRINTK PRINTK /* set this NULL to avoid verbose debug output */
60#define DPRINTK(x...) do { } while(0)
61
62#ifndef PRINTK
63# if DEBUG > 0
64# define PRINTK(x...) printk(KERN_DEBUG x)
65# else
66# define PRINTK(x...)
67# endif
68#endif
69
70static inline char * bmname(struct bitmap *bitmap)
71{
72 return bitmap->mddev ? mdname(bitmap->mddev) : "mdX";
73}
74
75
76/*
77 * test if the bitmap is active
78 */
79int bitmap_active(struct bitmap *bitmap)
80{
81 unsigned long flags;
82 int res = 0;
83
84 if (!bitmap)
85 return res;
86 spin_lock_irqsave(&bitmap->lock, flags);
87 res = bitmap->flags & BITMAP_ACTIVE;
88 spin_unlock_irqrestore(&bitmap->lock, flags);
89 return res;
90}
91
92#define WRITE_POOL_SIZE 256
93/* mempool for queueing pending writes on the bitmap file */
94static void *write_pool_alloc(unsigned int gfp_flags, void *data)
95{
96 return kmalloc(sizeof(struct page_list), gfp_flags);
97}
98
99static void write_pool_free(void *ptr, void *data)
100{
101 kfree(ptr);
102}
103
104/*
105 * just a placeholder - calls kmalloc for bitmap pages
106 */
107static unsigned char *bitmap_alloc_page(struct bitmap *bitmap)
108{
109 unsigned char *page;
110
111#if INJECT_FAULTS_1
112 page = NULL;
113#else
114 page = kmalloc(PAGE_SIZE, GFP_NOIO);
115#endif
116 if (!page)
117 printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap));
118 else
119 PRINTK("%s: bitmap_alloc_page: allocated page at %p\n",
120 bmname(bitmap), page);
121 return page;
122}
123
124/*
125 * for now just a placeholder -- just calls kfree for bitmap pages
126 */
127static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page)
128{
129 PRINTK("%s: bitmap_free_page: free page %p\n", bmname(bitmap), page);
130 kfree(page);
131}
132
133/*
134 * check a page and, if necessary, allocate it (or hijack it if the alloc fails)
135 *
136 * 1) check to see if this page is allocated, if it's not then try to alloc
137 * 2) if the alloc fails, set the page's hijacked flag so we'll use the
138 * page pointer directly as a counter
139 *
140 * if we find our page, we increment the page's refcount so that it stays
141 * allocated while we're using it
142 */
143static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int create)
144{
145 unsigned char *mappage;
146
147 if (page >= bitmap->pages) {
148 printk(KERN_ALERT
149 "%s: invalid bitmap page request: %lu (> %lu)\n",
150 bmname(bitmap), page, bitmap->pages-1);
151 return -EINVAL;
152 }
153
154
155 if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */
156 return 0;
157
158 if (bitmap->bp[page].map) /* page is already allocated, just return */
159 return 0;
160
161 if (!create)
162 return -ENOENT;
163
164 spin_unlock_irq(&bitmap->lock);
165
166 /* this page has not been allocated yet */
167
168 if ((mappage = bitmap_alloc_page(bitmap)) == NULL) {
169 PRINTK("%s: bitmap map page allocation failed, hijacking\n",
170 bmname(bitmap));
171 /* failed - set the hijacked flag so that we can use the
172 * pointer as a counter */
173 spin_lock_irq(&bitmap->lock);
174 if (!bitmap->bp[page].map)
175 bitmap->bp[page].hijacked = 1;
176 goto out;
177 }
178
179 /* got a page */
180
181 spin_lock_irq(&bitmap->lock);
182
183 /* recheck the page */
184
185 if (bitmap->bp[page].map || bitmap->bp[page].hijacked) {
186 /* somebody beat us to getting the page */
187 bitmap_free_page(bitmap, mappage);
188 return 0;
189 }
190
191 /* no page was in place and we have one, so install it */
192
193 memset(mappage, 0, PAGE_SIZE);
194 bitmap->bp[page].map = mappage;
195 bitmap->missing_pages--;
196out:
197 return 0;
198}
199
200
201/* if page is completely empty, put it back on the free list, or dealloc it */
202/* if page was hijacked, unmark the flag so it might get alloced next time */
203/* Note: lock should be held when calling this */
204static inline void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
205{
206 char *ptr;
207
208 if (bitmap->bp[page].count) /* page is still busy */
209 return;
210
211 /* page is no longer in use, it can be released */
212
213 if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */
214 bitmap->bp[page].hijacked = 0;
215 bitmap->bp[page].map = NULL;
216 return;
217 }
218
219 /* normal case, free the page */
220
221#if 0
222/* actually ... let's not. We will probably need the page again exactly when
223 * memory is tight and we are flusing to disk
224 */
225 return;
226#else
227 ptr = bitmap->bp[page].map;
228 bitmap->bp[page].map = NULL;
229 bitmap->missing_pages++;
230 bitmap_free_page(bitmap, ptr);
231 return;
232#endif
233}
234
235
236/*
237 * bitmap file handling - read and write the bitmap file and its superblock
238 */
239
240/* copy the pathname of a file to a buffer */
241char *file_path(struct file *file, char *buf, int count)
242{
243 struct dentry *d;
244 struct vfsmount *v;
245
246 if (!buf)
247 return NULL;
248
249 d = file->f_dentry;
250 v = file->f_vfsmnt;
251
252 buf = d_path(d, v, buf, count);
253
254 return IS_ERR(buf) ? NULL : buf;
255}
256
257/*
258 * basic page I/O operations
259 */
260
261/* IO operations when bitmap is stored near all superblocks */
262static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long index)
263{
264 /* choose a good rdev and read the page from there */
265
266 mdk_rdev_t *rdev;
267 struct list_head *tmp;
268 struct page *page = alloc_page(GFP_KERNEL);
269 sector_t target;
270
271 if (!page)
272 return ERR_PTR(-ENOMEM);
273 do {
274 ITERATE_RDEV(mddev, rdev, tmp)
275 if (rdev->in_sync && !rdev->faulty)
276 goto found;
277 return ERR_PTR(-EIO);
278
279 found:
280 target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512);
281
282 } while (!sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ));
283
284 page->index = index;
285 return page;
286}
287
288static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wait)
289{
290 mdk_rdev_t *rdev;
291 struct list_head *tmp;
292
293 ITERATE_RDEV(mddev, rdev, tmp)
294 if (rdev->in_sync && !rdev->faulty)
295 md_super_write(mddev, rdev,
296 (rdev->sb_offset<<1) + offset
297 + page->index * (PAGE_SIZE/512),
298 PAGE_SIZE,
299 page);
300
301 if (wait)
302 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
303 return 0;
304}
305
306/*
307 * write out a page to a file
308 */
309static int write_page(struct bitmap *bitmap, struct page *page, int wait)
310{
311 int ret = -ENOMEM;
312
313 if (bitmap->file == NULL)
314 return write_sb_page(bitmap->mddev, bitmap->offset, page, wait);
315
316 if (wait)
317 lock_page(page);
318 else {
319 if (TestSetPageLocked(page))
320 return -EAGAIN; /* already locked */
321 if (PageWriteback(page)) {
322 unlock_page(page);
323 return -EAGAIN;
324 }
325 }
326
327 ret = page->mapping->a_ops->prepare_write(NULL, page, 0, PAGE_SIZE);
328 if (!ret)
329 ret = page->mapping->a_ops->commit_write(NULL, page, 0,
330 PAGE_SIZE);
331 if (ret) {
332 unlock_page(page);
333 return ret;
334 }
335
336 set_page_dirty(page); /* force it to be written out */
337
338 if (!wait) {
339 /* add to list to be waited for by daemon */
340 struct page_list *item = mempool_alloc(bitmap->write_pool, GFP_NOIO);
341 item->page = page;
342 page_cache_get(page);
343 spin_lock(&bitmap->write_lock);
344 list_add(&item->list, &bitmap->complete_pages);
345 spin_unlock(&bitmap->write_lock);
346 md_wakeup_thread(bitmap->writeback_daemon);
347 }
348 return write_one_page(page, wait);
349}
350
351/* read a page from a file, pinning it into cache, and return bytes_read */
352static struct page *read_page(struct file *file, unsigned long index,
353 unsigned long *bytes_read)
354{
355 struct inode *inode = file->f_mapping->host;
356 struct page *page = NULL;
357 loff_t isize = i_size_read(inode);
358 unsigned long end_index = isize >> PAGE_CACHE_SHIFT;
359
360 PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_CACHE_SIZE,
361 (unsigned long long)index << PAGE_CACHE_SHIFT);
362
363 page = read_cache_page(inode->i_mapping, index,
364 (filler_t *)inode->i_mapping->a_ops->readpage, file);
365 if (IS_ERR(page))
366 goto out;
367 wait_on_page_locked(page);
368 if (!PageUptodate(page) || PageError(page)) {
369 page_cache_release(page);
370 page = ERR_PTR(-EIO);
371 goto out;
372 }
373
374 if (index > end_index) /* we have read beyond EOF */
375 *bytes_read = 0;
376 else if (index == end_index) /* possible short read */
377 *bytes_read = isize & ~PAGE_CACHE_MASK;
378 else
379 *bytes_read = PAGE_CACHE_SIZE; /* got a full page */
380out:
381 if (IS_ERR(page))
382 printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n",
383 (int)PAGE_CACHE_SIZE,
384 (unsigned long long)index << PAGE_CACHE_SHIFT,
385 PTR_ERR(page));
386 return page;
387}
388
389/*
390 * bitmap file superblock operations
391 */
392
393/* update the event counter and sync the superblock to disk */
394int bitmap_update_sb(struct bitmap *bitmap)
395{
396 bitmap_super_t *sb;
397 unsigned long flags;
398
399 if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
400 return 0;
401 spin_lock_irqsave(&bitmap->lock, flags);
402 if (!bitmap->sb_page) { /* no superblock */
403 spin_unlock_irqrestore(&bitmap->lock, flags);
404 return 0;
405 }
406 spin_unlock_irqrestore(&bitmap->lock, flags);
407 sb = (bitmap_super_t *)kmap(bitmap->sb_page);
408 sb->events = cpu_to_le64(bitmap->mddev->events);
409 if (!bitmap->mddev->degraded)
410 sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
411 kunmap(bitmap->sb_page);
412 return write_page(bitmap, bitmap->sb_page, 1);
413}
414
415/* print out the bitmap file superblock */
416void bitmap_print_sb(struct bitmap *bitmap)
417{
418 bitmap_super_t *sb;
419
420 if (!bitmap || !bitmap->sb_page)
421 return;
422 sb = (bitmap_super_t *)kmap(bitmap->sb_page);
423 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
424 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic));
425 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version));
426 printk(KERN_DEBUG " uuid: %08x.%08x.%08x.%08x\n",
427 *(__u32 *)(sb->uuid+0),
428 *(__u32 *)(sb->uuid+4),
429 *(__u32 *)(sb->uuid+8),
430 *(__u32 *)(sb->uuid+12));
431 printk(KERN_DEBUG " events: %llu\n",
432 (unsigned long long) le64_to_cpu(sb->events));
433 printk(KERN_DEBUG "events cleared: %llu\n",
434 (unsigned long long) le64_to_cpu(sb->events_cleared));
435 printk(KERN_DEBUG " state: %08x\n", le32_to_cpu(sb->state));
436 printk(KERN_DEBUG " chunksize: %d B\n", le32_to_cpu(sb->chunksize));
437 printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
438 printk(KERN_DEBUG " sync size: %llu KB\n",
439 (unsigned long long)le64_to_cpu(sb->sync_size)/2);
440 kunmap(bitmap->sb_page);
441}
442
443/* read the superblock from the bitmap file and initialize some bitmap fields */
444static int bitmap_read_sb(struct bitmap *bitmap)
445{
446 char *reason = NULL;
447 bitmap_super_t *sb;
448 unsigned long chunksize, daemon_sleep;
449 unsigned long bytes_read;
450 unsigned long long events;
451 int err = -EINVAL;
452
453 /* page 0 is the superblock, read it... */
454 if (bitmap->file)
455 bitmap->sb_page = read_page(bitmap->file, 0, &bytes_read);
456 else {
457 bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset, 0);
458 bytes_read = PAGE_SIZE;
459 }
460 if (IS_ERR(bitmap->sb_page)) {
461 err = PTR_ERR(bitmap->sb_page);
462 bitmap->sb_page = NULL;
463 return err;
464 }
465
466 sb = (bitmap_super_t *)kmap(bitmap->sb_page);
467
468 if (bytes_read < sizeof(*sb)) { /* short read */
469 printk(KERN_INFO "%s: bitmap file superblock truncated\n",
470 bmname(bitmap));
471 err = -ENOSPC;
472 goto out;
473 }
474
475 chunksize = le32_to_cpu(sb->chunksize);
476 daemon_sleep = le32_to_cpu(sb->daemon_sleep);
477
478 /* verify that the bitmap-specific fields are valid */
479 if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
480 reason = "bad magic";
481 else if (sb->version != cpu_to_le32(BITMAP_MAJOR))
482 reason = "unrecognized superblock version";
483 else if (chunksize < 512 || chunksize > (1024 * 1024 * 4))
484 reason = "bitmap chunksize out of range (512B - 4MB)";
485 else if ((1 << ffz(~chunksize)) != chunksize)
486 reason = "bitmap chunksize not a power of 2";
487 else if (daemon_sleep < 1 || daemon_sleep > 15)
488 reason = "daemon sleep period out of range";
489 if (reason) {
490 printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n",
491 bmname(bitmap), reason);
492 goto out;
493 }
494
495 /* keep the array size field of the bitmap superblock up to date */
496 sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
497
498 if (!bitmap->mddev->persistent)
499 goto success;
500
501 /*
502 * if we have a persistent array superblock, compare the
503 * bitmap's UUID and event counter to the mddev's
504 */
505 if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
506 printk(KERN_INFO "%s: bitmap superblock UUID mismatch\n",
507 bmname(bitmap));
508 goto out;
509 }
510 events = le64_to_cpu(sb->events);
511 if (events < bitmap->mddev->events) {
512 printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) "
513 "-- forcing full recovery\n", bmname(bitmap), events,
514 (unsigned long long) bitmap->mddev->events);
515 sb->state |= BITMAP_STALE;
516 }
517success:
518 /* assign fields using values from superblock */
519 bitmap->chunksize = chunksize;
520 bitmap->daemon_sleep = daemon_sleep;
521 bitmap->flags |= sb->state;
522 bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
523 err = 0;
524out:
525 kunmap(bitmap->sb_page);
526 if (err)
527 bitmap_print_sb(bitmap);
528 return err;
529}
530
531enum bitmap_mask_op {
532 MASK_SET,
533 MASK_UNSET
534};
535
536/* record the state of the bitmap in the superblock */
537static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
538 enum bitmap_mask_op op)
539{
540 bitmap_super_t *sb;
541 unsigned long flags;
542
543 spin_lock_irqsave(&bitmap->lock, flags);
544 if (!bitmap || !bitmap->sb_page) { /* can't set the state */
545 spin_unlock_irqrestore(&bitmap->lock, flags);
546 return;
547 }
548 page_cache_get(bitmap->sb_page);
549 spin_unlock_irqrestore(&bitmap->lock, flags);
550 sb = (bitmap_super_t *)kmap(bitmap->sb_page);
551 switch (op) {
552 case MASK_SET: sb->state |= bits;
553 break;
554 case MASK_UNSET: sb->state &= ~bits;
555 break;
556 default: BUG();
557 }
558 kunmap(bitmap->sb_page);
559 page_cache_release(bitmap->sb_page);
560}
561
562/*
563 * general bitmap file operations
564 */
565
566/* calculate the index of the page that contains this bit */
567static inline unsigned long file_page_index(unsigned long chunk)
568{
569 return CHUNK_BIT_OFFSET(chunk) >> PAGE_BIT_SHIFT;
570}
571
572/* calculate the (bit) offset of this bit within a page */
573static inline unsigned long file_page_offset(unsigned long chunk)
574{
575 return CHUNK_BIT_OFFSET(chunk) & (PAGE_BITS - 1);
576}
577
578/*
579 * return a pointer to the page in the filemap that contains the given bit
580 *
581 * this lookup is complicated by the fact that the bitmap sb might be exactly
582 * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page
583 * 0 or page 1
584 */
585static inline struct page *filemap_get_page(struct bitmap *bitmap,
586 unsigned long chunk)
587{
588 return bitmap->filemap[file_page_index(chunk) - file_page_index(0)];
589}
590
591
592static void bitmap_file_unmap(struct bitmap *bitmap)
593{
594 struct page **map, *sb_page;
595 unsigned long *attr;
596 int pages;
597 unsigned long flags;
598
599 spin_lock_irqsave(&bitmap->lock, flags);
600 map = bitmap->filemap;
601 bitmap->filemap = NULL;
602 attr = bitmap->filemap_attr;
603 bitmap->filemap_attr = NULL;
604 pages = bitmap->file_pages;
605 bitmap->file_pages = 0;
606 sb_page = bitmap->sb_page;
607 bitmap->sb_page = NULL;
608 spin_unlock_irqrestore(&bitmap->lock, flags);
609
610 while (pages--)
611 if (map[pages]->index != 0) /* 0 is sb_page, release it below */
612 page_cache_release(map[pages]);
613 kfree(map);
614 kfree(attr);
615
616 if (sb_page)
617 page_cache_release(sb_page);
618}
619
620static void bitmap_stop_daemons(struct bitmap *bitmap);
621
622/* dequeue the next item in a page list -- don't call from irq context */
623static struct page_list *dequeue_page(struct bitmap *bitmap)
624{
625 struct page_list *item = NULL;
626 struct list_head *head = &bitmap->complete_pages;
627
628 spin_lock(&bitmap->write_lock);
629 if (list_empty(head))
630 goto out;
631 item = list_entry(head->prev, struct page_list, list);
632 list_del(head->prev);
633out:
634 spin_unlock(&bitmap->write_lock);
635 return item;
636}
637
638static void drain_write_queues(struct bitmap *bitmap)
639{
640 struct page_list *item;
641
642 while ((item = dequeue_page(bitmap))) {
643 /* don't bother to wait */
644 page_cache_release(item->page);
645 mempool_free(item, bitmap->write_pool);
646 }
647
648 wake_up(&bitmap->write_wait);
649}
650
651static void bitmap_file_put(struct bitmap *bitmap)
652{
653 struct file *file;
654 struct inode *inode;
655 unsigned long flags;
656
657 spin_lock_irqsave(&bitmap->lock, flags);
658 file = bitmap->file;
659 bitmap->file = NULL;
660 spin_unlock_irqrestore(&bitmap->lock, flags);
661
662 bitmap_stop_daemons(bitmap);
663
664 drain_write_queues(bitmap);
665
666 bitmap_file_unmap(bitmap);
667
668 if (file) {
669 inode = file->f_mapping->host;
670 spin_lock(&inode->i_lock);
671 atomic_set(&inode->i_writecount, 1); /* allow writes again */
672 spin_unlock(&inode->i_lock);
673 fput(file);
674 }
675}
676
677
678/*
679 * bitmap_file_kick - if an error occurs while manipulating the bitmap file
680 * then it is no longer reliable, so we stop using it and we mark the file
681 * as failed in the superblock
682 */
683static void bitmap_file_kick(struct bitmap *bitmap)
684{
685 char *path, *ptr = NULL;
686
687 bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET);
688 bitmap_update_sb(bitmap);
689
690 if (bitmap->file) {
691 path = kmalloc(PAGE_SIZE, GFP_KERNEL);
692 if (path)
693 ptr = file_path(bitmap->file, path, PAGE_SIZE);
694
695 printk(KERN_ALERT "%s: kicking failed bitmap file %s from array!\n",
696 bmname(bitmap), ptr ? ptr : "");
697
698 kfree(path);
699 }
700
701 bitmap_file_put(bitmap);
702
703 return;
704}
705
706enum bitmap_page_attr {
707 BITMAP_PAGE_DIRTY = 1, // there are set bits that need to be synced
708 BITMAP_PAGE_CLEAN = 2, // there are bits that might need to be cleared
709 BITMAP_PAGE_NEEDWRITE=4, // there are cleared bits that need to be synced
710};
711
712static inline void set_page_attr(struct bitmap *bitmap, struct page *page,
713 enum bitmap_page_attr attr)
714{
715 bitmap->filemap_attr[page->index] |= attr;
716}
717
718static inline void clear_page_attr(struct bitmap *bitmap, struct page *page,
719 enum bitmap_page_attr attr)
720{
721 bitmap->filemap_attr[page->index] &= ~attr;
722}
723
724static inline unsigned long get_page_attr(struct bitmap *bitmap, struct page *page)
725{
726 return bitmap->filemap_attr[page->index];
727}
728
729/*
730 * bitmap_file_set_bit -- called before performing a write to the md device
731 * to set (and eventually sync) a particular bit in the bitmap file
732 *
733 * we set the bit immediately, then we record the page number so that
734 * when an unplug occurs, we can flush the dirty pages out to disk
735 */
736static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
737{
738 unsigned long bit;
739 struct page *page;
740 void *kaddr;
741 unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap);
742
743 if (!bitmap->filemap) {
744 return;
745 }
746
747 page = filemap_get_page(bitmap, chunk);
748 bit = file_page_offset(chunk);
749
750
751 /* make sure the page stays cached until it gets written out */
752 if (! (get_page_attr(bitmap, page) & BITMAP_PAGE_DIRTY))
753 page_cache_get(page);
754
755 /* set the bit */
756 kaddr = kmap_atomic(page, KM_USER0);
757 set_bit(bit, kaddr);
758 kunmap_atomic(kaddr, KM_USER0);
759 PRINTK("set file bit %lu page %lu\n", bit, page->index);
760
761 /* record page number so it gets flushed to disk when unplug occurs */
762 set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
763
764}
765
766/* this gets called when the md device is ready to unplug its underlying
767 * (slave) device queues -- before we let any writes go down, we need to
768 * sync the dirty pages of the bitmap file to disk */
769int bitmap_unplug(struct bitmap *bitmap)
770{
771 unsigned long i, attr, flags;
772 struct page *page;
773 int wait = 0;
774 int err;
775
776 if (!bitmap)
777 return 0;
778
779 /* look at each page to see if there are any set bits that need to be
780 * flushed out to disk */
781 for (i = 0; i < bitmap->file_pages; i++) {
782 spin_lock_irqsave(&bitmap->lock, flags);
783 if (!bitmap->filemap) {
784 spin_unlock_irqrestore(&bitmap->lock, flags);
785 return 0;
786 }
787 page = bitmap->filemap[i];
788 attr = get_page_attr(bitmap, page);
789 clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
790 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
791 if ((attr & BITMAP_PAGE_DIRTY))
792 wait = 1;
793 spin_unlock_irqrestore(&bitmap->lock, flags);
794
795 if (attr & (BITMAP_PAGE_DIRTY | BITMAP_PAGE_NEEDWRITE)) {
796 err = write_page(bitmap, page, 0);
797 if (err == -EAGAIN) {
798 if (attr & BITMAP_PAGE_DIRTY)
799 err = write_page(bitmap, page, 1);
800 else
801 err = 0;
802 }
803 if (err)
804 return 1;
805 }
806 }
807 if (wait) { /* if any writes were performed, we need to wait on them */
808 if (bitmap->file) {
809 spin_lock_irq(&bitmap->write_lock);
810 wait_event_lock_irq(bitmap->write_wait,
811 list_empty(&bitmap->complete_pages), bitmap->write_lock,
812 wake_up_process(bitmap->writeback_daemon->tsk));
813 spin_unlock_irq(&bitmap->write_lock);
814 } else
815 wait_event(bitmap->mddev->sb_wait,
816 atomic_read(&bitmap->mddev->pending_writes)==0);
817 }
818 return 0;
819}
820
821static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset,
822 unsigned long sectors, int in_sync);
823/* * bitmap_init_from_disk -- called at bitmap_create time to initialize
824 * the in-memory bitmap from the on-disk bitmap -- also, sets up the
825 * memory mapping of the bitmap file
826 * Special cases:
827 * if there's no bitmap file, or if the bitmap file had been
828 * previously kicked from the array, we mark all the bits as
829 * 1's in order to cause a full resync.
830 */
831static int bitmap_init_from_disk(struct bitmap *bitmap, int in_sync)
832{
833 unsigned long i, chunks, index, oldindex, bit;
834 struct page *page = NULL, *oldpage = NULL;
835 unsigned long num_pages, bit_cnt = 0;
836 struct file *file;
837 unsigned long bytes, offset, dummy;
838 int outofdate;
839 int ret = -ENOSPC;
840
841 chunks = bitmap->chunks;
842 file = bitmap->file;
843
844 BUG_ON(!file && !bitmap->offset);
845
846#if INJECT_FAULTS_3
847 outofdate = 1;
848#else
849 outofdate = bitmap->flags & BITMAP_STALE;
850#endif
851 if (outofdate)
852 printk(KERN_INFO "%s: bitmap file is out of date, doing full "
853 "recovery\n", bmname(bitmap));
854
855 bytes = (chunks + 7) / 8;
856
857 num_pages = (bytes + sizeof(bitmap_super_t) + PAGE_SIZE - 1) / PAGE_SIZE;
858
859 if (file && i_size_read(file->f_mapping->host) < bytes + sizeof(bitmap_super_t)) {
860 printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n",
861 bmname(bitmap),
862 (unsigned long) i_size_read(file->f_mapping->host),
863 bytes + sizeof(bitmap_super_t));
864 goto out;
865 }
866
867 ret = -ENOMEM;
868
869 bitmap->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL);
870 if (!bitmap->filemap)
871 goto out;
872
873 bitmap->filemap_attr = kmalloc(sizeof(long) * num_pages, GFP_KERNEL);
874 if (!bitmap->filemap_attr)
875 goto out;
876
877 memset(bitmap->filemap_attr, 0, sizeof(long) * num_pages);
878
879 oldindex = ~0L;
880
881 for (i = 0; i < chunks; i++) {
882 index = file_page_index(i);
883 bit = file_page_offset(i);
884 if (index != oldindex) { /* this is a new page, read it in */
885 /* unmap the old page, we're done with it */
886 if (oldpage != NULL)
887 kunmap(oldpage);
888 if (index == 0) {
889 /*
890 * if we're here then the superblock page
891 * contains some bits (PAGE_SIZE != sizeof sb)
892 * we've already read it in, so just use it
893 */
894 page = bitmap->sb_page;
895 offset = sizeof(bitmap_super_t);
896 } else if (file) {
897 page = read_page(file, index, &dummy);
898 offset = 0;
899 } else {
900 page = read_sb_page(bitmap->mddev, bitmap->offset, index);
901 offset = 0;
902 }
903 if (IS_ERR(page)) { /* read error */
904 ret = PTR_ERR(page);
905 goto out;
906 }
907
908 oldindex = index;
909 oldpage = page;
910 kmap(page);
911
912 if (outofdate) {
913 /*
914 * if bitmap is out of date, dirty the
915 * whole page and write it out
916 */
917 memset(page_address(page) + offset, 0xff,
918 PAGE_SIZE - offset);
919 ret = write_page(bitmap, page, 1);
920 if (ret) {
921 kunmap(page);
922 /* release, page not in filemap yet */
923 page_cache_release(page);
924 goto out;
925 }
926 }
927
928 bitmap->filemap[bitmap->file_pages++] = page;
929 }
930 if (test_bit(bit, page_address(page))) {
931 /* if the disk bit is set, set the memory bit */
932 bitmap_set_memory_bits(bitmap,
933 i << CHUNK_BLOCK_SHIFT(bitmap), 1, in_sync);
934 bit_cnt++;
935 }
936 }
937
938 /* everything went OK */
939 ret = 0;
940 bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET);
941
942 if (page) /* unmap the last page */
943 kunmap(page);
944
945 if (bit_cnt) { /* Kick recovery if any bits were set */
946 set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
947 md_wakeup_thread(bitmap->mddev->thread);
948 }
949
950out:
951 printk(KERN_INFO "%s: bitmap initialized from disk: "
952 "read %lu/%lu pages, set %lu bits, status: %d\n",
953 bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, ret);
954
955 return ret;
956}
957
958void bitmap_write_all(struct bitmap *bitmap)
959{
960 /* We don't actually write all bitmap blocks here,
961 * just flag them as needing to be written
962 */
963
964 unsigned long chunks = bitmap->chunks;
965 unsigned long bytes = (chunks+7)/8 + sizeof(bitmap_super_t);
966 unsigned long num_pages = (bytes + PAGE_SIZE-1) / PAGE_SIZE;
967 while (num_pages--)
968 bitmap->filemap_attr[num_pages] |= BITMAP_PAGE_NEEDWRITE;
969}
970
971
972static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
973{
974 sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap);
975 unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
976 bitmap->bp[page].count += inc;
977/*
978 if (page == 0) printk("count page 0, offset %llu: %d gives %d\n",
979 (unsigned long long)offset, inc, bitmap->bp[page].count);
980*/
981 bitmap_checkfree(bitmap, page);
982}
983static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
984 sector_t offset, int *blocks,
985 int create);
986
987/*
988 * bitmap daemon -- periodically wakes up to clean bits and flush pages
989 * out to disk
990 */
991
992int bitmap_daemon_work(struct bitmap *bitmap)
993{
994 unsigned long j;
995 unsigned long flags;
996 struct page *page = NULL, *lastpage = NULL;
997 int err = 0;
998 int blocks;
999 int attr;
1000
1001 if (bitmap == NULL)
1002 return 0;
1003 if (time_before(jiffies, bitmap->daemon_lastrun + bitmap->daemon_sleep*HZ))
1004 return 0;
1005 bitmap->daemon_lastrun = jiffies;
1006
1007 for (j = 0; j < bitmap->chunks; j++) {
1008 bitmap_counter_t *bmc;
1009 spin_lock_irqsave(&bitmap->lock, flags);
1010 if (!bitmap->filemap) {
1011 /* error or shutdown */
1012 spin_unlock_irqrestore(&bitmap->lock, flags);
1013 break;
1014 }
1015
1016 page = filemap_get_page(bitmap, j);
1017
1018 if (page != lastpage) {
1019 /* skip this page unless it's marked as needing cleaning */
1020 if (!((attr=get_page_attr(bitmap, page)) & BITMAP_PAGE_CLEAN)) {
1021 if (attr & BITMAP_PAGE_NEEDWRITE) {
1022 page_cache_get(page);
1023 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
1024 }
1025 spin_unlock_irqrestore(&bitmap->lock, flags);
1026 if (attr & BITMAP_PAGE_NEEDWRITE) {
1027 switch (write_page(bitmap, page, 0)) {
1028 case -EAGAIN:
1029 set_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
1030 break;
1031 case 0:
1032 break;
1033 default:
1034 bitmap_file_kick(bitmap);
1035 }
1036 page_cache_release(page);
1037 }
1038 continue;
1039 }
1040
1041 /* grab the new page, sync and release the old */
1042 page_cache_get(page);
1043 if (lastpage != NULL) {
1044 if (get_page_attr(bitmap, lastpage) & BITMAP_PAGE_NEEDWRITE) {
1045 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1046 spin_unlock_irqrestore(&bitmap->lock, flags);
1047 err = write_page(bitmap, lastpage, 0);
1048 if (err == -EAGAIN) {
1049 err = 0;
1050 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1051 }
1052 } else {
1053 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1054 spin_unlock_irqrestore(&bitmap->lock, flags);
1055 }
1056 kunmap(lastpage);
1057 page_cache_release(lastpage);
1058 if (err)
1059 bitmap_file_kick(bitmap);
1060 } else
1061 spin_unlock_irqrestore(&bitmap->lock, flags);
1062 lastpage = page;
1063 kmap(page);
1064/*
1065 printk("bitmap clean at page %lu\n", j);
1066*/
1067 spin_lock_irqsave(&bitmap->lock, flags);
1068 clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
1069 }
1070 bmc = bitmap_get_counter(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap),
1071 &blocks, 0);
1072 if (bmc) {
1073/*
1074 if (j < 100) printk("bitmap: j=%lu, *bmc = 0x%x\n", j, *bmc);
1075*/
1076 if (*bmc == 2) {
1077 *bmc=1; /* maybe clear the bit next time */
1078 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
1079 } else if (*bmc == 1) {
1080 /* we can clear the bit */
1081 *bmc = 0;
1082 bitmap_count_page(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap),
1083 -1);
1084
1085 /* clear the bit */
1086 clear_bit(file_page_offset(j), page_address(page));
1087 }
1088 }
1089 spin_unlock_irqrestore(&bitmap->lock, flags);
1090 }
1091
1092 /* now sync the final page */
1093 if (lastpage != NULL) {
1094 kunmap(lastpage);
1095 spin_lock_irqsave(&bitmap->lock, flags);
1096 if (get_page_attr(bitmap, lastpage) &BITMAP_PAGE_NEEDWRITE) {
1097 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1098 spin_unlock_irqrestore(&bitmap->lock, flags);
1099 err = write_page(bitmap, lastpage, 0);
1100 if (err == -EAGAIN) {
1101 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1102 err = 0;
1103 }
1104 } else {
1105 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1106 spin_unlock_irqrestore(&bitmap->lock, flags);
1107 }
1108
1109 page_cache_release(lastpage);
1110 }
1111
1112 return err;
1113}
1114
1115static void daemon_exit(struct bitmap *bitmap, mdk_thread_t **daemon)
1116{
1117 mdk_thread_t *dmn;
1118 unsigned long flags;
1119
1120 /* if no one is waiting on us, we'll free the md thread struct
1121 * and exit, otherwise we let the waiter clean things up */
1122 spin_lock_irqsave(&bitmap->lock, flags);
1123 if ((dmn = *daemon)) { /* no one is waiting, cleanup and exit */
1124 *daemon = NULL;
1125 spin_unlock_irqrestore(&bitmap->lock, flags);
1126 kfree(dmn);
1127 complete_and_exit(NULL, 0); /* do_exit not exported */
1128 }
1129 spin_unlock_irqrestore(&bitmap->lock, flags);
1130}
1131
1132static void bitmap_writeback_daemon(mddev_t *mddev)
1133{
1134 struct bitmap *bitmap = mddev->bitmap;
1135 struct page *page;
1136 struct page_list *item;
1137 int err = 0;
1138
1139 if (signal_pending(current)) {
1140 printk(KERN_INFO
1141 "%s: bitmap writeback daemon got signal, exiting...\n",
1142 bmname(bitmap));
1143 err = -EINTR;
1144 goto out;
1145 }
1146
1147 PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap));
1148 /* wait on bitmap page writebacks */
1149 while ((item = dequeue_page(bitmap))) {
1150 page = item->page;
1151 mempool_free(item, bitmap->write_pool);
1152 PRINTK("wait on page writeback: %p\n", page);
1153 wait_on_page_writeback(page);
1154 PRINTK("finished page writeback: %p\n", page);
1155
1156 err = PageError(page);
1157 page_cache_release(page);
1158 if (err) {
1159 printk(KERN_WARNING "%s: bitmap file writeback "
1160 "failed (page %lu): %d\n",
1161 bmname(bitmap), page->index, err);
1162 bitmap_file_kick(bitmap);
1163 goto out;
1164 }
1165 }
1166 out:
1167 wake_up(&bitmap->write_wait);
1168 if (err) {
1169 printk(KERN_INFO "%s: bitmap writeback daemon exiting (%d)\n",
1170 bmname(bitmap), err);
1171 daemon_exit(bitmap, &bitmap->writeback_daemon);
1172 }
1173}
1174
1175static int bitmap_start_daemon(struct bitmap *bitmap, mdk_thread_t **ptr,
1176 void (*func)(mddev_t *), char *name)
1177{
1178 mdk_thread_t *daemon;
1179 unsigned long flags;
1180 char namebuf[32];
1181
1182 spin_lock_irqsave(&bitmap->lock, flags);
1183 *ptr = NULL;
1184
1185 if (!bitmap->file) /* no need for daemon if there's no backing file */
1186 goto out_unlock;
1187
1188 spin_unlock_irqrestore(&bitmap->lock, flags);
1189
1190#if INJECT_FATAL_FAULT_2
1191 daemon = NULL;
1192#else
1193 sprintf(namebuf, "%%s_%s", name);
1194 daemon = md_register_thread(func, bitmap->mddev, namebuf);
1195#endif
1196 if (!daemon) {
1197 printk(KERN_ERR "%s: failed to start bitmap daemon\n",
1198 bmname(bitmap));
1199 return -ECHILD;
1200 }
1201
1202 spin_lock_irqsave(&bitmap->lock, flags);
1203 *ptr = daemon;
1204
1205 md_wakeup_thread(daemon); /* start it running */
1206
1207 PRINTK("%s: %s daemon (pid %d) started...\n",
1208 bmname(bitmap), name, daemon->tsk->pid);
1209out_unlock:
1210 spin_unlock_irqrestore(&bitmap->lock, flags);
1211 return 0;
1212}
1213
1214static int bitmap_start_daemons(struct bitmap *bitmap)
1215{
1216 int err = bitmap_start_daemon(bitmap, &bitmap->writeback_daemon,
1217 bitmap_writeback_daemon, "bitmap_wb");
1218 return err;
1219}
1220
1221static void bitmap_stop_daemon(struct bitmap *bitmap, mdk_thread_t **ptr)
1222{
1223 mdk_thread_t *daemon;
1224 unsigned long flags;
1225
1226 spin_lock_irqsave(&bitmap->lock, flags);
1227 daemon = *ptr;
1228 *ptr = NULL;
1229 spin_unlock_irqrestore(&bitmap->lock, flags);
1230 if (daemon)
1231 md_unregister_thread(daemon); /* destroy the thread */
1232}
1233
1234static void bitmap_stop_daemons(struct bitmap *bitmap)
1235{
1236 /* the daemons can't stop themselves... they'll just exit instead... */
1237 if (bitmap->writeback_daemon &&
1238 current->pid != bitmap->writeback_daemon->tsk->pid)
1239 bitmap_stop_daemon(bitmap, &bitmap->writeback_daemon);
1240}
1241
1242static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
1243 sector_t offset, int *blocks,
1244 int create)
1245{
1246 /* If 'create', we might release the lock and reclaim it.
1247 * The lock must have been taken with interrupts enabled.
1248 * If !create, we don't release the lock.
1249 */
1250 sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap);
1251 unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1252 unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
1253 sector_t csize;
1254
1255 if (bitmap_checkpage(bitmap, page, create) < 0) {
1256 csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap));
1257 *blocks = csize - (offset & (csize- 1));
1258 return NULL;
1259 }
1260 /* now locked ... */
1261
1262 if (bitmap->bp[page].hijacked) { /* hijacked pointer */
1263 /* should we use the first or second counter field
1264 * of the hijacked pointer? */
1265 int hi = (pageoff > PAGE_COUNTER_MASK);
1266 csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) +
1267 PAGE_COUNTER_SHIFT - 1);
1268 *blocks = csize - (offset & (csize- 1));
1269 return &((bitmap_counter_t *)
1270 &bitmap->bp[page].map)[hi];
1271 } else { /* page is allocated */
1272 csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap));
1273 *blocks = csize - (offset & (csize- 1));
1274 return (bitmap_counter_t *)
1275 &(bitmap->bp[page].map[pageoff]);
1276 }
1277}
1278
1279int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors)
1280{
1281 if (!bitmap) return 0;
1282 while (sectors) {
1283 int blocks;
1284 bitmap_counter_t *bmc;
1285
1286 spin_lock_irq(&bitmap->lock);
1287 bmc = bitmap_get_counter(bitmap, offset, &blocks, 1);
1288 if (!bmc) {
1289 spin_unlock_irq(&bitmap->lock);
1290 return 0;
1291 }
1292
1293 switch(*bmc) {
1294 case 0:
1295 bitmap_file_set_bit(bitmap, offset);
1296 bitmap_count_page(bitmap,offset, 1);
1297 blk_plug_device(bitmap->mddev->queue);
1298 /* fall through */
1299 case 1:
1300 *bmc = 2;
1301 }
1302 if ((*bmc & COUNTER_MAX) == COUNTER_MAX) BUG();
1303 (*bmc)++;
1304
1305 spin_unlock_irq(&bitmap->lock);
1306
1307 offset += blocks;
1308 if (sectors > blocks)
1309 sectors -= blocks;
1310 else sectors = 0;
1311 }
1312 return 0;
1313}
1314
1315void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors,
1316 int success)
1317{
1318 if (!bitmap) return;
1319 while (sectors) {
1320 int blocks;
1321 unsigned long flags;
1322 bitmap_counter_t *bmc;
1323
1324 spin_lock_irqsave(&bitmap->lock, flags);
1325 bmc = bitmap_get_counter(bitmap, offset, &blocks, 0);
1326 if (!bmc) {
1327 spin_unlock_irqrestore(&bitmap->lock, flags);
1328 return;
1329 }
1330
1331 if (!success && ! (*bmc & NEEDED_MASK))
1332 *bmc |= NEEDED_MASK;
1333
1334 (*bmc)--;
1335 if (*bmc <= 2) {
1336 set_page_attr(bitmap,
1337 filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)),
1338 BITMAP_PAGE_CLEAN);
1339 }
1340 spin_unlock_irqrestore(&bitmap->lock, flags);
1341 offset += blocks;
1342 if (sectors > blocks)
1343 sectors -= blocks;
1344 else sectors = 0;
1345 }
1346}
1347
1348int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks)
1349{
1350 bitmap_counter_t *bmc;
1351 int rv;
1352 if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */
1353 *blocks = 1024;
1354 return 1; /* always resync if no bitmap */
1355 }
1356 spin_lock_irq(&bitmap->lock);
1357 bmc = bitmap_get_counter(bitmap, offset, blocks, 0);
1358 rv = 0;
1359 if (bmc) {
1360 /* locked */
1361 if (RESYNC(*bmc))
1362 rv = 1;
1363 else if (NEEDED(*bmc)) {
1364 rv = 1;
1365 *bmc |= RESYNC_MASK;
1366 *bmc &= ~NEEDED_MASK;
1367 }
1368 }
1369 spin_unlock_irq(&bitmap->lock);
1370 return rv;
1371}
1372
1373void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted)
1374{
1375 bitmap_counter_t *bmc;
1376 unsigned long flags;
1377/*
1378 if (offset == 0) printk("bitmap_end_sync 0 (%d)\n", aborted);
1379*/ if (bitmap == NULL) {
1380 *blocks = 1024;
1381 return;
1382 }
1383 spin_lock_irqsave(&bitmap->lock, flags);
1384 bmc = bitmap_get_counter(bitmap, offset, blocks, 0);
1385 if (bmc == NULL)
1386 goto unlock;
1387 /* locked */
1388/*
1389 if (offset == 0) printk("bitmap_end sync found 0x%x, blocks %d\n", *bmc, *blocks);
1390*/
1391 if (RESYNC(*bmc)) {
1392 *bmc &= ~RESYNC_MASK;
1393
1394 if (!NEEDED(*bmc) && aborted)
1395 *bmc |= NEEDED_MASK;
1396 else {
1397 if (*bmc <= 2) {
1398 set_page_attr(bitmap,
1399 filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)),
1400 BITMAP_PAGE_CLEAN);
1401 }
1402 }
1403 }
1404 unlock:
1405 spin_unlock_irqrestore(&bitmap->lock, flags);
1406}
1407
1408void bitmap_close_sync(struct bitmap *bitmap)
1409{
1410 /* Sync has finished, and any bitmap chunks that weren't synced
1411 * properly have been aborted. It remains to us to clear the
1412 * RESYNC bit wherever it is still on
1413 */
1414 sector_t sector = 0;
1415 int blocks;
1416 if (!bitmap) return;
1417 while (sector < bitmap->mddev->resync_max_sectors) {
1418 bitmap_end_sync(bitmap, sector, &blocks, 0);
1419/*
1420 if (sector < 500) printk("bitmap_close_sync: sec %llu blks %d\n",
1421 (unsigned long long)sector, blocks);
1422*/ sector += blocks;
1423 }
1424}
1425
1426static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset,
1427 unsigned long sectors, int in_sync)
1428{
1429 /* For each chunk covered by any of these sectors, set the
1430 * counter to 1 and set resync_needed unless in_sync. They should all
1431 * be 0 at this point
1432 */
1433 while (sectors) {
1434 int secs;
1435 bitmap_counter_t *bmc;
1436 spin_lock_irq(&bitmap->lock);
1437 bmc = bitmap_get_counter(bitmap, offset, &secs, 1);
1438 if (!bmc) {
1439 spin_unlock_irq(&bitmap->lock);
1440 return;
1441 }
1442 if (! *bmc) {
1443 struct page *page;
1444 *bmc = 1 | (in_sync? 0 : NEEDED_MASK);
1445 bitmap_count_page(bitmap, offset, 1);
1446 page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap));
1447 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
1448 }
1449 spin_unlock_irq(&bitmap->lock);
1450 if (sectors > secs)
1451 sectors -= secs;
1452 else
1453 sectors = 0;
1454 }
1455}
1456
1457/*
1458 * free memory that was allocated
1459 */
1460void bitmap_destroy(mddev_t *mddev)
1461{
1462 unsigned long k, pages;
1463 struct bitmap_page *bp;
1464 struct bitmap *bitmap = mddev->bitmap;
1465
1466 if (!bitmap) /* there was no bitmap */
1467 return;
1468
1469 mddev->bitmap = NULL; /* disconnect from the md device */
1470
1471 /* release the bitmap file and kill the daemon */
1472 bitmap_file_put(bitmap);
1473
1474 bp = bitmap->bp;
1475 pages = bitmap->pages;
1476
1477 /* free all allocated memory */
1478
1479 mempool_destroy(bitmap->write_pool);
1480
1481 if (bp) /* deallocate the page memory */
1482 for (k = 0; k < pages; k++)
1483 if (bp[k].map && !bp[k].hijacked)
1484 kfree(bp[k].map);
1485 kfree(bp);
1486 kfree(bitmap);
1487}
1488
1489/*
1490 * initialize the bitmap structure
1491 * if this returns an error, bitmap_destroy must be called to do clean up
1492 */
1493int bitmap_create(mddev_t *mddev)
1494{
1495 struct bitmap *bitmap;
1496 unsigned long blocks = mddev->resync_max_sectors;
1497 unsigned long chunks;
1498 unsigned long pages;
1499 struct file *file = mddev->bitmap_file;
1500 int err;
1501
1502 BUG_ON(sizeof(bitmap_super_t) != 256);
1503
1504 if (!file && !mddev->bitmap_offset) /* bitmap disabled, nothing to do */
1505 return 0;
1506
1507 BUG_ON(file && mddev->bitmap_offset);
1508
1509 bitmap = kmalloc(sizeof(*bitmap), GFP_KERNEL);
1510 if (!bitmap)
1511 return -ENOMEM;
1512
1513 memset(bitmap, 0, sizeof(*bitmap));
1514
1515 spin_lock_init(&bitmap->lock);
1516 bitmap->mddev = mddev;
1517 mddev->bitmap = bitmap;
1518
1519 spin_lock_init(&bitmap->write_lock);
1520 INIT_LIST_HEAD(&bitmap->complete_pages);
1521 init_waitqueue_head(&bitmap->write_wait);
1522 bitmap->write_pool = mempool_create(WRITE_POOL_SIZE, write_pool_alloc,
1523 write_pool_free, NULL);
1524 if (!bitmap->write_pool)
1525 return -ENOMEM;
1526
1527 bitmap->file = file;
1528 bitmap->offset = mddev->bitmap_offset;
1529 if (file) get_file(file);
1530 /* read superblock from bitmap file (this sets bitmap->chunksize) */
1531 err = bitmap_read_sb(bitmap);
1532 if (err)
1533 return err;
1534
1535 bitmap->chunkshift = find_first_bit(&bitmap->chunksize,
1536 sizeof(bitmap->chunksize));
1537
1538 /* now that chunksize and chunkshift are set, we can use these macros */
1539 chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) /
1540 CHUNK_BLOCK_RATIO(bitmap);
1541 pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO;
1542
1543 BUG_ON(!pages);
1544
1545 bitmap->chunks = chunks;
1546 bitmap->pages = pages;
1547 bitmap->missing_pages = pages;
1548 bitmap->counter_bits = COUNTER_BITS;
1549
1550 bitmap->syncchunk = ~0UL;
1551
1552#if INJECT_FATAL_FAULT_1
1553 bitmap->bp = NULL;
1554#else
1555 bitmap->bp = kmalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL);
1556#endif
1557 if (!bitmap->bp)
1558 return -ENOMEM;
1559 memset(bitmap->bp, 0, pages * sizeof(*bitmap->bp));
1560
1561 bitmap->flags |= BITMAP_ACTIVE;
1562
1563 /* now that we have some pages available, initialize the in-memory
1564 * bitmap from the on-disk bitmap */
1565 err = bitmap_init_from_disk(bitmap, mddev->recovery_cp == MaxSector);
1566 if (err)
1567 return err;
1568
1569 printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
1570 pages, bmname(bitmap));
1571
1572 /* kick off the bitmap daemons */
1573 err = bitmap_start_daemons(bitmap);
1574 if (err)
1575 return err;
1576 return bitmap_update_sb(bitmap);
1577}
1578
1579/* the bitmap API -- for raid personalities */
1580EXPORT_SYMBOL(bitmap_startwrite);
1581EXPORT_SYMBOL(bitmap_endwrite);
1582EXPORT_SYMBOL(bitmap_start_sync);
1583EXPORT_SYMBOL(bitmap_end_sync);
1584EXPORT_SYMBOL(bitmap_unplug);
1585EXPORT_SYMBOL(bitmap_close_sync);
1586EXPORT_SYMBOL(bitmap_daemon_work);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 0dd6c2b5391b..d0a4bab220e5 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -704,8 +704,7 @@ static void crypt_dtr(struct dm_target *ti)
704 mempool_destroy(cc->page_pool); 704 mempool_destroy(cc->page_pool);
705 mempool_destroy(cc->io_pool); 705 mempool_destroy(cc->io_pool);
706 706
707 if (cc->iv_mode) 707 kfree(cc->iv_mode);
708 kfree(cc->iv_mode);
709 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) 708 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
710 cc->iv_gen_ops->dtr(cc); 709 cc->iv_gen_ops->dtr(cc);
711 crypto_free_tfm(cc->tfm); 710 crypto_free_tfm(cc->tfm);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 1e97b3c12bd5..0c1b8520ef86 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -985,6 +985,9 @@ static int do_end_io(struct multipath *m, struct bio *bio,
985 if (!error) 985 if (!error)
986 return 0; /* I/O complete */ 986 return 0; /* I/O complete */
987 987
988 if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
989 return error;
990
988 spin_lock(&m->lock); 991 spin_lock(&m->lock);
989 if (!m->nr_valid_paths) { 992 if (!m->nr_valid_paths) {
990 if (!m->queue_if_no_path || m->suspended) { 993 if (!m->queue_if_no_path || m->suspended) {
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 161e9aa87291..8d740013d74d 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -217,8 +217,7 @@ static int linear_run (mddev_t *mddev)
217 return 0; 217 return 0;
218 218
219out: 219out:
220 if (conf) 220 kfree(conf);
221 kfree(conf);
222 return 1; 221 return 1;
223} 222}
224 223
@@ -269,9 +268,8 @@ static int linear_make_request (request_queue_t *q, struct bio *bio)
269 * split it. 268 * split it.
270 */ 269 */
271 struct bio_pair *bp; 270 struct bio_pair *bp;
272 bp = bio_split(bio, bio_split_pool, 271 bp = bio_split(bio, bio_split_pool,
273 (bio->bi_sector + (bio->bi_size >> 9) - 272 ((tmp_dev->offset + tmp_dev->size)<<1) - bio->bi_sector);
274 (tmp_dev->offset + tmp_dev->size))<<1);
275 if (linear_make_request(q, &bp->bio1)) 273 if (linear_make_request(q, &bp->bio1))
276 generic_make_request(&bp->bio1); 274 generic_make_request(&bp->bio1);
277 if (linear_make_request(q, &bp->bio2)) 275 if (linear_make_request(q, &bp->bio2))
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d899204d3743..0c6b5b6baff6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -19,6 +19,9 @@
19 19
20 Neil Brown <neilb@cse.unsw.edu.au>. 20 Neil Brown <neilb@cse.unsw.edu.au>.
21 21
22 - persistent bitmap code
23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
22 This program is free software; you can redistribute it and/or modify 25 This program is free software; you can redistribute it and/or modify
23 it under the terms of the GNU General Public License as published by 26 it under the terms of the GNU General Public License as published by
24 the Free Software Foundation; either version 2, or (at your option) 27 the Free Software Foundation; either version 2, or (at your option)
@@ -33,6 +36,7 @@
33#include <linux/config.h> 36#include <linux/config.h>
34#include <linux/linkage.h> 37#include <linux/linkage.h>
35#include <linux/raid/md.h> 38#include <linux/raid/md.h>
39#include <linux/raid/bitmap.h>
36#include <linux/sysctl.h> 40#include <linux/sysctl.h>
37#include <linux/devfs_fs_kernel.h> 41#include <linux/devfs_fs_kernel.h>
38#include <linux/buffer_head.h> /* for invalidate_bdev */ 42#include <linux/buffer_head.h> /* for invalidate_bdev */
@@ -40,6 +44,8 @@
40 44
41#include <linux/init.h> 45#include <linux/init.h>
42 46
47#include <linux/file.h>
48
43#ifdef CONFIG_KMOD 49#ifdef CONFIG_KMOD
44#include <linux/kmod.h> 50#include <linux/kmod.h>
45#endif 51#endif
@@ -189,8 +195,7 @@ static mddev_t * mddev_find(dev_t unit)
189 if (mddev->unit == unit) { 195 if (mddev->unit == unit) {
190 mddev_get(mddev); 196 mddev_get(mddev);
191 spin_unlock(&all_mddevs_lock); 197 spin_unlock(&all_mddevs_lock);
192 if (new) 198 kfree(new);
193 kfree(new);
194 return mddev; 199 return mddev;
195 } 200 }
196 201
@@ -218,6 +223,8 @@ static mddev_t * mddev_find(dev_t unit)
218 INIT_LIST_HEAD(&new->all_mddevs); 223 INIT_LIST_HEAD(&new->all_mddevs);
219 init_timer(&new->safemode_timer); 224 init_timer(&new->safemode_timer);
220 atomic_set(&new->active, 1); 225 atomic_set(&new->active, 1);
226 spin_lock_init(&new->write_lock);
227 init_waitqueue_head(&new->sb_wait);
221 228
222 new->queue = blk_alloc_queue(GFP_KERNEL); 229 new->queue = blk_alloc_queue(GFP_KERNEL);
223 if (!new->queue) { 230 if (!new->queue) {
@@ -320,6 +327,40 @@ static void free_disk_sb(mdk_rdev_t * rdev)
320} 327}
321 328
322 329
330static int super_written(struct bio *bio, unsigned int bytes_done, int error)
331{
332 mdk_rdev_t *rdev = bio->bi_private;
333 if (bio->bi_size)
334 return 1;
335
336 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
337 md_error(rdev->mddev, rdev);
338
339 if (atomic_dec_and_test(&rdev->mddev->pending_writes))
340 wake_up(&rdev->mddev->sb_wait);
341 return 0;
342}
343
344void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
345 sector_t sector, int size, struct page *page)
346{
347 /* write first size bytes of page to sector of rdev
348 * Increment mddev->pending_writes before returning
349 * and decrement it on completion, waking up sb_wait
350 * if zero is reached.
351 * If an error occurred, call md_error
352 */
353 struct bio *bio = bio_alloc(GFP_NOIO, 1);
354
355 bio->bi_bdev = rdev->bdev;
356 bio->bi_sector = sector;
357 bio_add_page(bio, page, size, 0);
358 bio->bi_private = rdev;
359 bio->bi_end_io = super_written;
360 atomic_inc(&mddev->pending_writes);
361 submit_bio((1<<BIO_RW)|(1<<BIO_RW_SYNC), bio);
362}
363
323static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 364static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
324{ 365{
325 if (bio->bi_size) 366 if (bio->bi_size)
@@ -329,7 +370,7 @@ static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
329 return 0; 370 return 0;
330} 371}
331 372
332static int sync_page_io(struct block_device *bdev, sector_t sector, int size, 373int sync_page_io(struct block_device *bdev, sector_t sector, int size,
333 struct page *page, int rw) 374 struct page *page, int rw)
334{ 375{
335 struct bio *bio = bio_alloc(GFP_NOIO, 1); 376 struct bio *bio = bio_alloc(GFP_NOIO, 1);
@@ -416,11 +457,8 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
416 ret = 1; 457 ret = 1;
417 458
418abort: 459abort:
419 if (tmp1) 460 kfree(tmp1);
420 kfree(tmp1); 461 kfree(tmp2);
421 if (tmp2)
422 kfree(tmp2);
423
424 return ret; 462 return ret;
425} 463}
426 464
@@ -569,6 +607,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
569 mdp_disk_t *desc; 607 mdp_disk_t *desc;
570 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 608 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
571 609
610 rdev->raid_disk = -1;
611 rdev->in_sync = 0;
572 if (mddev->raid_disks == 0) { 612 if (mddev->raid_disks == 0) {
573 mddev->major_version = 0; 613 mddev->major_version = 0;
574 mddev->minor_version = sb->minor_version; 614 mddev->minor_version = sb->minor_version;
@@ -599,16 +639,35 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
599 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 639 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
600 640
601 mddev->max_disks = MD_SB_DISKS; 641 mddev->max_disks = MD_SB_DISKS;
602 } else { 642
603 __u64 ev1; 643 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
604 ev1 = md_event(sb); 644 mddev->bitmap_file == NULL) {
645 if (mddev->level != 1) {
646 /* FIXME use a better test */
647 printk(KERN_WARNING "md: bitmaps only support for raid1\n");
648 return -EINVAL;
649 }
650 mddev->bitmap_offset = (MD_SB_BYTES >> 9);
651 }
652
653 } else if (mddev->pers == NULL) {
654 /* Insist on good event counter while assembling */
655 __u64 ev1 = md_event(sb);
605 ++ev1; 656 ++ev1;
606 if (ev1 < mddev->events) 657 if (ev1 < mddev->events)
607 return -EINVAL; 658 return -EINVAL;
608 } 659 } else if (mddev->bitmap) {
660 /* if adding to array with a bitmap, then we can accept an
661 * older device ... but not too old.
662 */
663 __u64 ev1 = md_event(sb);
664 if (ev1 < mddev->bitmap->events_cleared)
665 return 0;
666 } else /* just a hot-add of a new device, leave raid_disk at -1 */
667 return 0;
668
609 if (mddev->level != LEVEL_MULTIPATH) { 669 if (mddev->level != LEVEL_MULTIPATH) {
610 rdev->raid_disk = -1; 670 rdev->faulty = 0;
611 rdev->in_sync = rdev->faulty = 0;
612 desc = sb->disks + rdev->desc_nr; 671 desc = sb->disks + rdev->desc_nr;
613 672
614 if (desc->state & (1<<MD_DISK_FAULTY)) 673 if (desc->state & (1<<MD_DISK_FAULTY))
@@ -618,7 +677,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
618 rdev->in_sync = 1; 677 rdev->in_sync = 1;
619 rdev->raid_disk = desc->raid_disk; 678 rdev->raid_disk = desc->raid_disk;
620 } 679 }
621 } 680 } else /* MULTIPATH are always insync */
681 rdev->in_sync = 1;
622 return 0; 682 return 0;
623} 683}
624 684
@@ -683,6 +743,9 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
683 sb->layout = mddev->layout; 743 sb->layout = mddev->layout;
684 sb->chunk_size = mddev->chunk_size; 744 sb->chunk_size = mddev->chunk_size;
685 745
746 if (mddev->bitmap && mddev->bitmap_file == NULL)
747 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
748
686 sb->disks[0].state = (1<<MD_DISK_REMOVED); 749 sb->disks[0].state = (1<<MD_DISK_REMOVED);
687 ITERATE_RDEV(mddev,rdev2,tmp) { 750 ITERATE_RDEV(mddev,rdev2,tmp) {
688 mdp_disk_t *d; 751 mdp_disk_t *d;
@@ -780,7 +843,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
780 case 0: 843 case 0:
781 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 844 sb_offset = rdev->bdev->bd_inode->i_size >> 9;
782 sb_offset -= 8*2; 845 sb_offset -= 8*2;
783 sb_offset &= ~(4*2-1); 846 sb_offset &= ~(sector_t)(4*2-1);
784 /* convert from sectors to K */ 847 /* convert from sectors to K */
785 sb_offset /= 2; 848 sb_offset /= 2;
786 break; 849 break;
@@ -860,6 +923,8 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
860{ 923{
861 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 924 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
862 925
926 rdev->raid_disk = -1;
927 rdev->in_sync = 0;
863 if (mddev->raid_disks == 0) { 928 if (mddev->raid_disks == 0) {
864 mddev->major_version = 1; 929 mddev->major_version = 1;
865 mddev->patch_version = 0; 930 mddev->patch_version = 0;
@@ -877,13 +942,30 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
877 memcpy(mddev->uuid, sb->set_uuid, 16); 942 memcpy(mddev->uuid, sb->set_uuid, 16);
878 943
879 mddev->max_disks = (4096-256)/2; 944 mddev->max_disks = (4096-256)/2;
880 } else { 945
881 __u64 ev1; 946 if ((le32_to_cpu(sb->feature_map) & 1) &&
882 ev1 = le64_to_cpu(sb->events); 947 mddev->bitmap_file == NULL ) {
948 if (mddev->level != 1) {
949 printk(KERN_WARNING "md: bitmaps only supported for raid1\n");
950 return -EINVAL;
951 }
952 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
953 }
954 } else if (mddev->pers == NULL) {
955 /* Insist of good event counter while assembling */
956 __u64 ev1 = le64_to_cpu(sb->events);
883 ++ev1; 957 ++ev1;
884 if (ev1 < mddev->events) 958 if (ev1 < mddev->events)
885 return -EINVAL; 959 return -EINVAL;
886 } 960 } else if (mddev->bitmap) {
961 /* If adding to array with a bitmap, then we can accept an
962 * older device, but not too old.
963 */
964 __u64 ev1 = le64_to_cpu(sb->events);
965 if (ev1 < mddev->bitmap->events_cleared)
966 return 0;
967 } else /* just a hot-add of a new device, leave raid_disk at -1 */
968 return 0;
887 969
888 if (mddev->level != LEVEL_MULTIPATH) { 970 if (mddev->level != LEVEL_MULTIPATH) {
889 int role; 971 int role;
@@ -891,14 +973,10 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
891 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 973 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
892 switch(role) { 974 switch(role) {
893 case 0xffff: /* spare */ 975 case 0xffff: /* spare */
894 rdev->in_sync = 0;
895 rdev->faulty = 0; 976 rdev->faulty = 0;
896 rdev->raid_disk = -1;
897 break; 977 break;
898 case 0xfffe: /* faulty */ 978 case 0xfffe: /* faulty */
899 rdev->in_sync = 0;
900 rdev->faulty = 1; 979 rdev->faulty = 1;
901 rdev->raid_disk = -1;
902 break; 980 break;
903 default: 981 default:
904 rdev->in_sync = 1; 982 rdev->in_sync = 1;
@@ -906,7 +984,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
906 rdev->raid_disk = role; 984 rdev->raid_disk = role;
907 break; 985 break;
908 } 986 }
909 } 987 } else /* MULTIPATH are always insync */
988 rdev->in_sync = 1;
989
910 return 0; 990 return 0;
911} 991}
912 992
@@ -933,6 +1013,11 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
933 else 1013 else
934 sb->resync_offset = cpu_to_le64(0); 1014 sb->resync_offset = cpu_to_le64(0);
935 1015
1016 if (mddev->bitmap && mddev->bitmap_file == NULL) {
1017 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1018 sb->feature_map = cpu_to_le32(1);
1019 }
1020
936 max_dev = 0; 1021 max_dev = 0;
937 ITERATE_RDEV(mddev,rdev2,tmp) 1022 ITERATE_RDEV(mddev,rdev2,tmp)
938 if (rdev2->desc_nr+1 > max_dev) 1023 if (rdev2->desc_nr+1 > max_dev)
@@ -1196,8 +1281,11 @@ void md_print_devices(void)
1196 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1281 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
1197 printk("md: **********************************\n"); 1282 printk("md: **********************************\n");
1198 ITERATE_MDDEV(mddev,tmp) { 1283 ITERATE_MDDEV(mddev,tmp) {
1199 printk("%s: ", mdname(mddev));
1200 1284
1285 if (mddev->bitmap)
1286 bitmap_print_sb(mddev->bitmap);
1287 else
1288 printk("%s: ", mdname(mddev));
1201 ITERATE_RDEV(mddev,rdev,tmp2) 1289 ITERATE_RDEV(mddev,rdev,tmp2)
1202 printk("<%s>", bdevname(rdev->bdev,b)); 1290 printk("<%s>", bdevname(rdev->bdev,b));
1203 printk("\n"); 1291 printk("\n");
@@ -1210,30 +1298,6 @@ void md_print_devices(void)
1210} 1298}
1211 1299
1212 1300
1213static int write_disk_sb(mdk_rdev_t * rdev)
1214{
1215 char b[BDEVNAME_SIZE];
1216 if (!rdev->sb_loaded) {
1217 MD_BUG();
1218 return 1;
1219 }
1220 if (rdev->faulty) {
1221 MD_BUG();
1222 return 1;
1223 }
1224
1225 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1226 bdevname(rdev->bdev,b),
1227 (unsigned long long)rdev->sb_offset);
1228
1229 if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE))
1230 return 0;
1231
1232 printk("md: write_disk_sb failed for device %s\n",
1233 bdevname(rdev->bdev,b));
1234 return 1;
1235}
1236
1237static void sync_sbs(mddev_t * mddev) 1301static void sync_sbs(mddev_t * mddev)
1238{ 1302{
1239 mdk_rdev_t *rdev; 1303 mdk_rdev_t *rdev;
@@ -1248,12 +1312,14 @@ static void sync_sbs(mddev_t * mddev)
1248 1312
1249static void md_update_sb(mddev_t * mddev) 1313static void md_update_sb(mddev_t * mddev)
1250{ 1314{
1251 int err, count = 100; 1315 int err;
1252 struct list_head *tmp; 1316 struct list_head *tmp;
1253 mdk_rdev_t *rdev; 1317 mdk_rdev_t *rdev;
1318 int sync_req;
1254 1319
1255 mddev->sb_dirty = 0;
1256repeat: 1320repeat:
1321 spin_lock(&mddev->write_lock);
1322 sync_req = mddev->in_sync;
1257 mddev->utime = get_seconds(); 1323 mddev->utime = get_seconds();
1258 mddev->events ++; 1324 mddev->events ++;
1259 1325
@@ -1266,20 +1332,26 @@ repeat:
1266 MD_BUG(); 1332 MD_BUG();
1267 mddev->events --; 1333 mddev->events --;
1268 } 1334 }
1335 mddev->sb_dirty = 2;
1269 sync_sbs(mddev); 1336 sync_sbs(mddev);
1270 1337
1271 /* 1338 /*
1272 * do not write anything to disk if using 1339 * do not write anything to disk if using
1273 * nonpersistent superblocks 1340 * nonpersistent superblocks
1274 */ 1341 */
1275 if (!mddev->persistent) 1342 if (!mddev->persistent) {
1343 mddev->sb_dirty = 0;
1344 spin_unlock(&mddev->write_lock);
1345 wake_up(&mddev->sb_wait);
1276 return; 1346 return;
1347 }
1348 spin_unlock(&mddev->write_lock);
1277 1349
1278 dprintk(KERN_INFO 1350 dprintk(KERN_INFO
1279 "md: updating %s RAID superblock on device (in sync %d)\n", 1351 "md: updating %s RAID superblock on device (in sync %d)\n",
1280 mdname(mddev),mddev->in_sync); 1352 mdname(mddev),mddev->in_sync);
1281 1353
1282 err = 0; 1354 err = bitmap_update_sb(mddev->bitmap);
1283 ITERATE_RDEV(mddev,rdev,tmp) { 1355 ITERATE_RDEV(mddev,rdev,tmp) {
1284 char b[BDEVNAME_SIZE]; 1356 char b[BDEVNAME_SIZE];
1285 dprintk(KERN_INFO "md: "); 1357 dprintk(KERN_INFO "md: ");
@@ -1288,22 +1360,32 @@ repeat:
1288 1360
1289 dprintk("%s ", bdevname(rdev->bdev,b)); 1361 dprintk("%s ", bdevname(rdev->bdev,b));
1290 if (!rdev->faulty) { 1362 if (!rdev->faulty) {
1291 err += write_disk_sb(rdev); 1363 md_super_write(mddev,rdev,
1364 rdev->sb_offset<<1, MD_SB_BYTES,
1365 rdev->sb_page);
1366 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1367 bdevname(rdev->bdev,b),
1368 (unsigned long long)rdev->sb_offset);
1369
1292 } else 1370 } else
1293 dprintk(")\n"); 1371 dprintk(")\n");
1294 if (!err && mddev->level == LEVEL_MULTIPATH) 1372 if (mddev->level == LEVEL_MULTIPATH)
1295 /* only need to write one superblock... */ 1373 /* only need to write one superblock... */
1296 break; 1374 break;
1297 } 1375 }
1298 if (err) { 1376 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
1299 if (--count) { 1377 /* if there was a failure, sb_dirty was set to 1, and we re-write super */
1300 printk(KERN_ERR "md: errors occurred during superblock" 1378
1301 " update, repeating\n"); 1379 spin_lock(&mddev->write_lock);
1302 goto repeat; 1380 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
1303 } 1381 /* have to write it out again */
1304 printk(KERN_ERR \ 1382 spin_unlock(&mddev->write_lock);
1305 "md: excessive errors occurred during superblock update, exiting\n"); 1383 goto repeat;
1306 } 1384 }
1385 mddev->sb_dirty = 0;
1386 spin_unlock(&mddev->write_lock);
1387 wake_up(&mddev->sb_wait);
1388
1307} 1389}
1308 1390
1309/* 1391/*
@@ -1607,12 +1689,19 @@ static int do_md_run(mddev_t * mddev)
1607 1689
1608 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 1690 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
1609 1691
1610 err = mddev->pers->run(mddev); 1692 /* before we start the array running, initialise the bitmap */
1693 err = bitmap_create(mddev);
1694 if (err)
1695 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
1696 mdname(mddev), err);
1697 else
1698 err = mddev->pers->run(mddev);
1611 if (err) { 1699 if (err) {
1612 printk(KERN_ERR "md: pers->run() failed ...\n"); 1700 printk(KERN_ERR "md: pers->run() failed ...\n");
1613 module_put(mddev->pers->owner); 1701 module_put(mddev->pers->owner);
1614 mddev->pers = NULL; 1702 mddev->pers = NULL;
1615 return -EINVAL; 1703 bitmap_destroy(mddev);
1704 return err;
1616 } 1705 }
1617 atomic_set(&mddev->writes_pending,0); 1706 atomic_set(&mddev->writes_pending,0);
1618 mddev->safemode = 0; 1707 mddev->safemode = 0;
@@ -1725,6 +1814,14 @@ static int do_md_stop(mddev_t * mddev, int ro)
1725 if (ro) 1814 if (ro)
1726 set_disk_ro(disk, 1); 1815 set_disk_ro(disk, 1);
1727 } 1816 }
1817
1818 bitmap_destroy(mddev);
1819 if (mddev->bitmap_file) {
1820 atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1);
1821 fput(mddev->bitmap_file);
1822 mddev->bitmap_file = NULL;
1823 }
1824
1728 /* 1825 /*
1729 * Free resources if final stop 1826 * Free resources if final stop
1730 */ 1827 */
@@ -1983,6 +2080,42 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
1983 return 0; 2080 return 0;
1984} 2081}
1985 2082
2083static int get_bitmap_file(mddev_t * mddev, void * arg)
2084{
2085 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
2086 char *ptr, *buf = NULL;
2087 int err = -ENOMEM;
2088
2089 file = kmalloc(sizeof(*file), GFP_KERNEL);
2090 if (!file)
2091 goto out;
2092
2093 /* bitmap disabled, zero the first byte and copy out */
2094 if (!mddev->bitmap || !mddev->bitmap->file) {
2095 file->pathname[0] = '\0';
2096 goto copy_out;
2097 }
2098
2099 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
2100 if (!buf)
2101 goto out;
2102
2103 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
2104 if (!ptr)
2105 goto out;
2106
2107 strcpy(file->pathname, ptr);
2108
2109copy_out:
2110 err = 0;
2111 if (copy_to_user(arg, file, sizeof(*file)))
2112 err = -EFAULT;
2113out:
2114 kfree(buf);
2115 kfree(file);
2116 return err;
2117}
2118
1986static int get_disk_info(mddev_t * mddev, void __user * arg) 2119static int get_disk_info(mddev_t * mddev, void __user * arg)
1987{ 2120{
1988 mdu_disk_info_t info; 2121 mdu_disk_info_t info;
@@ -2078,11 +2211,25 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2078 PTR_ERR(rdev)); 2211 PTR_ERR(rdev));
2079 return PTR_ERR(rdev); 2212 return PTR_ERR(rdev);
2080 } 2213 }
2214 /* set save_raid_disk if appropriate */
2215 if (!mddev->persistent) {
2216 if (info->state & (1<<MD_DISK_SYNC) &&
2217 info->raid_disk < mddev->raid_disks)
2218 rdev->raid_disk = info->raid_disk;
2219 else
2220 rdev->raid_disk = -1;
2221 } else
2222 super_types[mddev->major_version].
2223 validate_super(mddev, rdev);
2224 rdev->saved_raid_disk = rdev->raid_disk;
2225
2081 rdev->in_sync = 0; /* just to be sure */ 2226 rdev->in_sync = 0; /* just to be sure */
2082 rdev->raid_disk = -1; 2227 rdev->raid_disk = -1;
2083 err = bind_rdev_to_array(rdev, mddev); 2228 err = bind_rdev_to_array(rdev, mddev);
2084 if (err) 2229 if (err)
2085 export_rdev(rdev); 2230 export_rdev(rdev);
2231
2232 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2086 if (mddev->thread) 2233 if (mddev->thread)
2087 md_wakeup_thread(mddev->thread); 2234 md_wakeup_thread(mddev->thread);
2088 return err; 2235 return err;
@@ -2256,6 +2403,49 @@ abort_export:
2256 return err; 2403 return err;
2257} 2404}
2258 2405
2406/* similar to deny_write_access, but accounts for our holding a reference
2407 * to the file ourselves */
2408static int deny_bitmap_write_access(struct file * file)
2409{
2410 struct inode *inode = file->f_mapping->host;
2411
2412 spin_lock(&inode->i_lock);
2413 if (atomic_read(&inode->i_writecount) > 1) {
2414 spin_unlock(&inode->i_lock);
2415 return -ETXTBSY;
2416 }
2417 atomic_set(&inode->i_writecount, -1);
2418 spin_unlock(&inode->i_lock);
2419
2420 return 0;
2421}
2422
2423static int set_bitmap_file(mddev_t *mddev, int fd)
2424{
2425 int err;
2426
2427 if (mddev->pers)
2428 return -EBUSY;
2429
2430 mddev->bitmap_file = fget(fd);
2431
2432 if (mddev->bitmap_file == NULL) {
2433 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
2434 mdname(mddev));
2435 return -EBADF;
2436 }
2437
2438 err = deny_bitmap_write_access(mddev->bitmap_file);
2439 if (err) {
2440 printk(KERN_ERR "%s: error: bitmap file is already in use\n",
2441 mdname(mddev));
2442 fput(mddev->bitmap_file);
2443 mddev->bitmap_file = NULL;
2444 } else
2445 mddev->bitmap_offset = 0; /* file overrides offset */
2446 return err;
2447}
2448
2259/* 2449/*
2260 * set_array_info is used two different ways 2450 * set_array_info is used two different ways
2261 * The original usage is when creating a new array. 2451 * The original usage is when creating a new array.
@@ -2567,8 +2757,10 @@ static int md_ioctl(struct inode *inode, struct file *file,
2567 /* 2757 /*
2568 * Commands querying/configuring an existing array: 2758 * Commands querying/configuring an existing array:
2569 */ 2759 */
2570 /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ 2760 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
2571 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { 2761 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */
2762 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
2763 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) {
2572 err = -ENODEV; 2764 err = -ENODEV;
2573 goto abort_unlock; 2765 goto abort_unlock;
2574 } 2766 }
@@ -2582,6 +2774,10 @@ static int md_ioctl(struct inode *inode, struct file *file,
2582 err = get_array_info(mddev, argp); 2774 err = get_array_info(mddev, argp);
2583 goto done_unlock; 2775 goto done_unlock;
2584 2776
2777 case GET_BITMAP_FILE:
2778 err = get_bitmap_file(mddev, (void *)arg);
2779 goto done_unlock;
2780
2585 case GET_DISK_INFO: 2781 case GET_DISK_INFO:
2586 err = get_disk_info(mddev, argp); 2782 err = get_disk_info(mddev, argp);
2587 goto done_unlock; 2783 goto done_unlock;
@@ -2662,6 +2858,10 @@ static int md_ioctl(struct inode *inode, struct file *file,
2662 err = do_md_run (mddev); 2858 err = do_md_run (mddev);
2663 goto done_unlock; 2859 goto done_unlock;
2664 2860
2861 case SET_BITMAP_FILE:
2862 err = set_bitmap_file(mddev, (int)arg);
2863 goto done_unlock;
2864
2665 default: 2865 default:
2666 if (_IOC_TYPE(cmd) == MD_MAJOR) 2866 if (_IOC_TYPE(cmd) == MD_MAJOR)
2667 printk(KERN_WARNING "md: %s(pid %d) used" 2867 printk(KERN_WARNING "md: %s(pid %d) used"
@@ -2773,8 +2973,9 @@ static int md_thread(void * arg)
2773 while (thread->run) { 2973 while (thread->run) {
2774 void (*run)(mddev_t *); 2974 void (*run)(mddev_t *);
2775 2975
2776 wait_event_interruptible(thread->wqueue, 2976 wait_event_interruptible_timeout(thread->wqueue,
2777 test_bit(THREAD_WAKEUP, &thread->flags)); 2977 test_bit(THREAD_WAKEUP, &thread->flags),
2978 thread->timeout);
2778 if (current->flags & PF_FREEZE) 2979 if (current->flags & PF_FREEZE)
2779 refrigerator(PF_FREEZE); 2980 refrigerator(PF_FREEZE);
2780 2981
@@ -2820,6 +3021,7 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
2820 thread->run = run; 3021 thread->run = run;
2821 thread->mddev = mddev; 3022 thread->mddev = mddev;
2822 thread->name = name; 3023 thread->name = name;
3024 thread->timeout = MAX_SCHEDULE_TIMEOUT;
2823 ret = kernel_thread(md_thread, thread, 0); 3025 ret = kernel_thread(md_thread, thread, 0);
2824 if (ret < 0) { 3026 if (ret < 0) {
2825 kfree(thread); 3027 kfree(thread);
@@ -2858,13 +3060,13 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
2858 3060
2859 if (!rdev || rdev->faulty) 3061 if (!rdev || rdev->faulty)
2860 return; 3062 return;
2861 3063/*
2862 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 3064 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
2863 mdname(mddev), 3065 mdname(mddev),
2864 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 3066 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
2865 __builtin_return_address(0),__builtin_return_address(1), 3067 __builtin_return_address(0),__builtin_return_address(1),
2866 __builtin_return_address(2),__builtin_return_address(3)); 3068 __builtin_return_address(2),__builtin_return_address(3));
2867 3069*/
2868 if (!mddev->pers->error_handler) 3070 if (!mddev->pers->error_handler)
2869 return; 3071 return;
2870 mddev->pers->error_handler(mddev,rdev); 3072 mddev->pers->error_handler(mddev,rdev);
@@ -3018,6 +3220,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
3018 struct list_head *tmp2; 3220 struct list_head *tmp2;
3019 mdk_rdev_t *rdev; 3221 mdk_rdev_t *rdev;
3020 int i; 3222 int i;
3223 struct bitmap *bitmap;
3021 3224
3022 if (v == (void*)1) { 3225 if (v == (void*)1) {
3023 seq_printf(seq, "Personalities : "); 3226 seq_printf(seq, "Personalities : ");
@@ -3070,10 +3273,35 @@ static int md_seq_show(struct seq_file *seq, void *v)
3070 if (mddev->pers) { 3273 if (mddev->pers) {
3071 mddev->pers->status (seq, mddev); 3274 mddev->pers->status (seq, mddev);
3072 seq_printf(seq, "\n "); 3275 seq_printf(seq, "\n ");
3073 if (mddev->curr_resync > 2) 3276 if (mddev->curr_resync > 2) {
3074 status_resync (seq, mddev); 3277 status_resync (seq, mddev);
3075 else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 3278 seq_printf(seq, "\n ");
3076 seq_printf(seq, " resync=DELAYED"); 3279 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
3280 seq_printf(seq, " resync=DELAYED\n ");
3281 } else
3282 seq_printf(seq, "\n ");
3283
3284 if ((bitmap = mddev->bitmap)) {
3285 unsigned long chunk_kb;
3286 unsigned long flags;
3287 spin_lock_irqsave(&bitmap->lock, flags);
3288 chunk_kb = bitmap->chunksize >> 10;
3289 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
3290 "%lu%s chunk",
3291 bitmap->pages - bitmap->missing_pages,
3292 bitmap->pages,
3293 (bitmap->pages - bitmap->missing_pages)
3294 << (PAGE_SHIFT - 10),
3295 chunk_kb ? chunk_kb : bitmap->chunksize,
3296 chunk_kb ? "KB" : "B");
3297 if (bitmap->file) {
3298 seq_printf(seq, ", file: ");
3299 seq_path(seq, bitmap->file->f_vfsmnt,
3300 bitmap->file->f_dentry," \t\n");
3301 }
3302
3303 seq_printf(seq, "\n");
3304 spin_unlock_irqrestore(&bitmap->lock, flags);
3077 } 3305 }
3078 3306
3079 seq_printf(seq, "\n"); 3307 seq_printf(seq, "\n");
@@ -3176,19 +3404,28 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
3176} 3404}
3177 3405
3178 3406
3179void md_write_start(mddev_t *mddev) 3407/* md_write_start(mddev, bi)
3408 * If we need to update some array metadata (e.g. 'active' flag
3409 * in superblock) before writing, schedule a superblock update
3410 * and wait for it to complete.
3411 */
3412void md_write_start(mddev_t *mddev, struct bio *bi)
3180{ 3413{
3181 if (!atomic_read(&mddev->writes_pending)) { 3414 DEFINE_WAIT(w);
3182 mddev_lock_uninterruptible(mddev); 3415 if (bio_data_dir(bi) != WRITE)
3416 return;
3417
3418 atomic_inc(&mddev->writes_pending);
3419 if (mddev->in_sync) {
3420 spin_lock(&mddev->write_lock);
3183 if (mddev->in_sync) { 3421 if (mddev->in_sync) {
3184 mddev->in_sync = 0; 3422 mddev->in_sync = 0;
3185 del_timer(&mddev->safemode_timer); 3423 mddev->sb_dirty = 1;
3186 md_update_sb(mddev); 3424 md_wakeup_thread(mddev->thread);
3187 } 3425 }
3188 atomic_inc(&mddev->writes_pending); 3426 spin_unlock(&mddev->write_lock);
3189 mddev_unlock(mddev); 3427 }
3190 } else 3428 wait_event(mddev->sb_wait, mddev->sb_dirty==0);
3191 atomic_inc(&mddev->writes_pending);
3192} 3429}
3193 3430
3194void md_write_end(mddev_t *mddev) 3431void md_write_end(mddev_t *mddev)
@@ -3201,37 +3438,6 @@ void md_write_end(mddev_t *mddev)
3201 } 3438 }
3202} 3439}
3203 3440
3204static inline void md_enter_safemode(mddev_t *mddev)
3205{
3206 if (!mddev->safemode) return;
3207 if (mddev->safemode == 2 &&
3208 (atomic_read(&mddev->writes_pending) || mddev->in_sync ||
3209 mddev->recovery_cp != MaxSector))
3210 return; /* avoid the lock */
3211 mddev_lock_uninterruptible(mddev);
3212 if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
3213 !mddev->in_sync && mddev->recovery_cp == MaxSector) {
3214 mddev->in_sync = 1;
3215 md_update_sb(mddev);
3216 }
3217 mddev_unlock(mddev);
3218
3219 if (mddev->safemode == 1)
3220 mddev->safemode = 0;
3221}
3222
3223void md_handle_safemode(mddev_t *mddev)
3224{
3225 if (signal_pending(current)) {
3226 printk(KERN_INFO "md: %s in immediate safe mode\n",
3227 mdname(mddev));
3228 mddev->safemode = 2;
3229 flush_signals(current);
3230 }
3231 md_enter_safemode(mddev);
3232}
3233
3234
3235static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 3441static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
3236 3442
3237#define SYNC_MARKS 10 3443#define SYNC_MARKS 10
@@ -3241,12 +3447,13 @@ static void md_do_sync(mddev_t *mddev)
3241 mddev_t *mddev2; 3447 mddev_t *mddev2;
3242 unsigned int currspeed = 0, 3448 unsigned int currspeed = 0,
3243 window; 3449 window;
3244 sector_t max_sectors,j; 3450 sector_t max_sectors,j, io_sectors;
3245 unsigned long mark[SYNC_MARKS]; 3451 unsigned long mark[SYNC_MARKS];
3246 sector_t mark_cnt[SYNC_MARKS]; 3452 sector_t mark_cnt[SYNC_MARKS];
3247 int last_mark,m; 3453 int last_mark,m;
3248 struct list_head *tmp; 3454 struct list_head *tmp;
3249 sector_t last_check; 3455 sector_t last_check;
3456 int skipped = 0;
3250 3457
3251 /* just incase thread restarts... */ 3458 /* just incase thread restarts... */
3252 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 3459 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@@ -3312,7 +3519,7 @@ static void md_do_sync(mddev_t *mddev)
3312 3519
3313 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3520 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3314 /* resync follows the size requested by the personality, 3521 /* resync follows the size requested by the personality,
3315 * which default to physical size, but can be virtual size 3522 * which defaults to physical size, but can be virtual size
3316 */ 3523 */
3317 max_sectors = mddev->resync_max_sectors; 3524 max_sectors = mddev->resync_max_sectors;
3318 else 3525 else
@@ -3327,13 +3534,15 @@ static void md_do_sync(mddev_t *mddev)
3327 sysctl_speed_limit_max); 3534 sysctl_speed_limit_max);
3328 3535
3329 is_mddev_idle(mddev); /* this also initializes IO event counters */ 3536 is_mddev_idle(mddev); /* this also initializes IO event counters */
3330 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3537 /* we don't use the checkpoint if there's a bitmap */
3538 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap)
3331 j = mddev->recovery_cp; 3539 j = mddev->recovery_cp;
3332 else 3540 else
3333 j = 0; 3541 j = 0;
3542 io_sectors = 0;
3334 for (m = 0; m < SYNC_MARKS; m++) { 3543 for (m = 0; m < SYNC_MARKS; m++) {
3335 mark[m] = jiffies; 3544 mark[m] = jiffies;
3336 mark_cnt[m] = j; 3545 mark_cnt[m] = io_sectors;
3337 } 3546 }
3338 last_mark = 0; 3547 last_mark = 0;
3339 mddev->resync_mark = mark[last_mark]; 3548 mddev->resync_mark = mark[last_mark];
@@ -3358,21 +3567,29 @@ static void md_do_sync(mddev_t *mddev)
3358 } 3567 }
3359 3568
3360 while (j < max_sectors) { 3569 while (j < max_sectors) {
3361 int sectors; 3570 sector_t sectors;
3362 3571
3363 sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min); 3572 skipped = 0;
3364 if (sectors < 0) { 3573 sectors = mddev->pers->sync_request(mddev, j, &skipped,
3574 currspeed < sysctl_speed_limit_min);
3575 if (sectors == 0) {
3365 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 3576 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
3366 goto out; 3577 goto out;
3367 } 3578 }
3368 atomic_add(sectors, &mddev->recovery_active); 3579
3580 if (!skipped) { /* actual IO requested */
3581 io_sectors += sectors;
3582 atomic_add(sectors, &mddev->recovery_active);
3583 }
3584
3369 j += sectors; 3585 j += sectors;
3370 if (j>1) mddev->curr_resync = j; 3586 if (j>1) mddev->curr_resync = j;
3371 3587
3372 if (last_check + window > j || j == max_sectors) 3588
3589 if (last_check + window > io_sectors || j == max_sectors)
3373 continue; 3590 continue;
3374 3591
3375 last_check = j; 3592 last_check = io_sectors;
3376 3593
3377 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 3594 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
3378 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 3595 test_bit(MD_RECOVERY_ERR, &mddev->recovery))
@@ -3386,7 +3603,7 @@ static void md_do_sync(mddev_t *mddev)
3386 mddev->resync_mark = mark[next]; 3603 mddev->resync_mark = mark[next];
3387 mddev->resync_mark_cnt = mark_cnt[next]; 3604 mddev->resync_mark_cnt = mark_cnt[next];
3388 mark[next] = jiffies; 3605 mark[next] = jiffies;
3389 mark_cnt[next] = j - atomic_read(&mddev->recovery_active); 3606 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
3390 last_mark = next; 3607 last_mark = next;
3391 } 3608 }
3392 3609
@@ -3413,7 +3630,8 @@ static void md_do_sync(mddev_t *mddev)
3413 mddev->queue->unplug_fn(mddev->queue); 3630 mddev->queue->unplug_fn(mddev->queue);
3414 cond_resched(); 3631 cond_resched();
3415 3632
3416 currspeed = ((unsigned long)(j-mddev->resync_mark_cnt))/2/((jiffies-mddev->resync_mark)/HZ +1) +1; 3633 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
3634 /((jiffies-mddev->resync_mark)/HZ +1) +1;
3417 3635
3418 if (currspeed > sysctl_speed_limit_min) { 3636 if (currspeed > sysctl_speed_limit_min) {
3419 if ((currspeed > sysctl_speed_limit_max) || 3637 if ((currspeed > sysctl_speed_limit_max) ||
@@ -3433,7 +3651,7 @@ static void md_do_sync(mddev_t *mddev)
3433 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 3651 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
3434 3652
3435 /* tell personality that we are finished */ 3653 /* tell personality that we are finished */
3436 mddev->pers->sync_request(mddev, max_sectors, 1); 3654 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
3437 3655
3438 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 3656 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
3439 mddev->curr_resync > 2 && 3657 mddev->curr_resync > 2 &&
@@ -3447,7 +3665,6 @@ static void md_do_sync(mddev_t *mddev)
3447 mddev->recovery_cp = MaxSector; 3665 mddev->recovery_cp = MaxSector;
3448 } 3666 }
3449 3667
3450 md_enter_safemode(mddev);
3451 skip: 3668 skip:
3452 mddev->curr_resync = 0; 3669 mddev->curr_resync = 0;
3453 wake_up(&resync_wait); 3670 wake_up(&resync_wait);
@@ -3484,20 +3701,48 @@ void md_check_recovery(mddev_t *mddev)
3484 struct list_head *rtmp; 3701 struct list_head *rtmp;
3485 3702
3486 3703
3487 dprintk(KERN_INFO "md: recovery thread got woken up ...\n"); 3704 if (mddev->bitmap)
3705 bitmap_daemon_work(mddev->bitmap);
3488 3706
3489 if (mddev->ro) 3707 if (mddev->ro)
3490 return; 3708 return;
3709
3710 if (signal_pending(current)) {
3711 if (mddev->pers->sync_request) {
3712 printk(KERN_INFO "md: %s in immediate safe mode\n",
3713 mdname(mddev));
3714 mddev->safemode = 2;
3715 }
3716 flush_signals(current);
3717 }
3718
3491 if ( ! ( 3719 if ( ! (
3492 mddev->sb_dirty || 3720 mddev->sb_dirty ||
3493 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 3721 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
3494 test_bit(MD_RECOVERY_DONE, &mddev->recovery) 3722 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
3723 (mddev->safemode == 1) ||
3724 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
3725 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
3495 )) 3726 ))
3496 return; 3727 return;
3728
3497 if (mddev_trylock(mddev)==0) { 3729 if (mddev_trylock(mddev)==0) {
3498 int spares =0; 3730 int spares =0;
3731
3732 spin_lock(&mddev->write_lock);
3733 if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
3734 !mddev->in_sync && mddev->recovery_cp == MaxSector) {
3735 mddev->in_sync = 1;
3736 mddev->sb_dirty = 1;
3737 }
3738 if (mddev->safemode == 1)
3739 mddev->safemode = 0;
3740 spin_unlock(&mddev->write_lock);
3741
3499 if (mddev->sb_dirty) 3742 if (mddev->sb_dirty)
3500 md_update_sb(mddev); 3743 md_update_sb(mddev);
3744
3745
3501 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 3746 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
3502 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 3747 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
3503 /* resync/recovery still happening */ 3748 /* resync/recovery still happening */
@@ -3515,6 +3760,14 @@ void md_check_recovery(mddev_t *mddev)
3515 mddev->pers->spare_active(mddev); 3760 mddev->pers->spare_active(mddev);
3516 } 3761 }
3517 md_update_sb(mddev); 3762 md_update_sb(mddev);
3763
3764 /* if array is no-longer degraded, then any saved_raid_disk
3765 * information must be scrapped
3766 */
3767 if (!mddev->degraded)
3768 ITERATE_RDEV(mddev,rdev,rtmp)
3769 rdev->saved_raid_disk = -1;
3770
3518 mddev->recovery = 0; 3771 mddev->recovery = 0;
3519 /* flag recovery needed just to double check */ 3772 /* flag recovery needed just to double check */
3520 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3773 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -3557,6 +3810,13 @@ void md_check_recovery(mddev_t *mddev)
3557 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3810 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3558 if (!spares) 3811 if (!spares)
3559 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 3812 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3813 if (spares && mddev->bitmap && ! mddev->bitmap->file) {
3814 /* We are adding a device or devices to an array
3815 * which has the bitmap stored on all devices.
3816 * So make sure all bitmap pages get written
3817 */
3818 bitmap_write_all(mddev->bitmap);
3819 }
3560 mddev->sync_thread = md_register_thread(md_do_sync, 3820 mddev->sync_thread = md_register_thread(md_do_sync,
3561 mddev, 3821 mddev,
3562 "%s_resync"); 3822 "%s_resync");
@@ -3624,6 +3884,8 @@ static int __init md_init(void)
3624 " MD_SB_DISKS=%d\n", 3884 " MD_SB_DISKS=%d\n",
3625 MD_MAJOR_VERSION, MD_MINOR_VERSION, 3885 MD_MAJOR_VERSION, MD_MINOR_VERSION,
3626 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 3886 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
3887 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR,
3888 BITMAP_MINOR);
3627 3889
3628 if (register_blkdev(MAJOR_NR, "md")) 3890 if (register_blkdev(MAJOR_NR, "md"))
3629 return -1; 3891 return -1;
@@ -3739,7 +4001,6 @@ EXPORT_SYMBOL(md_error);
3739EXPORT_SYMBOL(md_done_sync); 4001EXPORT_SYMBOL(md_done_sync);
3740EXPORT_SYMBOL(md_write_start); 4002EXPORT_SYMBOL(md_write_start);
3741EXPORT_SYMBOL(md_write_end); 4003EXPORT_SYMBOL(md_write_end);
3742EXPORT_SYMBOL(md_handle_safemode);
3743EXPORT_SYMBOL(md_register_thread); 4004EXPORT_SYMBOL(md_register_thread);
3744EXPORT_SYMBOL(md_unregister_thread); 4005EXPORT_SYMBOL(md_unregister_thread);
3745EXPORT_SYMBOL(md_wakeup_thread); 4006EXPORT_SYMBOL(md_wakeup_thread);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 4e4bfde3db5d..2d2ca7fa0265 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -462,10 +462,6 @@ static int multipath_run (mddev_t *mddev)
462 } 462 }
463 memset(conf->multipaths, 0, sizeof(struct multipath_info)*mddev->raid_disks); 463 memset(conf->multipaths, 0, sizeof(struct multipath_info)*mddev->raid_disks);
464 464
465 mddev->queue->unplug_fn = multipath_unplug;
466
467 mddev->queue->issue_flush_fn = multipath_issue_flush;
468
469 conf->working_disks = 0; 465 conf->working_disks = 0;
470 ITERATE_RDEV(mddev,rdev,tmp) { 466 ITERATE_RDEV(mddev,rdev,tmp) {
471 disk_idx = rdev->raid_disk; 467 disk_idx = rdev->raid_disk;
@@ -528,13 +524,16 @@ static int multipath_run (mddev_t *mddev)
528 * Ok, everything is just fine now 524 * Ok, everything is just fine now
529 */ 525 */
530 mddev->array_size = mddev->size; 526 mddev->array_size = mddev->size;
527
528 mddev->queue->unplug_fn = multipath_unplug;
529 mddev->queue->issue_flush_fn = multipath_issue_flush;
530
531 return 0; 531 return 0;
532 532
533out_free_conf: 533out_free_conf:
534 if (conf->pool) 534 if (conf->pool)
535 mempool_destroy(conf->pool); 535 mempool_destroy(conf->pool);
536 if (conf->multipaths) 536 kfree(conf->multipaths);
537 kfree(conf->multipaths);
538 kfree(conf); 537 kfree(conf);
539 mddev->private = NULL; 538 mddev->private = NULL;
540out: 539out:
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index e7d934eca06f..e11dd14d0b43 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -371,10 +371,8 @@ static int raid0_run (mddev_t *mddev)
371 return 0; 371 return 0;
372 372
373out_free_conf: 373out_free_conf:
374 if (conf->strip_zone) 374 kfree(conf->strip_zone);
375 kfree(conf->strip_zone); 375 kfree(conf->devlist);
376 if (conf->devlist)
377 kfree (conf->devlist);
378 kfree(conf); 376 kfree(conf);
379 mddev->private = NULL; 377 mddev->private = NULL;
380out: 378out:
@@ -386,11 +384,11 @@ static int raid0_stop (mddev_t *mddev)
386 raid0_conf_t *conf = mddev_to_conf(mddev); 384 raid0_conf_t *conf = mddev_to_conf(mddev);
387 385
388 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 386 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
389 kfree (conf->hash_table); 387 kfree(conf->hash_table);
390 conf->hash_table = NULL; 388 conf->hash_table = NULL;
391 kfree (conf->strip_zone); 389 kfree(conf->strip_zone);
392 conf->strip_zone = NULL; 390 conf->strip_zone = NULL;
393 kfree (conf); 391 kfree(conf);
394 mddev->private = NULL; 392 mddev->private = NULL;
395 393
396 return 0; 394 return 0;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 83380b5d6593..ff1dbec864af 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -12,6 +12,15 @@
12 * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk> 12 * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> 13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
14 * 14 *
15 * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
16 * bitmapped intelligence in resync:
17 *
18 * - bitmap marked during normal i/o
19 * - bitmap used to skip nondirty blocks during sync
20 *
21 * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
22 * - persistent bitmap code
23 *
15 * This program is free software; you can redistribute it and/or modify 24 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by 25 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2, or (at your option) 26 * the Free Software Foundation; either version 2, or (at your option)
@@ -22,7 +31,16 @@
22 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 31 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */ 32 */
24 33
34#include "dm-bio-list.h"
25#include <linux/raid/raid1.h> 35#include <linux/raid/raid1.h>
36#include <linux/raid/bitmap.h>
37
38#define DEBUG 0
39#if DEBUG
40#define PRINTK(x...) printk(x)
41#else
42#define PRINTK(x...)
43#endif
26 44
27/* 45/*
28 * Number of guaranteed r1bios in case of extreme VM load: 46 * Number of guaranteed r1bios in case of extreme VM load:
@@ -287,9 +305,11 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
287 /* 305 /*
288 * this branch is our 'one mirror IO has finished' event handler: 306 * this branch is our 'one mirror IO has finished' event handler:
289 */ 307 */
290 if (!uptodate) 308 if (!uptodate) {
291 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 309 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
292 else 310 /* an I/O failed, we can't clear the bitmap */
311 set_bit(R1BIO_Degraded, &r1_bio->state);
312 } else
293 /* 313 /*
294 * Set R1BIO_Uptodate in our master bio, so that 314 * Set R1BIO_Uptodate in our master bio, so that
295 * we will return a good error code for to the higher 315 * we will return a good error code for to the higher
@@ -309,6 +329,10 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
309 * already. 329 * already.
310 */ 330 */
311 if (atomic_dec_and_test(&r1_bio->remaining)) { 331 if (atomic_dec_and_test(&r1_bio->remaining)) {
332 /* clear the bitmap if all writes complete successfully */
333 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
334 r1_bio->sectors,
335 !test_bit(R1BIO_Degraded, &r1_bio->state));
312 md_write_end(r1_bio->mddev); 336 md_write_end(r1_bio->mddev);
313 raid_end_bio_io(r1_bio); 337 raid_end_bio_io(r1_bio);
314 } 338 }
@@ -458,7 +482,10 @@ static void unplug_slaves(mddev_t *mddev)
458 482
459static void raid1_unplug(request_queue_t *q) 483static void raid1_unplug(request_queue_t *q)
460{ 484{
461 unplug_slaves(q->queuedata); 485 mddev_t *mddev = q->queuedata;
486
487 unplug_slaves(mddev);
488 md_wakeup_thread(mddev->thread);
462} 489}
463 490
464static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, 491static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
@@ -501,16 +528,16 @@ static void device_barrier(conf_t *conf, sector_t sect)
501{ 528{
502 spin_lock_irq(&conf->resync_lock); 529 spin_lock_irq(&conf->resync_lock);
503 wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), 530 wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),
504 conf->resync_lock, unplug_slaves(conf->mddev)); 531 conf->resync_lock, raid1_unplug(conf->mddev->queue));
505 532
506 if (!conf->barrier++) { 533 if (!conf->barrier++) {
507 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, 534 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
508 conf->resync_lock, unplug_slaves(conf->mddev)); 535 conf->resync_lock, raid1_unplug(conf->mddev->queue));
509 if (conf->nr_pending) 536 if (conf->nr_pending)
510 BUG(); 537 BUG();
511 } 538 }
512 wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, 539 wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH,
513 conf->resync_lock, unplug_slaves(conf->mddev)); 540 conf->resync_lock, raid1_unplug(conf->mddev->queue));
514 conf->next_resync = sect; 541 conf->next_resync = sect;
515 spin_unlock_irq(&conf->resync_lock); 542 spin_unlock_irq(&conf->resync_lock);
516} 543}
@@ -522,14 +549,20 @@ static int make_request(request_queue_t *q, struct bio * bio)
522 mirror_info_t *mirror; 549 mirror_info_t *mirror;
523 r1bio_t *r1_bio; 550 r1bio_t *r1_bio;
524 struct bio *read_bio; 551 struct bio *read_bio;
525 int i, disks; 552 int i, targets = 0, disks;
526 mdk_rdev_t *rdev; 553 mdk_rdev_t *rdev;
554 struct bitmap *bitmap = mddev->bitmap;
555 unsigned long flags;
556 struct bio_list bl;
557
527 558
528 /* 559 /*
529 * Register the new request and wait if the reconstruction 560 * Register the new request and wait if the reconstruction
530 * thread has put up a bar for new requests. 561 * thread has put up a bar for new requests.
531 * Continue immediately if no resync is active currently. 562 * Continue immediately if no resync is active currently.
532 */ 563 */
564 md_write_start(mddev, bio); /* wait on superblock update early */
565
533 spin_lock_irq(&conf->resync_lock); 566 spin_lock_irq(&conf->resync_lock);
534 wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, ); 567 wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
535 conf->nr_pending++; 568 conf->nr_pending++;
@@ -552,7 +585,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
552 585
553 r1_bio->master_bio = bio; 586 r1_bio->master_bio = bio;
554 r1_bio->sectors = bio->bi_size >> 9; 587 r1_bio->sectors = bio->bi_size >> 9;
555 588 r1_bio->state = 0;
556 r1_bio->mddev = mddev; 589 r1_bio->mddev = mddev;
557 r1_bio->sector = bio->bi_sector; 590 r1_bio->sector = bio->bi_sector;
558 591
@@ -595,6 +628,13 @@ static int make_request(request_queue_t *q, struct bio * bio)
595 * bios[x] to bio 628 * bios[x] to bio
596 */ 629 */
597 disks = conf->raid_disks; 630 disks = conf->raid_disks;
631#if 0
632 { static int first=1;
633 if (first) printk("First Write sector %llu disks %d\n",
634 (unsigned long long)r1_bio->sector, disks);
635 first = 0;
636 }
637#endif
598 rcu_read_lock(); 638 rcu_read_lock();
599 for (i = 0; i < disks; i++) { 639 for (i = 0; i < disks; i++) {
600 if ((rdev=conf->mirrors[i].rdev) != NULL && 640 if ((rdev=conf->mirrors[i].rdev) != NULL &&
@@ -605,13 +645,21 @@ static int make_request(request_queue_t *q, struct bio * bio)
605 r1_bio->bios[i] = NULL; 645 r1_bio->bios[i] = NULL;
606 } else 646 } else
607 r1_bio->bios[i] = bio; 647 r1_bio->bios[i] = bio;
648 targets++;
608 } else 649 } else
609 r1_bio->bios[i] = NULL; 650 r1_bio->bios[i] = NULL;
610 } 651 }
611 rcu_read_unlock(); 652 rcu_read_unlock();
612 653
613 atomic_set(&r1_bio->remaining, 1); 654 if (targets < conf->raid_disks) {
614 md_write_start(mddev); 655 /* array is degraded, we will not clear the bitmap
656 * on I/O completion (see raid1_end_write_request) */
657 set_bit(R1BIO_Degraded, &r1_bio->state);
658 }
659
660 atomic_set(&r1_bio->remaining, 0);
661
662 bio_list_init(&bl);
615 for (i = 0; i < disks; i++) { 663 for (i = 0; i < disks; i++) {
616 struct bio *mbio; 664 struct bio *mbio;
617 if (!r1_bio->bios[i]) 665 if (!r1_bio->bios[i])
@@ -627,14 +675,23 @@ static int make_request(request_queue_t *q, struct bio * bio)
627 mbio->bi_private = r1_bio; 675 mbio->bi_private = r1_bio;
628 676
629 atomic_inc(&r1_bio->remaining); 677 atomic_inc(&r1_bio->remaining);
630 generic_make_request(mbio);
631 }
632 678
633 if (atomic_dec_and_test(&r1_bio->remaining)) { 679 bio_list_add(&bl, mbio);
634 md_write_end(mddev);
635 raid_end_bio_io(r1_bio);
636 } 680 }
637 681
682 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors);
683 spin_lock_irqsave(&conf->device_lock, flags);
684 bio_list_merge(&conf->pending_bio_list, &bl);
685 bio_list_init(&bl);
686
687 blk_plug_device(mddev->queue);
688 spin_unlock_irqrestore(&conf->device_lock, flags);
689
690#if 0
691 while ((bio = bio_list_pop(&bl)) != NULL)
692 generic_make_request(bio);
693#endif
694
638 return 0; 695 return 0;
639} 696}
640 697
@@ -714,7 +771,7 @@ static void close_sync(conf_t *conf)
714{ 771{
715 spin_lock_irq(&conf->resync_lock); 772 spin_lock_irq(&conf->resync_lock);
716 wait_event_lock_irq(conf->wait_resume, !conf->barrier, 773 wait_event_lock_irq(conf->wait_resume, !conf->barrier,
717 conf->resync_lock, unplug_slaves(conf->mddev)); 774 conf->resync_lock, raid1_unplug(conf->mddev->queue));
718 spin_unlock_irq(&conf->resync_lock); 775 spin_unlock_irq(&conf->resync_lock);
719 776
720 if (conf->barrier) BUG(); 777 if (conf->barrier) BUG();
@@ -754,9 +811,12 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
754{ 811{
755 conf_t *conf = mddev->private; 812 conf_t *conf = mddev->private;
756 int found = 0; 813 int found = 0;
757 int mirror; 814 int mirror = 0;
758 mirror_info_t *p; 815 mirror_info_t *p;
759 816
817 if (rdev->saved_raid_disk >= 0 &&
818 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
819 mirror = rdev->saved_raid_disk;
760 for (mirror=0; mirror < mddev->raid_disks; mirror++) 820 for (mirror=0; mirror < mddev->raid_disks; mirror++)
761 if ( !(p=conf->mirrors+mirror)->rdev) { 821 if ( !(p=conf->mirrors+mirror)->rdev) {
762 822
@@ -773,6 +833,8 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
773 p->head_position = 0; 833 p->head_position = 0;
774 rdev->raid_disk = mirror; 834 rdev->raid_disk = mirror;
775 found = 1; 835 found = 1;
836 if (rdev->saved_raid_disk != mirror)
837 conf->fullsync = 1;
776 p->rdev = rdev; 838 p->rdev = rdev;
777 break; 839 break;
778 } 840 }
@@ -828,10 +890,11 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
828 * or re-read if the read failed. 890 * or re-read if the read failed.
829 * We don't do much here, just schedule handling by raid1d 891 * We don't do much here, just schedule handling by raid1d
830 */ 892 */
831 if (!uptodate) 893 if (!uptodate) {
832 md_error(r1_bio->mddev, 894 md_error(r1_bio->mddev,
833 conf->mirrors[r1_bio->read_disk].rdev); 895 conf->mirrors[r1_bio->read_disk].rdev);
834 else 896 set_bit(R1BIO_Degraded, &r1_bio->state);
897 } else
835 set_bit(R1BIO_Uptodate, &r1_bio->state); 898 set_bit(R1BIO_Uptodate, &r1_bio->state);
836 rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); 899 rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev);
837 reschedule_retry(r1_bio); 900 reschedule_retry(r1_bio);
@@ -855,8 +918,10 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
855 mirror = i; 918 mirror = i;
856 break; 919 break;
857 } 920 }
858 if (!uptodate) 921 if (!uptodate) {
859 md_error(mddev, conf->mirrors[mirror].rdev); 922 md_error(mddev, conf->mirrors[mirror].rdev);
923 set_bit(R1BIO_Degraded, &r1_bio->state);
924 }
860 update_head_pos(mirror, r1_bio); 925 update_head_pos(mirror, r1_bio);
861 926
862 if (atomic_dec_and_test(&r1_bio->remaining)) { 927 if (atomic_dec_and_test(&r1_bio->remaining)) {
@@ -876,6 +941,9 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
876 941
877 bio = r1_bio->bios[r1_bio->read_disk]; 942 bio = r1_bio->bios[r1_bio->read_disk];
878 943
944/*
945 if (r1_bio->sector == 0) printk("First sync write startss\n");
946*/
879 /* 947 /*
880 * schedule writes 948 * schedule writes
881 */ 949 */
@@ -903,10 +971,12 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
903 atomic_inc(&conf->mirrors[i].rdev->nr_pending); 971 atomic_inc(&conf->mirrors[i].rdev->nr_pending);
904 atomic_inc(&r1_bio->remaining); 972 atomic_inc(&r1_bio->remaining);
905 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); 973 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
974
906 generic_make_request(wbio); 975 generic_make_request(wbio);
907 } 976 }
908 977
909 if (atomic_dec_and_test(&r1_bio->remaining)) { 978 if (atomic_dec_and_test(&r1_bio->remaining)) {
979 /* if we're here, all write(s) have completed, so clean up */
910 md_done_sync(mddev, r1_bio->sectors, 1); 980 md_done_sync(mddev, r1_bio->sectors, 1);
911 put_buf(r1_bio); 981 put_buf(r1_bio);
912 } 982 }
@@ -931,11 +1001,30 @@ static void raid1d(mddev_t *mddev)
931 mdk_rdev_t *rdev; 1001 mdk_rdev_t *rdev;
932 1002
933 md_check_recovery(mddev); 1003 md_check_recovery(mddev);
934 md_handle_safemode(mddev);
935 1004
936 for (;;) { 1005 for (;;) {
937 char b[BDEVNAME_SIZE]; 1006 char b[BDEVNAME_SIZE];
938 spin_lock_irqsave(&conf->device_lock, flags); 1007 spin_lock_irqsave(&conf->device_lock, flags);
1008
1009 if (conf->pending_bio_list.head) {
1010 bio = bio_list_get(&conf->pending_bio_list);
1011 blk_remove_plug(mddev->queue);
1012 spin_unlock_irqrestore(&conf->device_lock, flags);
1013 /* flush any pending bitmap writes to disk before proceeding w/ I/O */
1014 if (bitmap_unplug(mddev->bitmap) != 0)
1015 printk("%s: bitmap file write failed!\n", mdname(mddev));
1016
1017 while (bio) { /* submit pending writes */
1018 struct bio *next = bio->bi_next;
1019 bio->bi_next = NULL;
1020 generic_make_request(bio);
1021 bio = next;
1022 }
1023 unplug = 1;
1024
1025 continue;
1026 }
1027
939 if (list_empty(head)) 1028 if (list_empty(head))
940 break; 1029 break;
941 r1_bio = list_entry(head->prev, r1bio_t, retry_list); 1030 r1_bio = list_entry(head->prev, r1bio_t, retry_list);
@@ -1009,7 +1098,7 @@ static int init_resync(conf_t *conf)
1009 * that can be installed to exclude normal IO requests. 1098 * that can be installed to exclude normal IO requests.
1010 */ 1099 */
1011 1100
1012static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) 1101static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1013{ 1102{
1014 conf_t *conf = mddev_to_conf(mddev); 1103 conf_t *conf = mddev_to_conf(mddev);
1015 mirror_info_t *mirror; 1104 mirror_info_t *mirror;
@@ -1019,17 +1108,43 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
1019 int disk; 1108 int disk;
1020 int i; 1109 int i;
1021 int write_targets = 0; 1110 int write_targets = 0;
1111 int sync_blocks;
1022 1112
1023 if (!conf->r1buf_pool) 1113 if (!conf->r1buf_pool)
1114 {
1115/*
1116 printk("sync start - bitmap %p\n", mddev->bitmap);
1117*/
1024 if (init_resync(conf)) 1118 if (init_resync(conf))
1025 return -ENOMEM; 1119 return 0;
1120 }
1026 1121
1027 max_sector = mddev->size << 1; 1122 max_sector = mddev->size << 1;
1028 if (sector_nr >= max_sector) { 1123 if (sector_nr >= max_sector) {
1124 /* If we aborted, we need to abort the
1125 * sync on the 'current' bitmap chunk (there will
1126 * only be one in raid1 resync.
1127 * We can find the current addess in mddev->curr_resync
1128 */
1129 if (!conf->fullsync) {
1130 if (mddev->curr_resync < max_sector)
1131 bitmap_end_sync(mddev->bitmap,
1132 mddev->curr_resync,
1133 &sync_blocks, 1);
1134 bitmap_close_sync(mddev->bitmap);
1135 }
1136 if (mddev->curr_resync >= max_sector)
1137 conf->fullsync = 0;
1029 close_sync(conf); 1138 close_sync(conf);
1030 return 0; 1139 return 0;
1031 } 1140 }
1032 1141
1142 if (!conf->fullsync &&
1143 !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks)) {
1144 /* We can skip this block, and probably several more */
1145 *skipped = 1;
1146 return sync_blocks;
1147 }
1033 /* 1148 /*
1034 * If there is non-resync activity waiting for us then 1149 * If there is non-resync activity waiting for us then
1035 * put in a delay to throttle resync. 1150 * put in a delay to throttle resync.
@@ -1068,6 +1183,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
1068 1183
1069 r1_bio->mddev = mddev; 1184 r1_bio->mddev = mddev;
1070 r1_bio->sector = sector_nr; 1185 r1_bio->sector = sector_nr;
1186 r1_bio->state = 0;
1071 set_bit(R1BIO_IsSync, &r1_bio->state); 1187 set_bit(R1BIO_IsSync, &r1_bio->state);
1072 r1_bio->read_disk = disk; 1188 r1_bio->read_disk = disk;
1073 1189
@@ -1102,18 +1218,24 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
1102 bio->bi_bdev = conf->mirrors[i].rdev->bdev; 1218 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1103 bio->bi_private = r1_bio; 1219 bio->bi_private = r1_bio;
1104 } 1220 }
1221
1222 if (write_targets + 1 < conf->raid_disks)
1223 /* array degraded, can't clear bitmap */
1224 set_bit(R1BIO_Degraded, &r1_bio->state);
1225
1105 if (write_targets == 0) { 1226 if (write_targets == 0) {
1106 /* There is nowhere to write, so all non-sync 1227 /* There is nowhere to write, so all non-sync
1107 * drives must be failed - so we are finished 1228 * drives must be failed - so we are finished
1108 */ 1229 */
1109 int rv = max_sector - sector_nr; 1230 sector_t rv = max_sector - sector_nr;
1110 md_done_sync(mddev, rv, 1); 1231 *skipped = 1;
1111 put_buf(r1_bio); 1232 put_buf(r1_bio);
1112 rdev_dec_pending(conf->mirrors[disk].rdev, mddev); 1233 rdev_dec_pending(conf->mirrors[disk].rdev, mddev);
1113 return rv; 1234 return rv;
1114 } 1235 }
1115 1236
1116 nr_sectors = 0; 1237 nr_sectors = 0;
1238 sync_blocks = 0;
1117 do { 1239 do {
1118 struct page *page; 1240 struct page *page;
1119 int len = PAGE_SIZE; 1241 int len = PAGE_SIZE;
@@ -1121,6 +1243,17 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
1121 len = (max_sector - sector_nr) << 9; 1243 len = (max_sector - sector_nr) << 9;
1122 if (len == 0) 1244 if (len == 0)
1123 break; 1245 break;
1246 if (!conf->fullsync) {
1247 if (sync_blocks == 0) {
1248 if (!bitmap_start_sync(mddev->bitmap,
1249 sector_nr, &sync_blocks))
1250 break;
1251 if (sync_blocks < (PAGE_SIZE>>9))
1252 BUG();
1253 if (len > (sync_blocks<<9)) len = sync_blocks<<9;
1254 }
1255 }
1256
1124 for (i=0 ; i < conf->raid_disks; i++) { 1257 for (i=0 ; i < conf->raid_disks; i++) {
1125 bio = r1_bio->bios[i]; 1258 bio = r1_bio->bios[i];
1126 if (bio->bi_end_io) { 1259 if (bio->bi_end_io) {
@@ -1143,6 +1276,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
1143 } 1276 }
1144 nr_sectors += len>>9; 1277 nr_sectors += len>>9;
1145 sector_nr += len>>9; 1278 sector_nr += len>>9;
1279 sync_blocks -= (len>>9);
1146 } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES); 1280 } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
1147 bio_full: 1281 bio_full:
1148 bio = r1_bio->bios[disk]; 1282 bio = r1_bio->bios[disk];
@@ -1197,10 +1331,6 @@ static int run(mddev_t *mddev)
1197 if (!conf->r1bio_pool) 1331 if (!conf->r1bio_pool)
1198 goto out_no_mem; 1332 goto out_no_mem;
1199 1333
1200 mddev->queue->unplug_fn = raid1_unplug;
1201
1202 mddev->queue->issue_flush_fn = raid1_issue_flush;
1203
1204 ITERATE_RDEV(mddev, rdev, tmp) { 1334 ITERATE_RDEV(mddev, rdev, tmp) {
1205 disk_idx = rdev->raid_disk; 1335 disk_idx = rdev->raid_disk;
1206 if (disk_idx >= mddev->raid_disks 1336 if (disk_idx >= mddev->raid_disks
@@ -1235,6 +1365,9 @@ static int run(mddev_t *mddev)
1235 init_waitqueue_head(&conf->wait_idle); 1365 init_waitqueue_head(&conf->wait_idle);
1236 init_waitqueue_head(&conf->wait_resume); 1366 init_waitqueue_head(&conf->wait_resume);
1237 1367
1368 bio_list_init(&conf->pending_bio_list);
1369 bio_list_init(&conf->flushing_bio_list);
1370
1238 if (!conf->working_disks) { 1371 if (!conf->working_disks) {
1239 printk(KERN_ERR "raid1: no operational mirrors for %s\n", 1372 printk(KERN_ERR "raid1: no operational mirrors for %s\n",
1240 mdname(mddev)); 1373 mdname(mddev));
@@ -1263,16 +1396,15 @@ static int run(mddev_t *mddev)
1263 conf->last_used = j; 1396 conf->last_used = j;
1264 1397
1265 1398
1266 1399 mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1");
1267 { 1400 if (!mddev->thread) {
1268 mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1"); 1401 printk(KERN_ERR
1269 if (!mddev->thread) { 1402 "raid1: couldn't allocate thread for %s\n",
1270 printk(KERN_ERR 1403 mdname(mddev));
1271 "raid1: couldn't allocate thread for %s\n", 1404 goto out_free_conf;
1272 mdname(mddev));
1273 goto out_free_conf;
1274 }
1275 } 1405 }
1406 if (mddev->bitmap) mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1407
1276 printk(KERN_INFO 1408 printk(KERN_INFO
1277 "raid1: raid set %s active with %d out of %d mirrors\n", 1409 "raid1: raid set %s active with %d out of %d mirrors\n",
1278 mdname(mddev), mddev->raid_disks - mddev->degraded, 1410 mdname(mddev), mddev->raid_disks - mddev->degraded,
@@ -1282,6 +1414,9 @@ static int run(mddev_t *mddev)
1282 */ 1414 */
1283 mddev->array_size = mddev->size; 1415 mddev->array_size = mddev->size;
1284 1416
1417 mddev->queue->unplug_fn = raid1_unplug;
1418 mddev->queue->issue_flush_fn = raid1_issue_flush;
1419
1285 return 0; 1420 return 0;
1286 1421
1287out_no_mem: 1422out_no_mem:
@@ -1292,10 +1427,8 @@ out_free_conf:
1292 if (conf) { 1427 if (conf) {
1293 if (conf->r1bio_pool) 1428 if (conf->r1bio_pool)
1294 mempool_destroy(conf->r1bio_pool); 1429 mempool_destroy(conf->r1bio_pool);
1295 if (conf->mirrors) 1430 kfree(conf->mirrors);
1296 kfree(conf->mirrors); 1431 kfree(conf->poolinfo);
1297 if (conf->poolinfo)
1298 kfree(conf->poolinfo);
1299 kfree(conf); 1432 kfree(conf);
1300 mddev->private = NULL; 1433 mddev->private = NULL;
1301 } 1434 }
@@ -1312,10 +1445,8 @@ static int stop(mddev_t *mddev)
1312 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 1445 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
1313 if (conf->r1bio_pool) 1446 if (conf->r1bio_pool)
1314 mempool_destroy(conf->r1bio_pool); 1447 mempool_destroy(conf->r1bio_pool);
1315 if (conf->mirrors) 1448 kfree(conf->mirrors);
1316 kfree(conf->mirrors); 1449 kfree(conf->poolinfo);
1317 if (conf->poolinfo)
1318 kfree(conf->poolinfo);
1319 kfree(conf); 1450 kfree(conf);
1320 mddev->private = NULL; 1451 mddev->private = NULL;
1321 return 0; 1452 return 0;
@@ -1350,17 +1481,26 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
1350 * We allocate a new r1bio_pool if we can. 1481 * We allocate a new r1bio_pool if we can.
1351 * Then raise a device barrier and wait until all IO stops. 1482 * Then raise a device barrier and wait until all IO stops.
1352 * Then resize conf->mirrors and swap in the new r1bio pool. 1483 * Then resize conf->mirrors and swap in the new r1bio pool.
1484 *
1485 * At the same time, we "pack" the devices so that all the missing
1486 * devices have the higher raid_disk numbers.
1353 */ 1487 */
1354 mempool_t *newpool, *oldpool; 1488 mempool_t *newpool, *oldpool;
1355 struct pool_info *newpoolinfo; 1489 struct pool_info *newpoolinfo;
1356 mirror_info_t *newmirrors; 1490 mirror_info_t *newmirrors;
1357 conf_t *conf = mddev_to_conf(mddev); 1491 conf_t *conf = mddev_to_conf(mddev);
1492 int cnt;
1358 1493
1359 int d; 1494 int d, d2;
1360 1495
1361 for (d= raid_disks; d < conf->raid_disks; d++) 1496 if (raid_disks < conf->raid_disks) {
1362 if (conf->mirrors[d].rdev) 1497 cnt=0;
1498 for (d= 0; d < conf->raid_disks; d++)
1499 if (conf->mirrors[d].rdev)
1500 cnt++;
1501 if (cnt > raid_disks)
1363 return -EBUSY; 1502 return -EBUSY;
1503 }
1364 1504
1365 newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL); 1505 newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
1366 if (!newpoolinfo) 1506 if (!newpoolinfo)
@@ -1385,14 +1525,18 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
1385 spin_lock_irq(&conf->resync_lock); 1525 spin_lock_irq(&conf->resync_lock);
1386 conf->barrier++; 1526 conf->barrier++;
1387 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, 1527 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
1388 conf->resync_lock, unplug_slaves(mddev)); 1528 conf->resync_lock, raid1_unplug(mddev->queue));
1389 spin_unlock_irq(&conf->resync_lock); 1529 spin_unlock_irq(&conf->resync_lock);
1390 1530
1391 /* ok, everything is stopped */ 1531 /* ok, everything is stopped */
1392 oldpool = conf->r1bio_pool; 1532 oldpool = conf->r1bio_pool;
1393 conf->r1bio_pool = newpool; 1533 conf->r1bio_pool = newpool;
1394 for (d=0; d < raid_disks && d < conf->raid_disks; d++) 1534
1395 newmirrors[d] = conf->mirrors[d]; 1535 for (d=d2=0; d < conf->raid_disks; d++)
1536 if (conf->mirrors[d].rdev) {
1537 conf->mirrors[d].rdev->raid_disk = d2;
1538 newmirrors[d2++].rdev = conf->mirrors[d].rdev;
1539 }
1396 kfree(conf->mirrors); 1540 kfree(conf->mirrors);
1397 conf->mirrors = newmirrors; 1541 conf->mirrors = newmirrors;
1398 kfree(conf->poolinfo); 1542 kfree(conf->poolinfo);
@@ -1401,6 +1545,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
1401 mddev->degraded += (raid_disks - conf->raid_disks); 1545 mddev->degraded += (raid_disks - conf->raid_disks);
1402 conf->raid_disks = mddev->raid_disks = raid_disks; 1546 conf->raid_disks = mddev->raid_disks = raid_disks;
1403 1547
1548 conf->last_used = 0; /* just make sure it is in-range */
1404 spin_lock_irq(&conf->resync_lock); 1549 spin_lock_irq(&conf->resync_lock);
1405 conf->barrier--; 1550 conf->barrier--;
1406 spin_unlock_irq(&conf->resync_lock); 1551 spin_unlock_irq(&conf->resync_lock);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e9dc2876a626..62ebb1bc72be 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -700,6 +700,8 @@ static int make_request(request_queue_t *q, struct bio * bio)
700 return 0; 700 return 0;
701 } 701 }
702 702
703 md_write_start(mddev, bio);
704
703 /* 705 /*
704 * Register the new request and wait if the reconstruction 706 * Register the new request and wait if the reconstruction
705 * thread has put up a bar for new requests. 707 * thread has put up a bar for new requests.
@@ -774,7 +776,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
774 rcu_read_unlock(); 776 rcu_read_unlock();
775 777
776 atomic_set(&r10_bio->remaining, 1); 778 atomic_set(&r10_bio->remaining, 1);
777 md_write_start(mddev); 779
778 for (i = 0; i < conf->copies; i++) { 780 for (i = 0; i < conf->copies; i++) {
779 struct bio *mbio; 781 struct bio *mbio;
780 int d = r10_bio->devs[i].devnum; 782 int d = r10_bio->devs[i].devnum;
@@ -1216,7 +1218,6 @@ static void raid10d(mddev_t *mddev)
1216 mdk_rdev_t *rdev; 1218 mdk_rdev_t *rdev;
1217 1219
1218 md_check_recovery(mddev); 1220 md_check_recovery(mddev);
1219 md_handle_safemode(mddev);
1220 1221
1221 for (;;) { 1222 for (;;) {
1222 char b[BDEVNAME_SIZE]; 1223 char b[BDEVNAME_SIZE];
@@ -1319,7 +1320,7 @@ static int init_resync(conf_t *conf)
1319 * 1320 *
1320 */ 1321 */
1321 1322
1322static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) 1323static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1323{ 1324{
1324 conf_t *conf = mddev_to_conf(mddev); 1325 conf_t *conf = mddev_to_conf(mddev);
1325 r10bio_t *r10_bio; 1326 r10bio_t *r10_bio;
@@ -1333,7 +1334,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
1333 1334
1334 if (!conf->r10buf_pool) 1335 if (!conf->r10buf_pool)
1335 if (init_resync(conf)) 1336 if (init_resync(conf))
1336 return -ENOMEM; 1337 return 0;
1337 1338
1338 skipped: 1339 skipped:
1339 max_sector = mddev->size << 1; 1340 max_sector = mddev->size << 1;
@@ -1341,15 +1342,15 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
1341 max_sector = mddev->resync_max_sectors; 1342 max_sector = mddev->resync_max_sectors;
1342 if (sector_nr >= max_sector) { 1343 if (sector_nr >= max_sector) {
1343 close_sync(conf); 1344 close_sync(conf);
1345 *skipped = 1;
1344 return sectors_skipped; 1346 return sectors_skipped;
1345 } 1347 }
1346 if (chunks_skipped >= conf->raid_disks) { 1348 if (chunks_skipped >= conf->raid_disks) {
1347 /* if there has been nothing to do on any drive, 1349 /* if there has been nothing to do on any drive,
1348 * then there is nothing to do at all.. 1350 * then there is nothing to do at all..
1349 */ 1351 */
1350 sector_t sec = max_sector - sector_nr; 1352 *skipped = 1;
1351 md_done_sync(mddev, sec, 1); 1353 return (max_sector - sector_nr) + sectors_skipped;
1352 return sec + sectors_skipped;
1353 } 1354 }
1354 1355
1355 /* make sure whole request will fit in a chunk - if chunks 1356 /* make sure whole request will fit in a chunk - if chunks
@@ -1563,17 +1564,22 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
1563 } 1564 }
1564 } 1565 }
1565 1566
1567 if (sectors_skipped)
1568 /* pretend they weren't skipped, it makes
1569 * no important difference in this case
1570 */
1571 md_done_sync(mddev, sectors_skipped, 1);
1572
1566 return sectors_skipped + nr_sectors; 1573 return sectors_skipped + nr_sectors;
1567 giveup: 1574 giveup:
1568 /* There is nowhere to write, so all non-sync 1575 /* There is nowhere to write, so all non-sync
1569 * drives must be failed, so try the next chunk... 1576 * drives must be failed, so try the next chunk...
1570 */ 1577 */
1571 { 1578 {
1572 int sec = max_sector - sector_nr; 1579 sector_t sec = max_sector - sector_nr;
1573 sectors_skipped += sec; 1580 sectors_skipped += sec;
1574 chunks_skipped ++; 1581 chunks_skipped ++;
1575 sector_nr = max_sector; 1582 sector_nr = max_sector;
1576 md_done_sync(mddev, sec, 1);
1577 goto skipped; 1583 goto skipped;
1578 } 1584 }
1579} 1585}
@@ -1639,9 +1645,6 @@ static int run(mddev_t *mddev)
1639 mdname(mddev)); 1645 mdname(mddev));
1640 goto out_free_conf; 1646 goto out_free_conf;
1641 } 1647 }
1642 mddev->queue->unplug_fn = raid10_unplug;
1643
1644 mddev->queue->issue_flush_fn = raid10_issue_flush;
1645 1648
1646 ITERATE_RDEV(mddev, rdev, tmp) { 1649 ITERATE_RDEV(mddev, rdev, tmp) {
1647 disk_idx = rdev->raid_disk; 1650 disk_idx = rdev->raid_disk;
@@ -1713,6 +1716,9 @@ static int run(mddev_t *mddev)
1713 mddev->array_size = size/2; 1716 mddev->array_size = size/2;
1714 mddev->resync_max_sectors = size; 1717 mddev->resync_max_sectors = size;
1715 1718
1719 mddev->queue->unplug_fn = raid10_unplug;
1720 mddev->queue->issue_flush_fn = raid10_issue_flush;
1721
1716 /* Calculate max read-ahead size. 1722 /* Calculate max read-ahead size.
1717 * We need to readahead at least twice a whole stripe.... 1723 * We need to readahead at least twice a whole stripe....
1718 * maybe... 1724 * maybe...
@@ -1731,8 +1737,7 @@ static int run(mddev_t *mddev)
1731out_free_conf: 1737out_free_conf:
1732 if (conf->r10bio_pool) 1738 if (conf->r10bio_pool)
1733 mempool_destroy(conf->r10bio_pool); 1739 mempool_destroy(conf->r10bio_pool);
1734 if (conf->mirrors) 1740 kfree(conf->mirrors);
1735 kfree(conf->mirrors);
1736 kfree(conf); 1741 kfree(conf);
1737 mddev->private = NULL; 1742 mddev->private = NULL;
1738out: 1743out:
@@ -1748,8 +1753,7 @@ static int stop(mddev_t *mddev)
1748 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 1753 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
1749 if (conf->r10bio_pool) 1754 if (conf->r10bio_pool)
1750 mempool_destroy(conf->r10bio_pool); 1755 mempool_destroy(conf->r10bio_pool);
1751 if (conf->mirrors) 1756 kfree(conf->mirrors);
1752 kfree(conf->mirrors);
1753 kfree(conf); 1757 kfree(conf);
1754 mddev->private = NULL; 1758 mddev->private = NULL;
1755 return 0; 1759 return 0;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e96e2a10a9c9..93a9726cc2d6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1411,6 +1411,8 @@ static int make_request (request_queue_t *q, struct bio * bi)
1411 sector_t logical_sector, last_sector; 1411 sector_t logical_sector, last_sector;
1412 struct stripe_head *sh; 1412 struct stripe_head *sh;
1413 1413
1414 md_write_start(mddev, bi);
1415
1414 if (bio_data_dir(bi)==WRITE) { 1416 if (bio_data_dir(bi)==WRITE) {
1415 disk_stat_inc(mddev->gendisk, writes); 1417 disk_stat_inc(mddev->gendisk, writes);
1416 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi)); 1418 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));
@@ -1423,8 +1425,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
1423 last_sector = bi->bi_sector + (bi->bi_size>>9); 1425 last_sector = bi->bi_sector + (bi->bi_size>>9);
1424 bi->bi_next = NULL; 1426 bi->bi_next = NULL;
1425 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 1427 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
1426 if ( bio_data_dir(bi) == WRITE ) 1428
1427 md_write_start(mddev);
1428 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 1429 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
1429 DEFINE_WAIT(w); 1430 DEFINE_WAIT(w);
1430 1431
@@ -1475,7 +1476,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
1475} 1476}
1476 1477
1477/* FIXME go_faster isn't used */ 1478/* FIXME go_faster isn't used */
1478static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster) 1479static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1479{ 1480{
1480 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 1481 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1481 struct stripe_head *sh; 1482 struct stripe_head *sh;
@@ -1498,8 +1499,8 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
1498 * nothing we can do. 1499 * nothing we can do.
1499 */ 1500 */
1500 if (mddev->degraded >= 1 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 1501 if (mddev->degraded >= 1 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1501 int rv = (mddev->size << 1) - sector_nr; 1502 sector_t rv = (mddev->size << 1) - sector_nr;
1502 md_done_sync(mddev, rv, 1); 1503 *skipped = 1;
1503 return rv; 1504 return rv;
1504 } 1505 }
1505 1506
@@ -1546,7 +1547,6 @@ static void raid5d (mddev_t *mddev)
1546 PRINTK("+++ raid5d active\n"); 1547 PRINTK("+++ raid5d active\n");
1547 1548
1548 md_check_recovery(mddev); 1549 md_check_recovery(mddev);
1549 md_handle_safemode(mddev);
1550 1550
1551 handled = 0; 1551 handled = 0;
1552 spin_lock_irq(&conf->device_lock); 1552 spin_lock_irq(&conf->device_lock);
@@ -1620,9 +1620,6 @@ static int run (mddev_t *mddev)
1620 atomic_set(&conf->active_stripes, 0); 1620 atomic_set(&conf->active_stripes, 0);
1621 atomic_set(&conf->preread_active_stripes, 0); 1621 atomic_set(&conf->preread_active_stripes, 0);
1622 1622
1623 mddev->queue->unplug_fn = raid5_unplug_device;
1624 mddev->queue->issue_flush_fn = raid5_issue_flush;
1625
1626 PRINTK("raid5: run(%s) called.\n", mdname(mddev)); 1623 PRINTK("raid5: run(%s) called.\n", mdname(mddev));
1627 1624
1628 ITERATE_RDEV(mddev,rdev,tmp) { 1625 ITERATE_RDEV(mddev,rdev,tmp) {
@@ -1728,6 +1725,10 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
1728 } 1725 }
1729 1726
1730 /* Ok, everything is just fine now */ 1727 /* Ok, everything is just fine now */
1728
1729 mddev->queue->unplug_fn = raid5_unplug_device;
1730 mddev->queue->issue_flush_fn = raid5_issue_flush;
1731
1731 mddev->array_size = mddev->size * (mddev->raid_disks - 1); 1732 mddev->array_size = mddev->size * (mddev->raid_disks - 1);
1732 return 0; 1733 return 0;
1733abort: 1734abort:
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 8a33f351e092..f62ea1a73d0d 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -1570,6 +1570,8 @@ static int make_request (request_queue_t *q, struct bio * bi)
1570 sector_t logical_sector, last_sector; 1570 sector_t logical_sector, last_sector;
1571 struct stripe_head *sh; 1571 struct stripe_head *sh;
1572 1572
1573 md_write_start(mddev, bi);
1574
1573 if (bio_data_dir(bi)==WRITE) { 1575 if (bio_data_dir(bi)==WRITE) {
1574 disk_stat_inc(mddev->gendisk, writes); 1576 disk_stat_inc(mddev->gendisk, writes);
1575 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi)); 1577 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));
@@ -1583,8 +1585,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
1583 1585
1584 bi->bi_next = NULL; 1586 bi->bi_next = NULL;
1585 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 1587 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
1586 if ( bio_data_dir(bi) == WRITE ) 1588
1587 md_write_start(mddev);
1588 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 1589 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
1589 DEFINE_WAIT(w); 1590 DEFINE_WAIT(w);
1590 1591
@@ -1634,7 +1635,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
1634} 1635}
1635 1636
1636/* FIXME go_faster isn't used */ 1637/* FIXME go_faster isn't used */
1637static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster) 1638static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1638{ 1639{
1639 raid6_conf_t *conf = (raid6_conf_t *) mddev->private; 1640 raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
1640 struct stripe_head *sh; 1641 struct stripe_head *sh;
@@ -1657,8 +1658,8 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
1657 * nothing we can do. 1658 * nothing we can do.
1658 */ 1659 */
1659 if (mddev->degraded >= 2 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 1660 if (mddev->degraded >= 2 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1660 int rv = (mddev->size << 1) - sector_nr; 1661 sector_t rv = (mddev->size << 1) - sector_nr;
1661 md_done_sync(mddev, rv, 1); 1662 *skipped = 1;
1662 return rv; 1663 return rv;
1663 } 1664 }
1664 1665
@@ -1705,7 +1706,6 @@ static void raid6d (mddev_t *mddev)
1705 PRINTK("+++ raid6d active\n"); 1706 PRINTK("+++ raid6d active\n");
1706 1707
1707 md_check_recovery(mddev); 1708 md_check_recovery(mddev);
1708 md_handle_safemode(mddev);
1709 1709
1710 handled = 0; 1710 handled = 0;
1711 spin_lock_irq(&conf->device_lock); 1711 spin_lock_irq(&conf->device_lock);
@@ -1779,9 +1779,6 @@ static int run (mddev_t *mddev)
1779 atomic_set(&conf->active_stripes, 0); 1779 atomic_set(&conf->active_stripes, 0);
1780 atomic_set(&conf->preread_active_stripes, 0); 1780 atomic_set(&conf->preread_active_stripes, 0);
1781 1781
1782 mddev->queue->unplug_fn = raid6_unplug_device;
1783 mddev->queue->issue_flush_fn = raid6_issue_flush;
1784
1785 PRINTK("raid6: run(%s) called.\n", mdname(mddev)); 1782 PRINTK("raid6: run(%s) called.\n", mdname(mddev));
1786 1783
1787 ITERATE_RDEV(mddev,rdev,tmp) { 1784 ITERATE_RDEV(mddev,rdev,tmp) {
@@ -1895,6 +1892,9 @@ static int run (mddev_t *mddev)
1895 1892
1896 /* Ok, everything is just fine now */ 1893 /* Ok, everything is just fine now */
1897 mddev->array_size = mddev->size * (mddev->raid_disks - 2); 1894 mddev->array_size = mddev->size * (mddev->raid_disks - 2);
1895
1896 mddev->queue->unplug_fn = raid6_unplug_device;
1897 mddev->queue->issue_flush_fn = raid6_issue_flush;
1898 return 0; 1898 return 0;
1899abort: 1899abort:
1900 if (conf) { 1900 if (conf) {